From d223f98f02099b002903b9b22b56febae16ef80d Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 16 Apr 2019 15:58:54 -0700
Subject: drm/v3d: Add support for compute shader dispatch.

The compute shader dispatch interface is pretty simple -- just pass in
the regs that userspace has passed us, with no CLs to run.  However,
with no CL to run it means that we need to do manual cache flushing of
the L2 after the HW execution completes (for SSBO, atomic, and
image_load_store writes that are the output of compute shaders).

This doesn't yet expose the L2 cache's ability to have a region of the
address space not write back to memory (which could be used for
shared_var storage).

So far, the Mesa side has been tested on V3D v4.2 simpenrose (passing
the ES31 tests), and on the kernel side on 7278 (failing atomic
compswap tests in a way that doesn't reproduce on simpenrose).

v2: Fix excessive allocation for the clean_job (reported by Dan
    Carpenter).  Keep refs on jobs until clean_job is finished, to
    avoid spurious MMU errors if the output BOs are freed by userspace
    before L2 cleaning is finished.

Signed-off-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20190416225856.20264-4-eric@anholt.net
Acked-by: Rob Clark <robdclark@gmail.com>
---
 include/uapi/drm/v3d_drm.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
index ea70669d2138..58fbe48c91e9 100644
--- a/include/uapi/drm/v3d_drm.h
+++ b/include/uapi/drm/v3d_drm.h
@@ -37,6 +37,7 @@ extern "C" {
 #define DRM_V3D_GET_PARAM                         0x04
 #define DRM_V3D_GET_BO_OFFSET                     0x05
 #define DRM_V3D_SUBMIT_TFU                        0x06
+#define DRM_V3D_SUBMIT_CSD                        0x07
 
 #define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
 #define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
@@ -45,6 +46,7 @@ extern "C" {
 #define DRM_IOCTL_V3D_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
 #define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
 #define DRM_IOCTL_V3D_SUBMIT_TFU          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
+#define DRM_IOCTL_V3D_SUBMIT_CSD          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd)
 
 /**
  * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
@@ -190,6 +192,7 @@ enum drm_v3d_param {
 	DRM_V3D_PARAM_V3D_CORE0_IDENT1,
 	DRM_V3D_PARAM_V3D_CORE0_IDENT2,
 	DRM_V3D_PARAM_SUPPORTS_TFU,
+	DRM_V3D_PARAM_SUPPORTS_CSD,
 };
 
 struct drm_v3d_get_param {
@@ -230,6 +233,31 @@ struct drm_v3d_submit_tfu {
 	__u32 out_sync;
 };
 
+/* Submits a compute shader for dispatch.  This job will block on any
+ * previous compute shaders submitted on this fd, and any other
+ * synchronization must be performed with in_sync/out_sync.
+ */
+struct drm_v3d_submit_csd {
+	__u32 cfg[7];
+	__u32 coef[4];
+
+	/* Pointer to a u32 array of the BOs that are referenced by the job.
+	 */
+	__u64 bo_handles;
+
+	/* Number of BO handles passed in (size is that times 4). */
+	__u32 bo_handle_count;
+
+	/* sync object to block on before running the CSD job.  Each
+	 * CSD job will execute in the order submitted to its FD.
+	 * Synchronization against rendering/TFU jobs or CSD from
+	 * other fds requires using sync objects.
+	 */
+	__u32 in_sync;
+	/* Sync object to signal when the CSD job is done. */
+	__u32 out_sync;
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3


From 96354b5ca4ac74d2e47e75987283963001b517f6 Mon Sep 17 00:00:00 2001
From: Philip Yang <Philip.Yang@amd.com>
Date: Wed, 17 Apr 2019 22:15:19 +0000
Subject: drm: increase drm mmap_range size to 1TB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After patch "drm: Use the same mmap-range offset and size for GEM and
TTM", application failed to create bo of system memory because drm
mmap_range size decrease to 64GB from original 1TB. This is not big
enough for applications. Increase the drm mmap_range size to 1TB.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190417221507.933-1-Philip.Yang@amd.com
---
 include/drm/drm_vma_manager.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/drm_vma_manager.h b/include/drm/drm_vma_manager.h
index f4f8ff1cdeec..76ac5e97a559 100644
--- a/include/drm/drm_vma_manager.h
+++ b/include/drm/drm_vma_manager.h
@@ -35,7 +35,7 @@
  */
 #if BITS_PER_LONG == 64
 #define DRM_FILE_PAGE_OFFSET_START ((0xFFFFFFFFUL >> PAGE_SHIFT) + 1)
-#define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFFUL >> PAGE_SHIFT) * 16)
+#define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFFUL >> PAGE_SHIFT) * 256)
 #else
 #define DRM_FILE_PAGE_OFFSET_START ((0xFFFFFFFUL >> PAGE_SHIFT) + 1)
 #define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFUL >> PAGE_SHIFT) * 16)
-- 
cgit v1.2.3


From 7d26097b4beb802fb6ce7cd6389ac4bf12b1a6ae Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Fri, 1 Mar 2019 13:56:12 +0100
Subject: drm/atomic: Create __drm_atomic_helper_crtc_reset() for subclassing
 crtc_state.

We already have __drm_atomic_helper_connector_reset() and
__drm_atomic_helper_plane_reset(), extend this to crtc as well.

This will allow us to set default values in the crtc_state, without
having to do it in each driver separately.

Of all drivers that need conversion, only nouveau is done in this
commit, because it wrote its own __drm_atomic_helper_crtc_reset(),
clashing with the drm core.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20190301125627.7285-3-maarten.lankhorst@linux.intel.com
---
 drivers/gpu/drm/drm_atomic_state_helper.c | 34 +++++++++++++++++++++++++------
 drivers/gpu/drm/nouveau/dispnv50/head.c   | 13 +++---------
 include/drm/drm_atomic_state_helper.h     |  2 ++
 3 files changed, 33 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c
index 59ffb6b9c745..06a97a268e81 100644
--- a/drivers/gpu/drm/drm_atomic_state_helper.c
+++ b/drivers/gpu/drm/drm_atomic_state_helper.c
@@ -56,6 +56,29 @@
  * for these functions.
  */
 
+/**
+ * __drm_atomic_helper_crtc_reset - reset state on CRTC
+ * @crtc: drm CRTC
+ * @crtc_state: CRTC state to assign
+ *
+ * Initializes the newly allocated @crtc_state and assigns it to
+ * the &drm_crtc->state pointer of @crtc, usually required when
+ * initializing the drivers or when called from the &drm_crtc_funcs.reset
+ * hook.
+ *
+ * This is useful for drivers that subclass the CRTC state.
+ */
+void
+__drm_atomic_helper_crtc_reset(struct drm_crtc *crtc,
+			       struct drm_crtc_state *crtc_state)
+{
+	if (crtc_state)
+		crtc_state->crtc = crtc;
+
+	crtc->state = crtc_state;
+}
+EXPORT_SYMBOL(__drm_atomic_helper_crtc_reset);
+
 /**
  * drm_atomic_helper_crtc_reset - default &drm_crtc_funcs.reset hook for CRTCs
  * @crtc: drm CRTC
@@ -65,14 +88,13 @@
  */
 void drm_atomic_helper_crtc_reset(struct drm_crtc *crtc)
 {
-	if (crtc->state)
-		__drm_atomic_helper_crtc_destroy_state(crtc->state);
-
-	kfree(crtc->state);
-	crtc->state = kzalloc(sizeof(*crtc->state), GFP_KERNEL);
+	struct drm_crtc_state *crtc_state =
+		kzalloc(sizeof(*crtc->state), GFP_KERNEL);
 
 	if (crtc->state)
-		crtc->state->crtc = crtc;
+		crtc->funcs->atomic_destroy_state(crtc, crtc->state);
+
+	__drm_atomic_helper_crtc_reset(crtc, crtc_state);
 }
 EXPORT_SYMBOL(drm_atomic_helper_crtc_reset);
 
diff --git a/drivers/gpu/drm/nouveau/dispnv50/head.c b/drivers/gpu/drm/nouveau/dispnv50/head.c
index 2e7a0c347ddb..93754743090f 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/head.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/head.c
@@ -419,16 +419,6 @@ nv50_head_atomic_duplicate_state(struct drm_crtc *crtc)
 	return &asyh->state;
 }
 
-static void
-__drm_atomic_helper_crtc_reset(struct drm_crtc *crtc,
-			       struct drm_crtc_state *state)
-{
-	if (crtc->state)
-		crtc->funcs->atomic_destroy_state(crtc, crtc->state);
-	crtc->state = state;
-	crtc->state->crtc = crtc;
-}
-
 static void
 nv50_head_reset(struct drm_crtc *crtc)
 {
@@ -437,6 +427,9 @@ nv50_head_reset(struct drm_crtc *crtc)
 	if (WARN_ON(!(asyh = kzalloc(sizeof(*asyh), GFP_KERNEL))))
 		return;
 
+	if (crtc->state)
+		nv50_head_atomic_destroy_state(crtc, crtc->state);
+
 	__drm_atomic_helper_crtc_reset(crtc, &asyh->state);
 }
 
diff --git a/include/drm/drm_atomic_state_helper.h b/include/drm/drm_atomic_state_helper.h
index 66c92cbd8e16..4e6d2e7a40b8 100644
--- a/include/drm/drm_atomic_state_helper.h
+++ b/include/drm/drm_atomic_state_helper.h
@@ -37,6 +37,8 @@ struct drm_private_state;
 struct drm_modeset_acquire_ctx;
 struct drm_device;
 
+void __drm_atomic_helper_crtc_reset(struct drm_crtc *crtc,
+				    struct drm_crtc_state *state);
 void drm_atomic_helper_crtc_reset(struct drm_crtc *crtc);
 void __drm_atomic_helper_crtc_duplicate_state(struct drm_crtc *crtc,
 					      struct drm_crtc_state *state);
-- 
cgit v1.2.3


From 67b886d290052dbf2bcfc876a5ae41a5fe461edf Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Thu, 21 Mar 2019 15:09:56 -0500
Subject: dma-buf: Remove leftover [un]map_atomic comments

The map_atomic/unmap_atomic callbacks have been removed, remove
the related comments.

Fixes: f664a5269542 ("dma-buf: remove kmap_atomic interface")
Signed-off-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190321200957.16938-1-afd@ti.com
---
 include/linux/dma-buf.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 58725f890b5b..e4a8dab2bc54 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -39,11 +39,6 @@ struct dma_buf_attachment;
 
 /**
  * struct dma_buf_ops - operations possible on struct dma_buf
- * @map_atomic: [optional] maps a page from the buffer into kernel address
- *		space, users may not block until the subsequent unmap call.
- *		This callback must not sleep.
- * @unmap_atomic: [optional] unmaps a atomically mapped page from the buffer.
- *		  This Callback must not sleep.
  * @map: [optional] maps a page from the buffer into kernel address space.
  * @unmap: [optional] unmaps a page from the buffer.
  * @vmap: [optional] creates a virtual mapping for the buffer into kernel
-- 
cgit v1.2.3


From d5ae7712b7ffbb435e8f3d98f2123eff4734c77f Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Thu, 21 Mar 2019 15:09:57 -0500
Subject: dma-buf: Update [un]map documentation to match the other functions

Other function have inline documentation, a couple still have
theirs at the top of the structure, update the docs and move
them inline.

Signed-off-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190321200957.16938-2-afd@ti.com
---
 include/linux/dma-buf.h | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index e4a8dab2bc54..a0bd071466fc 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -39,8 +39,6 @@ struct dma_buf_attachment;
 
 /**
  * struct dma_buf_ops - operations possible on struct dma_buf
- * @map: [optional] maps a page from the buffer into kernel address space.
- * @unmap: [optional] unmaps a page from the buffer.
  * @vmap: [optional] creates a virtual mapping for the buffer into kernel
  *	  address space. Same restrictions as for vmap and friends apply.
  * @vunmap: [optional] unmaps a vmap from the buffer
@@ -200,8 +198,6 @@ struct dma_buf_ops {
 	 * to be restarted.
 	 */
 	int (*end_cpu_access)(struct dma_buf *, enum dma_data_direction);
-	void *(*map)(struct dma_buf *, unsigned long);
-	void (*unmap)(struct dma_buf *, unsigned long, void *);
 
 	/**
 	 * @mmap:
@@ -240,6 +236,31 @@ struct dma_buf_ops {
 	 */
 	int (*mmap)(struct dma_buf *, struct vm_area_struct *vma);
 
+	/**
+	 * @map:
+	 *
+	 * Maps a page from the buffer into kernel address space. The page is
+	 * specified by offset into the buffer in PAGE_SIZE units.
+	 *
+	 * This callback is optional.
+	 *
+	 * Returns:
+	 *
+	 * Virtual address pointer where requested page can be accessed. NULL
+	 * on error or when this function is unimplemented by the exporter.
+	 */
+	void *(*map)(struct dma_buf *, unsigned long);
+
+	/**
+	 * @unmap:
+	 *
+	 * Unmaps a page from the buffer. Page offset and address pointer should
+	 * be the same as the one passed to and returned by matching call to map.
+	 *
+	 * This callback is optional.
+	 */
+	void (*unmap)(struct dma_buf *, unsigned long, void *);
+
 	void *(*vmap)(struct dma_buf *);
 	void (*vunmap)(struct dma_buf *, void *vaddr);
 };
-- 
cgit v1.2.3


From 5918045c4ed492fb5813f980dcf89a90fefd0a4e Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Thu, 18 Apr 2019 11:00:21 -0400
Subject: drm/scheduler: rework job destruction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We now destroy finished jobs from the worker thread to make sure that
we never destroy a job currently in timeout processing.
By this we avoid holding lock around ring mirror list in drm_sched_stop
which should solve a deadlock reported by a user.

v2: Remove unused variable.
v4: Move guilty job free into sched code.
v5:
Move sched->hw_rq_count to drm_sched_start to account for counter
decrement in drm_sched_stop even when we don't call resubmit jobs
if guily job did signal.
v6: remove unused variable

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692

Acked-by: Chunming Zhou <david1.zhou@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1555599624-12285-3-git-send-email-andrey.grodzovsky@amd.com
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   9 +-
 drivers/gpu/drm/etnaviv/etnaviv_dump.c     |   5 -
 drivers/gpu/drm/etnaviv/etnaviv_sched.c    |   2 +-
 drivers/gpu/drm/lima/lima_sched.c          |   2 +-
 drivers/gpu/drm/panfrost/panfrost_job.c    |   2 +-
 drivers/gpu/drm/scheduler/sched_main.c     | 159 +++++++++++++++++------------
 drivers/gpu/drm/v3d/v3d_sched.c            |   2 +-
 include/drm/gpu_scheduler.h                |   6 +-
 8 files changed, 102 insertions(+), 85 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7cee269ec3e3..a0e165c91a78 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3334,7 +3334,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 		if (!ring || !ring->sched.thread)
 			continue;
 
-		drm_sched_stop(&ring->sched);
+		drm_sched_stop(&ring->sched, &job->base);
 
 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
 		amdgpu_fence_driver_force_completion(ring);
@@ -3343,8 +3343,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	if(job)
 		drm_sched_increase_karma(&job->base);
 
-
-
 	if (!amdgpu_sriov_vf(adev)) {
 
 		if (!need_full_reset)
@@ -3482,8 +3480,7 @@ end:
 	return r;
 }
 
-static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
-					  struct amdgpu_job *job)
+static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
 {
 	int i;
 
@@ -3623,7 +3620,7 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */
 
 	/* Post ASIC reset for all devs .*/
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-		amdgpu_device_post_asic_reset(tmp_adev, tmp_adev == adev ? job : NULL);
+		amdgpu_device_post_asic_reset(tmp_adev);
 
 		if (r) {
 			/* bad news, how to tell it to userspace ? */
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
index 33854c94cb85..b24ddc406bba 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
@@ -118,7 +118,6 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
 	unsigned int n_obj, n_bomap_pages;
 	size_t file_size, mmu_size;
 	__le64 *bomap, *bomap_start;
-	unsigned long flags;
 
 	/* Only catch the first event, or when manually re-armed */
 	if (!etnaviv_dump_core)
@@ -135,13 +134,11 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
 		    mmu_size + gpu->buffer.size;
 
 	/* Add in the active command buffers */
-	spin_lock_irqsave(&gpu->sched.job_list_lock, flags);
 	list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
 		submit = to_etnaviv_submit(s_job);
 		file_size += submit->cmdbuf.size;
 		n_obj++;
 	}
-	spin_unlock_irqrestore(&gpu->sched.job_list_lock, flags);
 
 	/* Add in the active buffer objects */
 	list_for_each_entry(vram, &gpu->mmu->mappings, mmu_node) {
@@ -183,14 +180,12 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
 			      gpu->buffer.size,
 			      etnaviv_cmdbuf_get_va(&gpu->buffer));
 
-	spin_lock_irqsave(&gpu->sched.job_list_lock, flags);
 	list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
 		submit = to_etnaviv_submit(s_job);
 		etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD,
 				      submit->cmdbuf.vaddr, submit->cmdbuf.size,
 				      etnaviv_cmdbuf_get_va(&submit->cmdbuf));
 	}
-	spin_unlock_irqrestore(&gpu->sched.job_list_lock, flags);
 
 	/* Reserve space for the bomap */
 	if (n_bomap_pages) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 6d24fea1766b..a813c824e154 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -109,7 +109,7 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
 	}
 
 	/* block scheduler */
-	drm_sched_stop(&gpu->sched);
+	drm_sched_stop(&gpu->sched, sched_job);
 
 	if(sched_job)
 		drm_sched_increase_karma(sched_job);
diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index d53bd45f8d96..58a15479d175 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -258,7 +258,7 @@ static struct dma_fence *lima_sched_run_job(struct drm_sched_job *job)
 static void lima_sched_handle_error_task(struct lima_sched_pipe *pipe,
 					 struct lima_sched_task *task)
 {
-	drm_sched_stop(&pipe->base);
+	drm_sched_stop(&pipe->base, &task->base);
 
 	if (task)
 		drm_sched_increase_karma(&task->base);
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index a5716c8fe8b3..9bb9260d9181 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -387,7 +387,7 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 	mutex_lock(&pfdev->reset_lock);
 
 	for (i = 0; i < NUM_JOB_SLOTS; i++)
-		drm_sched_stop(&pfdev->js->queue[i].sched);
+		drm_sched_stop(&pfdev->js->queue[i].sched, sched_job);
 
 	if (sched_job)
 		drm_sched_increase_karma(sched_job);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 19fc601c9eeb..7816de7e8c82 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -265,32 +265,6 @@ void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
 }
 EXPORT_SYMBOL(drm_sched_resume_timeout);
 
-/* job_finish is called after hw fence signaled
- */
-static void drm_sched_job_finish(struct work_struct *work)
-{
-	struct drm_sched_job *s_job = container_of(work, struct drm_sched_job,
-						   finish_work);
-	struct drm_gpu_scheduler *sched = s_job->sched;
-	unsigned long flags;
-
-	/*
-	 * Canceling the timeout without removing our job from the ring mirror
-	 * list is safe, as we will only end up in this worker if our jobs
-	 * finished fence has been signaled. So even if some another worker
-	 * manages to find this job as the next job in the list, the fence
-	 * signaled check below will prevent the timeout to be restarted.
-	 */
-	cancel_delayed_work_sync(&sched->work_tdr);
-
-	spin_lock_irqsave(&sched->job_list_lock, flags);
-	/* queue TDR for next job */
-	drm_sched_start_timeout(sched);
-	spin_unlock_irqrestore(&sched->job_list_lock, flags);
-
-	sched->ops->free_job(s_job);
-}
-
 static void drm_sched_job_begin(struct drm_sched_job *s_job)
 {
 	struct drm_gpu_scheduler *sched = s_job->sched;
@@ -315,6 +289,13 @@ static void drm_sched_job_timedout(struct work_struct *work)
 	if (job)
 		job->sched->ops->timedout_job(job);
 
+	/*
+	 * Guilty job did complete and hence needs to be manually removed
+	 * See drm_sched_stop doc.
+	 */
+	if (list_empty(&job->node))
+		job->sched->ops->free_job(job);
+
 	spin_lock_irqsave(&sched->job_list_lock, flags);
 	drm_sched_start_timeout(sched);
 	spin_unlock_irqrestore(&sched->job_list_lock, flags);
@@ -371,23 +352,26 @@ EXPORT_SYMBOL(drm_sched_increase_karma);
  * @sched: scheduler instance
  * @bad: bad scheduler job
  *
+ * Stop the scheduler and also removes and frees all completed jobs.
+ * Note: bad job will not be freed as it might be used later and so it's
+ * callers responsibility to release it manually if it's not part of the
+ * mirror list any more.
+ *
  */
-void drm_sched_stop(struct drm_gpu_scheduler *sched)
+void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 {
-	struct drm_sched_job *s_job;
+	struct drm_sched_job *s_job, *tmp;
 	unsigned long flags;
-	struct dma_fence *last_fence =  NULL;
 
 	kthread_park(sched->thread);
 
 	/*
-	 * Verify all the signaled jobs in mirror list are removed from the ring
-	 * by waiting for the latest job to enter the list. This should insure that
-	 * also all the previous jobs that were in flight also already singaled
-	 * and removed from the list.
+	 * Iterate the job list from later to  earlier one and either deactive
+	 * their HW callbacks or remove them from mirror list if they already
+	 * signaled.
+	 * This iteration is thread safe as sched thread is stopped.
 	 */
-	spin_lock_irqsave(&sched->job_list_lock, flags);
-	list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
+	list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, node) {
 		if (s_job->s_fence->parent &&
 		    dma_fence_remove_callback(s_job->s_fence->parent,
 					      &s_job->cb)) {
@@ -395,16 +379,30 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched)
 			s_job->s_fence->parent = NULL;
 			atomic_dec(&sched->hw_rq_count);
 		} else {
-			 last_fence = dma_fence_get(&s_job->s_fence->finished);
-			 break;
+			/*
+			 * remove job from ring_mirror_list.
+			 * Locking here is for concurrent resume timeout
+			 */
+			spin_lock_irqsave(&sched->job_list_lock, flags);
+			list_del_init(&s_job->node);
+			spin_unlock_irqrestore(&sched->job_list_lock, flags);
+
+			/*
+			 * Wait for job's HW fence callback to finish using s_job
+			 * before releasing it.
+			 *
+			 * Job is still alive so fence refcount at least 1
+			 */
+			dma_fence_wait(&s_job->s_fence->finished, false);
+
+			/*
+			 * We must keep bad job alive for later use during
+			 * recovery by some of the drivers
+			 */
+			if (bad != s_job)
+				sched->ops->free_job(s_job);
 		}
 	}
-	spin_unlock_irqrestore(&sched->job_list_lock, flags);
-
-	if (last_fence) {
-		dma_fence_wait(last_fence, false);
-		dma_fence_put(last_fence);
-	}
 }
 
 EXPORT_SYMBOL(drm_sched_stop);
@@ -418,21 +416,22 @@ EXPORT_SYMBOL(drm_sched_stop);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 {
 	struct drm_sched_job *s_job, *tmp;
+	unsigned long flags;
 	int r;
 
-	if (!full_recovery)
-		goto unpark;
-
 	/*
 	 * Locking the list is not required here as the sched thread is parked
-	 * so no new jobs are being pushed in to HW and in drm_sched_stop we
-	 * flushed all the jobs who were still in mirror list but who already
-	 * signaled and removed them self from the list. Also concurrent
+	 * so no new jobs are being inserted or removed. Also concurrent
 	 * GPU recovers can't run in parallel.
 	 */
 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
 		struct dma_fence *fence = s_job->s_fence->parent;
 
+		atomic_inc(&sched->hw_rq_count);
+
+		if (!full_recovery)
+			continue;
+
 		if (fence) {
 			r = dma_fence_add_callback(fence, &s_job->cb,
 						   drm_sched_process_job);
@@ -445,9 +444,12 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 			drm_sched_process_job(NULL, &s_job->cb);
 	}
 
-	drm_sched_start_timeout(sched);
+	if (full_recovery) {
+		spin_lock_irqsave(&sched->job_list_lock, flags);
+		drm_sched_start_timeout(sched);
+		spin_unlock_irqrestore(&sched->job_list_lock, flags);
+	}
 
-unpark:
 	kthread_unpark(sched->thread);
 }
 EXPORT_SYMBOL(drm_sched_start);
@@ -464,7 +466,6 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
 	uint64_t guilty_context;
 	bool found_guilty = false;
 
-	/*TODO DO we need spinlock here ? */
 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
 		struct drm_sched_fence *s_fence = s_job->s_fence;
 
@@ -477,7 +478,6 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
 			dma_fence_set_error(&s_fence->finished, -ECANCELED);
 
 		s_job->s_fence->parent = sched->ops->run_job(s_job);
-		atomic_inc(&sched->hw_rq_count);
 	}
 }
 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
@@ -514,7 +514,6 @@ int drm_sched_job_init(struct drm_sched_job *job,
 		return -ENOMEM;
 	job->id = atomic64_inc_return(&sched->job_id_count);
 
-	INIT_WORK(&job->finish_work, drm_sched_job_finish);
 	INIT_LIST_HEAD(&job->node);
 
 	return 0;
@@ -597,24 +596,53 @@ static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb)
 	struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
 	struct drm_sched_fence *s_fence = s_job->s_fence;
 	struct drm_gpu_scheduler *sched = s_fence->sched;
-	unsigned long flags;
-
-	cancel_delayed_work(&sched->work_tdr);
 
 	atomic_dec(&sched->hw_rq_count);
 	atomic_dec(&sched->num_jobs);
 
-	spin_lock_irqsave(&sched->job_list_lock, flags);
-	/* remove job from ring_mirror_list */
-	list_del_init(&s_job->node);
-	spin_unlock_irqrestore(&sched->job_list_lock, flags);
+	trace_drm_sched_process_job(s_fence);
 
 	drm_sched_fence_finished(s_fence);
-
-	trace_drm_sched_process_job(s_fence);
 	wake_up_interruptible(&sched->wake_up_worker);
+}
+
+/**
+ * drm_sched_cleanup_jobs - destroy finished jobs
+ *
+ * @sched: scheduler instance
+ *
+ * Remove all finished jobs from the mirror list and destroy them.
+ */
+static void drm_sched_cleanup_jobs(struct drm_gpu_scheduler *sched)
+{
+	unsigned long flags;
+
+	/* Don't destroy jobs while the timeout worker is running */
+	if (!cancel_delayed_work(&sched->work_tdr))
+		return;
+
+
+	while (!list_empty(&sched->ring_mirror_list)) {
+		struct drm_sched_job *job;
+
+		job = list_first_entry(&sched->ring_mirror_list,
+				       struct drm_sched_job, node);
+		if (!dma_fence_is_signaled(&job->s_fence->finished))
+			break;
+
+		spin_lock_irqsave(&sched->job_list_lock, flags);
+		/* remove job from ring_mirror_list */
+		list_del_init(&job->node);
+		spin_unlock_irqrestore(&sched->job_list_lock, flags);
+
+		sched->ops->free_job(job);
+	}
+
+	/* queue timeout for next job */
+	spin_lock_irqsave(&sched->job_list_lock, flags);
+	drm_sched_start_timeout(sched);
+	spin_unlock_irqrestore(&sched->job_list_lock, flags);
 
-	schedule_work(&s_job->finish_work);
 }
 
 /**
@@ -656,9 +684,10 @@ static int drm_sched_main(void *param)
 		struct dma_fence *fence;
 
 		wait_event_interruptible(sched->wake_up_worker,
+					 (drm_sched_cleanup_jobs(sched),
 					 (!drm_sched_blocked(sched) &&
 					  (entity = drm_sched_select_entity(sched))) ||
-					 kthread_should_stop());
+					 kthread_should_stop()));
 
 		if (!entity)
 			continue;
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index b4255807b3a7..8c2df6d95283 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -268,7 +268,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
 
 	/* block scheduler */
 	for (q = 0; q < V3D_MAX_QUEUES; q++)
-		drm_sched_stop(&v3d->queue[q].sched);
+		drm_sched_stop(&v3d->queue[q].sched, sched_job);
 
 	if (sched_job)
 		drm_sched_increase_karma(sched_job);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 0daca4d8dad9..9ee0f2735d71 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -167,9 +167,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
  * @sched: the scheduler instance on which this job is scheduled.
  * @s_fence: contains the fences for the scheduling of job.
  * @finish_cb: the callback for the finished fence.
- * @finish_work: schedules the function @drm_sched_job_finish once the job has
- *               finished to remove the job from the
- *               @drm_gpu_scheduler.ring_mirror_list.
  * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list.
  * @id: a unique id assigned to each job scheduled on the scheduler.
  * @karma: increment on every hang caused by this job. If this exceeds the hang
@@ -188,7 +185,6 @@ struct drm_sched_job {
 	struct drm_gpu_scheduler	*sched;
 	struct drm_sched_fence		*s_fence;
 	struct dma_fence_cb		finish_cb;
-	struct work_struct		finish_work;
 	struct list_head		node;
 	uint64_t			id;
 	atomic_t			karma;
@@ -296,7 +292,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
 		       void *owner);
 void drm_sched_job_cleanup(struct drm_sched_job *job);
 void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
-void drm_sched_stop(struct drm_gpu_scheduler *sched);
+void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
 void drm_sched_increase_karma(struct drm_sched_job *bad);
-- 
cgit v1.2.3


From a5343b8a2ca5799ee6370e3cca77369a4c598221 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Thu, 18 Apr 2019 11:00:23 -0400
Subject: drm/scheduler: Add flag to hint the release of guilty job.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem:
Sched thread's cleanup function races against TO handler
and removes the guilty job from mirror list and we
have no way of differentiating if the job was removed from within the
TO handler or from the sched thread's clean-up function.

Fix:
Add a flag to scheduler to hint the TO handler that the guilty job needs
to be explicitly released.

v2: whitespace fix

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1555599624-12285-5-git-send-email-andrey.grodzovsky@amd.com
---
 drivers/gpu/drm/scheduler/sched_main.c | 9 +++++++--
 include/drm/gpu_scheduler.h            | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 03e6bd8a1a42..f8f0e1c19002 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -293,8 +293,10 @@ static void drm_sched_job_timedout(struct work_struct *work)
 	 * Guilty job did complete and hence needs to be manually removed
 	 * See drm_sched_stop doc.
 	 */
-	if (list_empty(&job->node))
+	if (sched->free_guilty) {
 		job->sched->ops->free_job(job);
+		sched->free_guilty = false;
+	}
 
 	spin_lock_irqsave(&sched->job_list_lock, flags);
 	drm_sched_start_timeout(sched);
@@ -395,10 +397,13 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 
 			/*
 			 * We must keep bad job alive for later use during
-			 * recovery by some of the drivers
+			 * recovery by some of the drivers but leave a hint
+			 * that the guilty job must be released.
 			 */
 			if (bad != s_job)
 				sched->ops->free_job(s_job);
+			else
+				sched->free_guilty = true;
 		}
 	}
 
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 9ee0f2735d71..57b4121c750a 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -259,6 +259,7 @@ struct drm_sched_backend_ops {
  *              guilty and it will be considered for scheduling further.
  * @num_jobs: the number of jobs in queue in the scheduler
  * @ready: marks if the underlying HW is ready to work
+ * @free_guilty: A hit to time out handler to free the guilty job.
  *
  * One scheduler is implemented for each hardware ring.
  */
@@ -279,6 +280,7 @@ struct drm_gpu_scheduler {
 	int				hang_limit;
 	atomic_t                        num_jobs;
 	bool			ready;
+	bool				free_guilty;
 };
 
 int drm_sched_init(struct drm_gpu_scheduler *sched,
-- 
cgit v1.2.3


From 27edadf6df811bcad0f7fa2f89321b4590a425bc Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Mon, 6 May 2019 16:46:29 +0200
Subject: drm/doc: Improve docs for conn_state->best_encoder

It's mandatory and considered core state since ioctls rely on this
working.

Thanks to Laurent for pointing out this gap.

v2: Clarify to "atomic drivers" only.

Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Sean Paul <sean@poorly.run>
Acked-by: Sean Paul <sean@poorly.run>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190506144629.5976-1-daniel.vetter@ffwll.ch
---
 include/drm/drm_connector.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 02a131202add..f43f40d5888a 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -517,6 +517,10 @@ struct drm_connector_state {
 	 * Used by the atomic helpers to select the encoder, through the
 	 * &drm_connector_helper_funcs.atomic_best_encoder or
 	 * &drm_connector_helper_funcs.best_encoder callbacks.
+	 *
+	 * NOTE: Atomic drivers must fill this out (either themselves or through
+	 * helpers), for otherwise the GETCONNECTOR and GETENCODER IOCTLs will
+	 * not return correct data to userspace.
 	 */
 	struct drm_encoder *best_encoder;
 
-- 
cgit v1.2.3


From 585b000de23ba06818975734da2303bd6ba2dcdd Mon Sep 17 00:00:00 2001
From: Ramalingam C <ramalingam.c@intel.com>
Date: Tue, 7 May 2019 21:57:35 +0530
Subject: drm: move content protection property to mode_config

Content protection property is created once and stored in
drm_mode_config. And attached to all HDCP capable connectors.

Signed-off-by: Ramalingam C <ramalingam.c@intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Dave Airlie <airlied@gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20190507162745.25600-2-ramalingam.c@intel.com
---
 drivers/gpu/drm/drm_atomic_uapi.c |  4 ++--
 drivers/gpu/drm/drm_connector.c   | 13 +++++++------
 include/drm/drm_connector.h       |  6 ------
 include/drm/drm_mode_config.h     |  6 ++++++
 4 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
index 428d82662dc4..4131e669785a 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -732,7 +732,7 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector,
 		state->content_type = val;
 	} else if (property == connector->scaling_mode_property) {
 		state->scaling_mode = val;
-	} else if (property == connector->content_protection_property) {
+	} else if (property == config->content_protection_property) {
 		if (val == DRM_MODE_CONTENT_PROTECTION_ENABLED) {
 			DRM_DEBUG_KMS("only drivers can set CP Enabled\n");
 			return -EINVAL;
@@ -814,7 +814,7 @@ drm_atomic_connector_get_property(struct drm_connector *connector,
 		*val = state->colorspace;
 	} else if (property == connector->scaling_mode_property) {
 		*val = state->scaling_mode;
-	} else if (property == connector->content_protection_property) {
+	} else if (property == config->content_protection_property) {
 		*val = state->content_protection;
 	} else if (property == config->writeback_fb_id_property) {
 		/* Writeback framebuffer is one-shot, write and forget */
diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 2355124849db..7c0eda9cca60 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1534,18 +1534,19 @@ int drm_connector_attach_content_protection_property(
 		struct drm_connector *connector)
 {
 	struct drm_device *dev = connector->dev;
-	struct drm_property *prop;
+	struct drm_property *prop =
+			dev->mode_config.content_protection_property;
 
-	prop = drm_property_create_enum(dev, 0, "Content Protection",
-					drm_cp_enum_list,
-					ARRAY_SIZE(drm_cp_enum_list));
+	if (!prop)
+		prop = drm_property_create_enum(dev, 0, "Content Protection",
+						drm_cp_enum_list,
+						ARRAY_SIZE(drm_cp_enum_list));
 	if (!prop)
 		return -ENOMEM;
 
 	drm_object_attach_property(&connector->base, prop,
 				   DRM_MODE_CONTENT_PROTECTION_UNDESIRED);
-
-	connector->content_protection_property = prop;
+	dev->mode_config.content_protection_property = prop;
 
 	return 0;
 }
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 02a131202add..5e41942e5679 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -1061,12 +1061,6 @@ struct drm_connector {
 	 */
 	struct drm_property *vrr_capable_property;
 
-	/**
-	 * @content_protection_property: DRM ENUM property for content
-	 * protection. See drm_connector_attach_content_protection_property().
-	 */
-	struct drm_property *content_protection_property;
-
 	/**
 	 * @colorspace_property: Connector property to set the suitable
 	 * colorspace supported by the sink.
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 7f60e8eb269a..5764ee3c7453 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -836,6 +836,12 @@ struct drm_mode_config {
 	 */
 	struct drm_property *writeback_out_fence_ptr_property;
 
+	/**
+	 * @content_protection_property: DRM ENUM property for content
+	 * protection. See drm_connector_attach_content_protection_property().
+	 */
+	struct drm_property *content_protection_property;
+
 	/* dumb ioctl parameters */
 	uint32_t preferred_depth, prefer_shadow;
 
-- 
cgit v1.2.3


From 0de655cae416b20ed7876adb480e5c65810274ea Mon Sep 17 00:00:00 2001
From: Ramalingam C <ramalingam.c@intel.com>
Date: Tue, 7 May 2019 21:57:37 +0530
Subject: drm: generic fn converting be24 to cpu and vice versa

Existing functions for converting a 3bytes(be24) of big endian value
into u32 of little endian and vice versa are renamed as

s/drm_hdcp2_seq_num_to_u32/drm_hdcp_be24_to_cpu
s/drm_hdcp2_u32_to_seq_num/drm_hdcp_cpu_to_be24

Signed-off-by: Ramalingam C <ramalingam.c@intel.com>
Suggested-by: Daniel Vetter <daniel@ffwll.ch>
cc: Tomas Winkler <tomas.winkler@intel.com>
Acked-by: Dave Airlie <airlied@gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20190507162745.25600-4-ramalingam.c@intel.com
---
 drivers/gpu/drm/i915/intel_hdcp.c | 5 +++--
 drivers/misc/mei/hdcp/mei_hdcp.c  | 2 +-
 include/drm/drm_hdcp.h            | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/intel_hdcp.c b/drivers/gpu/drm/i915/intel_hdcp.c
index b8c8d6d1a33d..c308dfee9ca4 100644
--- a/drivers/gpu/drm/i915/intel_hdcp.c
+++ b/drivers/gpu/drm/i915/intel_hdcp.c
@@ -1305,7 +1305,7 @@ int hdcp2_propagate_stream_management_info(struct intel_connector *connector)
 
 	/* Prepare RepeaterAuth_Stream_Manage msg */
 	msgs.stream_manage.msg_id = HDCP_2_2_REP_STREAM_MANAGE;
-	drm_hdcp2_u32_to_seq_num(msgs.stream_manage.seq_num_m, hdcp->seq_num_m);
+	drm_hdcp_cpu_to_be24(msgs.stream_manage.seq_num_m, hdcp->seq_num_m);
 
 	/* K no of streams is fixed as 1. Stored as big-endian. */
 	msgs.stream_manage.k = cpu_to_be16(1);
@@ -1370,7 +1370,8 @@ int hdcp2_authenticate_repeater_topology(struct intel_connector *connector)
 	}
 
 	/* Converting and Storing the seq_num_v to local variable as DWORD */
-	seq_num_v = drm_hdcp2_seq_num_to_u32(msgs.recvid_list.seq_num_v);
+	seq_num_v =
+		drm_hdcp_be24_to_cpu((const u8 *)msgs.recvid_list.seq_num_v);
 
 	if (seq_num_v < hdcp->seq_num_v) {
 		/* Roll over of the seq_num_v from repeater. Reauthenticate. */
diff --git a/drivers/misc/mei/hdcp/mei_hdcp.c b/drivers/misc/mei/hdcp/mei_hdcp.c
index 90b6ae8e9dae..2f192d6d8b54 100644
--- a/drivers/misc/mei/hdcp/mei_hdcp.c
+++ b/drivers/misc/mei/hdcp/mei_hdcp.c
@@ -576,7 +576,7 @@ static int mei_hdcp_verify_mprime(struct device *dev,
 
 	memcpy(verify_mprime_in.m_prime, stream_ready->m_prime,
 	       HDCP_2_2_MPRIME_LEN);
-	drm_hdcp2_u32_to_seq_num(verify_mprime_in.seq_num_m, data->seq_num_m);
+	drm_hdcp_cpu_to_be24(verify_mprime_in.seq_num_m, data->seq_num_m);
 	memcpy(verify_mprime_in.streams, data->streams,
 	       (data->k * sizeof(struct hdcp2_streamid_type)));
 
diff --git a/include/drm/drm_hdcp.h b/include/drm/drm_hdcp.h
index f243408ecf26..1cc66df05a43 100644
--- a/include/drm/drm_hdcp.h
+++ b/include/drm/drm_hdcp.h
@@ -252,13 +252,13 @@ struct hdcp2_rep_stream_ready {
  * host format and back
  */
 static inline
-u32 drm_hdcp2_seq_num_to_u32(u8 seq_num[HDCP_2_2_SEQ_NUM_LEN])
+u32 drm_hdcp_be24_to_cpu(const u8 seq_num[HDCP_2_2_SEQ_NUM_LEN])
 {
 	return (u32)(seq_num[2] | seq_num[1] << 8 | seq_num[0] << 16);
 }
 
 static inline
-void drm_hdcp2_u32_to_seq_num(u8 seq_num[HDCP_2_2_SEQ_NUM_LEN], u32 val)
+void drm_hdcp_cpu_to_be24(u8 seq_num[HDCP_2_2_SEQ_NUM_LEN], u32 val)
 {
 	seq_num[0] = val >> 16;
 	seq_num[1] = val >> 8;
-- 
cgit v1.2.3


From 6498bf5800a302ef69e7f4914e727893f278bb2f Mon Sep 17 00:00:00 2001
From: Ramalingam C <ramalingam.c@intel.com>
Date: Tue, 7 May 2019 21:57:38 +0530
Subject: drm: revocation check at drm subsystem

On every hdcp revocation check request SRM is read from fw file
/lib/firmware/display_hdcp_srm.bin

SRM table is parsed and stored at drm_hdcp.c, with functions exported
for the services for revocation check from drivers (which
implements the HDCP authentication)

This patch handles the HDCP1.4 and 2.2 versions of SRM table.

v2:
  moved the uAPI to request_firmware_direct() [Daniel]
v3:
  kdoc added. [Daniel]
  srm_header unified and bit field definitions are removed. [Daniel]
  locking improved. [Daniel]
  vrl length violation is fixed. [Daniel]
v4:
  s/__swab16/be16_to_cpu [Daniel]
  be24_to_cpu is done through a global func [Daniel]
  Unused variables are removed. [Daniel]
  unchecked return values are dropped from static funcs [Daniel]

Signed-off-by: Ramalingam C <ramalingam.c@intel.com>
Acked-by: Satyeshwar Singh <satyeshwar.singh@intel.com>
Reviewed-by: Daniel Vetter <daniel@ffwll.ch>
Acked-by: Dave Airlie <airlied@gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20190507162745.25600-5-ramalingam.c@intel.com
---
 Documentation/gpu/drm-kms-helpers.rst |   6 +
 drivers/gpu/drm/Makefile              |   2 +-
 drivers/gpu/drm/drm_hdcp.c            | 333 ++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/drm_internal.h        |   4 +
 drivers/gpu/drm/drm_sysfs.c           |   2 +
 include/drm/drm_hdcp.h                |  24 +++
 6 files changed, 370 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/drm_hdcp.c

(limited to 'include')

diff --git a/Documentation/gpu/drm-kms-helpers.rst b/Documentation/gpu/drm-kms-helpers.rst
index 14102ae035dc..0fe726a6ee67 100644
--- a/Documentation/gpu/drm-kms-helpers.rst
+++ b/Documentation/gpu/drm-kms-helpers.rst
@@ -181,6 +181,12 @@ Panel Helper Reference
 .. kernel-doc:: drivers/gpu/drm/drm_panel_orientation_quirks.c
    :export:
 
+HDCP Helper Functions Reference
+===============================
+
+.. kernel-doc:: drivers/gpu/drm/drm_hdcp.c
+   :export:
+
 Display Port Helper Functions Reference
 =======================================
 
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 72f5036d9bfa..dd02e9dec810 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -17,7 +17,7 @@ drm-y       :=	drm_auth.o drm_cache.o \
 		drm_plane.o drm_color_mgmt.o drm_print.o \
 		drm_dumb_buffers.o drm_mode_config.o drm_vblank.o \
 		drm_syncobj.o drm_lease.o drm_writeback.o drm_client.o \
-		drm_atomic_uapi.o
+		drm_atomic_uapi.o drm_hdcp.o
 
 drm-$(CONFIG_DRM_LEGACY) += drm_legacy_misc.o drm_bufs.o drm_context.o drm_dma.o drm_scatter.o drm_lock.o
 drm-$(CONFIG_DRM_LIB_RANDOM) += lib/drm_random.o
diff --git a/drivers/gpu/drm/drm_hdcp.c b/drivers/gpu/drm/drm_hdcp.c
new file mode 100644
index 000000000000..5e5409505c31
--- /dev/null
+++ b/drivers/gpu/drm/drm_hdcp.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Intel Corporation.
+ *
+ * Authors:
+ * Ramalingam C <ramalingam.c@intel.com>
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/firmware.h>
+
+#include <drm/drm_hdcp.h>
+#include <drm/drm_sysfs.h>
+#include <drm/drm_print.h>
+#include <drm/drm_device.h>
+
+struct hdcp_srm {
+	u32 revoked_ksv_cnt;
+	u8 *revoked_ksv_list;
+
+	/* Mutex to protect above struct member */
+	struct mutex mutex;
+} *srm_data;
+
+static inline void drm_hdcp_print_ksv(const u8 *ksv)
+{
+	DRM_DEBUG("\t%#02x, %#02x, %#02x, %#02x, %#02x\n",
+		  ksv[0], ksv[1], ksv[2], ksv[3], ksv[4]);
+}
+
+static u32 drm_hdcp_get_revoked_ksv_count(const u8 *buf, u32 vrls_length)
+{
+	u32 parsed_bytes = 0, ksv_count = 0, vrl_ksv_cnt, vrl_sz;
+
+	while (parsed_bytes < vrls_length) {
+		vrl_ksv_cnt = *buf;
+		ksv_count += vrl_ksv_cnt;
+
+		vrl_sz = (vrl_ksv_cnt * DRM_HDCP_KSV_LEN) + 1;
+		buf += vrl_sz;
+		parsed_bytes += vrl_sz;
+	}
+
+	/*
+	 * When vrls are not valid, ksvs are not considered.
+	 * Hence SRM will be discarded.
+	 */
+	if (parsed_bytes != vrls_length)
+		ksv_count = 0;
+
+	return ksv_count;
+}
+
+static u32 drm_hdcp_get_revoked_ksvs(const u8 *buf, u8 *revoked_ksv_list,
+				     u32 vrls_length)
+{
+	u32 parsed_bytes = 0, ksv_count = 0;
+	u32 vrl_ksv_cnt, vrl_ksv_sz, vrl_idx = 0;
+
+	do {
+		vrl_ksv_cnt = *buf;
+		vrl_ksv_sz = vrl_ksv_cnt * DRM_HDCP_KSV_LEN;
+
+		buf++;
+
+		DRM_DEBUG("vrl: %d, Revoked KSVs: %d\n", vrl_idx++,
+			  vrl_ksv_cnt);
+		memcpy(revoked_ksv_list, buf, vrl_ksv_sz);
+
+		ksv_count += vrl_ksv_cnt;
+		revoked_ksv_list += vrl_ksv_sz;
+		buf += vrl_ksv_sz;
+
+		parsed_bytes += (vrl_ksv_sz + 1);
+	} while (parsed_bytes < vrls_length);
+
+	return ksv_count;
+}
+
+static inline u32 get_vrl_length(const u8 *buf)
+{
+	return drm_hdcp_be24_to_cpu(buf);
+}
+
+static int drm_hdcp_parse_hdcp1_srm(const u8 *buf, size_t count)
+{
+	struct hdcp_srm_header *header;
+	u32 vrl_length, ksv_count;
+
+	if (count < (sizeof(struct hdcp_srm_header) +
+	    DRM_HDCP_1_4_VRL_LENGTH_SIZE + DRM_HDCP_1_4_DCP_SIG_SIZE)) {
+		DRM_ERROR("Invalid blob length\n");
+		return -EINVAL;
+	}
+
+	header = (struct hdcp_srm_header *)buf;
+	DRM_DEBUG("SRM ID: 0x%x, SRM Ver: 0x%x, SRM Gen No: 0x%x\n",
+		  header->srm_id,
+		  be16_to_cpu(header->srm_version), header->srm_gen_no);
+
+	WARN_ON(header->reserved);
+
+	buf = buf + sizeof(*header);
+	vrl_length = get_vrl_length(buf);
+	if (count < (sizeof(struct hdcp_srm_header) + vrl_length) ||
+	    vrl_length < (DRM_HDCP_1_4_VRL_LENGTH_SIZE +
+			  DRM_HDCP_1_4_DCP_SIG_SIZE)) {
+		DRM_ERROR("Invalid blob length or vrl length\n");
+		return -EINVAL;
+	}
+
+	/* Length of the all vrls combined */
+	vrl_length -= (DRM_HDCP_1_4_VRL_LENGTH_SIZE +
+		       DRM_HDCP_1_4_DCP_SIG_SIZE);
+
+	if (!vrl_length) {
+		DRM_ERROR("No vrl found\n");
+		return -EINVAL;
+	}
+
+	buf += DRM_HDCP_1_4_VRL_LENGTH_SIZE;
+	ksv_count = drm_hdcp_get_revoked_ksv_count(buf, vrl_length);
+	if (!ksv_count) {
+		DRM_DEBUG("Revoked KSV count is 0\n");
+		return count;
+	}
+
+	kfree(srm_data->revoked_ksv_list);
+	srm_data->revoked_ksv_list = kcalloc(ksv_count, DRM_HDCP_KSV_LEN,
+					     GFP_KERNEL);
+	if (!srm_data->revoked_ksv_list) {
+		DRM_ERROR("Out of Memory\n");
+		return -ENOMEM;
+	}
+
+	if (drm_hdcp_get_revoked_ksvs(buf, srm_data->revoked_ksv_list,
+				      vrl_length) != ksv_count) {
+		srm_data->revoked_ksv_cnt = 0;
+		kfree(srm_data->revoked_ksv_list);
+		return -EINVAL;
+	}
+
+	srm_data->revoked_ksv_cnt = ksv_count;
+	return count;
+}
+
+static int drm_hdcp_parse_hdcp2_srm(const u8 *buf, size_t count)
+{
+	struct hdcp_srm_header *header;
+	u32 vrl_length, ksv_count, ksv_sz;
+
+	if (count < (sizeof(struct hdcp_srm_header) +
+	    DRM_HDCP_2_VRL_LENGTH_SIZE + DRM_HDCP_2_DCP_SIG_SIZE)) {
+		DRM_ERROR("Invalid blob length\n");
+		return -EINVAL;
+	}
+
+	header = (struct hdcp_srm_header *)buf;
+	DRM_DEBUG("SRM ID: 0x%x, SRM Ver: 0x%x, SRM Gen No: 0x%x\n",
+		  header->srm_id & DRM_HDCP_SRM_ID_MASK,
+		  be16_to_cpu(header->srm_version), header->srm_gen_no);
+
+	if (header->reserved)
+		return -EINVAL;
+
+	buf = buf + sizeof(*header);
+	vrl_length = get_vrl_length(buf);
+
+	if (count < (sizeof(struct hdcp_srm_header) + vrl_length) ||
+	    vrl_length < (DRM_HDCP_2_VRL_LENGTH_SIZE +
+	    DRM_HDCP_2_DCP_SIG_SIZE)) {
+		DRM_ERROR("Invalid blob length or vrl length\n");
+		return -EINVAL;
+	}
+
+	/* Length of the all vrls combined */
+	vrl_length -= (DRM_HDCP_2_VRL_LENGTH_SIZE +
+		       DRM_HDCP_2_DCP_SIG_SIZE);
+
+	if (!vrl_length) {
+		DRM_ERROR("No vrl found\n");
+		return -EINVAL;
+	}
+
+	buf += DRM_HDCP_2_VRL_LENGTH_SIZE;
+	ksv_count = (*buf << 2) | DRM_HDCP_2_KSV_COUNT_2_LSBITS(*(buf + 1));
+	if (!ksv_count) {
+		DRM_DEBUG("Revoked KSV count is 0\n");
+		return count;
+	}
+
+	kfree(srm_data->revoked_ksv_list);
+	srm_data->revoked_ksv_list = kcalloc(ksv_count, DRM_HDCP_KSV_LEN,
+					     GFP_KERNEL);
+	if (!srm_data->revoked_ksv_list) {
+		DRM_ERROR("Out of Memory\n");
+		return -ENOMEM;
+	}
+
+	ksv_sz = ksv_count * DRM_HDCP_KSV_LEN;
+	buf += DRM_HDCP_2_NO_OF_DEV_PLUS_RESERVED_SZ;
+
+	DRM_DEBUG("Revoked KSVs: %d\n", ksv_count);
+	memcpy(srm_data->revoked_ksv_list, buf, ksv_sz);
+
+	srm_data->revoked_ksv_cnt = ksv_count;
+	return count;
+}
+
+static inline bool is_srm_version_hdcp1(const u8 *buf)
+{
+	return *buf == (u8)(DRM_HDCP_1_4_SRM_ID << 4);
+}
+
+static inline bool is_srm_version_hdcp2(const u8 *buf)
+{
+	return *buf == (u8)(DRM_HDCP_2_SRM_ID << 4 | DRM_HDCP_2_INDICATOR);
+}
+
+static void drm_hdcp_srm_update(const u8 *buf, size_t count)
+{
+	if (count < sizeof(struct hdcp_srm_header))
+		return;
+
+	if (is_srm_version_hdcp1(buf))
+		drm_hdcp_parse_hdcp1_srm(buf, count);
+	else if (is_srm_version_hdcp2(buf))
+		drm_hdcp_parse_hdcp2_srm(buf, count);
+}
+
+void drm_hdcp_request_srm(struct drm_device *drm_dev)
+{
+	char fw_name[36] = "display_hdcp_srm.bin";
+	const struct firmware *fw;
+
+	int ret;
+
+	ret = request_firmware_direct(&fw, (const char *)fw_name,
+				      drm_dev->dev);
+	if (ret < 0)
+		goto exit;
+
+	if (fw->size && fw->data)
+		drm_hdcp_srm_update(fw->data, fw->size);
+
+exit:
+	release_firmware(fw);
+}
+
+/**
+ * drm_hdcp_check_ksvs_revoked - Check the revoked status of the IDs
+ *
+ * @drm_dev: drm_device for which HDCP revocation check is requested
+ * @ksvs: List of KSVs (HDCP receiver IDs)
+ * @ksv_count: KSV count passed in through @ksvs
+ *
+ * This function reads the HDCP System renewability Message(SRM Table)
+ * from userspace as a firmware and parses it for the revoked HDCP
+ * KSVs(Receiver IDs) detected by DCP LLC. Once the revoked KSVs are known,
+ * revoked state of the KSVs in the list passed in by display drivers are
+ * decided and response is sent.
+ *
+ * SRM should be presented in the name of "display_hdcp_srm.bin".
+ *
+ * Returns:
+ * TRUE on any of the KSV is revoked, else FALSE.
+ */
+bool drm_hdcp_check_ksvs_revoked(struct drm_device *drm_dev, u8 *ksvs,
+				 u32 ksv_count)
+{
+	u32 rev_ksv_cnt, cnt, i, j;
+	u8 *rev_ksv_list;
+
+	if (!srm_data)
+		return false;
+
+	mutex_lock(&srm_data->mutex);
+	drm_hdcp_request_srm(drm_dev);
+
+	rev_ksv_cnt = srm_data->revoked_ksv_cnt;
+	rev_ksv_list = srm_data->revoked_ksv_list;
+
+	/* If the Revoked ksv list is empty */
+	if (!rev_ksv_cnt || !rev_ksv_list) {
+		mutex_unlock(&srm_data->mutex);
+		return false;
+	}
+
+	for  (cnt = 0; cnt < ksv_count; cnt++) {
+		rev_ksv_list = srm_data->revoked_ksv_list;
+		for (i = 0; i < rev_ksv_cnt; i++) {
+			for (j = 0; j < DRM_HDCP_KSV_LEN; j++)
+				if (ksvs[j] != rev_ksv_list[j]) {
+					break;
+				} else if (j == (DRM_HDCP_KSV_LEN - 1)) {
+					DRM_DEBUG("Revoked KSV is ");
+					drm_hdcp_print_ksv(ksvs);
+					mutex_unlock(&srm_data->mutex);
+					return true;
+				}
+			/* Move the offset to next KSV in the revoked list */
+			rev_ksv_list += DRM_HDCP_KSV_LEN;
+		}
+
+		/* Iterate to next ksv_offset */
+		ksvs += DRM_HDCP_KSV_LEN;
+	}
+	mutex_unlock(&srm_data->mutex);
+	return false;
+}
+EXPORT_SYMBOL_GPL(drm_hdcp_check_ksvs_revoked);
+
+int drm_setup_hdcp_srm(struct class *drm_class)
+{
+	srm_data = kzalloc(sizeof(*srm_data), GFP_KERNEL);
+	if (!srm_data)
+		return -ENOMEM;
+	mutex_init(&srm_data->mutex);
+
+	return 0;
+}
+
+void drm_teardown_hdcp_srm(struct class *drm_class)
+{
+	if (srm_data) {
+		kfree(srm_data->revoked_ksv_list);
+		kfree(srm_data);
+	}
+}
diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
index e19ac7ca602d..476a422414f6 100644
--- a/drivers/gpu/drm/drm_internal.h
+++ b/drivers/gpu/drm/drm_internal.h
@@ -201,3 +201,7 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
 void drm_framebuffer_print_info(struct drm_printer *p, unsigned int indent,
 				const struct drm_framebuffer *fb);
 int drm_framebuffer_debugfs_init(struct drm_minor *minor);
+
+/* drm_hdcp.c */
+int drm_setup_hdcp_srm(struct class *drm_class);
+void drm_teardown_hdcp_srm(struct class *drm_class);
diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index ecb7b33002bb..18b1ac442997 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -78,6 +78,7 @@ int drm_sysfs_init(void)
 	}
 
 	drm_class->devnode = drm_devnode;
+	drm_setup_hdcp_srm(drm_class);
 	return 0;
 }
 
@@ -90,6 +91,7 @@ void drm_sysfs_destroy(void)
 {
 	if (IS_ERR_OR_NULL(drm_class))
 		return;
+	drm_teardown_hdcp_srm(drm_class);
 	class_remove_file(drm_class, &class_attr_version.attr);
 	class_destroy(drm_class);
 	drm_class = NULL;
diff --git a/include/drm/drm_hdcp.h b/include/drm/drm_hdcp.h
index 1cc66df05a43..2f0335d0a50f 100644
--- a/include/drm/drm_hdcp.h
+++ b/include/drm/drm_hdcp.h
@@ -265,4 +265,28 @@ void drm_hdcp_cpu_to_be24(u8 seq_num[HDCP_2_2_SEQ_NUM_LEN], u32 val)
 	seq_num[2] = val;
 }
 
+#define DRM_HDCP_SRM_GEN1_MAX_BYTES		(5 * 1024)
+#define DRM_HDCP_1_4_SRM_ID			0x8
+#define DRM_HDCP_SRM_ID_MASK			(0xF << 4)
+#define DRM_HDCP_1_4_VRL_LENGTH_SIZE		3
+#define DRM_HDCP_1_4_DCP_SIG_SIZE		40
+#define DRM_HDCP_2_SRM_ID			0x9
+#define DRM_HDCP_2_INDICATOR			0x1
+#define DRM_HDCP_2_INDICATOR_MASK		0xF
+#define DRM_HDCP_2_VRL_LENGTH_SIZE		3
+#define DRM_HDCP_2_DCP_SIG_SIZE			384
+#define DRM_HDCP_2_NO_OF_DEV_PLUS_RESERVED_SZ	4
+#define DRM_HDCP_2_KSV_COUNT_2_LSBITS(byte)	(((byte) & 0xC) >> 6)
+
+struct hdcp_srm_header {
+	u8 srm_id;
+	u8 reserved;
+	__be16 srm_version;
+	u8 srm_gen_no;
+} __packed;
+
+struct drm_device;
+
+bool drm_hdcp_check_ksvs_revoked(struct drm_device *dev,
+				 u8 *ksvs, u32 ksv_count);
 #endif
-- 
cgit v1.2.3


From c16fd9be70faf3c49a61700efd16018dd910e390 Mon Sep 17 00:00:00 2001
From: Ramalingam C <ramalingam.c@intel.com>
Date: Tue, 7 May 2019 21:57:40 +0530
Subject: drm/hdcp: gathering hdcp related code into drm_hdcp.c

Considering the significant size of hdcp related code in drm, all
hdcp related codes are moved into separate file called drm_hdcp.c.

v2:
  Rebased.
v2:
  Rebased.

Signed-off-by: Ramalingam C <ramalingam.c@intel.com>
Suggested-by: Daniel Vetter <daniel@ffwll.ch>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Dave Airlie <airlied@gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20190507162745.25600-7-ramalingam.c@intel.com
---
 drivers/gpu/drm/drm_connector.c | 44 --------------------------------------
 drivers/gpu/drm/drm_hdcp.c      | 47 +++++++++++++++++++++++++++++++++++++++++
 include/drm/drm_connector.h     |  2 --
 include/drm/drm_hdcp.h          |  3 +++
 4 files changed, 50 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 7c0eda9cca60..764c7903edf6 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -823,13 +823,6 @@ static const struct drm_prop_enum_list drm_tv_subconnector_enum_list[] = {
 DRM_ENUM_NAME_FN(drm_get_tv_subconnector_name,
 		 drm_tv_subconnector_enum_list)
 
-static struct drm_prop_enum_list drm_cp_enum_list[] = {
-	{ DRM_MODE_CONTENT_PROTECTION_UNDESIRED, "Undesired" },
-	{ DRM_MODE_CONTENT_PROTECTION_DESIRED, "Desired" },
-	{ DRM_MODE_CONTENT_PROTECTION_ENABLED, "Enabled" },
-};
-DRM_ENUM_NAME_FN(drm_get_content_protection_name, drm_cp_enum_list)
-
 static const struct drm_prop_enum_list hdmi_colorspaces[] = {
 	/* For Default case, driver will set the colorspace */
 	{ DRM_MODE_COLORIMETRY_DEFAULT, "Default" },
@@ -1515,43 +1508,6 @@ int drm_connector_attach_scaling_mode_property(struct drm_connector *connector,
 }
 EXPORT_SYMBOL(drm_connector_attach_scaling_mode_property);
 
-/**
- * drm_connector_attach_content_protection_property - attach content protection
- * property
- *
- * @connector: connector to attach CP property on.
- *
- * This is used to add support for content protection on select connectors.
- * Content Protection is intentionally vague to allow for different underlying
- * technologies, however it is most implemented by HDCP.
- *
- * The content protection will be set to &drm_connector_state.content_protection
- *
- * Returns:
- * Zero on success, negative errno on failure.
- */
-int drm_connector_attach_content_protection_property(
-		struct drm_connector *connector)
-{
-	struct drm_device *dev = connector->dev;
-	struct drm_property *prop =
-			dev->mode_config.content_protection_property;
-
-	if (!prop)
-		prop = drm_property_create_enum(dev, 0, "Content Protection",
-						drm_cp_enum_list,
-						ARRAY_SIZE(drm_cp_enum_list));
-	if (!prop)
-		return -ENOMEM;
-
-	drm_object_attach_property(&connector->base, prop,
-				   DRM_MODE_CONTENT_PROTECTION_UNDESIRED);
-	dev->mode_config.content_protection_property = prop;
-
-	return 0;
-}
-EXPORT_SYMBOL(drm_connector_attach_content_protection_property);
-
 /**
  * drm_mode_create_aspect_ratio_property - create aspect ratio property
  * @dev: DRM device
diff --git a/drivers/gpu/drm/drm_hdcp.c b/drivers/gpu/drm/drm_hdcp.c
index 5e5409505c31..0da7b3718bad 100644
--- a/drivers/gpu/drm/drm_hdcp.c
+++ b/drivers/gpu/drm/drm_hdcp.c
@@ -17,6 +17,9 @@
 #include <drm/drm_sysfs.h>
 #include <drm/drm_print.h>
 #include <drm/drm_device.h>
+#include <drm/drm_property.h>
+#include <drm/drm_mode_object.h>
+#include <drm/drm_connector.h>
 
 struct hdcp_srm {
 	u32 revoked_ksv_cnt;
@@ -331,3 +334,47 @@ void drm_teardown_hdcp_srm(struct class *drm_class)
 		kfree(srm_data);
 	}
 }
+
+static struct drm_prop_enum_list drm_cp_enum_list[] = {
+	{ DRM_MODE_CONTENT_PROTECTION_UNDESIRED, "Undesired" },
+	{ DRM_MODE_CONTENT_PROTECTION_DESIRED, "Desired" },
+	{ DRM_MODE_CONTENT_PROTECTION_ENABLED, "Enabled" },
+};
+DRM_ENUM_NAME_FN(drm_get_content_protection_name, drm_cp_enum_list)
+
+/**
+ * drm_connector_attach_content_protection_property - attach content protection
+ * property
+ *
+ * @connector: connector to attach CP property on.
+ *
+ * This is used to add support for content protection on select connectors.
+ * Content Protection is intentionally vague to allow for different underlying
+ * technologies, however it is most implemented by HDCP.
+ *
+ * The content protection will be set to &drm_connector_state.content_protection
+ *
+ * Returns:
+ * Zero on success, negative errno on failure.
+ */
+int drm_connector_attach_content_protection_property(
+		struct drm_connector *connector)
+{
+	struct drm_device *dev = connector->dev;
+	struct drm_property *prop =
+			dev->mode_config.content_protection_property;
+
+	if (!prop)
+		prop = drm_property_create_enum(dev, 0, "Content Protection",
+						drm_cp_enum_list,
+						ARRAY_SIZE(drm_cp_enum_list));
+	if (!prop)
+		return -ENOMEM;
+
+	drm_object_attach_property(&connector->base, prop,
+				   DRM_MODE_CONTENT_PROTECTION_UNDESIRED);
+	dev->mode_config.content_protection_property = prop;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_connector_attach_content_protection_property);
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 5e41942e5679..da9e1f087fe4 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -1339,8 +1339,6 @@ int drm_connector_attach_scaling_mode_property(struct drm_connector *connector,
 					       u32 scaling_mode_mask);
 int drm_connector_attach_vrr_capable_property(
 		struct drm_connector *connector);
-int drm_connector_attach_content_protection_property(
-		struct drm_connector *connector);
 int drm_mode_create_aspect_ratio_property(struct drm_device *dev);
 int drm_mode_create_colorspace_property(struct drm_connector *connector);
 int drm_mode_create_content_type_property(struct drm_device *dev);
diff --git a/include/drm/drm_hdcp.h b/include/drm/drm_hdcp.h
index 2f0335d0a50f..13771a496e2b 100644
--- a/include/drm/drm_hdcp.h
+++ b/include/drm/drm_hdcp.h
@@ -286,7 +286,10 @@ struct hdcp_srm_header {
 } __packed;
 
 struct drm_device;
+struct drm_connector;
 
 bool drm_hdcp_check_ksvs_revoked(struct drm_device *dev,
 				 u8 *ksvs, u32 ksv_count);
+int drm_connector_attach_content_protection_property(
+		struct drm_connector *connector);
 #endif
-- 
cgit v1.2.3


From 09ed79d6d75f06cc963a78f25463251b0a758dc7 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 7 May 2019 10:01:47 -0700
Subject: percpu_ref: introduce PERCPU_REF_ALLOW_REINIT flag

In most cases percpu reference counters are not switched to the
percpu mode after they reach the atomic mode. Some obvious exceptions
are reference counters which are initialized into the atomic
mode (using PERCPU_REF_INIT_ATOMIC and PERCPU_REF_INIT_DEAD flags),
and there are few other exceptions.

But in most cases there is no way back, and once the reference counter
is switched to the atomic mode, there is no reason to wait for
percpu_ref_exit() to release the percpu memory. Of course, the size
of a single counter is not so big, but because it can pin the whole
percpu block in memory, the memory footprint can be noticeable
(e.g. on my 32 CPUs machine a percpu block is 8Mb large).

To make releasing of the percpu memory as early as possible, let's
introduce the PERCPU_REF_ALLOW_REINIT flag with the following semantics:
it has to be set in order to switch a percpu reference counter to the
percpu mode after the initialization. PERCPU_REF_INIT_ATOMIC and
PERCPU_REF_INIT_DEAD flags will implicitly assume PERCPU_REF_ALLOW_REINIT.

This patch doesn't introduce any functional change to avoid any
regressions. It will be done later in the patchset after adjusting
all call sites, which are reviving percpu counters.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/percpu-refcount.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index b297cd1cd4f1..0f0240af8520 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -75,14 +75,21 @@ enum {
 	 * operation using percpu_ref_switch_to_percpu().  If initialized
 	 * with this flag, the ref will stay in atomic mode until
 	 * percpu_ref_switch_to_percpu() is invoked on it.
+	 * Implies ALLOW_REINIT.
 	 */
 	PERCPU_REF_INIT_ATOMIC	= 1 << 0,
 
 	/*
 	 * Start dead w/ ref == 0 in atomic mode.  Must be revived with
-	 * percpu_ref_reinit() before used.  Implies INIT_ATOMIC.
+	 * percpu_ref_reinit() before used.  Implies INIT_ATOMIC and
+	 * ALLOW_REINIT.
 	 */
 	PERCPU_REF_INIT_DEAD	= 1 << 1,
+
+	/*
+	 * Allow switching from atomic mode to percpu mode.
+	 */
+	PERCPU_REF_ALLOW_REINIT	= 1 << 2,
 };
 
 struct percpu_ref {
-- 
cgit v1.2.3


From 7d9ab9b6adffd9c474c1274acb5f6208f9a09cf3 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 7 May 2019 10:01:50 -0700
Subject: percpu_ref: release percpu memory early without
 PERCPU_REF_ALLOW_REINIT

Release percpu memory after finishing the switch to the atomic mode
if only PERCPU_REF_ALLOW_REINIT isn't set.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/percpu-refcount.h |  1 +
 lib/percpu-refcount.c           | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 0f0240af8520..7aef0abc194a 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -102,6 +102,7 @@ struct percpu_ref {
 	percpu_ref_func_t	*release;
 	percpu_ref_func_t	*confirm_switch;
 	bool			force_atomic:1;
+	bool			allow_reinit:1;
 	struct rcu_head		rcu;
 };
 
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 9877682e49c7..501b517bd3db 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -69,11 +69,14 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
 		return -ENOMEM;
 
 	ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC;
+	ref->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT;
 
-	if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD))
+	if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) {
 		ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
-	else
+		ref->allow_reinit = true;
+	} else {
 		start_count += PERCPU_COUNT_BIAS;
+	}
 
 	if (flags & PERCPU_REF_INIT_DEAD)
 		ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
@@ -119,6 +122,9 @@ static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
 	ref->confirm_switch = NULL;
 	wake_up_all(&percpu_ref_switch_waitq);
 
+	if (!ref->allow_reinit)
+		percpu_ref_exit(ref);
+
 	/* drop ref from percpu_ref_switch_to_atomic() */
 	percpu_ref_put(ref);
 }
@@ -194,6 +200,9 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
 		return;
 
+	if (WARN_ON_ONCE(!ref->allow_reinit))
+		return;
+
 	atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
 
 	/*
-- 
cgit v1.2.3


From 1aa3750885fbcece5a0c9e6bbcd014ac526cea41 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Fri, 10 May 2019 17:02:55 +0300
Subject: drm/i915/icl: More workaround for port F detection due to broken VBTs

Add another ICL-Y PCIID that proved to have only 5 ports to the
corresponding PCIID list.

Meanwhile I'm trying to get a complete list of all PCIIDs with less than
6 ports and/or get a VBT fix to mark these ports non-existent, but until
then the only way is to go one-by-one.

This fixes the following error on machines with less than 6 port:

	[drm:intel_power_well_enable [i915]] enabling AUX F
	------------[ cut here ]------------
	WARN_ON(intel_wait_for_register(&dev_priv->uncore, regs->driver, (0x1 << ((pw_idx) * 2)), (0x1 << ((pw_idx) * 2)), 1))

(Internal reference: BSpec/Index/20584/Issues, HSD/1306084116)

Cc: Mika Kahola <mika.kahola@intel.com>
References: https://bugs.freedesktop.org/show_bug.cgi?id=108915
Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Mika Kahola <mika.kahola@intel.com>
Tested-by: Mika Kahola <mika.kahola@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190510140255.25215-1-imre.deak@intel.com
---
 include/drm/i915_pciids.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index 6477da22af28..6d60ea68c171 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -559,7 +559,6 @@
 #define INTEL_ICL_PORT_F_IDS(info) \
 	INTEL_VGA_DEVICE(0x8A50, info), \
 	INTEL_VGA_DEVICE(0x8A5C, info), \
-	INTEL_VGA_DEVICE(0x8A5D, info), \
 	INTEL_VGA_DEVICE(0x8A59, info),	\
 	INTEL_VGA_DEVICE(0x8A58, info),	\
 	INTEL_VGA_DEVICE(0x8A52, info), \
@@ -573,7 +572,8 @@
 
 #define INTEL_ICL_11_IDS(info) \
 	INTEL_ICL_PORT_F_IDS(info), \
-	INTEL_VGA_DEVICE(0x8A51, info)
+	INTEL_VGA_DEVICE(0x8A51, info), \
+	INTEL_VGA_DEVICE(0x8A5D, info)
 
 /* EHL */
 #define INTEL_EHL_IDS(info) \
-- 
cgit v1.2.3


From bc93475587850c097bfbc08f2ffa81171e0a1602 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Mon, 6 May 2019 20:01:31 +0200
Subject: drm/fb-helper: No need to cache rotation and sw_rotations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Getting rotation info is cheap so we can do it on demand.

This is done in preparation for the removal of struct drm_fb_helper_crtc.

Cc: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190506180139.6913-4-noralf@tronnes.org
---
 drivers/gpu/drm/drm_fb_helper.c | 131 ++++++++++++++++++++--------------------
 include/drm/drm_fb_helper.h     |   8 ---
 2 files changed, 65 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index c521e7490380..1bfbd16c23de 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -388,6 +388,49 @@ int drm_fb_helper_debug_leave(struct fb_info *info)
 }
 EXPORT_SYMBOL(drm_fb_helper_debug_leave);
 
+/* Check if the plane can hw rotate to match panel orientation */
+static bool drm_fb_helper_panel_rotation(struct drm_mode_set *modeset,
+					 unsigned int *rotation)
+{
+	struct drm_connector *connector = modeset->connectors[0];
+	struct drm_plane *plane = modeset->crtc->primary;
+	u64 valid_mask = 0;
+	unsigned int i;
+
+	if (!modeset->num_connectors)
+		return false;
+
+	switch (connector->display_info.panel_orientation) {
+	case DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP:
+		*rotation = DRM_MODE_ROTATE_180;
+		break;
+	case DRM_MODE_PANEL_ORIENTATION_LEFT_UP:
+		*rotation = DRM_MODE_ROTATE_90;
+		break;
+	case DRM_MODE_PANEL_ORIENTATION_RIGHT_UP:
+		*rotation = DRM_MODE_ROTATE_270;
+		break;
+	default:
+		*rotation = DRM_MODE_ROTATE_0;
+	}
+
+	/*
+	 * TODO: support 90 / 270 degree hardware rotation,
+	 * depending on the hardware this may require the framebuffer
+	 * to be in a specific tiling format.
+	 */
+	if (*rotation != DRM_MODE_ROTATE_180 || !plane->rotation_property)
+		return false;
+
+	for (i = 0; i < plane->rotation_property->num_values; i++)
+		valid_mask |= (1ULL << plane->rotation_property->values[i]);
+
+	if (!(*rotation & valid_mask))
+		return false;
+
+	return true;
+}
+
 static int restore_fbdev_mode_atomic(struct drm_fb_helper *fb_helper, bool active)
 {
 	struct drm_device *dev = fb_helper->dev;
@@ -428,10 +471,13 @@ retry:
 	for (i = 0; i < fb_helper->crtc_count; i++) {
 		struct drm_mode_set *mode_set = &fb_helper->crtc_info[i].mode_set;
 		struct drm_plane *primary = mode_set->crtc->primary;
+		unsigned int rotation;
 
-		/* Cannot fail as we've already gotten the plane state above */
-		plane_state = drm_atomic_get_new_plane_state(state, primary);
-		plane_state->rotation = fb_helper->crtc_info[i].rotation;
+		if (drm_fb_helper_panel_rotation(mode_set, &rotation)) {
+			/* Cannot fail as we've already gotten the plane state above */
+			plane_state = drm_atomic_get_new_plane_state(state, primary);
+			plane_state->rotation = rotation;
+		}
 
 		ret = __drm_atomic_helper_set_config(mode_set, state);
 		if (ret != 0)
@@ -881,7 +927,6 @@ int drm_fb_helper_init(struct drm_device *dev,
 		if (!fb_helper->crtc_info[i].mode_set.connectors)
 			goto out_free;
 		fb_helper->crtc_info[i].mode_set.num_connectors = 0;
-		fb_helper->crtc_info[i].rotation = DRM_MODE_ROTATE_0;
 	}
 
 	i = 0;
@@ -2510,62 +2555,6 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
 	return best_score;
 }
 
-/*
- * This function checks if rotation is necessary because of panel orientation
- * and if it is, if it is supported.
- * If rotation is necessary and supported, it gets set in fb_crtc.rotation.
- * If rotation is necessary but not supported, a DRM_MODE_ROTATE_* flag gets
- * or-ed into fb_helper->sw_rotations. In drm_setup_crtcs_fb() we check if only
- * one bit is set and then we set fb_info.fbcon_rotate_hint to make fbcon do
- * the unsupported rotation.
- */
-static void drm_setup_crtc_rotation(struct drm_fb_helper *fb_helper,
-				    struct drm_fb_helper_crtc *fb_crtc,
-				    struct drm_connector *connector)
-{
-	struct drm_plane *plane = fb_crtc->mode_set.crtc->primary;
-	uint64_t valid_mask = 0;
-	int i, rotation;
-
-	fb_crtc->rotation = DRM_MODE_ROTATE_0;
-
-	switch (connector->display_info.panel_orientation) {
-	case DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP:
-		rotation = DRM_MODE_ROTATE_180;
-		break;
-	case DRM_MODE_PANEL_ORIENTATION_LEFT_UP:
-		rotation = DRM_MODE_ROTATE_90;
-		break;
-	case DRM_MODE_PANEL_ORIENTATION_RIGHT_UP:
-		rotation = DRM_MODE_ROTATE_270;
-		break;
-	default:
-		rotation = DRM_MODE_ROTATE_0;
-	}
-
-	/*
-	 * TODO: support 90 / 270 degree hardware rotation,
-	 * depending on the hardware this may require the framebuffer
-	 * to be in a specific tiling format.
-	 */
-	if (rotation != DRM_MODE_ROTATE_180 || !plane->rotation_property) {
-		fb_helper->sw_rotations |= rotation;
-		return;
-	}
-
-	for (i = 0; i < plane->rotation_property->num_values; i++)
-		valid_mask |= (1ULL << plane->rotation_property->values[i]);
-
-	if (!(rotation & valid_mask)) {
-		fb_helper->sw_rotations |= rotation;
-		return;
-	}
-
-	fb_crtc->rotation = rotation;
-	/* Rotating in hardware, fbcon should not rotate */
-	fb_helper->sw_rotations |= DRM_MODE_ROTATE_0;
-}
-
 static struct drm_fb_helper_crtc *
 drm_fb_helper_crtc(struct drm_fb_helper *fb_helper, struct drm_crtc *crtc)
 {
@@ -2812,7 +2801,6 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 		drm_fb_helper_modeset_release(fb_helper,
 					      &fb_helper->crtc_info[i].mode_set);
 
-	fb_helper->sw_rotations = 0;
 	drm_fb_helper_for_each_connector(fb_helper, i) {
 		struct drm_display_mode *mode = modes[i];
 		struct drm_fb_helper_crtc *fb_crtc = crtcs[i];
@@ -2832,7 +2820,6 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 			modeset->mode = drm_mode_duplicate(dev,
 							   fb_crtc->desired_mode);
 			drm_connector_get(connector);
-			drm_setup_crtc_rotation(fb_helper, fb_crtc, connector);
 			modeset->connectors[modeset->num_connectors++] = connector;
 			modeset->x = offset->x;
 			modeset->y = offset->y;
@@ -2855,11 +2842,23 @@ out:
 static void drm_setup_crtcs_fb(struct drm_fb_helper *fb_helper)
 {
 	struct fb_info *info = fb_helper->fbdev;
+	unsigned int rotation, sw_rotations = 0;
 	int i;
 
-	for (i = 0; i < fb_helper->crtc_count; i++)
-		if (fb_helper->crtc_info[i].mode_set.num_connectors)
-			fb_helper->crtc_info[i].mode_set.fb = fb_helper->fb;
+	for (i = 0; i < fb_helper->crtc_count; i++) {
+		struct drm_mode_set *modeset = &fb_helper->crtc_info[i].mode_set;
+
+		if (!modeset->num_connectors)
+			continue;
+
+		modeset->fb = fb_helper->fb;
+
+		if (drm_fb_helper_panel_rotation(modeset, &rotation))
+			/* Rotating in hardware, fbcon should not rotate */
+			sw_rotations |= DRM_MODE_ROTATE_0;
+		else
+			sw_rotations |= rotation;
+	}
 
 	mutex_lock(&fb_helper->dev->mode_config.mutex);
 	drm_fb_helper_for_each_connector(fb_helper, i) {
@@ -2875,7 +2874,7 @@ static void drm_setup_crtcs_fb(struct drm_fb_helper *fb_helper)
 	}
 	mutex_unlock(&fb_helper->dev->mode_config.mutex);
 
-	switch (fb_helper->sw_rotations) {
+	switch (sw_rotations) {
 	case DRM_MODE_ROTATE_0:
 		info->fbcon_rotate_hint = FB_ROTATE_UR;
 		break;
diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h
index 40af2866c26a..b0a14aef2e39 100644
--- a/include/drm/drm_fb_helper.h
+++ b/include/drm/drm_fb_helper.h
@@ -51,7 +51,6 @@ struct drm_fb_helper_crtc {
 	struct drm_mode_set mode_set;
 	struct drm_display_mode *desired_mode;
 	int x, y;
-	int rotation;
 };
 
 /**
@@ -151,13 +150,6 @@ struct drm_fb_helper {
 	struct drm_fb_helper_crtc *crtc_info;
 	int connector_count;
 	int connector_info_alloc_count;
-	/**
-	 * @sw_rotations:
-	 * Bitmask of all rotations requested for panel-orientation which
-	 * could not be handled in hardware. If only one bit is set
-	 * fbdev->fbcon_rotate_hint gets set to the requested rotation.
-	 */
-	int sw_rotations;
 	/**
 	 * @connector_info:
 	 *
-- 
cgit v1.2.3


From c8de0d5ff978a581717a1576e676b68bdc86c996 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Mon, 6 May 2019 20:01:32 +0200
Subject: drm/fb-helper: Remove drm_fb_helper_crtc->{x, y, desired_mode}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The values are already present in the modeset.

This is done in preparation for the removal of struct drm_fb_helper_crtc.

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190506180139.6913-5-noralf@tronnes.org
---
 drivers/gpu/drm/drm_fb_helper.c | 12 ++++--------
 include/drm/drm_fb_helper.h     |  2 --
 2 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 1bfbd16c23de..c259a28522f8 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -2031,16 +2031,16 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper,
 		 */
 		bool lastv = true, lasth = true;
 
-		desired_mode = fb_helper->crtc_info[i].desired_mode;
 		mode_set = &fb_helper->crtc_info[i].mode_set;
+		desired_mode = mode_set->mode;
 
 		if (!desired_mode)
 			continue;
 
 		crtc_count++;
 
-		x = fb_helper->crtc_info[i].x;
-		y = fb_helper->crtc_info[i].y;
+		x = mode_set->x;
+		y = mode_set->y;
 
 		sizes.surface_width  = max_t(u32, desired_mode->hdisplay + x, sizes.surface_width);
 		sizes.surface_height = max_t(u32, desired_mode->vdisplay + y, sizes.surface_height);
@@ -2814,11 +2814,7 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 			DRM_DEBUG_KMS("desired mode %s set on crtc %d (%d,%d)\n",
 				      mode->name, fb_crtc->mode_set.crtc->base.id, offset->x, offset->y);
 
-			fb_crtc->desired_mode = mode;
-			fb_crtc->x = offset->x;
-			fb_crtc->y = offset->y;
-			modeset->mode = drm_mode_duplicate(dev,
-							   fb_crtc->desired_mode);
+			modeset->mode = drm_mode_duplicate(dev, mode);
 			drm_connector_get(connector);
 			modeset->connectors[modeset->num_connectors++] = connector;
 			modeset->x = offset->x;
diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h
index b0a14aef2e39..2af1c6d3e147 100644
--- a/include/drm/drm_fb_helper.h
+++ b/include/drm/drm_fb_helper.h
@@ -49,8 +49,6 @@ struct drm_fb_offset {
 
 struct drm_fb_helper_crtc {
 	struct drm_mode_set mode_set;
-	struct drm_display_mode *desired_mode;
-	int x, y;
 };
 
 /**
-- 
cgit v1.2.3


From 85438a8ddf031f416758ecf8c080ff92075acb01 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 8 May 2019 10:26:11 +0200
Subject: drm: Add |struct drm_gem_vram_object| and helpers

The type |struct drm_gem_vram_object| implements a GEM object for simple
framebuffer devices with dedicated video memory. The BO is either located
in VRAM or system memory.

The implementation has been created from the respective code in ast,
bochs and mgag200. These drivers copy their implementation from each
other; except for the names of several data types. The helpers are
currently build with TTM, but this is considered an implementation
detail and may change in future updates.

v5:
	* do WARN_ON_ONCE for pin-count mismatches
	* allocate only 2 entries in placements array
v4:
	* cleanups from checkpatch.pl
	* removed several fixed-size types from interfaces
	* DRM_VRAM_HELPER now selects DRM_TTM
	* remove separate config option for GEM VRAM
v2:
	* rename to |struct drm_gem_vram_object|
	* move drm_is_gem_ttm() to a later patch in the series
	* add drm_gem_vram_kmap_at()
	* return is_iomem from kmap functions
	* redefine TTM placement flags for public interface
	* documentation fixes

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: http://patchwork.freedesktop.org/patch/msgid/20190508082630.15116-2-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 Documentation/gpu/drm-mm.rst             |  15 ++
 drivers/gpu/drm/Kconfig                  |   7 +
 drivers/gpu/drm/Makefile                 |   4 +
 drivers/gpu/drm/drm_gem_vram_helper.c    | 412 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/drm_vram_helper_common.c |   6 +
 include/drm/drm_gem_vram_helper.h        |  92 +++++++
 6 files changed, 536 insertions(+)
 create mode 100644 drivers/gpu/drm/drm_gem_vram_helper.c
 create mode 100644 drivers/gpu/drm/drm_vram_helper_common.c
 create mode 100644 include/drm/drm_gem_vram_helper.h

(limited to 'include')

diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
index 54a696d961a7..a3b685089512 100644
--- a/Documentation/gpu/drm-mm.rst
+++ b/Documentation/gpu/drm-mm.rst
@@ -380,6 +380,21 @@ GEM CMA Helper Functions Reference
 .. kernel-doc:: drivers/gpu/drm/drm_gem_cma_helper.c
    :export:
 
+VRAM Helper Function Reference
+==============================
+
+GEM VRAM Helper Functions Reference
+-----------------------------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_gem_vram_helper.c
+   :doc: overview
+
+.. kernel-doc:: include/drm/drm_gem_vram_helper.h
+   :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_gem_vram_helper.c
+   :export:
+
 VMA Offset Manager
 ==================
 
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 2267e84d5cb4..13d7b321db0d 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -160,6 +160,13 @@ config DRM_TTM
 	  GPU memory types. Will be enabled automatically if a device driver
 	  uses it.
 
+config DRM_VRAM_HELPER
+	tristate
+	depends on DRM
+	select DRM_TTM
+	help
+	  Helpers for VRAM memory management
+
 config DRM_GEM_CMA_HELPER
 	bool
 	depends on DRM
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 72f5036d9bfa..ed49b2480766 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -32,6 +32,10 @@ drm-$(CONFIG_AGP) += drm_agpsupport.o
 drm-$(CONFIG_DEBUG_FS) += drm_debugfs.o drm_debugfs_crc.o
 drm-$(CONFIG_DRM_LOAD_EDID_FIRMWARE) += drm_edid_load.o
 
+drm_vram_helper-y := drm_gem_vram_helper.o \
+		     drm_vram_helper_common.o
+obj-$(CONFIG_DRM_VRAM_HELPER) += drm_vram_helper.o
+
 drm_kms_helper-y := drm_crtc_helper.o drm_dp_helper.o drm_dsc.o drm_probe_helper.o \
 		drm_plane_helper.o drm_dp_mst_topology.o drm_atomic_helper.o \
 		drm_kms_helper_common.o drm_dp_dual_mode_helper.o \
diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
new file mode 100644
index 000000000000..761a546412f8
--- /dev/null
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -0,0 +1,412 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <drm/drm_gem_vram_helper.h>
+#include <drm/ttm/ttm_page_alloc.h>
+
+/**
+ * DOC: overview
+ *
+ * This library provides a GEM buffer object that is backed by video RAM
+ * (VRAM). It can be used for framebuffer devices with dedicated memory.
+ */
+
+/*
+ * Buffer-objects helpers
+ */
+
+static void drm_gem_vram_cleanup(struct drm_gem_vram_object *gbo)
+{
+	/* We got here via ttm_bo_put(), which means that the
+	 * TTM buffer object in 'bo' has already been cleaned
+	 * up; only release the GEM object.
+	 */
+	drm_gem_object_release(&gbo->gem);
+}
+
+static void drm_gem_vram_destroy(struct drm_gem_vram_object *gbo)
+{
+	drm_gem_vram_cleanup(gbo);
+	kfree(gbo);
+}
+
+static void ttm_buffer_object_destroy(struct ttm_buffer_object *bo)
+{
+	struct drm_gem_vram_object *gbo = drm_gem_vram_of_bo(bo);
+
+	drm_gem_vram_destroy(gbo);
+}
+
+static void drm_gem_vram_placement(struct drm_gem_vram_object *gbo,
+				   unsigned long pl_flag)
+{
+	unsigned int i;
+	unsigned int c = 0;
+
+	gbo->placement.placement = gbo->placements;
+	gbo->placement.busy_placement = gbo->placements;
+
+	if (pl_flag & TTM_PL_FLAG_VRAM)
+		gbo->placements[c++].flags = TTM_PL_FLAG_WC |
+					     TTM_PL_FLAG_UNCACHED |
+					     TTM_PL_FLAG_VRAM;
+
+	if (pl_flag & TTM_PL_FLAG_SYSTEM)
+		gbo->placements[c++].flags = TTM_PL_MASK_CACHING |
+					     TTM_PL_FLAG_SYSTEM;
+
+	if (!c)
+		gbo->placements[c++].flags = TTM_PL_MASK_CACHING |
+					     TTM_PL_FLAG_SYSTEM;
+
+	gbo->placement.num_placement = c;
+	gbo->placement.num_busy_placement = c;
+
+	for (i = 0; i < c; ++i) {
+		gbo->placements[i].fpfn = 0;
+		gbo->placements[i].lpfn = 0;
+	}
+}
+
+static int drm_gem_vram_init(struct drm_device *dev,
+			     struct ttm_bo_device *bdev,
+			     struct drm_gem_vram_object *gbo,
+			     size_t size, unsigned long pg_align,
+			     bool interruptible)
+{
+	int ret;
+	size_t acc_size;
+
+	ret = drm_gem_object_init(dev, &gbo->gem, size);
+	if (ret)
+		return ret;
+
+	acc_size = ttm_bo_dma_acc_size(bdev, size, sizeof(*gbo));
+
+	gbo->bo.bdev = bdev;
+	drm_gem_vram_placement(gbo, TTM_PL_FLAG_VRAM | TTM_PL_FLAG_SYSTEM);
+
+	ret = ttm_bo_init(bdev, &gbo->bo, size, ttm_bo_type_device,
+			  &gbo->placement, pg_align, interruptible, acc_size,
+			  NULL, NULL, ttm_buffer_object_destroy);
+	if (ret)
+		goto err_drm_gem_object_release;
+
+	return 0;
+
+err_drm_gem_object_release:
+	drm_gem_object_release(&gbo->gem);
+	return ret;
+}
+
+/**
+ * drm_gem_vram_create() - Creates a VRAM-backed GEM object
+ * @dev:		the DRM device
+ * @bdev:		the TTM BO device backing the object
+ * @size:		the buffer size in bytes
+ * @pg_align:		the buffer's alignment in multiples of the page size
+ * @interruptible:	sleep interruptible if waiting for memory
+ *
+ * Returns:
+ * A new instance of &struct drm_gem_vram_object on success, or
+ * an ERR_PTR()-encoded error code otherwise.
+ */
+struct drm_gem_vram_object *drm_gem_vram_create(struct drm_device *dev,
+						struct ttm_bo_device *bdev,
+						size_t size,
+						unsigned long pg_align,
+						bool interruptible)
+{
+	struct drm_gem_vram_object *gbo;
+	int ret;
+
+	gbo = kzalloc(sizeof(*gbo), GFP_KERNEL);
+	if (!gbo)
+		return ERR_PTR(-ENOMEM);
+
+	ret = drm_gem_vram_init(dev, bdev, gbo, size, pg_align, interruptible);
+	if (ret < 0)
+		goto err_kfree;
+
+	return gbo;
+
+err_kfree:
+	kfree(gbo);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(drm_gem_vram_create);
+
+/**
+ * drm_gem_vram_put() - Releases a reference to a VRAM-backed GEM object
+ * @gbo:	the GEM VRAM object
+ *
+ * See ttm_bo_put() for more information.
+ */
+void drm_gem_vram_put(struct drm_gem_vram_object *gbo)
+{
+	ttm_bo_put(&gbo->bo);
+}
+EXPORT_SYMBOL(drm_gem_vram_put);
+
+/**
+ * drm_gem_vram_reserve() - Reserves a VRAM-backed GEM object
+ * @gbo:	the GEM VRAM object
+ * @no_wait:	don't wait for buffer object to become available
+ *
+ * See ttm_bo_reserve() for more information.
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise
+ */
+int drm_gem_vram_reserve(struct drm_gem_vram_object *gbo, bool no_wait)
+{
+	return ttm_bo_reserve(&gbo->bo, true, no_wait, NULL);
+}
+EXPORT_SYMBOL(drm_gem_vram_reserve);
+
+/**
+ * drm_gem_vram_unreserve() - \
+	Release a reservation acquired by drm_gem_vram_reserve()
+ * @gbo:	the GEM VRAM object
+ *
+ * See ttm_bo_unreserve() for more information.
+ */
+void drm_gem_vram_unreserve(struct drm_gem_vram_object *gbo)
+{
+	ttm_bo_unreserve(&gbo->bo);
+}
+EXPORT_SYMBOL(drm_gem_vram_unreserve);
+
+/**
+ * drm_gem_vram_mmap_offset() - Returns a GEM VRAM object's mmap offset
+ * @gbo:	the GEM VRAM object
+ *
+ * See drm_vma_node_offset_addr() for more information.
+ *
+ * Returns:
+ * The buffer object's offset for userspace mappings on success, or
+ * 0 if no offset is allocated.
+ */
+u64 drm_gem_vram_mmap_offset(struct drm_gem_vram_object *gbo)
+{
+	return drm_vma_node_offset_addr(&gbo->bo.vma_node);
+}
+EXPORT_SYMBOL(drm_gem_vram_mmap_offset);
+
+/**
+ * drm_gem_vram_offset() - \
+	Returns a GEM VRAM object's offset in video memory
+ * @gbo:	the GEM VRAM object
+ *
+ * This function returns the buffer object's offset in the device's video
+ * memory. The buffer object has to be pinned to %TTM_PL_VRAM.
+ *
+ * Returns:
+ * The buffer object's offset in video memory on success, or
+ * a negative errno code otherwise.
+ */
+s64 drm_gem_vram_offset(struct drm_gem_vram_object *gbo)
+{
+	if (WARN_ON_ONCE(!gbo->pin_count))
+		return (s64)-ENODEV;
+	return gbo->bo.offset;
+}
+EXPORT_SYMBOL(drm_gem_vram_offset);
+
+/**
+ * drm_gem_vram_pin() - Pins a GEM VRAM object in a region.
+ * @gbo:	the GEM VRAM object
+ * @pl_flag:	a bitmask of possible memory regions
+ *
+ * Pinning a buffer object ensures that it is not evicted from
+ * a memory region. A pinned buffer object has to be unpinned before
+ * it can be pinned to another region.
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_gem_vram_pin(struct drm_gem_vram_object *gbo, unsigned long pl_flag)
+{
+	int i, ret;
+	struct ttm_operation_ctx ctx = { false, false };
+
+	if (gbo->pin_count) {
+		++gbo->pin_count;
+		return 0;
+	}
+
+	drm_gem_vram_placement(gbo, pl_flag);
+	for (i = 0; i < gbo->placement.num_placement; ++i)
+		gbo->placements[i].flags |= TTM_PL_FLAG_NO_EVICT;
+
+	ret = ttm_bo_validate(&gbo->bo, &gbo->placement, &ctx);
+	if (ret < 0)
+		return ret;
+
+	gbo->pin_count = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_gem_vram_pin);
+
+/**
+ * drm_gem_vram_unpin() - Unpins a GEM VRAM object
+ * @gbo:	the GEM VRAM object
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_gem_vram_unpin(struct drm_gem_vram_object *gbo)
+{
+	int i, ret;
+	struct ttm_operation_ctx ctx = { false, false };
+
+	if (WARN_ON_ONCE(!gbo->pin_count))
+		return 0;
+
+	--gbo->pin_count;
+	if (gbo->pin_count)
+		return 0;
+
+	for (i = 0; i < gbo->placement.num_placement ; ++i)
+		gbo->placements[i].flags &= ~TTM_PL_FLAG_NO_EVICT;
+
+	ret = ttm_bo_validate(&gbo->bo, &gbo->placement, &ctx);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_gem_vram_unpin);
+
+/**
+ * drm_gem_vram_push_to_system() - \
+	Unpins a GEM VRAM object and moves it to system memory
+ * @gbo:	the GEM VRAM object
+ *
+ * This operation only works if the caller holds the final pin on the
+ * buffer object.
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_gem_vram_push_to_system(struct drm_gem_vram_object *gbo)
+{
+	int i, ret;
+	struct ttm_operation_ctx ctx = { false, false };
+
+	if (WARN_ON_ONCE(!gbo->pin_count))
+		return 0;
+
+	--gbo->pin_count;
+	if (gbo->pin_count)
+		return 0;
+
+	if (gbo->kmap.virtual)
+		ttm_bo_kunmap(&gbo->kmap);
+
+	drm_gem_vram_placement(gbo, TTM_PL_FLAG_SYSTEM);
+	for (i = 0; i < gbo->placement.num_placement ; ++i)
+		gbo->placements[i].flags |= TTM_PL_FLAG_NO_EVICT;
+
+	ret = ttm_bo_validate(&gbo->bo, &gbo->placement, &ctx);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_gem_vram_push_to_system);
+
+/**
+ * drm_gem_vram_kmap_at() - Maps a GEM VRAM object into kernel address space
+ * @gbo:	the GEM VRAM object
+ * @map:	establish a mapping if necessary
+ * @is_iomem:	returns true if the mapped memory is I/O memory, or false \
+	otherwise; can be NULL
+ * @kmap:	the mapping's kmap object
+ *
+ * This function maps the buffer object into the kernel's address space
+ * or returns the current mapping. If the parameter map is false, the
+ * function only queries the current mapping, but does not establish a
+ * new one.
+ *
+ * Returns:
+ * The buffers virtual address if mapped, or
+ * NULL if not mapped, or
+ * an ERR_PTR()-encoded error code otherwise.
+ */
+void *drm_gem_vram_kmap_at(struct drm_gem_vram_object *gbo, bool map,
+			   bool *is_iomem, struct ttm_bo_kmap_obj *kmap)
+{
+	int ret;
+
+	if (kmap->virtual || !map)
+		goto out;
+
+	ret = ttm_bo_kmap(&gbo->bo, 0, gbo->bo.num_pages, kmap);
+	if (ret)
+		return ERR_PTR(ret);
+
+out:
+	if (!is_iomem)
+		return kmap->virtual;
+	if (!kmap->virtual) {
+		*is_iomem = false;
+		return NULL;
+	}
+	return ttm_kmap_obj_virtual(kmap, is_iomem);
+}
+EXPORT_SYMBOL(drm_gem_vram_kmap_at);
+
+/**
+ * drm_gem_vram_kmap() - Maps a GEM VRAM object into kernel address space
+ * @gbo:	the GEM VRAM object
+ * @map:	establish a mapping if necessary
+ * @is_iomem:	returns true if the mapped memory is I/O memory, or false \
+	otherwise; can be NULL
+ *
+ * This function maps the buffer object into the kernel's address space
+ * or returns the current mapping. If the parameter map is false, the
+ * function only queries the current mapping, but does not establish a
+ * new one.
+ *
+ * Returns:
+ * The buffers virtual address if mapped, or
+ * NULL if not mapped, or
+ * an ERR_PTR()-encoded error code otherwise.
+ */
+void *drm_gem_vram_kmap(struct drm_gem_vram_object *gbo, bool map,
+			bool *is_iomem)
+{
+	return drm_gem_vram_kmap_at(gbo, map, is_iomem, &gbo->kmap);
+}
+EXPORT_SYMBOL(drm_gem_vram_kmap);
+
+/**
+ * drm_gem_vram_kunmap_at() - Unmaps a GEM VRAM object
+ * @gbo:	the GEM VRAM object
+ * @kmap:	the mapping's kmap object
+ */
+void drm_gem_vram_kunmap_at(struct drm_gem_vram_object *gbo,
+			    struct ttm_bo_kmap_obj *kmap)
+{
+	if (!kmap->virtual)
+		return;
+
+	ttm_bo_kunmap(kmap);
+	kmap->virtual = NULL;
+}
+EXPORT_SYMBOL(drm_gem_vram_kunmap_at);
+
+/**
+ * drm_gem_vram_kunmap() - Unmaps a GEM VRAM object
+ * @gbo:	the GEM VRAM object
+ */
+void drm_gem_vram_kunmap(struct drm_gem_vram_object *gbo)
+{
+	drm_gem_vram_kunmap_at(gbo, &gbo->kmap);
+}
+EXPORT_SYMBOL(drm_gem_vram_kunmap);
diff --git a/drivers/gpu/drm/drm_vram_helper_common.c b/drivers/gpu/drm/drm_vram_helper_common.c
new file mode 100644
index 000000000000..7c25daca18d0
--- /dev/null
+++ b/drivers/gpu/drm/drm_vram_helper_common.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("DRM VRAM memory-management helpers");
+MODULE_LICENSE("GPL");
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
new file mode 100644
index 000000000000..5802dd694939
--- /dev/null
+++ b/include/drm/drm_gem_vram_helper.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef DRM_GEM_VRAM_HELPER_H
+#define DRM_GEM_VRAM_HELPER_H
+
+#include <drm/drm_gem.h>
+#include <drm/ttm/ttm_bo_api.h>
+#include <drm/ttm/ttm_placement.h>
+#include <linux/kernel.h> /* for container_of() */
+
+struct filp;
+
+#define DRM_GEM_VRAM_PL_FLAG_VRAM	TTM_PL_FLAG_VRAM
+#define DRM_GEM_VRAM_PL_FLAG_SYSTEM	TTM_PL_FLAG_SYSTEM
+
+/*
+ * Buffer-object helpers
+ */
+
+/**
+ * struct drm_gem_vram_object - GEM object backed by VRAM
+ * @gem:	GEM object
+ * @bo:		TTM buffer object
+ * @kmap:	Mapping information for @bo
+ * @placement:	TTM placement information. Supported placements are \
+	%TTM_PL_VRAM and %TTM_PL_SYSTEM
+ * @placements:	TTM placement information.
+ * @pin_count:	Pin counter
+ *
+ * The type struct drm_gem_vram_object represents a GEM object that is
+ * backed by VRAM. It can be used for simple framebuffer devices with
+ * dedicated memory. The buffer object can be evicted to system memory if
+ * video memory becomes scarce.
+ */
+struct drm_gem_vram_object {
+	struct drm_gem_object gem;
+	struct ttm_buffer_object bo;
+	struct ttm_bo_kmap_obj kmap;
+
+	/* Supported placements are %TTM_PL_VRAM and %TTM_PL_SYSTEM */
+	struct ttm_placement placement;
+	struct ttm_place placements[2];
+
+	int pin_count;
+};
+
+/**
+ * Returns the container of type &struct drm_gem_vram_object
+ * for field bo.
+ * @bo:		the VRAM buffer object
+ * Returns:	The containing GEM VRAM object
+ */
+static inline struct drm_gem_vram_object *drm_gem_vram_of_bo(
+	struct ttm_buffer_object *bo)
+{
+	return container_of(bo, struct drm_gem_vram_object, bo);
+}
+
+/**
+ * Returns the container of type &struct drm_gem_vram_object
+ * for field gem.
+ * @gem:	the GEM object
+ * Returns:	The containing GEM VRAM object
+ */
+static inline struct drm_gem_vram_object *drm_gem_vram_of_gem(
+	struct drm_gem_object *gem)
+{
+	return container_of(gem, struct drm_gem_vram_object, gem);
+}
+
+struct drm_gem_vram_object *drm_gem_vram_create(struct drm_device *dev,
+						struct ttm_bo_device *bdev,
+						size_t size,
+						unsigned long pg_align,
+						bool interruptible);
+void drm_gem_vram_put(struct drm_gem_vram_object *gbo);
+int drm_gem_vram_reserve(struct drm_gem_vram_object *gbo, bool no_wait);
+void drm_gem_vram_unreserve(struct drm_gem_vram_object *gbo);
+u64 drm_gem_vram_mmap_offset(struct drm_gem_vram_object *gbo);
+s64 drm_gem_vram_offset(struct drm_gem_vram_object *gbo);
+int drm_gem_vram_pin(struct drm_gem_vram_object *gbo, unsigned long pl_flag);
+int drm_gem_vram_unpin(struct drm_gem_vram_object *gbo);
+int drm_gem_vram_push_to_system(struct drm_gem_vram_object *gbo);
+void *drm_gem_vram_kmap_at(struct drm_gem_vram_object *gbo, bool map,
+			   bool *is_iomem, struct ttm_bo_kmap_obj *kmap);
+void *drm_gem_vram_kmap(struct drm_gem_vram_object *gbo, bool map,
+			bool *is_iomem);
+void drm_gem_vram_kunmap_at(struct drm_gem_vram_object *gbo,
+			    struct ttm_bo_kmap_obj *kmap);
+void drm_gem_vram_kunmap(struct drm_gem_vram_object *gbo);
+
+#endif
-- 
cgit v1.2.3


From 6c812bc5074bbc969e78fadfbba7ac41fe61ddeb Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 8 May 2019 10:26:12 +0200
Subject: drm: Add |struct drm_gem_vram_object| callbacks for |struct
 ttm_bo_driver|

The provided helpers can be used for the respective callback functions
in |struct ttm_bo_driver|.

v2:
	* drm_is_gem_vram() is now a private function
	* documentation fixes

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: http://patchwork.freedesktop.org/patch/msgid/20190508082630.15116-3-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/drm_gem_vram_helper.c | 50 +++++++++++++++++++++++++++++++++++
 include/drm/drm_gem_vram_helper.h     | 10 +++++++
 2 files changed, 60 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index 761a546412f8..d6c4e0d0a1c5 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -410,3 +410,53 @@ void drm_gem_vram_kunmap(struct drm_gem_vram_object *gbo)
 	drm_gem_vram_kunmap_at(gbo, &gbo->kmap);
 }
 EXPORT_SYMBOL(drm_gem_vram_kunmap);
+
+/*
+ * Helpers for struct ttm_bo_driver
+ */
+
+static bool drm_is_gem_vram(struct ttm_buffer_object *bo)
+{
+	return (bo->destroy == ttm_buffer_object_destroy);
+}
+
+/**
+ * drm_gem_vram_bo_driver_evict_flags() - \
+	Implements &struct ttm_bo_driver.evict_flags
+ * @bo:	TTM buffer object. Refers to &struct drm_gem_vram_object.bo
+ * @pl:	TTM placement information.
+ */
+void drm_gem_vram_bo_driver_evict_flags(struct ttm_buffer_object *bo,
+					struct ttm_placement *pl)
+{
+	struct drm_gem_vram_object *gbo;
+
+	/* TTM may pass BOs that are not GEM VRAM BOs. */
+	if (!drm_is_gem_vram(bo))
+		return;
+
+	gbo = drm_gem_vram_of_bo(bo);
+	drm_gem_vram_placement(gbo, TTM_PL_FLAG_SYSTEM);
+	*pl = gbo->placement;
+}
+EXPORT_SYMBOL(drm_gem_vram_bo_driver_evict_flags);
+
+/**
+ * drm_gem_vram_bo_driver_verify_access() - \
+	Implements &struct ttm_bo_driver.verify_access
+ * @bo:		TTM buffer object. Refers to &struct drm_gem_vram_object.bo
+ * @filp:	File pointer.
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative errno code otherwise.
+ */
+int drm_gem_vram_bo_driver_verify_access(struct ttm_buffer_object *bo,
+					 struct file *filp)
+{
+	struct drm_gem_vram_object *gbo = drm_gem_vram_of_bo(bo);
+
+	return drm_vma_node_verify_access(&gbo->gem.vma_node,
+					  filp->private_data);
+}
+EXPORT_SYMBOL(drm_gem_vram_bo_driver_verify_access);
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index 5802dd694939..1915d3958304 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -89,4 +89,14 @@ void drm_gem_vram_kunmap_at(struct drm_gem_vram_object *gbo,
 			    struct ttm_bo_kmap_obj *kmap);
 void drm_gem_vram_kunmap(struct drm_gem_vram_object *gbo);
 
+/*
+ * Helpers for struct ttm_bo_driver
+ */
+
+void drm_gem_vram_bo_driver_evict_flags(struct ttm_buffer_object *bo,
+					struct ttm_placement *pl);
+
+int drm_gem_vram_bo_driver_verify_access(struct ttm_buffer_object *bo,
+					 struct file *filp);
+
 #endif
-- 
cgit v1.2.3


From 737000fd9c7d32dd378733cc4bba17555b241be0 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 8 May 2019 10:26:13 +0200
Subject: drm: Add |struct drm_gem_vram_object| callbacks for |struct
 drm_driver|

The provided helpers can be used for the respective callback functions
in |struct drm_driver|.

v4:
	* cleanups from checkpatch.pl
v2:
	* documentation fixes

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: http://patchwork.freedesktop.org/patch/msgid/20190508082630.15116-4-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/drm_gem_vram_helper.c | 49 +++++++++++++++++++++++++++++++++++
 include/drm/drm_gem_vram_helper.h     | 10 +++++++
 2 files changed, 59 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index d6c4e0d0a1c5..264028374336 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -460,3 +460,52 @@ int drm_gem_vram_bo_driver_verify_access(struct ttm_buffer_object *bo,
 					  filp->private_data);
 }
 EXPORT_SYMBOL(drm_gem_vram_bo_driver_verify_access);
+
+/*
+ * Helpers for struct drm_driver
+ */
+
+/**
+ * drm_gem_vram_driver_gem_free_object_unlocked() - \
+	Implements &struct drm_driver.gem_free_object_unlocked
+ * @gem:	GEM object. Refers to &struct drm_gem_vram_object.gem
+ */
+void drm_gem_vram_driver_gem_free_object_unlocked(struct drm_gem_object *gem)
+{
+	struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(gem);
+
+	drm_gem_vram_put(gbo);
+}
+EXPORT_SYMBOL(drm_gem_vram_driver_gem_free_object_unlocked);
+
+/**
+ * drm_gem_vram_driver_dumb_mmap_offset() - \
+	Implements &struct drm_driver.dumb_mmap_offset
+ * @file:	DRM file pointer.
+ * @dev:	DRM device.
+ * @handle:	GEM handle
+ * @offset:	Returns the mapping's memory offset on success
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative errno code otherwise.
+ */
+int drm_gem_vram_driver_dumb_mmap_offset(struct drm_file *file,
+					 struct drm_device *dev,
+					 uint32_t handle, uint64_t *offset)
+{
+	struct drm_gem_object *gem;
+	struct drm_gem_vram_object *gbo;
+
+	gem = drm_gem_object_lookup(file, handle);
+	if (!gem)
+		return -ENOENT;
+
+	gbo = drm_gem_vram_of_gem(gem);
+	*offset = drm_gem_vram_mmap_offset(gbo);
+
+	drm_gem_object_put_unlocked(gem);
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_gem_vram_driver_dumb_mmap_offset);
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index 1915d3958304..b87cb2e9d9da 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -99,4 +99,14 @@ void drm_gem_vram_bo_driver_evict_flags(struct ttm_buffer_object *bo,
 int drm_gem_vram_bo_driver_verify_access(struct ttm_buffer_object *bo,
 					 struct file *filp);
 
+/*
+ * Helpers for struct drm_driver
+ */
+
+void drm_gem_vram_driver_gem_free_object_unlocked(struct drm_gem_object *gem);
+
+int drm_gem_vram_driver_dumb_mmap_offset(struct drm_file *file,
+					 struct drm_device *dev,
+					 uint32_t handle, uint64_t *offset);
+
 #endif
-- 
cgit v1.2.3


From fed1eec080b9b23f5a80a0c4f19ae49a2d14c69e Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 8 May 2019 10:26:14 +0200
Subject: drm: Add drm_gem_vram_fill_create_dumb() to create dumb buffers

The helper function drm_gem_vram_fill_create_dumb() implements most of
struct drm_driver.dumb_create() for GEM-VRAM buffer objects. It's not a
full implementation of the callback, as several driver-specific parameters
are still required.

v4:
	* cleanups from checkpatch.pl
v2:
	* documentation fixes

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: http://patchwork.freedesktop.org/patch/msgid/20190508082630.15116-5-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/drm_gem_vram_helper.c | 62 +++++++++++++++++++++++++++++++++++
 include/drm/drm_gem_vram_helper.h     |  8 +++++
 2 files changed, 70 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index 264028374336..aba073279ca2 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <drm/drm_gem_vram_helper.h>
+#include <drm/drm_mode.h>
 #include <drm/ttm/ttm_page_alloc.h>
 
 /**
@@ -411,6 +412,67 @@ void drm_gem_vram_kunmap(struct drm_gem_vram_object *gbo)
 }
 EXPORT_SYMBOL(drm_gem_vram_kunmap);
 
+/**
+ * drm_gem_vram_fill_create_dumb() - \
+	Helper for implementing &struct drm_driver.dumb_create
+ * @file:		the DRM file
+ * @dev:		the DRM device
+ * @bdev:		the TTM BO device managing the buffer object
+ * @pg_align:		the buffer's alignment in multiples of the page size
+ * @interruptible:	sleep interruptible if waiting for memory
+ * @args:		the arguments as provided to \
+				&struct drm_driver.dumb_create
+ *
+ * This helper function fills &struct drm_mode_create_dumb, which is used
+ * by &struct drm_driver.dumb_create. Implementations of this interface
+ * should forwards their arguments to this helper, plus the driver-specific
+ * parameters.
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_gem_vram_fill_create_dumb(struct drm_file *file,
+				  struct drm_device *dev,
+				  struct ttm_bo_device *bdev,
+				  unsigned long pg_align,
+				  bool interruptible,
+				  struct drm_mode_create_dumb *args)
+{
+	size_t pitch, size;
+	struct drm_gem_vram_object *gbo;
+	int ret;
+	u32 handle;
+
+	pitch = args->width * ((args->bpp + 7) / 8);
+	size = pitch * args->height;
+
+	size = roundup(size, PAGE_SIZE);
+	if (!size)
+		return -EINVAL;
+
+	gbo = drm_gem_vram_create(dev, bdev, size, pg_align, interruptible);
+	if (IS_ERR(gbo))
+		return PTR_ERR(gbo);
+
+	ret = drm_gem_handle_create(file, &gbo->gem, &handle);
+	if (ret)
+		goto err_drm_gem_object_put_unlocked;
+
+	drm_gem_object_put_unlocked(&gbo->gem);
+
+	args->pitch = pitch;
+	args->size = size;
+	args->handle = handle;
+
+	return 0;
+
+err_drm_gem_object_put_unlocked:
+	drm_gem_object_put_unlocked(&gbo->gem);
+	return ret;
+}
+EXPORT_SYMBOL(drm_gem_vram_fill_create_dumb);
+
 /*
  * Helpers for struct ttm_bo_driver
  */
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index b87cb2e9d9da..8e54f721c9f8 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -8,6 +8,7 @@
 #include <drm/ttm/ttm_placement.h>
 #include <linux/kernel.h> /* for container_of() */
 
+struct drm_mode_create_dumb;
 struct filp;
 
 #define DRM_GEM_VRAM_PL_FLAG_VRAM	TTM_PL_FLAG_VRAM
@@ -89,6 +90,13 @@ void drm_gem_vram_kunmap_at(struct drm_gem_vram_object *gbo,
 			    struct ttm_bo_kmap_obj *kmap);
 void drm_gem_vram_kunmap(struct drm_gem_vram_object *gbo);
 
+int drm_gem_vram_fill_create_dumb(struct drm_file *file,
+				  struct drm_device *dev,
+				  struct ttm_bo_device *bdev,
+				  unsigned long pg_align,
+				  bool interruptible,
+				  struct drm_mode_create_dumb *args);
+
 /*
  * Helpers for struct ttm_bo_driver
  */
-- 
cgit v1.2.3


From 1f460b497890d6bf54aba6e41fe24bba2217141a Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 8 May 2019 10:26:15 +0200
Subject: drm: Add simple PRIME helpers for GEM VRAM

These basic helper functions for GEM VRAM allow for pinning and mapping
GEM VRAM objects via the PRIME interfaces. It's not a full implementation,
but complete enough for generic fbcon.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: http://patchwork.freedesktop.org/patch/msgid/20190508082630.15116-6-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/drm_gem_vram_helper.c | 98 +++++++++++++++++++++++++++++++++++
 include/drm/drm_gem_vram_helper.h     | 20 +++++++
 2 files changed, 118 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index aba073279ca2..15c6193e4985 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -2,6 +2,7 @@
 
 #include <drm/drm_gem_vram_helper.h>
 #include <drm/drm_mode.h>
+#include <drm/drm_prime.h>
 #include <drm/ttm/ttm_page_alloc.h>
 
 /**
@@ -571,3 +572,100 @@ int drm_gem_vram_driver_dumb_mmap_offset(struct drm_file *file,
 	return 0;
 }
 EXPORT_SYMBOL(drm_gem_vram_driver_dumb_mmap_offset);
+
+/*
+ * PRIME helpers for struct drm_driver
+ */
+
+/**
+ * drm_gem_vram_driver_gem_prime_pin() - \
+	Implements &struct drm_driver.gem_prime_pin
+ * @gem:	The GEM object to pin
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative errno code otherwise.
+ */
+int drm_gem_vram_driver_gem_prime_pin(struct drm_gem_object *gem)
+{
+	struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(gem);
+
+	return drm_gem_vram_pin(gbo, DRM_GEM_VRAM_PL_FLAG_VRAM);
+}
+EXPORT_SYMBOL(drm_gem_vram_driver_gem_prime_pin);
+
+/**
+ * drm_gem_vram_driver_gem_prime_unpin() - \
+	Implements &struct drm_driver.gem_prime_unpin
+ * @gem:	The GEM object to unpin
+ */
+void drm_gem_vram_driver_gem_prime_unpin(struct drm_gem_object *gem)
+{
+	struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(gem);
+
+	drm_gem_vram_unpin(gbo);
+}
+EXPORT_SYMBOL(drm_gem_vram_driver_gem_prime_unpin);
+
+/**
+ * drm_gem_vram_driver_gem_prime_vmap() - \
+	Implements &struct drm_driver.gem_prime_vmap
+ * @gem:	The GEM object to map
+ *
+ * Returns:
+ * The buffers virtual address on success, or
+ * NULL otherwise.
+ */
+void *drm_gem_vram_driver_gem_prime_vmap(struct drm_gem_object *gem)
+{
+	struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(gem);
+	int ret;
+	void *base;
+
+	ret = drm_gem_vram_pin(gbo, DRM_GEM_VRAM_PL_FLAG_VRAM);
+	if (ret)
+		return NULL;
+	base = drm_gem_vram_kmap(gbo, true, NULL);
+	if (IS_ERR(base)) {
+		drm_gem_vram_unpin(gbo);
+		return NULL;
+	}
+	return base;
+}
+EXPORT_SYMBOL(drm_gem_vram_driver_gem_prime_vmap);
+
+/**
+ * drm_gem_vram_driver_gem_prime_vunmap() - \
+	Implements &struct drm_driver.gem_prime_vunmap
+ * @gem:	The GEM object to unmap
+ * @vaddr:	The mapping's base address
+ */
+void drm_gem_vram_driver_gem_prime_vunmap(struct drm_gem_object *gem,
+					  void *vaddr)
+{
+	struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(gem);
+
+	drm_gem_vram_kunmap(gbo);
+	drm_gem_vram_unpin(gbo);
+}
+EXPORT_SYMBOL(drm_gem_vram_driver_gem_prime_vunmap);
+
+/**
+ * drm_gem_vram_driver_gem_prime_mmap() - \
+	Implements &struct drm_driver.gem_prime_mmap
+ * @gem:	The GEM object to map
+ * @vma:	The VMA describing the mapping
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative errno code otherwise.
+ */
+int drm_gem_vram_driver_gem_prime_mmap(struct drm_gem_object *gem,
+				       struct vm_area_struct *vma)
+{
+	struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(gem);
+
+	gbo->gem.vma_node.vm_node.start = gbo->bo.vma_node.vm_node.start;
+	return drm_gem_prime_mmap(gem, vma);
+}
+EXPORT_SYMBOL(drm_gem_vram_driver_gem_prime_mmap);
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index 8e54f721c9f8..9a8bd98b623f 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -116,5 +116,25 @@ void drm_gem_vram_driver_gem_free_object_unlocked(struct drm_gem_object *gem);
 int drm_gem_vram_driver_dumb_mmap_offset(struct drm_file *file,
 					 struct drm_device *dev,
 					 uint32_t handle, uint64_t *offset);
+/*
+ * PRIME helpers for struct drm_driver
+ */
+
+int drm_gem_vram_driver_gem_prime_pin(struct drm_gem_object *obj);
+void drm_gem_vram_driver_gem_prime_unpin(struct drm_gem_object *obj);
+void *drm_gem_vram_driver_gem_prime_vmap(struct drm_gem_object *obj);
+void drm_gem_vram_driver_gem_prime_vunmap(struct drm_gem_object *obj,
+					  void *vaddr);
+int drm_gem_vram_driver_gem_prime_mmap(struct drm_gem_object *obj,
+				       struct vm_area_struct *vma);
+
+#define DRM_GEM_VRAM_DRIVER_PRIME \
+	.gem_prime_export = drm_gem_prime_export, \
+	.gem_prime_import = drm_gem_prime_import, \
+	.gem_prime_pin	  = drm_gem_vram_driver_gem_prime_pin, \
+	.gem_prime_unpin  = drm_gem_vram_driver_gem_prime_unpin, \
+	.gem_prime_vmap	  = drm_gem_vram_driver_gem_prime_vmap, \
+	.gem_prime_vunmap = drm_gem_vram_driver_gem_prime_vunmap, \
+	.gem_prime_mmap	  = drm_gem_vram_driver_gem_prime_mmap
 
 #endif
-- 
cgit v1.2.3


From 96352eca5c7c36c3e0eb97a6d68fb170ce643ea5 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 8 May 2019 10:26:16 +0200
Subject: drm: Add VRAM MM, a simple memory manager for dedicated VRAM

The VRAM MM memory manager is a helper library that manages dedicated video
memory of simple framebuffer devices. It is supported to be used with
struct drm_gem_vram_object, but does not depend on it.

The implementation is based on the respective code from ast, bochs, and
mgag200. These drivers share the exact same implementation except for type
names. The helpers are currently build with TTM. This may change in future
revisions.

v4:
	* cleanups from checkpatch.pl
v2:
	* renamed to struct drm_vram_mm
	* add drm_vram_mm_mmap() helper
	* documentation fixes

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: http://patchwork.freedesktop.org/patch/msgid/20190508082630.15116-7-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 Documentation/gpu/drm-mm.rst         |  13 ++-
 drivers/gpu/drm/Makefile             |   3 +-
 drivers/gpu/drm/drm_vram_mm_helper.c | 210 +++++++++++++++++++++++++++++++++++
 include/drm/drm_vram_mm_helper.h     |  69 ++++++++++++
 4 files changed, 293 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/drm_vram_mm_helper.c
 create mode 100644 include/drm/drm_vram_mm_helper.h

(limited to 'include')

diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
index a3b685089512..eba50afbda42 100644
--- a/Documentation/gpu/drm-mm.rst
+++ b/Documentation/gpu/drm-mm.rst
@@ -79,7 +79,6 @@ count for the TTM, which will call your initialization function.
 
 See the radeon_ttm.c file for an example of usage.
 
-
 The Graphics Execution Manager (GEM)
 ====================================
 
@@ -395,6 +394,18 @@ GEM VRAM Helper Functions Reference
 .. kernel-doc:: drivers/gpu/drm/drm_gem_vram_helper.c
    :export:
 
+VRAM MM Helper Functions Reference
+----------------------------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_vram_mm_helper.c
+   :doc: overview
+
+.. kernel-doc:: include/drm/drm_vram_mm_helper.h
+   :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_vram_mm_helper.c
+   :export:
+
 VMA Offset Manager
 ==================
 
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index ed49b2480766..4c3dc4268b65 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -33,7 +33,8 @@ drm-$(CONFIG_DEBUG_FS) += drm_debugfs.o drm_debugfs_crc.o
 drm-$(CONFIG_DRM_LOAD_EDID_FIRMWARE) += drm_edid_load.o
 
 drm_vram_helper-y := drm_gem_vram_helper.o \
-		     drm_vram_helper_common.o
+		     drm_vram_helper_common.o \
+		     drm_vram_mm_helper.o
 obj-$(CONFIG_DRM_VRAM_HELPER) += drm_vram_helper.o
 
 drm_kms_helper-y := drm_crtc_helper.o drm_dp_helper.o drm_dsc.o drm_probe_helper.o \
diff --git a/drivers/gpu/drm/drm_vram_mm_helper.c b/drivers/gpu/drm/drm_vram_mm_helper.c
new file mode 100644
index 000000000000..d17c5169b018
--- /dev/null
+++ b/drivers/gpu/drm/drm_vram_mm_helper.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <drm/drm_vram_mm_helper.h>
+#include <drm/drmP.h>
+#include <drm/ttm/ttm_page_alloc.h>
+
+/**
+ * DOC: overview
+ *
+ * The data structure &struct drm_vram_mm and its helpers implement a memory
+ * manager for simple framebuffer devices with dedicated video memory. Buffer
+ * objects are either placed in video RAM or evicted to system memory. These
+ * helper functions work well with &struct drm_gem_vram_object.
+ */
+
+/*
+ * TTM TT
+ */
+
+static void backend_func_destroy(struct ttm_tt *tt)
+{
+	ttm_tt_fini(tt);
+	kfree(tt);
+}
+
+static struct ttm_backend_func backend_func = {
+	.destroy = backend_func_destroy
+};
+
+/*
+ * TTM BO device
+ */
+
+static struct ttm_tt *bo_driver_ttm_tt_create(struct ttm_buffer_object *bo,
+					      uint32_t page_flags)
+{
+	struct ttm_tt *tt;
+	int ret;
+
+	tt = kzalloc(sizeof(*tt), GFP_KERNEL);
+	if (!tt)
+		return NULL;
+
+	tt->func = &backend_func;
+
+	ret = ttm_tt_init(tt, bo, page_flags);
+	if (ret < 0)
+		goto err_ttm_tt_init;
+
+	return tt;
+
+err_ttm_tt_init:
+	kfree(tt);
+	return NULL;
+}
+
+static int bo_driver_init_mem_type(struct ttm_bo_device *bdev, uint32_t type,
+				   struct ttm_mem_type_manager *man)
+{
+	switch (type) {
+	case TTM_PL_SYSTEM:
+		man->flags = TTM_MEMTYPE_FLAG_MAPPABLE;
+		man->available_caching = TTM_PL_MASK_CACHING;
+		man->default_caching = TTM_PL_FLAG_CACHED;
+		break;
+	case TTM_PL_VRAM:
+		man->func = &ttm_bo_manager_func;
+		man->flags = TTM_MEMTYPE_FLAG_FIXED |
+			     TTM_MEMTYPE_FLAG_MAPPABLE;
+		man->available_caching = TTM_PL_FLAG_UNCACHED |
+					 TTM_PL_FLAG_WC;
+		man->default_caching = TTM_PL_FLAG_WC;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void bo_driver_evict_flags(struct ttm_buffer_object *bo,
+				  struct ttm_placement *placement)
+{
+	struct drm_vram_mm *vmm = drm_vram_mm_of_bdev(bo->bdev);
+
+	if (vmm->funcs && vmm->funcs->evict_flags)
+		vmm->funcs->evict_flags(bo, placement);
+}
+
+static int bo_driver_verify_access(struct ttm_buffer_object *bo,
+				   struct file *filp)
+{
+	struct drm_vram_mm *vmm = drm_vram_mm_of_bdev(bo->bdev);
+
+	if (!vmm->funcs || !vmm->funcs->verify_access)
+		return 0;
+	return vmm->funcs->verify_access(bo, filp);
+}
+
+static int bo_driver_io_mem_reserve(struct ttm_bo_device *bdev,
+				    struct ttm_mem_reg *mem)
+{
+	struct ttm_mem_type_manager *man = bdev->man + mem->mem_type;
+	struct drm_vram_mm *vmm = drm_vram_mm_of_bdev(bdev);
+
+	if (!(man->flags & TTM_MEMTYPE_FLAG_MAPPABLE))
+		return -EINVAL;
+
+	mem->bus.addr = NULL;
+	mem->bus.size = mem->num_pages << PAGE_SHIFT;
+
+	switch (mem->mem_type) {
+	case TTM_PL_SYSTEM:	/* nothing to do */
+		mem->bus.offset = 0;
+		mem->bus.base = 0;
+		mem->bus.is_iomem = false;
+		break;
+	case TTM_PL_VRAM:
+		mem->bus.offset = mem->start << PAGE_SHIFT;
+		mem->bus.base = vmm->vram_base;
+		mem->bus.is_iomem = true;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void bo_driver_io_mem_free(struct ttm_bo_device *bdev,
+				  struct ttm_mem_reg *mem)
+{ }
+
+static struct ttm_bo_driver bo_driver = {
+	.ttm_tt_create = bo_driver_ttm_tt_create,
+	.ttm_tt_populate = ttm_pool_populate,
+	.ttm_tt_unpopulate = ttm_pool_unpopulate,
+	.init_mem_type = bo_driver_init_mem_type,
+	.eviction_valuable = ttm_bo_eviction_valuable,
+	.evict_flags = bo_driver_evict_flags,
+	.verify_access = bo_driver_verify_access,
+	.io_mem_reserve = bo_driver_io_mem_reserve,
+	.io_mem_free = bo_driver_io_mem_free,
+};
+
+/*
+ * struct drm_vram_mm
+ */
+
+/**
+ * drm_vram_mm_init() - Initialize an instance of VRAM MM.
+ * @vmm:	the VRAM MM instance to initialize
+ * @dev:	the DRM device
+ * @vram_base:	the base address of the video memory
+ * @vram_size:	the size of the video memory in bytes
+ * @funcs:	callback functions for buffer objects
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_vram_mm_init(struct drm_vram_mm *vmm, struct drm_device *dev,
+		     uint64_t vram_base, size_t vram_size,
+		     const struct drm_vram_mm_funcs *funcs)
+{
+	int ret;
+
+	vmm->vram_base = vram_base;
+	vmm->vram_size = vram_size;
+	vmm->funcs = funcs;
+
+	ret = ttm_bo_device_init(&vmm->bdev, &bo_driver,
+				 dev->anon_inode->i_mapping,
+				 true);
+	if (ret)
+		return ret;
+
+	ret = ttm_bo_init_mm(&vmm->bdev, TTM_PL_VRAM, vram_size >> PAGE_SHIFT);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_vram_mm_init);
+
+/**
+ * drm_vram_mm_cleanup() - Cleans up an initialized instance of VRAM MM.
+ * @vmm:	the VRAM MM instance to clean up
+ */
+void drm_vram_mm_cleanup(struct drm_vram_mm *vmm)
+{
+	ttm_bo_device_release(&vmm->bdev);
+}
+EXPORT_SYMBOL(drm_vram_mm_cleanup);
+
+/**
+ * drm_vram_mm_mmap() - Helper for implementing &struct file_operations.mmap()
+ * @filp:	the mapping's file structure
+ * @vma:	the mapping's memory area
+ * @vmm:	the VRAM MM instance
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_vram_mm_mmap(struct file *filp, struct vm_area_struct *vma,
+		     struct drm_vram_mm *vmm)
+{
+	return ttm_bo_mmap(filp, vma, &vmm->bdev);
+}
+EXPORT_SYMBOL(drm_vram_mm_mmap);
diff --git a/include/drm/drm_vram_mm_helper.h b/include/drm/drm_vram_mm_helper.h
new file mode 100644
index 000000000000..5d45c6447fa4
--- /dev/null
+++ b/include/drm/drm_vram_mm_helper.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef DRM_VRAM_MM_HELPER_H
+#define DRM_VRAM_MM_HELPER_H
+
+#include <drm/ttm/ttm_bo_driver.h>
+
+struct drm_device;
+
+/**
+ * struct drm_vram_mm_funcs - Callback functions for &struct drm_vram_mm
+ * @evict_flags:	Provides an implementation for struct \
+	&ttm_bo_driver.evict_flags
+ * @verify_access:	Provides an implementation for \
+	struct &ttm_bo_driver.verify_access
+ *
+ * These callback function integrate VRAM MM with TTM buffer objects. New
+ * functions can be added if necessary.
+ */
+struct drm_vram_mm_funcs {
+	void (*evict_flags)(struct ttm_buffer_object *bo,
+			    struct ttm_placement *placement);
+	int (*verify_access)(struct ttm_buffer_object *bo, struct file *filp);
+};
+
+/**
+ * struct drm_vram_mm - An instance of VRAM MM
+ * @vram_base:	Base address of the managed video memory
+ * @vram_size:	Size of the managed video memory in bytes
+ * @bdev:	The TTM BO device.
+ * @funcs:	TTM BO functions
+ *
+ * The fields &struct drm_vram_mm.vram_base and
+ * &struct drm_vram_mm.vrm_size are managed by VRAM MM, but are
+ * available for public read access. Use the field
+ * &struct drm_vram_mm.bdev to access the TTM BO device.
+ */
+struct drm_vram_mm {
+	uint64_t vram_base;
+	size_t vram_size;
+
+	struct ttm_bo_device bdev;
+
+	const struct drm_vram_mm_funcs *funcs;
+};
+
+/**
+ * drm_vram_mm_of_bdev() - \
+	Returns the container of type &struct ttm_bo_device for field bdev.
+ * @bdev:	the TTM BO device
+ *
+ * Returns:
+ * The containing instance of &struct drm_vram_mm
+ */
+static inline struct drm_vram_mm *drm_vram_mm_of_bdev(
+	struct ttm_bo_device *bdev)
+{
+	return container_of(bdev, struct drm_vram_mm, bdev);
+}
+
+int drm_vram_mm_init(struct drm_vram_mm *vmm, struct drm_device *dev,
+		     uint64_t vram_base, size_t vram_size,
+		     const struct drm_vram_mm_funcs *funcs);
+void drm_vram_mm_cleanup(struct drm_vram_mm *vmm);
+
+int drm_vram_mm_mmap(struct file *filp, struct vm_area_struct *vma,
+		     struct drm_vram_mm *vmm);
+
+#endif
-- 
cgit v1.2.3


From 5c9dcacfe56673555540933017c54e8f39e947cb Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 8 May 2019 10:26:17 +0200
Subject: drm: Add default instance for VRAM MM callback functions

VRAM MM is most likely be used with GEM VRAM. The latter now provides the
required instance of struct drm_vram_mm_funcs for drivers to use.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: http://patchwork.freedesktop.org/patch/msgid/20190508082630.15116-8-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/drm_gem_vram_helper.c | 14 ++++++++++++++
 include/drm/drm_gem_vram_helper.h     |  3 +++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index 15c6193e4985..c21b1f920e0a 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -3,6 +3,7 @@
 #include <drm/drm_gem_vram_helper.h>
 #include <drm/drm_mode.h>
 #include <drm/drm_prime.h>
+#include <drm/drm_vram_mm_helper.h>
 #include <drm/ttm/ttm_page_alloc.h>
 
 /**
@@ -524,6 +525,19 @@ int drm_gem_vram_bo_driver_verify_access(struct ttm_buffer_object *bo,
 }
 EXPORT_SYMBOL(drm_gem_vram_bo_driver_verify_access);
 
+/**
+ * drm_gem_vram_mm_funcs - Functions for &struct drm_vram_mm
+ *
+ * Most users of @struct drm_gem_vram_object will also use
+ * @struct drm_vram_mm. This instance of &struct drm_vram_mm_funcs
+ * can be used to connect both.
+ */
+const struct drm_vram_mm_funcs drm_gem_vram_mm_funcs = {
+	.evict_flags = drm_gem_vram_bo_driver_evict_flags,
+	.verify_access = drm_gem_vram_bo_driver_verify_access
+};
+EXPORT_SYMBOL(drm_gem_vram_mm_funcs);
+
 /*
  * Helpers for struct drm_driver
  */
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index 9a8bd98b623f..7971656afe87 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -9,6 +9,7 @@
 #include <linux/kernel.h> /* for container_of() */
 
 struct drm_mode_create_dumb;
+struct drm_vram_mm_funcs;
 struct filp;
 
 #define DRM_GEM_VRAM_PL_FLAG_VRAM	TTM_PL_FLAG_VRAM
@@ -107,6 +108,8 @@ void drm_gem_vram_bo_driver_evict_flags(struct ttm_buffer_object *bo,
 int drm_gem_vram_bo_driver_verify_access(struct ttm_buffer_object *bo,
 					 struct file *filp);
 
+extern const struct drm_vram_mm_funcs drm_gem_vram_mm_funcs;
+
 /*
  * Helpers for struct drm_driver
  */
-- 
cgit v1.2.3


From 59f5989ad42b6edd089b47895986ef15259822dc Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 8 May 2019 10:26:18 +0200
Subject: drm: Integrate VRAM MM into struct drm_device

There's now a pointer to struct drm_vram_mm stored in struct drm_device.
DRM drivers that use VRAM MM should use this field to refer to their
instance of the data structure. Appropriate helpers are now provided as
well.

Adding struct drm_vram_mm to struct drm_device further avoids wrappers
and boilerplate code in drivers. This patch implements default functions
for callbacks in struct drm_driver and struct file_operations that use
the struct drm_vram_mm stored in struct drm_device. Drivers that need to
provide their own implementations can still do so.

The patch also adds documentation for the VRAM helper library in general.

v5:
	* set .llseek to no_llseek() from DRM_VRAM_MM_FILE_OPERATIONS
v4:
	* cleanups from checkpatch.pl
	* document VRAM helper library

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: http://patchwork.freedesktop.org/patch/msgid/20190508082630.15116-9-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 Documentation/gpu/drm-mm.rst             |  6 +++
 drivers/gpu/drm/drm_gem_vram_helper.c    | 28 ++++++++++
 drivers/gpu/drm/drm_vram_helper_common.c | 92 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/drm_vram_mm_helper.c     | 85 +++++++++++++++++++++++++++++
 include/drm/drm_device.h                 |  4 ++
 include/drm/drm_gem_vram_helper.h        | 19 ++++++-
 include/drm/drm_vram_mm_helper.h         | 33 ++++++++++++
 7 files changed, 266 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
index eba50afbda42..c8ebd4f66a6a 100644
--- a/Documentation/gpu/drm-mm.rst
+++ b/Documentation/gpu/drm-mm.rst
@@ -382,6 +382,12 @@ GEM CMA Helper Functions Reference
 VRAM Helper Function Reference
 ==============================
 
+.. kernel-doc:: drivers/gpu/drm/drm_vram_helper_common.c
+   :doc: overview
+
+.. kernel-doc:: include/drm/drm_gem_vram_helper.h
+   :internal:
+
 GEM VRAM Helper Functions Reference
 -----------------------------------
 
diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index c21b1f920e0a..8f142b810eb4 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <drm/drm_gem_vram_helper.h>
+#include <drm/drm_device.h>
 #include <drm/drm_mode.h>
 #include <drm/drm_prime.h>
 #include <drm/drm_vram_mm_helper.h>
@@ -555,6 +556,33 @@ void drm_gem_vram_driver_gem_free_object_unlocked(struct drm_gem_object *gem)
 }
 EXPORT_SYMBOL(drm_gem_vram_driver_gem_free_object_unlocked);
 
+/**
+ * drm_gem_vram_driver_create_dumb() - \
+	Implements &struct drm_driver.dumb_create
+ * @file:		the DRM file
+ * @dev:		the DRM device
+ * @args:		the arguments as provided to \
+				&struct drm_driver.dumb_create
+ *
+ * This function requires the driver to use @drm_device.vram_mm for its
+ * instance of VRAM MM.
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_gem_vram_driver_dumb_create(struct drm_file *file,
+				    struct drm_device *dev,
+				    struct drm_mode_create_dumb *args)
+{
+	if (WARN_ONCE(!dev->vram_mm, "VRAM MM not initialized"))
+		return -EINVAL;
+
+	return drm_gem_vram_fill_create_dumb(file, dev, &dev->vram_mm->bdev, 0,
+					     false, args);
+}
+EXPORT_SYMBOL(drm_gem_vram_driver_dumb_create);
+
 /**
  * drm_gem_vram_driver_dumb_mmap_offset() - \
 	Implements &struct drm_driver.dumb_mmap_offset
diff --git a/drivers/gpu/drm/drm_vram_helper_common.c b/drivers/gpu/drm/drm_vram_helper_common.c
index 7c25daca18d0..3b47f7002115 100644
--- a/drivers/gpu/drm/drm_vram_helper_common.c
+++ b/drivers/gpu/drm/drm_vram_helper_common.c
@@ -2,5 +2,97 @@
 
 #include <linux/module.h>
 
+/**
+ * DOC: overview
+ *
+ * This library provides &struct drm_gem_vram_object (GEM VRAM), a GEM
+ * buffer object that is backed by video RAM. It can be used for
+ * framebuffer devices with dedicated memory. The video RAM can be
+ * managed with &struct drm_vram_mm (VRAM MM). Both data structures are
+ * supposed to be used together, but can also be used individually.
+ *
+ * With the GEM interface userspace applications create, manage and destroy
+ * graphics buffers, such as an on-screen framebuffer. GEM does not provide
+ * an implementation of these interfaces. It's up to the DRM driver to
+ * provide an implementation that suits the hardware. If the hardware device
+ * contains dedicated video memory, the DRM driver can use the VRAM helper
+ * library. Each active buffer object is stored in video RAM. Active
+ * buffer are used for drawing the current frame, typically something like
+ * the frame's scanout buffer or the cursor image. If there's no more space
+ * left in VRAM, inactive GEM objects can be moved to system memory.
+ *
+ * The easiest way to use the VRAM helper library is to call
+ * drm_vram_helper_alloc_mm(). The function allocates and initializes an
+ * instance of &struct drm_vram_mm in &struct drm_device.vram_mm . Use
+ * &DRM_GEM_VRAM_DRIVER to initialize &struct drm_driver and
+ * &DRM_VRAM_MM_FILE_OPERATIONS to initialize &struct file_operations;
+ * as illustrated below.
+ *
+ * .. code-block:: c
+ *
+ *	struct file_operations fops ={
+ *		.owner = THIS_MODULE,
+ *		DRM_VRAM_MM_FILE_OPERATION
+ *	};
+ *	struct drm_driver drv = {
+ *		.driver_feature = DRM_ ... ,
+ *		.fops = &fops,
+ *		DRM_GEM_VRAM_DRIVER
+ *	};
+ *
+ *	int init_drm_driver()
+ *	{
+ *		struct drm_device *dev;
+ *		uint64_t vram_base;
+ *		unsigned long vram_size;
+ *		int ret;
+ *
+ *		// setup device, vram base and size
+ *		// ...
+ *
+ *		ret = drm_vram_helper_alloc_mm(dev, vram_base, vram_size,
+ *					       &drm_gem_vram_mm_funcs);
+ *		if (ret)
+ *			return ret;
+ *		return 0;
+ *	}
+ *
+ * This creates an instance of &struct drm_vram_mm, exports DRM userspace
+ * interfaces for GEM buffer management and initializes file operations to
+ * allow for accessing created GEM buffers. With this setup, the DRM driver
+ * manages an area of video RAM with VRAM MM and provides GEM VRAM objects
+ * to userspace.
+ *
+ * To clean up the VRAM memory management, call drm_vram_helper_release_mm()
+ * in the driver's clean-up code.
+ *
+ * .. code-block:: c
+ *
+ *	void fini_drm_driver()
+ *	{
+ *		struct drm_device *dev = ...;
+ *
+ *		drm_vram_helper_release_mm(dev);
+ *	}
+ *
+ * For drawing or scanout operations, buffer object have to be pinned in video
+ * RAM. Call drm_gem_vram_pin() with &DRM_GEM_VRAM_PL_FLAG_VRAM or
+ * &DRM_GEM_VRAM_PL_FLAG_SYSTEM to pin a buffer object in video RAM or system
+ * memory. Call drm_gem_vram_unpin() to release the pinned object afterwards.
+ * If you have to evict a buffer object from video RAM (e.g., for freeing up
+ * memory), unpin the buffer and call drm_gem_vram_push_to_system().
+ *
+ * A buffer object that is pinned in video RAM has a fixed address within that
+ * memory region. Call drm_gem_vram_offset() to retrieve this value. Typically
+ * it's used to program the hardware's scanout engine for framebuffers, set
+ * the cursor overlay's image for a mouse cursor, or use it as input to the
+ * hardware's draing engine.
+ *
+ * To access a buffer object's memory from the DRM driver, call
+ * drm_gem_vram_kmap(). It (optionally) maps the buffer into kernel address
+ * space and returns the memory address. Use drm_gem_vram_kunmap() to
+ * release the mapping.
+ */
+
 MODULE_DESCRIPTION("DRM VRAM memory-management helpers");
 MODULE_LICENSE("GPL");
diff --git a/drivers/gpu/drm/drm_vram_mm_helper.c b/drivers/gpu/drm/drm_vram_mm_helper.c
index d17c5169b018..c94a6dc5ade7 100644
--- a/drivers/gpu/drm/drm_vram_mm_helper.c
+++ b/drivers/gpu/drm/drm_vram_mm_helper.c
@@ -208,3 +208,88 @@ int drm_vram_mm_mmap(struct file *filp, struct vm_area_struct *vma,
 	return ttm_bo_mmap(filp, vma, &vmm->bdev);
 }
 EXPORT_SYMBOL(drm_vram_mm_mmap);
+
+/*
+ * Helpers for integration with struct drm_device
+ */
+
+/**
+ * drm_vram_helper_alloc_mm - Allocates a device's instance of \
+	&struct drm_vram_mm
+ * @dev:	the DRM device
+ * @vram_base:	the base address of the video memory
+ * @vram_size:	the size of the video memory in bytes
+ * @funcs:	callback functions for buffer objects
+ *
+ * Returns:
+ * The new instance of &struct drm_vram_mm on success, or
+ * an ERR_PTR()-encoded errno code otherwise.
+ */
+struct drm_vram_mm *drm_vram_helper_alloc_mm(
+	struct drm_device *dev, uint64_t vram_base, size_t vram_size,
+	const struct drm_vram_mm_funcs *funcs)
+{
+	int ret;
+
+	if (WARN_ON(dev->vram_mm))
+		return dev->vram_mm;
+
+	dev->vram_mm = kzalloc(sizeof(*dev->vram_mm), GFP_KERNEL);
+	if (!dev->vram_mm)
+		return ERR_PTR(-ENOMEM);
+
+	ret = drm_vram_mm_init(dev->vram_mm, dev, vram_base, vram_size, funcs);
+	if (ret)
+		goto err_kfree;
+
+	return dev->vram_mm;
+
+err_kfree:
+	kfree(dev->vram_mm);
+	dev->vram_mm = NULL;
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(drm_vram_helper_alloc_mm);
+
+/**
+ * drm_vram_helper_release_mm - Releases a device's instance of \
+	&struct drm_vram_mm
+ * @dev:	the DRM device
+ */
+void drm_vram_helper_release_mm(struct drm_device *dev)
+{
+	if (!dev->vram_mm)
+		return;
+
+	drm_vram_mm_cleanup(dev->vram_mm);
+	kfree(dev->vram_mm);
+	dev->vram_mm = NULL;
+}
+EXPORT_SYMBOL(drm_vram_helper_release_mm);
+
+/*
+ * Helpers for &struct file_operations
+ */
+
+/**
+ * drm_vram_mm_file_operations_mmap() - \
+	Implements &struct file_operations.mmap()
+ * @filp:	the mapping's file structure
+ * @vma:	the mapping's memory area
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_vram_mm_file_operations_mmap(
+	struct file *filp, struct vm_area_struct *vma)
+{
+	struct drm_file *file_priv = filp->private_data;
+	struct drm_device *dev = file_priv->minor->dev;
+
+	if (WARN_ONCE(!dev->vram_mm, "VRAM MM not initialized"))
+		return -EINVAL;
+
+	return drm_vram_mm_mmap(filp, vma, dev->vram_mm);
+}
+EXPORT_SYMBOL(drm_vram_mm_file_operations_mmap);
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index 7f9ef709b2b6..1acfc3bbd3fb 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -17,6 +17,7 @@ struct drm_vblank_crtc;
 struct drm_sg_mem;
 struct drm_local_map;
 struct drm_vma_offset_manager;
+struct drm_vram_mm;
 struct drm_fb_helper;
 
 struct inode;
@@ -286,6 +287,9 @@ struct drm_device {
 	/** @vma_offset_manager: GEM information */
 	struct drm_vma_offset_manager *vma_offset_manager;
 
+	/** @vram_mm: VRAM MM memory manager */
+	struct drm_vram_mm *vram_mm;
+
 	/**
 	 * @switch_power_state:
 	 *
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index 7971656afe87..b056f189ba62 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -11,6 +11,7 @@
 struct drm_mode_create_dumb;
 struct drm_vram_mm_funcs;
 struct filp;
+struct vm_area_struct;
 
 #define DRM_GEM_VRAM_PL_FLAG_VRAM	TTM_PL_FLAG_VRAM
 #define DRM_GEM_VRAM_PL_FLAG_SYSTEM	TTM_PL_FLAG_SYSTEM
@@ -115,10 +116,26 @@ extern const struct drm_vram_mm_funcs drm_gem_vram_mm_funcs;
  */
 
 void drm_gem_vram_driver_gem_free_object_unlocked(struct drm_gem_object *gem);
-
+int drm_gem_vram_driver_dumb_create(struct drm_file *file,
+				    struct drm_device *dev,
+				    struct drm_mode_create_dumb *args);
 int drm_gem_vram_driver_dumb_mmap_offset(struct drm_file *file,
 					 struct drm_device *dev,
 					 uint32_t handle, uint64_t *offset);
+
+/**
+ * define DRM_GEM_VRAM_DRIVER - default callback functions for \
+	&struct drm_driver
+ *
+ * Drivers that use VRAM MM and GEM VRAM can use this macro to initialize
+ * &struct drm_driver with default functions.
+ */
+#define DRM_GEM_VRAM_DRIVER \
+	.gem_free_object_unlocked = \
+		drm_gem_vram_driver_gem_free_object_unlocked, \
+	.dumb_create		  = drm_gem_vram_driver_dumb_create, \
+	.dumb_map_offset	  = drm_gem_vram_driver_dumb_mmap_offset
+
 /*
  * PRIME helpers for struct drm_driver
  */
diff --git a/include/drm/drm_vram_mm_helper.h b/include/drm/drm_vram_mm_helper.h
index 5d45c6447fa4..a8ffd8599b08 100644
--- a/include/drm/drm_vram_mm_helper.h
+++ b/include/drm/drm_vram_mm_helper.h
@@ -66,4 +66,37 @@ void drm_vram_mm_cleanup(struct drm_vram_mm *vmm);
 int drm_vram_mm_mmap(struct file *filp, struct vm_area_struct *vma,
 		     struct drm_vram_mm *vmm);
 
+/*
+ * Helpers for integration with struct drm_device
+ */
+
+struct drm_vram_mm *drm_vram_helper_alloc_mm(
+	struct drm_device *dev, uint64_t vram_base, size_t vram_size,
+	const struct drm_vram_mm_funcs *funcs);
+void drm_vram_helper_release_mm(struct drm_device *dev);
+
+/*
+ * Helpers for &struct file_operations
+ */
+
+int drm_vram_mm_file_operations_mmap(
+	struct file *filp, struct vm_area_struct *vma);
+
+/**
+ * define DRM_VRAM_MM_FILE_OPERATIONS - default callback functions for \
+	&struct file_operations
+ *
+ * Drivers that use VRAM MM can use this macro to initialize
+ * &struct file_operations with default functions.
+ */
+#define DRM_VRAM_MM_FILE_OPERATIONS \
+	.llseek		= no_llseek, \
+	.read		= drm_read, \
+	.poll		= drm_poll, \
+	.unlocked_ioctl = drm_ioctl, \
+	.compat_ioctl	= drm_compat_ioctl, \
+	.mmap		= drm_vram_mm_file_operations_mmap, \
+	.open		= drm_open, \
+	.release	= drm_release \
+
 #endif
-- 
cgit v1.2.3


From 75b3f1cb50bdbdc7fd557ca3ed63b3eb87c2bab3 Mon Sep 17 00:00:00 2001
From: James Clarke <jrtc27@jrtc27.com>
Date: Tue, 15 Jan 2019 15:04:18 +0000
Subject: drm: Fix drm.h uapi header for GNU/kFreeBSD

Like GNU/Linux, GNU/kFreeBSD's sys/types.h does not define the uintX_t
types, which differs from the BSDs' headers. Thus we should include
stdint.h to ensure we have all the required integer types.

Signed-off-by: James Clarke <jrtc27@jrtc27.com>
Signed-off-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20190115150418.68080-1-jrtc27@jrtc27.com
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 include/uapi/drm/drm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index 661d73f9a919..8a5b2f8f8eb9 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -50,6 +50,7 @@ typedef unsigned int drm_handle_t;
 
 #else /* One of the BSDs */
 
+#include <stdint.h>
 #include <sys/ioccom.h>
 #include <sys/types.h>
 typedef int8_t   __s8;
-- 
cgit v1.2.3


From 82ff2fb5d184e95c7877c58359cef4f5d43df9c1 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Thu, 16 May 2019 18:27:45 +0200
Subject: drm: Add drm_gem_vram_{pin/unpin}_reserved() and convert mgag200

The new interfaces drm_gem_vram_{pin/unpin}_reserved() are variants of the
GEM VRAM pin/unpin functions that do not reserve the BO during validation.
The mgag200 driver requires this behavior for its cursor handling. The
patch also converts the driver to use the new interfaces.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: http://patchwork.freedesktop.org/patch/msgid/20190516162746.11636-2-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/drm_gem_vram_helper.c    | 75 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/mgag200/mgag200_cursor.c | 18 ++++----
 include/drm/drm_gem_vram_helper.h        |  3 ++
 3 files changed, 88 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index 8f142b810eb4..a002c03eaf4c 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -254,6 +254,47 @@ int drm_gem_vram_pin(struct drm_gem_vram_object *gbo, unsigned long pl_flag)
 }
 EXPORT_SYMBOL(drm_gem_vram_pin);
 
+/**
+ * drm_gem_vram_pin_reserved() - Pins a GEM VRAM object in a region.
+ * @gbo:	the GEM VRAM object
+ * @pl_flag:	a bitmask of possible memory regions
+ *
+ * Pinning a buffer object ensures that it is not evicted from
+ * a memory region. A pinned buffer object has to be unpinned before
+ * it can be pinned to another region.
+ *
+ * This function pins a GEM VRAM object that has already been
+ * reserved. Use drm_gem_vram_pin() if possible.
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_gem_vram_pin_reserved(struct drm_gem_vram_object *gbo,
+			      unsigned long pl_flag)
+{
+	int i, ret;
+	struct ttm_operation_ctx ctx = { false, false };
+
+	if (gbo->pin_count) {
+		++gbo->pin_count;
+		return 0;
+	}
+
+	drm_gem_vram_placement(gbo, pl_flag);
+	for (i = 0; i < gbo->placement.num_placement; ++i)
+		gbo->placements[i].flags |= TTM_PL_FLAG_NO_EVICT;
+
+	ret = ttm_bo_validate(&gbo->bo, &gbo->placement, &ctx);
+	if (ret < 0)
+		return ret;
+
+	gbo->pin_count = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_gem_vram_pin_reserved);
+
 /**
  * drm_gem_vram_unpin() - Unpins a GEM VRAM object
  * @gbo:	the GEM VRAM object
@@ -285,6 +326,40 @@ int drm_gem_vram_unpin(struct drm_gem_vram_object *gbo)
 }
 EXPORT_SYMBOL(drm_gem_vram_unpin);
 
+/**
+ * drm_gem_vram_unpin_reserved() - Unpins a GEM VRAM object
+ * @gbo:	the GEM VRAM object
+ *
+ * This function unpins a GEM VRAM object that has already been
+ * reserved. Use drm_gem_vram_unpin() if possible.
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_gem_vram_unpin_reserved(struct drm_gem_vram_object *gbo)
+{
+	int i, ret;
+	struct ttm_operation_ctx ctx = { false, false };
+
+	if (WARN_ON_ONCE(!gbo->pin_count))
+		return 0;
+
+	--gbo->pin_count;
+	if (gbo->pin_count)
+		return 0;
+
+	for (i = 0; i < gbo->placement.num_placement ; ++i)
+		gbo->placements[i].flags &= ~TTM_PL_FLAG_NO_EVICT;
+
+	ret = ttm_bo_validate(&gbo->bo, &gbo->placement, &ctx);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_gem_vram_unpin_reserved);
+
 /**
  * drm_gem_vram_push_to_system() - \
 	Unpins a GEM VRAM object and moves it to system memory
diff --git a/drivers/gpu/drm/mgag200/mgag200_cursor.c b/drivers/gpu/drm/mgag200/mgag200_cursor.c
index 6c1a9d724d85..1c4fc85315a0 100644
--- a/drivers/gpu/drm/mgag200/mgag200_cursor.c
+++ b/drivers/gpu/drm/mgag200/mgag200_cursor.c
@@ -23,9 +23,9 @@ static void mga_hide_cursor(struct mga_device *mdev)
 	WREG8(MGA_CURPOSXL, 0);
 	WREG8(MGA_CURPOSXH, 0);
 	if (mdev->cursor.pixels_1->pin_count)
-		drm_gem_vram_unpin(mdev->cursor.pixels_1);
+		drm_gem_vram_unpin_reserved(mdev->cursor.pixels_1);
 	if (mdev->cursor.pixels_2->pin_count)
-		drm_gem_vram_unpin(mdev->cursor.pixels_2);
+		drm_gem_vram_unpin_reserved(mdev->cursor.pixels_2);
 }
 
 int mga_crtc_cursor_set(struct drm_crtc *crtc,
@@ -96,26 +96,28 @@ int mga_crtc_cursor_set(struct drm_crtc *crtc,
 
 	/* Move cursor buffers into VRAM if they aren't already */
 	if (!pixels_1->pin_count) {
-		ret = drm_gem_vram_pin(pixels_1, DRM_GEM_VRAM_PL_FLAG_VRAM);
+		ret = drm_gem_vram_pin_reserved(pixels_1,
+						DRM_GEM_VRAM_PL_FLAG_VRAM);
 		if (ret)
 			goto out1;
 		gpu_addr = drm_gem_vram_offset(pixels_1);
 		if (gpu_addr < 0) {
-			drm_gem_vram_unpin(pixels_1);
+			drm_gem_vram_unpin_reserved(pixels_1);
 			goto out1;
 		}
 		mdev->cursor.pixels_1_gpu_addr = gpu_addr;
 	}
 	if (!pixels_2->pin_count) {
-		ret = drm_gem_vram_pin(pixels_2, DRM_GEM_VRAM_PL_FLAG_VRAM);
+		ret = drm_gem_vram_pin_reserved(pixels_2,
+						DRM_GEM_VRAM_PL_FLAG_VRAM);
 		if (ret) {
-			drm_gem_vram_unpin(pixels_1);
+			drm_gem_vram_unpin_reserved(pixels_1);
 			goto out1;
 		}
 		gpu_addr = drm_gem_vram_offset(pixels_2);
 		if (gpu_addr < 0) {
-			drm_gem_vram_unpin(pixels_1);
-			drm_gem_vram_unpin(pixels_2);
+			drm_gem_vram_unpin_reserved(pixels_1);
+			drm_gem_vram_unpin_reserved(pixels_2);
 			goto out1;
 		}
 		mdev->cursor.pixels_2_gpu_addr = gpu_addr;
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index b056f189ba62..ff1a81761543 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -82,7 +82,10 @@ void drm_gem_vram_unreserve(struct drm_gem_vram_object *gbo);
 u64 drm_gem_vram_mmap_offset(struct drm_gem_vram_object *gbo);
 s64 drm_gem_vram_offset(struct drm_gem_vram_object *gbo);
 int drm_gem_vram_pin(struct drm_gem_vram_object *gbo, unsigned long pl_flag);
+int drm_gem_vram_pin_reserved(struct drm_gem_vram_object *gbo,
+			      unsigned long pl_flag);
 int drm_gem_vram_unpin(struct drm_gem_vram_object *gbo);
+int drm_gem_vram_unpin_reserved(struct drm_gem_vram_object *gbo);
 int drm_gem_vram_push_to_system(struct drm_gem_vram_object *gbo);
 void *drm_gem_vram_kmap_at(struct drm_gem_vram_object *gbo, bool map,
 			   bool *is_iomem, struct ttm_bo_kmap_obj *kmap);
-- 
cgit v1.2.3


From 2c61a5459932f68af51306302ab128a623f37e96 Mon Sep 17 00:00:00 2001
From: Anson Huang <anson.huang@nxp.com>
Date: Sun, 12 May 2019 10:17:08 +0000
Subject: dt-bindings: clock: imx8mm: Add GPIO clocks

Add macro for the GPIO clocks of the i.MX8MM.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Reviewed-by: Dong Aisheng <aisheng.dong@nxp.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 include/dt-bindings/clock/imx8mm-clock.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/imx8mm-clock.h b/include/dt-bindings/clock/imx8mm-clock.h
index 1b4353e7b486..fe47798f95df 100644
--- a/include/dt-bindings/clock/imx8mm-clock.h
+++ b/include/dt-bindings/clock/imx8mm-clock.h
@@ -239,6 +239,12 @@
 
 #define IMX8MM_CLK_NAND_USDHC_BUS_RAWNAND_CLK	222
 
-#define IMX8MM_CLK_END				223
+#define IMX8MM_CLK_GPIO1_ROOT			223
+#define IMX8MM_CLK_GPIO2_ROOT			224
+#define IMX8MM_CLK_GPIO3_ROOT			225
+#define IMX8MM_CLK_GPIO4_ROOT			226
+#define IMX8MM_CLK_GPIO5_ROOT			227
+
+#define IMX8MM_CLK_END				228
 
 #endif
-- 
cgit v1.2.3


From 05c452c115bffa12f78346723f0282a4264ed200 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Thu, 16 May 2019 12:31:47 +0200
Subject: drm: Remove users of drm_format_num_planes

drm_format_num_planes() is basically a lookup in the drm_format_info table
plus an access to the num_planes field of the appropriate entry.

Most drivers are using this function while having access to the entry
already, which means that we will perform an unnecessary lookup. Removing
the call to drm_format_num_planes is therefore more efficient.

Some drivers will not have access to that entry in the function, but in
this case the overhead is minimal (we just have to call drm_format_info()
to perform the lookup) and we can even avoid multiple, inefficient lookups
in some places that need multiple fields from the drm_format_info
structure.

Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/5ffcec9d14a50ed538e37d565f546802452ee672.1558002671.git-series.maxime.ripard@bootlin.com
---
 drivers/gpu/drm/arm/malidp_mw.c             |  2 +-
 drivers/gpu/drm/armada/armada_fb.c          |  3 ++-
 drivers/gpu/drm/drm_fourcc.c                | 16 ----------------
 drivers/gpu/drm/mediatek/mtk_drm_fb.c       |  6 ++++--
 drivers/gpu/drm/meson/meson_overlay.c       |  2 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c |  9 ++++++---
 drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c    |  3 ++-
 drivers/gpu/drm/msm/msm_fb.c                |  8 ++++++--
 drivers/gpu/drm/omapdrm/omap_fb.c           |  4 +++-
 drivers/gpu/drm/rockchip/rockchip_drm_fb.c  |  6 +++---
 drivers/gpu/drm/tegra/fb.c                  |  3 ++-
 drivers/gpu/drm/vc4/vc4_plane.c             |  2 +-
 drivers/gpu/drm/zte/zx_plane.c              |  4 +---
 include/drm/drm_fourcc.h                    |  1 -
 14 files changed, 32 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/arm/malidp_mw.c b/drivers/gpu/drm/arm/malidp_mw.c
index 5f102bdaf841..2e812525025d 100644
--- a/drivers/gpu/drm/arm/malidp_mw.c
+++ b/drivers/gpu/drm/arm/malidp_mw.c
@@ -158,7 +158,7 @@ malidp_mw_encoder_atomic_check(struct drm_encoder *encoder,
 		return -EINVAL;
 	}
 
-	n_planes = drm_format_num_planes(fb->format->format);
+	n_planes = fb->format->num_planes;
 	for (i = 0; i < n_planes; i++) {
 		struct drm_gem_cma_object *obj = drm_fb_cma_get_gem_obj(fb, i);
 		/* memory write buffers are never rotated */
diff --git a/drivers/gpu/drm/armada/armada_fb.c b/drivers/gpu/drm/armada/armada_fb.c
index 058ac7d9920f..a2f6472eb482 100644
--- a/drivers/gpu/drm/armada/armada_fb.c
+++ b/drivers/gpu/drm/armada/armada_fb.c
@@ -87,6 +87,7 @@ struct armada_framebuffer *armada_framebuffer_create(struct drm_device *dev,
 struct drm_framebuffer *armada_fb_create(struct drm_device *dev,
 	struct drm_file *dfile, const struct drm_mode_fb_cmd2 *mode)
 {
+	const struct drm_format_info *info = drm_get_format_info(dev, mode);
 	struct armada_gem_object *obj;
 	struct armada_framebuffer *dfb;
 	int ret;
@@ -97,7 +98,7 @@ struct drm_framebuffer *armada_fb_create(struct drm_device *dev,
 		mode->pitches[2]);
 
 	/* We can only handle a single plane at the moment */
-	if (drm_format_num_planes(mode->pixel_format) > 1 &&
+	if (info->num_planes > 1 &&
 	    (mode->handles[0] != mode->handles[1] ||
 	     mode->handles[0] != mode->handles[2])) {
 		ret = -EINVAL;
diff --git a/drivers/gpu/drm/drm_fourcc.c b/drivers/gpu/drm/drm_fourcc.c
index 6ea55fb4526d..873c0001d8c8 100644
--- a/drivers/gpu/drm/drm_fourcc.c
+++ b/drivers/gpu/drm/drm_fourcc.c
@@ -332,22 +332,6 @@ drm_get_format_info(struct drm_device *dev,
 }
 EXPORT_SYMBOL(drm_get_format_info);
 
-/**
- * drm_format_num_planes - get the number of planes for format
- * @format: pixel format (DRM_FORMAT_*)
- *
- * Returns:
- * The number of planes used by the specified pixel format.
- */
-int drm_format_num_planes(uint32_t format)
-{
-	const struct drm_format_info *info;
-
-	info = drm_format_info(format);
-	return info ? info->num_planes : 1;
-}
-EXPORT_SYMBOL(drm_format_num_planes);
-
 /**
  * drm_format_plane_cpp - determine the bytes per pixel value
  * @format: pixel format (DRM_FORMAT_*)
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_fb.c b/drivers/gpu/drm/mediatek/mtk_drm_fb.c
index e20fcaef2851..68fdef8b12bd 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_fb.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_fb.c
@@ -32,10 +32,11 @@ static struct drm_framebuffer *mtk_drm_framebuffer_init(struct drm_device *dev,
 					const struct drm_mode_fb_cmd2 *mode,
 					struct drm_gem_object *obj)
 {
+	const struct drm_format_info *info = drm_get_format_info(dev, mode);
 	struct drm_framebuffer *fb;
 	int ret;
 
-	if (drm_format_num_planes(mode->pixel_format) != 1)
+	if (info->num_planes != 1)
 		return ERR_PTR(-EINVAL);
 
 	fb = kzalloc(sizeof(*fb), GFP_KERNEL);
@@ -88,6 +89,7 @@ struct drm_framebuffer *mtk_drm_mode_fb_create(struct drm_device *dev,
 					       struct drm_file *file,
 					       const struct drm_mode_fb_cmd2 *cmd)
 {
+	const struct drm_format_info *info = drm_get_format_info(dev, cmd);
 	struct drm_framebuffer *fb;
 	struct drm_gem_object *gem;
 	unsigned int width = cmd->width;
@@ -95,7 +97,7 @@ struct drm_framebuffer *mtk_drm_mode_fb_create(struct drm_device *dev,
 	unsigned int size, bpp;
 	int ret;
 
-	if (drm_format_num_planes(cmd->pixel_format) != 1)
+	if (info->num_planes != 1)
 		return ERR_PTR(-EINVAL);
 
 	gem = drm_gem_object_lookup(file, cmd->handles[0]);
diff --git a/drivers/gpu/drm/meson/meson_overlay.c b/drivers/gpu/drm/meson/meson_overlay.c
index bdbf925ff3e8..fb8515b2860c 100644
--- a/drivers/gpu/drm/meson/meson_overlay.c
+++ b/drivers/gpu/drm/meson/meson_overlay.c
@@ -458,7 +458,7 @@ static void meson_overlay_atomic_update(struct drm_plane *plane,
 	}
 
 	/* Update Canvas with buffer address */
-	priv->viu.vd1_planes = drm_format_num_planes(fb->format->format);
+	priv->viu.vd1_planes = fb->format->num_planes;
 
 	switch (priv->viu.vd1_planes) {
 	case 3:
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c
index f59fe1a9f4b9..c3d491e8d44b 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c
@@ -1040,10 +1040,11 @@ int dpu_format_check_modified_format(
 		const struct drm_mode_fb_cmd2 *cmd,
 		struct drm_gem_object **bos)
 {
-	int ret, i, num_base_fmt_planes;
+	const struct drm_format_info *info;
 	const struct dpu_format *fmt;
 	struct dpu_hw_fmt_layout layout;
 	uint32_t bos_total_size = 0;
+	int ret, i;
 
 	if (!msm_fmt || !cmd || !bos) {
 		DRM_ERROR("invalid arguments\n");
@@ -1051,14 +1052,16 @@ int dpu_format_check_modified_format(
 	}
 
 	fmt = to_dpu_format(msm_fmt);
-	num_base_fmt_planes = drm_format_num_planes(fmt->base.pixel_format);
+	info = drm_format_info(fmt->base.pixel_format);
+	if (!info)
+		return -EINVAL;
 
 	ret = dpu_format_get_plane_sizes(fmt, cmd->width, cmd->height,
 			&layout, cmd->pitches);
 	if (ret)
 		return ret;
 
-	for (i = 0; i < num_base_fmt_planes; i++) {
+	for (i = 0; i < info->num_planes; i++) {
 		if (!bos[i]) {
 			DRM_ERROR("invalid handle for plane %d\n", i);
 			return -EINVAL;
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
index 6153514db04c..72ab8d89efa4 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
@@ -127,13 +127,14 @@ uint32_t mdp5_smp_calculate(struct mdp5_smp *smp,
 		const struct mdp_format *format,
 		u32 width, bool hdecim)
 {
+	const struct drm_format_info *info = drm_format_info(format->base.pixel_format);
 	struct mdp5_kms *mdp5_kms = get_kms(smp);
 	int rev = mdp5_cfg_get_hw_rev(mdp5_kms->cfg);
 	int i, hsub, nplanes, nlines;
 	u32 fmt = format->base.pixel_format;
 	uint32_t blkcfg = 0;
 
-	nplanes = drm_format_num_planes(fmt);
+	nplanes = info->num_planes;
 	hsub = drm_format_horz_chroma_subsampling(fmt);
 
 	/* different if BWC (compressed framebuffer?) enabled: */
diff --git a/drivers/gpu/drm/msm/msm_fb.c b/drivers/gpu/drm/msm/msm_fb.c
index 136058978e0f..432beddafb9e 100644
--- a/drivers/gpu/drm/msm/msm_fb.c
+++ b/drivers/gpu/drm/msm/msm_fb.c
@@ -106,9 +106,11 @@ const struct msm_format *msm_framebuffer_format(struct drm_framebuffer *fb)
 struct drm_framebuffer *msm_framebuffer_create(struct drm_device *dev,
 		struct drm_file *file, const struct drm_mode_fb_cmd2 *mode_cmd)
 {
+	const struct drm_format_info *info = drm_get_format_info(dev,
+								 mode_cmd);
 	struct drm_gem_object *bos[4] = {0};
 	struct drm_framebuffer *fb;
-	int ret, i, n = drm_format_num_planes(mode_cmd->pixel_format);
+	int ret, i, n = info->num_planes;
 
 	for (i = 0; i < n; i++) {
 		bos[i] = drm_gem_object_lookup(file, mode_cmd->handles[i]);
@@ -135,6 +137,8 @@ out_unref:
 static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev,
 		const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object **bos)
 {
+	const struct drm_format_info *info = drm_get_format_info(dev,
+								 mode_cmd);
 	struct msm_drm_private *priv = dev->dev_private;
 	struct msm_kms *kms = priv->kms;
 	struct msm_framebuffer *msm_fb = NULL;
@@ -147,7 +151,7 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev,
 			dev, mode_cmd, mode_cmd->width, mode_cmd->height,
 			(char *)&mode_cmd->pixel_format);
 
-	n = drm_format_num_planes(mode_cmd->pixel_format);
+	n = info->num_planes;
 	hsub = drm_format_horz_chroma_subsampling(mode_cmd->pixel_format);
 	vsub = drm_format_vert_chroma_subsampling(mode_cmd->pixel_format);
 
diff --git a/drivers/gpu/drm/omapdrm/omap_fb.c b/drivers/gpu/drm/omapdrm/omap_fb.c
index 4f8eb9d08f99..cfb641363a32 100644
--- a/drivers/gpu/drm/omapdrm/omap_fb.c
+++ b/drivers/gpu/drm/omapdrm/omap_fb.c
@@ -298,7 +298,9 @@ void omap_framebuffer_describe(struct drm_framebuffer *fb, struct seq_file *m)
 struct drm_framebuffer *omap_framebuffer_create(struct drm_device *dev,
 		struct drm_file *file, const struct drm_mode_fb_cmd2 *mode_cmd)
 {
-	unsigned int num_planes = drm_format_num_planes(mode_cmd->pixel_format);
+	const struct drm_format_info *info = drm_get_format_info(dev,
+								 mode_cmd);
+	unsigned int num_planes = info->num_planes;
 	struct drm_gem_object *bos[4];
 	struct drm_framebuffer *fb;
 	int i;
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_fb.c b/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
index 97438bbbe389..606d176d5d96 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
@@ -74,19 +74,19 @@ static struct drm_framebuffer *
 rockchip_user_fb_create(struct drm_device *dev, struct drm_file *file_priv,
 			const struct drm_mode_fb_cmd2 *mode_cmd)
 {
+	const struct drm_format_info *info = drm_get_format_info(dev,
+								 mode_cmd);
 	struct drm_framebuffer *fb;
 	struct drm_gem_object *objs[ROCKCHIP_MAX_FB_BUFFER];
 	struct drm_gem_object *obj;
 	unsigned int hsub;
 	unsigned int vsub;
-	int num_planes;
+	int num_planes = min_t(int, info->num_planes, ROCKCHIP_MAX_FB_BUFFER);
 	int ret;
 	int i;
 
 	hsub = drm_format_horz_chroma_subsampling(mode_cmd->pixel_format);
 	vsub = drm_format_vert_chroma_subsampling(mode_cmd->pixel_format);
-	num_planes = min(drm_format_num_planes(mode_cmd->pixel_format),
-			 ROCKCHIP_MAX_FB_BUFFER);
 
 	for (i = 0; i < num_planes; i++) {
 		unsigned int width = mode_cmd->width / (i ? hsub : 1);
diff --git a/drivers/gpu/drm/tegra/fb.c b/drivers/gpu/drm/tegra/fb.c
index 1dd83a757dba..da0747e317b7 100644
--- a/drivers/gpu/drm/tegra/fb.c
+++ b/drivers/gpu/drm/tegra/fb.c
@@ -131,6 +131,7 @@ struct drm_framebuffer *tegra_fb_create(struct drm_device *drm,
 					struct drm_file *file,
 					const struct drm_mode_fb_cmd2 *cmd)
 {
+	const struct drm_format_info *info = drm_get_format_info(drm, cmd);
 	unsigned int hsub, vsub, i;
 	struct tegra_bo *planes[4];
 	struct drm_gem_object *gem;
@@ -140,7 +141,7 @@ struct drm_framebuffer *tegra_fb_create(struct drm_device *drm,
 	hsub = drm_format_horz_chroma_subsampling(cmd->pixel_format);
 	vsub = drm_format_vert_chroma_subsampling(cmd->pixel_format);
 
-	for (i = 0; i < drm_format_num_planes(cmd->pixel_format); i++) {
+	for (i = 0; i < info->num_planes; i++) {
 		unsigned int width = cmd->width / (i ? hsub : 1);
 		unsigned int height = cmd->height / (i ? vsub : 1);
 		unsigned int size, bpp;
diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index 4d918d3e4858..e3c0a350cb77 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -592,7 +592,7 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	u32 ctl0_offset = vc4_state->dlist_count;
 	const struct hvs_format *format = vc4_get_hvs_format(fb->format->format);
 	u64 base_format_mod = fourcc_mod_broadcom_mod(fb->modifier);
-	int num_planes = drm_format_num_planes(format->drm);
+	int num_planes = fb->format->num_planes;
 	u32 h_subsample, v_subsample;
 	bool mix_plane_alpha;
 	bool covers_screen;
diff --git a/drivers/gpu/drm/zte/zx_plane.c b/drivers/gpu/drm/zte/zx_plane.c
index 83d236fd893c..c6a8be444300 100644
--- a/drivers/gpu/drm/zte/zx_plane.c
+++ b/drivers/gpu/drm/zte/zx_plane.c
@@ -199,7 +199,6 @@ static void zx_vl_plane_atomic_update(struct drm_plane *plane,
 	u32 dst_x, dst_y, dst_w, dst_h;
 	uint32_t format;
 	int fmt;
-	int num_planes;
 	int i;
 
 	if (!fb)
@@ -218,9 +217,8 @@ static void zx_vl_plane_atomic_update(struct drm_plane *plane,
 	dst_h = drm_rect_height(dst);
 
 	/* Set up data address registers for Y, Cb and Cr planes */
-	num_planes = drm_format_num_planes(format);
 	paddr_reg = layer + VL_Y;
-	for (i = 0; i < num_planes; i++) {
+	for (i = 0; i < fb->format->num_planes; i++) {
 		cma_obj = drm_fb_cma_get_gem_obj(fb, i);
 		paddr = cma_obj->paddr + fb->offsets[i];
 		paddr += src_y * fb->pitches[i];
diff --git a/include/drm/drm_fourcc.h b/include/drm/drm_fourcc.h
index b3d9d88ab290..41779b327d91 100644
--- a/include/drm/drm_fourcc.h
+++ b/include/drm/drm_fourcc.h
@@ -268,7 +268,6 @@ drm_get_format_info(struct drm_device *dev,
 uint32_t drm_mode_legacy_fb_format(uint32_t bpp, uint32_t depth);
 uint32_t drm_driver_legacy_fb_format(struct drm_device *dev,
 				     uint32_t bpp, uint32_t depth);
-int drm_format_num_planes(uint32_t format);
 int drm_format_plane_cpp(uint32_t format, int plane);
 int drm_format_horz_chroma_subsampling(uint32_t format);
 int drm_format_vert_chroma_subsampling(uint32_t format);
-- 
cgit v1.2.3


From f3e9632cb6241a6c098427510ad3ee041365ae64 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Thu, 16 May 2019 12:31:48 +0200
Subject: drm: Remove users of drm_format_(horz|vert)_chroma_subsampling

drm_format_horz_chroma_subsampling and drm_format_vert_chroma_subsampling
are basically a lookup in the drm_format_info table plus an access to the
hsub and vsub fields of the appropriate entry.

Most drivers are using this function while having access to the entry
already, which means that we will perform an unnecessary lookup. Removing
the call to these functions is therefore more efficient.

Some drivers will not have access to that entry in the function, but in
this case the overhead is minimal (we just have to call drm_format_info()
to perform the lookup) and we can even avoid multiple, inefficient lookups
in some places that need multiple fields from the drm_format_info
structure.

This is amplified by the fact that most of the time the callers will have
to retrieve both the vsub and hsub fields, meaning that they would perform
twice the lookup.

Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/6b3cceb8161e2c1d40c2681de99202328b0a8abc.1558002671.git-series.maxime.ripard@bootlin.com
---
 drivers/gpu/drm/arm/malidp_planes.c             |  6 ++---
 drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c |  9 ++-----
 drivers/gpu/drm/drm_fourcc.c                    | 34 -------------------------
 drivers/gpu/drm/imx/ipuv3-plane.c               | 15 +++++------
 drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c       |  9 ++-----
 drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c      | 24 ++++++++---------
 drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c        |  2 +-
 drivers/gpu/drm/msm/msm_fb.c                    |  8 ++----
 drivers/gpu/drm/rockchip/rockchip_drm_fb.c      |  9 ++-----
 drivers/gpu/drm/rockchip/rockchip_drm_vop.c     | 10 +++-----
 drivers/gpu/drm/tegra/fb.c                      |  9 +++----
 drivers/gpu/drm/vc4/vc4_plane.c                 | 13 +++-------
 include/drm/drm_fourcc.h                        |  2 --
 13 files changed, 38 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/arm/malidp_planes.c b/drivers/gpu/drm/arm/malidp_planes.c
index d42e0ea9a303..8f89813d08c1 100644
--- a/drivers/gpu/drm/arm/malidp_planes.c
+++ b/drivers/gpu/drm/arm/malidp_planes.c
@@ -233,8 +233,7 @@ bool malidp_format_mod_supported(struct drm_device *drm,
 			}
 		}
 
-		if ((drm_format_horz_chroma_subsampling(format) != 1) ||
-		    (drm_format_vert_chroma_subsampling(format) != 1)) {
+		if ((info->hsub != 1) || (info->vsub != 1)) {
 			if (!(format == DRM_FORMAT_YUV420_10BIT &&
 			      (map->features & MALIDP_DEVICE_AFBC_YUV_420_10_SUPPORT_SPLIT))) {
 				DRM_DEBUG_KMS("Formats which are sub-sampled should never be split\n");
@@ -244,8 +243,7 @@ bool malidp_format_mod_supported(struct drm_device *drm,
 	}
 
 	if (modifier & AFBC_CBR) {
-		if ((drm_format_horz_chroma_subsampling(format) == 1) ||
-		    (drm_format_vert_chroma_subsampling(format) == 1)) {
+		if ((info->hsub == 1) || (info->vsub == 1)) {
 			DRM_DEBUG_KMS("Formats which are not sub-sampled should not have CBR set\n");
 			return false;
 		}
diff --git a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c
index e836e2de35ce..fdd607ad27fe 100644
--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c
+++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c
@@ -603,8 +603,6 @@ static int atmel_hlcdc_plane_atomic_check(struct drm_plane *p,
 	const struct drm_display_mode *mode;
 	struct drm_crtc_state *crtc_state;
 	unsigned int tmp;
-	int hsub = 1;
-	int vsub = 1;
 	int ret;
 	int i;
 
@@ -642,13 +640,10 @@ static int atmel_hlcdc_plane_atomic_check(struct drm_plane *p,
 	if (state->nplanes > ATMEL_HLCDC_LAYER_MAX_PLANES)
 		return -EINVAL;
 
-	hsub = drm_format_horz_chroma_subsampling(fb->format->format);
-	vsub = drm_format_vert_chroma_subsampling(fb->format->format);
-
 	for (i = 0; i < state->nplanes; i++) {
 		unsigned int offset = 0;
-		int xdiv = i ? hsub : 1;
-		int ydiv = i ? vsub : 1;
+		int xdiv = i ? fb->format->hsub : 1;
+		int ydiv = i ? fb->format->vsub : 1;
 
 		state->bpp[i] = fb->format->cpp[i];
 		if (!state->bpp[i])
diff --git a/drivers/gpu/drm/drm_fourcc.c b/drivers/gpu/drm/drm_fourcc.c
index 873c0001d8c8..e4a2c8372c8b 100644
--- a/drivers/gpu/drm/drm_fourcc.c
+++ b/drivers/gpu/drm/drm_fourcc.c
@@ -352,40 +352,6 @@ int drm_format_plane_cpp(uint32_t format, int plane)
 }
 EXPORT_SYMBOL(drm_format_plane_cpp);
 
-/**
- * drm_format_horz_chroma_subsampling - get the horizontal chroma subsampling factor
- * @format: pixel format (DRM_FORMAT_*)
- *
- * Returns:
- * The horizontal chroma subsampling factor for the
- * specified pixel format.
- */
-int drm_format_horz_chroma_subsampling(uint32_t format)
-{
-	const struct drm_format_info *info;
-
-	info = drm_format_info(format);
-	return info ? info->hsub : 1;
-}
-EXPORT_SYMBOL(drm_format_horz_chroma_subsampling);
-
-/**
- * drm_format_vert_chroma_subsampling - get the vertical chroma subsampling factor
- * @format: pixel format (DRM_FORMAT_*)
- *
- * Returns:
- * The vertical chroma subsampling factor for the
- * specified pixel format.
- */
-int drm_format_vert_chroma_subsampling(uint32_t format)
-{
-	const struct drm_format_info *info;
-
-	info = drm_format_info(format);
-	return info ? info->vsub : 1;
-}
-EXPORT_SYMBOL(drm_format_vert_chroma_subsampling);
-
 /**
  * drm_format_plane_width - width of the plane given the first plane
  * @width: width of the first plane
diff --git a/drivers/gpu/drm/imx/ipuv3-plane.c b/drivers/gpu/drm/imx/ipuv3-plane.c
index d7a727a6e3d7..8f101a0763e3 100644
--- a/drivers/gpu/drm/imx/ipuv3-plane.c
+++ b/drivers/gpu/drm/imx/ipuv3-plane.c
@@ -115,8 +115,8 @@ drm_plane_state_to_ubo(struct drm_plane_state *state)
 	cma_obj = drm_fb_cma_get_gem_obj(fb, 1);
 	BUG_ON(!cma_obj);
 
-	x /= drm_format_horz_chroma_subsampling(fb->format->format);
-	y /= drm_format_vert_chroma_subsampling(fb->format->format);
+	x /= fb->format->hsub;
+	y /= fb->format->vsub;
 
 	return cma_obj->paddr + fb->offsets[1] + fb->pitches[1] * y +
 	       fb->format->cpp[1] * x - eba;
@@ -134,8 +134,8 @@ drm_plane_state_to_vbo(struct drm_plane_state *state)
 	cma_obj = drm_fb_cma_get_gem_obj(fb, 2);
 	BUG_ON(!cma_obj);
 
-	x /= drm_format_horz_chroma_subsampling(fb->format->format);
-	y /= drm_format_vert_chroma_subsampling(fb->format->format);
+	x /= fb->format->hsub;
+	y /= fb->format->vsub;
 
 	return cma_obj->paddr + fb->offsets[2] + fb->pitches[2] * y +
 	       fb->format->cpp[2] * x - eba;
@@ -352,7 +352,6 @@ static int ipu_plane_atomic_check(struct drm_plane *plane,
 	struct drm_framebuffer *old_fb = old_state->fb;
 	unsigned long eba, ubo, vbo, old_ubo, old_vbo, alpha_eba;
 	bool can_position = (plane->type == DRM_PLANE_TYPE_OVERLAY);
-	int hsub, vsub;
 	int ret;
 
 	/* Ok to disable */
@@ -471,10 +470,8 @@ static int ipu_plane_atomic_check(struct drm_plane *plane,
 		 * The x/y offsets must be even in case of horizontal/vertical
 		 * chroma subsampling.
 		 */
-		hsub = drm_format_horz_chroma_subsampling(fb->format->format);
-		vsub = drm_format_vert_chroma_subsampling(fb->format->format);
-		if (((state->src.x1 >> 16) & (hsub - 1)) ||
-		    ((state->src.y1 >> 16) & (vsub - 1)))
+		if (((state->src.x1 >> 16) & (fb->format->hsub - 1)) ||
+		    ((state->src.y1 >> 16) & (fb->format->vsub - 1)))
 			return -EINVAL;
 		break;
 	case DRM_FORMAT_RGB565_A8:
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c
index da1f727d7495..7994de952353 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c
@@ -557,14 +557,9 @@ static void _dpu_plane_setup_scaler(struct dpu_plane *pdpu,
 		struct dpu_plane_state *pstate,
 		const struct dpu_format *fmt, bool color_fill)
 {
-	uint32_t chroma_subsmpl_h, chroma_subsmpl_v;
+	const struct drm_format_info *info = drm_format_info(fmt->base.pixel_format);
 
 	/* don't chroma subsample if decimating */
-	chroma_subsmpl_h =
-		drm_format_horz_chroma_subsampling(fmt->base.pixel_format);
-	chroma_subsmpl_v =
-		drm_format_vert_chroma_subsampling(fmt->base.pixel_format);
-
 	/* update scaler. calculate default config for QSEED3 */
 	_dpu_plane_setup_scaler3(pdpu, pstate,
 			drm_rect_width(&pdpu->pipe_cfg.src_rect),
@@ -572,7 +567,7 @@ static void _dpu_plane_setup_scaler(struct dpu_plane *pdpu,
 			drm_rect_width(&pdpu->pipe_cfg.dst_rect),
 			drm_rect_height(&pdpu->pipe_cfg.dst_rect),
 			&pstate->scaler3_cfg, fmt,
-			chroma_subsmpl_h, chroma_subsmpl_v);
+			info->hsub, info->vsub);
 }
 
 /**
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c
index be13140967b4..9d9fb6c5fd68 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c
@@ -650,10 +650,10 @@ static int calc_scalex_steps(struct drm_plane *plane,
 		uint32_t pixel_format, uint32_t src, uint32_t dest,
 		uint32_t phasex_steps[COMP_MAX])
 {
+	const struct drm_format_info *info = drm_format_info(pixel_format);
 	struct mdp5_kms *mdp5_kms = get_kms(plane);
 	struct device *dev = mdp5_kms->dev->dev;
 	uint32_t phasex_step;
-	unsigned int hsub;
 	int ret;
 
 	ret = calc_phase_step(src, dest, &phasex_step);
@@ -662,11 +662,9 @@ static int calc_scalex_steps(struct drm_plane *plane,
 		return ret;
 	}
 
-	hsub = drm_format_horz_chroma_subsampling(pixel_format);
-
 	phasex_steps[COMP_0]   = phasex_step;
 	phasex_steps[COMP_3]   = phasex_step;
-	phasex_steps[COMP_1_2] = phasex_step / hsub;
+	phasex_steps[COMP_1_2] = phasex_step / info->hsub;
 
 	return 0;
 }
@@ -675,10 +673,10 @@ static int calc_scaley_steps(struct drm_plane *plane,
 		uint32_t pixel_format, uint32_t src, uint32_t dest,
 		uint32_t phasey_steps[COMP_MAX])
 {
+	const struct drm_format_info *info = drm_format_info(pixel_format);
 	struct mdp5_kms *mdp5_kms = get_kms(plane);
 	struct device *dev = mdp5_kms->dev->dev;
 	uint32_t phasey_step;
-	unsigned int vsub;
 	int ret;
 
 	ret = calc_phase_step(src, dest, &phasey_step);
@@ -687,11 +685,9 @@ static int calc_scaley_steps(struct drm_plane *plane,
 		return ret;
 	}
 
-	vsub = drm_format_vert_chroma_subsampling(pixel_format);
-
 	phasey_steps[COMP_0]   = phasey_step;
 	phasey_steps[COMP_3]   = phasey_step;
-	phasey_steps[COMP_1_2] = phasey_step / vsub;
+	phasey_steps[COMP_1_2] = phasey_step / info->vsub;
 
 	return 0;
 }
@@ -699,8 +695,9 @@ static int calc_scaley_steps(struct drm_plane *plane,
 static uint32_t get_scale_config(const struct mdp_format *format,
 		uint32_t src, uint32_t dst, bool horz)
 {
+	const struct drm_format_info *info = drm_format_info(format->base.pixel_format);
 	bool scaling = format->is_yuv ? true : (src != dst);
-	uint32_t sub, pix_fmt = format->base.pixel_format;
+	uint32_t sub;
 	uint32_t ya_filter, uv_filter;
 	bool yuv = format->is_yuv;
 
@@ -708,8 +705,7 @@ static uint32_t get_scale_config(const struct mdp_format *format,
 		return 0;
 
 	if (yuv) {
-		sub = horz ? drm_format_horz_chroma_subsampling(pix_fmt) :
-			     drm_format_vert_chroma_subsampling(pix_fmt);
+		sub = horz ? info->hsub : info->vsub;
 		uv_filter = ((src / sub) <= dst) ?
 				   SCALE_FILTER_BIL : SCALE_FILTER_PCMN;
 	}
@@ -754,7 +750,7 @@ static void mdp5_write_pixel_ext(struct mdp5_kms *mdp5_kms, enum mdp5_pipe pipe,
 	uint32_t src_w, int pe_left[COMP_MAX], int pe_right[COMP_MAX],
 	uint32_t src_h, int pe_top[COMP_MAX], int pe_bottom[COMP_MAX])
 {
-	uint32_t pix_fmt = format->base.pixel_format;
+	const struct drm_format_info *info = drm_format_info(format->base.pixel_format);
 	uint32_t lr, tb, req;
 	int i;
 
@@ -763,8 +759,8 @@ static void mdp5_write_pixel_ext(struct mdp5_kms *mdp5_kms, enum mdp5_pipe pipe,
 		uint32_t roi_h = src_h;
 
 		if (format->is_yuv && i == COMP_1_2) {
-			roi_w /= drm_format_horz_chroma_subsampling(pix_fmt);
-			roi_h /= drm_format_vert_chroma_subsampling(pix_fmt);
+			roi_w /= info->hsub;
+			roi_h /= info->vsub;
 		}
 
 		lr  = (pe_left[i] >= 0) ?
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
index 72ab8d89efa4..b30b2f4efc60 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
@@ -135,7 +135,7 @@ uint32_t mdp5_smp_calculate(struct mdp5_smp *smp,
 	uint32_t blkcfg = 0;
 
 	nplanes = info->num_planes;
-	hsub = drm_format_horz_chroma_subsampling(fmt);
+	hsub = info->hsub;
 
 	/* different if BWC (compressed framebuffer?) enabled: */
 	nlines = 2;
diff --git a/drivers/gpu/drm/msm/msm_fb.c b/drivers/gpu/drm/msm/msm_fb.c
index 432beddafb9e..f69c0afd6ec6 100644
--- a/drivers/gpu/drm/msm/msm_fb.c
+++ b/drivers/gpu/drm/msm/msm_fb.c
@@ -145,16 +145,12 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev,
 	struct drm_framebuffer *fb;
 	const struct msm_format *format;
 	int ret, i, n;
-	unsigned int hsub, vsub;
 
 	DBG("create framebuffer: dev=%p, mode_cmd=%p (%dx%d@%4.4s)",
 			dev, mode_cmd, mode_cmd->width, mode_cmd->height,
 			(char *)&mode_cmd->pixel_format);
 
 	n = info->num_planes;
-	hsub = drm_format_horz_chroma_subsampling(mode_cmd->pixel_format);
-	vsub = drm_format_vert_chroma_subsampling(mode_cmd->pixel_format);
-
 	format = kms->funcs->get_format(kms, mode_cmd->pixel_format,
 			mode_cmd->modifier[0]);
 	if (!format) {
@@ -180,8 +176,8 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev,
 	}
 
 	for (i = 0; i < n; i++) {
-		unsigned int width = mode_cmd->width / (i ? hsub : 1);
-		unsigned int height = mode_cmd->height / (i ? vsub : 1);
+		unsigned int width = mode_cmd->width / (i ? info->hsub : 1);
+		unsigned int height = mode_cmd->height / (i ? info->vsub : 1);
 		unsigned int min_size;
 
 		min_size = (height - 1) * mode_cmd->pitches[i]
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_fb.c b/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
index 606d176d5d96..c318fae28581 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
@@ -79,18 +79,13 @@ rockchip_user_fb_create(struct drm_device *dev, struct drm_file *file_priv,
 	struct drm_framebuffer *fb;
 	struct drm_gem_object *objs[ROCKCHIP_MAX_FB_BUFFER];
 	struct drm_gem_object *obj;
-	unsigned int hsub;
-	unsigned int vsub;
 	int num_planes = min_t(int, info->num_planes, ROCKCHIP_MAX_FB_BUFFER);
 	int ret;
 	int i;
 
-	hsub = drm_format_horz_chroma_subsampling(mode_cmd->pixel_format);
-	vsub = drm_format_vert_chroma_subsampling(mode_cmd->pixel_format);
-
 	for (i = 0; i < num_planes; i++) {
-		unsigned int width = mode_cmd->width / (i ? hsub : 1);
-		unsigned int height = mode_cmd->height / (i ? vsub : 1);
+		unsigned int width = mode_cmd->width / (i ? info->hsub : 1);
+		unsigned int height = mode_cmd->height / (i ? info->vsub : 1);
 		unsigned int min_size;
 
 		obj = drm_gem_object_lookup(file_priv, mode_cmd->handles[i]);
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
index c709f64b6f80..4189ca17f381 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
@@ -320,11 +320,9 @@ static void scl_vop_cal_scl_fac(struct vop *vop, const struct vop_win_data *win,
 	uint16_t yrgb_hor_scl_mode, yrgb_ver_scl_mode;
 	uint16_t cbcr_hor_scl_mode = SCALE_NONE;
 	uint16_t cbcr_ver_scl_mode = SCALE_NONE;
-	int hsub = drm_format_horz_chroma_subsampling(info->format);
-	int vsub = drm_format_vert_chroma_subsampling(info->format);
 	bool is_yuv = false;
-	uint16_t cbcr_src_w = src_w / hsub;
-	uint16_t cbcr_src_h = src_h / vsub;
+	uint16_t cbcr_src_w = src_w / info->hsub;
+	uint16_t cbcr_src_h = src_h / info->vsub;
 	uint16_t vsu_mode;
 	uint16_t lb_mode;
 	uint32_t val;
@@ -828,8 +826,8 @@ static void vop_plane_atomic_update(struct drm_plane *plane,
 		    (state->rotation & DRM_MODE_REFLECT_X) ? 1 : 0);
 
 	if (is_yuv) {
-		int hsub = drm_format_horz_chroma_subsampling(fb->format->format);
-		int vsub = drm_format_vert_chroma_subsampling(fb->format->format);
+		int hsub = fb->format->hsub;
+		int vsub = fb->format->vsub;
 		int bpp = fb->format->cpp[1];
 
 		uv_obj = fb->obj[1];
diff --git a/drivers/gpu/drm/tegra/fb.c b/drivers/gpu/drm/tegra/fb.c
index da0747e317b7..94fb75089d87 100644
--- a/drivers/gpu/drm/tegra/fb.c
+++ b/drivers/gpu/drm/tegra/fb.c
@@ -132,18 +132,15 @@ struct drm_framebuffer *tegra_fb_create(struct drm_device *drm,
 					const struct drm_mode_fb_cmd2 *cmd)
 {
 	const struct drm_format_info *info = drm_get_format_info(drm, cmd);
-	unsigned int hsub, vsub, i;
 	struct tegra_bo *planes[4];
 	struct drm_gem_object *gem;
 	struct drm_framebuffer *fb;
+	unsigned int i;
 	int err;
 
-	hsub = drm_format_horz_chroma_subsampling(cmd->pixel_format);
-	vsub = drm_format_vert_chroma_subsampling(cmd->pixel_format);
-
 	for (i = 0; i < info->num_planes; i++) {
-		unsigned int width = cmd->width / (i ? hsub : 1);
-		unsigned int height = cmd->height / (i ? vsub : 1);
+		unsigned int width = cmd->width / (i ? info->hsub : 1);
+		unsigned int height = cmd->height / (i ? info->vsub : 1);
 		unsigned int size, bpp;
 
 		gem = drm_gem_object_lookup(file, cmd->handles[i]);
diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index e3c0a350cb77..be2274924b34 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -310,10 +310,10 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 	struct drm_framebuffer *fb = state->fb;
 	struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);
 	u32 subpixel_src_mask = (1 << 16) - 1;
-	u32 format = fb->format->format;
 	int num_planes = fb->format->num_planes;
 	struct drm_crtc_state *crtc_state;
-	u32 h_subsample, v_subsample;
+	u32 h_subsample = fb->format->hsub;
+	u32 v_subsample = fb->format->vsub;
 	int i, ret;
 
 	crtc_state = drm_atomic_get_existing_crtc_state(state->state,
@@ -328,9 +328,6 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 	if (ret)
 		return ret;
 
-	h_subsample = drm_format_horz_chroma_subsampling(format);
-	v_subsample = drm_format_vert_chroma_subsampling(format);
-
 	for (i = 0; i < num_planes; i++)
 		vc4_state->offsets[i] = bo->paddr + fb->offsets[i];
 
@@ -593,7 +590,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	const struct hvs_format *format = vc4_get_hvs_format(fb->format->format);
 	u64 base_format_mod = fourcc_mod_broadcom_mod(fb->modifier);
 	int num_planes = fb->format->num_planes;
-	u32 h_subsample, v_subsample;
+	u32 h_subsample = fb->format->hsub;
+	u32 v_subsample = fb->format->vsub;
 	bool mix_plane_alpha;
 	bool covers_screen;
 	u32 scl0, scl1, pitch0;
@@ -623,9 +621,6 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 		scl1 = vc4_get_scl_field(state, 0);
 	}
 
-	h_subsample = drm_format_horz_chroma_subsampling(format->drm);
-	v_subsample = drm_format_vert_chroma_subsampling(format->drm);
-
 	rotation = drm_rotation_simplify(state->rotation,
 					 DRM_MODE_ROTATE_0 |
 					 DRM_MODE_REFLECT_X |
diff --git a/include/drm/drm_fourcc.h b/include/drm/drm_fourcc.h
index 41779b327d91..eeec449d6c6a 100644
--- a/include/drm/drm_fourcc.h
+++ b/include/drm/drm_fourcc.h
@@ -269,8 +269,6 @@ uint32_t drm_mode_legacy_fb_format(uint32_t bpp, uint32_t depth);
 uint32_t drm_driver_legacy_fb_format(struct drm_device *dev,
 				     uint32_t bpp, uint32_t depth);
 int drm_format_plane_cpp(uint32_t format, int plane);
-int drm_format_horz_chroma_subsampling(uint32_t format);
-int drm_format_vert_chroma_subsampling(uint32_t format);
 int drm_format_plane_width(int width, uint32_t format, int plane);
 int drm_format_plane_height(int height, uint32_t format, int plane);
 unsigned int drm_format_info_block_width(const struct drm_format_info *info,
-- 
cgit v1.2.3


From 24c478ead0bf50a758e9dbecc7356e9eebf20271 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Thu, 16 May 2019 12:31:49 +0200
Subject: drm/fourcc: Pass the format_info pointer to drm_format_plane_cpp

So far, the drm_format_plane_cpp function was operating on the format's
fourcc and was doing a lookup to retrieve the drm_format_info structure and
return the cpp.

However, this is inefficient since in most cases, we will have the
drm_format_info pointer already available so we shouldn't have to perform a
new lookup. Some drm_fourcc functions also already operate on the
drm_format_info pointer for that reason, so the API is quite inconsistent
there.

Let's follow the latter pattern and remove the extra lookup while being a
bit more consistent. In order to be extra consistent, also rename that
function to drm_format_info_plane_cpp and to a static function in the
header to match the current policy.

Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/32aa13e53dbc98a90207fd290aa8e79f785fb11e.1558002671.git-series.maxime.ripard@bootlin.com
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c     |  4 +++-
 drivers/gpu/drm/arm/malidp_hw.c            |  3 ++-
 drivers/gpu/drm/arm/malidp_planes.c        |  2 +-
 drivers/gpu/drm/drm_client.c               |  3 ++-
 drivers/gpu/drm/drm_fb_helper.c            |  2 +-
 drivers/gpu/drm/drm_format_helper.c        |  4 ++--
 drivers/gpu/drm/drm_fourcc.c               | 20 --------------------
 drivers/gpu/drm/i915/intel_sprite.c        |  3 ++-
 drivers/gpu/drm/mediatek/mtk_drm_fb.c      |  2 +-
 drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c  |  3 ++-
 drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c   |  2 +-
 drivers/gpu/drm/msm/msm_fb.c               |  2 +-
 drivers/gpu/drm/radeon/radeon_fb.c         |  4 +++-
 drivers/gpu/drm/rockchip/rockchip_drm_fb.c |  2 +-
 drivers/gpu/drm/stm/ltdc.c                 |  2 +-
 drivers/gpu/drm/tegra/fb.c                 |  2 +-
 drivers/gpu/drm/zte/zx_plane.c             |  2 +-
 include/drm/drm_fourcc.h                   | 18 +++++++++++++++++-
 18 files changed, 42 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
index e47609218839..06e73a343724 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
@@ -121,6 +121,8 @@ static int amdgpufb_create_pinned_object(struct amdgpu_fbdev *rfbdev,
 					 struct drm_mode_fb_cmd2 *mode_cmd,
 					 struct drm_gem_object **gobj_p)
 {
+	const struct drm_format_info *info = drm_get_format_info(dev,
+								 mode_cmd);
 	struct amdgpu_device *adev = rfbdev->adev;
 	struct drm_gem_object *gobj = NULL;
 	struct amdgpu_bo *abo = NULL;
@@ -131,7 +133,7 @@ static int amdgpufb_create_pinned_object(struct amdgpu_fbdev *rfbdev,
 	int height = mode_cmd->height;
 	u32 cpp;
 
-	cpp = drm_format_plane_cpp(mode_cmd->pixel_format, 0);
+	cpp = drm_format_info_plane_cpp(info, 0);
 
 	/* need to align pitch with crtc limits */
 	mode_cmd->pitches[0] = amdgpu_align_pitch(adev, mode_cmd->width, cpp,
diff --git a/drivers/gpu/drm/arm/malidp_hw.c b/drivers/gpu/drm/arm/malidp_hw.c
index 8df12e9a33bb..1c9e869f4c52 100644
--- a/drivers/gpu/drm/arm/malidp_hw.c
+++ b/drivers/gpu/drm/arm/malidp_hw.c
@@ -382,7 +382,8 @@ static void malidp500_modeset(struct malidp_hw_device *hwdev, struct videomode *
 
 int malidp_format_get_bpp(u32 fmt)
 {
-	int bpp = drm_format_plane_cpp(fmt, 0) * 8;
+	const struct drm_format_info *info = drm_format_info(fmt);
+	int bpp = drm_format_info_plane_cpp(info, 0) * 8;
 
 	if (bpp == 0) {
 		switch (fmt) {
diff --git a/drivers/gpu/drm/arm/malidp_planes.c b/drivers/gpu/drm/arm/malidp_planes.c
index 8f89813d08c1..361c02988375 100644
--- a/drivers/gpu/drm/arm/malidp_planes.c
+++ b/drivers/gpu/drm/arm/malidp_planes.c
@@ -227,7 +227,7 @@ bool malidp_format_mod_supported(struct drm_device *drm,
 
 	if (modifier & AFBC_SPLIT) {
 		if (!info->is_yuv) {
-			if (drm_format_plane_cpp(format, 0) <= 2) {
+			if (drm_format_info_plane_cpp(info, 0) <= 2) {
 				DRM_DEBUG_KMS("RGB formats <= 16bpp are not supported with SPLIT\n");
 				return false;
 			}
diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c
index f20d1dda3961..169d8eeaa662 100644
--- a/drivers/gpu/drm/drm_client.c
+++ b/drivers/gpu/drm/drm_client.c
@@ -243,6 +243,7 @@ static void drm_client_buffer_delete(struct drm_client_buffer *buffer)
 static struct drm_client_buffer *
 drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, u32 format)
 {
+	const struct drm_format_info *info = drm_format_info(format);
 	struct drm_mode_create_dumb dumb_args = { };
 	struct drm_device *dev = client->dev;
 	struct drm_client_buffer *buffer;
@@ -258,7 +259,7 @@ drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, u
 
 	dumb_args.width = width;
 	dumb_args.height = height;
-	dumb_args.bpp = drm_format_plane_cpp(format, 0) * 8;
+	dumb_args.bpp = drm_format_info_plane_cpp(info, 0) * 8;
 	ret = drm_mode_create_dumb(dev, &dumb_args, client->file);
 	if (ret)
 		goto err_delete;
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index a52b48fafbd7..e2d9a6da14a5 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -813,7 +813,7 @@ static void drm_fb_helper_dirty_blit_real(struct drm_fb_helper *fb_helper,
 					  struct drm_clip_rect *clip)
 {
 	struct drm_framebuffer *fb = fb_helper->fb;
-	unsigned int cpp = drm_format_plane_cpp(fb->format->format, 0);
+	unsigned int cpp = drm_format_info_plane_cpp(fb->format, 0);
 	size_t offset = clip->y1 * fb->pitches[0] + clip->x1 * cpp;
 	void *src = fb_helper->fbdev->screen_buffer + offset;
 	void *dst = fb_helper->buffer->vaddr + offset;
diff --git a/drivers/gpu/drm/drm_format_helper.c b/drivers/gpu/drm/drm_format_helper.c
index a18da35145b7..8ad66aa1362a 100644
--- a/drivers/gpu/drm/drm_format_helper.c
+++ b/drivers/gpu/drm/drm_format_helper.c
@@ -36,7 +36,7 @@ static unsigned int clip_offset(struct drm_rect *clip,
 void drm_fb_memcpy(void *dst, void *vaddr, struct drm_framebuffer *fb,
 		   struct drm_rect *clip)
 {
-	unsigned int cpp = drm_format_plane_cpp(fb->format->format, 0);
+	unsigned int cpp = drm_format_info_plane_cpp(fb->format, 0);
 	size_t len = (clip->x2 - clip->x1) * cpp;
 	unsigned int y, lines = clip->y2 - clip->y1;
 
@@ -63,7 +63,7 @@ void drm_fb_memcpy_dstclip(void __iomem *dst, void *vaddr,
 			   struct drm_framebuffer *fb,
 			   struct drm_rect *clip)
 {
-	unsigned int cpp = drm_format_plane_cpp(fb->format->format, 0);
+	unsigned int cpp = drm_format_info_plane_cpp(fb->format, 0);
 	unsigned int offset = clip_offset(clip, fb->pitches[0], cpp);
 	size_t len = (clip->x2 - clip->x1) * cpp;
 	unsigned int y, lines = clip->y2 - clip->y1;
diff --git a/drivers/gpu/drm/drm_fourcc.c b/drivers/gpu/drm/drm_fourcc.c
index e4a2c8372c8b..5f63fc74e265 100644
--- a/drivers/gpu/drm/drm_fourcc.c
+++ b/drivers/gpu/drm/drm_fourcc.c
@@ -332,26 +332,6 @@ drm_get_format_info(struct drm_device *dev,
 }
 EXPORT_SYMBOL(drm_get_format_info);
 
-/**
- * drm_format_plane_cpp - determine the bytes per pixel value
- * @format: pixel format (DRM_FORMAT_*)
- * @plane: plane index
- *
- * Returns:
- * The bytes per pixel value for the specified plane.
- */
-int drm_format_plane_cpp(uint32_t format, int plane)
-{
-	const struct drm_format_info *info;
-
-	info = drm_format_info(format);
-	if (!info || plane >= info->num_planes)
-		return 0;
-
-	return info->cpp[plane];
-}
-EXPORT_SYMBOL(drm_format_plane_cpp);
-
 /**
  * drm_format_plane_width - width of the plane given the first plane
  * @width: width of the first plane
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index 2913e89280d7..e35601b1f878 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -325,7 +325,8 @@ skl_plane_max_stride(struct intel_plane *plane,
 		     u32 pixel_format, u64 modifier,
 		     unsigned int rotation)
 {
-	int cpp = drm_format_plane_cpp(pixel_format, 0);
+	const struct drm_format_info *info = drm_format_info(pixel_format);
+	int cpp = drm_format_info_plane_cpp(info, 0);
 
 	/*
 	 * "The stride in bytes must not exceed the
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_fb.c b/drivers/gpu/drm/mediatek/mtk_drm_fb.c
index 68fdef8b12bd..0d5334a5a9a7 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_fb.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_fb.c
@@ -104,7 +104,7 @@ struct drm_framebuffer *mtk_drm_mode_fb_create(struct drm_device *dev,
 	if (!gem)
 		return ERR_PTR(-ENOENT);
 
-	bpp = drm_format_plane_cpp(cmd->pixel_format, 0);
+	bpp = drm_format_info_plane_cpp(info, 0);
 	size = (height - 1) * cmd->pitches[0] + width * bpp;
 	size += cmd->offsets[0];
 
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c
index f33d7007e830..59b24e0c5070 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c
@@ -782,6 +782,7 @@ static void get_roi(struct drm_crtc *crtc, uint32_t *roi_w, uint32_t *roi_h)
 
 static void mdp5_crtc_restore_cursor(struct drm_crtc *crtc)
 {
+	const struct drm_format_info *info = drm_format_info(DRM_FORMAT_ARGB8888);
 	struct mdp5_crtc_state *mdp5_cstate = to_mdp5_crtc_state(crtc->state);
 	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
 	struct mdp5_kms *mdp5_kms = get_kms(crtc);
@@ -800,7 +801,7 @@ static void mdp5_crtc_restore_cursor(struct drm_crtc *crtc)
 	width = mdp5_crtc->cursor.width;
 	height = mdp5_crtc->cursor.height;
 
-	stride = width * drm_format_plane_cpp(DRM_FORMAT_ARGB8888, 0);
+	stride = width * drm_format_info_plane_cpp(info, 0);
 
 	get_roi(crtc, &roi_w, &roi_h);
 
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
index b30b2f4efc60..1ca294694597 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
@@ -158,7 +158,7 @@ uint32_t mdp5_smp_calculate(struct mdp5_smp *smp,
 	for (i = 0; i < nplanes; i++) {
 		int n, fetch_stride, cpp;
 
-		cpp = drm_format_plane_cpp(fmt, i);
+		cpp = drm_format_info_plane_cpp(info, i);
 		fetch_stride = width * cpp / (i ? hsub : 1);
 
 		n = DIV_ROUND_UP(fetch_stride * nlines, smp->blk_size);
diff --git a/drivers/gpu/drm/msm/msm_fb.c b/drivers/gpu/drm/msm/msm_fb.c
index f69c0afd6ec6..29e45f2144b5 100644
--- a/drivers/gpu/drm/msm/msm_fb.c
+++ b/drivers/gpu/drm/msm/msm_fb.c
@@ -181,7 +181,7 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev,
 		unsigned int min_size;
 
 		min_size = (height - 1) * mode_cmd->pitches[i]
-			 + width * drm_format_plane_cpp(mode_cmd->pixel_format, i)
+			 + width * drm_format_info_plane_cpp(info, i)
 			 + mode_cmd->offsets[i];
 
 		if (bos[i]->size < min_size) {
diff --git a/drivers/gpu/drm/radeon/radeon_fb.c b/drivers/gpu/drm/radeon/radeon_fb.c
index 1298b84cb1c7..dbf596fc4339 100644
--- a/drivers/gpu/drm/radeon/radeon_fb.c
+++ b/drivers/gpu/drm/radeon/radeon_fb.c
@@ -125,6 +125,7 @@ static int radeonfb_create_pinned_object(struct radeon_fbdev *rfbdev,
 					 struct drm_mode_fb_cmd2 *mode_cmd,
 					 struct drm_gem_object **gobj_p)
 {
+	const struct drm_format_info *info;
 	struct radeon_device *rdev = rfbdev->rdev;
 	struct drm_gem_object *gobj = NULL;
 	struct radeon_bo *rbo = NULL;
@@ -135,7 +136,8 @@ static int radeonfb_create_pinned_object(struct radeon_fbdev *rfbdev,
 	int height = mode_cmd->height;
 	u32 cpp;
 
-	cpp = drm_format_plane_cpp(mode_cmd->pixel_format, 0);
+	info = drm_get_format_info(rdev->ddev, mode_cmd);
+	cpp = drm_format_info_plane_cpp(info, 0);
 
 	/* need to align pitch with crtc limits */
 	mode_cmd->pitches[0] = radeon_align_pitch(rdev, mode_cmd->width, cpp,
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_fb.c b/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
index c318fae28581..57873c99ae29 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
@@ -98,7 +98,7 @@ rockchip_user_fb_create(struct drm_device *dev, struct drm_file *file_priv,
 
 		min_size = (height - 1) * mode_cmd->pitches[i] +
 			mode_cmd->offsets[i] +
-			width * drm_format_plane_cpp(mode_cmd->pixel_format, i);
+			width * drm_format_info_plane_cpp(info, i);
 
 		if (obj->size < min_size) {
 			drm_gem_object_put_unlocked(obj);
diff --git a/drivers/gpu/drm/stm/ltdc.c b/drivers/gpu/drm/stm/ltdc.c
index 97912e2c663d..e7abe54b9746 100644
--- a/drivers/gpu/drm/stm/ltdc.c
+++ b/drivers/gpu/drm/stm/ltdc.c
@@ -784,7 +784,7 @@ static void ltdc_plane_atomic_update(struct drm_plane *plane,
 
 	/* Configures the color frame buffer pitch in bytes & line length */
 	pitch_in_bytes = fb->pitches[0];
-	line_length = drm_format_plane_cpp(fb->format->format, 0) *
+	line_length = drm_format_info_plane_cpp(fb->format, 0) *
 		      (x1 - x0 + 1) + (ldev->caps.bus_width >> 3) - 1;
 	val = ((pitch_in_bytes << 16) | line_length);
 	reg_update_bits(ldev->regs, LTDC_L1CFBLR + lofs,
diff --git a/drivers/gpu/drm/tegra/fb.c b/drivers/gpu/drm/tegra/fb.c
index 94fb75089d87..d1042196a30f 100644
--- a/drivers/gpu/drm/tegra/fb.c
+++ b/drivers/gpu/drm/tegra/fb.c
@@ -149,7 +149,7 @@ struct drm_framebuffer *tegra_fb_create(struct drm_device *drm,
 			goto unreference;
 		}
 
-		bpp = drm_format_plane_cpp(cmd->pixel_format, i);
+		bpp = drm_format_info_plane_cpp(info, i);
 
 		size = (height - 1) * cmd->pitches[i] +
 		       width * bpp + cmd->offsets[i];
diff --git a/drivers/gpu/drm/zte/zx_plane.c b/drivers/gpu/drm/zte/zx_plane.c
index c6a8be444300..d97a4dff515d 100644
--- a/drivers/gpu/drm/zte/zx_plane.c
+++ b/drivers/gpu/drm/zte/zx_plane.c
@@ -222,7 +222,7 @@ static void zx_vl_plane_atomic_update(struct drm_plane *plane,
 		cma_obj = drm_fb_cma_get_gem_obj(fb, i);
 		paddr = cma_obj->paddr + fb->offsets[i];
 		paddr += src_y * fb->pitches[i];
-		paddr += src_x * drm_format_plane_cpp(format, i);
+		paddr += src_x * drm_format_info_plane_cpp(fb->format, i);
 		zx_writel(paddr_reg, paddr);
 		paddr_reg += 4;
 	}
diff --git a/include/drm/drm_fourcc.h b/include/drm/drm_fourcc.h
index eeec449d6c6a..6b5a82b31bc4 100644
--- a/include/drm/drm_fourcc.h
+++ b/include/drm/drm_fourcc.h
@@ -260,6 +260,23 @@ drm_format_info_is_yuv_sampling_444(const struct drm_format_info *info)
 	return info->is_yuv && info->hsub == 1 && info->vsub == 1;
 }
 
+/**
+ * drm_format_info_plane_cpp - determine the bytes per pixel value
+ * @format: pixel format info
+ * @plane: plane index
+ *
+ * Returns:
+ * The bytes per pixel value for the specified plane.
+ */
+static inline
+int drm_format_info_plane_cpp(const struct drm_format_info *info, int plane)
+{
+	if (!info || plane >= info->num_planes)
+		return 0;
+
+	return info->cpp[plane];
+}
+
 const struct drm_format_info *__drm_format_info(u32 format);
 const struct drm_format_info *drm_format_info(u32 format);
 const struct drm_format_info *
@@ -268,7 +285,6 @@ drm_get_format_info(struct drm_device *dev,
 uint32_t drm_mode_legacy_fb_format(uint32_t bpp, uint32_t depth);
 uint32_t drm_driver_legacy_fb_format(struct drm_device *dev,
 				     uint32_t bpp, uint32_t depth);
-int drm_format_plane_cpp(uint32_t format, int plane);
 int drm_format_plane_width(int width, uint32_t format, int plane);
 int drm_format_plane_height(int height, uint32_t format, int plane);
 unsigned int drm_format_info_block_width(const struct drm_format_info *info,
-- 
cgit v1.2.3


From bf39607c16141811d0f5fe67e231364c96a87e09 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Thu, 16 May 2019 12:31:50 +0200
Subject: drm/fourcc: Pass the format_info pointer to
 drm_format_plane_width/height

So far, the drm_format_plane_height/width functions were operating on the
format's fourcc and was doing a lookup to retrieve the drm_format_info
structure and return the cpp.

However, this is inefficient since in most cases, we will have the
drm_format_info pointer already available so we shouldn't have to perform a
new lookup. Some drm_fourcc functions also already operate on the
drm_format_info pointer for that reason, so the API is quite inconsistent
there.

Let's follow the latter pattern and remove the extra lookup while being a
bit more consistent.

In order to be extra consistent, also rename that function to
drm_format_info_plane_cpp and to a static function in the header to match
the current policy. The parameters order have also be changed to match the
other functions prototype.

Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/514af1d489d80b8b1767e3716b663ce5103da6eb.1558002671.git-series.maxime.ripard@bootlin.com
---
 drivers/gpu/drm/drm_fourcc.c          | 48 -----------------------------------
 drivers/gpu/drm/meson/meson_overlay.c | 12 ++++-----
 include/drm/drm_fourcc.h              | 46 +++++++++++++++++++++++++++++++--
 3 files changed, 50 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_fourcc.c b/drivers/gpu/drm/drm_fourcc.c
index 5f63fc74e265..35b459d186c5 100644
--- a/drivers/gpu/drm/drm_fourcc.c
+++ b/drivers/gpu/drm/drm_fourcc.c
@@ -332,54 +332,6 @@ drm_get_format_info(struct drm_device *dev,
 }
 EXPORT_SYMBOL(drm_get_format_info);
 
-/**
- * drm_format_plane_width - width of the plane given the first plane
- * @width: width of the first plane
- * @format: pixel format
- * @plane: plane index
- *
- * Returns:
- * The width of @plane, given that the width of the first plane is @width.
- */
-int drm_format_plane_width(int width, uint32_t format, int plane)
-{
-	const struct drm_format_info *info;
-
-	info = drm_format_info(format);
-	if (!info || plane >= info->num_planes)
-		return 0;
-
-	if (plane == 0)
-		return width;
-
-	return width / info->hsub;
-}
-EXPORT_SYMBOL(drm_format_plane_width);
-
-/**
- * drm_format_plane_height - height of the plane given the first plane
- * @height: height of the first plane
- * @format: pixel format
- * @plane: plane index
- *
- * Returns:
- * The height of @plane, given that the height of the first plane is @height.
- */
-int drm_format_plane_height(int height, uint32_t format, int plane)
-{
-	const struct drm_format_info *info;
-
-	info = drm_format_info(format);
-	if (!info || plane >= info->num_planes)
-		return 0;
-
-	if (plane == 0)
-		return height;
-
-	return height / info->vsub;
-}
-EXPORT_SYMBOL(drm_format_plane_height);
-
 /**
  * drm_format_info_block_width - width in pixels of block.
  * @info: pixel format info
diff --git a/drivers/gpu/drm/meson/meson_overlay.c b/drivers/gpu/drm/meson/meson_overlay.c
index fb8515b2860c..55b3f2f2e608 100644
--- a/drivers/gpu/drm/meson/meson_overlay.c
+++ b/drivers/gpu/drm/meson/meson_overlay.c
@@ -466,8 +466,8 @@ static void meson_overlay_atomic_update(struct drm_plane *plane,
 		priv->viu.vd1_addr2 = gem->paddr + fb->offsets[2];
 		priv->viu.vd1_stride2 = fb->pitches[2];
 		priv->viu.vd1_height2 =
-			drm_format_plane_height(fb->height,
-						fb->format->format, 2);
+			drm_format_info_plane_height(fb->format,
+						fb->height, 2);
 		DRM_DEBUG("plane 2 addr 0x%x stride %d height %d\n",
 			 priv->viu.vd1_addr2,
 			 priv->viu.vd1_stride2,
@@ -478,8 +478,8 @@ static void meson_overlay_atomic_update(struct drm_plane *plane,
 		priv->viu.vd1_addr1 = gem->paddr + fb->offsets[1];
 		priv->viu.vd1_stride1 = fb->pitches[1];
 		priv->viu.vd1_height1 =
-			drm_format_plane_height(fb->height,
-						fb->format->format, 1);
+			drm_format_info_plane_height(fb->format,
+						fb->height, 1);
 		DRM_DEBUG("plane 1 addr 0x%x stride %d height %d\n",
 			 priv->viu.vd1_addr1,
 			 priv->viu.vd1_stride1,
@@ -490,8 +490,8 @@ static void meson_overlay_atomic_update(struct drm_plane *plane,
 		priv->viu.vd1_addr0 = gem->paddr + fb->offsets[0];
 		priv->viu.vd1_stride0 = fb->pitches[0];
 		priv->viu.vd1_height0 =
-			drm_format_plane_height(fb->height,
-						fb->format->format, 0);
+			drm_format_info_plane_height(fb->format,
+						fb->height, 0);
 		DRM_DEBUG("plane 0 addr 0x%x stride %d height %d\n",
 			 priv->viu.vd1_addr0,
 			 priv->viu.vd1_stride0,
diff --git a/include/drm/drm_fourcc.h b/include/drm/drm_fourcc.h
index 6b5a82b31bc4..4ef8ccb5d236 100644
--- a/include/drm/drm_fourcc.h
+++ b/include/drm/drm_fourcc.h
@@ -277,6 +277,50 @@ int drm_format_info_plane_cpp(const struct drm_format_info *info, int plane)
 	return info->cpp[plane];
 }
 
+/**
+ * drm_format_info_plane_width - width of the plane given the first plane
+ * @format: pixel format info
+ * @width: width of the first plane
+ * @plane: plane index
+ *
+ * Returns:
+ * The width of @plane, given that the width of the first plane is @width.
+ */
+static inline
+int drm_format_info_plane_width(const struct drm_format_info *info, int width,
+				int plane)
+{
+	if (!info || plane >= info->num_planes)
+		return 0;
+
+	if (plane == 0)
+		return width;
+
+	return width / info->hsub;
+}
+
+/**
+ * drm_format_info_plane_height - height of the plane given the first plane
+ * @format: pixel format info
+ * @height: height of the first plane
+ * @plane: plane index
+ *
+ * Returns:
+ * The height of @plane, given that the height of the first plane is @height.
+ */
+static inline
+int drm_format_info_plane_height(const struct drm_format_info *info, int height,
+				 int plane)
+{
+	if (!info || plane >= info->num_planes)
+		return 0;
+
+	if (plane == 0)
+		return height;
+
+	return height / info->vsub;
+}
+
 const struct drm_format_info *__drm_format_info(u32 format);
 const struct drm_format_info *drm_format_info(u32 format);
 const struct drm_format_info *
@@ -285,8 +329,6 @@ drm_get_format_info(struct drm_device *dev,
 uint32_t drm_mode_legacy_fb_format(uint32_t bpp, uint32_t depth);
 uint32_t drm_driver_legacy_fb_format(struct drm_device *dev,
 				     uint32_t bpp, uint32_t depth);
-int drm_format_plane_width(int width, uint32_t format, int plane);
-int drm_format_plane_height(int height, uint32_t format, int plane);
 unsigned int drm_format_info_block_width(const struct drm_format_info *info,
 					 int plane);
 unsigned int drm_format_info_block_height(const struct drm_format_info *info,
-- 
cgit v1.2.3


From b0f986b4b025c8036ab2c660460621c1d17656b5 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Thu, 16 May 2019 12:31:52 +0200
Subject: drm: Remove users of drm_format_info_plane_cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

drm_format_info_plane_cpp() basically just returns the cpp array content
found in the drm_format_info structure.

Since it's pretty trivial, let's remove the function and have the users use
the array directly

Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/c0a78c87cd0410a1819edad2794ad06543c85bb5.1558002671.git-series.maxime.ripard@bootlin.com
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c     |  2 +-
 drivers/gpu/drm/arm/malidp_hw.c            |  2 +-
 drivers/gpu/drm/arm/malidp_planes.c        |  2 +-
 drivers/gpu/drm/drm_client.c               |  2 +-
 drivers/gpu/drm/drm_fb_helper.c            |  2 +-
 drivers/gpu/drm/drm_format_helper.c        |  4 ++--
 drivers/gpu/drm/i915/intel_sprite.c        |  2 +-
 drivers/gpu/drm/mediatek/mtk_drm_fb.c      |  2 +-
 drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c  |  2 +-
 drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c   |  2 +-
 drivers/gpu/drm/msm/msm_fb.c               |  2 +-
 drivers/gpu/drm/radeon/radeon_fb.c         |  2 +-
 drivers/gpu/drm/rockchip/rockchip_drm_fb.c |  2 +-
 drivers/gpu/drm/stm/ltdc.c                 |  2 +-
 drivers/gpu/drm/tegra/fb.c                 |  2 +-
 drivers/gpu/drm/zte/zx_plane.c             |  2 +-
 include/drm/drm_fourcc.h                   | 17 -----------------
 17 files changed, 17 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
index 6edae6458be8..2e2869299a84 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
@@ -133,7 +133,7 @@ static int amdgpufb_create_pinned_object(struct amdgpu_fbdev *rfbdev,
 	u32 cpp;
 
 	info = drm_get_format_info(adev->ddev, mode_cmd);
-	cpp = drm_format_info_plane_cpp(info, 0);
+	cpp = info->cpp[0];
 
 	/* need to align pitch with crtc limits */
 	mode_cmd->pitches[0] = amdgpu_align_pitch(adev, mode_cmd->width, cpp,
diff --git a/drivers/gpu/drm/arm/malidp_hw.c b/drivers/gpu/drm/arm/malidp_hw.c
index 1c9e869f4c52..53391c0f87eb 100644
--- a/drivers/gpu/drm/arm/malidp_hw.c
+++ b/drivers/gpu/drm/arm/malidp_hw.c
@@ -383,7 +383,7 @@ static void malidp500_modeset(struct malidp_hw_device *hwdev, struct videomode *
 int malidp_format_get_bpp(u32 fmt)
 {
 	const struct drm_format_info *info = drm_format_info(fmt);
-	int bpp = drm_format_info_plane_cpp(info, 0) * 8;
+	int bpp = info->cpp[0] * 8;
 
 	if (bpp == 0) {
 		switch (fmt) {
diff --git a/drivers/gpu/drm/arm/malidp_planes.c b/drivers/gpu/drm/arm/malidp_planes.c
index 361c02988375..07ceb4ee14e3 100644
--- a/drivers/gpu/drm/arm/malidp_planes.c
+++ b/drivers/gpu/drm/arm/malidp_planes.c
@@ -227,7 +227,7 @@ bool malidp_format_mod_supported(struct drm_device *drm,
 
 	if (modifier & AFBC_SPLIT) {
 		if (!info->is_yuv) {
-			if (drm_format_info_plane_cpp(info, 0) <= 2) {
+			if (info->cpp[0] <= 2) {
 				DRM_DEBUG_KMS("RGB formats <= 16bpp are not supported with SPLIT\n");
 				return false;
 			}
diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c
index 169d8eeaa662..5abcd83da6a6 100644
--- a/drivers/gpu/drm/drm_client.c
+++ b/drivers/gpu/drm/drm_client.c
@@ -259,7 +259,7 @@ drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, u
 
 	dumb_args.width = width;
 	dumb_args.height = height;
-	dumb_args.bpp = drm_format_info_plane_cpp(info, 0) * 8;
+	dumb_args.bpp = info->cpp[0] * 8;
 	ret = drm_mode_create_dumb(dev, &dumb_args, client->file);
 	if (ret)
 		goto err_delete;
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index e2d9a6da14a5..302cf5f8bcce 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -813,7 +813,7 @@ static void drm_fb_helper_dirty_blit_real(struct drm_fb_helper *fb_helper,
 					  struct drm_clip_rect *clip)
 {
 	struct drm_framebuffer *fb = fb_helper->fb;
-	unsigned int cpp = drm_format_info_plane_cpp(fb->format, 0);
+	unsigned int cpp = fb->format->cpp[0];
 	size_t offset = clip->y1 * fb->pitches[0] + clip->x1 * cpp;
 	void *src = fb_helper->fbdev->screen_buffer + offset;
 	void *dst = fb_helper->buffer->vaddr + offset;
diff --git a/drivers/gpu/drm/drm_format_helper.c b/drivers/gpu/drm/drm_format_helper.c
index 8ad66aa1362a..0897cb9aeaff 100644
--- a/drivers/gpu/drm/drm_format_helper.c
+++ b/drivers/gpu/drm/drm_format_helper.c
@@ -36,7 +36,7 @@ static unsigned int clip_offset(struct drm_rect *clip,
 void drm_fb_memcpy(void *dst, void *vaddr, struct drm_framebuffer *fb,
 		   struct drm_rect *clip)
 {
-	unsigned int cpp = drm_format_info_plane_cpp(fb->format, 0);
+	unsigned int cpp = fb->format->cpp[0];
 	size_t len = (clip->x2 - clip->x1) * cpp;
 	unsigned int y, lines = clip->y2 - clip->y1;
 
@@ -63,7 +63,7 @@ void drm_fb_memcpy_dstclip(void __iomem *dst, void *vaddr,
 			   struct drm_framebuffer *fb,
 			   struct drm_rect *clip)
 {
-	unsigned int cpp = drm_format_info_plane_cpp(fb->format, 0);
+	unsigned int cpp = fb->format->cpp[0];
 	unsigned int offset = clip_offset(clip, fb->pitches[0], cpp);
 	size_t len = (clip->x2 - clip->x1) * cpp;
 	unsigned int y, lines = clip->y2 - clip->y1;
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index e35601b1f878..c1647c0cc217 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -326,7 +326,7 @@ skl_plane_max_stride(struct intel_plane *plane,
 		     unsigned int rotation)
 {
 	const struct drm_format_info *info = drm_format_info(pixel_format);
-	int cpp = drm_format_info_plane_cpp(info, 0);
+	int cpp = info->cpp[0];
 
 	/*
 	 * "The stride in bytes must not exceed the
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_fb.c b/drivers/gpu/drm/mediatek/mtk_drm_fb.c
index 0d5334a5a9a7..b5e2f230da00 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_fb.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_fb.c
@@ -104,7 +104,7 @@ struct drm_framebuffer *mtk_drm_mode_fb_create(struct drm_device *dev,
 	if (!gem)
 		return ERR_PTR(-ENOENT);
 
-	bpp = drm_format_info_plane_cpp(info, 0);
+	bpp = info->cpp[0];
 	size = (height - 1) * cmd->pitches[0] + width * bpp;
 	size += cmd->offsets[0];
 
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c
index 59b24e0c5070..c3751c95b452 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c
@@ -801,7 +801,7 @@ static void mdp5_crtc_restore_cursor(struct drm_crtc *crtc)
 	width = mdp5_crtc->cursor.width;
 	height = mdp5_crtc->cursor.height;
 
-	stride = width * drm_format_info_plane_cpp(info, 0);
+	stride = width * info->cpp[0];
 
 	get_roi(crtc, &roi_w, &roi_h);
 
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
index 1ca294694597..2834837f4d3e 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
@@ -158,7 +158,7 @@ uint32_t mdp5_smp_calculate(struct mdp5_smp *smp,
 	for (i = 0; i < nplanes; i++) {
 		int n, fetch_stride, cpp;
 
-		cpp = drm_format_info_plane_cpp(info, i);
+		cpp = info->cpp[i];
 		fetch_stride = width * cpp / (i ? hsub : 1);
 
 		n = DIV_ROUND_UP(fetch_stride * nlines, smp->blk_size);
diff --git a/drivers/gpu/drm/msm/msm_fb.c b/drivers/gpu/drm/msm/msm_fb.c
index 29e45f2144b5..68fa2c8f61e6 100644
--- a/drivers/gpu/drm/msm/msm_fb.c
+++ b/drivers/gpu/drm/msm/msm_fb.c
@@ -181,7 +181,7 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev,
 		unsigned int min_size;
 
 		min_size = (height - 1) * mode_cmd->pitches[i]
-			 + width * drm_format_info_plane_cpp(info, i)
+			 + width * info->cpp[i]
 			 + mode_cmd->offsets[i];
 
 		if (bos[i]->size < min_size) {
diff --git a/drivers/gpu/drm/radeon/radeon_fb.c b/drivers/gpu/drm/radeon/radeon_fb.c
index dbf596fc4339..287e3f92102a 100644
--- a/drivers/gpu/drm/radeon/radeon_fb.c
+++ b/drivers/gpu/drm/radeon/radeon_fb.c
@@ -137,7 +137,7 @@ static int radeonfb_create_pinned_object(struct radeon_fbdev *rfbdev,
 	u32 cpp;
 
 	info = drm_get_format_info(rdev->ddev, mode_cmd);
-	cpp = drm_format_info_plane_cpp(info, 0);
+	cpp = info->cpp[0];
 
 	/* need to align pitch with crtc limits */
 	mode_cmd->pitches[0] = radeon_align_pitch(rdev, mode_cmd->width, cpp,
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_fb.c b/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
index 57873c99ae29..31030cf81bc9 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_fb.c
@@ -98,7 +98,7 @@ rockchip_user_fb_create(struct drm_device *dev, struct drm_file *file_priv,
 
 		min_size = (height - 1) * mode_cmd->pitches[i] +
 			mode_cmd->offsets[i] +
-			width * drm_format_info_plane_cpp(info, i);
+			width * info->cpp[i];
 
 		if (obj->size < min_size) {
 			drm_gem_object_put_unlocked(obj);
diff --git a/drivers/gpu/drm/stm/ltdc.c b/drivers/gpu/drm/stm/ltdc.c
index e7abe54b9746..14eb8c40953b 100644
--- a/drivers/gpu/drm/stm/ltdc.c
+++ b/drivers/gpu/drm/stm/ltdc.c
@@ -784,7 +784,7 @@ static void ltdc_plane_atomic_update(struct drm_plane *plane,
 
 	/* Configures the color frame buffer pitch in bytes & line length */
 	pitch_in_bytes = fb->pitches[0];
-	line_length = drm_format_info_plane_cpp(fb->format, 0) *
+	line_length = fb->format->cpp[0] *
 		      (x1 - x0 + 1) + (ldev->caps.bus_width >> 3) - 1;
 	val = ((pitch_in_bytes << 16) | line_length);
 	reg_update_bits(ldev->regs, LTDC_L1CFBLR + lofs,
diff --git a/drivers/gpu/drm/tegra/fb.c b/drivers/gpu/drm/tegra/fb.c
index d1042196a30f..57cc26e1da01 100644
--- a/drivers/gpu/drm/tegra/fb.c
+++ b/drivers/gpu/drm/tegra/fb.c
@@ -149,7 +149,7 @@ struct drm_framebuffer *tegra_fb_create(struct drm_device *drm,
 			goto unreference;
 		}
 
-		bpp = drm_format_info_plane_cpp(info, i);
+		bpp = info->cpp[i];
 
 		size = (height - 1) * cmd->pitches[i] +
 		       width * bpp + cmd->offsets[i];
diff --git a/drivers/gpu/drm/zte/zx_plane.c b/drivers/gpu/drm/zte/zx_plane.c
index d97a4dff515d..706452f9b276 100644
--- a/drivers/gpu/drm/zte/zx_plane.c
+++ b/drivers/gpu/drm/zte/zx_plane.c
@@ -222,7 +222,7 @@ static void zx_vl_plane_atomic_update(struct drm_plane *plane,
 		cma_obj = drm_fb_cma_get_gem_obj(fb, i);
 		paddr = cma_obj->paddr + fb->offsets[i];
 		paddr += src_y * fb->pitches[i];
-		paddr += src_x * drm_format_info_plane_cpp(fb->format, i);
+		paddr += src_x * fb->format->cpp[i];
 		zx_writel(paddr_reg, paddr);
 		paddr_reg += 4;
 	}
diff --git a/include/drm/drm_fourcc.h b/include/drm/drm_fourcc.h
index 4ef8ccb5d236..405466692bd2 100644
--- a/include/drm/drm_fourcc.h
+++ b/include/drm/drm_fourcc.h
@@ -260,23 +260,6 @@ drm_format_info_is_yuv_sampling_444(const struct drm_format_info *info)
 	return info->is_yuv && info->hsub == 1 && info->vsub == 1;
 }
 
-/**
- * drm_format_info_plane_cpp - determine the bytes per pixel value
- * @format: pixel format info
- * @plane: plane index
- *
- * Returns:
- * The bytes per pixel value for the specified plane.
- */
-static inline
-int drm_format_info_plane_cpp(const struct drm_format_info *info, int plane)
-{
-	if (!info || plane >= info->num_planes)
-		return 0;
-
-	return info->cpp[plane];
-}
-
 /**
  * drm_format_info_plane_width - width of the plane given the first plane
  * @format: pixel format info
-- 
cgit v1.2.3


From ec9964b4803300fb86f8e8fd9b421e59f7a71dc5 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 13 May 2019 09:56:34 +0200
Subject: Platform: OLPC: Move EC-specific functionality out from x86

Move the olpc-ec driver away from the X86 OLPC platform so that it could be
used by the ARM based laptops too. Notably, the driver for the OLPC battery,
which is also used on the ARM models, builds on this driver's interface.

It is actually plaform independent: the OLPC EC commands with their argument
and responses are mostly the same despite the delivery mechanism is
different.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 arch/x86/include/asm/olpc.h         |  31 ----------
 arch/x86/platform/olpc/olpc.c       | 119 ++++++------------------------------
 drivers/platform/olpc/olpc-ec.c     |  99 +++++++++++++++++++++++++++++-
 drivers/power/supply/olpc_battery.c |   1 -
 include/linux/olpc-ec.h             |  32 +++++++++-
 5 files changed, 145 insertions(+), 137 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index c2bf1de5d901..6fe76282aceb 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -9,12 +9,10 @@
 struct olpc_platform_t {
 	int flags;
 	uint32_t boardrev;
-	int ecver;
 };
 
 #define OLPC_F_PRESENT		0x01
 #define OLPC_F_DCON		0x02
-#define OLPC_F_EC_WIDE_SCI	0x04
 
 #ifdef CONFIG_OLPC
 
@@ -64,13 +62,6 @@ static inline int olpc_board_at_least(uint32_t rev)
 	return olpc_platform_info.boardrev >= rev;
 }
 
-extern void olpc_ec_wakeup_set(u16 value);
-extern void olpc_ec_wakeup_clear(u16 value);
-extern bool olpc_ec_wakeup_available(void);
-
-extern int olpc_ec_mask_write(u16 bits);
-extern int olpc_ec_sci_query(u16 *sci_value);
-
 #else
 
 static inline int machine_is_olpc(void)
@@ -83,14 +74,6 @@ static inline int olpc_has_dcon(void)
 	return 0;
 }
 
-static inline void olpc_ec_wakeup_set(u16 value) { }
-static inline void olpc_ec_wakeup_clear(u16 value) { }
-
-static inline bool olpc_ec_wakeup_available(void)
-{
-	return false;
-}
-
 #endif
 
 #ifdef CONFIG_OLPC_XO1_PM
@@ -101,20 +84,6 @@ extern void olpc_xo1_pm_wakeup_clear(u16 value);
 
 extern int pci_olpc_init(void);
 
-/* SCI source values */
-
-#define EC_SCI_SRC_EMPTY	0x00
-#define EC_SCI_SRC_GAME		0x01
-#define EC_SCI_SRC_BATTERY	0x02
-#define EC_SCI_SRC_BATSOC	0x04
-#define EC_SCI_SRC_BATERR	0x08
-#define EC_SCI_SRC_EBOOK	0x10	/* XO-1 only */
-#define EC_SCI_SRC_WLAN		0x20	/* XO-1 only */
-#define EC_SCI_SRC_ACPWR	0x40
-#define EC_SCI_SRC_BATCRIT	0x80
-#define EC_SCI_SRC_GPWAKE	0x100	/* XO-1.5 only */
-#define EC_SCI_SRC_ALL		0x1FF
-
 /* GPIO assignments */
 
 #define OLPC_GPIO_MIC_AC	1
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index f0e920fb98ad..c6c62b4f251f 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -30,9 +30,6 @@
 struct olpc_platform_t olpc_platform_info;
 EXPORT_SYMBOL_GPL(olpc_platform_info);
 
-/* EC event mask to be applied during suspend (defining wakeup sources). */
-static u16 ec_wakeup_mask;
-
 /* what the timeout *should* be (in ms) */
 #define EC_BASE_TIMEOUT 20
 
@@ -186,83 +183,6 @@ err:
 	return ret;
 }
 
-void olpc_ec_wakeup_set(u16 value)
-{
-	ec_wakeup_mask |= value;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set);
-
-void olpc_ec_wakeup_clear(u16 value)
-{
-	ec_wakeup_mask &= ~value;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear);
-
-/*
- * Returns true if the compile and runtime configurations allow for EC events
- * to wake the system.
- */
-bool olpc_ec_wakeup_available(void)
-{
-	if (!machine_is_olpc())
-		return false;
-
-	/*
-	 * XO-1 EC wakeups are available when olpc-xo1-sci driver is
-	 * compiled in
-	 */
-#ifdef CONFIG_OLPC_XO1_SCI
-	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */
-		return true;
-#endif
-
-	/*
-	 * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
-	 * compiled in
-	 */
-#ifdef CONFIG_OLPC_XO15_SCI
-	if (olpc_platform_info.boardrev >= olpc_board_pre(0xd0)) /* XO-1.5 */
-		return true;
-#endif
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available);
-
-int olpc_ec_mask_write(u16 bits)
-{
-	if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
-		__be16 ec_word = cpu_to_be16(bits);
-		return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *) &ec_word, 2,
-				   NULL, 0);
-	} else {
-		unsigned char ec_byte = bits & 0xff;
-		return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0);
-	}
-}
-EXPORT_SYMBOL_GPL(olpc_ec_mask_write);
-
-int olpc_ec_sci_query(u16 *sci_value)
-{
-	int ret;
-
-	if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
-		__be16 ec_word;
-		ret = olpc_ec_cmd(EC_EXT_SCI_QUERY,
-			NULL, 0, (void *) &ec_word, 2);
-		if (ret == 0)
-			*sci_value = be16_to_cpu(ec_word);
-	} else {
-		unsigned char ec_byte;
-		ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1);
-		if (ret == 0)
-			*sci_value = ec_byte;
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
-
 static bool __init check_ofw_architecture(struct device_node *root)
 {
 	const char *olpc_arch;
@@ -296,6 +216,10 @@ static bool __init platform_detect(void)
 	if (success) {
 		olpc_platform_info.boardrev = get_board_revision(root);
 		olpc_platform_info.flags |= OLPC_F_PRESENT;
+
+		pr_info("OLPC board revision %s%X\n",
+			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
+			olpc_platform_info.boardrev >> 4);
 	}
 
 	of_node_put(root);
@@ -315,27 +239,8 @@ static int __init add_xo1_platform_devices(void)
 	return PTR_ERR_OR_ZERO(pdev);
 }
 
-static int olpc_xo1_ec_probe(struct platform_device *pdev)
-{
-	/* get the EC revision */
-	olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
-			(unsigned char *) &olpc_platform_info.ecver, 1);
-
-	/* EC version 0x5f adds support for wide SCI mask */
-	if (olpc_platform_info.ecver >= 0x5f)
-		olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI;
-
-	pr_info("OLPC board revision %s%X (EC=%x)\n",
-			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
-			olpc_platform_info.boardrev >> 4,
-			olpc_platform_info.ecver);
-
-	return 0;
-}
 static int olpc_xo1_ec_suspend(struct platform_device *pdev)
 {
-	olpc_ec_mask_write(ec_wakeup_mask);
-
 	/*
 	 * Squelch SCIs while suspended.  This is a fix for
 	 * <http://dev.laptop.org/ticket/1835>.
@@ -359,15 +264,27 @@ static int olpc_xo1_ec_resume(struct platform_device *pdev)
 }
 
 static struct olpc_ec_driver ec_xo1_driver = {
-	.probe = olpc_xo1_ec_probe,
 	.suspend = olpc_xo1_ec_suspend,
 	.resume = olpc_xo1_ec_resume,
 	.ec_cmd = olpc_xo1_ec_cmd,
+#ifdef CONFIG_OLPC_XO1_SCI
+	/*
+	 * XO-1 EC wakeups are available when olpc-xo1-sci driver is
+	 * compiled in
+	 */
+	.wakeup_available = true,
+#endif
 };
 
 static struct olpc_ec_driver ec_xo1_5_driver = {
-	.probe = olpc_xo1_ec_probe,
 	.ec_cmd = olpc_xo1_ec_cmd,
+#ifdef CONFIG_OLPC_XO1_5_SCI
+	/*
+	 * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
+	 * compiled in
+	 */
+	.wakeup_available = true,
+#endif
 };
 
 static int __init olpc_init(void)
diff --git a/drivers/platform/olpc/olpc-ec.c b/drivers/platform/olpc/olpc-ec.c
index 981955dce926..2a647455a368 100644
--- a/drivers/platform/olpc/olpc-ec.c
+++ b/drivers/platform/olpc/olpc-ec.c
@@ -32,6 +32,7 @@ struct ec_cmd_desc {
 
 struct olpc_ec_priv {
 	struct olpc_ec_driver *drv;
+	u8 version;
 	struct work_struct worker;
 	struct mutex cmd_lock;
 
@@ -41,6 +42,12 @@ struct olpc_ec_priv {
 
 	struct dentry *dbgfs_dir;
 
+	/*
+	 * EC event mask to be applied during suspend (defining wakeup
+	 * sources).
+	 */
+	u16 ec_wakeup_mask;
+
 	/*
 	 * Running an EC command while suspending means we don't always finish
 	 * the command before the machine suspends.  This means that the EC
@@ -149,6 +156,88 @@ int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen)
 }
 EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 
+void olpc_ec_wakeup_set(u16 value)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+
+	if (WARN_ON(!ec))
+		return;
+
+	ec->ec_wakeup_mask |= value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set);
+
+void olpc_ec_wakeup_clear(u16 value)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+
+	if (WARN_ON(!ec))
+		return;
+
+	ec->ec_wakeup_mask &= ~value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear);
+
+int olpc_ec_mask_write(u16 bits)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+
+	if (WARN_ON(!ec))
+		return -ENODEV;
+
+	/* EC version 0x5f adds support for wide SCI mask */
+	if (ec->version >= 0x5f) {
+		__be16 ec_word = cpu_to_be16(bits);
+
+		return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *)&ec_word, 2, NULL, 0);
+	} else {
+		u8 ec_byte = bits & 0xff;
+
+		return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0);
+	}
+}
+EXPORT_SYMBOL_GPL(olpc_ec_mask_write);
+
+/*
+ * Returns true if the compile and runtime configurations allow for EC events
+ * to wake the system.
+ */
+bool olpc_ec_wakeup_available(void)
+{
+	if (WARN_ON(!ec_driver))
+		return false;
+
+	return ec_driver->wakeup_available;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available);
+
+int olpc_ec_sci_query(u16 *sci_value)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+	int ret;
+
+	if (WARN_ON(!ec))
+		return -ENODEV;
+
+	/* EC version 0x5f adds support for wide SCI mask */
+	if (ec->version >= 0x5f) {
+		__be16 ec_word;
+
+		ret = olpc_ec_cmd(EC_EXT_SCI_QUERY, NULL, 0, (void *)&ec_word, 2);
+		if (ret == 0)
+			*sci_value = be16_to_cpu(ec_word);
+	} else {
+		u8 ec_byte;
+
+		ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1);
+		if (ret == 0)
+			*sci_value = ec_byte;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
+
 #ifdef CONFIG_DEBUG_FS
 
 /*
@@ -276,14 +365,16 @@ static int olpc_ec_probe(struct platform_device *pdev)
 	ec_priv = ec;
 	platform_set_drvdata(pdev, ec);
 
-	err = ec_driver->probe ? ec_driver->probe(pdev) : 0;
+	/* get the EC revision */
+	err = olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, &ec->version, 1);
 	if (err) {
 		ec_priv = NULL;
 		kfree(ec);
-	} else {
-		ec->dbgfs_dir = olpc_ec_setup_debugfs();
+		return err;
 	}
 
+	ec->dbgfs_dir = olpc_ec_setup_debugfs();
+
 	return err;
 }
 
@@ -293,6 +384,8 @@ static int olpc_ec_suspend(struct device *dev)
 	struct olpc_ec_priv *ec = platform_get_drvdata(pdev);
 	int err = 0;
 
+	olpc_ec_mask_write(ec->ec_wakeup_mask);
+
 	if (ec_driver->suspend)
 		err = ec_driver->suspend(pdev);
 	if (!err)
diff --git a/drivers/power/supply/olpc_battery.c b/drivers/power/supply/olpc_battery.c
index 7720e4c2ac0b..066ec9a11153 100644
--- a/drivers/power/supply/olpc_battery.c
+++ b/drivers/power/supply/olpc_battery.c
@@ -20,7 +20,6 @@
 #include <linux/jiffies.h>
 #include <linux/sched.h>
 #include <linux/olpc-ec.h>
-#include <asm/olpc.h>
 
 
 #define EC_BAT_VOLTAGE	0x10	/* uint16_t,	*9.76/32,    mV   */
diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h
index 79bdc6328c52..7fa3d27f7fee 100644
--- a/include/linux/olpc-ec.h
+++ b/include/linux/olpc-ec.h
@@ -16,14 +16,28 @@
 #define EC_SCI_QUERY			0x84
 #define EC_EXT_SCI_QUERY		0x85
 
+/* SCI source values */
+#define EC_SCI_SRC_EMPTY        0x00
+#define EC_SCI_SRC_GAME         0x01
+#define EC_SCI_SRC_BATTERY      0x02
+#define EC_SCI_SRC_BATSOC       0x04
+#define EC_SCI_SRC_BATERR       0x08
+#define EC_SCI_SRC_EBOOK        0x10    /* XO-1 only */
+#define EC_SCI_SRC_WLAN         0x20    /* XO-1 only */
+#define EC_SCI_SRC_ACPWR        0x40
+#define EC_SCI_SRC_BATCRIT      0x80
+#define EC_SCI_SRC_GPWAKE       0x100   /* XO-1.5 only */
+#define EC_SCI_SRC_ALL          0x1FF
+
 struct platform_device;
 
 struct olpc_ec_driver {
-	int (*probe)(struct platform_device *);
 	int (*suspend)(struct platform_device *);
 	int (*resume)(struct platform_device *);
 
 	int (*ec_cmd)(u8, u8 *, size_t, u8 *, size_t, void *);
+
+	bool wakeup_available;
 };
 
 #ifdef CONFIG_OLPC
@@ -33,11 +47,27 @@ extern void olpc_ec_driver_register(struct olpc_ec_driver *drv, void *arg);
 extern int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf,
 		size_t outlen);
 
+extern void olpc_ec_wakeup_set(u16 value);
+extern void olpc_ec_wakeup_clear(u16 value);
+
+extern int olpc_ec_mask_write(u16 bits);
+extern int olpc_ec_sci_query(u16 *sci_value);
+
+extern bool olpc_ec_wakeup_available(void);
+
 #else
 
 static inline int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf,
 		size_t outlen) { return -ENODEV; }
 
+static inline void olpc_ec_wakeup_set(u16 value) { }
+static inline void olpc_ec_wakeup_clear(u16 value) { }
+
+static inline bool olpc_ec_wakeup_available(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_OLPC */
 
 #endif /* _LINUX_OLPC_EC_H */
-- 
cgit v1.2.3


From 8097548f3af9ec990169574ad9d874052b78bff8 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 13 May 2019 09:56:36 +0200
Subject: Platform: OLPC: Use BIT() and GENMASK() for event masks

Just a cosmetic tidy-up.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/olpc-ec.h | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h
index 7fa3d27f7fee..f7b6a7eda232 100644
--- a/include/linux/olpc-ec.h
+++ b/include/linux/olpc-ec.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_OLPC_EC_H
 #define _LINUX_OLPC_EC_H
 
+#include <linux/bits.h>
+
 /* XO-1 EC commands */
 #define EC_FIRMWARE_REV			0x08
 #define EC_WRITE_SCI_MASK		0x1b
@@ -17,17 +19,16 @@
 #define EC_EXT_SCI_QUERY		0x85
 
 /* SCI source values */
-#define EC_SCI_SRC_EMPTY        0x00
-#define EC_SCI_SRC_GAME         0x01
-#define EC_SCI_SRC_BATTERY      0x02
-#define EC_SCI_SRC_BATSOC       0x04
-#define EC_SCI_SRC_BATERR       0x08
-#define EC_SCI_SRC_EBOOK        0x10    /* XO-1 only */
-#define EC_SCI_SRC_WLAN         0x20    /* XO-1 only */
-#define EC_SCI_SRC_ACPWR        0x40
-#define EC_SCI_SRC_BATCRIT      0x80
-#define EC_SCI_SRC_GPWAKE       0x100   /* XO-1.5 only */
-#define EC_SCI_SRC_ALL          0x1FF
+#define EC_SCI_SRC_GAME         BIT(0)
+#define EC_SCI_SRC_BATTERY      BIT(1)
+#define EC_SCI_SRC_BATSOC       BIT(2)
+#define EC_SCI_SRC_BATERR       BIT(3)
+#define EC_SCI_SRC_EBOOK        BIT(4)    /* XO-1 only */
+#define EC_SCI_SRC_WLAN         BIT(5)    /* XO-1 only */
+#define EC_SCI_SRC_ACPWR        BIT(6)
+#define EC_SCI_SRC_BATCRIT      BIT(7)
+#define EC_SCI_SRC_GPWAKE       BIT(8)   /* XO-1.5 only */
+#define EC_SCI_SRC_ALL          GENMASK(8, 0)
 
 struct platform_device;
 
-- 
cgit v1.2.3


From 0c3d931b3ab9efeea4948b5373c62095449d0101 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 13 May 2019 09:56:37 +0200
Subject: Platform: OLPC: Add XO-1.75 EC driver

It's based off the driver from the OLPC kernel sources. Somewhat
modernized and cleaned up, for better or worse.

Modified to plug into the olpc-ec driver infrastructure (so that battery
interface and debugfs could be reused) and the SPI slave framework.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 arch/x86/Kconfig                      |   1 +
 drivers/platform/Kconfig              |   2 +
 drivers/platform/Makefile             |   2 +-
 drivers/platform/olpc/Kconfig         |  14 +
 drivers/platform/olpc/Makefile        |   3 +-
 drivers/platform/olpc/olpc-xo175-ec.c | 752 ++++++++++++++++++++++++++++++++++
 include/linux/olpc-ec.h               |   4 +-
 7 files changed, 774 insertions(+), 4 deletions(-)
 create mode 100644 drivers/platform/olpc/Kconfig
 create mode 100644 drivers/platform/olpc/olpc-xo175-ec.c

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2bbbd4d1ba31..cb1c073b3c7e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2698,6 +2698,7 @@ config OLPC
 	select OF
 	select OF_PROMTREE
 	select IRQ_DOMAIN
+	select OLPC_EC
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
diff --git a/drivers/platform/Kconfig b/drivers/platform/Kconfig
index d4c2e424a700..4313d73d3618 100644
--- a/drivers/platform/Kconfig
+++ b/drivers/platform/Kconfig
@@ -10,3 +10,5 @@ source "drivers/platform/goldfish/Kconfig"
 source "drivers/platform/chrome/Kconfig"
 
 source "drivers/platform/mellanox/Kconfig"
+
+source "drivers/platform/olpc/Kconfig"
diff --git a/drivers/platform/Makefile b/drivers/platform/Makefile
index 4b2ce58bcd9c..6fda58c021ca 100644
--- a/drivers/platform/Makefile
+++ b/drivers/platform/Makefile
@@ -6,6 +6,6 @@
 obj-$(CONFIG_X86)		+= x86/
 obj-$(CONFIG_MELLANOX_PLATFORM)	+= mellanox/
 obj-$(CONFIG_MIPS)		+= mips/
-obj-$(CONFIG_OLPC)		+= olpc/
+obj-$(CONFIG_OLPC_EC)		+= olpc/
 obj-$(CONFIG_GOLDFISH)		+= goldfish/
 obj-$(CONFIG_CHROME_PLATFORMS)	+= chrome/
diff --git a/drivers/platform/olpc/Kconfig b/drivers/platform/olpc/Kconfig
new file mode 100644
index 000000000000..559f843199d7
--- /dev/null
+++ b/drivers/platform/olpc/Kconfig
@@ -0,0 +1,14 @@
+config OLPC_EC
+	bool
+
+config OLPC_XO175_EC
+	tristate "OLPC XO 1.75 Embedded Controller"
+	depends on ARCH_MMP || COMPILE_TEST
+	select SPI_SLAVE
+	select OLPC_EC
+	help
+	  Include support for the OLPC XO Embedded Controller (EC). The EC
+	  provides various platform services, including support for the power,
+	  button, restart, shutdown and battery charging status.
+
+	  Unless you have an OLPC XO laptop, you will want to say N.
diff --git a/drivers/platform/olpc/Makefile b/drivers/platform/olpc/Makefile
index dc8b26bc7209..01fe6ba01665 100644
--- a/drivers/platform/olpc/Makefile
+++ b/drivers/platform/olpc/Makefile
@@ -1,4 +1,5 @@
 #
 # OLPC XO platform-specific drivers
 #
-obj-$(CONFIG_OLPC)		+= olpc-ec.o
+obj-$(CONFIG_OLPC_EC)		+= olpc-ec.o
+obj-$(CONFIG_OLPC_XO175_EC)	+= olpc-xo175-ec.o
diff --git a/drivers/platform/olpc/olpc-xo175-ec.c b/drivers/platform/olpc/olpc-xo175-ec.c
new file mode 100644
index 000000000000..344d14f3da54
--- /dev/null
+++ b/drivers/platform/olpc/olpc-xo175-ec.c
@@ -0,0 +1,752 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Driver for the OLPC XO-1.75 Embedded Controller.
+ *
+ * The EC protocol is documented at:
+ * http://wiki.laptop.org/go/XO_1.75_HOST_to_EC_Protocol
+ *
+ * Copyright (C) 2010 One Laptop per Child Foundation.
+ * Copyright (C) 2018 Lubomir Rintel <lkundrak@v3.sk>
+ */
+
+#include <linux/completion.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/input.h>
+#include <linux/kfifo.h>
+#include <linux/module.h>
+#include <linux/olpc-ec.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/spi/spi.h>
+
+struct ec_cmd_t {
+	u8 cmd;
+	u8 bytes_returned;
+};
+
+enum ec_chan_t {
+	CHAN_NONE = 0,
+	CHAN_SWITCH,
+	CHAN_CMD_RESP,
+	CHAN_KEYBOARD,
+	CHAN_TOUCHPAD,
+	CHAN_EVENT,
+	CHAN_DEBUG,
+	CHAN_CMD_ERROR,
+};
+
+/*
+ * EC events
+ */
+#define EVENT_AC_CHANGE			1  /* AC plugged/unplugged */
+#define EVENT_BATTERY_STATUS		2  /* Battery low/full/error/gone */
+#define EVENT_BATTERY_CRITICAL		3  /* Battery critical voltage */
+#define EVENT_BATTERY_SOC_CHANGE	4  /* 1% SOC Change */
+#define EVENT_BATTERY_ERROR		5  /* Abnormal error, query for cause */
+#define EVENT_POWER_PRESSED		6  /* Power button was pressed */
+#define EVENT_POWER_PRESS_WAKE		7  /* Woken up with a power button */
+#define EVENT_TIMED_HOST_WAKE		8  /* Host wake timer */
+#define EVENT_OLS_HIGH_LIMIT		9  /* OLS crossed dark threshold */
+#define EVENT_OLS_LOW_LIMIT		10 /* OLS crossed light threshold */
+
+/*
+ * EC commands
+ * (from http://dev.laptop.org/git/users/rsmith/ec-1.75/tree/ec_cmd.h)
+ */
+#define CMD_GET_API_VERSION		0x08 /* out: u8 */
+#define CMD_READ_VOLTAGE		0x10 /* out: u16, *9.76/32, mV */
+#define CMD_READ_CURRENT		0x11 /* out: s16, *15.625/120, mA */
+#define CMD_READ_ACR			0x12 /* out: s16, *6250/15, uAh */
+#define CMD_READ_BATT_TEMPERATURE	0x13 /* out: u16, *100/256, deg C */
+#define CMD_READ_AMBIENT_TEMPERATURE	0x14 /* unimplemented, no hardware */
+#define CMD_READ_BATTERY_STATUS		0x15 /* out: u8, bitmask */
+#define CMD_READ_SOC			0x16 /* out: u8, percentage */
+#define CMD_READ_GAUGE_ID		0x17 /* out: u8 * 8 */
+#define CMD_READ_GAUGE_DATA		0x18 /* in: u8 addr, out: u8 data */
+#define CMD_READ_BOARD_ID		0x19 /* out: u16 (platform id) */
+#define CMD_READ_BATT_ERR_CODE		0x1f /* out: u8, error bitmask */
+#define CMD_SET_DCON_POWER		0x26 /* in: u8 */
+#define CMD_RESET_EC			0x28 /* none */
+#define CMD_READ_BATTERY_TYPE		0x2c /* out: u8 */
+#define CMD_SET_AUTOWAK			0x33 /* out: u8 */
+#define CMD_SET_EC_WAKEUP_TIMER		0x36 /* in: u32, out: ? */
+#define CMD_READ_EXT_SCI_MASK		0x37 /* ? */
+#define CMD_WRITE_EXT_SCI_MASK		0x38 /* ? */
+#define CMD_CLEAR_EC_WAKEUP_TIMER	0x39 /* none */
+#define CMD_ENABLE_RUNIN_DISCHARGE	0x3B /* none */
+#define CMD_DISABLE_RUNIN_DISCHARGE	0x3C /* none */
+#define CMD_READ_MPPT_ACTIVE		0x3d /* out: u8 */
+#define CMD_READ_MPPT_LIMIT		0x3e /* out: u8 */
+#define CMD_SET_MPPT_LIMIT		0x3f /* in: u8 */
+#define CMD_DISABLE_MPPT		0x40 /* none */
+#define CMD_ENABLE_MPPT			0x41 /* none */
+#define CMD_READ_VIN			0x42 /* out: u16 */
+#define CMD_EXT_SCI_QUERY		0x43 /* ? */
+#define RSP_KEYBOARD_DATA		0x48 /* ? */
+#define RSP_TOUCHPAD_DATA		0x49 /* ? */
+#define CMD_GET_FW_VERSION		0x4a /* out: u8 * 16 */
+#define CMD_POWER_CYCLE			0x4b /* none */
+#define CMD_POWER_OFF			0x4c /* none */
+#define CMD_RESET_EC_SOFT		0x4d /* none */
+#define CMD_READ_GAUGE_U16		0x4e /* ? */
+#define CMD_ENABLE_MOUSE		0x4f /* ? */
+#define CMD_ECHO			0x52 /* in: u8 * 5, out: u8 * 5 */
+#define CMD_GET_FW_DATE			0x53 /* out: u8 * 16 */
+#define CMD_GET_FW_USER			0x54 /* out: u8 * 16 */
+#define CMD_TURN_OFF_POWER		0x55 /* none (same as 0x4c) */
+#define CMD_READ_OLS			0x56 /* out: u16 */
+#define CMD_OLS_SMT_LEDON		0x57 /* none */
+#define CMD_OLS_SMT_LEDOFF		0x58 /* none */
+#define CMD_START_OLS_ASSY		0x59 /* none */
+#define CMD_STOP_OLS_ASSY		0x5a /* none */
+#define CMD_OLS_SMTTEST_STOP		0x5b /* none */
+#define CMD_READ_VIN_SCALED		0x5c /* out: u16 */
+#define CMD_READ_BAT_MIN_W		0x5d /* out: u16 */
+#define CMD_READ_BAR_MAX_W		0x5e /* out: u16 */
+#define CMD_RESET_BAT_MINMAX_W		0x5f /* none */
+#define CMD_READ_LOCATION		0x60 /* in: u16 addr, out: u8 data */
+#define CMD_WRITE_LOCATION		0x61 /* in: u16 addr, u8 data */
+#define CMD_KEYBOARD_CMD		0x62 /* in: u8, out: ? */
+#define CMD_TOUCHPAD_CMD		0x63 /* in: u8, out: ? */
+#define CMD_GET_FW_HASH			0x64 /* out: u8 * 16 */
+#define CMD_SUSPEND_HINT		0x65 /* in: u8 */
+#define CMD_ENABLE_WAKE_TIMER		0x66 /* in: u8 */
+#define CMD_SET_WAKE_TIMER		0x67 /* in: 32 */
+#define CMD_ENABLE_WAKE_AUTORESET	0x68 /* in: u8 */
+#define CMD_OLS_SET_LIMITS		0x69 /* in: u16, u16 */
+#define CMD_OLS_GET_LIMITS		0x6a /* out: u16, u16 */
+#define CMD_OLS_SET_CEILING		0x6b /* in: u16 */
+#define CMD_OLS_GET_CEILING		0x6c /* out: u16 */
+
+/*
+ * Accepted EC commands, and how many bytes they return. There are plenty
+ * of EC commands that are no longer implemented, or are implemented only on
+ * certain older boards.
+ */
+static const struct ec_cmd_t olpc_xo175_ec_cmds[] = {
+	{ CMD_GET_API_VERSION, 1 },
+	{ CMD_READ_VOLTAGE, 2 },
+	{ CMD_READ_CURRENT, 2 },
+	{ CMD_READ_ACR, 2 },
+	{ CMD_READ_BATT_TEMPERATURE, 2 },
+	{ CMD_READ_BATTERY_STATUS, 1 },
+	{ CMD_READ_SOC, 1 },
+	{ CMD_READ_GAUGE_ID, 8 },
+	{ CMD_READ_GAUGE_DATA, 1 },
+	{ CMD_READ_BOARD_ID, 2 },
+	{ CMD_READ_BATT_ERR_CODE, 1 },
+	{ CMD_SET_DCON_POWER, 0 },
+	{ CMD_RESET_EC, 0 },
+	{ CMD_READ_BATTERY_TYPE, 1 },
+	{ CMD_ENABLE_RUNIN_DISCHARGE, 0 },
+	{ CMD_DISABLE_RUNIN_DISCHARGE, 0 },
+	{ CMD_READ_MPPT_ACTIVE, 1 },
+	{ CMD_READ_MPPT_LIMIT, 1 },
+	{ CMD_SET_MPPT_LIMIT, 0 },
+	{ CMD_DISABLE_MPPT, 0 },
+	{ CMD_ENABLE_MPPT, 0 },
+	{ CMD_READ_VIN, 2 },
+	{ CMD_GET_FW_VERSION, 16 },
+	{ CMD_POWER_CYCLE, 0 },
+	{ CMD_POWER_OFF, 0 },
+	{ CMD_RESET_EC_SOFT, 0 },
+	{ CMD_ECHO, 5 },
+	{ CMD_GET_FW_DATE, 16 },
+	{ CMD_GET_FW_USER, 16 },
+	{ CMD_TURN_OFF_POWER, 0 },
+	{ CMD_READ_OLS, 2 },
+	{ CMD_OLS_SMT_LEDON, 0 },
+	{ CMD_OLS_SMT_LEDOFF, 0 },
+	{ CMD_START_OLS_ASSY, 0 },
+	{ CMD_STOP_OLS_ASSY, 0 },
+	{ CMD_OLS_SMTTEST_STOP, 0 },
+	{ CMD_READ_VIN_SCALED, 2 },
+	{ CMD_READ_BAT_MIN_W, 2 },
+	{ CMD_READ_BAR_MAX_W, 2 },
+	{ CMD_RESET_BAT_MINMAX_W, 0 },
+	{ CMD_READ_LOCATION, 1 },
+	{ CMD_WRITE_LOCATION, 0 },
+	{ CMD_GET_FW_HASH, 16 },
+	{ CMD_SUSPEND_HINT, 0 },
+	{ CMD_ENABLE_WAKE_TIMER, 0 },
+	{ CMD_SET_WAKE_TIMER, 0 },
+	{ CMD_ENABLE_WAKE_AUTORESET, 0 },
+	{ CMD_OLS_SET_LIMITS, 0 },
+	{ CMD_OLS_GET_LIMITS, 4 },
+	{ CMD_OLS_SET_CEILING, 0 },
+	{ CMD_OLS_GET_CEILING, 2 },
+	{ CMD_READ_EXT_SCI_MASK, 2 },
+	{ CMD_WRITE_EXT_SCI_MASK, 0 },
+
+	{ }
+};
+
+#define EC_MAX_CMD_DATA_LEN	5
+#define EC_MAX_RESP_LEN		16
+
+#define LOG_BUF_SIZE		128
+
+#define PM_WAKEUP_TIME		1000
+
+#define EC_ALL_EVENTS		GENMASK(15, 0)
+
+enum ec_state_t {
+	CMD_STATE_IDLE = 0,
+	CMD_STATE_WAITING_FOR_SWITCH,
+	CMD_STATE_CMD_IN_TX_FIFO,
+	CMD_STATE_CMD_SENT,
+	CMD_STATE_RESP_RECEIVED,
+	CMD_STATE_ERROR_RECEIVED,
+};
+
+struct olpc_xo175_ec_cmd {
+	u8 command;
+	u8 nr_args;
+	u8 data_len;
+	u8 args[EC_MAX_CMD_DATA_LEN];
+};
+
+struct olpc_xo175_ec_resp {
+	u8 channel;
+	u8 byte;
+};
+
+struct olpc_xo175_ec {
+	bool suspended;
+
+	/* SPI related stuff. */
+	struct spi_device *spi;
+	struct spi_transfer xfer;
+	struct spi_message msg;
+	union {
+		struct olpc_xo175_ec_cmd cmd;
+		struct olpc_xo175_ec_resp resp;
+	} tx_buf, rx_buf;
+
+	/* GPIO for the CMD signals. */
+	struct gpio_desc *gpio_cmd;
+
+	/* Command handling related state. */
+	spinlock_t cmd_state_lock;
+	int cmd_state;
+	bool cmd_running;
+	struct completion cmd_done;
+	struct olpc_xo175_ec_cmd cmd;
+	u8 resp_data[EC_MAX_RESP_LEN];
+	int expected_resp_len;
+	int resp_len;
+
+	/* Power button. */
+	struct input_dev *pwrbtn;
+
+	/* Debug handling. */
+	char logbuf[LOG_BUF_SIZE];
+	int logbuf_len;
+};
+
+static struct platform_device *olpc_ec;
+
+static int olpc_xo175_ec_resp_len(u8 cmd)
+{
+	const struct ec_cmd_t *p;
+
+	for (p = olpc_xo175_ec_cmds; p->cmd; p++) {
+		if (p->cmd == cmd)
+			return p->bytes_returned;
+	}
+
+	return -EINVAL;
+}
+
+static void olpc_xo175_ec_flush_logbuf(struct olpc_xo175_ec *priv)
+{
+	dev_dbg(&priv->spi->dev, "got debug string [%*pE]\n",
+				priv->logbuf_len, priv->logbuf);
+	priv->logbuf_len = 0;
+}
+
+static void olpc_xo175_ec_complete(void *arg);
+
+static void olpc_xo175_ec_send_command(struct olpc_xo175_ec *priv, void *cmd,
+								size_t cmdlen)
+{
+	int ret;
+
+	memcpy(&priv->tx_buf, cmd, cmdlen);
+	priv->xfer.len = cmdlen;
+
+	spi_message_init_with_transfers(&priv->msg, &priv->xfer, 1);
+
+	priv->msg.complete = olpc_xo175_ec_complete;
+	priv->msg.context = priv;
+
+	ret = spi_async(priv->spi, &priv->msg);
+	if (ret)
+		dev_err(&priv->spi->dev, "spi_async() failed %d\n", ret);
+}
+
+static void olpc_xo175_ec_read_packet(struct olpc_xo175_ec *priv)
+{
+	u8 nonce[] = {0xA5, 0x5A};
+
+	olpc_xo175_ec_send_command(priv, nonce, sizeof(nonce));
+}
+
+static void olpc_xo175_ec_complete(void *arg)
+{
+	struct olpc_xo175_ec *priv = arg;
+	struct device *dev = &priv->spi->dev;
+	struct power_supply *psy;
+	unsigned long flags;
+	u8 channel;
+	u8 byte;
+	int ret;
+
+	ret = priv->msg.status;
+	if (ret) {
+		dev_err(dev, "SPI transfer failed: %d\n", ret);
+
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+		if (priv->cmd_running) {
+			priv->resp_len = 0;
+			priv->cmd_state = CMD_STATE_ERROR_RECEIVED;
+			complete(&priv->cmd_done);
+		}
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+
+		if (ret != -EINTR)
+			olpc_xo175_ec_read_packet(priv);
+
+		return;
+	}
+
+	channel = priv->rx_buf.resp.channel;
+	byte = priv->rx_buf.resp.byte;
+
+	switch (channel) {
+	case CHAN_NONE:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			/* We can safely ignore these */
+			dev_err(dev, "spurious FIFO read packet\n");
+			spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+			return;
+		}
+
+		priv->cmd_state = CMD_STATE_CMD_SENT;
+		if (!priv->expected_resp_len)
+			complete(&priv->cmd_done);
+		olpc_xo175_ec_read_packet(priv);
+
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		return;
+
+	case CHAN_SWITCH:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			/* Just go with the flow */
+			dev_err(dev, "spurious SWITCH packet\n");
+			memset(&priv->cmd, 0, sizeof(priv->cmd));
+			priv->cmd.command = CMD_ECHO;
+		}
+
+		priv->cmd_state = CMD_STATE_CMD_IN_TX_FIFO;
+
+		/* Throw command into TxFIFO */
+		gpiod_set_value_cansleep(priv->gpio_cmd, 0);
+		olpc_xo175_ec_send_command(priv, &priv->cmd, sizeof(priv->cmd));
+
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		return;
+
+	case CHAN_CMD_RESP:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			dev_err(dev, "spurious response packet\n");
+		} else if (priv->resp_len >= priv->expected_resp_len) {
+			dev_err(dev, "too many response packets\n");
+		} else {
+			priv->resp_data[priv->resp_len++] = byte;
+			if (priv->resp_len == priv->expected_resp_len) {
+				priv->cmd_state = CMD_STATE_RESP_RECEIVED;
+				complete(&priv->cmd_done);
+			}
+		}
+
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		break;
+
+	case CHAN_CMD_ERROR:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			dev_err(dev, "spurious cmd error packet\n");
+		} else {
+			priv->resp_data[0] = byte;
+			priv->resp_len = 1;
+			priv->cmd_state = CMD_STATE_ERROR_RECEIVED;
+			complete(&priv->cmd_done);
+		}
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		break;
+
+	case CHAN_KEYBOARD:
+		dev_warn(dev, "keyboard is not supported\n");
+		break;
+
+	case CHAN_TOUCHPAD:
+		dev_warn(dev, "touchpad is not supported\n");
+		break;
+
+	case CHAN_EVENT:
+		dev_dbg(dev, "got event %.2x\n", byte);
+		switch (byte) {
+		case EVENT_AC_CHANGE:
+			psy = power_supply_get_by_name("olpc-ac");
+			if (psy) {
+				power_supply_changed(psy);
+				power_supply_put(psy);
+			}
+			break;
+		case EVENT_BATTERY_STATUS:
+		case EVENT_BATTERY_CRITICAL:
+		case EVENT_BATTERY_SOC_CHANGE:
+		case EVENT_BATTERY_ERROR:
+			psy = power_supply_get_by_name("olpc-battery");
+			if (psy) {
+				power_supply_changed(psy);
+				power_supply_put(psy);
+			}
+			break;
+		case EVENT_POWER_PRESSED:
+			input_report_key(priv->pwrbtn, KEY_POWER, 1);
+			input_sync(priv->pwrbtn);
+			input_report_key(priv->pwrbtn, KEY_POWER, 0);
+			input_sync(priv->pwrbtn);
+			/* fall through */
+		case EVENT_POWER_PRESS_WAKE:
+		case EVENT_TIMED_HOST_WAKE:
+			pm_wakeup_event(priv->pwrbtn->dev.parent,
+						PM_WAKEUP_TIME);
+			break;
+		default:
+			dev_dbg(dev, "ignored unknown event %.2x\n", byte);
+			break;
+		}
+		break;
+
+	case CHAN_DEBUG:
+		if (byte == '\n') {
+			olpc_xo175_ec_flush_logbuf(priv);
+		} else if (isprint(byte)) {
+			priv->logbuf[priv->logbuf_len++] = byte;
+			if (priv->logbuf_len == LOG_BUF_SIZE)
+				olpc_xo175_ec_flush_logbuf(priv);
+		}
+		break;
+
+	default:
+		dev_warn(dev, "unknown channel: %d, %.2x\n", channel, byte);
+		break;
+	}
+
+	/* Most non-command packets get the TxFIFO refilled and an ACK. */
+	olpc_xo175_ec_read_packet(priv);
+}
+
+/*
+ * This function is protected with a mutex. We can safely assume that
+ * there will be only one instance of this function running at a time.
+ * One of the ways in which we enforce this is by waiting until we get
+ * all response bytes back from the EC, rather than just the number that
+ * the caller requests (otherwise, we might start a new command while an
+ * old command's response bytes are still incoming).
+ */
+static int olpc_xo175_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *resp,
+					size_t resp_len, void *ec_cb_arg)
+{
+	struct olpc_xo175_ec *priv = ec_cb_arg;
+	struct device *dev = &priv->spi->dev;
+	unsigned long flags;
+	size_t nr_bytes;
+	int ret = 0;
+
+	dev_dbg(dev, "CMD %x, %zd bytes expected\n", cmd, resp_len);
+
+	if (inlen > 5) {
+		dev_err(dev, "command len %zd too big!\n", resp_len);
+		return -EOVERFLOW;
+	}
+
+	/* Suspending in the middle of an EC command hoses things badly! */
+	if (WARN_ON(priv->suspended))
+		return -EBUSY;
+
+	/* Ensure a valid command and return bytes */
+	ret = olpc_xo175_ec_resp_len(cmd);
+	if (ret < 0) {
+		dev_err_ratelimited(dev, "unknown command 0x%x\n", cmd);
+
+		/*
+		 * Assume the best in our callers, and allow unknown commands
+		 * through. I'm not the charitable type, but it was beaten
+		 * into me. Just maintain a minimum standard of sanity.
+		 */
+		if (resp_len > sizeof(priv->resp_data)) {
+			dev_err(dev, "response too big: %zd!\n", resp_len);
+			return -EOVERFLOW;
+		}
+		nr_bytes = resp_len;
+	} else {
+		nr_bytes = (size_t)ret;
+	}
+	resp_len = min(resp_len, nr_bytes);
+
+	spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+	/* Initialize the state machine */
+	init_completion(&priv->cmd_done);
+	priv->cmd_running = true;
+	priv->cmd_state = CMD_STATE_WAITING_FOR_SWITCH;
+	memset(&priv->cmd, 0, sizeof(priv->cmd));
+	priv->cmd.command = cmd;
+	priv->cmd.nr_args = inlen;
+	priv->cmd.data_len = 0;
+	memcpy(priv->cmd.args, inbuf, inlen);
+	priv->expected_resp_len = nr_bytes;
+	priv->resp_len = 0;
+
+	/* Tickle the cmd gpio to get things started */
+	gpiod_set_value_cansleep(priv->gpio_cmd, 1);
+
+	spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+
+	/* The irq handler should do the rest */
+	if (!wait_for_completion_timeout(&priv->cmd_done,
+			msecs_to_jiffies(4000))) {
+		dev_err(dev, "EC cmd error: timeout in STATE %d\n",
+				priv->cmd_state);
+		gpiod_set_value_cansleep(priv->gpio_cmd, 0);
+		spi_slave_abort(priv->spi);
+		olpc_xo175_ec_read_packet(priv);
+		return -ETIMEDOUT;
+	}
+
+	spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+	/* Deal with the results. */
+	if (priv->cmd_state == CMD_STATE_ERROR_RECEIVED) {
+		/* EC-provided error is in the single response byte */
+		dev_err(dev, "command 0x%x returned error 0x%x\n",
+						cmd, priv->resp_data[0]);
+		ret = -EREMOTEIO;
+	} else if (priv->resp_len != nr_bytes) {
+		dev_err(dev, "command 0x%x returned %d bytes, expected %zd bytes\n",
+						cmd, priv->resp_len, nr_bytes);
+		ret = -EREMOTEIO;
+	} else {
+		/*
+		 * We may have 8 bytes in priv->resp, but we only care about
+		 * what we've been asked for. If the caller asked for only 2
+		 * bytes, give them that. We've guaranteed that
+		 * resp_len <= priv->resp_len and priv->resp_len == nr_bytes.
+		 */
+		memcpy(resp, priv->resp_data, resp_len);
+	}
+
+	/* This should already be low, but just in case. */
+	gpiod_set_value_cansleep(priv->gpio_cmd, 0);
+	priv->cmd_running = false;
+
+	spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+
+	return ret;
+}
+
+static int olpc_xo175_ec_set_event_mask(unsigned int mask)
+{
+	u8 args[2];
+
+	args[0] = mask >> 0;
+	args[1] = mask >> 8;
+	return olpc_ec_cmd(CMD_WRITE_EXT_SCI_MASK, args, 2, NULL, 0);
+}
+
+static void olpc_xo175_ec_power_off(void)
+{
+	while (1) {
+		olpc_ec_cmd(CMD_POWER_OFF, NULL, 0, NULL, 0);
+		mdelay(1000);
+	}
+}
+
+static int __maybe_unused olpc_xo175_ec_suspend(struct device *dev)
+{
+	struct olpc_xo175_ec *priv = dev_get_drvdata(dev);
+	static struct {
+		u8 suspend;
+		u32 suspend_count;
+	} __packed hintargs;
+	static unsigned int suspend_count;
+
+	/*
+	 * SOC_SLEEP is not wired to the EC on B3 and earlier boards.
+	 * This command lets the EC know instead. The suspend count doesn't seem
+	 * to be used anywhere but in the EC debug output.
+	 */
+	hintargs.suspend = 1;
+	hintargs.suspend_count = suspend_count++;
+	olpc_ec_cmd(CMD_SUSPEND_HINT, (void *)&hintargs, sizeof(hintargs),
+								NULL, 0);
+
+	/*
+	 * After we've sent the suspend hint, don't allow further EC commands
+	 * to be run until we've resumed. Userspace tasks should be frozen,
+	 * but kernel threads and interrupts could still schedule EC commands.
+	 */
+	priv->suspended = true;
+
+	return 0;
+}
+
+static int __maybe_unused olpc_xo175_ec_resume_noirq(struct device *dev)
+{
+	struct olpc_xo175_ec *priv = dev_get_drvdata(dev);
+
+	priv->suspended = false;
+
+	return 0;
+}
+
+static int __maybe_unused olpc_xo175_ec_resume(struct device *dev)
+{
+	u8 x = 0;
+
+	/*
+	 * The resume hint is only needed if no other commands are
+	 * being sent during resume. all it does is tell the EC
+	 * the SoC is definitely awake.
+	 */
+	olpc_ec_cmd(CMD_SUSPEND_HINT, &x, 1, NULL, 0);
+
+	/* Enable all EC events while we're awake */
+	olpc_xo175_ec_set_event_mask(EC_ALL_EVENTS);
+
+	return 0;
+}
+
+static struct olpc_ec_driver olpc_xo175_ec_driver = {
+	.ec_cmd = olpc_xo175_ec_cmd,
+};
+
+static int olpc_xo175_ec_remove(struct spi_device *spi)
+{
+	if (pm_power_off == olpc_xo175_ec_power_off)
+		pm_power_off = NULL;
+
+	spi_slave_abort(spi);
+
+	platform_device_unregister(olpc_ec);
+	olpc_ec = NULL;
+
+	return 0;
+}
+
+static int olpc_xo175_ec_probe(struct spi_device *spi)
+{
+	struct olpc_xo175_ec *priv;
+	int ret;
+
+	if (olpc_ec) {
+		dev_err(&spi->dev, "OLPC EC already registered.\n");
+		return -EBUSY;
+	}
+
+	priv = devm_kzalloc(&spi->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->gpio_cmd = devm_gpiod_get(&spi->dev, "cmd", GPIOD_OUT_LOW);
+	if (IS_ERR(priv->gpio_cmd)) {
+		dev_err(&spi->dev, "failed to get cmd gpio: %ld\n",
+					PTR_ERR(priv->gpio_cmd));
+		return PTR_ERR(priv->gpio_cmd);
+	}
+
+	priv->spi = spi;
+
+	spin_lock_init(&priv->cmd_state_lock);
+	priv->cmd_state = CMD_STATE_IDLE;
+	init_completion(&priv->cmd_done);
+
+	priv->logbuf_len = 0;
+
+	/* Set up power button input device */
+	priv->pwrbtn = devm_input_allocate_device(&spi->dev);
+	if (!priv->pwrbtn)
+		return -ENOMEM;
+	priv->pwrbtn->name = "Power Button";
+	priv->pwrbtn->dev.parent = &spi->dev;
+	input_set_capability(priv->pwrbtn, EV_KEY, KEY_POWER);
+	ret = input_register_device(priv->pwrbtn);
+	if (ret) {
+		dev_err(&spi->dev, "error registering input device: %d\n", ret);
+		return ret;
+	}
+
+	spi_set_drvdata(spi, priv);
+
+	priv->xfer.rx_buf = &priv->rx_buf;
+	priv->xfer.tx_buf = &priv->tx_buf;
+
+	olpc_xo175_ec_read_packet(priv);
+
+	olpc_ec_driver_register(&olpc_xo175_ec_driver, priv);
+	olpc_ec = platform_device_register_resndata(&spi->dev, "olpc-ec", -1,
+							NULL, 0, NULL, 0);
+
+	/* Enable all EC events while we're awake */
+	olpc_xo175_ec_set_event_mask(EC_ALL_EVENTS);
+
+	if (pm_power_off == NULL)
+		pm_power_off = olpc_xo175_ec_power_off;
+
+	dev_info(&spi->dev, "OLPC XO-1.75 Embedded Controller driver\n");
+
+	return 0;
+}
+
+static const struct dev_pm_ops olpc_xo175_ec_pm_ops = {
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, olpc_xo175_ec_resume_noirq)
+	SET_RUNTIME_PM_OPS(olpc_xo175_ec_suspend, olpc_xo175_ec_resume, NULL)
+};
+
+static const struct of_device_id olpc_xo175_ec_of_match[] = {
+	{ .compatible = "olpc,xo1.75-ec" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, olpc_xo175_ec_of_match);
+
+static struct spi_driver olpc_xo175_ec_spi_driver = {
+	.driver = {
+		.name	= "olpc-xo175-ec",
+		.of_match_table = olpc_xo175_ec_of_match,
+		.pm = &olpc_xo175_ec_pm_ops,
+	},
+	.probe		= olpc_xo175_ec_probe,
+	.remove		= olpc_xo175_ec_remove,
+};
+module_spi_driver(olpc_xo175_ec_spi_driver);
+
+MODULE_DESCRIPTION("OLPC XO-1.75 Embedded Controller driver");
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>"); /* Functionality */
+MODULE_AUTHOR("Lubomir Rintel <lkundrak@v3.sk>"); /* Bugs */
+MODULE_LICENSE("GPL");
diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h
index f7b6a7eda232..c4602364e909 100644
--- a/include/linux/olpc-ec.h
+++ b/include/linux/olpc-ec.h
@@ -41,7 +41,7 @@ struct olpc_ec_driver {
 	bool wakeup_available;
 };
 
-#ifdef CONFIG_OLPC
+#ifdef CONFIG_OLPC_EC
 
 extern void olpc_ec_driver_register(struct olpc_ec_driver *drv, void *arg);
 
@@ -69,6 +69,6 @@ static inline bool olpc_ec_wakeup_available(void)
 	return false;
 }
 
-#endif /* CONFIG_OLPC */
+#endif /* CONFIG_OLPC_EC */
 
 #endif /* _LINUX_OLPC_EC_H */
-- 
cgit v1.2.3


From 5d1d046e2868fc876a69231eb2f24f000b521f1c Mon Sep 17 00:00:00 2001
From: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Date: Sat, 27 Apr 2019 10:10:24 +0300
Subject: soc: fsl: qbman_portals: add APIs to retrieve the probing status

Add a couple of new APIs to check the probing status of the required
cpu bound qman and bman portals:
 'int bman_portals_probed()' and 'int qman_portals_probed()'.
They return the following values.
 *  1 if qman/bman portals were all probed correctly
 *  0 if qman/bman portals were not yet probed
 * -1 if probing of qman/bman portals failed
Portals are considered successful probed if no error occurred during
the probing of any of the portals and if enough portals were probed
to have one available for each cpu.
The error handling paths were slightly rearranged in order to fit this
new functionality without being too intrusive.
Drivers that use qman/bman portal driver services are required to use
these APIs before calling any functions exported by these drivers or
otherwise they will crash the kernel.
First user will be the dpaa1 ethernet driver, coming in a subsequent
patch.

Signed-off-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Signed-off-by: Li Yang <leoyang.li@nxp.com>
---
 drivers/soc/fsl/qbman/bman_portal.c | 20 ++++++++++++++++----
 drivers/soc/fsl/qbman/qman_portal.c | 21 +++++++++++++++++----
 include/soc/fsl/bman.h              |  8 ++++++++
 include/soc/fsl/qman.h              |  9 +++++++++
 4 files changed, 50 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/fsl/qbman/bman_portal.c b/drivers/soc/fsl/qbman/bman_portal.c
index 2c95cf59f3e7..cf4f10d6f590 100644
--- a/drivers/soc/fsl/qbman/bman_portal.c
+++ b/drivers/soc/fsl/qbman/bman_portal.c
@@ -32,6 +32,7 @@
 
 static struct bman_portal *affine_bportals[NR_CPUS];
 static struct cpumask portal_cpus;
+static int __bman_portals_probed;
 /* protect bman global registers and global data shared among portals */
 static DEFINE_SPINLOCK(bman_lock);
 
@@ -87,6 +88,12 @@ static int bman_online_cpu(unsigned int cpu)
 	return 0;
 }
 
+int bman_portals_probed(void)
+{
+	return __bman_portals_probed;
+}
+EXPORT_SYMBOL_GPL(bman_portals_probed);
+
 static int bman_portal_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -104,8 +111,10 @@ static int bman_portal_probe(struct platform_device *pdev)
 	}
 
 	pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL);
-	if (!pcfg)
+	if (!pcfg) {
+		__bman_portals_probed = -1;
 		return -ENOMEM;
+	}
 
 	pcfg->dev = dev;
 
@@ -113,14 +122,14 @@ static int bman_portal_probe(struct platform_device *pdev)
 					     DPAA_PORTAL_CE);
 	if (!addr_phys[0]) {
 		dev_err(dev, "Can't get %pOF property 'reg::CE'\n", node);
-		return -ENXIO;
+		goto err_ioremap1;
 	}
 
 	addr_phys[1] = platform_get_resource(pdev, IORESOURCE_MEM,
 					     DPAA_PORTAL_CI);
 	if (!addr_phys[1]) {
 		dev_err(dev, "Can't get %pOF property 'reg::CI'\n", node);
-		return -ENXIO;
+		goto err_ioremap1;
 	}
 
 	pcfg->cpu = -1;
@@ -128,7 +137,7 @@ static int bman_portal_probe(struct platform_device *pdev)
 	irq = platform_get_irq(pdev, 0);
 	if (irq <= 0) {
 		dev_err(dev, "Can't get %pOF IRQ'\n", node);
-		return -ENXIO;
+		goto err_ioremap1;
 	}
 	pcfg->irq = irq;
 
@@ -150,6 +159,7 @@ static int bman_portal_probe(struct platform_device *pdev)
 	spin_lock(&bman_lock);
 	cpu = cpumask_next_zero(-1, &portal_cpus);
 	if (cpu >= nr_cpu_ids) {
+		__bman_portals_probed = 1;
 		/* unassigned portal, skip init */
 		spin_unlock(&bman_lock);
 		return 0;
@@ -175,6 +185,8 @@ err_portal_init:
 err_ioremap2:
 	memunmap(pcfg->addr_virt_ce);
 err_ioremap1:
+	 __bman_portals_probed = -1;
+
 	return -ENXIO;
 }
 
diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c
index 661c9b234d32..e2186b681d87 100644
--- a/drivers/soc/fsl/qbman/qman_portal.c
+++ b/drivers/soc/fsl/qbman/qman_portal.c
@@ -38,6 +38,7 @@ EXPORT_SYMBOL(qman_dma_portal);
 #define CONFIG_FSL_DPA_PIRQ_FAST  1
 
 static struct cpumask portal_cpus;
+static int __qman_portals_probed;
 /* protect qman global registers and global data shared among portals */
 static DEFINE_SPINLOCK(qman_lock);
 
@@ -220,6 +221,12 @@ static int qman_online_cpu(unsigned int cpu)
 	return 0;
 }
 
+int qman_portals_probed(void)
+{
+	return __qman_portals_probed;
+}
+EXPORT_SYMBOL_GPL(qman_portals_probed);
+
 static int qman_portal_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -238,8 +245,10 @@ static int qman_portal_probe(struct platform_device *pdev)
 	}
 
 	pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL);
-	if (!pcfg)
+	if (!pcfg) {
+		__qman_portals_probed = -1;
 		return -ENOMEM;
+	}
 
 	pcfg->dev = dev;
 
@@ -247,19 +256,20 @@ static int qman_portal_probe(struct platform_device *pdev)
 					     DPAA_PORTAL_CE);
 	if (!addr_phys[0]) {
 		dev_err(dev, "Can't get %pOF property 'reg::CE'\n", node);
-		return -ENXIO;
+		goto err_ioremap1;
 	}
 
 	addr_phys[1] = platform_get_resource(pdev, IORESOURCE_MEM,
 					     DPAA_PORTAL_CI);
 	if (!addr_phys[1]) {
 		dev_err(dev, "Can't get %pOF property 'reg::CI'\n", node);
-		return -ENXIO;
+		goto err_ioremap1;
 	}
 
 	err = of_property_read_u32(node, "cell-index", &val);
 	if (err) {
 		dev_err(dev, "Can't get %pOF property 'cell-index'\n", node);
+		__qman_portals_probed = -1;
 		return err;
 	}
 	pcfg->channel = val;
@@ -267,7 +277,7 @@ static int qman_portal_probe(struct platform_device *pdev)
 	irq = platform_get_irq(pdev, 0);
 	if (irq <= 0) {
 		dev_err(dev, "Can't get %pOF IRQ\n", node);
-		return -ENXIO;
+		goto err_ioremap1;
 	}
 	pcfg->irq = irq;
 
@@ -291,6 +301,7 @@ static int qman_portal_probe(struct platform_device *pdev)
 	spin_lock(&qman_lock);
 	cpu = cpumask_next_zero(-1, &portal_cpus);
 	if (cpu >= nr_cpu_ids) {
+		__qman_portals_probed = 1;
 		/* unassigned portal, skip init */
 		spin_unlock(&qman_lock);
 		return 0;
@@ -321,6 +332,8 @@ err_portal_init:
 err_ioremap2:
 	memunmap(pcfg->addr_virt_ce);
 err_ioremap1:
+	__qman_portals_probed = -1;
+
 	return -ENXIO;
 }
 
diff --git a/include/soc/fsl/bman.h b/include/soc/fsl/bman.h
index 5b99cb2ea5ef..173e4049d963 100644
--- a/include/soc/fsl/bman.h
+++ b/include/soc/fsl/bman.h
@@ -133,5 +133,13 @@ int bman_acquire(struct bman_pool *pool, struct bm_buffer *bufs, u8 num);
  * failed to probe or 0 if the bman driver did not probed yet.
  */
 int bman_is_probed(void);
+/**
+ * bman_portals_probed - Check if all cpu bound bman portals are probed
+ *
+ * Returns 1 if all the required cpu bound bman portals successfully probed,
+ * -1 if probe errors appeared or 0 if the bman portals did not yet finished
+ * probing.
+ */
+int bman_portals_probed(void);
 
 #endif	/* __FSL_BMAN_H */
diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h
index 5cc7af06c1ba..aa31c05a103a 100644
--- a/include/soc/fsl/qman.h
+++ b/include/soc/fsl/qman.h
@@ -1194,6 +1194,15 @@ int qman_release_cgrid(u32 id);
  */
 int qman_is_probed(void);
 
+/**
+ * qman_portals_probed - Check if all cpu bound qman portals are probed
+ *
+ * Returns 1 if all the required cpu bound qman portals successfully probed,
+ * -1 if probe errors appeared or 0 if the qman portals did not yet finished
+ * probing.
+ */
+int qman_portals_probed(void);
+
 /**
  * qman_dqrr_get_ithresh - Get coalesce interrupt threshold
  * @portal: portal to get the value for
-- 
cgit v1.2.3


From 9a0f780958bbcb85604636fa340e2a1efaa4f432 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms+renesas@verge.net.au>
Date: Mon, 13 May 2019 13:39:51 +0200
Subject: dmaengine: sudmac: remove unused driver

SUDMAC driver was introduced in v3.10 but was never integrated for use
by any platform. As it is unused remove it.

Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
Acked-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/sh/Kconfig  |   6 -
 drivers/dma/sh/Makefile |   1 -
 drivers/dma/sh/sudmac.c | 414 ------------------------------------------------
 include/linux/sudmac.h  |  52 ------
 4 files changed, 473 deletions(-)
 delete mode 100644 drivers/dma/sh/sudmac.c
 delete mode 100644 include/linux/sudmac.h

(limited to 'include')

diff --git a/drivers/dma/sh/Kconfig b/drivers/dma/sh/Kconfig
index 4d6b02b3b1f1..54d5d0369d3c 100644
--- a/drivers/dma/sh/Kconfig
+++ b/drivers/dma/sh/Kconfig
@@ -47,9 +47,3 @@ config RENESAS_USB_DMAC
 	help
 	  This driver supports the USB-DMA controller found in the Renesas
 	  SoCs.
-
-config SUDMAC
-	tristate "Renesas SUDMAC support"
-	depends on SH_DMAE_BASE
-	help
-	  Enable support for the Renesas SUDMAC controllers.
diff --git a/drivers/dma/sh/Makefile b/drivers/dma/sh/Makefile
index 42110dd57a56..112fbd22bb3f 100644
--- a/drivers/dma/sh/Makefile
+++ b/drivers/dma/sh/Makefile
@@ -15,4 +15,3 @@ obj-$(CONFIG_SH_DMAE) += shdma.o
 
 obj-$(CONFIG_RCAR_DMAC) += rcar-dmac.o
 obj-$(CONFIG_RENESAS_USB_DMAC) += usb-dmac.o
-obj-$(CONFIG_SUDMAC) += sudmac.o
diff --git a/drivers/dma/sh/sudmac.c b/drivers/dma/sh/sudmac.c
deleted file mode 100644
index 30cc3553cb8b..000000000000
--- a/drivers/dma/sh/sudmac.c
+++ /dev/null
@@ -1,414 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Renesas SUDMAC support
- *
- * Copyright (C) 2013 Renesas Solutions Corp.
- *
- * based on drivers/dma/sh/shdma.c:
- * Copyright (C) 2011-2012 Guennadi Liakhovetski <g.liakhovetski@gmx.de>
- * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
- * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
- * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved.
- */
-
-#include <linux/dmaengine.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/sudmac.h>
-
-struct sudmac_chan {
-	struct shdma_chan shdma_chan;
-	void __iomem *base;
-	char dev_id[16];	/* unique name per DMAC of channel */
-
-	u32 offset;		/* for CFG, BA, BBC, CA, CBC, DEN */
-	u32 cfg;
-	u32 dint_end_bit;
-};
-
-struct sudmac_device {
-	struct shdma_dev shdma_dev;
-	struct sudmac_pdata *pdata;
-	void __iomem *chan_reg;
-};
-
-struct sudmac_regs {
-	u32 base_addr;
-	u32 base_byte_count;
-};
-
-struct sudmac_desc {
-	struct sudmac_regs hw;
-	struct shdma_desc shdma_desc;
-};
-
-#define to_chan(schan) container_of(schan, struct sudmac_chan, shdma_chan)
-#define to_desc(sdesc) container_of(sdesc, struct sudmac_desc, shdma_desc)
-#define to_sdev(sc) container_of(sc->shdma_chan.dma_chan.device, \
-				 struct sudmac_device, shdma_dev.dma_dev)
-
-/* SUDMAC register */
-#define SUDMAC_CH0CFG		0x00
-#define SUDMAC_CH0BA		0x10
-#define SUDMAC_CH0BBC		0x18
-#define SUDMAC_CH0CA		0x20
-#define SUDMAC_CH0CBC		0x28
-#define SUDMAC_CH0DEN		0x30
-#define SUDMAC_DSTSCLR		0x38
-#define SUDMAC_DBUFCTRL		0x3C
-#define SUDMAC_DINTCTRL		0x40
-#define SUDMAC_DINTSTS		0x44
-#define SUDMAC_DINTSTSCLR	0x48
-#define SUDMAC_CH0SHCTRL	0x50
-
-/* Definitions for the sudmac_channel.config */
-#define SUDMAC_SENDBUFM	0x1000 /* b12: Transmit Buffer Mode */
-#define SUDMAC_RCVENDM	0x0100 /* b8: Receive Data Transfer End Mode */
-#define SUDMAC_LBA_WAIT	0x0030 /* b5-4: Local Bus Access Wait */
-
-/* Definitions for the sudmac_channel.dint_end_bit */
-#define SUDMAC_CH1ENDE	0x0002 /* b1: Ch1 DMA Transfer End Int Enable */
-#define SUDMAC_CH0ENDE	0x0001 /* b0: Ch0 DMA Transfer End Int Enable */
-
-#define SUDMAC_DRV_NAME "sudmac"
-
-static void sudmac_writel(struct sudmac_chan *sc, u32 data, u32 reg)
-{
-	iowrite32(data, sc->base + reg);
-}
-
-static u32 sudmac_readl(struct sudmac_chan *sc, u32 reg)
-{
-	return ioread32(sc->base + reg);
-}
-
-static bool sudmac_is_busy(struct sudmac_chan *sc)
-{
-	u32 den = sudmac_readl(sc, SUDMAC_CH0DEN + sc->offset);
-
-	if (den)
-		return true; /* working */
-
-	return false; /* waiting */
-}
-
-static void sudmac_set_reg(struct sudmac_chan *sc, struct sudmac_regs *hw,
-			   struct shdma_desc *sdesc)
-{
-	sudmac_writel(sc, sc->cfg, SUDMAC_CH0CFG + sc->offset);
-	sudmac_writel(sc, hw->base_addr, SUDMAC_CH0BA + sc->offset);
-	sudmac_writel(sc, hw->base_byte_count, SUDMAC_CH0BBC + sc->offset);
-}
-
-static void sudmac_start(struct sudmac_chan *sc)
-{
-	u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL);
-
-	sudmac_writel(sc, dintctrl | sc->dint_end_bit, SUDMAC_DINTCTRL);
-	sudmac_writel(sc, 1, SUDMAC_CH0DEN + sc->offset);
-}
-
-static void sudmac_start_xfer(struct shdma_chan *schan,
-			      struct shdma_desc *sdesc)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-
-	sudmac_set_reg(sc, &sd->hw, sdesc);
-	sudmac_start(sc);
-}
-
-static bool sudmac_channel_busy(struct shdma_chan *schan)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-
-	return sudmac_is_busy(sc);
-}
-
-static void sudmac_setup_xfer(struct shdma_chan *schan, int slave_id)
-{
-}
-
-static const struct sudmac_slave_config *sudmac_find_slave(
-	struct sudmac_chan *sc, int slave_id)
-{
-	struct sudmac_device *sdev = to_sdev(sc);
-	struct sudmac_pdata *pdata = sdev->pdata;
-	const struct sudmac_slave_config *cfg;
-	int i;
-
-	for (i = 0, cfg = pdata->slave; i < pdata->slave_num; i++, cfg++)
-		if (cfg->slave_id == slave_id)
-			return cfg;
-
-	return NULL;
-}
-
-static int sudmac_set_slave(struct shdma_chan *schan, int slave_id,
-			    dma_addr_t slave_addr, bool try)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	const struct sudmac_slave_config *cfg = sudmac_find_slave(sc, slave_id);
-
-	if (!cfg)
-		return -ENODEV;
-
-	return 0;
-}
-
-static inline void sudmac_dma_halt(struct sudmac_chan *sc)
-{
-	u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL);
-
-	sudmac_writel(sc, 0, SUDMAC_CH0DEN + sc->offset);
-	sudmac_writel(sc, dintctrl & ~sc->dint_end_bit, SUDMAC_DINTCTRL);
-	sudmac_writel(sc, sc->dint_end_bit, SUDMAC_DINTSTSCLR);
-}
-
-static int sudmac_desc_setup(struct shdma_chan *schan,
-			     struct shdma_desc *sdesc,
-			     dma_addr_t src, dma_addr_t dst, size_t *len)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-
-	dev_dbg(sc->shdma_chan.dev, "%s: src=%pad, dst=%pad, len=%zu\n",
-		__func__, &src, &dst, *len);
-
-	if (*len > schan->max_xfer_len)
-		*len = schan->max_xfer_len;
-
-	if (dst)
-		sd->hw.base_addr = dst;
-	else if (src)
-		sd->hw.base_addr = src;
-	sd->hw.base_byte_count = *len;
-
-	return 0;
-}
-
-static void sudmac_halt(struct shdma_chan *schan)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-
-	sudmac_dma_halt(sc);
-}
-
-static bool sudmac_chan_irq(struct shdma_chan *schan, int irq)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	u32 dintsts = sudmac_readl(sc, SUDMAC_DINTSTS);
-
-	if (!(dintsts & sc->dint_end_bit))
-		return false;
-
-	/* DMA stop */
-	sudmac_dma_halt(sc);
-
-	return true;
-}
-
-static size_t sudmac_get_partial(struct shdma_chan *schan,
-				 struct shdma_desc *sdesc)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-	u32 current_byte_count = sudmac_readl(sc, SUDMAC_CH0CBC + sc->offset);
-
-	return sd->hw.base_byte_count - current_byte_count;
-}
-
-static bool sudmac_desc_completed(struct shdma_chan *schan,
-				  struct shdma_desc *sdesc)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-	u32 current_addr = sudmac_readl(sc, SUDMAC_CH0CA + sc->offset);
-
-	return sd->hw.base_addr + sd->hw.base_byte_count == current_addr;
-}
-
-static int sudmac_chan_probe(struct sudmac_device *su_dev, int id, int irq,
-			     unsigned long flags)
-{
-	struct shdma_dev *sdev = &su_dev->shdma_dev;
-	struct platform_device *pdev = to_platform_device(sdev->dma_dev.dev);
-	struct sudmac_chan *sc;
-	struct shdma_chan *schan;
-	int err;
-
-	sc = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_chan), GFP_KERNEL);
-	if (!sc)
-		return -ENOMEM;
-
-	schan = &sc->shdma_chan;
-	schan->max_xfer_len = 64 * 1024 * 1024 - 1;
-
-	shdma_chan_probe(sdev, schan, id);
-
-	sc->base = su_dev->chan_reg;
-
-	/* get platform_data */
-	sc->offset = su_dev->pdata->channel->offset;
-	if (su_dev->pdata->channel->config & SUDMAC_TX_BUFFER_MODE)
-		sc->cfg |= SUDMAC_SENDBUFM;
-	if (su_dev->pdata->channel->config & SUDMAC_RX_END_MODE)
-		sc->cfg |= SUDMAC_RCVENDM;
-	sc->cfg |= (su_dev->pdata->channel->wait << 4) & SUDMAC_LBA_WAIT;
-
-	if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH0)
-		sc->dint_end_bit |= SUDMAC_CH0ENDE;
-	if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH1)
-		sc->dint_end_bit |= SUDMAC_CH1ENDE;
-
-	/* set up channel irq */
-	if (pdev->id >= 0)
-		snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d.%d",
-			 pdev->id, id);
-	else
-		snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d", id);
-
-	err = shdma_request_irq(schan, irq, flags, sc->dev_id);
-	if (err) {
-		dev_err(sdev->dma_dev.dev,
-			"DMA channel %d request_irq failed %d\n", id, err);
-		goto err_no_irq;
-	}
-
-	return 0;
-
-err_no_irq:
-	/* remove from dmaengine device node */
-	shdma_chan_remove(schan);
-	return err;
-}
-
-static void sudmac_chan_remove(struct sudmac_device *su_dev)
-{
-	struct shdma_chan *schan;
-	int i;
-
-	shdma_for_each_chan(schan, &su_dev->shdma_dev, i) {
-		BUG_ON(!schan);
-
-		shdma_chan_remove(schan);
-	}
-}
-
-static dma_addr_t sudmac_slave_addr(struct shdma_chan *schan)
-{
-	/* SUDMAC doesn't need the address */
-	return 0;
-}
-
-static struct shdma_desc *sudmac_embedded_desc(void *buf, int i)
-{
-	return &((struct sudmac_desc *)buf)[i].shdma_desc;
-}
-
-static const struct shdma_ops sudmac_shdma_ops = {
-	.desc_completed = sudmac_desc_completed,
-	.halt_channel = sudmac_halt,
-	.channel_busy = sudmac_channel_busy,
-	.slave_addr = sudmac_slave_addr,
-	.desc_setup = sudmac_desc_setup,
-	.set_slave = sudmac_set_slave,
-	.setup_xfer = sudmac_setup_xfer,
-	.start_xfer = sudmac_start_xfer,
-	.embedded_desc = sudmac_embedded_desc,
-	.chan_irq = sudmac_chan_irq,
-	.get_partial = sudmac_get_partial,
-};
-
-static int sudmac_probe(struct platform_device *pdev)
-{
-	struct sudmac_pdata *pdata = dev_get_platdata(&pdev->dev);
-	int err, i;
-	struct sudmac_device *su_dev;
-	struct dma_device *dma_dev;
-	struct resource *chan, *irq_res;
-
-	/* get platform data */
-	if (!pdata)
-		return -ENODEV;
-
-	irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (!irq_res)
-		return -ENODEV;
-
-	err = -ENOMEM;
-	su_dev = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_device),
-			      GFP_KERNEL);
-	if (!su_dev)
-		return err;
-
-	dma_dev = &su_dev->shdma_dev.dma_dev;
-
-	chan = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	su_dev->chan_reg = devm_ioremap_resource(&pdev->dev, chan);
-	if (IS_ERR(su_dev->chan_reg))
-		return PTR_ERR(su_dev->chan_reg);
-
-	dma_cap_set(DMA_SLAVE, dma_dev->cap_mask);
-
-	su_dev->shdma_dev.ops = &sudmac_shdma_ops;
-	su_dev->shdma_dev.desc_size = sizeof(struct sudmac_desc);
-	err = shdma_init(&pdev->dev, &su_dev->shdma_dev, pdata->channel_num);
-	if (err < 0)
-		return err;
-
-	/* platform data */
-	su_dev->pdata = dev_get_platdata(&pdev->dev);
-
-	platform_set_drvdata(pdev, su_dev);
-
-	/* Create DMA Channel */
-	for (i = 0; i < pdata->channel_num; i++) {
-		err = sudmac_chan_probe(su_dev, i, irq_res->start, IRQF_SHARED);
-		if (err)
-			goto chan_probe_err;
-	}
-
-	err = dma_async_device_register(&su_dev->shdma_dev.dma_dev);
-	if (err < 0)
-		goto chan_probe_err;
-
-	return err;
-
-chan_probe_err:
-	sudmac_chan_remove(su_dev);
-
-	shdma_cleanup(&su_dev->shdma_dev);
-
-	return err;
-}
-
-static int sudmac_remove(struct platform_device *pdev)
-{
-	struct sudmac_device *su_dev = platform_get_drvdata(pdev);
-	struct dma_device *dma_dev = &su_dev->shdma_dev.dma_dev;
-
-	dma_async_device_unregister(dma_dev);
-	sudmac_chan_remove(su_dev);
-	shdma_cleanup(&su_dev->shdma_dev);
-
-	return 0;
-}
-
-static struct platform_driver sudmac_driver = {
-	.driver		= {
-		.name	= SUDMAC_DRV_NAME,
-	},
-	.probe		= sudmac_probe,
-	.remove		= sudmac_remove,
-};
-module_platform_driver(sudmac_driver);
-
-MODULE_AUTHOR("Yoshihiro Shimoda");
-MODULE_DESCRIPTION("Renesas SUDMAC driver");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:" SUDMAC_DRV_NAME);
diff --git a/include/linux/sudmac.h b/include/linux/sudmac.h
deleted file mode 100644
index 377b8a5788fa..000000000000
--- a/include/linux/sudmac.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Header for the SUDMAC driver
- *
- * Copyright (C) 2013 Renesas Solutions Corp.
- *
- * This is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- */
-#ifndef SUDMAC_H
-#define SUDMAC_H
-
-#include <linux/dmaengine.h>
-#include <linux/shdma-base.h>
-#include <linux/types.h>
-
-/* Used by slave DMA clients to request DMA to/from a specific peripheral */
-struct sudmac_slave {
-	struct shdma_slave	shdma_slave;	/* Set by the platform */
-};
-
-/*
- * Supplied by platforms to specify, how a DMA channel has to be configured for
- * a certain peripheral
- */
-struct sudmac_slave_config {
-	int		slave_id;
-};
-
-struct sudmac_channel {
-	unsigned long	offset;
-	unsigned long	config;
-	unsigned long	wait;		/* The configuable range is 0 to 3 */
-	unsigned long	dint_end_bit;
-};
-
-struct sudmac_pdata {
-	const struct sudmac_slave_config *slave;
-	int slave_num;
-	const struct sudmac_channel *channel;
-	int channel_num;
-};
-
-/* Definitions for the sudmac_channel.config */
-#define SUDMAC_TX_BUFFER_MODE	BIT(0)
-#define SUDMAC_RX_END_MODE	BIT(1)
-
-/* Definitions for the sudmac_channel.dint_end_bit */
-#define SUDMAC_DMA_BIT_CH0	BIT(0)
-#define SUDMAC_DMA_BIT_CH1	BIT(1)
-
-#endif
-- 
cgit v1.2.3


From 7e5f7bb08b8cefd3a7e8961861f47fe1f0e830d4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 May 2019 13:44:57 +0100
Subject: unexport simple_dname()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/d_path.c            | 1 -
 fs/internal.h          | 1 +
 include/linux/dcache.h | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/d_path.c b/fs/d_path.c
index e8fce6b1174f..a7d0a96b35ce 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -316,7 +316,6 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
 		end = ERR_PTR(-ENAMETOOLONG);
 	return end;
 }
-EXPORT_SYMBOL(simple_dname);
 
 /*
  * Write full pathname from the root of the filesystem into the buffer.
diff --git a/fs/internal.h b/fs/internal.h
index 0010889f2e85..1ac2b8f6c621 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -160,6 +160,7 @@ extern int d_set_mounted(struct dentry *dentry);
 extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
 extern struct dentry *d_alloc_cursor(struct dentry *);
 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
+extern char *simple_dname(struct dentry *, char *, int);
 
 /*
  * read_write.c
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f14e587c5d5d..361305ddd75e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -291,7 +291,6 @@ static inline unsigned d_count(const struct dentry *dentry)
  */
 extern __printf(4, 5)
 char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
-extern char *simple_dname(struct dentry *, char *, int);
 
 extern char *__d_path(const struct path *, const struct path *, char *, int);
 extern char *d_absolute_path(const struct path *, char *, int);
-- 
cgit v1.2.3


From 8d9a8543be03096eb6d25663f33f3183bd62b3a6 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 19 Sep 2018 19:12:14 +0200
Subject: dt-bindings: tegra186-gpio: Remove unused definitions

Now that all users of the old definitions have been updated to use the
Tegra186 specific prefix, remove the unused definitions.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/dt-bindings/gpio/tegra186-gpio.h | 41 --------------------------------
 1 file changed, 41 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/gpio/tegra186-gpio.h b/include/dt-bindings/gpio/tegra186-gpio.h
index cabc5712e745..0782b05e2775 100644
--- a/include/dt-bindings/gpio/tegra186-gpio.h
+++ b/include/dt-bindings/gpio/tegra186-gpio.h
@@ -41,34 +41,6 @@
 #define TEGRA186_MAIN_GPIO(port, offset) \
 	((TEGRA186_MAIN_GPIO_PORT_##port * 8) + offset)
 
-/* need to keep these for backwards-compatibility */
-#define TEGRA_MAIN_GPIO_PORT_A 0
-#define TEGRA_MAIN_GPIO_PORT_B 1
-#define TEGRA_MAIN_GPIO_PORT_C 2
-#define TEGRA_MAIN_GPIO_PORT_D 3
-#define TEGRA_MAIN_GPIO_PORT_E 4
-#define TEGRA_MAIN_GPIO_PORT_F 5
-#define TEGRA_MAIN_GPIO_PORT_G 6
-#define TEGRA_MAIN_GPIO_PORT_H 7
-#define TEGRA_MAIN_GPIO_PORT_I 8
-#define TEGRA_MAIN_GPIO_PORT_J 9
-#define TEGRA_MAIN_GPIO_PORT_K 10
-#define TEGRA_MAIN_GPIO_PORT_L 11
-#define TEGRA_MAIN_GPIO_PORT_M 12
-#define TEGRA_MAIN_GPIO_PORT_N 13
-#define TEGRA_MAIN_GPIO_PORT_O 14
-#define TEGRA_MAIN_GPIO_PORT_P 15
-#define TEGRA_MAIN_GPIO_PORT_Q 16
-#define TEGRA_MAIN_GPIO_PORT_R 17
-#define TEGRA_MAIN_GPIO_PORT_T 18
-#define TEGRA_MAIN_GPIO_PORT_X 19
-#define TEGRA_MAIN_GPIO_PORT_Y 20
-#define TEGRA_MAIN_GPIO_PORT_BB 21
-#define TEGRA_MAIN_GPIO_PORT_CC 22
-
-#define TEGRA_MAIN_GPIO(port, offset) \
-	((TEGRA_MAIN_GPIO_PORT_##port * 8) + offset)
-
 /* GPIOs implemented by AON GPIO controller */
 #define TEGRA186_AON_GPIO_PORT_S 0
 #define TEGRA186_AON_GPIO_PORT_U 1
@@ -82,17 +54,4 @@
 #define TEGRA186_AON_GPIO(port, offset) \
 	((TEGRA186_AON_GPIO_PORT_##port * 8) + offset)
 
-/* need to keep these for backwards-compatibility */
-#define TEGRA_AON_GPIO_PORT_S 0
-#define TEGRA_AON_GPIO_PORT_U 1
-#define TEGRA_AON_GPIO_PORT_V 2
-#define TEGRA_AON_GPIO_PORT_W 3
-#define TEGRA_AON_GPIO_PORT_Z 4
-#define TEGRA_AON_GPIO_PORT_AA 5
-#define TEGRA_AON_GPIO_PORT_EE 6
-#define TEGRA_AON_GPIO_PORT_FF 7
-
-#define TEGRA_AON_GPIO(port, offset) \
-	((TEGRA_AON_GPIO_PORT_##port * 8) + offset)
-
 #endif
-- 
cgit v1.2.3


From 7793a108964924dd2802ee8c5dff2e9f3a9cd54a Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Tue, 21 May 2019 13:09:06 +0200
Subject: drm/fourcc: Fix the parameters name in the documentation

We introduced new functions in the commit bf39607c1614 ("drm/fourcc: Pass
the format_info pointer to drm_format_plane_width/height") based on
previous ones but with a slightly different prototype. However, the
documentation wasn't changed to reflect that change.

Fixes: bf39607c1614 ("drm/fourcc: Pass the format_info pointer to drm_format_plane_width/height")
Reported-by: kbuild test robot <lkp@intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190521110906.15268-1-maxime.ripard@bootlin.com
---
 include/drm/drm_fourcc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_fourcc.h b/include/drm/drm_fourcc.h
index 405466692bd2..306d1efeb5e0 100644
--- a/include/drm/drm_fourcc.h
+++ b/include/drm/drm_fourcc.h
@@ -262,7 +262,7 @@ drm_format_info_is_yuv_sampling_444(const struct drm_format_info *info)
 
 /**
  * drm_format_info_plane_width - width of the plane given the first plane
- * @format: pixel format info
+ * @info: pixel format info
  * @width: width of the first plane
  * @plane: plane index
  *
@@ -284,7 +284,7 @@ int drm_format_info_plane_width(const struct drm_format_info *info, int width,
 
 /**
  * drm_format_info_plane_height - height of the plane given the first plane
- * @format: pixel format info
+ * @info: pixel format info
  * @height: height of the first plane
  * @plane: plane index
  *
-- 
cgit v1.2.3


From d2183c6f1958e6b6dfdde279f4cee04280710e34 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Mon, 20 May 2019 09:05:25 +0300
Subject: RDMA/umem: Move page_shift from ib_umem to ib_odp_umem

This value has always been set to PAGE_SHIFT in the core code, the only
thing that does differently was the ODP path. Move the value into the ODP
struct and still use it for ODP, but change all the non-ODP things to just
use PAGE_SHIFT/PAGE_SIZE/PAGE_MASK directly.

Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/umem.c           |  3 +-
 drivers/infiniband/core/umem_odp.c       | 79 +++++++++++++++-----------------
 drivers/infiniband/hw/hns/hns_roce_cq.c  |  3 +-
 drivers/infiniband/hw/hns/hns_roce_srq.c | 10 ++--
 drivers/infiniband/hw/mlx4/mr.c          |  8 ++--
 drivers/infiniband/hw/mlx4/srq.c         |  2 +-
 drivers/infiniband/hw/mlx5/mem.c         | 20 ++++----
 drivers/infiniband/hw/mlx5/mr.c          |  5 +-
 drivers/infiniband/hw/mlx5/odp.c         | 23 ++++------
 drivers/infiniband/hw/nes/nes_verbs.c    |  9 ++--
 include/rdma/ib_umem.h                   | 19 ++------
 include/rdma/ib_umem_odp.h               | 20 ++++++++
 12 files changed, 99 insertions(+), 102 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index e7ea819fcb11..7edc5839606b 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -244,7 +244,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 	umem->context    = context;
 	umem->length     = size;
 	umem->address    = addr;
-	umem->page_shift = PAGE_SHIFT;
 	umem->writable   = ib_access_writable(access);
 	umem->owning_mm = mm = current->mm;
 	mmgrab(mm);
@@ -385,7 +384,7 @@ int ib_umem_page_count(struct ib_umem *umem)
 
 	n = 0;
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
-		n += sg_dma_len(sg) >> umem->page_shift;
+		n += sg_dma_len(sg) >> PAGE_SHIFT;
 
 	return n;
 }
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index f962b5bbfa40..c3b3c523401f 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -59,7 +59,7 @@ static u64 node_start(struct umem_odp_node *n)
 	struct ib_umem_odp *umem_odp =
 			container_of(n, struct ib_umem_odp, interval_tree);
 
-	return ib_umem_start(&umem_odp->umem);
+	return ib_umem_start(umem_odp);
 }
 
 /* Note that the representation of the intervals in the interval tree
@@ -72,7 +72,7 @@ static u64 node_last(struct umem_odp_node *n)
 	struct ib_umem_odp *umem_odp =
 			container_of(n, struct ib_umem_odp, interval_tree);
 
-	return ib_umem_end(&umem_odp->umem) - 1;
+	return ib_umem_end(umem_odp) - 1;
 }
 
 INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
@@ -107,8 +107,6 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
 static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
 					       u64 start, u64 end, void *cookie)
 {
-	struct ib_umem *umem = &umem_odp->umem;
-
 	/*
 	 * Increase the number of notifiers running, to
 	 * prevent any further fault handling on this MR.
@@ -119,8 +117,8 @@ static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
 	 * all pending page faults. */
 	smp_wmb();
 	complete_all(&umem_odp->notifier_completion);
-	umem->context->invalidate_range(umem_odp, ib_umem_start(umem),
-					ib_umem_end(umem));
+	umem_odp->umem.context->invalidate_range(
+		umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp));
 	return 0;
 }
 
@@ -205,10 +203,9 @@ static const struct mmu_notifier_ops ib_umem_notifiers = {
 static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
 {
 	struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
-	struct ib_umem *umem = &umem_odp->umem;
 
 	down_write(&per_mm->umem_rwsem);
-	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+	if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp)))
 		rbt_ib_umem_insert(&umem_odp->interval_tree,
 				   &per_mm->umem_tree);
 	up_write(&per_mm->umem_rwsem);
@@ -217,10 +214,9 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
 static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
 {
 	struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
-	struct ib_umem *umem = &umem_odp->umem;
 
 	down_write(&per_mm->umem_rwsem);
-	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+	if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp)))
 		rbt_ib_umem_remove(&umem_odp->interval_tree,
 				   &per_mm->umem_tree);
 	complete_all(&umem_odp->notifier_completion);
@@ -351,7 +347,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root,
 	umem->context    = ctx;
 	umem->length     = size;
 	umem->address    = addr;
-	umem->page_shift = PAGE_SHIFT;
+	odp_data->page_shift = PAGE_SHIFT;
 	umem->writable   = root->umem.writable;
 	umem->is_odp = 1;
 	odp_data->per_mm = per_mm;
@@ -405,18 +401,19 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
 	struct mm_struct *mm = umem->owning_mm;
 	int ret_val;
 
+	umem_odp->page_shift = PAGE_SHIFT;
 	if (access & IB_ACCESS_HUGETLB) {
 		struct vm_area_struct *vma;
 		struct hstate *h;
 
 		down_read(&mm->mmap_sem);
-		vma = find_vma(mm, ib_umem_start(umem));
+		vma = find_vma(mm, ib_umem_start(umem_odp));
 		if (!vma || !is_vm_hugetlb_page(vma)) {
 			up_read(&mm->mmap_sem);
 			return -EINVAL;
 		}
 		h = hstate_vma(vma);
-		umem->page_shift = huge_page_shift(h);
+		umem_odp->page_shift = huge_page_shift(h);
 		up_read(&mm->mmap_sem);
 	}
 
@@ -424,16 +421,16 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
 
 	init_completion(&umem_odp->notifier_completion);
 
-	if (ib_umem_num_pages(umem)) {
+	if (ib_umem_odp_num_pages(umem_odp)) {
 		umem_odp->page_list =
 			vzalloc(array_size(sizeof(*umem_odp->page_list),
-					   ib_umem_num_pages(umem)));
+					   ib_umem_odp_num_pages(umem_odp)));
 		if (!umem_odp->page_list)
 			return -ENOMEM;
 
 		umem_odp->dma_list =
 			vzalloc(array_size(sizeof(*umem_odp->dma_list),
-					   ib_umem_num_pages(umem)));
+					   ib_umem_odp_num_pages(umem_odp)));
 		if (!umem_odp->dma_list) {
 			ret_val = -ENOMEM;
 			goto out_page_list;
@@ -456,16 +453,14 @@ out_page_list:
 
 void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
 {
-	struct ib_umem *umem = &umem_odp->umem;
-
 	/*
 	 * Ensure that no more pages are mapped in the umem.
 	 *
 	 * It is the driver's responsibility to ensure, before calling us,
 	 * that the hardware will not attempt to access the MR any more.
 	 */
-	ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem),
-				    ib_umem_end(umem));
+	ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+				    ib_umem_end(umem_odp));
 
 	remove_umem_from_per_mm(umem_odp);
 	put_per_mm(umem_odp);
@@ -498,8 +493,8 @@ static int ib_umem_odp_map_dma_single_page(
 		u64 access_mask,
 		unsigned long current_seq)
 {
-	struct ib_umem *umem = &umem_odp->umem;
-	struct ib_device *dev = umem->context->device;
+	struct ib_ucontext *context = umem_odp->umem.context;
+	struct ib_device *dev = context->device;
 	dma_addr_t dma_addr;
 	int remove_existing_mapping = 0;
 	int ret = 0;
@@ -514,10 +509,9 @@ static int ib_umem_odp_map_dma_single_page(
 		goto out;
 	}
 	if (!(umem_odp->dma_list[page_index])) {
-		dma_addr = ib_dma_map_page(dev,
-					   page,
-					   0, BIT(umem->page_shift),
-					   DMA_BIDIRECTIONAL);
+		dma_addr =
+			ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift),
+					DMA_BIDIRECTIONAL);
 		if (ib_dma_mapping_error(dev, dma_addr)) {
 			ret = -EFAULT;
 			goto out;
@@ -540,11 +534,12 @@ out:
 
 	if (remove_existing_mapping) {
 		ib_umem_notifier_start_account(umem_odp);
-		umem->context->invalidate_range(
+		context->invalidate_range(
 			umem_odp,
-			ib_umem_start(umem) + (page_index << umem->page_shift),
-			ib_umem_start(umem) +
-				((page_index + 1) << umem->page_shift));
+			ib_umem_start(umem_odp) +
+				(page_index << umem_odp->page_shift),
+			ib_umem_start(umem_odp) +
+				((page_index + 1) << umem_odp->page_shift));
 		ib_umem_notifier_end_account(umem_odp);
 		ret = -EAGAIN;
 	}
@@ -581,27 +576,26 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 			      u64 bcnt, u64 access_mask,
 			      unsigned long current_seq)
 {
-	struct ib_umem *umem = &umem_odp->umem;
 	struct task_struct *owning_process  = NULL;
 	struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
 	struct page       **local_page_list = NULL;
 	u64 page_mask, off;
-	int j, k, ret = 0, start_idx, npages = 0, page_shift;
-	unsigned int flags = 0;
+	int j, k, ret = 0, start_idx, npages = 0;
+	unsigned int flags = 0, page_shift;
 	phys_addr_t p = 0;
 
 	if (access_mask == 0)
 		return -EINVAL;
 
-	if (user_virt < ib_umem_start(umem) ||
-	    user_virt + bcnt > ib_umem_end(umem))
+	if (user_virt < ib_umem_start(umem_odp) ||
+	    user_virt + bcnt > ib_umem_end(umem_odp))
 		return -EFAULT;
 
 	local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
 	if (!local_page_list)
 		return -ENOMEM;
 
-	page_shift = umem->page_shift;
+	page_shift = umem_odp->page_shift;
 	page_mask = ~(BIT(page_shift) - 1);
 	off = user_virt & (~page_mask);
 	user_virt = user_virt & page_mask;
@@ -621,7 +615,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 	if (access_mask & ODP_WRITE_ALLOWED_BIT)
 		flags |= FOLL_WRITE;
 
-	start_idx = (user_virt - ib_umem_start(umem)) >> page_shift;
+	start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift;
 	k = start_idx;
 
 	while (bcnt > 0) {
@@ -711,21 +705,20 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
 				 u64 bound)
 {
-	struct ib_umem *umem = &umem_odp->umem;
 	int idx;
 	u64 addr;
-	struct ib_device *dev = umem->context->device;
+	struct ib_device *dev = umem_odp->umem.context->device;
 
-	virt  = max_t(u64, virt,  ib_umem_start(umem));
-	bound = min_t(u64, bound, ib_umem_end(umem));
+	virt = max_t(u64, virt, ib_umem_start(umem_odp));
+	bound = min_t(u64, bound, ib_umem_end(umem_odp));
 	/* Note that during the run of this function, the
 	 * notifiers_count of the MR is > 0, preventing any racing
 	 * faults from completion. We might be racing with other
 	 * invalidations, so we must make sure we free each page only
 	 * once. */
 	mutex_lock(&umem_odp->umem_mutex);
-	for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) {
-		idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
+	for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
+		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
 		if (umem_odp->page_list[idx]) {
 			struct page *page = umem_odp->page_list[idx];
 			dma_addr_t dma = umem_odp->dma_list[idx];
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 9caf35061721..6e81ff3f1813 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -235,8 +235,7 @@ static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev,
 					&buf->hr_mtt);
 	} else {
 		ret = hns_roce_mtt_init(hr_dev, ib_umem_page_count(*umem),
-				(*umem)->page_shift,
-				&buf->hr_mtt);
+					PAGE_SHIFT, &buf->hr_mtt);
 	}
 	if (ret)
 		goto err_buf;
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index b3421b1f21e0..ad15b41da30a 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -264,8 +264,7 @@ int hns_roce_create_srq(struct ib_srq *ib_srq,
 		} else
 			ret = hns_roce_mtt_init(hr_dev,
 						ib_umem_page_count(srq->umem),
-						srq->umem->page_shift,
-						&srq->mtt);
+						PAGE_SHIFT, &srq->mtt);
 		if (ret)
 			goto err_buf;
 
@@ -291,10 +290,9 @@ int hns_roce_create_srq(struct ib_srq *ib_srq,
 			ret = hns_roce_mtt_init(hr_dev, npages,
 						page_shift, &srq->idx_que.mtt);
 		} else {
-			ret = hns_roce_mtt_init(hr_dev,
-				       ib_umem_page_count(srq->idx_que.umem),
-				       srq->idx_que.umem->page_shift,
-				       &srq->idx_que.mtt);
+			ret = hns_roce_mtt_init(
+				hr_dev, ib_umem_page_count(srq->idx_que.umem),
+				PAGE_SHIFT, &srq->idx_que.mtt);
 		}
 
 		if (ret) {
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 355205a28544..b0b94dedb848 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -258,7 +258,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
 				       int *num_of_mtts)
 {
 	u64 block_shift = MLX4_MAX_MTT_SHIFT;
-	u64 min_shift = umem->page_shift;
+	u64 min_shift = PAGE_SHIFT;
 	u64 last_block_aligned_end = 0;
 	u64 current_block_start = 0;
 	u64 first_block_start = 0;
@@ -295,8 +295,8 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
 			 * in access to the wrong data.
 			 */
 			misalignment_bits =
-			(start_va & (~(((u64)(BIT(umem->page_shift))) - 1ULL)))
-			^ current_block_start;
+				(start_va & (~(((u64)(PAGE_SIZE)) - 1ULL))) ^
+				current_block_start;
 			block_shift = min(alignment_of(misalignment_bits),
 					  block_shift);
 		}
@@ -514,7 +514,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 			goto release_mpt_entry;
 		}
 		n = ib_umem_page_count(mmr->umem);
-		shift = mmr->umem->page_shift;
+		shift = PAGE_SHIFT;
 
 		err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
 					      virt_addr, length, n, shift,
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 4bf2946b9759..c9f555e04c9f 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -115,7 +115,7 @@ int mlx4_ib_create_srq(struct ib_srq *ib_srq,
 			return PTR_ERR(srq->umem);
 
 		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
-				    srq->umem->page_shift, &srq->mtt);
+				    PAGE_SHIFT, &srq->mtt);
 		if (err)
 			goto err_buf;
 
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 9f90be296ee0..fe1a76d8531c 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -55,9 +55,10 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 	int i = 0;
 	struct scatterlist *sg;
 	int entry;
-	unsigned long page_shift = umem->page_shift;
 
 	if (umem->is_odp) {
+		unsigned int page_shift = to_ib_umem_odp(umem)->page_shift;
+
 		*ncont = ib_umem_page_count(umem);
 		*count = *ncont << (page_shift - PAGE_SHIFT);
 		*shift = page_shift;
@@ -67,15 +68,15 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 		return;
 	}
 
-	addr = addr >> page_shift;
+	addr = addr >> PAGE_SHIFT;
 	tmp = (unsigned long)addr;
 	m = find_first_bit(&tmp, BITS_PER_LONG);
 	if (max_page_shift)
-		m = min_t(unsigned long, max_page_shift - page_shift, m);
+		m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
 
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> page_shift;
-		pfn = sg_dma_address(sg) >> page_shift;
+		len = sg_dma_len(sg) >> PAGE_SHIFT;
+		pfn = sg_dma_address(sg) >> PAGE_SHIFT;
 		if (base + p != pfn) {
 			/* If either the offset or the new
 			 * base are unaligned update m
@@ -107,7 +108,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 
 		*ncont = 0;
 	}
-	*shift = page_shift + m;
+	*shift = PAGE_SHIFT + m;
 	*count = i;
 }
 
@@ -140,8 +141,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 			    int page_shift, size_t offset, size_t num_pages,
 			    __be64 *pas, int access_flags)
 {
-	unsigned long umem_page_shift = umem->page_shift;
-	int shift = page_shift - umem_page_shift;
+	int shift = page_shift - PAGE_SHIFT;
 	int mask = (1 << shift) - 1;
 	int i, k, idx;
 	u64 cur = 0;
@@ -165,7 +165,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 
 	i = 0;
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> umem_page_shift;
+		len = sg_dma_len(sg) >> PAGE_SHIFT;
 		base = sg_dma_address(sg);
 
 		/* Skip elements below offset */
@@ -184,7 +184,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 
 		for (; k < len; k++) {
 			if (!(i & mask)) {
-				cur = base + (k << umem_page_shift);
+				cur = base + (k << PAGE_SHIFT);
 				cur |= access_flags;
 				idx = (i >> shift) - offset;
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 5f09699fab98..4d033796dcfc 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1606,8 +1606,9 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 		synchronize_srcu(&dev->mr_srcu);
 		/* Destroy all page mappings */
 		if (umem_odp->page_list)
-			mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem),
-						 ib_umem_end(umem));
+			mlx5_ib_invalidate_range(umem_odp,
+						 ib_umem_start(umem_odp),
+						 ib_umem_end(umem_odp));
 		else
 			mlx5_ib_free_implicit_mr(mr);
 		/*
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 91507a2e9290..d0c6f9cc97ef 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -150,7 +150,7 @@ static struct ib_umem_odp *odp_lookup(u64 start, u64 length,
 		if (!rb)
 			goto not_found;
 		odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
-		if (ib_umem_start(&odp->umem) > start + length)
+		if (ib_umem_start(odp) > start + length)
 			goto not_found;
 	}
 not_found:
@@ -200,7 +200,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
 static void mr_leaf_free_action(struct work_struct *work)
 {
 	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
-	int idx = ib_umem_start(&odp->umem) >> MLX5_IMR_MTT_SHIFT;
+	int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
 	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
 
 	mr->parent = NULL;
@@ -224,7 +224,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
 	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
 				    sizeof(struct mlx5_mtt)) - 1;
 	u64 idx = 0, blk_start_idx = 0;
-	struct ib_umem *umem;
 	int in_block = 0;
 	u64 addr;
 
@@ -232,15 +231,14 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
 		pr_err("invalidation called on NULL umem or non-ODP umem\n");
 		return;
 	}
-	umem = &umem_odp->umem;
 
 	mr = umem_odp->private;
 
 	if (!mr || !mr->ibmr.pd)
 		return;
 
-	start = max_t(u64, ib_umem_start(umem), start);
-	end = min_t(u64, ib_umem_end(umem), end);
+	start = max_t(u64, ib_umem_start(umem_odp), start);
+	end = min_t(u64, ib_umem_end(umem_odp), end);
 
 	/*
 	 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
@@ -249,8 +247,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
 	 * but they will write 0s as well, so no difference in the end result.
 	 */
 
-	for (addr = start; addr < end; addr += BIT(umem->page_shift)) {
-		idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
+	for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
+		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
 		/*
 		 * Strive to write the MTTs in chunks, but avoid overwriting
 		 * non-existing MTTs. The huristic here can be improved to
@@ -544,13 +542,12 @@ static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end,
 			void *cookie)
 {
 	struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie;
-	struct ib_umem *umem = &umem_odp->umem;
 
 	if (mr->parent != imr)
 		return 0;
 
-	ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem),
-				    ib_umem_end(umem));
+	ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+				    ib_umem_end(umem_odp));
 
 	if (umem_odp->dying)
 		return 0;
@@ -602,9 +599,9 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 	}
 
 next_mr:
-	size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt);
+	size = min_t(size_t, bcnt, ib_umem_end(odp) - io_virt);
 
-	page_shift = mr->umem->page_shift;
+	page_shift = odp->page_shift;
 	page_mask = ~(BIT(page_shift) - 1);
 	start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
 	access_mask = ODP_READ_ALLOWED_BIT;
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 49024326a518..ad2b8322cc3f 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -2112,10 +2112,11 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		return (struct ib_mr *)region;
 	}
 
-	nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u,"
-			" offset = %u, page size = %lu.\n",
-			(unsigned long int)start, (unsigned long int)virt, (u32)length,
-			ib_umem_offset(region), BIT(region->page_shift));
+	nes_debug(
+		NES_DBG_MR,
+		"User base = 0x%lX, Virt base = 0x%lX, length = %u, offset = %u, page size = %lu.\n",
+		(unsigned long)start, (unsigned long)virt, (u32)length,
+		ib_umem_offset(region), PAGE_SIZE);
 
 	skip_pages = ((u32)ib_umem_offset(region)) >> 12;
 
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 040d853077c6..1052d0d62be7 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -46,7 +46,6 @@ struct ib_umem {
 	struct mm_struct       *owning_mm;
 	size_t			length;
 	unsigned long		address;
-	int			page_shift;
 	u32 writable : 1;
 	u32 is_odp : 1;
 	struct work_struct	work;
@@ -58,24 +57,14 @@ struct ib_umem {
 /* Returns the offset of the umem start relative to the first page. */
 static inline int ib_umem_offset(struct ib_umem *umem)
 {
-	return umem->address & (BIT(umem->page_shift) - 1);
-}
-
-/* Returns the first page of an ODP umem. */
-static inline unsigned long ib_umem_start(struct ib_umem *umem)
-{
-	return umem->address - ib_umem_offset(umem);
-}
-
-/* Returns the address of the page after the last one of an ODP umem. */
-static inline unsigned long ib_umem_end(struct ib_umem *umem)
-{
-	return ALIGN(umem->address + umem->length, BIT(umem->page_shift));
+	return umem->address & ~PAGE_MASK;
 }
 
 static inline size_t ib_umem_num_pages(struct ib_umem *umem)
 {
-	return (ib_umem_end(umem) - ib_umem_start(umem)) >> umem->page_shift;
+	return (ALIGN(umem->address + umem->length, PAGE_SIZE) -
+		ALIGN_DOWN(umem->address, PAGE_SIZE)) >>
+	       PAGE_SHIFT;
 }
 
 #ifdef CONFIG_INFINIBAND_USER_MEM
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index eeec4e53c448..479db5c98ff6 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -76,6 +76,7 @@ struct ib_umem_odp {
 
 	struct completion	notifier_completion;
 	int			dying;
+	unsigned int		page_shift;
 	struct work_struct	work;
 };
 
@@ -84,6 +85,25 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
 	return container_of(umem, struct ib_umem_odp, umem);
 }
 
+/* Returns the first page of an ODP umem. */
+static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp)
+{
+	return ALIGN_DOWN(umem_odp->umem.address, 1UL << umem_odp->page_shift);
+}
+
+/* Returns the address of the page after the last one of an ODP umem. */
+static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp)
+{
+	return ALIGN(umem_odp->umem.address + umem_odp->umem.length,
+		     1UL << umem_odp->page_shift);
+}
+
+static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
+{
+	return (ib_umem_end(umem_odp) - ib_umem_start(umem_odp)) >>
+	       umem_odp->page_shift;
+}
+
 /*
  * The lower 2 bits of the DMA address signal the R/W permissions for
  * the entry. To upgrade the permissions, provide the appropriate
-- 
cgit v1.2.3


From 890ac8d97e6722a9e4a66a0bd836d1b028d075fe Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 20 May 2019 09:54:21 +0300
Subject: RDMA/core: Make ib_destroy_cq() void

Kernel destroy CQ flows can't fail and the returned value of
ib_destroy_cq() is not interested in those flows.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/ib_verbs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0742095355f2..ec6446864b08 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3858,9 +3858,9 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata);
  *
  * NOTE: for user cq use ib_destroy_cq_user with valid udata!
  */
-static inline int ib_destroy_cq(struct ib_cq *cq)
+static inline void ib_destroy_cq(struct ib_cq *cq)
 {
-	return ib_destroy_cq_user(cq, NULL);
+	ib_destroy_cq_user(cq, NULL);
 }
 
 /**
-- 
cgit v1.2.3


From 7f3f317a66cac307f6fbc1b5dd74902fb1b48860 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 May 2019 22:11:25 +0100
Subject: drm/i915: Restore control over ppgtt for context creation ABI

Having hid the partially exposed new ABI from the PR, put it back again
for completion of context recovery. A significant part of context
recovery is the ability to reuse as much of the old context as is
feasible (to avoid expensive reconstruction). The biggest chunk kept
hidden at the moment is fine-control over the ctx->ppgtt (the GPU page
tables and associated translation tables and kernel maps), so make
control over the ctx->ppgtt explicit.

This allows userspace to create and share virtual memory address spaces
(within the limits of a single fd) between contexts they own, along with
the ability to query the contexts for the vm state.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190521211134.16117-1-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.c         |  2 ++
 drivers/gpu/drm/i915/i915_gem_context.c |  5 -----
 include/uapi/drm/i915_drm.h             | 15 +++++++++++++++
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 2c7a4318d13c..5061cb32856b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -3164,6 +3164,8 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_PERF_ADD_CONFIG, i915_perf_add_config_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_PERF_REMOVE_CONFIG, i915_perf_remove_config_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_QUERY, i915_query_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_VM_CREATE, i915_gem_vm_create_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_VM_DESTROY, i915_gem_vm_destroy_ioctl, DRM_RENDER_ALLOW),
 };
 
 static struct drm_driver driver = {
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 65cefc520e79..413c4529191d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -98,7 +98,6 @@
 #include "i915_user_extensions.h"
 
 #define I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE (1 << 1)
-#define I915_CONTEXT_PARAM_VM 0x9
 
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
 
@@ -966,8 +965,6 @@ static int get_ppgtt(struct drm_i915_file_private *file_priv,
 	struct i915_hw_ppgtt *ppgtt;
 	int ret;
 
-	return -EINVAL; /* nothing to see here; please move along */
-
 	if (!ctx->ppgtt)
 		return -ENODEV;
 
@@ -1066,8 +1063,6 @@ static int set_ppgtt(struct drm_i915_file_private *file_priv,
 	struct i915_hw_ppgtt *ppgtt, *old;
 	int err;
 
-	return -EINVAL; /* nothing to see here; please move along */
-
 	if (args->size)
 		return -EINVAL;
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 3a73f5316766..d6ad4a15b2b9 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -355,6 +355,8 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_PERF_ADD_CONFIG	0x37
 #define DRM_I915_PERF_REMOVE_CONFIG	0x38
 #define DRM_I915_QUERY			0x39
+#define DRM_I915_GEM_VM_CREATE		0x3a
+#define DRM_I915_GEM_VM_DESTROY		0x3b
 /* Must be kept compact -- no holes */
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
@@ -415,6 +417,8 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_PERF_ADD_CONFIG	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config)
 #define DRM_IOCTL_I915_PERF_REMOVE_CONFIG	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_REMOVE_CONFIG, __u64)
 #define DRM_IOCTL_I915_QUERY			DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, struct drm_i915_query)
+#define DRM_IOCTL_I915_GEM_VM_CREATE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_VM_CREATE, struct drm_i915_gem_vm_control)
+#define DRM_IOCTL_I915_GEM_VM_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_VM_DESTROY, struct drm_i915_gem_vm_control)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -1507,6 +1511,17 @@ struct drm_i915_gem_context_param {
  * On creation, all new contexts are marked as recoverable.
  */
 #define I915_CONTEXT_PARAM_RECOVERABLE	0x8
+
+	/*
+	 * The id of the associated virtual memory address space (ppGTT) of
+	 * this context. Can be retrieved and passed to another context
+	 * (on the same fd) for both to use the same ppGTT and so share
+	 * address layouts, and avoid reloading the page tables on context
+	 * switches between themselves.
+	 *
+	 * See DRM_I915_GEM_VM_CREATE and DRM_I915_GEM_VM_DESTROY.
+	 */
+#define I915_CONTEXT_PARAM_VM		0x9
 /* Must be kept compact -- no holes and well documented */
 
 	__u64 value;
-- 
cgit v1.2.3


From 976b55f0e1db5cb8fccb0a42f68ea77ae42604a6 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 May 2019 22:11:26 +0100
Subject: drm/i915: Allow a context to define its set of engines

Over the last few years, we have debated how to extend the user API to
support an increase in the number of engines, that may be sparse and
even be heterogeneous within a class (not all video decoders created
equal). We settled on using (class, instance) tuples to identify a
specific engine, with an API for the user to construct a map of engines
to capabilities. Into this picture, we then add a challenge of virtual
engines; one user engine that maps behind the scenes to any number of
physical engines. To keep it general, we want the user to have full
control over that mapping. To that end, we allow the user to constrain a
context to define the set of engines that it can access, order fully
controlled by the user via (class, instance). With such precise control
in context setup, we can continue to use the existing execbuf uABI of
specifying a single index; only now it doesn't automagically map onto
the engines, it uses the user defined engine map from the context.

v2: Fixup freeing of local on success of get_engines()
v3: Allow empty engines[]
v4: s/nengine/num_engines/
v5: Replace 64 limit on num_engines with a note that execbuf is
currently limited to only using the first 64 engines.
v6: Actually use the engines_mutex to guard the ctx->engines.

Testcase: igt/gem_ctx_engines
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190521211134.16117-2-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_context.c       | 265 ++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_gem_context.h       |  18 ++
 drivers/gpu/drm/i915/i915_gem_context_types.h |   1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c    |   5 +-
 drivers/gpu/drm/i915/i915_utils.h             |  34 ++++
 include/uapi/drm/i915_drm.h                   |  31 +++
 6 files changed, 341 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 413c4529191d..c501e2848ca0 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -90,7 +90,6 @@
 #include <drm/i915_drm.h>
 
 #include "gt/intel_lrc_reg.h"
-#include "gt/intel_workarounds.h"
 
 #include "i915_drv.h"
 #include "i915_globals.h"
@@ -141,15 +140,31 @@ static void lut_close(struct i915_gem_context *ctx)
 }
 
 static struct intel_context *
-lookup_user_engine(struct i915_gem_context *ctx, u16 class, u16 instance)
+lookup_user_engine(struct i915_gem_context *ctx,
+		   unsigned long flags,
+		   const struct i915_engine_class_instance *ci)
+#define LOOKUP_USER_INDEX BIT(0)
 {
-	struct intel_engine_cs *engine;
+	int idx;
 
-	engine = intel_engine_lookup_user(ctx->i915, class, instance);
-	if (!engine)
+	if (!!(flags & LOOKUP_USER_INDEX) != i915_gem_context_user_engines(ctx))
 		return ERR_PTR(-EINVAL);
 
-	return i915_gem_context_get_engine(ctx, engine->id);
+	if (!i915_gem_context_user_engines(ctx)) {
+		struct intel_engine_cs *engine;
+
+		engine = intel_engine_lookup_user(ctx->i915,
+						  ci->engine_class,
+						  ci->engine_instance);
+		if (!engine)
+			return ERR_PTR(-EINVAL);
+
+		idx = engine->id;
+	} else {
+		idx = ci->engine_instance;
+	}
+
+	return i915_gem_context_get_engine(ctx, idx);
 }
 
 static inline int new_hw_id(struct drm_i915_private *i915, gfp_t gfp)
@@ -257,6 +272,17 @@ static void free_engines(struct i915_gem_engines *e)
 	__free_engines(e, e->num_engines);
 }
 
+static void free_engines_rcu(struct work_struct *wrk)
+{
+	struct i915_gem_engines *e =
+		container_of(wrk, struct i915_gem_engines, rcu.work);
+	struct drm_i915_private *i915 = e->i915;
+
+	mutex_lock(&i915->drm.struct_mutex);
+	free_engines(e);
+	mutex_unlock(&i915->drm.struct_mutex);
+}
+
 static struct i915_gem_engines *default_engines(struct i915_gem_context *ctx)
 {
 	struct intel_engine_cs *engine;
@@ -1352,9 +1378,7 @@ static int set_sseu(struct i915_gem_context *ctx,
 	if (user_sseu.flags || user_sseu.rsvd)
 		return -EINVAL;
 
-	ce = lookup_user_engine(ctx,
-				user_sseu.engine.engine_class,
-				user_sseu.engine.engine_instance);
+	ce = lookup_user_engine(ctx, 0, &user_sseu.engine);
 	if (IS_ERR(ce))
 		return PTR_ERR(ce);
 
@@ -1379,6 +1403,217 @@ out_ce:
 	return ret;
 }
 
+struct set_engines {
+	struct i915_gem_context *ctx;
+	struct i915_gem_engines *engines;
+};
+
+static const i915_user_extension_fn set_engines__extensions[] = {
+};
+
+static int
+set_engines(struct i915_gem_context *ctx,
+	    const struct drm_i915_gem_context_param *args)
+{
+	struct i915_context_param_engines __user *user =
+		u64_to_user_ptr(args->value);
+	struct set_engines set = { .ctx = ctx };
+	unsigned int num_engines, n;
+	u64 extensions;
+	int err;
+
+	if (!args->size) { /* switch back to legacy user_ring_map */
+		if (!i915_gem_context_user_engines(ctx))
+			return 0;
+
+		set.engines = default_engines(ctx);
+		if (IS_ERR(set.engines))
+			return PTR_ERR(set.engines);
+
+		goto replace;
+	}
+
+	BUILD_BUG_ON(!IS_ALIGNED(sizeof(*user), sizeof(*user->engines)));
+	if (args->size < sizeof(*user) ||
+	    !IS_ALIGNED(args->size, sizeof(*user->engines))) {
+		DRM_DEBUG("Invalid size for engine array: %d\n",
+			  args->size);
+		return -EINVAL;
+	}
+
+	/*
+	 * Note that I915_EXEC_RING_MASK limits execbuf to only using the
+	 * first 64 engines defined here.
+	 */
+	num_engines = (args->size - sizeof(*user)) / sizeof(*user->engines);
+
+	set.engines = kmalloc(struct_size(set.engines, engines, num_engines),
+			      GFP_KERNEL);
+	if (!set.engines)
+		return -ENOMEM;
+
+	set.engines->i915 = ctx->i915;
+	for (n = 0; n < num_engines; n++) {
+		struct i915_engine_class_instance ci;
+		struct intel_engine_cs *engine;
+
+		if (copy_from_user(&ci, &user->engines[n], sizeof(ci))) {
+			__free_engines(set.engines, n);
+			return -EFAULT;
+		}
+
+		if (ci.engine_class == (u16)I915_ENGINE_CLASS_INVALID &&
+		    ci.engine_instance == (u16)I915_ENGINE_CLASS_INVALID_NONE) {
+			set.engines->engines[n] = NULL;
+			continue;
+		}
+
+		engine = intel_engine_lookup_user(ctx->i915,
+						  ci.engine_class,
+						  ci.engine_instance);
+		if (!engine) {
+			DRM_DEBUG("Invalid engine[%d]: { class:%d, instance:%d }\n",
+				  n, ci.engine_class, ci.engine_instance);
+			__free_engines(set.engines, n);
+			return -ENOENT;
+		}
+
+		set.engines->engines[n] = intel_context_create(ctx, engine);
+		if (!set.engines->engines[n]) {
+			__free_engines(set.engines, n);
+			return -ENOMEM;
+		}
+	}
+	set.engines->num_engines = num_engines;
+
+	err = -EFAULT;
+	if (!get_user(extensions, &user->extensions))
+		err = i915_user_extensions(u64_to_user_ptr(extensions),
+					   set_engines__extensions,
+					   ARRAY_SIZE(set_engines__extensions),
+					   &set);
+	if (err) {
+		free_engines(set.engines);
+		return err;
+	}
+
+replace:
+	mutex_lock(&ctx->engines_mutex);
+	if (args->size)
+		i915_gem_context_set_user_engines(ctx);
+	else
+		i915_gem_context_clear_user_engines(ctx);
+	rcu_swap_protected(ctx->engines, set.engines, 1);
+	mutex_unlock(&ctx->engines_mutex);
+
+	INIT_RCU_WORK(&set.engines->rcu, free_engines_rcu);
+	queue_rcu_work(system_wq, &set.engines->rcu);
+
+	return 0;
+}
+
+static struct i915_gem_engines *
+__copy_engines(struct i915_gem_engines *e)
+{
+	struct i915_gem_engines *copy;
+	unsigned int n;
+
+	copy = kmalloc(struct_size(e, engines, e->num_engines), GFP_KERNEL);
+	if (!copy)
+		return ERR_PTR(-ENOMEM);
+
+	copy->i915 = e->i915;
+	for (n = 0; n < e->num_engines; n++) {
+		if (e->engines[n])
+			copy->engines[n] = intel_context_get(e->engines[n]);
+		else
+			copy->engines[n] = NULL;
+	}
+	copy->num_engines = n;
+
+	return copy;
+}
+
+static int
+get_engines(struct i915_gem_context *ctx,
+	    struct drm_i915_gem_context_param *args)
+{
+	struct i915_context_param_engines __user *user;
+	struct i915_gem_engines *e;
+	size_t n, count, size;
+	int err = 0;
+
+	err = mutex_lock_interruptible(&ctx->engines_mutex);
+	if (err)
+		return err;
+
+	e = NULL;
+	if (i915_gem_context_user_engines(ctx))
+		e = __copy_engines(i915_gem_context_engines(ctx));
+	mutex_unlock(&ctx->engines_mutex);
+	if (IS_ERR_OR_NULL(e)) {
+		args->size = 0;
+		return PTR_ERR_OR_ZERO(e);
+	}
+
+	count = e->num_engines;
+
+	/* Be paranoid in case we have an impedance mismatch */
+	if (!check_struct_size(user, engines, count, &size)) {
+		err = -EINVAL;
+		goto err_free;
+	}
+	if (overflows_type(size, args->size)) {
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	if (!args->size) {
+		args->size = size;
+		goto err_free;
+	}
+
+	if (args->size < size) {
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	user = u64_to_user_ptr(args->value);
+	if (!access_ok(user, size)) {
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	if (put_user(0, &user->extensions)) {
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	for (n = 0; n < count; n++) {
+		struct i915_engine_class_instance ci = {
+			.engine_class = I915_ENGINE_CLASS_INVALID,
+			.engine_instance = I915_ENGINE_CLASS_INVALID_NONE,
+		};
+
+		if (e->engines[n]) {
+			ci.engine_class = e->engines[n]->engine->uabi_class;
+			ci.engine_instance = e->engines[n]->engine->instance;
+		}
+
+		if (copy_to_user(&user->engines[n], &ci, sizeof(ci))) {
+			err = -EFAULT;
+			goto err_free;
+		}
+	}
+
+	args->size = size;
+
+err_free:
+	INIT_RCU_WORK(&e->rcu, free_engines_rcu);
+	queue_rcu_work(system_wq, &e->rcu);
+	return err;
+}
+
 static int ctx_setparam(struct drm_i915_file_private *fpriv,
 			struct i915_gem_context *ctx,
 			struct drm_i915_gem_context_param *args)
@@ -1452,6 +1687,10 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
 		ret = set_ppgtt(fpriv, ctx, args);
 		break;
 
+	case I915_CONTEXT_PARAM_ENGINES:
+		ret = set_engines(ctx, args);
+		break;
+
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	default:
 		ret = -EINVAL;
@@ -1596,9 +1835,7 @@ static int get_sseu(struct i915_gem_context *ctx,
 	if (user_sseu.flags || user_sseu.rsvd)
 		return -EINVAL;
 
-	ce = lookup_user_engine(ctx,
-				user_sseu.engine.engine_class,
-				user_sseu.engine.engine_instance);
+	ce = lookup_user_engine(ctx, 0, &user_sseu.engine);
 	if (IS_ERR(ce))
 		return PTR_ERR(ce);
 
@@ -1682,6 +1919,10 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		ret = get_ppgtt(file_priv, ctx, args);
 		break;
 
+	case I915_CONTEXT_PARAM_ENGINES:
+		ret = get_engines(ctx, args);
+		break;
+
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	default:
 		ret = -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 272e183ebc0c..9ad4a6362438 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -112,6 +112,24 @@ static inline void i915_gem_context_set_force_single_submission(struct i915_gem_
 	__set_bit(CONTEXT_FORCE_SINGLE_SUBMISSION, &ctx->flags);
 }
 
+static inline bool
+i915_gem_context_user_engines(const struct i915_gem_context *ctx)
+{
+	return test_bit(CONTEXT_USER_ENGINES, &ctx->flags);
+}
+
+static inline void
+i915_gem_context_set_user_engines(struct i915_gem_context *ctx)
+{
+	set_bit(CONTEXT_USER_ENGINES, &ctx->flags);
+}
+
+static inline void
+i915_gem_context_clear_user_engines(struct i915_gem_context *ctx)
+{
+	clear_bit(CONTEXT_USER_ENGINES, &ctx->flags);
+}
+
 int __i915_gem_context_pin_hw_id(struct i915_gem_context *ctx);
 static inline int i915_gem_context_pin_hw_id(struct i915_gem_context *ctx)
 {
diff --git a/drivers/gpu/drm/i915/i915_gem_context_types.h b/drivers/gpu/drm/i915/i915_gem_context_types.h
index d5cb4f121aad..fb965ded2508 100644
--- a/drivers/gpu/drm/i915/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/i915_gem_context_types.h
@@ -146,6 +146,7 @@ struct i915_gem_context {
 #define CONTEXT_BANNED			0
 #define CONTEXT_CLOSED			1
 #define CONTEXT_FORCE_SINGLE_SUBMISSION	2
+#define CONTEXT_USER_ENGINES		3
 
 	/**
 	 * @hw_id: - unique identifier for the context
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 679f7c1561ba..d6c5220addd0 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -2165,7 +2165,10 @@ eb_select_engine(struct i915_execbuffer *eb,
 	unsigned int idx;
 	int err;
 
-	idx = eb_select_legacy_ring(eb, file, args);
+	if (i915_gem_context_user_engines(eb->gem_context))
+		idx = args->flags & I915_EXEC_RING_MASK;
+	else
+		idx = eb_select_legacy_ring(eb, file, args);
 
 	ce = i915_gem_context_get_engine(eb->gem_context, idx);
 	if (IS_ERR(ce))
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 5c94c7ab4607..e52866084891 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -26,6 +26,7 @@
 #define __I915_UTILS_H
 
 #include <linux/list.h>
+#include <linux/overflow.h>
 #include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
@@ -78,6 +79,39 @@
 #define overflows_type(x, T) \
 	(sizeof(x) > sizeof(T) && (x) >> BITS_PER_TYPE(T))
 
+static inline bool
+__check_struct_size(size_t base, size_t arr, size_t count, size_t *size)
+{
+	size_t sz;
+
+	if (check_mul_overflow(count, arr, &sz))
+		return false;
+
+	if (check_add_overflow(sz, base, &sz))
+		return false;
+
+	*size = sz;
+	return true;
+}
+
+/**
+ * check_struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @n: Number of elements in the array.
+ * @sz: Total size of structure and array
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @n @member elements, like struct_size() but reports
+ * whether it overflowed, and the resultant size in @sz
+ *
+ * Return: false if the calculation overflowed.
+ */
+#define check_struct_size(p, member, n, sz) \
+	likely(__check_struct_size(sizeof(*(p)), \
+				   sizeof(*(p)->member) + __must_be_array((p)->member), \
+				   n, sz))
+
 #define ptr_mask_bits(ptr, n) ({					\
 	unsigned long __v = (unsigned long)(ptr);			\
 	(typeof(ptr))(__v & -BIT(n));					\
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index d6ad4a15b2b9..8e1bb22926e4 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -136,6 +136,7 @@ enum drm_i915_gem_engine_class {
 struct i915_engine_class_instance {
 	__u16 engine_class; /* see enum drm_i915_gem_engine_class */
 	__u16 engine_instance;
+#define I915_ENGINE_CLASS_INVALID_NONE -1
 };
 
 /**
@@ -1522,6 +1523,26 @@ struct drm_i915_gem_context_param {
 	 * See DRM_I915_GEM_VM_CREATE and DRM_I915_GEM_VM_DESTROY.
 	 */
 #define I915_CONTEXT_PARAM_VM		0x9
+
+/*
+ * I915_CONTEXT_PARAM_ENGINES:
+ *
+ * Bind this context to operate on this subset of available engines. Henceforth,
+ * the I915_EXEC_RING selector for DRM_IOCTL_I915_GEM_EXECBUFFER2 operates as
+ * an index into this array of engines; I915_EXEC_DEFAULT selecting engine[0]
+ * and upwards. Slots 0...N are filled in using the specified (class, instance).
+ * Use
+ *	engine_class: I915_ENGINE_CLASS_INVALID,
+ *	engine_instance: I915_ENGINE_CLASS_INVALID_NONE
+ * to specify a gap in the array that can be filled in later, e.g. by a
+ * virtual engine used for load balancing.
+ *
+ * Setting the number of engines bound to the context to 0, by passing a zero
+ * sized argument, will revert back to default settings.
+ *
+ * See struct i915_context_param_engines.
+ */
+#define I915_CONTEXT_PARAM_ENGINES	0xa
 /* Must be kept compact -- no holes and well documented */
 
 	__u64 value;
@@ -1585,6 +1606,16 @@ struct drm_i915_gem_context_param_sseu {
 	__u32 rsvd;
 };
 
+struct i915_context_param_engines {
+	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
+	struct i915_engine_class_instance engines[0];
+} __attribute__((packed));
+
+#define I915_DEFINE_CONTEXT_PARAM_ENGINES(name__, N__) struct { \
+	__u64 extensions; \
+	struct i915_engine_class_instance engines[N__]; \
+} __attribute__((packed)) name__
+
 struct drm_i915_gem_context_create_ext_setparam {
 #define I915_CONTEXT_CREATE_EXT_SETPARAM 0
 	struct i915_user_extension base;
-- 
cgit v1.2.3


From e620f7b3a26389dfce2663ad4e64c2271ad1a815 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 May 2019 22:11:27 +0100
Subject: drm/i915: Extend I915_CONTEXT_PARAM_SSEU to support local
 ctx->engine[]

Allow the user to specify a local engine index (as opposed to
class:index) that they can use to refer to a preset engine inside the
ctx->engine[] array defined by an earlier I915_CONTEXT_PARAM_ENGINES.
This will be useful for setting SSEU parameters on virtual engines that
are local to the context and do not have a valid global class:instance
lookup.

Note that due to the ambiguity in using class:instance with
ctx->engines[], if a user supplied engine map is active the user must
specify the engine to alter by its index into the ctx->engines[].

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190521211134.16117-3-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_context.c | 24 ++++++++++++++++++++----
 include/uapi/drm/i915_drm.h             |  3 ++-
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index c501e2848ca0..d38d8a45a5cf 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1363,6 +1363,7 @@ static int set_sseu(struct i915_gem_context *ctx,
 	struct drm_i915_gem_context_param_sseu user_sseu;
 	struct intel_context *ce;
 	struct intel_sseu sseu;
+	unsigned long lookup;
 	int ret;
 
 	if (args->size < sizeof(user_sseu))
@@ -1375,10 +1376,17 @@ static int set_sseu(struct i915_gem_context *ctx,
 			   sizeof(user_sseu)))
 		return -EFAULT;
 
-	if (user_sseu.flags || user_sseu.rsvd)
+	if (user_sseu.rsvd)
 		return -EINVAL;
 
-	ce = lookup_user_engine(ctx, 0, &user_sseu.engine);
+	if (user_sseu.flags & ~(I915_CONTEXT_SSEU_FLAG_ENGINE_INDEX))
+		return -EINVAL;
+
+	lookup = 0;
+	if (user_sseu.flags & I915_CONTEXT_SSEU_FLAG_ENGINE_INDEX)
+		lookup |= LOOKUP_USER_INDEX;
+
+	ce = lookup_user_engine(ctx, lookup, &user_sseu.engine);
 	if (IS_ERR(ce))
 		return PTR_ERR(ce);
 
@@ -1821,6 +1829,7 @@ static int get_sseu(struct i915_gem_context *ctx,
 {
 	struct drm_i915_gem_context_param_sseu user_sseu;
 	struct intel_context *ce;
+	unsigned long lookup;
 	int err;
 
 	if (args->size == 0)
@@ -1832,10 +1841,17 @@ static int get_sseu(struct i915_gem_context *ctx,
 			   sizeof(user_sseu)))
 		return -EFAULT;
 
-	if (user_sseu.flags || user_sseu.rsvd)
+	if (user_sseu.rsvd)
 		return -EINVAL;
 
-	ce = lookup_user_engine(ctx, 0, &user_sseu.engine);
+	if (user_sseu.flags & ~(I915_CONTEXT_SSEU_FLAG_ENGINE_INDEX))
+		return -EINVAL;
+
+	lookup = 0;
+	if (user_sseu.flags & I915_CONTEXT_SSEU_FLAG_ENGINE_INDEX)
+		lookup |= LOOKUP_USER_INDEX;
+
+	ce = lookup_user_engine(ctx, lookup, &user_sseu.engine);
 	if (IS_ERR(ce))
 		return PTR_ERR(ce);
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 8e1bb22926e4..82bd488ed0d1 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1576,9 +1576,10 @@ struct drm_i915_gem_context_param_sseu {
 	struct i915_engine_class_instance engine;
 
 	/*
-	 * Unused for now. Must be cleared to zero.
+	 * Unknown flags must be cleared to zero.
 	 */
 	__u32 flags;
+#define I915_CONTEXT_SSEU_FLAG_ENGINE_INDEX (1u << 0)
 
 	/*
 	 * Mask of slices to enable for the context. Valid values are a subset
-- 
cgit v1.2.3


From 8319f44c0525708c26ac7724da897cff3dbb0f84 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 May 2019 22:11:28 +0100
Subject: drm/i915: Re-expose SINGLE_TIMELINE flags for context creation

The SINGLE_TIMELINE flag can be used to create a context such that all
engine instances within that context share a common timeline. This can
be useful for mixing operations between real and virtual engines, or
when using a composite context for a single client API context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190521211134.16117-4-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_context.c | 4 ----
 include/uapi/drm/i915_drm.h             | 3 ++-
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index d38d8a45a5cf..d391d474820f 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -96,8 +96,6 @@
 #include "i915_trace.h"
 #include "i915_user_extensions.h"
 
-#define I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE (1 << 1)
-
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
 
 static struct i915_global_gem_context {
@@ -505,8 +503,6 @@ i915_gem_create_context(struct drm_i915_private *dev_priv, unsigned int flags)
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
 
-	BUILD_BUG_ON(I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE &
-		     ~I915_CONTEXT_CREATE_FLAGS_UNKNOWN);
 	if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE &&
 	    !HAS_EXECLISTS(dev_priv))
 		return ERR_PTR(-EINVAL);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 82bd488ed0d1..957ba8e60e02 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1469,8 +1469,9 @@ struct drm_i915_gem_context_create_ext {
 	__u32 ctx_id; /* output: id of new context*/
 	__u32 flags;
 #define I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS	(1u << 0)
+#define I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE	(1u << 1)
 #define I915_CONTEXT_CREATE_FLAGS_UNKNOWN \
-	(-(I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS << 1))
+	(-(I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE << 1))
 	__u64 extensions;
 };
 
-- 
cgit v1.2.3


From b81dde719439c8f09bb61e742ed95bfc4b33946b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 May 2019 22:11:29 +0100
Subject: drm/i915: Allow userspace to clone contexts on creation

A usecase arose out of handling context recovery in mesa, whereby they
wish to recreate a context with fresh logical state but preserving all
other details of the original. Currently, they create a new context and
iterate over which bits they want to copy across, but it would much more
convenient if they were able to just pass in a target context to clone
during creation. This essentially extends the setparam during creation
to pull the details from a target context instead of the user supplied
parameters.

The ideal here is that we don't expose control over anything more than
can be obtained via CONTEXT_PARAM. That is userspace retains explicit
control over all features, and this api is just convenience.

For example, you could replace

	struct context_param p = { .param = CONTEXT_PARAM_VM };

	param.ctx_id = old_id;
	gem_context_get_param(&p.param);

	new_id = gem_context_create();

	param.ctx_id = new_id;
	gem_context_set_param(&p.param);

	gem_vm_destroy(param.value); /* drop the ref to VM_ID handle */

with

	struct create_ext_param p = {
	  { .name = CONTEXT_CREATE_CLONE },
	  .clone_id = old_id,
	  .flags = CLONE_FLAGS_VM
	}
	new_id = gem_context_create_ext(&p);

and not have to worry about stray namespace pollution etc.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190521211134.16117-5-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_context.c | 206 ++++++++++++++++++++++++++++++++
 include/uapi/drm/i915_drm.h             |  15 +++
 2 files changed, 221 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index d391d474820f..24736fcd463d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1723,8 +1723,214 @@ static int create_setparam(struct i915_user_extension __user *ext, void *data)
 	return ctx_setparam(arg->fpriv, arg->ctx, &local.param);
 }
 
+static int clone_engines(struct i915_gem_context *dst,
+			 struct i915_gem_context *src)
+{
+	struct i915_gem_engines *e = i915_gem_context_lock_engines(src);
+	struct i915_gem_engines *clone;
+	bool user_engines;
+	unsigned long n;
+
+	clone = kmalloc(struct_size(e, engines, e->num_engines), GFP_KERNEL);
+	if (!clone)
+		goto err_unlock;
+
+	clone->i915 = dst->i915;
+	for (n = 0; n < e->num_engines; n++) {
+		if (!e->engines[n]) {
+			clone->engines[n] = NULL;
+			continue;
+		}
+
+		clone->engines[n] =
+			intel_context_create(dst, e->engines[n]->engine);
+		if (!clone->engines[n]) {
+			__free_engines(clone, n);
+			goto err_unlock;
+		}
+	}
+	clone->num_engines = n;
+
+	user_engines = i915_gem_context_user_engines(src);
+	i915_gem_context_unlock_engines(src);
+
+	free_engines(dst->engines);
+	RCU_INIT_POINTER(dst->engines, clone);
+	if (user_engines)
+		i915_gem_context_set_user_engines(dst);
+	else
+		i915_gem_context_clear_user_engines(dst);
+	return 0;
+
+err_unlock:
+	i915_gem_context_unlock_engines(src);
+	return -ENOMEM;
+}
+
+static int clone_flags(struct i915_gem_context *dst,
+		       struct i915_gem_context *src)
+{
+	dst->user_flags = src->user_flags;
+	return 0;
+}
+
+static int clone_schedattr(struct i915_gem_context *dst,
+			   struct i915_gem_context *src)
+{
+	dst->sched = src->sched;
+	return 0;
+}
+
+static int clone_sseu(struct i915_gem_context *dst,
+		      struct i915_gem_context *src)
+{
+	struct i915_gem_engines *e = i915_gem_context_lock_engines(src);
+	struct i915_gem_engines *clone;
+	unsigned long n;
+	int err;
+
+	clone = dst->engines; /* no locking required; sole access */
+	if (e->num_engines != clone->num_engines) {
+		err = -EINVAL;
+		goto unlock;
+	}
+
+	for (n = 0; n < e->num_engines; n++) {
+		struct intel_context *ce = e->engines[n];
+
+		if (clone->engines[n]->engine->class != ce->engine->class) {
+			/* Must have compatible engine maps! */
+			err = -EINVAL;
+			goto unlock;
+		}
+
+		/* serialises with set_sseu */
+		err = intel_context_lock_pinned(ce);
+		if (err)
+			goto unlock;
+
+		clone->engines[n]->sseu = ce->sseu;
+		intel_context_unlock_pinned(ce);
+	}
+
+	err = 0;
+unlock:
+	i915_gem_context_unlock_engines(src);
+	return err;
+}
+
+static int clone_timeline(struct i915_gem_context *dst,
+			  struct i915_gem_context *src)
+{
+	if (src->timeline) {
+		GEM_BUG_ON(src->timeline == dst->timeline);
+
+		if (dst->timeline)
+			i915_timeline_put(dst->timeline);
+		dst->timeline = i915_timeline_get(src->timeline);
+	}
+
+	return 0;
+}
+
+static int clone_vm(struct i915_gem_context *dst,
+		    struct i915_gem_context *src)
+{
+	struct i915_hw_ppgtt *ppgtt;
+
+	rcu_read_lock();
+	do {
+		ppgtt = READ_ONCE(src->ppgtt);
+		if (!ppgtt)
+			break;
+
+		if (!kref_get_unless_zero(&ppgtt->ref))
+			continue;
+
+		/*
+		 * This ppgtt may have be reallocated between
+		 * the read and the kref, and reassigned to a third
+		 * context. In order to avoid inadvertent sharing
+		 * of this ppgtt with that third context (and not
+		 * src), we have to confirm that we have the same
+		 * ppgtt after passing through the strong memory
+		 * barrier implied by a successful
+		 * kref_get_unless_zero().
+		 *
+		 * Once we have acquired the current ppgtt of src,
+		 * we no longer care if it is released from src, as
+		 * it cannot be reallocated elsewhere.
+		 */
+
+		if (ppgtt == READ_ONCE(src->ppgtt))
+			break;
+
+		i915_ppgtt_put(ppgtt);
+	} while (1);
+	rcu_read_unlock();
+
+	if (ppgtt) {
+		__assign_ppgtt(dst, ppgtt);
+		i915_ppgtt_put(ppgtt);
+	}
+
+	return 0;
+}
+
+static int create_clone(struct i915_user_extension __user *ext, void *data)
+{
+	static int (* const fn[])(struct i915_gem_context *dst,
+				  struct i915_gem_context *src) = {
+#define MAP(x, y) [ilog2(I915_CONTEXT_CLONE_##x)] = y
+		MAP(ENGINES, clone_engines),
+		MAP(FLAGS, clone_flags),
+		MAP(SCHEDATTR, clone_schedattr),
+		MAP(SSEU, clone_sseu),
+		MAP(TIMELINE, clone_timeline),
+		MAP(VM, clone_vm),
+#undef MAP
+	};
+	struct drm_i915_gem_context_create_ext_clone local;
+	const struct create_ext *arg = data;
+	struct i915_gem_context *dst = arg->ctx;
+	struct i915_gem_context *src;
+	int err, bit;
+
+	if (copy_from_user(&local, ext, sizeof(local)))
+		return -EFAULT;
+
+	BUILD_BUG_ON(GENMASK(BITS_PER_TYPE(local.flags) - 1, ARRAY_SIZE(fn)) !=
+		     I915_CONTEXT_CLONE_UNKNOWN);
+
+	if (local.flags & I915_CONTEXT_CLONE_UNKNOWN)
+		return -EINVAL;
+
+	if (local.rsvd)
+		return -EINVAL;
+
+	rcu_read_lock();
+	src = __i915_gem_context_lookup_rcu(arg->fpriv, local.clone_id);
+	rcu_read_unlock();
+	if (!src)
+		return -ENOENT;
+
+	GEM_BUG_ON(src == dst);
+
+	for (bit = 0; bit < ARRAY_SIZE(fn); bit++) {
+		if (!(local.flags & BIT(bit)))
+			continue;
+
+		err = fn[bit](dst, src);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static const i915_user_extension_fn create_extensions[] = {
 	[I915_CONTEXT_CREATE_EXT_SETPARAM] = create_setparam,
+	[I915_CONTEXT_CREATE_EXT_CLONE] = create_clone,
 };
 
 static bool client_is_banned(struct drm_i915_file_private *file_priv)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 957ba8e60e02..62396d575e28 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1624,6 +1624,21 @@ struct drm_i915_gem_context_create_ext_setparam {
 	struct drm_i915_gem_context_param param;
 };
 
+struct drm_i915_gem_context_create_ext_clone {
+#define I915_CONTEXT_CREATE_EXT_CLONE 1
+	struct i915_user_extension base;
+	__u32 clone_id;
+	__u32 flags;
+#define I915_CONTEXT_CLONE_ENGINES	(1u << 0)
+#define I915_CONTEXT_CLONE_FLAGS	(1u << 1)
+#define I915_CONTEXT_CLONE_SCHEDATTR	(1u << 2)
+#define I915_CONTEXT_CLONE_SSEU		(1u << 3)
+#define I915_CONTEXT_CLONE_TIMELINE	(1u << 4)
+#define I915_CONTEXT_CLONE_VM		(1u << 5)
+#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_VM << 1)
+	__u64 rsvd;
+};
+
 struct drm_i915_gem_context_destroy {
 	__u32 ctx_id;
 	__u32 pad;
-- 
cgit v1.2.3


From 6d06779e86724322d79eb53b26989edd9db188f6 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 May 2019 22:11:30 +0100
Subject: drm/i915: Load balancing across a virtual engine

Having allowed the user to define a set of engines that they will want
to only use, we go one step further and allow them to bind those engines
into a single virtual instance. Submitting a batch to the virtual engine
will then forward it to any one of the set in a manner as best to
distribute load.  The virtual engine has a single timeline across all
engines (it operates as a single queue), so it is not able to concurrently
run batches across multiple engines by itself; that is left up to the user
to submit multiple concurrent batches to multiple queues. Multiple users
will be load balanced across the system.

The mechanism used for load balancing in this patch is a late greedy
balancer. When a request is ready for execution, it is added to each
engine's queue, and when an engine is ready for its next request it
claims it from the virtual engine. The first engine to do so, wins, i.e.
the request is executed at the earliest opportunity (idle moment) in the
system.

As not all HW is created equal, the user is still able to skip the
virtual engine and execute the batch on a specific engine, all within the
same queue. It will then be executed in order on the correct engine,
with execution on other virtual engines being moved away due to the load
detection.

A couple of areas for potential improvement left!

- The virtual engine always take priority over equal-priority tasks.
Mostly broken up by applying FQ_CODEL rules for prioritising new clients,
and hopefully the virtual and real engines are not then congested (i.e.
all work is via virtual engines, or all work is to the real engine).

- We require the breadcrumb irq around every virtual engine request. For
normal engines, we eliminate the need for the slow round trip via
interrupt by using the submit fence and queueing in order. For virtual
engines, we have to allow any job to transfer to a new ring, and cannot
coalesce the submissions, so require the completion fence instead,
forcing the persistent use of interrupts.

- We only drip feed single requests through each virtual engine and onto
the physical engines, even if there was enough work to fill all ELSP,
leaving small stalls with an idle CS event at the end of every request.
Could we be greedy and fill both slots? Being lazy is virtuous for load
distribution on less-than-full workloads though.

Other areas of improvement are more general, such as reducing lock
contention, reducing dispatch overhead, looking at direct submission
rather than bouncing around tasklets etc.

sseu: Lift the restriction to allow sseu to be reconfigured on virtual
engines composed of RENDER_CLASS (rcs).

v2: macroize check_user_mbz()
v3: Cancel virtual engines on wedging
v4: Commence commenting
v5: Replace 64b sibling_mask with a list of class:instance
v6: Drop the one-element array in the uabi
v7: Assert it is an virtual engine in to_virtual_engine()
v8: Skip over holes in [class][inst] so we can selftest with (vcs0, vcs2)

Link: https://github.com/intel/media-driver/pull/283
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190521211134.16117-6-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/gt/intel_engine_types.h |   8 +
 drivers/gpu/drm/i915/gt/intel_lrc.c          | 680 ++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/gt/intel_lrc.h          |   9 +
 drivers/gpu/drm/i915/gt/selftest_lrc.c       | 180 +++++++
 drivers/gpu/drm/i915/i915_gem.h              |   5 +
 drivers/gpu/drm/i915/i915_gem_context.c      | 116 ++++-
 drivers/gpu/drm/i915/i915_scheduler.c        |  19 +-
 drivers/gpu/drm/i915/i915_timeline_types.h   |   1 +
 include/uapi/drm/i915_drm.h                  |  39 ++
 9 files changed, 1029 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index e381c1c73902..7b47e00fa082 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -227,6 +227,7 @@ struct intel_engine_execlists {
 	 * @queue: queue of requests, in priority lists
 	 */
 	struct rb_root_cached queue;
+	struct rb_root_cached virtual;
 
 	/**
 	 * @csb_write: control register for Context Switch buffer
@@ -445,6 +446,7 @@ struct intel_engine_cs {
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
 #define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
 #define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4)
+#define I915_ENGINE_IS_VIRTUAL       BIT(5)
 	unsigned int flags;
 
 	/*
@@ -534,6 +536,12 @@ intel_engine_needs_breadcrumb_tasklet(const struct intel_engine_cs *engine)
 	return engine->flags & I915_ENGINE_NEEDS_BREADCRUMB_TASKLET;
 }
 
+static inline bool
+intel_engine_is_virtual(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_IS_VIRTUAL;
+}
+
 #define instdone_slice_mask(dev_priv__) \
 	(IS_GEN(dev_priv__, 7) ? \
 	 1 : RUNTIME_INFO(dev_priv__)->sseu.slice_mask)
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index f263a8374273..affa5e2dfce1 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -164,6 +164,42 @@
 #define WA_TAIL_DWORDS 2
 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 
+struct virtual_engine {
+	struct intel_engine_cs base;
+	struct intel_context context;
+
+	/*
+	 * We allow only a single request through the virtual engine at a time
+	 * (each request in the timeline waits for the completion fence of
+	 * the previous before being submitted). By restricting ourselves to
+	 * only submitting a single request, each request is placed on to a
+	 * physical to maximise load spreading (by virtue of the late greedy
+	 * scheduling -- each real engine takes the next available request
+	 * upon idling).
+	 */
+	struct i915_request *request;
+
+	/*
+	 * We keep a rbtree of available virtual engines inside each physical
+	 * engine, sorted by priority. Here we preallocate the nodes we need
+	 * for the virtual engine, indexed by physical_engine->id.
+	 */
+	struct ve_node {
+		struct rb_node rb;
+		int prio;
+	} nodes[I915_NUM_ENGINES];
+
+	/* And finally, which physical engines this virtual engine maps onto. */
+	unsigned int num_siblings;
+	struct intel_engine_cs *siblings[0];
+};
+
+static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
+{
+	GEM_BUG_ON(!intel_engine_is_virtual(engine));
+	return container_of(engine, struct virtual_engine, base);
+}
+
 static int execlists_context_deferred_alloc(struct intel_context *ce,
 					    struct intel_engine_cs *engine);
 static void execlists_init_reg_state(u32 *reg_state,
@@ -216,7 +252,8 @@ static int queue_prio(const struct intel_engine_execlists *execlists)
 }
 
 static inline bool need_preempt(const struct intel_engine_cs *engine,
-				const struct i915_request *rq)
+				const struct i915_request *rq,
+				struct rb_node *rb)
 {
 	int last_prio;
 
@@ -251,6 +288,25 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
 	    rq_prio(list_next_entry(rq, link)) > last_prio)
 		return true;
 
+	if (rb) {
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+		bool preempt = false;
+
+		if (engine == ve->siblings[0]) { /* only preempt one sibling */
+			struct i915_request *next;
+
+			rcu_read_lock();
+			next = READ_ONCE(ve->request);
+			if (next)
+				preempt = rq_prio(next) > last_prio;
+			rcu_read_unlock();
+		}
+
+		if (preempt)
+			return preempt;
+	}
+
 	/*
 	 * If the inflight context did not trigger the preemption, then maybe
 	 * it was the set of queued requests? Pick the highest priority in
@@ -369,6 +425,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
 	list_for_each_entry_safe_reverse(rq, rn,
 					 &engine->timeline.requests,
 					 link) {
+		struct intel_engine_cs *owner;
+
 		if (i915_request_completed(rq))
 			break;
 
@@ -377,16 +435,29 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
 
 		GEM_BUG_ON(rq->hw_context->active);
 
-		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
-		if (rq_prio(rq) != prio) {
-			prio = rq_prio(rq);
-			pl = i915_sched_lookup_priolist(engine, prio);
-		}
-		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
-
-		list_add(&rq->sched.link, pl);
+		/*
+		 * Push the request back into the queue for later resubmission.
+		 * If this request is not native to this physical engine (i.e.
+		 * it came from a virtual source), push it back onto the virtual
+		 * engine so that it can be moved across onto another physical
+		 * engine as load dictates.
+		 */
+		owner = rq->hw_context->engine;
+		if (likely(owner == engine)) {
+			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
+			if (rq_prio(rq) != prio) {
+				prio = rq_prio(rq);
+				pl = i915_sched_lookup_priolist(engine, prio);
+			}
+			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 
-		active = rq;
+			list_add(&rq->sched.link, pl);
+			active = rq;
+		} else {
+			rq->engine = owner;
+			owner->submit_request(rq);
+			active = NULL;
+		}
 	}
 
 	return active;
@@ -622,6 +693,90 @@ static void complete_preempt_context(struct intel_engine_execlists *execlists)
 						  execlists));
 }
 
+static void virtual_update_register_offsets(u32 *regs,
+					    struct intel_engine_cs *engine)
+{
+	u32 base = engine->mmio_base;
+
+	/* Must match execlists_init_reg_state()! */
+
+	regs[CTX_CONTEXT_CONTROL] =
+		i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base));
+	regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
+	regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
+	regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
+	regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
+
+	regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
+	regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
+	regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
+	regs[CTX_SECOND_BB_HEAD_U] =
+		i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
+	regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
+	regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
+
+	regs[CTX_CTX_TIMESTAMP] =
+		i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
+	regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3));
+	regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3));
+	regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2));
+	regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2));
+	regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1));
+	regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1));
+	regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
+	regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0));
+
+	if (engine->class == RENDER_CLASS) {
+		regs[CTX_RCS_INDIRECT_CTX] =
+			i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
+		regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
+			i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
+		regs[CTX_BB_PER_CTX_PTR] =
+			i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
+
+		regs[CTX_R_PWR_CLK_STATE] =
+			i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
+	}
+}
+
+static bool virtual_matches(const struct virtual_engine *ve,
+			    const struct i915_request *rq,
+			    const struct intel_engine_cs *engine)
+{
+	const struct intel_engine_cs *active;
+
+	/*
+	 * We track when the HW has completed saving the context image
+	 * (i.e. when we have seen the final CS event switching out of
+	 * the context) and must not overwrite the context image before
+	 * then. This restricts us to only using the active engine
+	 * while the previous virtualized request is inflight (so
+	 * we reuse the register offsets). This is a very small
+	 * hystersis on the greedy seelction algorithm.
+	 */
+	active = READ_ONCE(ve->context.active);
+	if (active && active != engine)
+		return false;
+
+	return true;
+}
+
+static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
+				     struct intel_engine_cs *engine)
+{
+	struct intel_engine_cs *old = ve->siblings[0];
+
+	/* All unattached (rq->engine == old) must already be completed */
+
+	spin_lock(&old->breadcrumbs.irq_lock);
+	if (!list_empty(&ve->context.signal_link)) {
+		list_move_tail(&ve->context.signal_link,
+			       &engine->breadcrumbs.signalers);
+		intel_engine_queue_breadcrumbs(engine);
+	}
+	spin_unlock(&old->breadcrumbs.irq_lock);
+}
+
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -654,6 +809,26 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * and context switches) submission.
 	 */
 
+	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+		struct i915_request *rq = READ_ONCE(ve->request);
+
+		if (!rq) { /* lazily cleanup after another engine handled rq */
+			rb_erase_cached(rb, &execlists->virtual);
+			RB_CLEAR_NODE(rb);
+			rb = rb_first_cached(&execlists->virtual);
+			continue;
+		}
+
+		if (!virtual_matches(ve, rq, engine)) {
+			rb = rb_next(rb);
+			continue;
+		}
+
+		break;
+	}
+
 	if (last) {
 		/*
 		 * Don't resubmit or switch until all outstanding
@@ -675,7 +850,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
 			return;
 
-		if (need_preempt(engine, last)) {
+		if (need_preempt(engine, last, rb)) {
 			inject_preempt_context(engine);
 			return;
 		}
@@ -715,6 +890,92 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		last->tail = last->wa_tail;
 	}
 
+	while (rb) { /* XXX virtual is always taking precedence */
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+		struct i915_request *rq;
+
+		spin_lock(&ve->base.timeline.lock);
+
+		rq = ve->request;
+		if (unlikely(!rq)) { /* lost the race to a sibling */
+			spin_unlock(&ve->base.timeline.lock);
+			rb_erase_cached(rb, &execlists->virtual);
+			RB_CLEAR_NODE(rb);
+			rb = rb_first_cached(&execlists->virtual);
+			continue;
+		}
+
+		GEM_BUG_ON(rq != ve->request);
+		GEM_BUG_ON(rq->engine != &ve->base);
+		GEM_BUG_ON(rq->hw_context != &ve->context);
+
+		if (rq_prio(rq) >= queue_prio(execlists)) {
+			if (!virtual_matches(ve, rq, engine)) {
+				spin_unlock(&ve->base.timeline.lock);
+				rb = rb_next(rb);
+				continue;
+			}
+
+			if (last && !can_merge_rq(last, rq)) {
+				spin_unlock(&ve->base.timeline.lock);
+				return; /* leave this rq for another engine */
+			}
+
+			GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
+				  engine->name,
+				  rq->fence.context,
+				  rq->fence.seqno,
+				  i915_request_completed(rq) ? "!" :
+				  i915_request_started(rq) ? "*" :
+				  "",
+				  yesno(engine != ve->siblings[0]));
+
+			ve->request = NULL;
+			ve->base.execlists.queue_priority_hint = INT_MIN;
+			rb_erase_cached(rb, &execlists->virtual);
+			RB_CLEAR_NODE(rb);
+
+			rq->engine = engine;
+
+			if (engine != ve->siblings[0]) {
+				u32 *regs = ve->context.lrc_reg_state;
+				unsigned int n;
+
+				GEM_BUG_ON(READ_ONCE(ve->context.active));
+				virtual_update_register_offsets(regs, engine);
+
+				if (!list_empty(&ve->context.signals))
+					virtual_xfer_breadcrumbs(ve, engine);
+
+				/*
+				 * Move the bound engine to the top of the list
+				 * for future execution. We then kick this
+				 * tasklet first before checking others, so that
+				 * we preferentially reuse this set of bound
+				 * registers.
+				 */
+				for (n = 1; n < ve->num_siblings; n++) {
+					if (ve->siblings[n] == engine) {
+						swap(ve->siblings[n],
+						     ve->siblings[0]);
+						break;
+					}
+				}
+
+				GEM_BUG_ON(ve->siblings[0] != engine);
+			}
+
+			__i915_request_submit(rq);
+			trace_i915_request_in(rq, port_index(port, execlists));
+			submit = true;
+			last = rq;
+		}
+
+		spin_unlock(&ve->base.timeline.lock);
+		break;
+	}
+
 	while ((rb = rb_first_cached(&execlists->queue))) {
 		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
@@ -1838,6 +2099,25 @@ static void reset_csb_pointers(struct intel_engine_execlists *execlists)
 			       &execlists->csb_status[reset_value]);
 }
 
+static struct i915_request *active_request(struct i915_request *rq)
+{
+	const struct list_head * const list = &rq->engine->timeline.requests;
+	const struct intel_context * const context = rq->hw_context;
+	struct i915_request *active = NULL;
+
+	list_for_each_entry_from_reverse(rq, list, link) {
+		if (i915_request_completed(rq))
+			break;
+
+		if (rq->hw_context != context)
+			break;
+
+		active = rq;
+	}
+
+	return active;
+}
+
 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1858,7 +2138,8 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	if (!port_isset(execlists->port))
 		goto out_clear;
 
-	ce = port_request(execlists->port)->hw_context;
+	rq = port_request(execlists->port);
+	ce = rq->hw_context;
 
 	/*
 	 * Catch up with any missed context-switch interrupts.
@@ -1871,16 +2152,10 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	 */
 	execlists_cancel_port_requests(execlists);
 
-	/* Push back any incomplete requests for replay after the reset. */
-	rq = __unwind_incomplete_requests(engine);
+	rq = active_request(rq);
 	if (!rq)
 		goto out_replay;
 
-	if (rq->hw_context != ce) { /* caught just before a CS event */
-		rq = NULL;
-		goto out_replay;
-	}
-
 	/*
 	 * If this request hasn't started yet, e.g. it is waiting on a
 	 * semaphore, we need to avoid skipping the request or else we
@@ -1927,13 +2202,16 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	}
 	execlists_init_reg_state(regs, ce, engine, ce->ring);
 
-	/* Rerun the request; its payload has been neutered (if guilty). */
 out_replay:
+	/* Rerun the request; its payload has been neutered (if guilty). */
 	ce->ring->head =
 		rq ? intel_ring_wrap(ce->ring, rq->head) : ce->ring->tail;
 	intel_ring_update_space(ce->ring);
 	__execlists_update_reg_state(ce, engine);
 
+	/* Push back any incomplete requests for replay after the reset. */
+	__unwind_incomplete_requests(engine);
+
 out_clear:
 	execlists_clear_all_active(execlists);
 }
@@ -2007,6 +2285,26 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 		i915_priolist_free(p);
 	}
 
+	/* Cancel all attached virtual engines */
+	while ((rb = rb_first_cached(&execlists->virtual))) {
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+
+		rb_erase_cached(rb, &execlists->virtual);
+		RB_CLEAR_NODE(rb);
+
+		spin_lock(&ve->base.timeline.lock);
+		if (ve->request) {
+			ve->request->engine = engine;
+			__i915_request_submit(ve->request);
+			dma_fence_set_error(&ve->request->fence, -EIO);
+			i915_request_mark_complete(ve->request);
+			ve->base.execlists.queue_priority_hint = INT_MIN;
+			ve->request = NULL;
+		}
+		spin_unlock(&ve->base.timeline.lock);
+	}
+
 	/* Remaining _unready_ requests will be nop'ed when submitted */
 
 	execlists->queue_priority_hint = INT_MIN;
@@ -2491,12 +2789,15 @@ static void execlists_init_reg_state(u32 *regs,
 	bool rcs = engine->class == RENDER_CLASS;
 	u32 base = engine->mmio_base;
 
-	/* A context is actually a big batch buffer with several
+	/*
+	 * A context is actually a big batch buffer with several
 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
 	 * values we are setting here are only for the first context restore:
 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
 	 * we are not initializing here).
+	 *
+	 * Must keep consistent with virtual_update_register_offsets().
 	 */
 	regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
 				 MI_LRI_FORCE_POSTED;
@@ -2715,6 +3016,320 @@ error_deref_obj:
 	return ret;
 }
 
+static void virtual_context_destroy(struct kref *kref)
+{
+	struct virtual_engine *ve =
+		container_of(kref, typeof(*ve), context.ref);
+	unsigned int n;
+
+	GEM_BUG_ON(ve->request);
+	GEM_BUG_ON(ve->context.active);
+
+	for (n = 0; n < ve->num_siblings; n++) {
+		struct intel_engine_cs *sibling = ve->siblings[n];
+		struct rb_node *node = &ve->nodes[sibling->id].rb;
+
+		if (RB_EMPTY_NODE(node))
+			continue;
+
+		spin_lock_irq(&sibling->timeline.lock);
+
+		/* Detachment is lazily performed in the execlists tasklet */
+		if (!RB_EMPTY_NODE(node))
+			rb_erase_cached(node, &sibling->execlists.virtual);
+
+		spin_unlock_irq(&sibling->timeline.lock);
+	}
+	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
+
+	if (ve->context.state)
+		__execlists_context_fini(&ve->context);
+
+	i915_timeline_fini(&ve->base.timeline);
+	kfree(ve);
+}
+
+static void virtual_engine_initial_hint(struct virtual_engine *ve)
+{
+	int swp;
+
+	/*
+	 * Pick a random sibling on starting to help spread the load around.
+	 *
+	 * New contexts are typically created with exactly the same order
+	 * of siblings, and often started in batches. Due to the way we iterate
+	 * the array of sibling when submitting requests, sibling[0] is
+	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
+	 * randomised across the system, we also help spread the load by the
+	 * first engine we inspect being different each time.
+	 *
+	 * NB This does not force us to execute on this engine, it will just
+	 * typically be the first we inspect for submission.
+	 */
+	swp = prandom_u32_max(ve->num_siblings);
+	if (!swp)
+		return;
+
+	swap(ve->siblings[swp], ve->siblings[0]);
+	virtual_update_register_offsets(ve->context.lrc_reg_state,
+					ve->siblings[0]);
+}
+
+static int virtual_context_pin(struct intel_context *ce)
+{
+	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
+	int err;
+
+	/* Note: we must use a real engine class for setting up reg state */
+	err = __execlists_context_pin(ce, ve->siblings[0]);
+	if (err)
+		return err;
+
+	virtual_engine_initial_hint(ve);
+	return 0;
+}
+
+static void virtual_context_enter(struct intel_context *ce)
+{
+	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
+	unsigned int n;
+
+	for (n = 0; n < ve->num_siblings; n++)
+		intel_engine_pm_get(ve->siblings[n]);
+}
+
+static void virtual_context_exit(struct intel_context *ce)
+{
+	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
+	unsigned int n;
+
+	ce->saturated = 0;
+	for (n = 0; n < ve->num_siblings; n++)
+		intel_engine_pm_put(ve->siblings[n]);
+}
+
+static const struct intel_context_ops virtual_context_ops = {
+	.pin = virtual_context_pin,
+	.unpin = execlists_context_unpin,
+
+	.enter = virtual_context_enter,
+	.exit = virtual_context_exit,
+
+	.destroy = virtual_context_destroy,
+};
+
+static void virtual_submission_tasklet(unsigned long data)
+{
+	struct virtual_engine * const ve = (struct virtual_engine *)data;
+	const int prio = ve->base.execlists.queue_priority_hint;
+	unsigned int n;
+
+	local_irq_disable();
+	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
+		struct intel_engine_cs *sibling = ve->siblings[n];
+		struct ve_node * const node = &ve->nodes[sibling->id];
+		struct rb_node **parent, *rb;
+		bool first;
+
+		spin_lock(&sibling->timeline.lock);
+
+		if (!RB_EMPTY_NODE(&node->rb)) {
+			/*
+			 * Cheat and avoid rebalancing the tree if we can
+			 * reuse this node in situ.
+			 */
+			first = rb_first_cached(&sibling->execlists.virtual) ==
+				&node->rb;
+			if (prio == node->prio || (prio > node->prio && first))
+				goto submit_engine;
+
+			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
+		}
+
+		rb = NULL;
+		first = true;
+		parent = &sibling->execlists.virtual.rb_root.rb_node;
+		while (*parent) {
+			struct ve_node *other;
+
+			rb = *parent;
+			other = rb_entry(rb, typeof(*other), rb);
+			if (prio > other->prio) {
+				parent = &rb->rb_left;
+			} else {
+				parent = &rb->rb_right;
+				first = false;
+			}
+		}
+
+		rb_link_node(&node->rb, rb, parent);
+		rb_insert_color_cached(&node->rb,
+				       &sibling->execlists.virtual,
+				       first);
+
+submit_engine:
+		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
+		node->prio = prio;
+		if (first && prio > sibling->execlists.queue_priority_hint) {
+			sibling->execlists.queue_priority_hint = prio;
+			tasklet_hi_schedule(&sibling->execlists.tasklet);
+		}
+
+		spin_unlock(&sibling->timeline.lock);
+	}
+	local_irq_enable();
+}
+
+static void virtual_submit_request(struct i915_request *rq)
+{
+	struct virtual_engine *ve = to_virtual_engine(rq->engine);
+
+	GEM_TRACE("%s: rq=%llx:%lld\n",
+		  ve->base.name,
+		  rq->fence.context,
+		  rq->fence.seqno);
+
+	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
+
+	GEM_BUG_ON(ve->request);
+	ve->base.execlists.queue_priority_hint = rq_prio(rq);
+	WRITE_ONCE(ve->request, rq);
+
+	tasklet_schedule(&ve->base.execlists.tasklet);
+}
+
+struct intel_context *
+intel_execlists_create_virtual(struct i915_gem_context *ctx,
+			       struct intel_engine_cs **siblings,
+			       unsigned int count)
+{
+	struct virtual_engine *ve;
+	unsigned int n;
+	int err;
+
+	if (count == 0)
+		return ERR_PTR(-EINVAL);
+
+	if (count == 1)
+		return intel_context_create(ctx, siblings[0]);
+
+	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
+	if (!ve)
+		return ERR_PTR(-ENOMEM);
+
+	ve->base.i915 = ctx->i915;
+	ve->base.id = -1;
+	ve->base.class = OTHER_CLASS;
+	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
+	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
+	ve->base.flags = I915_ENGINE_IS_VIRTUAL;
+
+	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
+
+	err = i915_timeline_init(ctx->i915, &ve->base.timeline, NULL);
+	if (err)
+		goto err_put;
+	i915_timeline_set_subclass(&ve->base.timeline, TIMELINE_VIRTUAL);
+
+	intel_engine_init_execlists(&ve->base);
+
+	ve->base.cops = &virtual_context_ops;
+	ve->base.request_alloc = execlists_request_alloc;
+
+	ve->base.schedule = i915_schedule;
+	ve->base.submit_request = virtual_submit_request;
+
+	ve->base.execlists.queue_priority_hint = INT_MIN;
+	tasklet_init(&ve->base.execlists.tasklet,
+		     virtual_submission_tasklet,
+		     (unsigned long)ve);
+
+	intel_context_init(&ve->context, ctx, &ve->base);
+
+	for (n = 0; n < count; n++) {
+		struct intel_engine_cs *sibling = siblings[n];
+
+		GEM_BUG_ON(!is_power_of_2(sibling->mask));
+		if (sibling->mask & ve->base.mask) {
+			DRM_DEBUG("duplicate %s entry in load balancer\n",
+				  sibling->name);
+			err = -EINVAL;
+			goto err_put;
+		}
+
+		/*
+		 * The virtual engine implementation is tightly coupled to
+		 * the execlists backend -- we push out request directly
+		 * into a tree inside each physical engine. We could support
+		 * layering if we handle cloning of the requests and
+		 * submitting a copy into each backend.
+		 */
+		if (sibling->execlists.tasklet.func !=
+		    execlists_submission_tasklet) {
+			err = -ENODEV;
+			goto err_put;
+		}
+
+		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
+		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
+
+		ve->siblings[ve->num_siblings++] = sibling;
+		ve->base.mask |= sibling->mask;
+
+		/*
+		 * All physical engines must be compatible for their emission
+		 * functions (as we build the instructions during request
+		 * construction and do not alter them before submission
+		 * on the physical engine). We use the engine class as a guide
+		 * here, although that could be refined.
+		 */
+		if (ve->base.class != OTHER_CLASS) {
+			if (ve->base.class != sibling->class) {
+				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
+					  sibling->class, ve->base.class);
+				err = -EINVAL;
+				goto err_put;
+			}
+			continue;
+		}
+
+		ve->base.class = sibling->class;
+		ve->base.uabi_class = sibling->uabi_class;
+		snprintf(ve->base.name, sizeof(ve->base.name),
+			 "v%dx%d", ve->base.class, count);
+		ve->base.context_size = sibling->context_size;
+
+		ve->base.emit_bb_start = sibling->emit_bb_start;
+		ve->base.emit_flush = sibling->emit_flush;
+		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
+		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
+		ve->base.emit_fini_breadcrumb_dw =
+			sibling->emit_fini_breadcrumb_dw;
+	}
+
+	return &ve->context;
+
+err_put:
+	intel_context_put(&ve->context);
+	return ERR_PTR(err);
+}
+
+struct intel_context *
+intel_execlists_clone_virtual(struct i915_gem_context *ctx,
+			      struct intel_engine_cs *src)
+{
+	struct virtual_engine *se = to_virtual_engine(src);
+	struct intel_context *dst;
+
+	dst = intel_execlists_create_virtual(ctx,
+					     se->siblings,
+					     se->num_siblings);
+	if (IS_ERR(dst))
+		return dst;
+
+	return dst;
+}
+
 void intel_execlists_show_requests(struct intel_engine_cs *engine,
 				   struct drm_printer *m,
 				   void (*show_request)(struct drm_printer *m,
@@ -2772,6 +3387,29 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
 		show_request(m, last, "\t\tQ ");
 	}
 
+	last = NULL;
+	count = 0;
+	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+		struct i915_request *rq = READ_ONCE(ve->request);
+
+		if (rq) {
+			if (count++ < max - 1)
+				show_request(m, rq, "\t\tV ");
+			else
+				last = rq;
+		}
+	}
+	if (last) {
+		if (count > max) {
+			drm_printf(m,
+				   "\t\t...skipping %d virtual requests...\n",
+				   count - max);
+		}
+		show_request(m, last, "\t\tV ");
+	}
+
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.h b/drivers/gpu/drm/i915/gt/intel_lrc.h
index a0dc907a7249..5530606052e5 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.h
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.h
@@ -114,4 +114,13 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
 							const char *prefix),
 				   unsigned int max);
 
+struct intel_context *
+intel_execlists_create_virtual(struct i915_gem_context *ctx,
+			       struct intel_engine_cs **siblings,
+			       unsigned int count);
+
+struct intel_context *
+intel_execlists_clone_virtual(struct i915_gem_context *ctx,
+			      struct intel_engine_cs *src);
+
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 5b3d8e33f1cf..f880271fb9ba 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -1310,6 +1310,185 @@ err_unlock:
 	return err;
 }
 
+static int nop_virtual_engine(struct drm_i915_private *i915,
+			      struct intel_engine_cs **siblings,
+			      unsigned int nsibling,
+			      unsigned int nctx,
+			      unsigned int flags)
+#define CHAIN BIT(0)
+{
+	IGT_TIMEOUT(end_time);
+	struct i915_request *request[16];
+	struct i915_gem_context *ctx[16];
+	struct intel_context *ve[16];
+	unsigned long n, prime, nc;
+	struct igt_live_test t;
+	ktime_t times[2] = {};
+	int err;
+
+	GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ctx));
+
+	for (n = 0; n < nctx; n++) {
+		ctx[n] = kernel_context(i915);
+		if (!ctx[n]) {
+			err = -ENOMEM;
+			nctx = n;
+			goto out;
+		}
+
+		ve[n] = intel_execlists_create_virtual(ctx[n],
+						       siblings, nsibling);
+		if (IS_ERR(ve[n])) {
+			kernel_context_close(ctx[n]);
+			err = PTR_ERR(ve[n]);
+			nctx = n;
+			goto out;
+		}
+
+		err = intel_context_pin(ve[n]);
+		if (err) {
+			intel_context_put(ve[n]);
+			kernel_context_close(ctx[n]);
+			nctx = n;
+			goto out;
+		}
+	}
+
+	err = igt_live_test_begin(&t, i915, __func__, ve[0]->engine->name);
+	if (err)
+		goto out;
+
+	for_each_prime_number_from(prime, 1, 8192) {
+		times[1] = ktime_get_raw();
+
+		if (flags & CHAIN) {
+			for (nc = 0; nc < nctx; nc++) {
+				for (n = 0; n < prime; n++) {
+					request[nc] =
+						i915_request_create(ve[nc]);
+					if (IS_ERR(request[nc])) {
+						err = PTR_ERR(request[nc]);
+						goto out;
+					}
+
+					i915_request_add(request[nc]);
+				}
+			}
+		} else {
+			for (n = 0; n < prime; n++) {
+				for (nc = 0; nc < nctx; nc++) {
+					request[nc] =
+						i915_request_create(ve[nc]);
+					if (IS_ERR(request[nc])) {
+						err = PTR_ERR(request[nc]);
+						goto out;
+					}
+
+					i915_request_add(request[nc]);
+				}
+			}
+		}
+
+		for (nc = 0; nc < nctx; nc++) {
+			if (i915_request_wait(request[nc],
+					      I915_WAIT_LOCKED,
+					      HZ / 10) < 0) {
+				pr_err("%s(%s): wait for %llx:%lld timed out\n",
+				       __func__, ve[0]->engine->name,
+				       request[nc]->fence.context,
+				       request[nc]->fence.seqno);
+
+				GEM_TRACE("%s(%s) failed at request %llx:%lld\n",
+					  __func__, ve[0]->engine->name,
+					  request[nc]->fence.context,
+					  request[nc]->fence.seqno);
+				GEM_TRACE_DUMP();
+				i915_gem_set_wedged(i915);
+				break;
+			}
+		}
+
+		times[1] = ktime_sub(ktime_get_raw(), times[1]);
+		if (prime == 1)
+			times[0] = times[1];
+
+		if (__igt_timeout(end_time, NULL))
+			break;
+	}
+
+	err = igt_live_test_end(&t);
+	if (err)
+		goto out;
+
+	pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n",
+		nctx, ve[0]->engine->name, ktime_to_ns(times[0]),
+		prime, div64_u64(ktime_to_ns(times[1]), prime));
+
+out:
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+
+	for (nc = 0; nc < nctx; nc++) {
+		intel_context_unpin(ve[nc]);
+		intel_context_put(ve[nc]);
+		kernel_context_close(ctx[nc]);
+	}
+	return err;
+}
+
+static int live_virtual_engine(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned int class, inst;
+	int err = -ENODEV;
+
+	if (USES_GUC_SUBMISSION(i915))
+		return 0;
+
+	mutex_lock(&i915->drm.struct_mutex);
+
+	for_each_engine(engine, i915, id) {
+		err = nop_virtual_engine(i915, &engine, 1, 1, 0);
+		if (err) {
+			pr_err("Failed to wrap engine %s: err=%d\n",
+			       engine->name, err);
+			goto out_unlock;
+		}
+	}
+
+	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
+		int nsibling, n;
+
+		nsibling = 0;
+		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
+			if (!i915->engine_class[class][inst])
+				continue;
+
+			siblings[nsibling++] = i915->engine_class[class][inst];
+		}
+		if (nsibling < 2)
+			continue;
+
+		for (n = 1; n <= nsibling + 1; n++) {
+			err = nop_virtual_engine(i915, siblings, nsibling,
+						 n, 0);
+			if (err)
+				goto out_unlock;
+		}
+
+		err = nop_virtual_engine(i915, siblings, nsibling, n, CHAIN);
+		if (err)
+			goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+}
+
 int intel_execlists_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
@@ -1322,6 +1501,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_chain_preempt),
 		SUBTEST(live_preempt_hang),
 		SUBTEST(live_preempt_smoke),
+		SUBTEST(live_virtual_engine),
 	};
 
 	if (!HAS_EXECLISTS(i915))
diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
index 67f8a4a807a0..fe82d3571072 100644
--- a/drivers/gpu/drm/i915/i915_gem.h
+++ b/drivers/gpu/drm/i915/i915_gem.h
@@ -91,4 +91,9 @@ static inline bool __tasklet_enable(struct tasklet_struct *t)
 	return atomic_dec_and_test(&t->count);
 }
 
+static inline bool __tasklet_is_scheduled(struct tasklet_struct *t)
+{
+	return test_bit(TASKLET_STATE_SCHED, &t->state);
+}
+
 #endif /* __I915_GEM_H__ */
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 24736fcd463d..68db6b28e606 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -86,6 +86,7 @@
  */
 
 #include <linux/log2.h>
+#include <linux/nospec.h>
 
 #include <drm/i915_drm.h>
 
@@ -1218,7 +1219,6 @@ __intel_context_reconfigure_sseu(struct intel_context *ce,
 	int ret;
 
 	GEM_BUG_ON(INTEL_GEN(ce->gem_context->i915) < 8);
-	GEM_BUG_ON(ce->engine->id != RCS0);
 
 	ret = intel_context_lock_pinned(ce);
 	if (ret)
@@ -1412,7 +1412,100 @@ struct set_engines {
 	struct i915_gem_engines *engines;
 };
 
+static int
+set_engines__load_balance(struct i915_user_extension __user *base, void *data)
+{
+	struct i915_context_engines_load_balance __user *ext =
+		container_of_user(base, typeof(*ext), base);
+	const struct set_engines *set = data;
+	struct intel_engine_cs *stack[16];
+	struct intel_engine_cs **siblings;
+	struct intel_context *ce;
+	u16 num_siblings, idx;
+	unsigned int n;
+	int err;
+
+	if (!HAS_EXECLISTS(set->ctx->i915))
+		return -ENODEV;
+
+	if (USES_GUC_SUBMISSION(set->ctx->i915))
+		return -ENODEV; /* not implement yet */
+
+	if (get_user(idx, &ext->engine_index))
+		return -EFAULT;
+
+	if (idx >= set->engines->num_engines) {
+		DRM_DEBUG("Invalid placement value, %d >= %d\n",
+			  idx, set->engines->num_engines);
+		return -EINVAL;
+	}
+
+	idx = array_index_nospec(idx, set->engines->num_engines);
+	if (set->engines->engines[idx]) {
+		DRM_DEBUG("Invalid placement[%d], already occupied\n", idx);
+		return -EEXIST;
+	}
+
+	if (get_user(num_siblings, &ext->num_siblings))
+		return -EFAULT;
+
+	err = check_user_mbz(&ext->flags);
+	if (err)
+		return err;
+
+	err = check_user_mbz(&ext->mbz64);
+	if (err)
+		return err;
+
+	siblings = stack;
+	if (num_siblings > ARRAY_SIZE(stack)) {
+		siblings = kmalloc_array(num_siblings,
+					 sizeof(*siblings),
+					 GFP_KERNEL);
+		if (!siblings)
+			return -ENOMEM;
+	}
+
+	for (n = 0; n < num_siblings; n++) {
+		struct i915_engine_class_instance ci;
+
+		if (copy_from_user(&ci, &ext->engines[n], sizeof(ci))) {
+			err = -EFAULT;
+			goto out_siblings;
+		}
+
+		siblings[n] = intel_engine_lookup_user(set->ctx->i915,
+						       ci.engine_class,
+						       ci.engine_instance);
+		if (!siblings[n]) {
+			DRM_DEBUG("Invalid sibling[%d]: { class:%d, inst:%d }\n",
+				  n, ci.engine_class, ci.engine_instance);
+			err = -EINVAL;
+			goto out_siblings;
+		}
+	}
+
+	ce = intel_execlists_create_virtual(set->ctx, siblings, n);
+	if (IS_ERR(ce)) {
+		err = PTR_ERR(ce);
+		goto out_siblings;
+	}
+
+	if (cmpxchg(&set->engines->engines[idx], NULL, ce)) {
+		intel_context_put(ce);
+		err = -EEXIST;
+		goto out_siblings;
+	}
+
+out_siblings:
+	if (siblings != stack)
+		kfree(siblings);
+
+	return err;
+}
+
 static const i915_user_extension_fn set_engines__extensions[] = {
+	[I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance,
 };
 
 static int
@@ -1737,14 +1830,29 @@ static int clone_engines(struct i915_gem_context *dst,
 
 	clone->i915 = dst->i915;
 	for (n = 0; n < e->num_engines; n++) {
+		struct intel_engine_cs *engine;
+
 		if (!e->engines[n]) {
 			clone->engines[n] = NULL;
 			continue;
 		}
+		engine = e->engines[n]->engine;
 
-		clone->engines[n] =
-			intel_context_create(dst, e->engines[n]->engine);
-		if (!clone->engines[n]) {
+		/*
+		 * Virtual engines are singletons; they can only exist
+		 * inside a single context, because they embed their
+		 * HW context... As each virtual context implies a single
+		 * timeline (each engine can only dequeue a single request
+		 * at any time), it would be surprising for two contexts
+		 * to use the same engine. So let's create a copy of
+		 * the virtual engine instead.
+		 */
+		if (intel_engine_is_virtual(engine))
+			clone->engines[n] =
+				intel_execlists_clone_virtual(dst, engine);
+		else
+			clone->engines[n] = intel_context_create(dst, engine);
+		if (IS_ERR_OR_NULL(clone->engines[n])) {
 			__free_engines(clone, n);
 			goto err_unlock;
 		}
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index d215dcdf0b1e..78ceb56d7801 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -150,17 +150,26 @@ sched_lock_engine(const struct i915_sched_node *node,
 		  struct intel_engine_cs *locked,
 		  struct sched_cache *cache)
 {
-	struct intel_engine_cs *engine = node_to_request(node)->engine;
+	const struct i915_request *rq = node_to_request(node);
+	struct intel_engine_cs *engine;
 
 	GEM_BUG_ON(!locked);
 
-	if (engine != locked) {
+	/*
+	 * Virtual engines complicate acquiring the engine timeline lock,
+	 * as their rq->engine pointer is not stable until under that
+	 * engine lock. The simple ploy we use is to take the lock then
+	 * check that the rq still belongs to the newly locked engine.
+	 */
+	while (locked != (engine = READ_ONCE(rq->engine))) {
 		spin_unlock(&locked->timeline.lock);
 		memset(cache, 0, sizeof(*cache));
 		spin_lock(&engine->timeline.lock);
+		locked = engine;
 	}
 
-	return engine;
+	GEM_BUG_ON(locked != engine);
+	return locked;
 }
 
 static inline int rq_prio(const struct i915_request *rq)
@@ -272,6 +281,7 @@ static void __i915_schedule(struct i915_sched_node *node,
 	spin_lock(&engine->timeline.lock);
 
 	/* Fifo and depth-first replacement ensure our deps execute before us */
+	engine = sched_lock_engine(node, engine, &cache);
 	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
 		INIT_LIST_HEAD(&dep->dfs_link);
 
@@ -283,8 +293,11 @@ static void __i915_schedule(struct i915_sched_node *node,
 		if (prio <= node->attr.priority || node_signaled(node))
 			continue;
 
+		GEM_BUG_ON(node_to_request(node)->engine != engine);
+
 		node->attr.priority = prio;
 		if (!list_empty(&node->link)) {
+			GEM_BUG_ON(intel_engine_is_virtual(engine));
 			if (!cache.priolist)
 				cache.priolist =
 					i915_sched_lookup_priolist(engine,
diff --git a/drivers/gpu/drm/i915/i915_timeline_types.h b/drivers/gpu/drm/i915/i915_timeline_types.h
index 5256a0b5c5f7..1688705f4a2b 100644
--- a/drivers/gpu/drm/i915/i915_timeline_types.h
+++ b/drivers/gpu/drm/i915/i915_timeline_types.h
@@ -26,6 +26,7 @@ struct i915_timeline {
 	spinlock_t lock;
 #define TIMELINE_CLIENT 0 /* default subclass */
 #define TIMELINE_ENGINE 1
+#define TIMELINE_VIRTUAL 2
 	struct mutex mutex; /* protects the flow of requests */
 
 	unsigned int pin_count;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 62396d575e28..f9770948161c 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -137,6 +137,7 @@ struct i915_engine_class_instance {
 	__u16 engine_class; /* see enum drm_i915_gem_engine_class */
 	__u16 engine_instance;
 #define I915_ENGINE_CLASS_INVALID_NONE -1
+#define I915_ENGINE_CLASS_INVALID_VIRTUAL -2
 };
 
 /**
@@ -1608,8 +1609,46 @@ struct drm_i915_gem_context_param_sseu {
 	__u32 rsvd;
 };
 
+/*
+ * i915_context_engines_load_balance:
+ *
+ * Enable load balancing across this set of engines.
+ *
+ * Into the I915_EXEC_DEFAULT slot [0], a virtual engine is created that when
+ * used will proxy the execbuffer request onto one of the set of engines
+ * in such a way as to distribute the load evenly across the set.
+ *
+ * The set of engines must be compatible (e.g. the same HW class) as they
+ * will share the same logical GPU context and ring.
+ *
+ * To intermix rendering with the virtual engine and direct rendering onto
+ * the backing engines (bypassing the load balancing proxy), the context must
+ * be defined to use a single timeline for all engines.
+ */
+struct i915_context_engines_load_balance {
+	struct i915_user_extension base;
+
+	__u16 engine_index;
+	__u16 num_siblings;
+	__u32 flags; /* all undefined flags must be zero */
+
+	__u64 mbz64; /* reserved for future use; must be zero */
+
+	struct i915_engine_class_instance engines[0];
+} __attribute__((packed));
+
+#define I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(name__, N__) struct { \
+	struct i915_user_extension base; \
+	__u16 engine_index; \
+	__u16 num_siblings; \
+	__u32 flags; \
+	__u64 mbz64; \
+	struct i915_engine_class_instance engines[N__]; \
+} __attribute__((packed)) name__
+
 struct i915_context_param_engines {
 	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
+#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */
 	struct i915_engine_class_instance engines[0];
 } __attribute__((packed));
 
-- 
cgit v1.2.3


From ee1136908e9b28173f9794be25465a13b2bb9b18 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 May 2019 22:11:33 +0100
Subject: drm/i915/execlists: Virtual engine bonding

Some users require that when a master batch is executed on one particular
engine, a companion batch is run simultaneously on a specific slave
engine. For this purpose, we introduce virtual engine bonding, allowing
maps of master:slaves to be constructed to constrain which physical
engines a virtual engine may select given a fence on a master engine.

For the moment, we continue to ignore the issue of preemption deferring
the master request for later. Ideally, we would like to then also remove
the slave and run something else rather than have it stall the pipeline.
With load balancing, we should be able to move workload around it, but
there is a similar stall on the master pipeline while it may wait for
the slave to be executed. At the cost of more latency for the bonded
request, it may be interesting to launch both on their engines in
lockstep. (Bubbles abound.)

Opens: Also what about bonding an engine as its own master? It doesn't
break anything internally, so allow the silliness.

v2: Emancipate the bonds
v3: Couple in delayed scheduling for the selftests
v4: Handle invalid mutually exclusive bonding
v5: Mention what the uapi does
v6: s/nbond/num_bonds/

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190521211134.16117-9-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |   7 +
 drivers/gpu/drm/i915/gt/intel_lrc.c           |  98 +++++++++++++
 drivers/gpu/drm/i915/gt/intel_lrc.h           |   4 +
 drivers/gpu/drm/i915/gt/selftest_lrc.c        | 191 ++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_context.c       |  84 +++++++++++
 drivers/gpu/drm/i915/selftests/lib_sw_fence.c |   3 +
 include/uapi/drm/i915_drm.h                   |  44 ++++++
 7 files changed, 431 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 7b47e00fa082..f3fc2e8acc90 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -405,6 +405,13 @@ struct intel_engine_cs {
 	 */
 	void		(*submit_request)(struct i915_request *rq);
 
+	/*
+	 * Called on signaling of a SUBMIT_FENCE, passing along the signaling
+	 * request down to the bonded pairs.
+	 */
+	void            (*bond_execute)(struct i915_request *rq,
+					struct dma_fence *signal);
+
 	/*
 	 * Call when the priority on a request has changed and it and its
 	 * dependencies may need rescheduling. Note the request itself may
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 0b4d29d4816b..1f7bee0cae0c 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -189,6 +189,18 @@ struct virtual_engine {
 		int prio;
 	} nodes[I915_NUM_ENGINES];
 
+	/*
+	 * Keep track of bonded pairs -- restrictions upon on our selection
+	 * of physical engines any particular request may be submitted to.
+	 * If we receive a submit-fence from a master engine, we will only
+	 * use one of sibling_mask physical engines.
+	 */
+	struct ve_bond {
+		const struct intel_engine_cs *master;
+		intel_engine_mask_t sibling_mask;
+	} *bonds;
+	unsigned int num_bonds;
+
 	/* And finally, which physical engines this virtual engine maps onto. */
 	unsigned int num_siblings;
 	struct intel_engine_cs *siblings[0];
@@ -960,6 +972,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			rb_erase_cached(rb, &execlists->virtual);
 			RB_CLEAR_NODE(rb);
 
+			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
 			rq->engine = engine;
 
 			if (engine != ve->siblings[0]) {
@@ -3069,6 +3082,8 @@ static void virtual_context_destroy(struct kref *kref)
 	if (ve->context.state)
 		__execlists_context_fini(&ve->context);
 
+	kfree(ve->bonds);
+
 	i915_timeline_fini(&ve->base.timeline);
 	kfree(ve);
 }
@@ -3265,6 +3280,38 @@ static void virtual_submit_request(struct i915_request *rq)
 	tasklet_schedule(&ve->base.execlists.tasklet);
 }
 
+static struct ve_bond *
+virtual_find_bond(struct virtual_engine *ve,
+		  const struct intel_engine_cs *master)
+{
+	int i;
+
+	for (i = 0; i < ve->num_bonds; i++) {
+		if (ve->bonds[i].master == master)
+			return &ve->bonds[i];
+	}
+
+	return NULL;
+}
+
+static void
+virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
+{
+	struct virtual_engine *ve = to_virtual_engine(rq->engine);
+	struct ve_bond *bond;
+
+	bond = virtual_find_bond(ve, to_request(signal)->engine);
+	if (bond) {
+		intel_engine_mask_t old, new, cmp;
+
+		cmp = READ_ONCE(rq->execution_mask);
+		do {
+			old = cmp;
+			new = cmp & bond->sibling_mask;
+		} while ((cmp = cmpxchg(&rq->execution_mask, old, new)) != old);
+	}
+}
+
 struct intel_context *
 intel_execlists_create_virtual(struct i915_gem_context *ctx,
 			       struct intel_engine_cs **siblings,
@@ -3305,6 +3352,7 @@ intel_execlists_create_virtual(struct i915_gem_context *ctx,
 
 	ve->base.schedule = i915_schedule;
 	ve->base.submit_request = virtual_submit_request;
+	ve->base.bond_execute = virtual_bond_execute;
 
 	ve->base.execlists.queue_priority_hint = INT_MIN;
 	tasklet_init(&ve->base.execlists.tasklet,
@@ -3394,9 +3442,59 @@ intel_execlists_clone_virtual(struct i915_gem_context *ctx,
 	if (IS_ERR(dst))
 		return dst;
 
+	if (se->num_bonds) {
+		struct virtual_engine *de = to_virtual_engine(dst->engine);
+
+		de->bonds = kmemdup(se->bonds,
+				    sizeof(*se->bonds) * se->num_bonds,
+				    GFP_KERNEL);
+		if (!de->bonds) {
+			intel_context_put(dst);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		de->num_bonds = se->num_bonds;
+	}
+
 	return dst;
 }
 
+int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
+				     const struct intel_engine_cs *master,
+				     const struct intel_engine_cs *sibling)
+{
+	struct virtual_engine *ve = to_virtual_engine(engine);
+	struct ve_bond *bond;
+	int n;
+
+	/* Sanity check the sibling is part of the virtual engine */
+	for (n = 0; n < ve->num_siblings; n++)
+		if (sibling == ve->siblings[n])
+			break;
+	if (n == ve->num_siblings)
+		return -EINVAL;
+
+	bond = virtual_find_bond(ve, master);
+	if (bond) {
+		bond->sibling_mask |= sibling->mask;
+		return 0;
+	}
+
+	bond = krealloc(ve->bonds,
+			sizeof(*bond) * (ve->num_bonds + 1),
+			GFP_KERNEL);
+	if (!bond)
+		return -ENOMEM;
+
+	bond[ve->num_bonds].master = master;
+	bond[ve->num_bonds].sibling_mask = sibling->mask;
+
+	ve->bonds = bond;
+	ve->num_bonds++;
+
+	return 0;
+}
+
 void intel_execlists_show_requests(struct intel_engine_cs *engine,
 				   struct drm_printer *m,
 				   void (*show_request)(struct drm_printer *m,
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.h b/drivers/gpu/drm/i915/gt/intel_lrc.h
index 5530606052e5..e029aee87adf 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.h
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.h
@@ -123,4 +123,8 @@ struct intel_context *
 intel_execlists_clone_virtual(struct i915_gem_context *ctx,
 			      struct intel_engine_cs *src);
 
+int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
+				     const struct intel_engine_cs *master,
+				     const struct intel_engine_cs *sibling);
+
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 61637f525690..a8c50900e2d4 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -13,6 +13,7 @@
 #include "selftests/igt_gem_utils.h"
 #include "selftests/igt_live_test.h"
 #include "selftests/igt_spinner.h"
+#include "selftests/lib_sw_fence.h"
 #include "selftests/mock_context.h"
 
 static int live_sanitycheck(void *arg)
@@ -1619,6 +1620,195 @@ out_unlock:
 	return err;
 }
 
+static int bond_virtual_engine(struct drm_i915_private *i915,
+			       unsigned int class,
+			       struct intel_engine_cs **siblings,
+			       unsigned int nsibling,
+			       unsigned int flags)
+#define BOND_SCHEDULE BIT(0)
+{
+	struct intel_engine_cs *master;
+	struct i915_gem_context *ctx;
+	struct i915_request *rq[16];
+	enum intel_engine_id id;
+	unsigned long n;
+	int err;
+
+	GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1);
+
+	ctx = kernel_context(i915);
+	if (!ctx)
+		return -ENOMEM;
+
+	err = 0;
+	rq[0] = ERR_PTR(-ENOMEM);
+	for_each_engine(master, i915, id) {
+		struct i915_sw_fence fence = {};
+
+		if (master->class == class)
+			continue;
+
+		memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq));
+
+		rq[0] = igt_request_alloc(ctx, master);
+		if (IS_ERR(rq[0])) {
+			err = PTR_ERR(rq[0]);
+			goto out;
+		}
+		i915_request_get(rq[0]);
+
+		if (flags & BOND_SCHEDULE) {
+			onstack_fence_init(&fence);
+			err = i915_sw_fence_await_sw_fence_gfp(&rq[0]->submit,
+							       &fence,
+							       GFP_KERNEL);
+		}
+		i915_request_add(rq[0]);
+		if (err < 0)
+			goto out;
+
+		for (n = 0; n < nsibling; n++) {
+			struct intel_context *ve;
+
+			ve = intel_execlists_create_virtual(ctx,
+							    siblings,
+							    nsibling);
+			if (IS_ERR(ve)) {
+				err = PTR_ERR(ve);
+				onstack_fence_fini(&fence);
+				goto out;
+			}
+
+			err = intel_virtual_engine_attach_bond(ve->engine,
+							       master,
+							       siblings[n]);
+			if (err) {
+				intel_context_put(ve);
+				onstack_fence_fini(&fence);
+				goto out;
+			}
+
+			err = intel_context_pin(ve);
+			intel_context_put(ve);
+			if (err) {
+				onstack_fence_fini(&fence);
+				goto out;
+			}
+
+			rq[n + 1] = i915_request_create(ve);
+			intel_context_unpin(ve);
+			if (IS_ERR(rq[n + 1])) {
+				err = PTR_ERR(rq[n + 1]);
+				onstack_fence_fini(&fence);
+				goto out;
+			}
+			i915_request_get(rq[n + 1]);
+
+			err = i915_request_await_execution(rq[n + 1],
+							   &rq[0]->fence,
+							   ve->engine->bond_execute);
+			i915_request_add(rq[n + 1]);
+			if (err < 0) {
+				onstack_fence_fini(&fence);
+				goto out;
+			}
+		}
+		onstack_fence_fini(&fence);
+
+		if (i915_request_wait(rq[0],
+				      I915_WAIT_LOCKED,
+				      HZ / 10) < 0) {
+			pr_err("Master request did not execute (on %s)!\n",
+			       rq[0]->engine->name);
+			err = -EIO;
+			goto out;
+		}
+
+		for (n = 0; n < nsibling; n++) {
+			if (i915_request_wait(rq[n + 1],
+					      I915_WAIT_LOCKED,
+					      MAX_SCHEDULE_TIMEOUT) < 0) {
+				err = -EIO;
+				goto out;
+			}
+
+			if (rq[n + 1]->engine != siblings[n]) {
+				pr_err("Bonded request did not execute on target engine: expected %s, used %s; master was %s\n",
+				       siblings[n]->name,
+				       rq[n + 1]->engine->name,
+				       rq[0]->engine->name);
+				err = -EINVAL;
+				goto out;
+			}
+		}
+
+		for (n = 0; !IS_ERR(rq[n]); n++)
+			i915_request_put(rq[n]);
+		rq[0] = ERR_PTR(-ENOMEM);
+	}
+
+out:
+	for (n = 0; !IS_ERR(rq[n]); n++)
+		i915_request_put(rq[n]);
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+
+	kernel_context_close(ctx);
+	return err;
+}
+
+static int live_virtual_bond(void *arg)
+{
+	static const struct phase {
+		const char *name;
+		unsigned int flags;
+	} phases[] = {
+		{ "", 0 },
+		{ "schedule", BOND_SCHEDULE },
+		{ },
+	};
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
+	unsigned int class, inst;
+	int err = 0;
+
+	if (USES_GUC_SUBMISSION(i915))
+		return 0;
+
+	mutex_lock(&i915->drm.struct_mutex);
+
+	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
+		const struct phase *p;
+		int nsibling;
+
+		nsibling = 0;
+		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
+			if (!i915->engine_class[class][inst])
+				break;
+
+			GEM_BUG_ON(nsibling == ARRAY_SIZE(siblings));
+			siblings[nsibling++] = i915->engine_class[class][inst];
+		}
+		if (nsibling < 2)
+			continue;
+
+		for (p = phases; p->name; p++) {
+			err = bond_virtual_engine(i915,
+						  class, siblings, nsibling,
+						  p->flags);
+			if (err) {
+				pr_err("%s(%s): failed class=%d, nsibling=%d, err=%d\n",
+				       __func__, p->name, class, nsibling, err);
+				goto out_unlock;
+			}
+		}
+	}
+
+out_unlock:
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+}
+
 int intel_execlists_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
@@ -1633,6 +1823,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_preempt_smoke),
 		SUBTEST(live_virtual_engine),
 		SUBTEST(live_virtual_mask),
+		SUBTEST(live_virtual_bond),
 	};
 
 	if (!HAS_EXECLISTS(i915))
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 68db6b28e606..5d2f8ba92b59 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1504,8 +1504,92 @@ out_siblings:
 	return err;
 }
 
+static int
+set_engines__bond(struct i915_user_extension __user *base, void *data)
+{
+	struct i915_context_engines_bond __user *ext =
+		container_of_user(base, typeof(*ext), base);
+	const struct set_engines *set = data;
+	struct i915_engine_class_instance ci;
+	struct intel_engine_cs *virtual;
+	struct intel_engine_cs *master;
+	u16 idx, num_bonds;
+	int err, n;
+
+	if (get_user(idx, &ext->virtual_index))
+		return -EFAULT;
+
+	if (idx >= set->engines->num_engines) {
+		DRM_DEBUG("Invalid index for virtual engine: %d >= %d\n",
+			  idx, set->engines->num_engines);
+		return -EINVAL;
+	}
+
+	idx = array_index_nospec(idx, set->engines->num_engines);
+	if (!set->engines->engines[idx]) {
+		DRM_DEBUG("Invalid engine at %d\n", idx);
+		return -EINVAL;
+	}
+	virtual = set->engines->engines[idx]->engine;
+
+	err = check_user_mbz(&ext->flags);
+	if (err)
+		return err;
+
+	for (n = 0; n < ARRAY_SIZE(ext->mbz64); n++) {
+		err = check_user_mbz(&ext->mbz64[n]);
+		if (err)
+			return err;
+	}
+
+	if (copy_from_user(&ci, &ext->master, sizeof(ci)))
+		return -EFAULT;
+
+	master = intel_engine_lookup_user(set->ctx->i915,
+					  ci.engine_class, ci.engine_instance);
+	if (!master) {
+		DRM_DEBUG("Unrecognised master engine: { class:%u, instance:%u }\n",
+			  ci.engine_class, ci.engine_instance);
+		return -EINVAL;
+	}
+
+	if (get_user(num_bonds, &ext->num_bonds))
+		return -EFAULT;
+
+	for (n = 0; n < num_bonds; n++) {
+		struct intel_engine_cs *bond;
+
+		if (copy_from_user(&ci, &ext->engines[n], sizeof(ci)))
+			return -EFAULT;
+
+		bond = intel_engine_lookup_user(set->ctx->i915,
+						ci.engine_class,
+						ci.engine_instance);
+		if (!bond) {
+			DRM_DEBUG("Unrecognised engine[%d] for bonding: { class:%d, instance: %d }\n",
+				  n, ci.engine_class, ci.engine_instance);
+			return -EINVAL;
+		}
+
+		/*
+		 * A non-virtual engine has no siblings to choose between; and
+		 * a submit fence will always be directed to the one engine.
+		 */
+		if (intel_engine_is_virtual(virtual)) {
+			err = intel_virtual_engine_attach_bond(virtual,
+							       master,
+							       bond);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
 static const i915_user_extension_fn set_engines__extensions[] = {
 	[I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance,
+	[I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond,
 };
 
 static int
diff --git a/drivers/gpu/drm/i915/selftests/lib_sw_fence.c b/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
index 2bfa72c1654b..b976c12817c5 100644
--- a/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
+++ b/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
@@ -45,6 +45,9 @@ void __onstack_fence_init(struct i915_sw_fence *fence,
 
 void onstack_fence_fini(struct i915_sw_fence *fence)
 {
+	if (!fence->flags)
+		return;
+
 	i915_sw_fence_commit(fence);
 	i915_sw_fence_fini(fence);
 }
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index f9770948161c..e2da9027bcdf 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1543,6 +1543,10 @@ struct drm_i915_gem_context_param {
  * sized argument, will revert back to default settings.
  *
  * See struct i915_context_param_engines.
+ *
+ * Extensions:
+ *   i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
+ *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
  */
 #define I915_CONTEXT_PARAM_ENGINES	0xa
 /* Must be kept compact -- no holes and well documented */
@@ -1646,9 +1650,49 @@ struct i915_context_engines_load_balance {
 	struct i915_engine_class_instance engines[N__]; \
 } __attribute__((packed)) name__
 
+/*
+ * i915_context_engines_bond:
+ *
+ * Constructed bonded pairs for execution within a virtual engine.
+ *
+ * All engines are equal, but some are more equal than others. Given
+ * the distribution of resources in the HW, it may be preferable to run
+ * a request on a given subset of engines in parallel to a request on a
+ * specific engine. We enable this selection of engines within a virtual
+ * engine by specifying bonding pairs, for any given master engine we will
+ * only execute on one of the corresponding siblings within the virtual engine.
+ *
+ * To execute a request in parallel on the master engine and a sibling requires
+ * coordination with a I915_EXEC_FENCE_SUBMIT.
+ */
+struct i915_context_engines_bond {
+	struct i915_user_extension base;
+
+	struct i915_engine_class_instance master;
+
+	__u16 virtual_index; /* index of virtual engine in ctx->engines[] */
+	__u16 num_bonds;
+
+	__u64 flags; /* all undefined flags must be zero */
+	__u64 mbz64[4]; /* reserved for future use; must be zero */
+
+	struct i915_engine_class_instance engines[0];
+} __attribute__((packed));
+
+#define I915_DEFINE_CONTEXT_ENGINES_BOND(name__, N__) struct { \
+	struct i915_user_extension base; \
+	struct i915_engine_class_instance master; \
+	__u16 virtual_index; \
+	__u16 num_bonds; \
+	__u64 flags; \
+	__u64 mbz64[4]; \
+	struct i915_engine_class_instance engines[N__]; \
+} __attribute__((packed)) name__
+
 struct i915_context_param_engines {
 	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
 #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */
+#define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */
 	struct i915_engine_class_instance engines[0];
 } __attribute__((packed));
 
-- 
cgit v1.2.3


From a88b6e4cbafd6f23b3450c087acdbe23d90e7606 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 May 2019 22:11:34 +0100
Subject: drm/i915: Allow specification of parallel execbuf

There is a desire to split a task onto two engines and have them run at
the same time, e.g. scanline interleaving to spread the workload evenly.
Through the use of the out-fence from the first execbuf, we can
coordinate secondary execbuf to only become ready simultaneously with
the first, so that with all things idle the second execbufs are executed
in parallel with the first. The key difference here between the new
EXEC_FENCE_SUBMIT and the existing EXEC_FENCE_IN is that the in-fence
waits for the completion of the first request (so that all of its
rendering results are visible to the second execbuf, the more common
userspace fence requirement).

Since we only have a single input fence slot, userspace cannot mix an
in-fence and a submit-fence. It has to use one or the other! This is not
such a harsh requirement, since by virtue of the submit-fence, the
secondary execbuf inherit all of the dependencies from the first
request, and for the application the dependencies should be common
between the primary and secondary execbuf.

Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Testcase: igt/gem_exec_fence/parallel
Link: https://github.com/intel/media-driver/pull/546
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190521211134.16117-10-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 25 ++++++++++++++++++++++++-
 include/uapi/drm/i915_drm.h                | 17 ++++++++++++++++-
 3 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 5061cb32856b..83d2eb9e74cb 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -443,6 +443,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_CAPTURE:
 	case I915_PARAM_HAS_EXEC_BATCH_FIRST:
 	case I915_PARAM_HAS_EXEC_FENCE_ARRAY:
+	case I915_PARAM_HAS_EXEC_SUBMIT_FENCE:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index d6c5220addd0..7ce25b54c57b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -2318,6 +2318,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 {
 	struct i915_execbuffer eb;
 	struct dma_fence *in_fence = NULL;
+	struct dma_fence *exec_fence = NULL;
 	struct sync_file *out_fence = NULL;
 	int out_fence_fd = -1;
 	int err;
@@ -2360,11 +2361,24 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 			return -EINVAL;
 	}
 
+	if (args->flags & I915_EXEC_FENCE_SUBMIT) {
+		if (in_fence) {
+			err = -EINVAL;
+			goto err_in_fence;
+		}
+
+		exec_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
+		if (!exec_fence) {
+			err = -EINVAL;
+			goto err_in_fence;
+		}
+	}
+
 	if (args->flags & I915_EXEC_FENCE_OUT) {
 		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
 		if (out_fence_fd < 0) {
 			err = out_fence_fd;
-			goto err_in_fence;
+			goto err_exec_fence;
 		}
 	}
 
@@ -2494,6 +2508,13 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 			goto err_request;
 	}
 
+	if (exec_fence) {
+		err = i915_request_await_execution(eb.request, exec_fence,
+						   eb.engine->bond_execute);
+		if (err < 0)
+			goto err_request;
+	}
+
 	if (fences) {
 		err = await_fence_array(&eb, fences);
 		if (err)
@@ -2555,6 +2576,8 @@ err_destroy:
 err_out_fence:
 	if (out_fence_fd != -1)
 		put_unused_fd(out_fence_fd);
+err_exec_fence:
+	dma_fence_put(exec_fence);
 err_in_fence:
 	dma_fence_put(in_fence);
 	return err;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index e2da9027bcdf..bdb00ec1f8be 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -604,6 +604,12 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_MMAP_GTT_COHERENT	52
 
+/*
+ * Query whether DRM_I915_GEM_EXECBUFFER2 supports coordination of parallel
+ * execution through use of explicit fence support.
+ * See I915_EXEC_FENCE_OUT and I915_EXEC_FENCE_SUBMIT.
+ */
+#define I915_PARAM_HAS_EXEC_SUBMIT_FENCE 53
 /* Must be kept compact -- no holes and well documented */
 
 typedef struct drm_i915_getparam {
@@ -1126,7 +1132,16 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_FENCE_ARRAY   (1<<19)
 
-#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_ARRAY<<1))
+/*
+ * Setting I915_EXEC_FENCE_SUBMIT implies that lower_32_bits(rsvd2) represent
+ * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
+ * the batch.
+ *
+ * Returns -EINVAL if the sync_file fd cannot be found.
+ */
+#define I915_EXEC_FENCE_SUBMIT		(1 << 20)
+
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_SUBMIT << 1))
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
cgit v1.2.3


From 4ef69160b3ed62379ef853c512d3785bf2ea57e6 Mon Sep 17 00:00:00 2001
From: Anson Huang <anson.huang@nxp.com>
Date: Wed, 15 May 2019 01:09:24 +0000
Subject: dt-bindings: clock: imx8mq: Add SNVS clock

Add macro for the SNVS clock of the i.MX8MQ.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Reviewed-by: Leonard Crestez <leonard.crestez@nxp.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 include/dt-bindings/clock/imx8mq-clock.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/imx8mq-clock.h b/include/dt-bindings/clock/imx8mq-clock.h
index 6677e920dc2d..0233bb1e6bf8 100644
--- a/include/dt-bindings/clock/imx8mq-clock.h
+++ b/include/dt-bindings/clock/imx8mq-clock.h
@@ -400,5 +400,7 @@
 #define IMX8MQ_CLK_GPIO4_ROOT			262
 #define IMX8MQ_CLK_GPIO5_ROOT			263
 
-#define IMX8MQ_CLK_END				264
+#define IMX8MQ_CLK_SNVS_ROOT			264
+
+#define IMX8MQ_CLK_END				265
 #endif /* __DT_BINDINGS_CLOCK_IMX8MQ_H */
-- 
cgit v1.2.3


From 81da87f63a1edebcf8cbb811d387e353d9f89c7a Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 21 May 2019 13:08:29 +0200
Subject: drm: Replace drm_gem_vram_push_to_system() with kunmap + unpin

The push-to-system function forces a buffer out of video RAM. This decision
should rather be made by the memory manager. By replacing the function with
calls to the kunmap and unpin functions, the buffer's memory becomes available,
but the buffer remains in VRAM until it's evicted by a pin operation.

This patch replaces the remaining instances of drm_gem_vram_push_to_system()
in ast and mgag200, and removes the function from DRM.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/20190521110831.20200-2-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/ast/ast_mode.c           | 14 ++++++---
 drivers/gpu/drm/drm_gem_vram_helper.c    | 50 --------------------------------
 drivers/gpu/drm/drm_vram_helper_common.c |  2 --
 drivers/gpu/drm/mgag200/mgag200_mode.c   | 15 ++++++----
 include/drm/drm_gem_vram_helper.h        |  1 -
 5 files changed, 20 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c
index 9aca9135a5cc..fbafe3994654 100644
--- a/drivers/gpu/drm/ast/ast_mode.c
+++ b/drivers/gpu/drm/ast/ast_mode.c
@@ -521,7 +521,6 @@ static void ast_crtc_dpms(struct drm_crtc *crtc, int mode)
 	}
 }
 
-/* ast is different - we will force move buffers out of VRAM */
 static int ast_crtc_do_set_base(struct drm_crtc *crtc,
 				struct drm_framebuffer *fb,
 				int x, int y, int atomic)
@@ -534,12 +533,15 @@ static int ast_crtc_do_set_base(struct drm_crtc *crtc,
 	s64 gpu_addr;
 	void *base;
 
-	/* push the previous fb to system ram */
 	if (!atomic && fb) {
 		ast_fb = to_ast_framebuffer(fb);
 		obj = ast_fb->obj;
 		gbo = drm_gem_vram_of_gem(obj);
-		drm_gem_vram_push_to_system(gbo);
+
+		/* unmap if console */
+		if (&ast->fbdev->afb == ast_fb)
+			drm_gem_vram_kunmap(gbo);
+		drm_gem_vram_unpin(gbo);
 	}
 
 	ast_fb = to_ast_framebuffer(crtc->primary->fb);
@@ -622,11 +624,15 @@ static void ast_crtc_disable(struct drm_crtc *crtc)
 	DRM_DEBUG_KMS("\n");
 	ast_crtc_dpms(crtc, DRM_MODE_DPMS_OFF);
 	if (crtc->primary->fb) {
+		struct ast_private *ast = crtc->dev->dev_private;
 		struct ast_framebuffer *ast_fb = to_ast_framebuffer(crtc->primary->fb);
 		struct drm_gem_object *obj = ast_fb->obj;
 		struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(obj);
 
-		drm_gem_vram_push_to_system(gbo);
+		/* unmap if console */
+		if (&ast->fbdev->afb == ast_fb)
+			drm_gem_vram_kunmap(gbo);
+		drm_gem_vram_unpin(gbo);
 	}
 	crtc->primary->fb = NULL;
 }
diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index bde8237e8021..b341b70832be 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -379,56 +379,6 @@ int drm_gem_vram_unpin_reserved(struct drm_gem_vram_object *gbo)
 }
 EXPORT_SYMBOL(drm_gem_vram_unpin_reserved);
 
-/**
- * drm_gem_vram_push_to_system() - \
-	Unpins a GEM VRAM object and moves it to system memory
- * @gbo:	the GEM VRAM object
- *
- * This operation only works if the caller holds the final pin on the
- * buffer object.
- *
- * Returns:
- * 0 on success, or
- * a negative error code otherwise.
- */
-int drm_gem_vram_push_to_system(struct drm_gem_vram_object *gbo)
-{
-	int i, ret;
-	struct ttm_operation_ctx ctx = { false, false };
-
-	ret = ttm_bo_reserve(&gbo->bo, true, false, NULL);
-	if (ret < 0)
-		return ret;
-
-	if (WARN_ON_ONCE(!gbo->pin_count))
-		goto out;
-
-	--gbo->pin_count;
-	if (gbo->pin_count)
-		goto out;
-
-	if (gbo->kmap.virtual)
-		ttm_bo_kunmap(&gbo->kmap);
-
-	drm_gem_vram_placement(gbo, TTM_PL_FLAG_SYSTEM);
-	for (i = 0; i < gbo->placement.num_placement ; ++i)
-		gbo->placements[i].flags |= TTM_PL_FLAG_NO_EVICT;
-
-	ret = ttm_bo_validate(&gbo->bo, &gbo->placement, &ctx);
-	if (ret)
-		goto err_ttm_bo_unreserve;
-
-out:
-	ttm_bo_unreserve(&gbo->bo);
-
-	return 0;
-
-err_ttm_bo_unreserve:
-	ttm_bo_unreserve(&gbo->bo);
-	return ret;
-}
-EXPORT_SYMBOL(drm_gem_vram_push_to_system);
-
 /**
  * drm_gem_vram_kmap_at() - Maps a GEM VRAM object into kernel address space
  * @gbo:	the GEM VRAM object
diff --git a/drivers/gpu/drm/drm_vram_helper_common.c b/drivers/gpu/drm/drm_vram_helper_common.c
index 3b47f7002115..e9c9f9a80ba3 100644
--- a/drivers/gpu/drm/drm_vram_helper_common.c
+++ b/drivers/gpu/drm/drm_vram_helper_common.c
@@ -79,8 +79,6 @@
  * RAM. Call drm_gem_vram_pin() with &DRM_GEM_VRAM_PL_FLAG_VRAM or
  * &DRM_GEM_VRAM_PL_FLAG_SYSTEM to pin a buffer object in video RAM or system
  * memory. Call drm_gem_vram_unpin() to release the pinned object afterwards.
- * If you have to evict a buffer object from video RAM (e.g., for freeing up
- * memory), unpin the buffer and call drm_gem_vram_push_to_system().
  *
  * A buffer object that is pinned in video RAM has a fixed address within that
  * memory region. Call drm_gem_vram_offset() to retrieve this value. Typically
diff --git a/drivers/gpu/drm/mgag200/mgag200_mode.c b/drivers/gpu/drm/mgag200/mgag200_mode.c
index e79872c968bf..1c8e0bfac015 100644
--- a/drivers/gpu/drm/mgag200/mgag200_mode.c
+++ b/drivers/gpu/drm/mgag200/mgag200_mode.c
@@ -858,8 +858,6 @@ static void mga_set_start_address(struct drm_crtc *crtc, unsigned offset)
 	WREG_ECRT(0x0, ((u8)(addr >> 16) & 0xf) | crtcext0);
 }
 
-
-/* ast is different - we will force move buffers out of VRAM */
 static int mga_crtc_do_set_base(struct drm_crtc *crtc,
 				struct drm_framebuffer *fb,
 				int x, int y, int atomic)
@@ -872,12 +870,15 @@ static int mga_crtc_do_set_base(struct drm_crtc *crtc,
 	s64 gpu_addr;
 	void *base;
 
-	/* push the previous fb to system ram */
 	if (!atomic && fb) {
 		mga_fb = to_mga_framebuffer(fb);
 		obj = mga_fb->obj;
 		gbo = drm_gem_vram_of_gem(obj);
-		drm_gem_vram_push_to_system(gbo);
+
+		/* unmap if console */
+		if (&mdev->mfbdev->mfb == mga_fb)
+			drm_gem_vram_kunmap(gbo);
+		drm_gem_vram_unpin(gbo);
 	}
 
 	mga_fb = to_mga_framebuffer(crtc->primary->fb);
@@ -1425,11 +1426,15 @@ static void mga_crtc_disable(struct drm_crtc *crtc)
 	DRM_DEBUG_KMS("\n");
 	mga_crtc_dpms(crtc, DRM_MODE_DPMS_OFF);
 	if (crtc->primary->fb) {
+		struct mga_device *mdev = crtc->dev->dev_private;
 		struct mga_framebuffer *mga_fb = to_mga_framebuffer(crtc->primary->fb);
 		struct drm_gem_object *obj = mga_fb->obj;
 		struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(obj);
 
-		drm_gem_vram_push_to_system(gbo);
+		/* unmap if console */
+		if (&mdev->mfbdev->mfb == mga_fb)
+			drm_gem_vram_kunmap(gbo);
+		drm_gem_vram_unpin(gbo);
 	}
 	crtc->primary->fb = NULL;
 }
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index ff1a81761543..22e52d7dfbb7 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -86,7 +86,6 @@ int drm_gem_vram_pin_reserved(struct drm_gem_vram_object *gbo,
 			      unsigned long pl_flag);
 int drm_gem_vram_unpin(struct drm_gem_vram_object *gbo);
 int drm_gem_vram_unpin_reserved(struct drm_gem_vram_object *gbo);
-int drm_gem_vram_push_to_system(struct drm_gem_vram_object *gbo);
 void *drm_gem_vram_kmap_at(struct drm_gem_vram_object *gbo, bool map,
 			   bool *is_iomem, struct ttm_bo_kmap_obj *kmap);
 void *drm_gem_vram_kmap(struct drm_gem_vram_object *gbo, bool map,
-- 
cgit v1.2.3


From cb1f8814bbfcc2d0d7c499530e3d878ba04e4af2 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 21 May 2019 13:08:30 +0200
Subject: drm: Rename reserve/unreserve to lock/unlock in GEM VRAM helpers

To align with the rest of DRM terminology, the GEM VRAM helpers now use
lock and unlock in places where reserve and unreserve where used before.
All callers have been adapted.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/20190521110831.20200-3-tzimmermann@suse.de
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/ast/ast_fb.c             | 11 ++++-----
 drivers/gpu/drm/ast/ast_mode.c           | 12 +++++-----
 drivers/gpu/drm/drm_gem_vram_helper.c    | 32 ++++++++++++-------------
 drivers/gpu/drm/mgag200/mgag200_cursor.c | 40 ++++++++++++++++----------------
 drivers/gpu/drm/mgag200/mgag200_fb.c     | 11 ++++-----
 include/drm/drm_gem_vram_helper.h        |  8 +++----
 6 files changed, 56 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ast/ast_fb.c b/drivers/gpu/drm/ast/ast_fb.c
index 505e602855c0..05f45222b702 100644
--- a/drivers/gpu/drm/ast/ast_fb.c
+++ b/drivers/gpu/drm/ast/ast_fb.c
@@ -62,13 +62,12 @@ static void ast_dirty_update(struct ast_fbdev *afbdev,
 	obj = afbdev->afb.obj;
 	gbo = drm_gem_vram_of_gem(obj);
 
-	/*
-	 * try and reserve the BO, if we fail with busy
-	 * then the BO is being moved and we should
-	 * store up the damage until later.
+	/* Try to lock the BO. If we fail with -EBUSY then
+	 * the BO is being moved and we should store up the
+	 * damage until later.
 	 */
 	if (drm_can_sleep())
-		ret = drm_gem_vram_reserve(gbo, true);
+		ret = drm_gem_vram_lock(gbo, true);
 	if (ret) {
 		if (ret != -EBUSY)
 			return;
@@ -127,7 +126,7 @@ static void ast_dirty_update(struct ast_fbdev *afbdev,
 		drm_gem_vram_kunmap(gbo);
 
 out:
-	drm_gem_vram_unreserve(gbo);
+	drm_gem_vram_unlock(gbo);
 }
 
 static void ast_fillrect(struct fb_info *info,
diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c
index fbafe3994654..fb700d620b64 100644
--- a/drivers/gpu/drm/ast/ast_mode.c
+++ b/drivers/gpu/drm/ast/ast_mode.c
@@ -1201,7 +1201,7 @@ static int ast_cursor_set(struct drm_crtc *crtc,
 	}
 	gbo = drm_gem_vram_of_gem(obj);
 
-	ret = drm_gem_vram_reserve(gbo, false);
+	ret = drm_gem_vram_lock(gbo, false);
 	if (ret)
 		goto fail;
 
@@ -1209,7 +1209,7 @@ static int ast_cursor_set(struct drm_crtc *crtc,
 	src = drm_gem_vram_kmap_at(gbo, true, &src_isiomem, &uobj_map);
 	if (IS_ERR(src)) {
 		ret = PTR_ERR(src);
-		goto fail_unreserve;
+		goto fail_unlock;
 	}
 	if (src_isiomem == true)
 		DRM_ERROR("src cursor bo should be in main memory\n");
@@ -1218,7 +1218,7 @@ static int ast_cursor_set(struct drm_crtc *crtc,
 				   false, &dst_isiomem, &ast->cache_kmap);
 	if (IS_ERR(dst)) {
 		ret = PTR_ERR(dst);
-		goto fail_unreserve;
+		goto fail_unlock;
 	}
 	if (dst_isiomem == false)
 		DRM_ERROR("dst bo should be in VRAM\n");
@@ -1229,7 +1229,7 @@ static int ast_cursor_set(struct drm_crtc *crtc,
 	csum = copy_cursor_image(src, dst, width, height);
 
 	drm_gem_vram_kunmap_at(gbo, &uobj_map);
-	drm_gem_vram_unreserve(gbo);
+	drm_gem_vram_unlock(gbo);
 
 	/* write checksum + signature */
 	{
@@ -1262,8 +1262,8 @@ static int ast_cursor_set(struct drm_crtc *crtc,
 	drm_gem_object_put_unlocked(obj);
 	return 0;
 
-fail_unreserve:
-	drm_gem_vram_unreserve(gbo);
+fail_unlock:
+	drm_gem_vram_unlock(gbo);
 fail:
 	drm_gem_object_put_unlocked(obj);
 	return ret;
diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index b341b70832be..aefb0c361486 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -152,7 +152,7 @@ void drm_gem_vram_put(struct drm_gem_vram_object *gbo)
 EXPORT_SYMBOL(drm_gem_vram_put);
 
 /**
- * drm_gem_vram_reserve() - Reserves a VRAM-backed GEM object
+ * drm_gem_vram_lock() - Locks a VRAM-backed GEM object
  * @gbo:	the GEM VRAM object
  * @no_wait:	don't wait for buffer object to become available
  *
@@ -162,24 +162,24 @@ EXPORT_SYMBOL(drm_gem_vram_put);
  * 0 on success, or
  * a negative error code otherwise
  */
-int drm_gem_vram_reserve(struct drm_gem_vram_object *gbo, bool no_wait)
+int drm_gem_vram_lock(struct drm_gem_vram_object *gbo, bool no_wait)
 {
 	return ttm_bo_reserve(&gbo->bo, true, no_wait, NULL);
 }
-EXPORT_SYMBOL(drm_gem_vram_reserve);
+EXPORT_SYMBOL(drm_gem_vram_lock);
 
 /**
- * drm_gem_vram_unreserve() - \
-	Release a reservation acquired by drm_gem_vram_reserve()
+ * drm_gem_vram_unlock() - \
+	Release a reservation acquired by drm_gem_vram_lock()
  * @gbo:	the GEM VRAM object
  *
  * See ttm_bo_unreserve() for more information.
  */
-void drm_gem_vram_unreserve(struct drm_gem_vram_object *gbo)
+void drm_gem_vram_unlock(struct drm_gem_vram_object *gbo)
 {
 	ttm_bo_unreserve(&gbo->bo);
 }
-EXPORT_SYMBOL(drm_gem_vram_unreserve);
+EXPORT_SYMBOL(drm_gem_vram_unlock);
 
 /**
  * drm_gem_vram_mmap_offset() - Returns a GEM VRAM object's mmap offset
@@ -263,7 +263,7 @@ err_ttm_bo_unreserve:
 EXPORT_SYMBOL(drm_gem_vram_pin);
 
 /**
- * drm_gem_vram_pin_reserved() - Pins a GEM VRAM object in a region.
+ * drm_gem_vram_pin_locked() - Pins a GEM VRAM object in a region.
  * @gbo:	the GEM VRAM object
  * @pl_flag:	a bitmask of possible memory regions
  *
@@ -272,14 +272,14 @@ EXPORT_SYMBOL(drm_gem_vram_pin);
  * it can be pinned to another region.
  *
  * This function pins a GEM VRAM object that has already been
- * reserved. Use drm_gem_vram_pin() if possible.
+ * locked. Use drm_gem_vram_pin() if possible.
  *
  * Returns:
  * 0 on success, or
  * a negative error code otherwise.
  */
-int drm_gem_vram_pin_reserved(struct drm_gem_vram_object *gbo,
-			      unsigned long pl_flag)
+int drm_gem_vram_pin_locked(struct drm_gem_vram_object *gbo,
+			    unsigned long pl_flag)
 {
 	int i, ret;
 	struct ttm_operation_ctx ctx = { false, false };
@@ -301,7 +301,7 @@ int drm_gem_vram_pin_reserved(struct drm_gem_vram_object *gbo,
 
 	return 0;
 }
-EXPORT_SYMBOL(drm_gem_vram_pin_reserved);
+EXPORT_SYMBOL(drm_gem_vram_pin_locked);
 
 /**
  * drm_gem_vram_unpin() - Unpins a GEM VRAM object
@@ -346,17 +346,17 @@ err_ttm_bo_unreserve:
 EXPORT_SYMBOL(drm_gem_vram_unpin);
 
 /**
- * drm_gem_vram_unpin_reserved() - Unpins a GEM VRAM object
+ * drm_gem_vram_unpin_locked() - Unpins a GEM VRAM object
  * @gbo:	the GEM VRAM object
  *
  * This function unpins a GEM VRAM object that has already been
- * reserved. Use drm_gem_vram_unpin() if possible.
+ * locked. Use drm_gem_vram_unpin() if possible.
  *
  * Returns:
  * 0 on success, or
  * a negative error code otherwise.
  */
-int drm_gem_vram_unpin_reserved(struct drm_gem_vram_object *gbo)
+int drm_gem_vram_unpin_locked(struct drm_gem_vram_object *gbo)
 {
 	int i, ret;
 	struct ttm_operation_ctx ctx = { false, false };
@@ -377,7 +377,7 @@ int drm_gem_vram_unpin_reserved(struct drm_gem_vram_object *gbo)
 
 	return 0;
 }
-EXPORT_SYMBOL(drm_gem_vram_unpin_reserved);
+EXPORT_SYMBOL(drm_gem_vram_unpin_locked);
 
 /**
  * drm_gem_vram_kmap_at() - Maps a GEM VRAM object into kernel address space
diff --git a/drivers/gpu/drm/mgag200/mgag200_cursor.c b/drivers/gpu/drm/mgag200/mgag200_cursor.c
index 1c4fc85315a0..06a8c076cb33 100644
--- a/drivers/gpu/drm/mgag200/mgag200_cursor.c
+++ b/drivers/gpu/drm/mgag200/mgag200_cursor.c
@@ -23,9 +23,9 @@ static void mga_hide_cursor(struct mga_device *mdev)
 	WREG8(MGA_CURPOSXL, 0);
 	WREG8(MGA_CURPOSXH, 0);
 	if (mdev->cursor.pixels_1->pin_count)
-		drm_gem_vram_unpin_reserved(mdev->cursor.pixels_1);
+		drm_gem_vram_unpin_locked(mdev->cursor.pixels_1);
 	if (mdev->cursor.pixels_2->pin_count)
-		drm_gem_vram_unpin_reserved(mdev->cursor.pixels_2);
+		drm_gem_vram_unpin_locked(mdev->cursor.pixels_2);
 }
 
 int mga_crtc_cursor_set(struct drm_crtc *crtc,
@@ -80,53 +80,53 @@ int mga_crtc_cursor_set(struct drm_crtc *crtc,
 	if (!obj)
 		return -ENOENT;
 
-	ret = drm_gem_vram_reserve(pixels_1, true);
+	ret = drm_gem_vram_lock(pixels_1, true);
 	if (ret) {
 		WREG8(MGA_CURPOSXL, 0);
 		WREG8(MGA_CURPOSXH, 0);
 		goto out_unref;
 	}
-	ret = drm_gem_vram_reserve(pixels_2, true);
+	ret = drm_gem_vram_lock(pixels_2, true);
 	if (ret) {
 		WREG8(MGA_CURPOSXL, 0);
 		WREG8(MGA_CURPOSXH, 0);
-		drm_gem_vram_unreserve(pixels_1);
-		goto out_unreserve1;
+		drm_gem_vram_unlock(pixels_1);
+		goto out_unlock1;
 	}
 
 	/* Move cursor buffers into VRAM if they aren't already */
 	if (!pixels_1->pin_count) {
-		ret = drm_gem_vram_pin_reserved(pixels_1,
-						DRM_GEM_VRAM_PL_FLAG_VRAM);
+		ret = drm_gem_vram_pin_locked(pixels_1,
+					      DRM_GEM_VRAM_PL_FLAG_VRAM);
 		if (ret)
 			goto out1;
 		gpu_addr = drm_gem_vram_offset(pixels_1);
 		if (gpu_addr < 0) {
-			drm_gem_vram_unpin_reserved(pixels_1);
+			drm_gem_vram_unpin_locked(pixels_1);
 			goto out1;
 		}
 		mdev->cursor.pixels_1_gpu_addr = gpu_addr;
 	}
 	if (!pixels_2->pin_count) {
-		ret = drm_gem_vram_pin_reserved(pixels_2,
-						DRM_GEM_VRAM_PL_FLAG_VRAM);
+		ret = drm_gem_vram_pin_locked(pixels_2,
+					      DRM_GEM_VRAM_PL_FLAG_VRAM);
 		if (ret) {
-			drm_gem_vram_unpin_reserved(pixels_1);
+			drm_gem_vram_unpin_locked(pixels_1);
 			goto out1;
 		}
 		gpu_addr = drm_gem_vram_offset(pixels_2);
 		if (gpu_addr < 0) {
-			drm_gem_vram_unpin_reserved(pixels_1);
-			drm_gem_vram_unpin_reserved(pixels_2);
+			drm_gem_vram_unpin_locked(pixels_1);
+			drm_gem_vram_unpin_locked(pixels_2);
 			goto out1;
 		}
 		mdev->cursor.pixels_2_gpu_addr = gpu_addr;
 	}
 
 	gbo = drm_gem_vram_of_gem(obj);
-	ret = drm_gem_vram_reserve(gbo, true);
+	ret = drm_gem_vram_lock(gbo, true);
 	if (ret) {
-		dev_err(&dev->pdev->dev, "failed to reserve user bo\n");
+		dev_err(&dev->pdev->dev, "failed to lock user bo\n");
 		goto out1;
 	}
 	src = drm_gem_vram_kmap(gbo, true, NULL);
@@ -250,13 +250,13 @@ int mga_crtc_cursor_set(struct drm_crtc *crtc,
  out3:
 	drm_gem_vram_kunmap(gbo);
  out2:
-	drm_gem_vram_unreserve(gbo);
+	drm_gem_vram_unlock(gbo);
  out1:
 	if (ret)
 		mga_hide_cursor(mdev);
-	drm_gem_vram_unreserve(pixels_1);
-out_unreserve1:
-	drm_gem_vram_unreserve(pixels_2);
+	drm_gem_vram_unlock(pixels_1);
+out_unlock1:
+	drm_gem_vram_unlock(pixels_2);
 out_unref:
 	drm_gem_object_put_unlocked(obj);
 
diff --git a/drivers/gpu/drm/mgag200/mgag200_fb.c b/drivers/gpu/drm/mgag200/mgag200_fb.c
index 87217bdce9f8..97c575a9a86f 100644
--- a/drivers/gpu/drm/mgag200/mgag200_fb.c
+++ b/drivers/gpu/drm/mgag200/mgag200_fb.c
@@ -36,13 +36,12 @@ static void mga_dirty_update(struct mga_fbdev *mfbdev,
 	obj = mfbdev->mfb.obj;
 	gbo = drm_gem_vram_of_gem(obj);
 
-	/*
-	 * try and reserve the BO, if we fail with busy
-	 * then the BO is being moved and we should
-	 * store up the damage until later.
+	/* Try to lock the BO. If we fail with -EBUSY then
+	 * the BO is being moved and we should store up the
+	 * damage until later.
 	 */
 	if (drm_can_sleep())
-		ret = drm_gem_vram_reserve(gbo, true);
+		ret = drm_gem_vram_lock(gbo, true);
 	if (ret) {
 		if (ret != -EBUSY)
 			return;
@@ -101,7 +100,7 @@ static void mga_dirty_update(struct mga_fbdev *mfbdev,
 		drm_gem_vram_kunmap(gbo);
 
 out:
-	drm_gem_vram_unreserve(gbo);
+	drm_gem_vram_unlock(gbo);
 }
 
 static void mga_fillrect(struct fb_info *info,
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index 22e52d7dfbb7..4d1d2c1bf32b 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -77,15 +77,15 @@ struct drm_gem_vram_object *drm_gem_vram_create(struct drm_device *dev,
 						unsigned long pg_align,
 						bool interruptible);
 void drm_gem_vram_put(struct drm_gem_vram_object *gbo);
-int drm_gem_vram_reserve(struct drm_gem_vram_object *gbo, bool no_wait);
-void drm_gem_vram_unreserve(struct drm_gem_vram_object *gbo);
+int drm_gem_vram_lock(struct drm_gem_vram_object *gbo, bool no_wait);
+void drm_gem_vram_unlock(struct drm_gem_vram_object *gbo);
 u64 drm_gem_vram_mmap_offset(struct drm_gem_vram_object *gbo);
 s64 drm_gem_vram_offset(struct drm_gem_vram_object *gbo);
 int drm_gem_vram_pin(struct drm_gem_vram_object *gbo, unsigned long pl_flag);
-int drm_gem_vram_pin_reserved(struct drm_gem_vram_object *gbo,
+int drm_gem_vram_pin_locked(struct drm_gem_vram_object *gbo,
 			      unsigned long pl_flag);
 int drm_gem_vram_unpin(struct drm_gem_vram_object *gbo);
-int drm_gem_vram_unpin_reserved(struct drm_gem_vram_object *gbo);
+int drm_gem_vram_unpin_locked(struct drm_gem_vram_object *gbo);
 void *drm_gem_vram_kmap_at(struct drm_gem_vram_object *gbo, bool map,
 			   bool *is_iomem, struct ttm_bo_kmap_obj *kmap);
 void *drm_gem_vram_kmap(struct drm_gem_vram_object *gbo, bool map,
-- 
cgit v1.2.3


From d825c565304f33d5f717a35f74d5109ff4820cae Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sun, 19 May 2019 21:55:22 +0200
Subject: drm/gma500: remove empty gma_drm.h header file

The header file gma_drm.h is empty so remove it and
drop all uses of the file.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Patrik Jakobsson <patrik.r.jakobsson@gmail.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Patrik Jakobsson <patrik.r.jakobsson@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190519195526.3422-2-sam@ravnborg.org
---
 drivers/gpu/drm/gma500/cdv_device.c      |  2 +-
 drivers/gpu/drm/gma500/gem.c             |  2 +-
 drivers/gpu/drm/gma500/intel_bios.c      |  2 +-
 drivers/gpu/drm/gma500/intel_gmbus.c     |  2 +-
 drivers/gpu/drm/gma500/mid_bios.c        |  2 +-
 drivers/gpu/drm/gma500/oaktrail_device.c |  2 +-
 drivers/gpu/drm/gma500/psb_device.c      |  2 +-
 drivers/gpu/drm/gma500/psb_drv.h         |  2 +-
 drivers/gpu/drm/gma500/psb_intel_sdvo.c  |  2 +-
 include/drm/gma_drm.h                    | 25 -------------------------
 10 files changed, 9 insertions(+), 34 deletions(-)
 delete mode 100644 include/drm/gma_drm.h

(limited to 'include')

diff --git a/drivers/gpu/drm/gma500/cdv_device.c b/drivers/gpu/drm/gma500/cdv_device.c
index 34b85767e4da..6ba605ac7123 100644
--- a/drivers/gpu/drm/gma500/cdv_device.c
+++ b/drivers/gpu/drm/gma500/cdv_device.c
@@ -20,7 +20,7 @@
 #include <linux/backlight.h>
 #include <drm/drmP.h>
 #include <drm/drm.h>
-#include <drm/gma_drm.h>
+
 #include "psb_drv.h"
 #include "psb_reg.h"
 #include "psb_intel_reg.h"
diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c
index 576f1b272f23..cfc5b559c77b 100644
--- a/drivers/gpu/drm/gma500/gem.c
+++ b/drivers/gpu/drm/gma500/gem.c
@@ -25,8 +25,8 @@
 
 #include <drm/drmP.h>
 #include <drm/drm.h>
-#include <drm/gma_drm.h>
 #include <drm/drm_vma_manager.h>
+
 #include "psb_drv.h"
 
 void psb_gem_free_object(struct drm_gem_object *obj)
diff --git a/drivers/gpu/drm/gma500/intel_bios.c b/drivers/gpu/drm/gma500/intel_bios.c
index 63bde4e86c6a..45c17c6f58f8 100644
--- a/drivers/gpu/drm/gma500/intel_bios.c
+++ b/drivers/gpu/drm/gma500/intel_bios.c
@@ -20,7 +20,7 @@
  */
 #include <drm/drmP.h>
 #include <drm/drm.h>
-#include <drm/gma_drm.h>
+
 #include "psb_drv.h"
 #include "psb_intel_drv.h"
 #include "psb_intel_reg.h"
diff --git a/drivers/gpu/drm/gma500/intel_gmbus.c b/drivers/gpu/drm/gma500/intel_gmbus.c
index e7e22187c539..b1bf314ab150 100644
--- a/drivers/gpu/drm/gma500/intel_gmbus.c
+++ b/drivers/gpu/drm/gma500/intel_gmbus.c
@@ -30,8 +30,8 @@
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <drm/drmP.h>
+
 #include "psb_intel_drv.h"
-#include <drm/gma_drm.h>
 #include "psb_drv.h"
 #include "psb_intel_reg.h"
 
diff --git a/drivers/gpu/drm/gma500/mid_bios.c b/drivers/gpu/drm/gma500/mid_bios.c
index 237041a37532..e00778b002ec 100644
--- a/drivers/gpu/drm/gma500/mid_bios.c
+++ b/drivers/gpu/drm/gma500/mid_bios.c
@@ -25,7 +25,7 @@
 
 #include <drm/drmP.h>
 #include <drm/drm.h>
-#include <drm/gma_drm.h>
+
 #include "psb_drv.h"
 #include "mid_bios.h"
 
diff --git a/drivers/gpu/drm/gma500/oaktrail_device.c b/drivers/gpu/drm/gma500/oaktrail_device.c
index ba30b43a3412..f0872e2c22d5 100644
--- a/drivers/gpu/drm/gma500/oaktrail_device.c
+++ b/drivers/gpu/drm/gma500/oaktrail_device.c
@@ -22,7 +22,7 @@
 #include <linux/dmi.h>
 #include <drm/drmP.h>
 #include <drm/drm.h>
-#include <drm/gma_drm.h>
+
 #include "psb_drv.h"
 #include "psb_reg.h"
 #include "psb_intel_reg.h"
diff --git a/drivers/gpu/drm/gma500/psb_device.c b/drivers/gpu/drm/gma500/psb_device.c
index dc0f8527570c..9fd418985d70 100644
--- a/drivers/gpu/drm/gma500/psb_device.c
+++ b/drivers/gpu/drm/gma500/psb_device.c
@@ -20,7 +20,7 @@
 #include <linux/backlight.h>
 #include <drm/drmP.h>
 #include <drm/drm.h>
-#include <drm/gma_drm.h>
+
 #include "psb_drv.h"
 #include "psb_reg.h"
 #include "psb_intel_reg.h"
diff --git a/drivers/gpu/drm/gma500/psb_drv.h b/drivers/gpu/drm/gma500/psb_drv.h
index 941b238bdcc9..fbccf2abdad7 100644
--- a/drivers/gpu/drm/gma500/psb_drv.h
+++ b/drivers/gpu/drm/gma500/psb_drv.h
@@ -24,7 +24,7 @@
 #include <linux/mm_types.h>
 
 #include <drm/drmP.h>
-#include <drm/gma_drm.h>
+
 #include "psb_reg.h"
 #include "psb_intel_drv.h"
 #include "gma_display.h"
diff --git a/drivers/gpu/drm/gma500/psb_intel_sdvo.c b/drivers/gpu/drm/gma500/psb_intel_sdvo.c
index dd3cec0e3190..5090ef5f348d 100644
--- a/drivers/gpu/drm/gma500/psb_intel_sdvo.c
+++ b/drivers/gpu/drm/gma500/psb_intel_sdvo.c
@@ -32,8 +32,8 @@
 #include <drm/drmP.h>
 #include <drm/drm_crtc.h>
 #include <drm/drm_edid.h>
+
 #include "psb_intel_drv.h"
-#include <drm/gma_drm.h>
 #include "psb_drv.h"
 #include "psb_intel_sdvo_regs.h"
 #include "psb_intel_reg.h"
diff --git a/include/drm/gma_drm.h b/include/drm/gma_drm.h
deleted file mode 100644
index 87ac5e6ca551..000000000000
--- a/include/drm/gma_drm.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/**************************************************************************
- * Copyright (c) 2007-2011, Intel Corporation.
- * All Rights Reserved.
- * Copyright (c) 2008, Tungsten Graphics Inc.  Cedar Park, TX., USA.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- **************************************************************************/
-
-#ifndef _GMA_DRM_H_
-#define _GMA_DRM_H_
-
-#endif
-- 
cgit v1.2.3


From c5d3e39caa456b1e061644b739131f2b54c84c08 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Wed, 22 May 2019 10:00:54 +0100
Subject: drm/i915: Engine discovery query

Engine discovery query allows userspace to enumerate engines, probe their
configuration features, all without needing to maintain the internal PCI
ID based database.

A new query for the generic i915 query ioctl is added named
DRM_I915_QUERY_ENGINE_INFO, together with accompanying structure
drm_i915_query_engine_info. The address of latter should be passed to the
kernel in the query.data_ptr field, and should be large enough for the
kernel to fill out all known engines as struct drm_i915_engine_info
elements trailing the query.

As with other queries, setting the item query length to zero allows
userspace to query minimum required buffer size.

Enumerated engines have common type mask which can be used to query all
hardware engines, versus engines userspace can submit to using the execbuf
uAPI.

Engines also have capabilities which are per engine class namespace of
bits describing features not present on all engine instances.

v2:
 * Fixed HEVC assignment.
 * Reorder some fields, rename type to flags, increase width. (Lionel)
 * No need to allocate temporary storage if we do it engine by engine.
   (Lionel)

v3:
 * Describe engine flags and mark mbz fields. (Lionel)
 * HEVC only applies to VCS.

v4:
 * Squash SFC flag into main patch.
 * Tidy some comments.

v5:
 * Add uabi_ prefix to engine capabilities. (Chris Wilson)
 * Report exact size of engine info array. (Chris Wilson)
 * Drop the engine flags. (Joonas Lahtinen)
 * Added some more reserved fields.
 * Move flags after class/instance.

v6:
 * Do not check engine info array was zeroed by userspace but zero the
   unused fields for them instead.

v7:
 * Simplify length calculation loop. (Lionel Landwerlin)

v8:
 * Remove MBZ comments where not applicable.
 * Rename ABI flags to match engine class define naming.
 * Rename SFC ABI flag to reflect it applies to VCS and VECS.
 * SFC is wired to even _logical_ engine instances.
 * SFC applies to VCS and VECS.
 * HEVC is present on all instances on Gen11. (Tony)
 * Simplify length calculation even more. (Chris Wilson)
 * Move info_ptr assigment closer to loop for clarity. (Chris Wilson)
 * Use vdbox_sfc_access from runtime info.
 * Rebase for RUNTIME_INFO.
 * Refactor for lower indentation.
 * Rename uAPI class/instance to engine_class/instance to avoid C++
   keyword.

v9:
 * Rebase for s/num_rings/num_engines/ in RUNTIME_INFO.

v10:
 * Use new copy_query_item.

v11:
 * Consolidate with struct i915_engine_class_instnace.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tony Ye <tony.ye@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v7
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190522090054.6007-1-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c    | 41 +++++++++++++++++++++++
 drivers/gpu/drm/i915/gt/intel_engine_types.h |  2 ++
 drivers/gpu/drm/i915/i915_query.c            | 49 ++++++++++++++++++++++++++++
 include/uapi/drm/i915_drm.h                  | 42 ++++++++++++++++++++++++
 4 files changed, 134 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 4c3753c1b573..2590f5904b67 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -349,6 +349,45 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
 	return 0;
 }
 
+static void __setup_engine_capabilities(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *i915 = engine->i915;
+
+	if (engine->class == VIDEO_DECODE_CLASS) {
+		/*
+		 * HEVC support is present on first engine instance
+		 * before Gen11 and on all instances afterwards.
+		 */
+		if (INTEL_GEN(i915) >= 11 ||
+		    (INTEL_GEN(i915) >= 9 && engine->instance == 0))
+			engine->uabi_capabilities |=
+				I915_VIDEO_CLASS_CAPABILITY_HEVC;
+
+		/*
+		 * SFC block is present only on even logical engine
+		 * instances.
+		 */
+		if ((INTEL_GEN(i915) >= 11 &&
+		     RUNTIME_INFO(i915)->vdbox_sfc_access & engine->mask) ||
+		    (INTEL_GEN(i915) >= 9 && engine->instance == 0))
+			engine->uabi_capabilities |=
+				I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC;
+	} else if (engine->class == VIDEO_ENHANCEMENT_CLASS) {
+		if (INTEL_GEN(i915) >= 9)
+			engine->uabi_capabilities |=
+				I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC;
+	}
+}
+
+static void intel_setup_engine_capabilities(struct drm_i915_private *i915)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	for_each_engine(engine, i915, id)
+		__setup_engine_capabilities(engine);
+}
+
 /**
  * intel_engines_cleanup() - free the resources allocated for Command Streamers
  * @i915: the i915 devic
@@ -414,6 +453,8 @@ int intel_engines_init_mmio(struct drm_i915_private *i915)
 
 	i915_check_and_clear_faults(i915);
 
+	intel_setup_engine_capabilities(i915);
+
 	return 0;
 
 cleanup:
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index f3fc2e8acc90..40e774acc2cd 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -280,6 +280,8 @@ struct intel_engine_cs {
 	u32 context_size;
 	u32 mmio_base;
 
+	u32 uabi_capabilities;
+
 	struct intel_sseu sseu;
 
 	struct intel_ring *buffer;
diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
index 782183b78f49..414d0a6d1f70 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -96,9 +96,58 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
 	return total_length;
 }
 
+static int
+query_engine_info(struct drm_i915_private *i915,
+		  struct drm_i915_query_item *query_item)
+{
+	struct drm_i915_query_engine_info __user *query_ptr =
+				u64_to_user_ptr(query_item->data_ptr);
+	struct drm_i915_engine_info __user *info_ptr;
+	struct drm_i915_query_engine_info query;
+	struct drm_i915_engine_info info = { };
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int len, ret;
+
+	if (query_item->flags)
+		return -EINVAL;
+
+	len = sizeof(struct drm_i915_query_engine_info) +
+	      RUNTIME_INFO(i915)->num_engines *
+	      sizeof(struct drm_i915_engine_info);
+
+	ret = copy_query_item(&query, sizeof(query), len, query_item);
+	if (ret != 0)
+		return ret;
+
+	if (query.num_engines || query.rsvd[0] || query.rsvd[1] ||
+	    query.rsvd[2])
+		return -EINVAL;
+
+	info_ptr = &query_ptr->engines[0];
+
+	for_each_engine(engine, i915, id) {
+		info.engine.engine_class = engine->uabi_class;
+		info.engine.engine_instance = engine->instance;
+		info.capabilities = engine->uabi_capabilities;
+
+		if (__copy_to_user(info_ptr, &info, sizeof(info)))
+			return -EFAULT;
+
+		query.num_engines++;
+		info_ptr++;
+	}
+
+	if (__copy_to_user(query_ptr, &query, sizeof(query)))
+		return -EFAULT;
+
+	return len;
+}
+
 static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv,
 					struct drm_i915_query_item *query_item) = {
 	query_topology_info,
+	query_engine_info,
 };
 
 int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index bdb00ec1f8be..328d05e77d9f 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1982,6 +1982,7 @@ struct drm_i915_perf_oa_config {
 struct drm_i915_query_item {
 	__u64 query_id;
 #define DRM_I915_QUERY_TOPOLOGY_INFO    1
+#define DRM_I915_QUERY_ENGINE_INFO	2
 /* Must be kept compact -- no holes and well documented */
 
 	/*
@@ -2080,6 +2081,47 @@ struct drm_i915_query_topology_info {
 	__u8 data[];
 };
 
+/**
+ * struct drm_i915_engine_info
+ *
+ * Describes one engine and it's capabilities as known to the driver.
+ */
+struct drm_i915_engine_info {
+	/** Engine class and instance. */
+	struct i915_engine_class_instance engine;
+
+	/** Reserved field. */
+	__u32 rsvd0;
+
+	/** Engine flags. */
+	__u64 flags;
+
+	/** Capabilities of this engine. */
+	__u64 capabilities;
+#define I915_VIDEO_CLASS_CAPABILITY_HEVC		(1 << 0)
+#define I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC	(1 << 1)
+
+	/** Reserved fields. */
+	__u64 rsvd1[4];
+};
+
+/**
+ * struct drm_i915_query_engine_info
+ *
+ * Engine info query enumerates all engines known to the driver by filling in
+ * an array of struct drm_i915_engine_info structures.
+ */
+struct drm_i915_query_engine_info {
+	/** Number of struct drm_i915_engine_info structs following. */
+	__u32 num_engines;
+
+	/** MBZ */
+	__u32 rsvd[3];
+
+	/** Marker for drm_i915_engine_info structures. */
+	struct drm_i915_engine_info engines[];
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3


From f13e143e7444bffc53f5c2904aeed76646da69d6 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Tue, 3 Jul 2018 16:42:26 +0200
Subject: dma-buf: start caching of sg_table objects v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To allow a smooth transition from pinning buffer objects to dynamic
invalidation we first start to cache the sg_table for an attachment.

v2: keep closer to the DRM implementation

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.kernel.org/patch/10943053/
---
 drivers/dma-buf/dma-buf.c | 27 +++++++++++++++++++++++++--
 include/linux/dma-buf.h   | 13 +++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 3ae6c0c2cc02..f4104a21b069 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -576,6 +576,7 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 	list_add(&attach->node, &dmabuf->attachments);
 
 	mutex_unlock(&dmabuf->lock);
+
 	return attach;
 
 err_attach:
@@ -598,6 +599,9 @@ void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach)
 	if (WARN_ON(!dmabuf || !attach))
 		return;
 
+	if (attach->sgt)
+		dmabuf->ops->unmap_dma_buf(attach, attach->sgt, attach->dir);
+
 	mutex_lock(&dmabuf->lock);
 	list_del(&attach->node);
 	if (dmabuf->ops->detach)
@@ -633,10 +637,27 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach,
 	if (WARN_ON(!attach || !attach->dmabuf))
 		return ERR_PTR(-EINVAL);
 
+	if (attach->sgt) {
+		/*
+		 * Two mappings with different directions for the same
+		 * attachment are not allowed.
+		 */
+		if (attach->dir != direction &&
+		    attach->dir != DMA_BIDIRECTIONAL)
+			return ERR_PTR(-EBUSY);
+
+		return attach->sgt;
+	}
+
 	sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction);
 	if (!sg_table)
 		sg_table = ERR_PTR(-ENOMEM);
 
+	if (!IS_ERR(sg_table) && attach->dmabuf->ops->cache_sgt_mapping) {
+		attach->sgt = sg_table;
+		attach->dir = direction;
+	}
+
 	return sg_table;
 }
 EXPORT_SYMBOL_GPL(dma_buf_map_attachment);
@@ -660,8 +681,10 @@ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach,
 	if (WARN_ON(!attach || !attach->dmabuf || !sg_table))
 		return;
 
-	attach->dmabuf->ops->unmap_dma_buf(attach, sg_table,
-						direction);
+	if (attach->sgt == sg_table)
+		return;
+
+	attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction);
 }
 EXPORT_SYMBOL_GPL(dma_buf_unmap_attachment);
 
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index a0bd071466fc..8a327566d7f4 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -44,6 +44,15 @@ struct dma_buf_attachment;
  * @vunmap: [optional] unmaps a vmap from the buffer
  */
 struct dma_buf_ops {
+	/**
+	  * @cache_sgt_mapping:
+	  *
+	  * If true the framework will cache the first mapping made for each
+	  * attachment. This avoids creating mappings for attachments multiple
+	  * times.
+	  */
+	bool cache_sgt_mapping;
+
 	/**
 	 * @attach:
 	 *
@@ -323,6 +332,8 @@ struct dma_buf {
  * @dmabuf: buffer for this attachment.
  * @dev: device attached to the buffer.
  * @node: list of dma_buf_attachment.
+ * @sgt: cached mapping.
+ * @dir: direction of cached mapping.
  * @priv: exporter specific attachment data.
  *
  * This structure holds the attachment information between the dma_buf buffer
@@ -338,6 +349,8 @@ struct dma_buf_attachment {
 	struct dma_buf *dmabuf;
 	struct device *dev;
 	struct list_head node;
+	struct sg_table *sgt;
+	enum dma_data_direction dir;
 	void *priv;
 };
 
-- 
cgit v1.2.3


From fbb5d0353c62d10c3699ec844d2d015a762952d7 Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Thu, 16 May 2019 19:40:06 +0530
Subject: drm: Add HDR source metadata property
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds a blob property to get HDR metadata
information from userspace. This will be send as part
of AVI Infoframe to panel.

It also implements get() and set() functions for HDR output
metadata property.The blob data is received from userspace and
saved in connector state, the same is returned as blob in get
property call to userspace.

v2: Rebase and modified the metadata structure elements
as per Ville's POC changes.

v3: No Change

v4: Addressed Shashank's review comments

v5: Rebase.

v6: Addressed Brian Starkey's review comments, defined
new structure with header for dynamic metadata scalability.
Merge get/set property functions for metadata in this patch.

v7: Addressed Jonas Karlman review comments and defined separate
structure for infoframe to better align with CTA 861.G spec. Added
Shashank's RB.

v8: Addressed Ville's review comments. Moved sink metadata structure
out of uapi headers as suggested by Jonas Karlman.

v9: Rebase and addressed Jonas Karlman review comments.

v10: Addressed Ville's review comments, dropped the metdata_changed
state variable as its not needed anymore.

Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-2-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_atomic_uapi.c | 12 ++++++++++++
 drivers/gpu/drm/drm_connector.c   |  6 ++++++
 include/drm/drm_connector.h       | 10 ++++++++++
 include/drm/drm_mode_config.h     |  7 +++++++
 include/linux/hdmi.h              | 26 ++++++++++++++++++++++++++
 include/uapi/drm/drm_mode.h       | 23 +++++++++++++++++++++++
 6 files changed, 84 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
index 428d82662dc4..125605ff45af 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -676,6 +676,8 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector,
 {
 	struct drm_device *dev = connector->dev;
 	struct drm_mode_config *config = &dev->mode_config;
+	bool replaced = false;
+	int ret;
 
 	if (property == config->prop_crtc_id) {
 		struct drm_crtc *crtc = drm_crtc_find(dev, file_priv, val);
@@ -726,6 +728,13 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector,
 		 */
 		if (state->link_status != DRM_LINK_STATUS_GOOD)
 			state->link_status = val;
+	} else if (property == config->hdr_output_metadata_property) {
+		ret = drm_atomic_replace_property_blob_from_id(dev,
+				&state->hdr_output_metadata,
+				val,
+				sizeof(struct hdr_output_metadata), -1,
+				&replaced);
+		return ret;
 	} else if (property == config->aspect_ratio_property) {
 		state->picture_aspect_ratio = val;
 	} else if (property == config->content_type_property) {
@@ -814,6 +823,9 @@ drm_atomic_connector_get_property(struct drm_connector *connector,
 		*val = state->colorspace;
 	} else if (property == connector->scaling_mode_property) {
 		*val = state->scaling_mode;
+	} else if (property == config->hdr_output_metadata_property) {
+		*val = state->hdr_output_metadata ?
+			state->hdr_output_metadata->base.id : 0;
 	} else if (property == connector->content_protection_property) {
 		*val = state->content_protection;
 	} else if (property == config->writeback_fb_id_property) {
diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index b34c3d38bf15..365ace0c0c9e 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1058,6 +1058,12 @@ int drm_connector_create_standard_properties(struct drm_device *dev)
 		return -ENOMEM;
 	dev->mode_config.non_desktop_property = prop;
 
+	prop = drm_property_create(dev, DRM_MODE_PROP_BLOB,
+				   "HDR_OUTPUT_METADATA", 0);
+	if (!prop)
+		return -ENOMEM;
+	dev->mode_config.hdr_output_metadata_property = prop;
+
 	return 0;
 }
 
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index f43f40d5888a..f0e987df4c1e 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -603,6 +603,12 @@ struct drm_connector_state {
 	 * and the connector bpc limitations obtained from edid.
 	 */
 	u8 max_bpc;
+
+	/**
+	 * @hdr_output_metadata:
+	 * DRM blob property for HDR output metadata
+	 */
+	struct drm_property_blob *hdr_output_metadata;
 };
 
 /**
@@ -1243,6 +1249,10 @@ struct drm_connector {
 	 * &drm_mode_config.connector_free_work.
 	 */
 	struct llist_node free_node;
+
+	/* HDR metdata */
+	struct hdr_output_metadata hdr_output_metadata;
+	struct hdr_sink_metadata hdr_sink_metadata;
 };
 
 #define obj_to_connector(x) container_of(x, struct drm_connector, base)
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 7f60e8eb269a..c031b5a9d8d1 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -836,6 +836,13 @@ struct drm_mode_config {
 	 */
 	struct drm_property *writeback_out_fence_ptr_property;
 
+	/**
+	 * hdr_output_metadata_property: Connector property containing hdr
+	 * metatda. This will be provided by userspace compositors based
+	 * on HDR content
+	 */
+	struct drm_property *hdr_output_metadata_property;
+
 	/* dumb ioctl parameters */
 	uint32_t preferred_depth, prefer_shadow;
 
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 927ad6451105..6780476dcbff 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -152,6 +152,16 @@ enum hdmi_content_type {
 	HDMI_CONTENT_TYPE_GAME,
 };
 
+enum hdmi_metadata_type {
+	HDMI_STATIC_METADATA_TYPE1 = 1,
+};
+
+enum hdmi_eotf {
+	HDMI_EOTF_TRADITIONAL_GAMMA_SDR,
+	HDMI_EOTF_TRADITIONAL_GAMMA_HDR,
+	HDMI_EOTF_SMPTE_ST2084,
+};
+
 struct hdmi_avi_infoframe {
 	enum hdmi_infoframe_type type;
 	unsigned char version;
@@ -320,6 +330,22 @@ struct hdmi_vendor_infoframe {
 	unsigned int s3d_ext_data;
 };
 
+/* HDR Metadata as per 861.G spec */
+struct hdr_static_metadata {
+	__u8 eotf;
+	__u8 metadata_type;
+	__u16 max_cll;
+	__u16 max_fall;
+	__u16 min_cll;
+};
+
+struct hdr_sink_metadata {
+	__u32 metadata_type;
+	union {
+		struct hdr_static_metadata hdmi_type1;
+	};
+};
+
 int hdmi_vendor_infoframe_init(struct hdmi_vendor_infoframe *frame);
 ssize_t hdmi_vendor_infoframe_pack(struct hdmi_vendor_infoframe *frame,
 				   void *buffer, size_t size);
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 83cd1636b9be..997a7e05c0c6 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -630,6 +630,29 @@ struct drm_color_lut {
 	__u16 reserved;
 };
 
+/* HDR Metadata Infoframe as per 861.G spec */
+struct hdr_metadata_infoframe {
+	__u8 eotf;
+	__u8 metadata_type;
+	struct {
+		__u16 x, y;
+		} display_primaries[3];
+	struct {
+		__u16 x, y;
+		} white_point;
+	__u16 max_display_mastering_luminance;
+	__u16 min_display_mastering_luminance;
+	__u16 max_cll;
+	__u16 max_fall;
+};
+
+struct hdr_output_metadata {
+	__u32 metadata_type;
+	union {
+		struct hdr_metadata_infoframe hdmi_metadata_type1;
+	};
+};
+
 #define DRM_MODE_PAGE_FLIP_EVENT 0x01
 #define DRM_MODE_PAGE_FLIP_ASYNC 0x02
 #define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4
-- 
cgit v1.2.3


From 2cdbfd66a82969770ce1a7032fb1e2155a08cee8 Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Thu, 16 May 2019 19:40:09 +0530
Subject: drm: Enable HDR infoframe support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable Dynamic Range and Mastering Infoframe for HDR
content, which is defined in CEA 861.3 spec.

The metadata will be computed based on blending
policy in userspace compositors and passed as a connector
property blob to driver. The same will be sent as infoframe
to panel which support HDR.

Added the const version of infoframe for DRM metadata
for HDR.

v2: Rebase and added Ville's POC changes.

v3: No Change

v4: Addressed Shashank's review comments and merged the
patch making drm infoframe function arguments as constant.

v5: Rebase

v6: Fixed checkpatch warnings with --strict option. Addressed
Shashank's review comments and added his RB.

v7: Addressed Brian Starkey's review comments. Merged 2 patches
into one.

v8: Addressed Jonas Karlman review comments.

v9: Addressed Jonas Karlman review comments.

v10: Addressed Ville's review comments.

v11: Added BUILD_BUG_ON and sizeof instead of magic numbers as
per Ville's comments.

Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-5-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_edid.c |  72 +++++++++++++++++
 drivers/video/hdmi.c       | 190 +++++++++++++++++++++++++++++++++++++++++++++
 include/drm/drm_edid.h     |   5 ++
 include/linux/hdmi.h       |  28 +++++++
 4 files changed, 295 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index a5ef9f45fee0..73560c9437cd 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -4904,6 +4904,78 @@ static bool is_hdmi2_sink(struct drm_connector *connector)
 		connector->display_info.color_formats & DRM_COLOR_FORMAT_YCRCB420;
 }
 
+static inline bool is_eotf_supported(u8 output_eotf, u8 sink_eotf)
+{
+	return sink_eotf & BIT(output_eotf);
+}
+
+/**
+ * drm_hdmi_infoframe_set_hdr_metadata() - fill an HDMI DRM infoframe with
+ *                                         HDR metadata from userspace
+ * @frame: HDMI DRM infoframe
+ * @hdr_metadata: hdr_source_metadata info from userspace
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int
+drm_hdmi_infoframe_set_hdr_metadata(struct hdmi_drm_infoframe *frame,
+				    const struct drm_connector_state *conn_state)
+{
+	struct drm_connector *connector;
+	struct hdr_output_metadata *hdr_metadata;
+	int err;
+
+	if (!frame || !conn_state)
+		return -EINVAL;
+
+	connector = conn_state->connector;
+
+	if (!conn_state->hdr_output_metadata)
+		return -EINVAL;
+
+	hdr_metadata = conn_state->hdr_output_metadata->data;
+
+	if (!hdr_metadata || !connector)
+		return -EINVAL;
+
+	/* Sink EOTF is Bit map while infoframe is absolute values */
+	if (!is_eotf_supported(hdr_metadata->hdmi_metadata_type1.eotf,
+	    connector->hdr_sink_metadata.hdmi_type1.eotf)) {
+		DRM_DEBUG_KMS("EOTF Not Supported\n");
+		return -EINVAL;
+	}
+
+	err = hdmi_drm_infoframe_init(frame);
+	if (err < 0)
+		return err;
+
+	frame->eotf = hdr_metadata->hdmi_metadata_type1.eotf;
+	frame->metadata_type = hdr_metadata->hdmi_metadata_type1.metadata_type;
+
+	BUILD_BUG_ON(sizeof(frame->display_primaries) !=
+		     sizeof(hdr_metadata->hdmi_metadata_type1.display_primaries));
+	BUILD_BUG_ON(sizeof(frame->white_point) !=
+		     sizeof(hdr_metadata->hdmi_metadata_type1.white_point));
+
+	memcpy(&frame->display_primaries,
+	       &hdr_metadata->hdmi_metadata_type1.display_primaries,
+	       sizeof(frame->display_primaries));
+
+	memcpy(&frame->white_point,
+	       &hdr_metadata->hdmi_metadata_type1.white_point,
+	       sizeof(frame->white_point));
+
+	frame->max_display_mastering_luminance =
+		hdr_metadata->hdmi_metadata_type1.max_display_mastering_luminance;
+	frame->min_display_mastering_luminance =
+		hdr_metadata->hdmi_metadata_type1.min_display_mastering_luminance;
+	frame->max_fall = hdr_metadata->hdmi_metadata_type1.max_fall;
+	frame->max_cll = hdr_metadata->hdmi_metadata_type1.max_cll;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_hdmi_infoframe_set_hdr_metadata);
+
 /**
  * drm_hdmi_avi_infoframe_from_display_mode() - fill an HDMI AVI infoframe with
  *                                              data from a DRM display mode
diff --git a/drivers/video/hdmi.c b/drivers/video/hdmi.c
index 799ae49774f5..481f0367dfd3 100644
--- a/drivers/video/hdmi.c
+++ b/drivers/video/hdmi.c
@@ -650,6 +650,150 @@ hdmi_vendor_any_infoframe_check_only(const union hdmi_vendor_any_infoframe *fram
 	return 0;
 }
 
+/**
+ * hdmi_drm_infoframe_init() - initialize an HDMI Dynaminc Range and
+ * mastering infoframe
+ * @frame: HDMI DRM infoframe
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int hdmi_drm_infoframe_init(struct hdmi_drm_infoframe *frame)
+{
+	memset(frame, 0, sizeof(*frame));
+
+	frame->type = HDMI_INFOFRAME_TYPE_DRM;
+	frame->version = 1;
+	frame->length = HDMI_DRM_INFOFRAME_SIZE;
+
+	return 0;
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_init);
+
+static int hdmi_drm_infoframe_check_only(const struct hdmi_drm_infoframe *frame)
+{
+	if (frame->type != HDMI_INFOFRAME_TYPE_DRM ||
+	    frame->version != 1)
+		return -EINVAL;
+
+	if (frame->length != HDMI_DRM_INFOFRAME_SIZE)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * hdmi_drm_infoframe_check() - check a HDMI DRM infoframe
+ * @frame: HDMI DRM infoframe
+ *
+ * Validates that the infoframe is consistent.
+ * Returns 0 on success or a negative error code on failure.
+ */
+int hdmi_drm_infoframe_check(struct hdmi_drm_infoframe *frame)
+{
+	return hdmi_drm_infoframe_check_only(frame);
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_check);
+
+/**
+ * hdmi_drm_infoframe_pack_only() - write HDMI DRM infoframe to binary buffer
+ * @frame: HDMI DRM infoframe
+ * @buffer: destination buffer
+ * @size: size of buffer
+ *
+ * Packs the information contained in the @frame structure into a binary
+ * representation that can be written into the corresponding controller
+ * registers. Also computes the checksum as required by section 5.3.5 of
+ * the HDMI 1.4 specification.
+ *
+ * Returns the number of bytes packed into the binary buffer or a negative
+ * error code on failure.
+ */
+ssize_t hdmi_drm_infoframe_pack_only(const struct hdmi_drm_infoframe *frame,
+				     void *buffer, size_t size)
+{
+	u8 *ptr = buffer;
+	size_t length;
+	int i;
+
+	length = HDMI_INFOFRAME_HEADER_SIZE + frame->length;
+
+	if (size < length)
+		return -ENOSPC;
+
+	memset(buffer, 0, size);
+
+	ptr[0] = frame->type;
+	ptr[1] = frame->version;
+	ptr[2] = frame->length;
+	ptr[3] = 0; /* checksum */
+
+	/* start infoframe payload */
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	*ptr++ = frame->eotf;
+	*ptr++ = frame->metadata_type;
+
+	for (i = 0; i < 3; i++) {
+		*ptr++ = frame->display_primaries[i].x;
+		*ptr++ = frame->display_primaries[i].x >> 8;
+		*ptr++ = frame->display_primaries[i].y;
+		*ptr++ = frame->display_primaries[i].y >> 8;
+	}
+
+	*ptr++ = frame->white_point.x;
+	*ptr++ = frame->white_point.x >> 8;
+
+	*ptr++ = frame->white_point.y;
+	*ptr++ = frame->white_point.y >> 8;
+
+	*ptr++ = frame->max_display_mastering_luminance;
+	*ptr++ = frame->max_display_mastering_luminance >> 8;
+
+	*ptr++ = frame->min_display_mastering_luminance;
+	*ptr++ = frame->min_display_mastering_luminance >> 8;
+
+	*ptr++ = frame->max_cll;
+	*ptr++ = frame->max_cll >> 8;
+
+	*ptr++ = frame->max_fall;
+	*ptr++ = frame->max_fall >> 8;
+
+	hdmi_infoframe_set_checksum(buffer, length);
+
+	return length;
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_pack_only);
+
+/**
+ * hdmi_drm_infoframe_pack() - check a HDMI DRM infoframe,
+ *                             and write it to binary buffer
+ * @frame: HDMI DRM infoframe
+ * @buffer: destination buffer
+ * @size: size of buffer
+ *
+ * Validates that the infoframe is consistent and updates derived fields
+ * (eg. length) based on other fields, after which it packs the information
+ * contained in the @frame structure into a binary representation that
+ * can be written into the corresponding controller registers. This function
+ * also computes the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns the number of bytes packed into the binary buffer or a negative
+ * error code on failure.
+ */
+ssize_t hdmi_drm_infoframe_pack(struct hdmi_drm_infoframe *frame,
+				void *buffer, size_t size)
+{
+	int ret;
+
+	ret = hdmi_drm_infoframe_check(frame);
+	if (ret)
+		return ret;
+
+	return hdmi_drm_infoframe_pack_only(frame, buffer, size);
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_pack);
+
 /*
  * hdmi_vendor_any_infoframe_check() - check a vendor infoframe
  */
@@ -758,6 +902,10 @@ hdmi_infoframe_pack_only(const union hdmi_infoframe *frame, void *buffer, size_t
 		length = hdmi_avi_infoframe_pack_only(&frame->avi,
 						      buffer, size);
 		break;
+	case HDMI_INFOFRAME_TYPE_DRM:
+		length = hdmi_drm_infoframe_pack_only(&frame->drm,
+						      buffer, size);
+		break;
 	case HDMI_INFOFRAME_TYPE_SPD:
 		length = hdmi_spd_infoframe_pack_only(&frame->spd,
 						      buffer, size);
@@ -806,6 +954,9 @@ hdmi_infoframe_pack(union hdmi_infoframe *frame,
 	case HDMI_INFOFRAME_TYPE_AVI:
 		length = hdmi_avi_infoframe_pack(&frame->avi, buffer, size);
 		break;
+	case HDMI_INFOFRAME_TYPE_DRM:
+		length = hdmi_drm_infoframe_pack(&frame->drm, buffer, size);
+		break;
 	case HDMI_INFOFRAME_TYPE_SPD:
 		length = hdmi_spd_infoframe_pack(&frame->spd, buffer, size);
 		break;
@@ -838,6 +989,8 @@ static const char *hdmi_infoframe_type_get_name(enum hdmi_infoframe_type type)
 		return "Source Product Description (SPD)";
 	case HDMI_INFOFRAME_TYPE_AUDIO:
 		return "Audio";
+	case HDMI_INFOFRAME_TYPE_DRM:
+		return "Dynamic Range and Mastering";
 	}
 	return "Reserved";
 }
@@ -1284,6 +1437,40 @@ static void hdmi_audio_infoframe_log(const char *level,
 			frame->downmix_inhibit ? "Yes" : "No");
 }
 
+/**
+ * hdmi_drm_infoframe_log() - log info of HDMI DRM infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI DRM infoframe
+ */
+static void hdmi_drm_infoframe_log(const char *level,
+				   struct device *dev,
+				   const struct hdmi_drm_infoframe *frame)
+{
+	int i;
+
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+	hdmi_log("length: %d\n", frame->length);
+	hdmi_log("metadata type: %d\n", frame->metadata_type);
+	hdmi_log("eotf: %d\n", frame->eotf);
+	for (i = 0; i < 3; i++) {
+		hdmi_log("x[%d]: %d\n", i, frame->display_primaries[i].x);
+		hdmi_log("y[%d]: %d\n", i, frame->display_primaries[i].y);
+	}
+
+	hdmi_log("white point x: %d\n", frame->white_point.x);
+	hdmi_log("white point y: %d\n", frame->white_point.y);
+
+	hdmi_log("max_display_mastering_luminance: %d\n",
+		 frame->max_display_mastering_luminance);
+	hdmi_log("min_display_mastering_luminance: %d\n",
+		 frame->min_display_mastering_luminance);
+
+	hdmi_log("max_cll: %d\n", frame->max_cll);
+	hdmi_log("max_fall: %d\n", frame->max_fall);
+}
+
 static const char *
 hdmi_3d_structure_get_name(enum hdmi_3d_structure s3d_struct)
 {
@@ -1372,6 +1559,9 @@ void hdmi_infoframe_log(const char *level,
 	case HDMI_INFOFRAME_TYPE_VENDOR:
 		hdmi_vendor_any_infoframe_log(level, dev, &frame->vendor);
 		break;
+	case HDMI_INFOFRAME_TYPE_DRM:
+		hdmi_drm_infoframe_log(level, dev, &frame->drm);
+		break;
 	}
 }
 EXPORT_SYMBOL(hdmi_infoframe_log);
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 9d3b5b93102c..0e21e91c4314 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -25,6 +25,7 @@
 
 #include <linux/types.h>
 #include <linux/hdmi.h>
+#include <drm/drm_mode.h>
 
 struct drm_device;
 struct i2c_adapter;
@@ -370,6 +371,10 @@ drm_hdmi_avi_infoframe_quant_range(struct hdmi_avi_infoframe *frame,
 				   const struct drm_display_mode *mode,
 				   enum hdmi_quantization_range rgb_quant_range);
 
+int
+drm_hdmi_infoframe_set_hdr_metadata(struct hdmi_drm_infoframe *frame,
+				    const struct drm_connector_state *conn_state);
+
 /**
  * drm_eld_mnl - Get ELD monitor name length in bytes.
  * @eld: pointer to an eld memory structure with mnl set
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 6780476dcbff..bcf3c6c3499e 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -47,6 +47,7 @@ enum hdmi_infoframe_type {
 	HDMI_INFOFRAME_TYPE_AVI = 0x82,
 	HDMI_INFOFRAME_TYPE_SPD = 0x83,
 	HDMI_INFOFRAME_TYPE_AUDIO = 0x84,
+	HDMI_INFOFRAME_TYPE_DRM = 0x87,
 };
 
 #define HDMI_IEEE_OUI 0x000c03
@@ -55,6 +56,7 @@ enum hdmi_infoframe_type {
 #define HDMI_AVI_INFOFRAME_SIZE    13
 #define HDMI_SPD_INFOFRAME_SIZE    25
 #define HDMI_AUDIO_INFOFRAME_SIZE  10
+#define HDMI_DRM_INFOFRAME_SIZE    26
 
 #define HDMI_INFOFRAME_SIZE(type)	\
 	(HDMI_INFOFRAME_HEADER_SIZE + HDMI_ ## type ## _INFOFRAME_SIZE)
@@ -185,12 +187,37 @@ struct hdmi_avi_infoframe {
 	unsigned short right_bar;
 };
 
+/* DRM Infoframe as per CTA 861.G spec */
+struct hdmi_drm_infoframe {
+	enum hdmi_infoframe_type type;
+	unsigned char version;
+	unsigned char length;
+	enum hdmi_eotf eotf;
+	enum hdmi_metadata_type metadata_type;
+	struct {
+		u16 x, y;
+	} display_primaries[3];
+	struct {
+		u16 x, y;
+	} white_point;
+	u16 max_display_mastering_luminance;
+	u16 min_display_mastering_luminance;
+	u16 max_cll;
+	u16 max_fall;
+};
+
 int hdmi_avi_infoframe_init(struct hdmi_avi_infoframe *frame);
 ssize_t hdmi_avi_infoframe_pack(struct hdmi_avi_infoframe *frame, void *buffer,
 				size_t size);
 ssize_t hdmi_avi_infoframe_pack_only(const struct hdmi_avi_infoframe *frame,
 				     void *buffer, size_t size);
 int hdmi_avi_infoframe_check(struct hdmi_avi_infoframe *frame);
+int hdmi_drm_infoframe_init(struct hdmi_drm_infoframe *frame);
+ssize_t hdmi_drm_infoframe_pack(struct hdmi_drm_infoframe *frame, void *buffer,
+				size_t size);
+ssize_t hdmi_drm_infoframe_pack_only(const struct hdmi_drm_infoframe *frame,
+				     void *buffer, size_t size);
+int hdmi_drm_infoframe_check(struct hdmi_drm_infoframe *frame);
 
 enum hdmi_spd_sdi {
 	HDMI_SPD_SDI_UNKNOWN,
@@ -381,6 +408,7 @@ union hdmi_infoframe {
 	struct hdmi_spd_infoframe spd;
 	union hdmi_vendor_any_infoframe vendor;
 	struct hdmi_audio_infoframe audio;
+	struct hdmi_drm_infoframe drm;
 };
 
 ssize_t hdmi_infoframe_pack(union hdmi_infoframe *frame, void *buffer,
-- 
cgit v1.2.3


From b5e3eed1eeb363c148e2935d9d3c12c30a280de6 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Thu, 16 May 2019 19:40:12 +0530
Subject: drm: Add HLG EOTF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ADD HLG EOTF to the list of EOTF transfer functions supported.
Hybrid Log-Gamma (HLG) is a high dynamic range (HDR) standard.
HLG defines a nonlinear transfer function in which the lower
half of the signal values use a gamma curve and the upper half
of the signal values use a logarithmic curve.

v2: Rebase

v3: Fixed a warning message

v4: Addressed Shashank's review comments

v5: Addressed Jonas Karlman's review comment and dropped the i915
tag from header.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-8-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_edid.c | 3 ++-
 include/linux/hdmi.h       | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 73560c9437cd..262510c2a670 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -3854,7 +3854,8 @@ static uint8_t eotf_supported(const u8 *edid_ext)
 	return edid_ext[2] &
 		(BIT(HDMI_EOTF_TRADITIONAL_GAMMA_SDR) |
 		 BIT(HDMI_EOTF_TRADITIONAL_GAMMA_HDR) |
-		 BIT(HDMI_EOTF_SMPTE_ST2084));
+		 BIT(HDMI_EOTF_SMPTE_ST2084) |
+		 BIT(HDMI_EOTF_BT_2100_HLG));
 }
 
 static uint8_t hdr_metadata_type(const u8 *edid_ext)
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index bcf3c6c3499e..ee55ba589cdc 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -162,6 +162,7 @@ enum hdmi_eotf {
 	HDMI_EOTF_TRADITIONAL_GAMMA_SDR,
 	HDMI_EOTF_TRADITIONAL_GAMMA_HDR,
 	HDMI_EOTF_SMPTE_ST2084,
+	HDMI_EOTF_BT_2100_HLG,
 };
 
 struct hdmi_avi_infoframe {
-- 
cgit v1.2.3


From 4d432f956d4f3fe5a7082e11ee029d43f6f64457 Mon Sep 17 00:00:00 2001
From: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Date: Tue, 21 May 2019 15:17:17 +0300
Subject: drm: Rename struct edp_vsc_psr to struct dp_sdp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VSC SDP Payload for PSR is one of data block type of SDP (Secondaray Data
Packet). In order to generalize SDP packet structure name, it renames
struct edp_vsc_psr to struct dp_sdp. And each SDP data blocks have
different usages, each SDP type has different reserved data blocks and
Video_Stream_Configuration Extension VESA SDP might use all of Data Blocks
as Extended INFORFRAME Data Byte. so it makes Data Block variables as
array type. And it adds comments of details of DB of VSC SDP Payload
for Pixel Encoding/Colorimetry Format. This comments follows DP 1.4a spec,
section 2.2.5.7.5, chapter "VSC SDP Payload for Pixel Encoding/Colorimetry
Format".

v7: Addressed review comments from Ville.

v9: Rename a member value name DB to db on struct dp_sdp [Laurent]

Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190521121721.32010-3-gwan-gyeong.mun@intel.com
---
 drivers/gpu/drm/bridge/analogix/analogix_dp_core.c | 12 ++++----
 drivers/gpu/drm/bridge/analogix/analogix_dp_core.h |  2 +-
 drivers/gpu/drm/bridge/analogix/analogix_dp_reg.c  | 10 +++----
 drivers/gpu/drm/i915/intel_psr.c                   |  2 +-
 include/drm/drm_dp_helper.h                        | 33 +++++++++++++++-------
 5 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
index 225f5e5dd69b..257d69b21d99 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
@@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(analogix_dp_psr_enabled);
 
 int analogix_dp_enable_psr(struct analogix_dp_device *dp)
 {
-	struct edp_vsc_psr psr_vsc;
+	struct dp_sdp psr_vsc;
 
 	if (!dp->psr_enable)
 		return 0;
@@ -127,8 +127,8 @@ int analogix_dp_enable_psr(struct analogix_dp_device *dp)
 	psr_vsc.sdp_header.HB2 = 0x2;
 	psr_vsc.sdp_header.HB3 = 0x8;
 
-	psr_vsc.DB0 = 0;
-	psr_vsc.DB1 = EDP_VSC_PSR_STATE_ACTIVE | EDP_VSC_PSR_CRC_VALUES_VALID;
+	psr_vsc.db[0] = 0;
+	psr_vsc.db[1] = EDP_VSC_PSR_STATE_ACTIVE | EDP_VSC_PSR_CRC_VALUES_VALID;
 
 	return analogix_dp_send_psr_spd(dp, &psr_vsc, true);
 }
@@ -136,7 +136,7 @@ EXPORT_SYMBOL_GPL(analogix_dp_enable_psr);
 
 int analogix_dp_disable_psr(struct analogix_dp_device *dp)
 {
-	struct edp_vsc_psr psr_vsc;
+	struct dp_sdp psr_vsc;
 	int ret;
 
 	if (!dp->psr_enable)
@@ -149,8 +149,8 @@ int analogix_dp_disable_psr(struct analogix_dp_device *dp)
 	psr_vsc.sdp_header.HB2 = 0x2;
 	psr_vsc.sdp_header.HB3 = 0x8;
 
-	psr_vsc.DB0 = 0;
-	psr_vsc.DB1 = 0;
+	psr_vsc.db[0] = 0;
+	psr_vsc.db[1] = 0;
 
 	ret = drm_dp_dpcd_writeb(&dp->aux, DP_SET_POWER, DP_SET_POWER_D0);
 	if (ret != 1) {
diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h
index 769255dc6e99..3e5fe90edf71 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h
@@ -254,7 +254,7 @@ void analogix_dp_enable_scrambling(struct analogix_dp_device *dp);
 void analogix_dp_disable_scrambling(struct analogix_dp_device *dp);
 void analogix_dp_enable_psr_crc(struct analogix_dp_device *dp);
 int analogix_dp_send_psr_spd(struct analogix_dp_device *dp,
-			     struct edp_vsc_psr *vsc, bool blocking);
+			     struct dp_sdp *vsc, bool blocking);
 ssize_t analogix_dp_transfer(struct analogix_dp_device *dp,
 			     struct drm_dp_aux_msg *msg);
 
diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_reg.c b/drivers/gpu/drm/bridge/analogix/analogix_dp_reg.c
index a5f2763d72e4..cf17e2e21b15 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_reg.c
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_reg.c
@@ -1041,7 +1041,7 @@ static ssize_t analogix_dp_get_psr_status(struct analogix_dp_device *dp)
 }
 
 int analogix_dp_send_psr_spd(struct analogix_dp_device *dp,
-			     struct edp_vsc_psr *vsc, bool blocking)
+			     struct dp_sdp *vsc, bool blocking)
 {
 	unsigned int val;
 	int ret;
@@ -1069,8 +1069,8 @@ int analogix_dp_send_psr_spd(struct analogix_dp_device *dp,
 	writel(0x5D, dp->reg_base + ANALOGIX_DP_SPD_PB3);
 
 	/* configure DB0 / DB1 values */
-	writel(vsc->DB0, dp->reg_base + ANALOGIX_DP_VSC_SHADOW_DB0);
-	writel(vsc->DB1, dp->reg_base + ANALOGIX_DP_VSC_SHADOW_DB1);
+	writel(vsc->db[0], dp->reg_base + ANALOGIX_DP_VSC_SHADOW_DB0);
+	writel(vsc->db[1], dp->reg_base + ANALOGIX_DP_VSC_SHADOW_DB1);
 
 	/* set reuse spd inforframe */
 	val = readl(dp->reg_base + ANALOGIX_DP_VIDEO_CTL_3);
@@ -1092,8 +1092,8 @@ int analogix_dp_send_psr_spd(struct analogix_dp_device *dp,
 
 	ret = readx_poll_timeout(analogix_dp_get_psr_status, dp, psr_status,
 		psr_status >= 0 &&
-		((vsc->DB1 && psr_status == DP_PSR_SINK_ACTIVE_RFB) ||
-		(!vsc->DB1 && psr_status == DP_PSR_SINK_INACTIVE)), 1500,
+		((vsc->db[1] && psr_status == DP_PSR_SINK_ACTIVE_RFB) ||
+		(!vsc->db[1] && psr_status == DP_PSR_SINK_INACTIVE)), 1500,
 		DP_TIMEOUT_PSR_LOOP_MS * 1000);
 	if (ret) {
 		dev_warn(dp->dev, "Failed to apply PSR %d\n", ret);
diff --git a/drivers/gpu/drm/i915/intel_psr.c b/drivers/gpu/drm/i915/intel_psr.c
index 2a547a128a37..01ca502099df 100644
--- a/drivers/gpu/drm/i915/intel_psr.c
+++ b/drivers/gpu/drm/i915/intel_psr.c
@@ -342,7 +342,7 @@ static void intel_psr_setup_vsc(struct intel_dp *intel_dp,
 {
 	struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
 	struct drm_i915_private *dev_priv = dp_to_i915(intel_dp);
-	struct edp_vsc_psr psr_vsc;
+	struct dp_sdp psr_vsc;
 
 	if (dev_priv->psr.psr2_enabled) {
 		/* Prepare VSC Header for SU as per EDP 1.4 spec, Table 6.11 */
diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 97ce790a5b5a..3fc534ee8174 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -1083,17 +1083,30 @@ struct dp_sdp_header {
 #define EDP_SDP_HEADER_VALID_PAYLOAD_BYTES	0x1F
 #define DP_SDP_PPS_HEADER_PAYLOAD_BYTES_MINUS_1 0x7F
 
-struct edp_vsc_psr {
+/**
+ * struct dp_sdp - DP secondary data packet
+ * @sdp_header: DP secondary data packet header
+ * @db: DP secondaray data packet data blocks
+ * VSC SDP Payload for PSR
+ * db[0]: Stereo Interface
+ * db[1]: 0 - PSR State; 1 - Update RFB; 2 - CRC Valid
+ * db[2]: CRC value bits 7:0 of the R or Cr component
+ * db[3]: CRC value bits 15:8 of the R or Cr component
+ * db[4]: CRC value bits 7:0 of the G or Y component
+ * db[5]: CRC value bits 15:8 of the G or Y component
+ * db[6]: CRC value bits 7:0 of the B or Cb component
+ * db[7]: CRC value bits 15:8 of the B or Cb component
+ * db[8] - db[31]: Reserved
+ * VSC SDP Payload for Pixel Encoding/Colorimetry Format
+ * db[0] - db[15]: Reserved
+ * db[16]: Pixel Encoding and Colorimetry Formats
+ * db[17]: Dynamic Range and Component Bit Depth
+ * db[18]: Content Type
+ * db[19] - db[31]: Reserved
+ */
+struct dp_sdp {
 	struct dp_sdp_header sdp_header;
-	u8 DB0; /* Stereo Interface */
-	u8 DB1; /* 0 - PSR State; 1 - Update RFB; 2 - CRC Valid */
-	u8 DB2; /* CRC value bits 7:0 of the R or Cr component */
-	u8 DB3; /* CRC value bits 15:8 of the R or Cr component */
-	u8 DB4; /* CRC value bits 7:0 of the G or Y component */
-	u8 DB5; /* CRC value bits 15:8 of the G or Y component */
-	u8 DB6; /* CRC value bits 7:0 of the B or Cb component */
-	u8 DB7; /* CRC value bits 15:8 of the B or Cb component */
-	u8 DB8_31[24]; /* Reserved */
+	u8 db[32];
 } __packed;
 
 #define EDP_VSC_PSR_STATE_ACTIVE	(1<<0)
-- 
cgit v1.2.3


From c08e7e4c8a6f04e01d16117eb4a0077059ec2cd4 Mon Sep 17 00:00:00 2001
From: Guillaume La Roque <glaroque@baylibre.com>
Date: Tue, 14 May 2019 10:26:48 +0200
Subject: pinctrl: generic: add new 'drive-strength-microamp' property support

Add drive-strength-microamp property support to allow drive strength in uA

Signed-off-by: Guillaume La Roque <glaroque@baylibre.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c       | 2 ++
 include/linux/pinctrl/pinconf-generic.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index b4f7f8a458ea..d0cbdb1ad76a 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -39,6 +39,7 @@ static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_DRIVE_OPEN_SOURCE, "output drive open source", NULL, false),
 	PCONFDUMP(PIN_CONFIG_DRIVE_PUSH_PULL, "output drive push pull", NULL, false),
 	PCONFDUMP(PIN_CONFIG_DRIVE_STRENGTH, "output drive strength", "mA", true),
+	PCONFDUMP(PIN_CONFIG_DRIVE_STRENGTH_UA, "output drive strength", "uA", true),
 	PCONFDUMP(PIN_CONFIG_INPUT_DEBOUNCE, "input debounce", "usec", true),
 	PCONFDUMP(PIN_CONFIG_INPUT_ENABLE, "input enabled", NULL, false),
 	PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT, "input schmitt trigger", NULL, false),
@@ -167,6 +168,7 @@ static const struct pinconf_generic_params dt_params[] = {
 	{ "drive-open-source", PIN_CONFIG_DRIVE_OPEN_SOURCE, 0 },
 	{ "drive-push-pull", PIN_CONFIG_DRIVE_PUSH_PULL, 0 },
 	{ "drive-strength", PIN_CONFIG_DRIVE_STRENGTH, 0 },
+	{ "drive-strength-microamp", PIN_CONFIG_DRIVE_STRENGTH_UA, 0 },
 	{ "input-debounce", PIN_CONFIG_INPUT_DEBOUNCE, 0 },
 	{ "input-disable", PIN_CONFIG_INPUT_ENABLE, 0 },
 	{ "input-enable", PIN_CONFIG_INPUT_ENABLE, 1 },
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 6c0680641108..72d06d6a3099 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -55,6 +55,8 @@
  *	push-pull mode, the argument is ignored.
  * @PIN_CONFIG_DRIVE_STRENGTH: the pin will sink or source at most the current
  *	passed as argument. The argument is in mA.
+ * @PIN_CONFIG_DRIVE_STRENGTH_UA: the pin will sink or source at most the current
+ *	passed as argument. The argument is in uA.
  * @PIN_CONFIG_INPUT_DEBOUNCE: this will configure the pin to debounce mode,
  *	which means it will wait for signals to settle when reading inputs. The
  *	argument gives the debounce time in usecs. Setting the
@@ -112,6 +114,7 @@ enum pin_config_param {
 	PIN_CONFIG_DRIVE_OPEN_SOURCE,
 	PIN_CONFIG_DRIVE_PUSH_PULL,
 	PIN_CONFIG_DRIVE_STRENGTH,
+	PIN_CONFIG_DRIVE_STRENGTH_UA,
 	PIN_CONFIG_INPUT_DEBOUNCE,
 	PIN_CONFIG_INPUT_ENABLE,
 	PIN_CONFIG_INPUT_SCHMITT,
-- 
cgit v1.2.3


From 2b2ebb9acb89cf968bb08b488362383ae795e6b1 Mon Sep 17 00:00:00 2001
From: Anson Huang <anson.huang@nxp.com>
Date: Wed, 15 May 2019 01:29:53 +0000
Subject: dt-bindings: clock: imx8mm: Add SNVS clock

Add macro for the SNVS clock of the i.MX8MM.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Reviewed-by: Leonard Crestez <leonard.crestez@nxp.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 include/dt-bindings/clock/imx8mm-clock.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/imx8mm-clock.h b/include/dt-bindings/clock/imx8mm-clock.h
index fe47798f95df..83f667368d55 100644
--- a/include/dt-bindings/clock/imx8mm-clock.h
+++ b/include/dt-bindings/clock/imx8mm-clock.h
@@ -245,6 +245,8 @@
 #define IMX8MM_CLK_GPIO4_ROOT			226
 #define IMX8MM_CLK_GPIO5_ROOT			227
 
-#define IMX8MM_CLK_END				228
+#define IMX8MM_CLK_SNVS_ROOT			228
+
+#define IMX8MM_CLK_END				229
 
 #endif
-- 
cgit v1.2.3


From 036f394dd77f8117346874151793ec38967d843f Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@st.com>
Date: Wed, 22 May 2019 17:29:24 +0200
Subject: pinctrl: Enable device link creation for pin control

A pin controller may want to create a link between itself
and its clients to be sure of suspend/resume call ordering.

Introduce link_consumers field in pinctrl_desc structure to let
pinctrl core knows that controller expect to create a link.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@st.com>
[Renamed create_link to link_consumers]
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c          | 11 +++++++++++
 include/linux/pinctrl/pinctrl.h |  5 +++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index c6ff4d5fa482..d757c51d7114 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -1216,6 +1216,15 @@ struct pinctrl_state *pinctrl_lookup_state(struct pinctrl *p,
 }
 EXPORT_SYMBOL_GPL(pinctrl_lookup_state);
 
+static void pinctrl_link_add(struct pinctrl_dev *pctldev,
+			     struct device *consumer)
+{
+	if (pctldev->desc->link_consumers)
+		device_link_add(consumer, pctldev->dev,
+				DL_FLAG_PM_RUNTIME |
+				DL_FLAG_AUTOREMOVE_CONSUMER);
+}
+
 /**
  * pinctrl_commit_state() - select/activate/program a pinctrl state to HW
  * @p: the pinctrl handle for the device that requests configuration
@@ -1261,6 +1270,8 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state)
 		if (ret < 0) {
 			goto unapply_new_state;
 		}
+
+		pinctrl_link_add(setting->pctldev, p->dev);
 	}
 
 	p->state = state;
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 8f5dbb84547a..2744113f1024 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -125,6 +125,10 @@ struct pinctrl_ops {
  *	the hardware description
  * @custom_conf_items: Information how to print @params in debugfs, must be
  *	the same size as the @custom_params, i.e. @num_custom_params
+ * @link_consumers: If true create a device link between pinctrl and its
+ *	consumers (i.e. the devices requesting pin control states). This is
+ *	sometimes necessary to ascertain the right suspend/resume order for
+ *	example.
  */
 struct pinctrl_desc {
 	const char *name;
@@ -139,6 +143,7 @@ struct pinctrl_desc {
 	const struct pinconf_generic_params *custom_params;
 	const struct pin_config_item *custom_conf_items;
 #endif
+	bool link_consumers;
 };
 
 /* External interface to pin controller */
-- 
cgit v1.2.3


From 87def8d0d5bfc32bf8a221b63addb8a051cbf017 Mon Sep 17 00:00:00 2001
From: Leonard Crestez <leonard.crestez@nxp.com>
Date: Wed, 22 May 2019 09:48:29 +0000
Subject: dt-bindings: clock: imx8m: Add GIC clock

This should be defined in the clock tree so that parents are not
shutdown by accident

Signed-off-by: Leonard Crestez <leonard.crestez@nxp.com>
Reviewed-by: Abel Vesa <abel.vesa@nxp.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 include/dt-bindings/clock/imx8mm-clock.h | 3 ++-
 include/dt-bindings/clock/imx8mq-clock.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/imx8mm-clock.h b/include/dt-bindings/clock/imx8mm-clock.h
index 83f667368d55..07e6c686f3ef 100644
--- a/include/dt-bindings/clock/imx8mm-clock.h
+++ b/include/dt-bindings/clock/imx8mm-clock.h
@@ -246,7 +246,8 @@
 #define IMX8MM_CLK_GPIO5_ROOT			227
 
 #define IMX8MM_CLK_SNVS_ROOT			228
+#define IMX8MM_CLK_GIC				229
 
-#define IMX8MM_CLK_END				229
+#define IMX8MM_CLK_END				230
 
 #endif
diff --git a/include/dt-bindings/clock/imx8mq-clock.h b/include/dt-bindings/clock/imx8mq-clock.h
index 0233bb1e6bf8..65463673d25e 100644
--- a/include/dt-bindings/clock/imx8mq-clock.h
+++ b/include/dt-bindings/clock/imx8mq-clock.h
@@ -401,6 +401,7 @@
 #define IMX8MQ_CLK_GPIO5_ROOT			263
 
 #define IMX8MQ_CLK_SNVS_ROOT			264
+#define IMX8MQ_CLK_GIC				265
 
-#define IMX8MQ_CLK_END				265
+#define IMX8MQ_CLK_END				266
 #endif /* __DT_BINDINGS_CLOCK_IMX8MQ_H */
-- 
cgit v1.2.3


From 59fcdce425b7c947ccea03a16e393af9bb4d6262 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@kernel.org>
Date: Thu, 23 May 2019 17:05:59 -0700
Subject: clk: Remove ifdef for COMMON_CLK in clk-provider.h

This ifdef has been there since the beginning of this file, but it
doesn't really seem to serve any purpose besides obfuscating the struct
definitions and #defines here from compilation units that include it.
Let's always expose these function prototypes and struct definitions so
that code can inspect clk providers without needing to have
CONFIG_COMMON_CLK enabled.

Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bb6118f79784..3bced2ec9f26 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -9,8 +9,6 @@
 #include <linux/of.h>
 #include <linux/of_clk.h>
 
-#ifdef CONFIG_COMMON_CLK
-
 /*
  * flags used across common struct clk.  these flags should only affect the
  * top-level framework.  custom flags for dealing with hardware specifics
@@ -1019,5 +1017,4 @@ static inline int of_clk_detect_critical(struct device_node *np, int index,
 
 void clk_gate_restore_context(struct clk_hw *hw);
 
-#endif /* CONFIG_COMMON_CLK */
 #endif /* CLK_PROVIDER_H */
-- 
cgit v1.2.3


From 30d5a945743cd05ec5c847f2e38c2fbda5e00944 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@kernel.org>
Date: Thu, 23 May 2019 17:11:57 -0700
Subject: clk: Unexport __clk_of_table

This symbol doesn't need to be exported to clk providers anymore.
Originally, it was hidden inside clk.c, but then OMAP needed to get
access to it in commit 819b4861c18d ("CLK: ti: add init support for
clock IP blocks"), but eventually that code also changed in commit
c08ee14cc663 ("clk: ti: change clock init to use generic of_clk_init")
and we were left with this exported. Move this back into clk.c so that
it isn't exposed anymore.

Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c            | 1 +
 include/linux/clk-provider.h | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index aa51756fd4d6..b34e84bb8167 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -4038,6 +4038,7 @@ struct of_clk_provider {
 	void *data;
 };
 
+extern struct of_device_id __clk_of_table;
 static const struct of_device_id __clk_of_table_sentinel
 	__used __section(__clk_of_table_end);
 
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 3bced2ec9f26..9ba000e3a50d 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -865,8 +865,6 @@ static inline long divider_ro_round_rate(struct clk_hw *hw, unsigned long rate,
  */
 unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate);
 
-struct of_device_id;
-
 struct clk_onecell_data {
 	struct clk **clks;
 	unsigned int clk_num;
@@ -877,8 +875,6 @@ struct clk_hw_onecell_data {
 	struct clk_hw *hws[];
 };
 
-extern struct of_device_id __clk_of_table;
-
 #define CLK_OF_DECLARE(name, compat, fn) OF_DECLARE_1(clk, name, compat, fn)
 
 /*
-- 
cgit v1.2.3


From ef98682a4e1257ff45405bb1372939e8dfe774bb Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Sat, 11 May 2019 00:15:22 +0530
Subject: dt-bindings: reset: Add devicetree binding for BM1880 reset
 controller

Add devicetree binding for Bitmain BM1880 SoC reset controller.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 .../bindings/reset/bitmain,bm1880-reset.txt        | 18 ++++++++
 include/dt-bindings/reset/bitmain,bm1880-reset.h   | 51 ++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt
 create mode 100644 include/dt-bindings/reset/bitmain,bm1880-reset.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt b/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt
new file mode 100644
index 000000000000..a6f8455ae6c4
--- /dev/null
+++ b/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt
@@ -0,0 +1,18 @@
+Bitmain BM1880 SoC Reset Controller
+===================================
+
+Please also refer to reset.txt in this directory for common reset
+controller binding usage.
+
+Required properties:
+- compatible:   Should be "bitmain,bm1880-reset"
+- reg:          Offset and length of reset controller space in SCTRL.
+- #reset-cells: Must be 1.
+
+Example:
+
+        rst: reset-controller@c00 {
+                compatible = "bitmain,bm1880-reset";
+                reg = <0xc00 0x8>;
+                #reset-cells = <1>;
+        };
diff --git a/include/dt-bindings/reset/bitmain,bm1880-reset.h b/include/dt-bindings/reset/bitmain,bm1880-reset.h
new file mode 100644
index 000000000000..4c0de5223773
--- /dev/null
+++ b/include/dt-bindings/reset/bitmain,bm1880-reset.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2018 Bitmain Ltd.
+ * Copyright (c) 2019 Linaro Ltd.
+ */
+
+#ifndef _DT_BINDINGS_BM1880_RESET_H
+#define _DT_BINDINGS_BM1880_RESET_H
+
+#define BM1880_RST_MAIN_AP		0
+#define BM1880_RST_SECOND_AP		1
+#define BM1880_RST_DDR			2
+#define BM1880_RST_VIDEO		3
+#define BM1880_RST_JPEG			4
+#define BM1880_RST_VPP			5
+#define BM1880_RST_GDMA			6
+#define BM1880_RST_AXI_SRAM		7
+#define BM1880_RST_TPU			8
+#define BM1880_RST_USB			9
+#define BM1880_RST_ETH0			10
+#define BM1880_RST_ETH1			11
+#define BM1880_RST_NAND			12
+#define BM1880_RST_EMMC			13
+#define BM1880_RST_SD			14
+#define BM1880_RST_SDMA			15
+#define BM1880_RST_I2S0			16
+#define BM1880_RST_I2S1			17
+#define BM1880_RST_UART0_1_CLK		18
+#define BM1880_RST_UART0_1_ACLK		19
+#define BM1880_RST_UART2_3_CLK		20
+#define BM1880_RST_UART2_3_ACLK		21
+#define BM1880_RST_MINER		22
+#define BM1880_RST_I2C0			23
+#define BM1880_RST_I2C1			24
+#define BM1880_RST_I2C2			25
+#define BM1880_RST_I2C3			26
+#define BM1880_RST_I2C4			27
+#define BM1880_RST_PWM0			28
+#define BM1880_RST_PWM1			29
+#define BM1880_RST_PWM2			30
+#define BM1880_RST_PWM3			31
+#define BM1880_RST_SPI			32
+#define BM1880_RST_GPIO0		33
+#define BM1880_RST_GPIO1		34
+#define BM1880_RST_GPIO2		35
+#define BM1880_RST_EFUSE		36
+#define BM1880_RST_WDT			37
+#define BM1880_RST_AHB_ROM		38
+#define BM1880_RST_SPIC			39
+
+#endif /* _DT_BINDINGS_BM1880_RESET_H */
-- 
cgit v1.2.3


From 88807dc8d573c0f718d0d26f592f212c5a487cf0 Mon Sep 17 00:00:00 2001
From: Oak Zeng <Oak.Zeng@amd.com>
Date: Thu, 4 Apr 2019 15:47:34 -0500
Subject: drm/amdgpu: Remap hdp coherency registers

Remap HDP_MEM_COHERENCY_FLUSH_CNTL and HDP_REG_COHERENCY_FLUSH_CNTL
to an empty page in mmio space. We will later map this page to process
space so application can flush hdp. This can't be done properly at
those registers' original location because it will expose more than
desired registers to process space.

v2: Use explicit register hole location
v3: Moved remapped hdp registers into adev struct
v4: Use more generic name for remapped page
    Expose register offset in kfd_ioctl.h
v5: Move hdp register remap function to nbio ip function
v6: Fixed operator precedence issue and other bugs

Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  7 +++++++
 drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 15 ++++++++++++---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 15 ++++++++++++---
 drivers/gpu/drm/amd/amdgpu/soc15.c     | 11 +++++++++++
 include/uapi/linux/kfd_ioctl.h         |  7 +++++++
 5 files changed, 49 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 14398f55f602..23c3375623d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -639,6 +639,11 @@ struct nbio_hdp_flush_reg {
 	u32 ref_and_mask_sdma1;
 };
 
+struct amdgpu_mmio_remap {
+	u32 reg_offset;
+	resource_size_t bus_addr;
+};
+
 struct amdgpu_nbio_funcs {
 	const struct nbio_hdp_flush_reg *hdp_flush_reg;
 	u32 (*get_hdp_flush_req_offset)(struct amdgpu_device *adev);
@@ -666,6 +671,7 @@ struct amdgpu_nbio_funcs {
 	void (*ih_control)(struct amdgpu_device *adev);
 	void (*init_registers)(struct amdgpu_device *adev);
 	void (*detect_hw_virt)(struct amdgpu_device *adev);
+	void (*remap_hdp_registers)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_df_funcs {
@@ -764,6 +770,7 @@ struct amdgpu_device {
 	void __iomem			*rmmio;
 	/* protects concurrent MM_INDEX/DATA based register access */
 	spinlock_t mmio_idx_lock;
+	struct amdgpu_mmio_remap        rmmio_remap;
 	/* protects concurrent SMC based register access */
 	spinlock_t smc_idx_lock;
 	amdgpu_rreg_t			smc_rreg;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
index 1cdb98ad2db3..73419fa38159 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
@@ -29,9 +29,18 @@
 #include "nbio/nbio_7_0_sh_mask.h"
 #include "nbio/nbio_7_0_smn.h"
 #include "vega10_enum.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 #define smnNBIF_MGCG_CTRL_LCLK	0x1013a05c
 
+static void nbio_v7_0_remap_hdp_registers(struct amdgpu_device *adev)
+{
+	WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL,
+		adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL);
+	WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL,
+		adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL);
+}
+
 static u32 nbio_v7_0_get_rev_id(struct amdgpu_device *adev)
 {
         u32 tmp = RREG32_SOC15(NBIO, 0, mmRCC_DEV0_EPF0_STRAP0);
@@ -55,10 +64,9 @@ static void nbio_v7_0_hdp_flush(struct amdgpu_device *adev,
 				struct amdgpu_ring *ring)
 {
 	if (!ring || !ring->funcs->emit_wreg)
-		WREG32_SOC15_NO_KIQ(NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL, 0);
+		WREG32_NO_KIQ((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0);
 	else
-		amdgpu_ring_emit_wreg(ring, SOC15_REG_OFFSET(
-			NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL), 0);
+		amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0);
 }
 
 static u32 nbio_v7_0_get_memsize(struct amdgpu_device *adev)
@@ -283,4 +291,5 @@ const struct amdgpu_nbio_funcs nbio_v7_0_funcs = {
 	.ih_control = nbio_v7_0_ih_control,
 	.init_registers = nbio_v7_0_init_registers,
 	.detect_hw_virt = nbio_v7_0_detect_hw_virt,
+	.remap_hdp_registers = nbio_v7_0_remap_hdp_registers,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index c69d51598cfe..bfaaa327ae3c 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -27,9 +27,18 @@
 #include "nbio/nbio_7_4_offset.h"
 #include "nbio/nbio_7_4_sh_mask.h"
 #include "nbio/nbio_7_4_0_smn.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 #define smnNBIF_MGCG_CTRL_LCLK	0x1013a21c
 
+static void nbio_v7_4_remap_hdp_registers(struct amdgpu_device *adev)
+{
+	WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL,
+		adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL);
+	WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL,
+		adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL);
+}
+
 static u32 nbio_v7_4_get_rev_id(struct amdgpu_device *adev)
 {
 	u32 tmp = RREG32_SOC15(NBIO, 0, mmRCC_DEV0_EPF0_STRAP0);
@@ -53,10 +62,9 @@ static void nbio_v7_4_hdp_flush(struct amdgpu_device *adev,
 				struct amdgpu_ring *ring)
 {
 	if (!ring || !ring->funcs->emit_wreg)
-		WREG32_SOC15_NO_KIQ(NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL, 0);
+		WREG32_NO_KIQ((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0);
 	else
-		amdgpu_ring_emit_wreg(ring, SOC15_REG_OFFSET(
-			NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL), 0);
+		amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0);
 }
 
 static u32 nbio_v7_4_get_memsize(struct amdgpu_device *adev)
@@ -262,4 +270,5 @@ const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
 	.ih_control = nbio_v7_4_ih_control,
 	.init_registers = nbio_v7_4_init_registers,
 	.detect_hw_virt = nbio_v7_4_detect_hw_virt,
+	.remap_hdp_registers = nbio_v7_4_remap_hdp_registers,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 4900e4958dec..78bd00a0142f 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -44,6 +44,7 @@
 #include "smuio/smuio_9_0_offset.h"
 #include "smuio/smuio_9_0_sh_mask.h"
 #include "nbio/nbio_7_0_default.h"
+#include "nbio/nbio_7_0_offset.h"
 #include "nbio/nbio_7_0_sh_mask.h"
 #include "nbio/nbio_7_0_smn.h"
 #include "mp/mp_9_0_offset.h"
@@ -64,6 +65,7 @@
 #include "dce_virtual.h"
 #include "mxgpu_ai.h"
 #include "amdgpu_smu.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 #define mmMP0_MISC_CGTT_CTRL0                                                                   0x01b9
 #define mmMP0_MISC_CGTT_CTRL0_BASE_IDX                                                          0
@@ -783,8 +785,11 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs =
 
 static int soc15_common_early_init(void *handle)
 {
+#define MMIO_REG_HOLE_OFFSET (0x80000 - PAGE_SIZE)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	adev->rmmio_remap.reg_offset = MMIO_REG_HOLE_OFFSET;
+	adev->rmmio_remap.bus_addr = adev->rmmio_base + MMIO_REG_HOLE_OFFSET;
 	adev->smc_rreg = NULL;
 	adev->smc_wreg = NULL;
 	adev->pcie_rreg = &soc15_pcie_rreg;
@@ -1014,6 +1019,12 @@ static int soc15_common_hw_init(void *handle)
 	soc15_program_aspm(adev);
 	/* setup nbio registers */
 	adev->nbio_funcs->init_registers(adev);
+	/* remap HDP registers to a hole in mmio space,
+	 * for the purpose of expose those registers
+	 * to process space
+	 */
+	if (adev->nbio_funcs->remap_hdp_registers)
+		adev->nbio_funcs->remap_hdp_registers(adev);
 	/* enable the doorbell aperture */
 	soc15_enable_doorbell_aperture(adev, true);
 	/* HW doorbell routing policy: doorbell writing not
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index dc067ed0b72d..bb1b4280f53d 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -426,6 +426,13 @@ struct kfd_ioctl_import_dmabuf_args {
 	__u32 dmabuf_fd;	/* to KFD */
 };
 
+/* Register offset inside the remapped mmio page
+ */
+enum kfd_mmio_remap {
+	KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL = 0,
+	KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL = 4,
+};
+
 #define AMDKFD_IOCTL_BASE 'K'
 #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
 #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
-- 
cgit v1.2.3


From d8e408a82704c86ba87c3d58cfe69dcdb758aa07 Mon Sep 17 00:00:00 2001
From: Oak Zeng <Oak.Zeng@amd.com>
Date: Thu, 11 Apr 2019 14:43:39 -0500
Subject: drm/amdkfd: Expose HDP registers to user space

Introduce a new memory type (KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) and
expose mmio page of HDP registers to user space through this new
memory type.

v2: moved remapped hdp regs to adev struct
v3: rename the new memory type to ALLOC_MEM_FLAGS_MMIO_REMAP
v4: use more generic function name
v5: Fail remapped mmio allocation for asics before gfx9

Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c       | 7 +++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h       | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 ++++---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         | 6 ++++++
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h  | 1 +
 include/uapi/linux/kfd_ioctl.h                   | 1 +
 6 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index aeead072fa79..401edb605fdd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -519,6 +519,13 @@ uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
 	return adev->gmc.xgmi.hive_id;
 }
 
+uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct kgd_dev *kgd)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+
+	return adev->rmmio_remap.bus_addr;
+}
+
 int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
 				uint32_t vmid, uint64_t gpu_addr,
 				uint32_t *ib_cmd, uint32_t ib_len)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4e37fa7e85b1..ea1f141db3ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -169,6 +169,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
 				  uint32_t *flags);
 uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
 uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
+uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct kgd_dev *kgd);
 
 #define read_user_wptr(mmptr, wptr, dst)				\
 	({								\
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index a6e5184d436c..00e013581a70 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1109,7 +1109,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
 		if (!offset || !*offset)
 			return -EINVAL;
 		user_addr = *offset;
-	} else if (flags & ALLOC_MEM_FLAGS_DOORBELL) {
+	} else if (flags & (ALLOC_MEM_FLAGS_DOORBELL |
+			ALLOC_MEM_FLAGS_MMIO_REMAP)) {
 		domain = AMDGPU_GEM_DOMAIN_GTT;
 		alloc_domain = AMDGPU_GEM_DOMAIN_CPU;
 		bo_type = ttm_bo_type_sg;
@@ -1294,8 +1295,8 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
 	/* Free the sync object */
 	amdgpu_sync_free(&mem->sync);
 
-	/* If the SG is not NULL, it's one we created for a doorbell
-	 * BO. We need to free it.
+	/* If the SG is not NULL, it's one we created for a doorbell or mmio
+	 * remap BO. We need to free it.
 	 */
 	if (mem->bo->tbo.sg) {
 		sg_free_table(mem->bo->tbo.sg);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 083bd8114db1..d795e5018270 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1272,6 +1272,12 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
 		if (args->size != kfd_doorbell_process_slice(dev))
 			return -EINVAL;
 		offset = kfd_get_process_doorbells(dev, p);
+	} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
+		if (args->size != PAGE_SIZE)
+			return -EINVAL;
+		offset = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
+		if (!offset)
+			return -ENOMEM;
 	}
 
 	mutex_lock(&p->mutex);
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index b897aca9b4c9..98b9533e672b 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -174,6 +174,7 @@ struct tile_config {
 #define ALLOC_MEM_FLAGS_GTT		(1 << 1)
 #define ALLOC_MEM_FLAGS_USERPTR		(1 << 2)
 #define ALLOC_MEM_FLAGS_DOORBELL	(1 << 3)
+#define ALLOC_MEM_FLAGS_MMIO_REMAP	(1 << 4)
 
 /*
  * Allocation flags attributes/access options.
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index bb1b4280f53d..1e7d5f3376b0 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -338,6 +338,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_GTT		(1 << 1)
 #define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR		(1 << 2)
 #define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL	(1 << 3)
+#define KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP	(1 << 4)
 /* Allocation flags: attributes/access options */
 #define KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE	(1 << 31)
 #define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE	(1 << 30)
-- 
cgit v1.2.3


From 1b4670f6983156526c286723465fdf805070b45d Mon Sep 17 00:00:00 2001
From: Oak Zeng <Oak.Zeng@amd.com>
Date: Thu, 7 Feb 2019 14:02:27 -0600
Subject: drm/amdkfd: Introduce XGMI SDMA queue type

Existing QUEUE_TYPE_SDMA means PCIe optimized SDMA queues.
Introduce a new QUEUE_TYPE_SDMA_XGMI, which is optimized
for non-PCIe transfer such as XGMI.

Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            |  15 +++
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 123 +++++++++++++++------
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |   3 +
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c   |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c   |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    |   3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |   4 +-
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  10 +-
 include/uapi/linux/kfd_ioctl.h                     |   7 +-
 10 files changed, 132 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 3ccaa38779ea..38ae53fe8182 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -213,6 +213,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
 		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
 	else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA)
 		q_properties->type = KFD_QUEUE_TYPE_SDMA;
+	else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
+		q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
 	else
 		return -ENOTSUPP;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8202a5db3a35..1368b41cb92b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -54,6 +54,7 @@ static const struct kfd_device_info kaveri_device_info = {
 	.needs_iommu_device = true,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -71,6 +72,7 @@ static const struct kfd_device_info carrizo_device_info = {
 	.needs_iommu_device = true,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -87,6 +89,7 @@ static const struct kfd_device_info raven_device_info = {
 	.needs_iommu_device = true,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 1,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 #endif
@@ -105,6 +108,7 @@ static const struct kfd_device_info hawaii_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -121,6 +125,7 @@ static const struct kfd_device_info tonga_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -137,6 +142,7 @@ static const struct kfd_device_info fiji_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -153,6 +159,7 @@ static const struct kfd_device_info fiji_vf_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -170,6 +177,7 @@ static const struct kfd_device_info polaris10_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -186,6 +194,7 @@ static const struct kfd_device_info polaris10_vf_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -202,6 +211,7 @@ static const struct kfd_device_info polaris11_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -218,6 +228,7 @@ static const struct kfd_device_info polaris12_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -234,6 +245,7 @@ static const struct kfd_device_info vega10_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -250,6 +262,7 @@ static const struct kfd_device_info vega10_vf_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -266,6 +279,7 @@ static const struct kfd_device_info vega12_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
@@ -282,6 +296,7 @@ static const struct kfd_device_info vega20_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 8,
 };
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index d41045d3fc3a..1562590d837e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -60,14 +60,14 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd);
 
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
-				unsigned int sdma_queue_id);
+				struct queue *q);
 
 static void kfd_process_hw_exception(struct work_struct *work);
 
 static inline
 enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
 {
-	if (type == KFD_QUEUE_TYPE_SDMA)
+	if (type == KFD_QUEUE_TYPE_SDMA || type == KFD_QUEUE_TYPE_SDMA_XGMI)
 		return KFD_MQD_TYPE_SDMA;
 	return KFD_MQD_TYPE_CP;
 }
@@ -107,12 +107,23 @@ static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
 	return dqm->dev->device_info->num_sdma_engines;
 }
 
+static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm)
+{
+	return dqm->dev->device_info->num_xgmi_sdma_engines;
+}
+
 unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
 {
 	return dqm->dev->device_info->num_sdma_engines
 			* dqm->dev->device_info->num_sdma_queues_per_engine;
 }
 
+unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm)
+{
+	return dqm->dev->device_info->num_xgmi_sdma_engines
+			* dqm->dev->device_info->num_sdma_queues_per_engine;
+}
+
 void program_sh_mem_settings(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd)
 {
@@ -133,7 +144,8 @@ static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q)
 		 * preserve the user mode ABI.
 		 */
 		q->doorbell_id = q->properties.queue_id;
-	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+			q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		/* For SDMA queues on SOC15 with 8-byte doorbell, use static
 		 * doorbell assignments based on the engine and queue id.
 		 * The doobell index distance between RLC (2*i) and (2*i+1)
@@ -174,7 +186,8 @@ static void deallocate_doorbell(struct qcm_process_device *qpd,
 	struct kfd_dev *dev = qpd->dqm->dev;
 
 	if (!KFD_IS_SOC15(dev->device_info->asic_family) ||
-	    q->properties.type == KFD_QUEUE_TYPE_SDMA)
+	    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+	    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
 		return;
 
 	old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap);
@@ -289,7 +302,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 
 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
 		retval = create_compute_queue_nocpsch(dqm, q, qpd);
-	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+			q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
 		retval = create_sdma_queue_nocpsch(dqm, q, qpd);
 	else
 		retval = -EINVAL;
@@ -307,6 +321,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
 		dqm->sdma_queue_count++;
+	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		dqm->xgmi_sdma_queue_count++;
 
 	/*
 	 * Unconditionally increment this counter, regardless of the queue's
@@ -430,7 +446,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 		deallocate_hqd(dqm, q);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		dqm->sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q->sdma_id);
+		deallocate_sdma_queue(dqm, q);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		dqm->xgmi_sdma_queue_count--;
+		deallocate_sdma_queue(dqm, q);
 	} else {
 		pr_debug("q->properties.type %d is invalid\n",
 				q->properties.type);
@@ -521,7 +540,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 		}
 	} else if (prev_active &&
 		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
-		    q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
+		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
 		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
 				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
 				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
@@ -548,7 +568,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 		retval = map_queues_cpsch(dqm);
 	else if (q->properties.is_active &&
 		 (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
-		  q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
+		  q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		  q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
 		if (WARN(q->process->mm != current->mm,
 			 "should only run in user thread"))
 			retval = -EFAULT;
@@ -840,6 +861,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
 	INIT_LIST_HEAD(&dqm->queues);
 	dqm->queue_count = dqm->next_pipe_to_allocate = 0;
 	dqm->sdma_queue_count = 0;
+	dqm->xgmi_sdma_queue_count = 0;
 
 	for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
 		int pipe_offset = pipe * get_queues_per_pipe(dqm);
@@ -852,6 +874,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
 
 	dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
 	dqm->sdma_bitmap = (1ULL << get_num_sdma_queues(dqm)) - 1;
+	dqm->xgmi_sdma_bitmap = (1ULL << get_num_xgmi_sdma_queues(dqm)) - 1;
 
 	return 0;
 }
@@ -886,17 +909,34 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 {
 	int bit;
 
-	if (dqm->sdma_bitmap == 0)
-		return -ENOMEM;
-
-	bit = __ffs64(dqm->sdma_bitmap);
-	dqm->sdma_bitmap &= ~(1ULL << bit);
-	q->sdma_id = bit;
-
-	q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm);
-	q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm);
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+		if (dqm->sdma_bitmap == 0)
+			return -ENOMEM;
+		bit = __ffs64(dqm->sdma_bitmap);
+		dqm->sdma_bitmap &= ~(1ULL << bit);
+		q->sdma_id = bit;
+		q->properties.sdma_engine_id = q->sdma_id %
+				get_num_sdma_engines(dqm);
+		q->properties.sdma_queue_id = q->sdma_id /
+				get_num_sdma_engines(dqm);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		if (dqm->xgmi_sdma_bitmap == 0)
+			return -ENOMEM;
+		bit = __ffs64(dqm->xgmi_sdma_bitmap);
+		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
+		q->sdma_id = bit;
+		/* sdma_engine_id is sdma id including
+		 * both PCIe-optimized SDMAs and XGMI-
+		 * optimized SDMAs. The calculation below
+		 * assumes the first N engines are always
+		 * PCIe-optimized ones
+		 */
+		q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
+				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
+		q->properties.sdma_queue_id = q->sdma_id /
+				get_num_xgmi_sdma_engines(dqm);
+	}
 
-	pr_debug("SDMA id is:    %d\n", q->sdma_id);
 	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
 	pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
 
@@ -904,11 +944,17 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 }
 
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
-				unsigned int sdma_id)
+				struct queue *q)
 {
-	if (sdma_id >= get_num_sdma_queues(dqm))
-		return;
-	dqm->sdma_bitmap |= (1ULL << sdma_id);
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+		if (q->sdma_id >= get_num_sdma_queues(dqm))
+			return;
+		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
+			return;
+		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
+	}
 }
 
 static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
@@ -946,7 +992,7 @@ out_uninit_mqd:
 out_deallocate_doorbell:
 	deallocate_doorbell(qpd, q);
 out_deallocate_sdma_queue:
-	deallocate_sdma_queue(dqm, q->sdma_id);
+	deallocate_sdma_queue(dqm, q);
 
 	return retval;
 }
@@ -1004,8 +1050,10 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 	INIT_LIST_HEAD(&dqm->queues);
 	dqm->queue_count = dqm->processes_count = 0;
 	dqm->sdma_queue_count = 0;
+	dqm->xgmi_sdma_queue_count = 0;
 	dqm->active_runlist = false;
 	dqm->sdma_bitmap = (1ULL << get_num_sdma_queues(dqm)) - 1;
+	dqm->xgmi_sdma_bitmap = (1ULL << get_num_xgmi_sdma_queues(dqm)) - 1;
 
 	INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
 
@@ -1127,7 +1175,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 		goto out;
 	}
 
-	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		retval = allocate_sdma_queue(dqm, q);
 		if (retval)
 			goto out;
@@ -1167,6 +1216,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
 		dqm->sdma_queue_count++;
+	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		dqm->xgmi_sdma_queue_count++;
 	/*
 	 * Unconditionally increment this counter, regardless of the queue's
 	 * type or whether the queue is active.
@@ -1182,8 +1233,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 out_deallocate_doorbell:
 	deallocate_doorbell(qpd, q);
 out_deallocate_sdma_queue:
-	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
-		deallocate_sdma_queue(dqm, q->sdma_id);
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		deallocate_sdma_queue(dqm, q);
 out:
 	return retval;
 }
@@ -1216,7 +1268,8 @@ static int unmap_sdma_queues(struct device_queue_manager *dqm)
 {
 	int i, retval = 0;
 
-	for (i = 0; i < dqm->dev->device_info->num_sdma_engines; i++) {
+	for (i = 0; i < dqm->dev->device_info->num_sdma_engines +
+		dqm->dev->device_info->num_xgmi_sdma_engines; i++) {
 		retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
 			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, i);
 		if (retval)
@@ -1258,10 +1311,10 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 	if (!dqm->active_runlist)
 		return retval;
 
-	pr_debug("Before destroying queues, sdma queue count is : %u\n",
-		dqm->sdma_queue_count);
+	pr_debug("Before destroying queues, sdma queue count is : %u, xgmi sdma queue count is : %u\n",
+		dqm->sdma_queue_count, dqm->xgmi_sdma_queue_count);
 
-	if (dqm->sdma_queue_count > 0)
+	if (dqm->sdma_queue_count > 0 || dqm->xgmi_sdma_queue_count)
 		unmap_sdma_queues(dqm);
 
 	retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE,
@@ -1333,7 +1386,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		dqm->sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q->sdma_id);
+		deallocate_sdma_queue(dqm, q);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		dqm->xgmi_sdma_queue_count--;
+		deallocate_sdma_queue(dqm, q);
 	}
 
 	list_del(&q->list);
@@ -1550,7 +1606,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	list_for_each_entry(q, &qpd->queues_list, list) {
 		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 			dqm->sdma_queue_count--;
-			deallocate_sdma_queue(dqm, q->sdma_id);
+			deallocate_sdma_queue(dqm, q);
+		} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+			dqm->xgmi_sdma_queue_count--;
+			deallocate_sdma_queue(dqm, q);
 		}
 
 		if (q->properties.is_active)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 3742fd340ec3..88b4c007696e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -181,10 +181,12 @@ struct device_queue_manager {
 	unsigned int		processes_count;
 	unsigned int		queue_count;
 	unsigned int		sdma_queue_count;
+	unsigned int		xgmi_sdma_queue_count;
 	unsigned int		total_queue_count;
 	unsigned int		next_pipe_to_allocate;
 	unsigned int		*allocated_queues;
 	uint64_t		sdma_bitmap;
+	uint64_t		xgmi_sdma_bitmap;
 	unsigned int		vmid_bitmap;
 	uint64_t		pipelines_addr;
 	struct kfd_mem_obj	*pipeline_mem;
@@ -216,6 +218,7 @@ unsigned int get_queues_num(struct device_queue_manager *dqm);
 unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
 unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
 unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
+unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
 
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
index 33830b1a5a54..604570bea6bd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
@@ -175,6 +175,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 			queue_type__mes_map_queues__debug_interface_queue_vi;
 		break;
 	case KFD_QUEUE_TYPE_SDMA:
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
 		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
 				engine_sel__mes_map_queues__sdma0_vi;
 		use_static = false; /* no static queues under SDMA */
@@ -221,6 +222,7 @@ static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 			engine_sel__mes_unmap_queues__compute;
 		break;
 	case KFD_QUEUE_TYPE_SDMA:
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
 		packet->bitfields2.engine_sel =
 			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
 		break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
index bf20c6d32ef3..3cdb19826927 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
@@ -212,6 +212,7 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
 			queue_type__mes_map_queues__debug_interface_queue_vi;
 		break;
 	case KFD_QUEUE_TYPE_SDMA:
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
 		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
 				engine_sel__mes_map_queues__sdma0_vi;
 		use_static = false; /* no static queues under SDMA */
@@ -258,6 +259,7 @@ static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
 			engine_sel__mes_unmap_queues__compute;
 		break;
 	case KFD_QUEUE_TYPE_SDMA:
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
 		packet->bitfields2.engine_sel =
 			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
 		break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 045a229436a0..077c47fd4fee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -48,7 +48,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
 
 	process_count = pm->dqm->processes_count;
 	queue_count = pm->dqm->queue_count;
-	compute_queue_count = queue_count - pm->dqm->sdma_queue_count;
+	compute_queue_count = queue_count - pm->dqm->sdma_queue_count -
+				pm->dqm->xgmi_sdma_queue_count;
 
 	/* check if there is over subscription
 	 * Note: the arbitration between the number of VMIDs and
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 10bd1abe1646..8f02d7817162 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -188,6 +188,7 @@ struct kfd_device_info {
 	bool needs_iommu_device;
 	bool needs_pci_atomics;
 	unsigned int num_sdma_engines;
+	unsigned int num_xgmi_sdma_engines;
 	unsigned int num_sdma_queues_per_engine;
 };
 
@@ -329,7 +330,8 @@ enum kfd_queue_type  {
 	KFD_QUEUE_TYPE_COMPUTE,
 	KFD_QUEUE_TYPE_SDMA,
 	KFD_QUEUE_TYPE_HIQ,
-	KFD_QUEUE_TYPE_DIQ
+	KFD_QUEUE_TYPE_DIQ,
+	KFD_QUEUE_TYPE_SDMA_XGMI
 };
 
 enum kfd_queue_format {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index f18d9cdf9aac..e652e25ede75 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -186,8 +186,13 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 
 	switch (type) {
 	case KFD_QUEUE_TYPE_SDMA:
-		if (dev->dqm->queue_count >= get_num_sdma_queues(dev->dqm)) {
-			pr_err("Over-subscription is not allowed for SDMA.\n");
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
+		if ((type == KFD_QUEUE_TYPE_SDMA && dev->dqm->sdma_queue_count
+			>= get_num_sdma_queues(dev->dqm)) ||
+			(type == KFD_QUEUE_TYPE_SDMA_XGMI &&
+			dev->dqm->xgmi_sdma_queue_count
+			>= get_num_xgmi_sdma_queues(dev->dqm))) {
+			pr_debug("Over-subscription is not allowed for SDMA.\n");
 			retval = -EPERM;
 			goto err_create_queue;
 		}
@@ -446,6 +451,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
 			q = pqn->q;
 			switch (q->properties.type) {
 			case KFD_QUEUE_TYPE_SDMA:
+			case KFD_QUEUE_TYPE_SDMA_XGMI:
 				seq_printf(m, "  SDMA queue on device %x\n",
 					   q->device->id);
 				mqd_type = KFD_MQD_TYPE_SDMA;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 1e7d5f3376b0..20917c59f39c 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -35,9 +35,10 @@ struct kfd_ioctl_get_version_args {
 };
 
 /* For kfd_ioctl_create_queue_args.queue_type. */
-#define KFD_IOC_QUEUE_TYPE_COMPUTE	0
-#define KFD_IOC_QUEUE_TYPE_SDMA		1
-#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL	2
+#define KFD_IOC_QUEUE_TYPE_COMPUTE		0x0
+#define KFD_IOC_QUEUE_TYPE_SDMA			0x1
+#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL		0x2
+#define KFD_IOC_QUEUE_TYPE_SDMA_XGMI		0x3
 
 #define KFD_MAX_QUEUE_PERCENTAGE	100
 #define KFD_MAX_QUEUE_PRIORITY		15
-- 
cgit v1.2.3


From 1f58bb18f6f28d1df0b7144d90bc90ee5672416d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 May 2019 13:44:57 +0100
Subject: mount_pseudo(): drop 'name' argument, switch to d_make_root()

Once upon a time we used to set ->d_name of e.g. pipefs root
so that d_path() on pipes would work.  These days it's
completely pointless - dentries of pipes are not even connected
to pipefs root.  However, mount_pseudo() had set the root
dentry name (passed as the second argument) and callers
kept inventing names to pass to it.  Including those that
didn't *have* any non-root dentries to start with...

All of that had been pointless for about 8 years now; it's
time to get rid of that cargo-culting...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/perfmon.c      |  2 +-
 drivers/dax/super.c             |  2 +-
 drivers/gpu/drm/drm_drv.c       |  6 +-----
 drivers/misc/cxl/api.c          |  3 +--
 drivers/scsi/cxlflash/ocxl_hw.c |  3 +--
 drivers/virtio/virtio_balloon.c |  3 +--
 fs/aio.c                        |  3 +--
 fs/anon_inodes.c                |  4 ++--
 fs/block_dev.c                  |  2 +-
 fs/btrfs/tests/btrfs-tests.c    |  2 +-
 fs/libfs.c                      | 12 +++---------
 fs/nsfs.c                       |  2 +-
 fs/pipe.c                       |  2 +-
 include/linux/fs.h              |  6 +++---
 mm/z3fold.c                     |  2 +-
 mm/zsmalloc.c                   |  2 +-
 net/socket.c                    |  2 +-
 17 files changed, 22 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 7a969f4c3534..a30da6f2c28e 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -602,7 +602,7 @@ static const struct dentry_operations pfmfs_dentry_operations;
 static struct dentry *
 pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "pfm:", NULL, &pfmfs_dentry_operations,
+	return mount_pseudo(fs_type, NULL, &pfmfs_dentry_operations,
 			PFMFS_MAGIC);
 }
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 35f051efaf35..f83814eea5ad 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -440,7 +440,7 @@ static const struct super_operations dax_sops = {
 static struct dentry *dax_mount(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
+	return mount_pseudo(fs_type, &dax_sops, NULL, DAXFS_MAGIC);
 }
 
 static struct file_system_type dax_fs_type = {
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 8b44ac9a92ae..48365c62a190 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -535,11 +535,7 @@ static struct vfsmount *drm_fs_mnt;
 static struct dentry *drm_fs_mount(struct file_system_type *fs_type, int flags,
 				   const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type,
-			    "drm:",
-			    NULL,
-			    NULL,
-			    0x010203ff);
+	return mount_pseudo(fs_type, NULL, NULL, 0x010203ff);
 }
 
 static struct file_system_type drm_fs_type = {
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index a59c7af79873..1f2b0535a8cf 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -40,8 +40,7 @@ static struct vfsmount *cxl_vfs_mount;
 static struct dentry *cxl_fs_mount(struct file_system_type *fs_type, int flags,
 				const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "cxl:", NULL, NULL,
-			CXL_PSEUDO_FS_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, CXL_PSEUDO_FS_MAGIC);
 }
 
 static struct file_system_type cxl_fs_type = {
diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c
index 31cfdf2c8c30..38e1fbd2b406 100644
--- a/drivers/scsi/cxlflash/ocxl_hw.c
+++ b/drivers/scsi/cxlflash/ocxl_hw.c
@@ -48,8 +48,7 @@ static struct dentry *ocxlflash_fs_mount(struct file_system_type *fs_type,
 					 int flags, const char *dev_name,
 					 void *data)
 {
-	return mount_pseudo(fs_type, "ocxlflash:", NULL, NULL,
-			    OCXLFLASH_FS_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, OCXLFLASH_FS_MAGIC);
 }
 
 static struct file_system_type ocxlflash_fs_type = {
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 554d1a98d193..62bafc4f2662 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -761,8 +761,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 static struct dentry *balloon_mount(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "balloon-kvm:", NULL, NULL,
-				BALLOON_KVM_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, BALLOON_KVM_MAGIC);
 }
 
 static struct file_system_type balloon_fs = {
diff --git a/fs/aio.c b/fs/aio.c
index 3490d1fa0e16..09bc35fa6810 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -252,8 +252,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 static struct dentry *aio_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, NULL,
-					   AIO_RING_MAGIC);
+	struct dentry *root = mount_pseudo(fs_type, NULL, NULL, AIO_RING_MAGIC);
 
 	if (!IS_ERR(root))
 		root->d_sb->s_iflags |= SB_I_NOEXEC;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 91262c34b797..644d0837aafe 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -41,8 +41,8 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
 static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "anon_inode:", NULL,
-			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+	return mount_pseudo(fs_type, NULL, &anon_inodefs_dentry_operations,
+			    ANON_INODE_FS_MAGIC);
 }
 
 static struct file_system_type anon_inode_fs_type = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0f7552a87d54..3143da7b0998 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -837,7 +837,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	struct dentry *dent;
-	dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+	dent = mount_pseudo(fs_type, &bdev_sops, NULL, BDEVFS_MAGIC);
 	if (!IS_ERR(dent))
 		dent->d_sb->s_iflags |= SB_I_CGROUPWB;
 	return dent;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9238fd4f1734..6da54323eaf8 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -36,7 +36,7 @@ static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
 				       int flags, const char *dev_name,
 				       void *data)
 {
-	return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
+	return mount_pseudo(fs_type, &btrfs_test_super_ops,
 			    NULL, BTRFS_TEST_MAGIC);
 }
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 4b59b1816efb..030e545f586e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -239,14 +239,12 @@ static const struct super_operations simple_super_operations = {
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
  */
-struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
+struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type,
 	const struct super_operations *ops, const struct xattr_handler **xattr,
 	const struct dentry_operations *dops, unsigned long magic)
 {
 	struct super_block *s;
-	struct dentry *dentry;
 	struct inode *root;
-	struct qstr d_name = QSTR_INIT(name, strlen(name));
 
 	s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
 			&init_user_ns, NULL);
@@ -271,13 +269,9 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	root->i_ino = 1;
 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
 	root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
-	dentry = __d_alloc(s, &d_name);
-	if (!dentry) {
-		iput(root);
+	s->s_root = d_make_root(root);
+	if (!s->s_root)
 		goto Enomem;
-	}
-	d_instantiate(dentry, root);
-	s->s_root = dentry;
 	s->s_d_op = dops;
 	s->s_flags |= SB_ACTIVE;
 	return dget(s->s_root);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index e3bf08c5af41..b3c49ddc0f85 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -261,7 +261,7 @@ static const struct super_operations nsfs_ops = {
 static struct dentry *nsfs_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
+	return mount_pseudo(fs_type, &nsfs_ops,
 			&ns_dentry_operations, NSFS_MAGIC);
 }
 static struct file_system_type nsfs = {
diff --git a/fs/pipe.c b/fs/pipe.c
index 41065901106b..99a023730e6f 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1185,7 +1185,7 @@ static const struct super_operations pipefs_ops = {
 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+	return mount_pseudo(fs_type, &pipefs_ops,
 			&pipefs_dentry_operations, PIPEFS_MAGIC);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..b06251dd429f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2257,18 +2257,18 @@ struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			int flags, void *data);
-extern struct dentry *mount_pseudo_xattr(struct file_system_type *, char *,
+extern struct dentry *mount_pseudo_xattr(struct file_system_type *,
 					 const struct super_operations *ops,
 					 const struct xattr_handler **xattr,
 					 const struct dentry_operations *dops,
 					 unsigned long);
 
 static inline struct dentry *
-mount_pseudo(struct file_system_type *fs_type, char *name,
+mount_pseudo(struct file_system_type *fs_type,
 	     const struct super_operations *ops,
 	     const struct dentry_operations *dops, unsigned long magic)
 {
-	return mount_pseudo_xattr(fs_type, name, ops, NULL, dops, magic);
+	return mount_pseudo_xattr(fs_type, ops, NULL, dops, magic);
 }
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 0b14daf930a8..abeb5bcbea57 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -242,7 +242,7 @@ static inline void free_handle(unsigned long handle)
 static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "z3fold:", NULL, NULL, 0x33);
+	return mount_pseudo(fs_type, NULL, NULL, 0x33);
 }
 
 static struct file_system_type z3fold_fs = {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d9f831f63625..ef230be8c03e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1817,7 +1817,7 @@ static void lock_zspage(struct zspage *zspage)
 static struct dentry *zs_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "zsmalloc:", NULL, NULL, ZSMALLOC_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, ZSMALLOC_MAGIC);
 }
 
 static struct file_system_type zsmalloc_fs = {
diff --git a/net/socket.c b/net/socket.c
index 472fbefa5d9b..c86679584eed 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -362,7 +362,7 @@ static const struct xattr_handler *sockfs_xattr_handlers[] = {
 static struct dentry *sockfs_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo_xattr(fs_type, "socket:", &sockfs_ops,
+	return mount_pseudo_xattr(fs_type, &sockfs_ops,
 				  sockfs_xattr_handlers,
 				  &sockfs_dentry_operations, SOCKFS_MAGIC);
 }
-- 
cgit v1.2.3


From bb7b6b2bbdb827e68cd506c8f5e3ba13215cccb2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:28 +0000
Subject: vfs: Kill mount_ns()

Kill mount_ns() as it has been replaced by vfs_get_super() in the new mount
API.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 38 --------------------------------------
 include/linux/fs.h |  3 ---
 2 files changed, 41 deletions(-)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index 3ba91d70c2a8..6919f5c728f0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1143,44 +1143,6 @@ void kill_litter_super(struct super_block *sb)
 }
 EXPORT_SYMBOL(kill_litter_super);
 
-static int ns_test_super(struct super_block *sb, void *data)
-{
-	return sb->s_fs_info == data;
-}
-
-static int ns_set_super(struct super_block *sb, void *data)
-{
-	sb->s_fs_info = data;
-	return set_anon_super(sb, NULL);
-}
-
-struct dentry *mount_ns(struct file_system_type *fs_type,
-	int flags, void *data, void *ns, struct user_namespace *user_ns,
-	int (*fill_super)(struct super_block *, void *, int))
-{
-	struct super_block *sb;
-
-	sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
-			 user_ns, ns);
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-
-	if (!sb->s_root) {
-		int err;
-		err = fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
-		if (err) {
-			deactivate_locked_super(sb);
-			return ERR_PTR(err);
-		}
-
-		sb->s_flags |= SB_ACTIVE;
-	}
-
-	return dget(sb->s_root);
-}
-
-EXPORT_SYMBOL(mount_ns);
-
 int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
 {
 	return set_anon_super(sb, NULL);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b06251dd429f..790342cf4df9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2206,9 +2206,6 @@ struct file_system_type {
 
 #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
 
-extern struct dentry *mount_ns(struct file_system_type *fs_type,
-	int flags, void *data, void *ns, struct user_namespace *user_ns,
-	int (*fill_super)(struct super_block *, void *, int));
 #ifdef CONFIG_BLOCK
 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-- 
cgit v1.2.3


From c80fa7c8301c10ad10d997b9e86b4aeac5923b3e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:23 +0000
Subject: vfs: Provide sb->s_iflags settings in fs_context struct

Provide a field in the fs_context struct through which bits in the
sb->s_iflags superblock field can be set.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
---
 fs/super.c                 | 1 +
 include/linux/fs_context.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index 72b4a5afcfd6..f836b67abffe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -540,6 +540,7 @@ retry:
 	}
 	fc->s_fs_info = NULL;
 	s->s_type = fc->fs_type;
+	s->s_iflags |= fc->s_iflags;
 	strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
 	list_add_tail(&s->s_list, &super_blocks);
 	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 1f966670c8dc..c995b852ba40 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -103,6 +103,7 @@ struct fs_context {
 	void			*s_fs_info;	/* Proposed s_fs_info */
 	unsigned int		sb_flags;	/* Proposed superblock flags (SB_*) */
 	unsigned int		sb_flags_mask;	/* Superblock flags that were changed */
+	unsigned int		s_iflags;	/* OR'd with sb->s_iflags */
 	unsigned int		lsm_flags;	/* Information flags from the fs to the LSM */
 	enum fs_context_purpose	purpose:8;
 	enum fs_context_phase	phase:8;	/* The phase the context is in */
-- 
cgit v1.2.3


From 31d6d5ce53400d6dc58e29ddd8dc184b3ba89d66 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:23 +0000
Subject: vfs: Provide a mount_pseudo-replacement for the new mount API

Provide a function, init_pseudo(), that provides a common
infrastructure for converting pseudo-filesystems that can never be
mountable.

[AV: once all users of mount_pseudo_xattr() get converted, it will be folded
into pseudo_fs_get_tree()]

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
---
 fs/libfs.c                | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pseudo_fs.h | 16 ++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 include/linux/pseudo_fs.h

(limited to 'include')

diff --git a/fs/libfs.c b/fs/libfs.c
index 030e545f586e..edef70d35438 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,6 +16,8 @@
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> /* sync_mapping_buffers */
+#include <linux/fs_context.h>
+#include <linux/pseudo_fs.h>
 
 #include <linux/uaccess.h>
 
@@ -235,6 +237,50 @@ static const struct super_operations simple_super_operations = {
 	.statfs		= simple_statfs,
 };
 
+static int pseudo_fs_get_tree(struct fs_context *fc)
+{
+	struct pseudo_fs_context *ctx = fc->fs_private;
+	struct dentry *root;
+
+	root = mount_pseudo_xattr(fc->fs_type,
+				  ctx->ops, ctx->xattr,
+			          ctx->dops, ctx->magic);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	fc->root = root;
+	return 0;
+}
+
+static void pseudo_fs_free(struct fs_context *fc)
+{
+	kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations pseudo_fs_context_ops = {
+	.free		= pseudo_fs_free,
+	.get_tree	= pseudo_fs_get_tree,
+};
+
+/*
+ * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
+ * will never be mountable)
+ */
+struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
+					unsigned long magic)
+{
+	struct pseudo_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL);
+	if (likely(ctx)) {
+		ctx->magic = magic;
+		fc->fs_private = ctx;
+		fc->ops = &pseudo_fs_context_ops;
+	}
+	return ctx;
+}
+EXPORT_SYMBOL(init_pseudo);
+
 /*
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
new file mode 100644
index 000000000000..eceda1d1407a
--- /dev/null
+++ b/include/linux/pseudo_fs.h
@@ -0,0 +1,16 @@
+#ifndef __LINUX_PSEUDO_FS__
+#define __LINUX_PSEUDO_FS__
+
+#include <linux/fs_context.h>
+
+struct pseudo_fs_context {
+	const struct super_operations *ops;
+	const struct xattr_handler **xattr;
+	const struct dentry_operations *dops;
+	unsigned long magic;
+};
+
+struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
+				      unsigned long magic);
+
+#endif
-- 
cgit v1.2.3


From ea8157ab2ae5e914dd427e5cfab533b6da3819cd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 21 May 2019 07:55:45 +0100
Subject: zsfold: Convert zsfold to use the new mount API

Convert the zsfold filesystem to the new internal mount API as the old one
will be obsoleted and removed.  This allows greater flexibility in
communication of mount parameters between userspace, the VFS and the
filesystem.

See Documentation/filesystems/mount_api.txt for more information.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/uapi/linux/magic.h |  1 +
 mm/z3fold.c                | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index f8c00045d537..85c1119d0b0b 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -91,5 +91,6 @@
 #define UDF_SUPER_MAGIC		0x15013346
 #define BALLOON_KVM_MAGIC	0x13661366
 #define ZSMALLOC_MAGIC		0x58295829
+#define Z3FOLD_MAGIC		0x33
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/z3fold.c b/mm/z3fold.c
index abeb5bcbea57..a43e8bfcaaea 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -25,7 +25,6 @@
 #include <linux/atomic.h>
 #include <linux/sched.h>
 #include <linux/cpumask.h>
-#include <linux/dcache.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -35,12 +34,14 @@
 #include <linux/compaction.h>
 #include <linux/percpu.h>
 #include <linux/mount.h>
+#include <linux/pseudo_fs.h>
 #include <linux/fs.h>
 #include <linux/preempt.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/zpool.h>
+#include <linux/magic.h>
 
 /*
  * NCHUNKS_ORDER determines the internal allocation granularity, effectively
@@ -239,15 +240,14 @@ static inline void free_handle(unsigned long handle)
 	}
 }
 
-static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
-				int flags, const char *dev_name, void *data)
+static int z3fold_init_fs_context(struct fs_context *fc)
 {
-	return mount_pseudo(fs_type, NULL, NULL, 0x33);
+	return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
 }
 
 static struct file_system_type z3fold_fs = {
 	.name		= "z3fold",
-	.mount		= z3fold_do_mount,
+	.init_fs_context = z3fold_init_fs_context,
 	.kill_sb	= kill_anon_super,
 };
 
-- 
cgit v1.2.3


From 8d9e46d80777b484f8f0945c317ad618224d7811 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 May 2019 11:43:59 -0400
Subject: fold mount_pseudo_xattr() into pseudo_fs_get_tree()

... now that all other callers are gone

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c         | 88 +++++++++++++++++++++---------------------------------
 include/linux/fs.h | 13 --------
 2 files changed, 34 insertions(+), 67 deletions(-)

(limited to 'include')

diff --git a/fs/libfs.c b/fs/libfs.c
index edef70d35438..7df3c9a85f6b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -240,16 +240,43 @@ static const struct super_operations simple_super_operations = {
 static int pseudo_fs_get_tree(struct fs_context *fc)
 {
 	struct pseudo_fs_context *ctx = fc->fs_private;
-	struct dentry *root;
+	struct super_block *s;
+	struct inode *root;
 
-	root = mount_pseudo_xattr(fc->fs_type,
-				  ctx->ops, ctx->xattr,
-			          ctx->dops, ctx->magic);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
+	s = sget_userns(fc->fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
+			&init_user_ns, NULL);
+	if (IS_ERR(s))
+		return PTR_ERR(s);
 
-	fc->root = root;
+	s->s_maxbytes = MAX_LFS_FILESIZE;
+	s->s_blocksize = PAGE_SIZE;
+	s->s_blocksize_bits = PAGE_SHIFT;
+	s->s_magic = ctx->magic;
+	s->s_op = ctx->ops ?: &simple_super_operations;
+	s->s_xattr = ctx->xattr;
+	s->s_time_gran = 1;
+	root = new_inode(s);
+	if (!root)
+		goto Enomem;
+	/*
+	 * since this is the first inode, make it number 1. New inodes created
+	 * after this must take care not to collide with it (by passing
+	 * max_reserved of 1 to iunique).
+	 */
+	root->i_ino = 1;
+	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
+	root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
+	s->s_root = d_make_root(root);
+	if (!s->s_root)
+		goto Enomem;
+	s->s_d_op = ctx->dops;
+	s->s_flags |= SB_ACTIVE;
+	fc->root = dget(s->s_root);
 	return 0;
+
+Enomem:
+	deactivate_locked_super(s);
+	return -ENOMEM;
 }
 
 static void pseudo_fs_free(struct fs_context *fc)
@@ -281,53 +308,6 @@ struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
 }
 EXPORT_SYMBOL(init_pseudo);
 
-/*
- * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
- * will never be mountable)
- */
-struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type,
-	const struct super_operations *ops, const struct xattr_handler **xattr,
-	const struct dentry_operations *dops, unsigned long magic)
-{
-	struct super_block *s;
-	struct inode *root;
-
-	s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
-			&init_user_ns, NULL);
-	if (IS_ERR(s))
-		return ERR_CAST(s);
-
-	s->s_maxbytes = MAX_LFS_FILESIZE;
-	s->s_blocksize = PAGE_SIZE;
-	s->s_blocksize_bits = PAGE_SHIFT;
-	s->s_magic = magic;
-	s->s_op = ops ? ops : &simple_super_operations;
-	s->s_xattr = xattr;
-	s->s_time_gran = 1;
-	root = new_inode(s);
-	if (!root)
-		goto Enomem;
-	/*
-	 * since this is the first inode, make it number 1. New inodes created
-	 * after this must take care not to collide with it (by passing
-	 * max_reserved of 1 to iunique).
-	 */
-	root->i_ino = 1;
-	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-	root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
-	s->s_root = d_make_root(root);
-	if (!s->s_root)
-		goto Enomem;
-	s->s_d_op = dops;
-	s->s_flags |= SB_ACTIVE;
-	return dget(s->s_root);
-
-Enomem:
-	deactivate_locked_super(s);
-	return ERR_PTR(-ENOMEM);
-}
-EXPORT_SYMBOL(mount_pseudo_xattr);
-
 int simple_open(struct inode *inode, struct file *file)
 {
 	if (inode->i_private)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 790342cf4df9..d625acabbfcf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2254,19 +2254,6 @@ struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			int flags, void *data);
-extern struct dentry *mount_pseudo_xattr(struct file_system_type *,
-					 const struct super_operations *ops,
-					 const struct xattr_handler **xattr,
-					 const struct dentry_operations *dops,
-					 unsigned long);
-
-static inline struct dentry *
-mount_pseudo(struct file_system_type *fs_type,
-	     const struct super_operations *ops,
-	     const struct dentry_operations *dops, unsigned long magic)
-{
-	return mount_pseudo_xattr(fs_type, ops, NULL, dops, magic);
-}
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
-- 
cgit v1.2.3


From 023d066a0d0a87696c04b0de2ceae53063d0b655 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:28 +0000
Subject: vfs: Kill sget_userns()

Kill sget_userns(), folding it into sget() as that's the only remaining
user.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
---
 fs/super.c         | 54 ++++++++++++++++--------------------------------------
 include/linux/fs.h |  5 -----
 2 files changed, 16 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index f836b67abffe..ca2302501d32 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -563,24 +563,31 @@ share_extant_sb:
 EXPORT_SYMBOL(sget_fc);
 
 /**
- *	sget_userns -	find or create a superblock
- *	@type:	filesystem type superblock should belong to
- *	@test:	comparison callback
- *	@set:	setup callback
- *	@flags:	mount flags
- *	@user_ns: User namespace for the super_block
- *	@data:	argument to each of them
+ *	sget	-	find or create a superblock
+ *	@type:	  filesystem type superblock should belong to
+ *	@test:	  comparison callback
+ *	@set:	  setup callback
+ *	@flags:	  mount flags
+ *	@data:	  argument to each of them
  */
-struct super_block *sget_userns(struct file_system_type *type,
+struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-			int flags, struct user_namespace *user_ns,
+			int flags,
 			void *data)
 {
+	struct user_namespace *user_ns = current_user_ns();
 	struct super_block *s = NULL;
 	struct super_block *old;
 	int err;
 
+	/* We don't yet pass the user namespace of the parent
+	 * mount through to here so always use &init_user_ns
+	 * until that changes.
+	 */
+	if (flags & SB_SUBMOUNT)
+		user_ns = &init_user_ns;
+
 retry:
 	spin_lock(&sb_lock);
 	if (test) {
@@ -621,35 +628,6 @@ retry:
 	register_shrinker_prepared(&s->s_shrink);
 	return s;
 }
-
-EXPORT_SYMBOL(sget_userns);
-
-/**
- *	sget	-	find or create a superblock
- *	@type:	  filesystem type superblock should belong to
- *	@test:	  comparison callback
- *	@set:	  setup callback
- *	@flags:	  mount flags
- *	@data:	  argument to each of them
- */
-struct super_block *sget(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags,
-			void *data)
-{
-	struct user_namespace *user_ns = current_user_ns();
-
-	/* We don't yet pass the user namespace of the parent
-	 * mount through to here so always use &init_user_ns
-	 * until that changes.
-	 */
-	if (flags & SB_SUBMOUNT)
-		user_ns = &init_user_ns;
-
-	return sget_userns(type, test, set, flags, user_ns, data);
-}
-
 EXPORT_SYMBOL(sget);
 
 void drop_super(struct super_block *sb)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d625acabbfcf..71421856ff2c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2245,11 +2245,6 @@ void free_anon_bdev(dev_t);
 struct super_block *sget_fc(struct fs_context *fc,
 			    int (*test)(struct super_block *, struct fs_context *),
 			    int (*set)(struct super_block *, struct fs_context *));
-struct super_block *sget_userns(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags, struct user_namespace *user_ns,
-			void *data);
 struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-- 
cgit v1.2.3


From 7375dca1647fa978310f2d706ddbff537f72110b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 20 May 2019 09:26:24 -0400
Subject: ftrace: Make enable and update parameters bool when applicable

The code modification functions have "enable" and "update" variables that
are sometimes "int" but used as "bool". Remove the ambiguity and make them
"bool" when they are only used for true or false values.

Link: http://lkml.kernel.org/r/e1429923d9eda92a3cf5ee9e33c7eacce539781d.1558115654.git.naveen.n.rao@linux.vnet.ibm.com

Reported-by: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  4 ++--
 kernel/trace/ftrace.c  | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 25e2995d4a4c..8a8cb3c401b2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -427,8 +427,8 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter);
 	     iter = ftrace_rec_iter_next(iter))
 
 
-int ftrace_update_record(struct dyn_ftrace *rec, int enable);
-int ftrace_test_record(struct dyn_ftrace *rec, int enable);
+int ftrace_update_record(struct dyn_ftrace *rec, bool enable);
+int ftrace_test_record(struct dyn_ftrace *rec, bool enable);
 void ftrace_run_stop_machine(int command);
 unsigned long ftrace_location(unsigned long ip);
 unsigned long ftrace_location_range(unsigned long start, unsigned long end);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12aff849c04..4f2c26bebe2a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1768,7 +1768,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		count++;
 
 		/* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */
-		update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE;
+		update |= ftrace_test_record(rec, true) != FTRACE_UPDATE_IGNORE;
 
 		/* Shortcut, if we handled all records, we are done. */
 		if (!all && count == hash->count)
@@ -2047,7 +2047,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
 	}
 }
 
-static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
+static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 {
 	unsigned long flag = 0UL;
 
@@ -2146,28 +2146,28 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 /**
  * ftrace_update_record, set a record that now is tracing or not
  * @rec: the record to update
- * @enable: set to 1 if the record is tracing, zero to force disable
+ * @enable: set to true if the record is tracing, false to force disable
  *
  * The records that represent all functions that can be traced need
  * to be updated when tracing has been enabled.
  */
-int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
 {
-	return ftrace_check_record(rec, enable, 1);
+	return ftrace_check_record(rec, enable, true);
 }
 
 /**
  * ftrace_test_record, check if the record has been enabled or not
  * @rec: the record to test
- * @enable: set to 1 to check if enabled, 0 if it is disabled
+ * @enable: set to true to check if enabled, false if it is disabled
  *
  * The arch code may need to test if a record is already set to
  * tracing to determine how to modify the function code that it
  * represents.
  */
-int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+int ftrace_test_record(struct dyn_ftrace *rec, bool enable)
 {
-	return ftrace_check_record(rec, enable, 0);
+	return ftrace_check_record(rec, enable, false);
 }
 
 static struct ftrace_ops *
@@ -2356,7 +2356,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
 }
 
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, bool enable)
 {
 	unsigned long ftrace_old_addr;
 	unsigned long ftrace_addr;
@@ -2395,7 +2395,7 @@ void __weak ftrace_replace_code(int mod_flags)
 {
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
-	int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
+	bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
 	int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL;
 	int failed;
 
-- 
cgit v1.2.3


From 2d8d8fac3b4eab035dcd0068e1f5a746a697fbb3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 15 May 2019 14:38:06 +0900
Subject: x86/uaccess: Allow access_ok() in irq context if pagefault_disabled

WARN_ON_IN_IRQ() assumes that the access_ok() and following
user memory access can sleep. But this assumption is not
always correct; when the pagefault is disabled, following
memory access will just returns -EFAULT and never sleep.

Add pagefault_disabled() check in WARN_ON_ONCE() so that
it can ignore the case we call it with disabling pagefault.
For this purpose, this modified pagefault_disabled() as
an inline function.

Link: http://lkml.kernel.org/r/155789868664.26965.7932665824135793317.stgit@devnote2

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/include/asm/uaccess.h | 4 +++-
 include/linux/uaccess.h        | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c82abd6e4ca3..9c4435307ff8 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -66,7 +66,9 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 })
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-# define WARN_ON_IN_IRQ()	WARN_ON_ONCE(!in_task())
+static inline bool pagefault_disabled(void);
+# define WARN_ON_IN_IRQ()	\
+	WARN_ON_ONCE(!in_task() && !pagefault_disabled())
 #else
 # define WARN_ON_IN_IRQ()
 #endif
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 2b70130af585..5a43ef7db492 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -203,7 +203,10 @@ static inline void pagefault_enable(void)
 /*
  * Is the pagefault handler disabled? If so, user access methods will not sleep.
  */
-#define pagefault_disabled() (current->pagefault_disabled != 0)
+static inline bool pagefault_disabled(void)
+{
+	return current->pagefault_disabled != 0;
+}
 
 /*
  * The pagefault handler is in general disabled by pagefault_disable() or
-- 
cgit v1.2.3


From 3d7081822f7f9eab867d9bcc8fd635208ec438e0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 15 May 2019 14:38:18 +0900
Subject: uaccess: Add non-pagefault user-space read functions

Add probe_user_read(), strncpy_from_unsafe_user() and
strnlen_unsafe_user() which allows caller to access user-space
in IRQ context.

Current probe_kernel_read() and strncpy_from_unsafe() are
not available for user-space memory, because it sets
KERNEL_DS while accessing data. On some arch, user address
space and kernel address space can be co-exist, but others
can not. In that case, setting KERNEL_DS means given
address is treated as a kernel address space.
Also strnlen_user() is only available from user context since
it can sleep if pagefault is enabled.

To access user-space memory without pagefault, we need
these new functions which sets USER_DS while accessing
the data.

Link: http://lkml.kernel.org/r/155789869802.26965.4940338412595759063.stgit@devnote2

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/uaccess.h |  14 ++++++
 mm/maccess.c            | 122 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 130 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5a43ef7db492..9c435c3f2105 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -242,6 +242,17 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
 extern long probe_kernel_read(void *dst, const void *src, size_t size);
 extern long __probe_kernel_read(void *dst, const void *src, size_t size);
 
+/*
+ * probe_user_read(): safely attempt to read from a location in user space
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+extern long probe_user_read(void *dst, const void __user *src, size_t size);
+
 /*
  * probe_kernel_write(): safely attempt to write to a location
  * @dst: address to write to
@@ -255,6 +266,9 @@ extern long notrace probe_kernel_write(void *dst, const void *src, size_t size);
 extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size);
 
 extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
+extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+				     long count);
+extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count);
 
 /**
  * probe_kernel_address(): safely attempt to read from a location
diff --git a/mm/maccess.c b/mm/maccess.c
index ec00be51a24f..19c8c3dc14df 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -5,8 +5,20 @@
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 
+static __always_inline long
+probe_read_common(void *dst, const void __user *src, size_t size)
+{
+	long ret;
+
+	pagefault_disable();
+	ret = __copy_from_user_inatomic(dst, src, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+
 /**
- * probe_kernel_read(): safely attempt to read from a location
+ * probe_kernel_read(): safely attempt to read from a kernel-space location
  * @dst: pointer to the buffer that shall take the data
  * @src: address to read from
  * @size: size of the data chunk
@@ -29,16 +41,40 @@ long __probe_kernel_read(void *dst, const void *src, size_t size)
 	mm_segment_t old_fs = get_fs();
 
 	set_fs(KERNEL_DS);
-	pagefault_disable();
-	ret = __copy_from_user_inatomic(dst,
-			(__force const void __user *)src, size);
-	pagefault_enable();
+	ret = probe_read_common(dst, (__force const void __user *)src, size);
 	set_fs(old_fs);
 
-	return ret ? -EFAULT : 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(probe_kernel_read);
 
+/**
+ * probe_user_read(): safely attempt to read from a user-space location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from. This must be a user address.
+ * @size: size of the data chunk
+ *
+ * Safely read from user address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_read(void *dst, const void __user *src, size_t size)
+    __attribute__((alias("__probe_user_read")));
+
+long __probe_user_read(void *dst, const void __user *src, size_t size)
+{
+	long ret = -EFAULT;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(USER_DS);
+	if (access_ok(src, size))
+		ret = probe_read_common(dst, src, size);
+	set_fs(old_fs);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_read);
+
 /**
  * probe_kernel_write(): safely attempt to write to a location
  * @dst: address to write to
@@ -66,6 +102,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
 }
 EXPORT_SYMBOL_GPL(probe_kernel_write);
 
+
 /**
  * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
  * @dst:   Destination address, in kernel space.  This buffer must be at
@@ -105,3 +142,76 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
 
 	return ret ? -EFAULT : src - unsafe_addr;
 }
+
+/**
+ * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user
+ *				address.
+ * @dst:   Destination address, in kernel space.  This buffer must be at
+ *         least @count bytes long.
+ * @unsafe_addr: Unsafe user address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe user address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+			      long count)
+{
+	mm_segment_t old_fs = get_fs();
+	long ret;
+
+	if (unlikely(count <= 0))
+		return 0;
+
+	set_fs(USER_DS);
+	pagefault_disable();
+	ret = strncpy_from_user(dst, unsafe_addr, count);
+	pagefault_enable();
+	set_fs(old_fs);
+
+	if (ret >= count) {
+		ret = count;
+		dst[ret - 1] = '\0';
+	} else if (ret > 0) {
+		ret++;
+	}
+
+	return ret;
+}
+
+/**
+ * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL.
+ * @unsafe_addr: The string to measure.
+ * @count: Maximum count (including NUL)
+ *
+ * Get the size of a NUL-terminated string in user space without pagefault.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ *
+ * If the string is too long, returns a number larger than @count. User
+ * has to check the return value against "> count".
+ * On exception (or invalid count), returns 0.
+ *
+ * Unlike strnlen_user, this can be used from IRQ handler etc. because
+ * it disables pagefaults.
+ */
+long strnlen_unsafe_user(const void __user *unsafe_addr, long count)
+{
+	mm_segment_t old_fs = get_fs();
+	int ret;
+
+	set_fs(USER_DS);
+	pagefault_disable();
+	ret = strnlen_user(unsafe_addr, count);
+	pagefault_enable();
+	set_fs(old_fs);
+
+	return ret;
+}
-- 
cgit v1.2.3


From 87a90956eeab260a469a51897bfda27b28adf67d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 22 May 2019 17:27:44 +0900
Subject: uaccess: Add a prototype of non-static __probe_user_read()

Declare a prototype of non-static __probe_user_read() as
same as __probe_kernel_read() at uaccess.h.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/uaccess.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 9c435c3f2105..34a038563d97 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -252,6 +252,7 @@ extern long __probe_kernel_read(void *dst, const void *src, size_t size);
  * happens, handle that and return -EFAULT.
  */
 extern long probe_user_read(void *dst, const void __user *src, size_t size);
+extern long __probe_user_read(void *dst, const void __user *src, size_t size);
 
 /*
  * probe_kernel_write(): safely attempt to write to a location
-- 
cgit v1.2.3


From 54e6a745aad30ba82bbea5b0e9869e42a39cbbc3 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sun, 19 May 2019 20:36:35 +0200
Subject: drm/bridge: make dw_mipi_dsi.h self-contained

To allow users to include dw_mipi_dsi.h without pulling in dependencies
make dw_mipi_dsi.h self-contained.
Use forward declarations when possible.

v2:
- Drop forward declarations of local structs (Laurent)
- Add include of drm_modes.h (Laurent)

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Laurent Pinchart <Laurent.pinchart@ideasonboard.com>
Cc: Andrzej Hajda <a.hajda@samsung.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Peter Senna Tschudin <peter.senna@gmail.com>
Cc: Martin Donnelly <martin.donnelly@ge.com>
Cc: Martyn Welch <martyn.welch@collabora.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190519183636.19588-2-sam@ravnborg.org
---
 include/drm/bridge/dw_mipi_dsi.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/drm/bridge/dw_mipi_dsi.h b/include/drm/bridge/dw_mipi_dsi.h
index 7d3dd69a5caa..92cf9d1e1655 100644
--- a/include/drm/bridge/dw_mipi_dsi.h
+++ b/include/drm/bridge/dw_mipi_dsi.h
@@ -10,7 +10,15 @@
 #ifndef __DW_MIPI_DSI__
 #define __DW_MIPI_DSI__
 
+#include <linux/types.h>
+
+#include <drm/drm_modes.h>
+
+struct drm_display_mode;
+struct drm_encoder;
 struct dw_mipi_dsi;
+struct mipi_dsi_device;
+struct platform_device;
 
 struct dw_mipi_dsi_phy_ops {
 	int (*init)(void *priv_data);
-- 
cgit v1.2.3


From f5151311c3f37f6edc85b2253ccf6d3e2a4c4c26 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Mon, 20 May 2019 19:32:14 +0800
Subject: dmaengine: Add matching device node validation in
 __dma_request_channel()

When user try to request one DMA channel by __dma_request_channel(), it won't
validate if it is the correct DMA device to request, that will lead each DMA
engine driver to validate the correct device node in their filter function
if it is necessary.

Thus we can add the matching device node validation in the DMA engine core,
to remove all of device node validation in the drivers.

Tested-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dmaengine.c   | 10 ++++++++--
 drivers/dma/of-dma.c      |  4 ++--
 include/linux/dmaengine.h | 12 ++++++++----
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 3a11b1092e80..610080c629bb 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -641,11 +641,13 @@ EXPORT_SYMBOL_GPL(dma_get_any_slave_channel);
  * @mask: capabilities that the channel must satisfy
  * @fn: optional callback to disposition available channels
  * @fn_param: opaque parameter to pass to dma_filter_fn
+ * @np: device node to look for DMA channels
  *
  * Returns pointer to appropriate DMA channel on success or NULL.
  */
 struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
-				       dma_filter_fn fn, void *fn_param)
+				       dma_filter_fn fn, void *fn_param,
+				       struct device_node *np)
 {
 	struct dma_device *device, *_d;
 	struct dma_chan *chan = NULL;
@@ -653,6 +655,10 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
 	/* Find a channel */
 	mutex_lock(&dma_list_mutex);
 	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
+		/* Finds a DMA controller with matching device node */
+		if (np && device->dev->of_node && np != device->dev->of_node)
+			continue;
+
 		chan = find_candidate(device, mask, fn, fn_param);
 		if (!IS_ERR(chan))
 			break;
@@ -769,7 +775,7 @@ struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask)
 	if (!mask)
 		return ERR_PTR(-ENODEV);
 
-	chan = __dma_request_channel(mask, NULL, NULL);
+	chan = __dma_request_channel(mask, NULL, NULL, NULL);
 	if (!chan) {
 		mutex_lock(&dma_list_mutex);
 		if (list_empty(&dma_device_list))
diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index 91fd395c90c4..6b43d04da05d 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -316,8 +316,8 @@ struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
 	if (count != 1)
 		return NULL;
 
-	return dma_request_channel(info->dma_cap, info->filter_fn,
-			&dma_spec->args[0]);
+	return __dma_request_channel(&info->dma_cap, info->filter_fn,
+				     &dma_spec->args[0], dma_spec->np);
 }
 EXPORT_SYMBOL_GPL(of_dma_simple_xlate);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index d49ec5c31944..504085b2bf21 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1314,7 +1314,8 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
 enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
 void dma_issue_pending_all(void);
 struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
-					dma_filter_fn fn, void *fn_param);
+				       dma_filter_fn fn, void *fn_param,
+				       struct device_node *np);
 struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name);
 
 struct dma_chan *dma_request_chan(struct device *dev, const char *name);
@@ -1339,7 +1340,9 @@ static inline void dma_issue_pending_all(void)
 {
 }
 static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
-					      dma_filter_fn fn, void *fn_param)
+						     dma_filter_fn fn,
+						     void *fn_param,
+						     struct device_node *np)
 {
 	return NULL;
 }
@@ -1411,7 +1414,8 @@ void dma_async_device_unregister(struct dma_device *device);
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
 struct dma_chan *dma_get_slave_channel(struct dma_chan *chan);
 struct dma_chan *dma_get_any_slave_channel(struct dma_device *device);
-#define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y)
+#define dma_request_channel(mask, x, y) \
+	__dma_request_channel(&(mask), x, y, NULL)
 #define dma_request_slave_channel_compat(mask, x, y, dev, name) \
 	__dma_request_slave_channel_compat(&(mask), x, y, dev, name)
 
@@ -1429,6 +1433,6 @@ static inline struct dma_chan
 	if (!fn || !fn_param)
 		return NULL;
 
-	return __dma_request_channel(mask, fn, fn_param);
+	return __dma_request_channel(mask, fn, fn_param, NULL);
 }
 #endif /* DMAENGINE_H */
-- 
cgit v1.2.3


From 990c0b53bf6599a9c9c7df1529dde681dee6cf64 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Mon, 20 May 2019 19:32:16 +0800
Subject: dmaengine: imx-sdma: Let the core do the device node validation

Let the DMA engine core do the device node validation instead of drivers.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c                | 9 ++-------
 include/linux/platform_data/dma-imx.h | 1 -
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 99d9f431ae2c..ca296f0849ef 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1934,16 +1934,11 @@ disable_clk_ipg:
 static bool sdma_filter_fn(struct dma_chan *chan, void *fn_param)
 {
 	struct sdma_channel *sdmac = to_sdma_chan(chan);
-	struct sdma_engine *sdma = sdmac->sdma;
 	struct imx_dma_data *data = fn_param;
 
 	if (!imx_dma_is_general_purpose(chan))
 		return false;
 
-	/* return false if it's not the right device */
-	if (sdma->dev->of_node != data->of_node)
-		return false;
-
 	sdmac->data = *data;
 	chan->private = &sdmac->data;
 
@@ -1971,9 +1966,9 @@ static struct dma_chan *sdma_xlate(struct of_phandle_args *dma_spec,
 	 * be set to sdmac->event_id1.
 	 */
 	data.dma_request2 = 0;
-	data.of_node = ofdma->of_node;
 
-	return dma_request_channel(mask, sdma_filter_fn, &data);
+	return __dma_request_channel(&mask, sdma_filter_fn, &data,
+				     ofdma->of_node);
 }
 
 static int sdma_probe(struct platform_device *pdev)
diff --git a/include/linux/platform_data/dma-imx.h b/include/linux/platform_data/dma-imx.h
index 9daea8d42a10..7d964e787299 100644
--- a/include/linux/platform_data/dma-imx.h
+++ b/include/linux/platform_data/dma-imx.h
@@ -55,7 +55,6 @@ struct imx_dma_data {
 	int dma_request2; /* secondary DMA request line */
 	enum sdma_peripheral_type peripheral_type;
 	int priority;
-	struct device_node *of_node;
 };
 
 static inline int imx_dma_is_ipu(struct dma_chan *chan)
-- 
cgit v1.2.3


From d27ac2e02bf256d4e824e7c1e1e1afa2b96cefcc Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 27 May 2019 09:55:16 +0300
Subject: include: fpga: adi-axi-common.h: add common regs & defs header

The AXI HDL cores provided for Analog Devices reference designs all share
some common base registers (e.g. version register at address 0x00).

To reduce duplication for this, a common header is added to define these
registers as well as bitfields & macros to work with these registers.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/fpga/adi-axi-common.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 include/linux/fpga/adi-axi-common.h

(limited to 'include')

diff --git a/include/linux/fpga/adi-axi-common.h b/include/linux/fpga/adi-axi-common.h
new file mode 100644
index 000000000000..7fc95d5c95bb
--- /dev/null
+++ b/include/linux/fpga/adi-axi-common.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Analog Devices AXI common registers & definitions
+ *
+ * Copyright 2019 Analog Devices Inc.
+ *
+ * https://wiki.analog.com/resources/fpga/docs/axi_ip
+ * https://wiki.analog.com/resources/fpga/docs/hdl/regmap
+ */
+
+#ifndef ADI_AXI_COMMON_H_
+#define ADI_AXI_COMMON_H_
+
+#define	ADI_AXI_REG_VERSION			0x0000
+
+#define ADI_AXI_PCORE_VER(major, minor, patch)	\
+	(((major) << 16) | ((minor) << 8) | (patch))
+
+#endif /* ADI_AXI_COMMON_H_ */
-- 
cgit v1.2.3


From 2197f55f462dc776312fada027d45d52bd12749c Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sun, 26 May 2019 19:35:29 +0200
Subject: drm: make drm/drm_auth.h self contained

Do not require users of include/drm/drm_auth.h to include
other files just to let it build.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Daniel Vetter <daniel@ffwll.ch>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: David Airlie <airlied@linux.ie>
Link: https://patchwork.freedesktop.org/patch/msgid/20190526173535.32701-2-sam@ravnborg.org
---
 include/drm/drm_auth.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_auth.h b/include/drm/drm_auth.h
index 871008118bab..6bf8b2b78991 100644
--- a/include/drm/drm_auth.h
+++ b/include/drm/drm_auth.h
@@ -1,3 +1,6 @@
+#ifndef _DRM_AUTH_H_
+#define _DRM_AUTH_H_
+
 /*
  * Internal Header for the Direct Rendering Manager
  *
@@ -25,8 +28,12 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#ifndef _DRM_AUTH_H_
-#define _DRM_AUTH_H_
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/wait.h>
+
+struct drm_file;
+struct drm_hw_lock;
 
 /*
  * Legacy DRI1 locking data structure. Only here instead of in drm_legacy.h for
-- 
cgit v1.2.3


From cbe932a38dc67f4c5c2fa6ffc4e7f4efa27c06f7 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sun, 26 May 2019 19:35:30 +0200
Subject: drm: make drm/drm_legacy.h self-contained

Do not require users of include/drm/drm_legacy.h to
include other files just to let it build.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Daniel Vetter <daniel@ffwll.ch>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: David Airlie <airlied@linux.ie>
Link: https://patchwork.freedesktop.org/patch/msgid/20190526173535.32701-3-sam@ravnborg.org
---
 include/drm/drm_legacy.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_legacy.h b/include/drm/drm_legacy.h
index 2182a56ac421..58dc0c04bf99 100644
--- a/include/drm/drm_legacy.h
+++ b/include/drm/drm_legacy.h
@@ -1,11 +1,5 @@
 #ifndef __DRM_DRM_LEGACY_H__
 #define __DRM_DRM_LEGACY_H__
-
-#include <drm/drm_auth.h>
-#include <drm/drm_hashtab.h>
-
-struct drm_device;
-
 /*
  * Legacy driver interfaces for the Direct Rendering Manager
  *
@@ -39,6 +33,12 @@ struct drm_device;
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include <drm/drm.h>
+#include <drm/drm_auth.h>
+#include <drm/drm_hashtab.h>
+
+struct drm_device;
+struct file;
 
 /*
  * Legacy Support for palateontologic DRM drivers
-- 
cgit v1.2.3


From 153969fd952d81ab8f57574f9be1a90b0a0fa791 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 21 May 2019 03:38:25 +0200
Subject: ARM: versatile: Drop CLCD platform data

The Versatile family no longer makes any use of the CLCD
platform data, we have moved over all users to the DRM
driver that has built-in handling of the displays. Delete
the old auxdata.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-versatile/versatile_dt.c             | 157 ---------------------
 include/linux/platform_data/video-clcd-versatile.h |  28 ----
 2 files changed, 185 deletions(-)
 delete mode 100644 include/linux/platform_data/video-clcd-versatile.h

(limited to 'include')

diff --git a/arch/arm/mach-versatile/versatile_dt.c b/arch/arm/mach-versatile/versatile_dt.c
index 028463af726d..b5ff1ea5a944 100644
--- a/arch/arm/mach-versatile/versatile_dt.c
+++ b/arch/arm/mach-versatile/versatile_dt.c
@@ -29,8 +29,6 @@
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 #include <linux/amba/bus.h>
-#include <linux/amba/clcd.h>
-#include <linux/platform_data/video-clcd-versatile.h>
 #include <linux/amba/mmci.h>
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
@@ -47,14 +45,12 @@
  */
 #define VERSATILE_SYS_PCICTL_OFFSET           0x44
 #define VERSATILE_SYS_MCI_OFFSET              0x48
-#define VERSATILE_SYS_CLCD_OFFSET             0x50
 
 /*
  * VERSATILE peripheral addresses
  */
 #define VERSATILE_MMCI0_BASE           0x10005000	/* MMC interface */
 #define VERSATILE_MMCI1_BASE           0x1000B000	/* MMC Interface */
-#define VERSATILE_CLCD_BASE            0x10120000	/* CLCD */
 #define VERSATILE_SCTL_BASE            0x101E0000	/* System controller */
 #define VERSATILE_IB2_BASE             0x24000000	/* IB2 module */
 #define VERSATILE_IB2_CTL_BASE		(VERSATILE_IB2_BASE + 0x03000000)
@@ -96,158 +92,6 @@ static struct mmci_platform_data mmc1_plat_data = {
 	.status		= mmc_status,
 };
 
-/*
- * CLCD support.
- */
-#define SYS_CLCD_MODE_MASK	(3 << 0)
-#define SYS_CLCD_MODE_888	(0 << 0)
-#define SYS_CLCD_MODE_5551	(1 << 0)
-#define SYS_CLCD_MODE_565_RLSB	(2 << 0)
-#define SYS_CLCD_MODE_565_BLSB	(3 << 0)
-#define SYS_CLCD_NLCDIOON	(1 << 2)
-#define SYS_CLCD_VDDPOSSWITCH	(1 << 3)
-#define SYS_CLCD_PWR3V5SWITCH	(1 << 4)
-#define SYS_CLCD_ID_MASK	(0x1f << 8)
-#define SYS_CLCD_ID_SANYO_3_8	(0x00 << 8)
-#define SYS_CLCD_ID_UNKNOWN_8_4	(0x01 << 8)
-#define SYS_CLCD_ID_EPSON_2_2	(0x02 << 8)
-#define SYS_CLCD_ID_SANYO_2_5	(0x07 << 8)
-#define SYS_CLCD_ID_VGA		(0x1f << 8)
-
-static bool is_sanyo_2_5_lcd;
-
-/*
- * Disable all display connectors on the interface module.
- */
-static void versatile_clcd_disable(struct clcd_fb *fb)
-{
-	void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET;
-	u32 val;
-
-	val = readl(sys_clcd);
-	val &= ~SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH;
-	writel(val, sys_clcd);
-
-	/*
-	 * If the LCD is Sanyo 2x5 in on the IB2 board, turn the back-light off
-	 */
-	if (of_machine_is_compatible("arm,versatile-ab") && is_sanyo_2_5_lcd) {
-		unsigned long ctrl;
-
-		ctrl = readl(versatile_ib2_ctrl);
-		ctrl &= ~0x01;
-		writel(ctrl, versatile_ib2_ctrl);
-	}
-}
-
-/*
- * Enable the relevant connector on the interface module.
- */
-static void versatile_clcd_enable(struct clcd_fb *fb)
-{
-	struct fb_var_screeninfo *var = &fb->fb.var;
-	void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET;
-	u32 val;
-
-	val = readl(sys_clcd);
-	val &= ~SYS_CLCD_MODE_MASK;
-
-	switch (var->green.length) {
-	case 5:
-		val |= SYS_CLCD_MODE_5551;
-		break;
-	case 6:
-		if (var->red.offset == 0)
-			val |= SYS_CLCD_MODE_565_RLSB;
-		else
-			val |= SYS_CLCD_MODE_565_BLSB;
-		break;
-	case 8:
-		val |= SYS_CLCD_MODE_888;
-		break;
-	}
-
-	/*
-	 * Set the MUX
-	 */
-	writel(val, sys_clcd);
-
-	/*
-	 * And now enable the PSUs
-	 */
-	val |= SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH;
-	writel(val, sys_clcd);
-
-	/*
-	 * If the LCD is Sanyo 2x5 in on the IB2 board, turn the back-light on
-	 */
-	if (of_machine_is_compatible("arm,versatile-ab") && is_sanyo_2_5_lcd) {
-		unsigned long ctrl;
-
-		ctrl = readl(versatile_ib2_ctrl);
-		ctrl |= 0x01;
-		writel(ctrl, versatile_ib2_ctrl);
-	}
-}
-
-/*
- * Detect which LCD panel is connected, and return the appropriate
- * clcd_panel structure.  Note: we do not have any information on
- * the required timings for the 8.4in panel, so we presently assume
- * VGA timings.
- */
-static int versatile_clcd_setup(struct clcd_fb *fb)
-{
-	void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET;
-	const char *panel_name;
-	u32 val;
-
-	is_sanyo_2_5_lcd = false;
-
-	val = readl(sys_clcd) & SYS_CLCD_ID_MASK;
-	if (val == SYS_CLCD_ID_SANYO_3_8)
-		panel_name = "Sanyo TM38QV67A02A";
-	else if (val == SYS_CLCD_ID_SANYO_2_5) {
-		panel_name = "Sanyo QVGA Portrait";
-		is_sanyo_2_5_lcd = true;
-	} else if (val == SYS_CLCD_ID_EPSON_2_2)
-		panel_name = "Epson L2F50113T00";
-	else if (val == SYS_CLCD_ID_VGA)
-		panel_name = "VGA";
-	else {
-		printk(KERN_ERR "CLCD: unknown LCD panel ID 0x%08x, using VGA\n",
-			val);
-		panel_name = "VGA";
-	}
-
-	fb->panel = versatile_clcd_get_panel(panel_name);
-	if (!fb->panel)
-		return -EINVAL;
-
-	return versatile_clcd_setup_dma(fb, SZ_1M);
-}
-
-static void versatile_clcd_decode(struct clcd_fb *fb, struct clcd_regs *regs)
-{
-	clcdfb_decode(fb, regs);
-
-	/* Always clear BGR for RGB565: we do the routing externally */
-	if (fb->fb.var.green.length == 6)
-		regs->cntl &= ~CNTL_BGR;
-}
-
-static struct clcd_board clcd_plat_data = {
-	.name		= "Versatile",
-	.caps		= CLCD_CAP_5551 | CLCD_CAP_565 | CLCD_CAP_888,
-	.check		= clcdfb_check,
-	.decode		= versatile_clcd_decode,
-	.disable	= versatile_clcd_disable,
-	.enable		= versatile_clcd_enable,
-	.setup		= versatile_clcd_setup,
-	.mmap		= versatile_clcd_mmap_dma,
-	.remove		= versatile_clcd_remove_dma,
-};
-
 /*
  * Lookup table for attaching a specific name and platform_data pointer to
  * devices as they get created by of_platform_populate().  Ideally this table
@@ -257,7 +101,6 @@ static struct clcd_board clcd_plat_data = {
 struct of_dev_auxdata versatile_auxdata_lookup[] __initdata = {
 	OF_DEV_AUXDATA("arm,primecell", VERSATILE_MMCI0_BASE, "fpga:05", &mmc0_plat_data),
 	OF_DEV_AUXDATA("arm,primecell", VERSATILE_MMCI1_BASE, "fpga:0b", &mmc1_plat_data),
-	OF_DEV_AUXDATA("arm,primecell", VERSATILE_CLCD_BASE, "dev:20", &clcd_plat_data),
 	{}
 };
 
diff --git a/include/linux/platform_data/video-clcd-versatile.h b/include/linux/platform_data/video-clcd-versatile.h
deleted file mode 100644
index 305ebaec3afd..000000000000
--- a/include/linux/platform_data/video-clcd-versatile.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef PLAT_CLCD_H
-#define PLAT_CLCD_H
-
-#ifdef CONFIG_PLAT_VERSATILE_CLCD
-struct clcd_panel *versatile_clcd_get_panel(const char *);
-int versatile_clcd_setup_dma(struct clcd_fb *, unsigned long);
-int versatile_clcd_mmap_dma(struct clcd_fb *, struct vm_area_struct *);
-void versatile_clcd_remove_dma(struct clcd_fb *);
-#else
-static inline struct clcd_panel *versatile_clcd_get_panel(const char *s)
-{
-	return NULL;
-}
-static inline int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
-{
-	return -ENODEV;
-}
-static inline int versatile_clcd_mmap_dma(struct clcd_fb *fb, struct vm_area_struct *vm)
-{
-	return -ENODEV;
-}
-static inline void versatile_clcd_remove_dma(struct clcd_fb *fb)
-{
-}
-#endif
-
-#endif
-- 
cgit v1.2.3


From 2b2f7def058a5386838ef4dba70a860285f79e66 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 27 May 2019 04:51:53 -0700
Subject: bus: ti-sysc: Add support for missing clockdomain handling

We need to let ti-sysc driver manage clockdomain autoidle for the
duration of of reset, enable and idle. And we need to do it before we
enable the clock and after we disable it. Currently we are still
relying on platform callbacks indirectly managing clockdomain autoidle.
But I noticed that for device tree only probed drivers it now happens
only after we enabling the clocks and before we disable the clocks,
while it should be the other way around. So far I have not noticed
any issues with this though.

Let's add new ti_sysc_clkdm_deny_idle() and ti_sysc_clkdm_allow_idle()
functions for ti-sysc driver to use to manage clockdomains directly via
platform data callbacks. Note that we can implement the clockdomain
functions in pdata-quirks.c as for probing devices without "ti,hwmods"
custom property we don't need to use the other platform data callbacks.

Let's do this in one patch as there's is still an unlikely chance we
may need to apply this as a fix for v5.2 for dropping legacy platform
data for some devices. We also do have the option of adding back the
platform data if needed in case of trouble.

Tested-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/omap_hwmod.c      |  39 ++---------
 arch/arm/mach-omap2/pdata-quirks.c    |  60 ++++++++++++++++
 drivers/bus/ti-sysc.c                 | 127 +++++++++++++++++++++++++++-------
 include/linux/platform_data/ti-sysc.h |   8 +++
 4 files changed, 174 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index 405ac24def05..932ba221e8e7 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -3445,6 +3445,7 @@ static int omap_hwmod_check_module(struct device *dev,
  * @dev: struct device
  * @oh: module
  * @sysc_fields: sysc register bits
+ * @clockdomain: clockdomain
  * @rev_offs: revision register offset
  * @sysc_offs: sysconfig register offset
  * @syss_offs: sysstatus register offset
@@ -3456,6 +3457,7 @@ static int omap_hwmod_check_module(struct device *dev,
 static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh,
 				      const struct ti_sysc_module_data *data,
 				      struct sysc_regbits *sysc_fields,
+				      struct clockdomain *clkdm,
 				      s32 rev_offs, s32 sysc_offs,
 				      s32 syss_offs, u32 sysc_flags,
 				      u32 idlemodes)
@@ -3463,8 +3465,6 @@ static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh,
 	struct omap_hwmod_class_sysconfig *sysc;
 	struct omap_hwmod_class *class = NULL;
 	struct omap_hwmod_ocp_if *oi = NULL;
-	struct clockdomain *clkdm = NULL;
-	struct clk *clk = NULL;
 	void __iomem *regs = NULL;
 	unsigned long flags;
 
@@ -3511,36 +3511,6 @@ static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh,
 		oi->user = OCP_USER_MPU | OCP_USER_SDMA;
 	}
 
-	if (!oh->_clk) {
-		struct clk_hw_omap *hwclk;
-
-		clk = of_clk_get_by_name(dev->of_node, "fck");
-		if (!IS_ERR(clk))
-			clk_prepare(clk);
-		else
-			clk = NULL;
-
-		/*
-		 * Populate clockdomain based on dts clock. It is needed for
-		 * clkdm_deny_idle() and clkdm_allow_idle() until we have have
-		 * interconnect driver and reset driver capable of blocking
-		 * clockdomain idle during reset, enable and idle.
-		 */
-		if (clk) {
-			hwclk = to_clk_hw_omap(__clk_get_hw(clk));
-			if (hwclk && hwclk->clkdm_name)
-				clkdm = clkdm_lookup(hwclk->clkdm_name);
-		}
-
-		/*
-		 * Note that we assume interconnect driver manages the clocks
-		 * and do not need to populate oh->_clk for dynamically
-		 * allocated modules.
-		 */
-		clk_unprepare(clk);
-		clk_put(clk);
-	}
-
 	spin_lock_irqsave(&oh->_lock, flags);
 	if (regs)
 		oh->_mpu_rt_va = regs;
@@ -3626,7 +3596,7 @@ int omap_hwmod_init_module(struct device *dev,
 	u32 sysc_flags, idlemodes;
 	int error;
 
-	if (!dev || !data)
+	if (!dev || !data || !data->name || !cookie)
 		return -EINVAL;
 
 	oh = _lookup(data->name);
@@ -3697,7 +3667,8 @@ int omap_hwmod_init_module(struct device *dev,
 		return error;
 
 	return omap_hwmod_allocate_module(dev, oh, data, sysc_fields,
-					  rev_offs, sysc_offs, syss_offs,
+					  cookie->clkdm, rev_offs,
+					  sysc_offs, syss_offs,
 					  sysc_flags, idlemodes);
 }
 
diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index a2ecc5e69abb..b09cc4e8d240 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -29,6 +29,7 @@
 #include <linux/platform_data/wkup_m3.h>
 #include <linux/platform_data/asoc-ti-mcbsp.h>
 
+#include "clockdomain.h"
 #include "common.h"
 #include "common-board-devices.h"
 #include "control.h"
@@ -463,6 +464,62 @@ static void __init dra7x_evm_mmc_quirk(void)
 }
 #endif
 
+static struct clockdomain *ti_sysc_find_one_clockdomain(struct clk *clk)
+{
+	struct clockdomain *clkdm = NULL;
+	struct clk_hw_omap *hwclk;
+
+	hwclk = to_clk_hw_omap(__clk_get_hw(clk));
+	if (hwclk && hwclk->clkdm_name)
+		clkdm = clkdm_lookup(hwclk->clkdm_name);
+
+	return clkdm;
+}
+
+/**
+ * ti_sysc_clkdm_init - find clockdomain based on clock
+ * @fck: device functional clock
+ * @ick: device interface clock
+ * @dev: struct device
+ *
+ * Populate clockdomain based on clock. It is needed for
+ * clkdm_deny_idle() and clkdm_allow_idle() for blocking clockdomain
+ * clockdomain idle during reset, enable and idle.
+ *
+ * Note that we assume interconnect driver manages the clocks
+ * and do not need to populate oh->_clk for dynamically
+ * allocated modules.
+ */
+static int ti_sysc_clkdm_init(struct device *dev,
+			      struct clk *fck, struct clk *ick,
+			      struct ti_sysc_cookie *cookie)
+{
+	if (fck)
+		cookie->clkdm = ti_sysc_find_one_clockdomain(fck);
+	if (cookie->clkdm)
+		return 0;
+	if (ick)
+		cookie->clkdm = ti_sysc_find_one_clockdomain(ick);
+	if (cookie->clkdm)
+		return 0;
+
+	return -ENODEV;
+}
+
+static void ti_sysc_clkdm_deny_idle(struct device *dev,
+				    const struct ti_sysc_cookie *cookie)
+{
+	if (cookie->clkdm)
+		clkdm_deny_idle(cookie->clkdm);
+}
+
+static void ti_sysc_clkdm_allow_idle(struct device *dev,
+				     const struct ti_sysc_cookie *cookie)
+{
+	if (cookie->clkdm)
+		clkdm_allow_idle(cookie->clkdm);
+}
+
 static int ti_sysc_enable_module(struct device *dev,
 				 const struct ti_sysc_cookie *cookie)
 {
@@ -494,6 +551,9 @@ static struct of_dev_auxdata omap_auxdata_lookup[];
 
 static struct ti_sysc_platform_data ti_sysc_pdata = {
 	.auxdata = omap_auxdata_lookup,
+	.init_clockdomain = ti_sysc_clkdm_init,
+	.clkdm_deny_idle = ti_sysc_clkdm_deny_idle,
+	.clkdm_allow_idle = ti_sysc_clkdm_allow_idle,
 	.init_module = omap_hwmod_init_module,
 	.enable_module = ti_sysc_enable_module,
 	.idle_module = ti_sysc_idle_module,
diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index b72741668c92..e86f7850206a 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -422,6 +422,30 @@ static void sysc_disable_opt_clocks(struct sysc *ddata)
 	}
 }
 
+static void sysc_clkdm_deny_idle(struct sysc *ddata)
+{
+	struct ti_sysc_platform_data *pdata;
+
+	if (ddata->legacy_mode)
+		return;
+
+	pdata = dev_get_platdata(ddata->dev);
+	if (pdata && pdata->clkdm_deny_idle)
+		pdata->clkdm_deny_idle(ddata->dev, &ddata->cookie);
+}
+
+static void sysc_clkdm_allow_idle(struct sysc *ddata)
+{
+	struct ti_sysc_platform_data *pdata;
+
+	if (ddata->legacy_mode)
+		return;
+
+	pdata = dev_get_platdata(ddata->dev);
+	if (pdata && pdata->clkdm_allow_idle)
+		pdata->clkdm_allow_idle(ddata->dev, &ddata->cookie);
+}
+
 /**
  * sysc_init_resets - init rstctrl reset line if configured
  * @ddata: device driver data
@@ -795,6 +819,7 @@ static void sysc_show_registers(struct sysc *ddata)
 
 #define SYSC_IDLE_MASK	(SYSC_NR_IDLEMODES - 1)
 
+/* Caller needs to manage sysc_clkdm_deny_idle() and sysc_clkdm_allow_idle() */
 static int sysc_enable_module(struct device *dev)
 {
 	struct sysc *ddata;
@@ -805,11 +830,6 @@ static int sysc_enable_module(struct device *dev)
 	if (ddata->offsets[SYSC_SYSCONFIG] == -ENODEV)
 		return 0;
 
-	/*
-	 * TODO: Need to prevent clockdomain autoidle?
-	 * See clkdm_deny_idle() in arch/mach-omap2/omap_hwmod.c
-	 */
-
 	regbits = ddata->cap->regbits;
 	reg = sysc_read(ddata, ddata->offsets[SYSC_SYSCONFIG]);
 
@@ -861,6 +881,7 @@ static int sysc_best_idle_mode(u32 idlemodes, u32 *best_mode)
 	return 0;
 }
 
+/* Caller needs to manage sysc_clkdm_deny_idle() and sysc_clkdm_allow_idle() */
 static int sysc_disable_module(struct device *dev)
 {
 	struct sysc *ddata;
@@ -872,11 +893,6 @@ static int sysc_disable_module(struct device *dev)
 	if (ddata->offsets[SYSC_SYSCONFIG] == -ENODEV)
 		return 0;
 
-	/*
-	 * TODO: Need to prevent clockdomain autoidle?
-	 * See clkdm_deny_idle() in arch/mach-omap2/omap_hwmod.c
-	 */
-
 	regbits = ddata->cap->regbits;
 	reg = sysc_read(ddata, ddata->offsets[SYSC_SYSCONFIG]);
 
@@ -966,14 +982,16 @@ static int __maybe_unused sysc_runtime_suspend(struct device *dev)
 	if (!ddata->enabled)
 		return 0;
 
+	sysc_clkdm_deny_idle(ddata);
+
 	if (ddata->legacy_mode) {
 		error = sysc_runtime_suspend_legacy(dev, ddata);
 		if (error)
-			return error;
+			goto err_allow_idle;
 	} else {
 		error = sysc_disable_module(dev);
 		if (error)
-			return error;
+			goto err_allow_idle;
 	}
 
 	sysc_disable_main_clocks(ddata);
@@ -983,6 +1001,9 @@ static int __maybe_unused sysc_runtime_suspend(struct device *dev)
 
 	ddata->enabled = false;
 
+err_allow_idle:
+	sysc_clkdm_allow_idle(ddata);
+
 	return error;
 }
 
@@ -996,10 +1017,12 @@ static int __maybe_unused sysc_runtime_resume(struct device *dev)
 	if (ddata->enabled)
 		return 0;
 
+	sysc_clkdm_deny_idle(ddata);
+
 	if (sysc_opt_clks_needed(ddata)) {
 		error = sysc_enable_opt_clocks(ddata);
 		if (error)
-			return error;
+			goto err_allow_idle;
 	}
 
 	error = sysc_enable_main_clocks(ddata);
@@ -1018,6 +1041,8 @@ static int __maybe_unused sysc_runtime_resume(struct device *dev)
 
 	ddata->enabled = true;
 
+	sysc_clkdm_allow_idle(ddata);
+
 	return 0;
 
 err_main_clocks:
@@ -1025,6 +1050,8 @@ err_main_clocks:
 err_opt_clocks:
 	if (sysc_opt_clks_needed(ddata))
 		sysc_disable_opt_clocks(ddata);
+err_allow_idle:
+	sysc_clkdm_allow_idle(ddata);
 
 	return error;
 }
@@ -1245,6 +1272,33 @@ static void sysc_init_revision_quirks(struct sysc *ddata)
 	}
 }
 
+static int sysc_clockdomain_init(struct sysc *ddata)
+{
+	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
+	struct clk *fck = NULL, *ick = NULL;
+	int error;
+
+	if (!pdata || !pdata->init_clockdomain)
+		return 0;
+
+	switch (ddata->nr_clocks) {
+	case 2:
+		ick = ddata->clocks[SYSC_ICK];
+		/* fallthrough */
+	case 1:
+		fck = ddata->clocks[SYSC_FCK];
+		break;
+	case 0:
+		return 0;
+	}
+
+	error = pdata->init_clockdomain(ddata->dev, fck, ick, &ddata->cookie);
+	if (!error || error == -ENODEV)
+		return 0;
+
+	return error;
+}
+
 /*
  * Note that pdata->init_module() typically does a reset first. After
  * pdata->init_module() is done, PM runtime can be used for the interconnect
@@ -1255,7 +1309,7 @@ static int sysc_legacy_init(struct sysc *ddata)
 	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
 	int error;
 
-	if (!ddata->legacy_mode || !pdata || !pdata->init_module)
+	if (!pdata || !pdata->init_module)
 		return 0;
 
 	error = pdata->init_module(ddata->dev, ddata->mdata, &ddata->cookie);
@@ -1347,7 +1401,13 @@ static int sysc_init_module(struct sysc *ddata)
 	    (SYSC_QUIRK_NO_IDLE | SYSC_QUIRK_NO_IDLE_ON_INIT))
 		manage_clocks = false;
 
+	error = sysc_clockdomain_init(ddata);
+	if (error)
+		return error;
+
 	if (manage_clocks) {
+		sysc_clkdm_deny_idle(ddata);
+
 		error = sysc_enable_opt_clocks(ddata);
 		if (error)
 			return error;
@@ -1360,20 +1420,33 @@ static int sysc_init_module(struct sysc *ddata)
 	ddata->revision = sysc_read_revision(ddata);
 	sysc_init_revision_quirks(ddata);
 
-	error = sysc_legacy_init(ddata);
-	if (error)
-		goto err_main_clocks;
+	if (ddata->legacy_mode) {
+		error = sysc_legacy_init(ddata);
+		if (error)
+			goto err_main_clocks;
+	}
+
+	if (!ddata->legacy_mode && manage_clocks) {
+		error = sysc_enable_module(ddata->dev);
+		if (error)
+			goto err_main_clocks;
+	}
 
 	error = sysc_reset(ddata);
 	if (error)
 		dev_err(ddata->dev, "Reset failed with %d\n", error);
 
+	if (!ddata->legacy_mode && manage_clocks)
+		sysc_disable_module(ddata->dev);
+
 err_main_clocks:
 	if (manage_clocks)
 		sysc_disable_main_clocks(ddata);
 err_opt_clocks:
-	if (manage_clocks)
+	if (manage_clocks) {
 		sysc_disable_opt_clocks(ddata);
+		sysc_clkdm_allow_idle(ddata);
+	}
 
 	return error;
 }
@@ -2012,20 +2085,22 @@ static int sysc_init_pdata(struct sysc *ddata)
 	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
 	struct ti_sysc_module_data *mdata;
 
-	if (!pdata || !ddata->legacy_mode)
+	if (!pdata)
 		return 0;
 
 	mdata = devm_kzalloc(ddata->dev, sizeof(*mdata), GFP_KERNEL);
 	if (!mdata)
 		return -ENOMEM;
 
-	mdata->name = ddata->legacy_mode;
-	mdata->module_pa = ddata->module_pa;
-	mdata->module_size = ddata->module_size;
-	mdata->offsets = ddata->offsets;
-	mdata->nr_offsets = SYSC_MAX_REGS;
-	mdata->cap = ddata->cap;
-	mdata->cfg = &ddata->cfg;
+	if (ddata->legacy_mode) {
+		mdata->name = ddata->legacy_mode;
+		mdata->module_pa = ddata->module_pa;
+		mdata->module_size = ddata->module_size;
+		mdata->offsets = ddata->offsets;
+		mdata->nr_offsets = SYSC_MAX_REGS;
+		mdata->cap = ddata->cap;
+		mdata->cfg = &ddata->cfg;
+	}
 
 	ddata->mdata = mdata;
 
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 9256c0305968..6626fd31e309 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -19,6 +19,7 @@ enum ti_sysc_module_type {
 
 struct ti_sysc_cookie {
 	void *data;
+	void *clkdm;
 };
 
 /**
@@ -125,9 +126,16 @@ struct ti_sysc_module_data {
 };
 
 struct device;
+struct clk;
 
 struct ti_sysc_platform_data {
 	struct of_dev_auxdata *auxdata;
+	int (*init_clockdomain)(struct device *dev, struct clk *fck,
+				struct clk *ick, struct ti_sysc_cookie *cookie);
+	void (*clkdm_deny_idle)(struct device *dev,
+				const struct ti_sysc_cookie *cookie);
+	void (*clkdm_allow_idle)(struct device *dev,
+				 const struct ti_sysc_cookie *cookie);
 	int (*init_module)(struct device *dev,
 			   const struct ti_sysc_module_data *data,
 			   struct ti_sysc_cookie *cookie);
-- 
cgit v1.2.3


From e0db94fe87dacd72be0699adcc29e321db7f1689 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 27 May 2019 04:51:53 -0700
Subject: bus: ti-sysc: Make OCP reset work for sysstatus and sysconfig reset
 bits

We've had minimal OCP softreset support in ti-sysc interconnect target
module driver only used for MCAN driver so far. But it turns out that
MCAN has the sysstatus register resetdone bit inverted compared to most
other modules.

Let's make OCP softreset work for other typical cases with reset status
in sysstatus or sysconfig register so we can use the new functions for
sysc_enable_module() and sysc_disable_module() without "ti,hwmods"
property in the following patches.

Tested-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 72 ++++++++++++++++++++++++++---------
 include/linux/platform_data/ti-sysc.h |  1 +
 2 files changed, 55 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index f00997eea207..f4a048430cd1 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -153,6 +153,26 @@ static u32 sysc_read_revision(struct sysc *ddata)
 	return sysc_read(ddata, offset);
 }
 
+static u32 sysc_read_sysconfig(struct sysc *ddata)
+{
+	int offset = ddata->offsets[SYSC_SYSCONFIG];
+
+	if (offset < 0)
+		return 0;
+
+	return sysc_read(ddata, offset);
+}
+
+static u32 sysc_read_sysstatus(struct sysc *ddata)
+{
+	int offset = ddata->offsets[SYSC_SYSSTATUS];
+
+	if (offset < 0)
+		return 0;
+
+	return sysc_read(ddata, offset);
+}
+
 static int sysc_add_named_clock_from_child(struct sysc *ddata,
 					   const char *name,
 					   const char *optfck_name)
@@ -1369,34 +1389,49 @@ static int sysc_rstctrl_reset_deassert(struct sysc *ddata, bool reset)
 	return reset_control_deassert(ddata->rsts);
 }
 
+/*
+ * Note that the caller must ensure the interconnect target module is enabled
+ * before calling reset. Otherwise reset will not complete.
+ */
 static int sysc_reset(struct sysc *ddata)
 {
-	int offset = ddata->offsets[SYSC_SYSCONFIG];
-	int val;
+	int sysc_offset, syss_offset, sysc_val, rstval, quirks, error = 0;
+	u32 sysc_mask, syss_done;
+
+	sysc_offset = ddata->offsets[SYSC_SYSCONFIG];
+	syss_offset = ddata->offsets[SYSC_SYSSTATUS];
+	quirks = ddata->cfg.quirks;
 
-	if (ddata->legacy_mode || offset < 0 ||
+	if (ddata->legacy_mode || sysc_offset < 0 ||
+	    ddata->cap->regbits->srst_shift < 0 ||
 	    ddata->cfg.quirks & SYSC_QUIRK_NO_RESET_ON_INIT)
 		return 0;
 
-	/*
-	 * Currently only support reset status in sysstatus.
-	 * Warn and return error in all other cases
-	 */
-	if (!ddata->cfg.syss_mask) {
-		dev_err(ddata->dev, "No ti,syss-mask. Reset failed\n");
-		return -EINVAL;
-	}
+	sysc_mask = BIT(ddata->cap->regbits->srst_shift);
 
-	val = sysc_read(ddata, offset);
-	val |= (0x1 << ddata->cap->regbits->srst_shift);
-	sysc_write(ddata, offset, val);
+	if (ddata->cfg.quirks & SYSS_QUIRK_RESETDONE_INVERTED)
+		syss_done = 0;
+	else
+		syss_done = ddata->cfg.syss_mask;
+
+	sysc_val = sysc_read_sysconfig(ddata);
+	sysc_val |= sysc_mask;
+	sysc_write(ddata, sysc_offset, sysc_val);
 
 	/* Poll on reset status */
-	offset = ddata->offsets[SYSC_SYSSTATUS];
+	if (syss_offset >= 0) {
+		error = readx_poll_timeout(sysc_read_sysstatus, ddata, rstval,
+					   (rstval & ddata->cfg.syss_mask) ==
+					   syss_done,
+					   100, MAX_MODULE_SOFTRESET_WAIT);
+
+	} else if (ddata->cfg.quirks & SYSC_QUIRK_RESET_STATUS) {
+		error = readx_poll_timeout(sysc_read_sysconfig, ddata, rstval,
+					   !(rstval & sysc_mask),
+					   100, MAX_MODULE_SOFTRESET_WAIT);
+	}
 
-	return readl_poll_timeout(ddata->module_va + offset, val,
-				  (val & ddata->cfg.syss_mask) == 0x0,
-				  100, MAX_MODULE_SOFTRESET_WAIT);
+	return error;
 }
 
 /*
@@ -2099,6 +2134,7 @@ static const struct sysc_capabilities sysc_dra7_mcan = {
 	.type = TI_SYSC_DRA7_MCAN,
 	.sysc_mask = SYSC_DRA7_MCAN_ENAWAKEUP | SYSC_OMAP4_SOFTRESET,
 	.regbits = &sysc_regbits_dra7_mcan,
+	.mod_quirks = SYSS_QUIRK_RESETDONE_INVERTED,
 };
 
 static int sysc_init_pdata(struct sysc *ddata)
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 6626fd31e309..8822e99ff813 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -47,6 +47,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSS_QUIRK_RESETDONE_INVERTED	BIT(14)
 #define SYSC_QUIRK_SWSUP_MSTANDBY	BIT(13)
 #define SYSC_QUIRK_SWSUP_SIDLE_ACT	BIT(12)
 #define SYSC_QUIRK_SWSUP_SIDLE		BIT(11)
-- 
cgit v1.2.3


From 1a058c3376765ee31d65e28cbbb9d4ff15120056 Mon Sep 17 00:00:00 2001
From: Oak Zeng <Oak.Zeng@amd.com>
Date: Mon, 6 May 2019 22:11:14 -0500
Subject: drm/amdkfd: New IOCTL to allocate queue GWS

Add a new kfd ioctl to allocate queue GWS. Queue
GWS is released on queue destroy.

Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 27 +++++++++++++++++++++++++++
 include/uapi/linux/kfd_ioctl.h           | 20 +++++++++++++++++++-
 2 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index c92e931ceb27..aab2aa6c1dee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1567,6 +1567,31 @@ copy_from_user_failed:
 	return err;
 }
 
+static int kfd_ioctl_alloc_queue_gws(struct file *filep,
+		struct kfd_process *p, void *data)
+{
+	int retval;
+	struct kfd_ioctl_alloc_queue_gws_args *args = data;
+	struct kfd_dev *dev = NULL;
+
+	if (!hws_gws_support ||
+		dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
+		return -EINVAL;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev) {
+		pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
+		return -EINVAL;
+	}
+
+	mutex_lock(&p->mutex);
+	retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
+	mutex_unlock(&p->mutex);
+
+	args->first_gws = 0;
+	return retval;
+}
+
 static int kfd_ioctl_get_dmabuf_info(struct file *filep,
 		struct kfd_process *p, void *data)
 {
@@ -1769,6 +1794,8 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF,
 				kfd_ioctl_import_dmabuf, 0),
 
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
+			kfd_ioctl_alloc_queue_gws, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 20917c59f39c..070d1bc7e725 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -410,6 +410,21 @@ struct kfd_ioctl_unmap_memory_from_gpu_args {
 	__u32 n_success;		/* to/from KFD */
 };
 
+/* Allocate GWS for specific queue
+ *
+ * @gpu_id:      device identifier
+ * @queue_id:    queue's id that GWS is allocated for
+ * @num_gws:     how many GWS to allocate
+ * @first_gws:   index of the first GWS allocated.
+ *               only support contiguous GWS allocation
+ */
+struct kfd_ioctl_alloc_queue_gws_args {
+	__u32 gpu_id;		/* to KFD */
+	__u32 queue_id;		/* to KFD */
+	__u32 num_gws;		/* to KFD */
+	__u32 first_gws;	/* from KFD */
+};
+
 struct kfd_ioctl_get_dmabuf_info_args {
 	__u64 size;		/* from KFD */
 	__u64 metadata_ptr;	/* to KFD */
@@ -529,7 +544,10 @@ enum kfd_mmio_remap {
 #define AMDKFD_IOC_IMPORT_DMABUF		\
 		AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args)
 
+#define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
+		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x1E
+#define AMDKFD_COMMAND_END		0x1F
 
 #endif
-- 
cgit v1.2.3


From 0cb93b1503c19f19966402c3ed841626ac9db266 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Mon, 13 May 2019 15:50:10 +0530
Subject: dt-bindings: power: Add rpm power domain bindings for qcs404

Add RPM power domain bindings for the qcs404 family of SoC

Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Jeffrey Hugo <jhugo@codeaurora.org>
Reviewed-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
[sibis: Add supported rpmpd states for qcs404]
Signed-off-by: Sibi Sankar <sibis@codeaurora.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>

Signed-off-by: Andy Gross <agross@kernel.org>
---
 .../devicetree/bindings/power/qcom,rpmpd.txt       |  1 +
 include/dt-bindings/power/qcom-rpmpd.h             | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
index 980e5413d18f..172ccf940c5c 100644
--- a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
+++ b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
@@ -6,6 +6,7 @@ which then translates it into a corresponding voltage on a rail
 Required Properties:
  - compatible: Should be one of the following
 	* qcom,msm8996-rpmpd: RPM Power domain for the msm8996 family of SoC
+	* qcom,qcs404-rpmpd: RPM Power domain for the qcs404 family of SoC
 	* qcom,sdm845-rpmhpd: RPMh Power domain for the sdm845 family of SoC
  - #power-domain-cells: number of cells in Power domain specifier
 	must be 1.
diff --git a/include/dt-bindings/power/qcom-rpmpd.h b/include/dt-bindings/power/qcom-rpmpd.h
index 87d9c6611682..450378662944 100644
--- a/include/dt-bindings/power/qcom-rpmpd.h
+++ b/include/dt-bindings/power/qcom-rpmpd.h
@@ -36,4 +36,26 @@
 #define MSM8996_VDDSSCX		5
 #define MSM8996_VDDSSCX_VFC	6
 
+/* QCS404 Power Domains */
+#define QCS404_VDDMX		0
+#define QCS404_VDDMX_AO		1
+#define QCS404_VDDMX_VFL	2
+#define QCS404_LPICX		3
+#define QCS404_LPICX_VFL	4
+#define QCS404_LPIMX		5
+#define QCS404_LPIMX_VFL	6
+
+/* RPM SMD Power Domain performance levels */
+#define RPM_SMD_LEVEL_RETENTION       16
+#define RPM_SMD_LEVEL_RETENTION_PLUS  32
+#define RPM_SMD_LEVEL_MIN_SVS         48
+#define RPM_SMD_LEVEL_LOW_SVS         64
+#define RPM_SMD_LEVEL_SVS             128
+#define RPM_SMD_LEVEL_SVS_PLUS        192
+#define RPM_SMD_LEVEL_NOM             256
+#define RPM_SMD_LEVEL_NOM_PLUS        320
+#define RPM_SMD_LEVEL_TURBO           384
+#define RPM_SMD_LEVEL_TURBO_NO_CPR    416
+#define RPM_SMD_LEVEL_BINNING         512
+
 #endif
-- 
cgit v1.2.3


From dec9a05a147e4988e1f743c0d0a1389a0552e322 Mon Sep 17 00:00:00 2001
From: Sibi Sankar <sibis@codeaurora.org>
Date: Mon, 13 May 2019 15:50:13 +0530
Subject: dt-bindings: power: Add rpm power domain bindings for msm8998

Add RPM power domain bindings for the msm8998 family of SoC

Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Jeffrey Hugo <jhugo@codeaurora.org>
Reviewed-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sibi Sankar <sibis@codeaurora.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <agross@kernel.org>
---
 Documentation/devicetree/bindings/power/qcom,rpmpd.txt |  1 +
 include/dt-bindings/power/qcom-rpmpd.h                 | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
index 172ccf940c5c..eb35b22f9e23 100644
--- a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
+++ b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
@@ -6,6 +6,7 @@ which then translates it into a corresponding voltage on a rail
 Required Properties:
  - compatible: Should be one of the following
 	* qcom,msm8996-rpmpd: RPM Power domain for the msm8996 family of SoC
+	* qcom,msm8998-rpmpd: RPM Power domain for the msm8998 family of SoC
 	* qcom,qcs404-rpmpd: RPM Power domain for the qcs404 family of SoC
 	* qcom,sdm845-rpmhpd: RPMh Power domain for the sdm845 family of SoC
  - #power-domain-cells: number of cells in Power domain specifier
diff --git a/include/dt-bindings/power/qcom-rpmpd.h b/include/dt-bindings/power/qcom-rpmpd.h
index 450378662944..93e36d011527 100644
--- a/include/dt-bindings/power/qcom-rpmpd.h
+++ b/include/dt-bindings/power/qcom-rpmpd.h
@@ -36,6 +36,18 @@
 #define MSM8996_VDDSSCX		5
 #define MSM8996_VDDSSCX_VFC	6
 
+/* MSM8998 Power Domain Indexes */
+#define MSM8998_VDDCX		0
+#define MSM8998_VDDCX_AO	1
+#define MSM8998_VDDCX_VFL	2
+#define MSM8998_VDDMX		3
+#define MSM8998_VDDMX_AO	4
+#define MSM8998_VDDMX_VFL	5
+#define MSM8998_SSCCX		6
+#define MSM8998_SSCCX_VFL	7
+#define MSM8998_SSCMX		8
+#define MSM8998_SSCMX_VFL	9
+
 /* QCS404 Power Domains */
 #define QCS404_VDDMX		0
 #define QCS404_VDDMX_AO		1
-- 
cgit v1.2.3


From 229b4e0728e0a6ddca2645e73696d5b104fbbbfb Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Tue, 14 May 2019 22:47:24 +0800
Subject: Documentation: PCI: convert pci.txt to reST

Convert plain text documentation to reStructuredText format and add it to
Sphinx TOC tree.  No essential content change.

Move the description of struct pci_driver and struct pci_device_id into
in-source comments.

Signed-off-by: Changbin Du <changbin.du@gmail.com>
[bhelgaas: fix kernel-doc warnings related to moving descriptions to
linux/pci.h, fix "space tab" whitespace errors in mod_devicetable.h]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/PCI/index.rst     |   2 +
 Documentation/PCI/pci.rst       | 578 ++++++++++++++++++++++++++++++++++++
 Documentation/PCI/pci.txt       | 636 ----------------------------------------
 include/linux/mod_devicetable.h |  29 +-
 include/linux/pci.h             |  48 ++-
 5 files changed, 651 insertions(+), 642 deletions(-)
 create mode 100644 Documentation/PCI/pci.rst
 delete mode 100644 Documentation/PCI/pci.txt

(limited to 'include')

diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst
index c2f8728d11cf..7babf43709b0 100644
--- a/Documentation/PCI/index.rst
+++ b/Documentation/PCI/index.rst
@@ -7,3 +7,5 @@ Linux PCI Bus Subsystem
 .. toctree::
    :maxdepth: 2
    :numbered:
+
+   pci
diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst
new file mode 100644
index 000000000000..6864f9a70f5f
--- /dev/null
+++ b/Documentation/PCI/pci.rst
@@ -0,0 +1,578 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+How To Write Linux PCI Drivers
+==============================
+
+:Authors: - Martin Mares <mj@ucw.cz>
+          - Grant Grundler <grundler@parisc-linux.org>
+
+The world of PCI is vast and full of (mostly unpleasant) surprises.
+Since each CPU architecture implements different chip-sets and PCI devices
+have different requirements (erm, "features"), the result is the PCI support
+in the Linux kernel is not as trivial as one would wish. This short paper
+tries to introduce all potential driver authors to Linux APIs for
+PCI device drivers.
+
+A more complete resource is the third edition of "Linux Device Drivers"
+by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
+LDD3 is available for free (under Creative Commons License) from:
+http://lwn.net/Kernel/LDD3/.
+
+However, keep in mind that all documents are subject to "bit rot".
+Refer to the source code if things are not working as described here.
+
+Please send questions/comments/patches about Linux PCI API to the
+"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
+
+
+Structure of PCI drivers
+========================
+PCI drivers "discover" PCI devices in a system via pci_register_driver().
+Actually, it's the other way around. When the PCI generic code discovers
+a new device, the driver with a matching "description" will be notified.
+Details on this below.
+
+pci_register_driver() leaves most of the probing for devices to
+the PCI layer and supports online insertion/removal of devices [thus
+supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
+pci_register_driver() call requires passing in a table of function
+pointers and thus dictates the high level structure of a driver.
+
+Once the driver knows about a PCI device and takes ownership, the
+driver generally needs to perform the following initialization:
+
+  - Enable the device
+  - Request MMIO/IOP resources
+  - Set the DMA mask size (for both coherent and streaming DMA)
+  - Allocate and initialize shared control data (pci_allocate_coherent())
+  - Access device configuration space (if needed)
+  - Register IRQ handler (request_irq())
+  - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+  - Enable DMA/processing engines
+
+When done using the device, and perhaps the module needs to be unloaded,
+the driver needs to take the follow steps:
+
+  - Disable the device from generating IRQs
+  - Release the IRQ (free_irq())
+  - Stop all DMA activity
+  - Release DMA buffers (both streaming and coherent)
+  - Unregister from other subsystems (e.g. scsi or netdev)
+  - Release MMIO/IOP resources
+  - Disable the device
+
+Most of these topics are covered in the following sections.
+For the rest look at LDD3 or <linux/pci.h> .
+
+If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
+the PCI functions described below are defined as inline functions either
+completely empty or just returning an appropriate error codes to avoid
+lots of ifdefs in the drivers.
+
+
+pci_register_driver() call
+==========================
+
+PCI device drivers call ``pci_register_driver()`` during their
+initialization with a pointer to a structure describing the driver
+(``struct pci_driver``):
+
+.. kernel-doc:: include/linux/pci.h
+   :functions: pci_driver
+
+The ID table is an array of ``struct pci_device_id`` entries ending with an
+all-zero entry.  Definitions with static const are generally preferred.
+
+.. kernel-doc:: include/linux/mod_devicetable.h
+   :functions: pci_device_id
+
+Most drivers only need ``PCI_DEVICE()`` or ``PCI_DEVICE_CLASS()`` to set up
+a pci_device_id table.
+
+New PCI IDs may be added to a device driver pci_ids table at runtime
+as shown below::
+
+  echo "vendor device subvendor subdevice class class_mask driver_data" > \
+  /sys/bus/pci/drivers/{driver}/new_id
+
+All fields are passed in as hexadecimal values (no leading 0x).
+The vendor and device fields are mandatory, the others are optional. Users
+need pass only as many optional fields as necessary:
+
+  - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
+  - class and classmask fields default to 0
+  - driver_data defaults to 0UL.
+
+Note that driver_data must match the value used by any of the pci_device_id
+entries defined in the driver. This makes the driver_data field mandatory
+if all the pci_device_id entries have a non-zero driver_data value.
+
+Once added, the driver probe routine will be invoked for any unclaimed
+PCI devices listed in its (newly updated) pci_ids list.
+
+When the driver exits, it just calls pci_unregister_driver() and the PCI layer
+automatically calls the remove hook for all devices handled by the driver.
+
+
+"Attributes" for driver functions/data
+--------------------------------------
+
+Please mark the initialization and cleanup functions where appropriate
+(the corresponding macros are defined in <linux/init.h>):
+
+	======		=================================================
+	__init		Initialization code. Thrown away after the driver
+			initializes.
+	__exit		Exit code. Ignored for non-modular drivers.
+	======		=================================================
+
+Tips on when/where to use the above attributes:
+	- The module_init()/module_exit() functions (and all
+	  initialization functions called _only_ from these)
+	  should be marked __init/__exit.
+
+	- Do not mark the struct pci_driver.
+
+	- Do NOT mark a function if you are not sure which mark to use.
+	  Better to not mark the function than mark the function wrong.
+
+
+How to find PCI devices manually
+================================
+
+PCI drivers should have a really good reason for not using the
+pci_register_driver() interface to search for PCI devices.
+The main reason PCI devices are controlled by multiple drivers
+is because one PCI device implements several different HW services.
+E.g. combined serial/parallel port/floppy controller.
+
+A manual search may be performed using the following constructs:
+
+Searching by vendor and device ID::
+
+	struct pci_dev *dev = NULL;
+	while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
+		configure_device(dev);
+
+Searching by class ID (iterate in a similar way)::
+
+	pci_get_class(CLASS_ID, dev)
+
+Searching by both vendor/device and subsystem vendor/device ID::
+
+	pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
+
+You can use the constant PCI_ANY_ID as a wildcard replacement for
+VENDOR_ID or DEVICE_ID.  This allows searching for any device from a
+specific vendor, for example.
+
+These functions are hotplug-safe. They increment the reference count on
+the pci_dev that they return. You must eventually (possibly at module unload)
+decrement the reference count on these devices by calling pci_dev_put().
+
+
+Device Initialization Steps
+===========================
+
+As noted in the introduction, most PCI drivers need the following steps
+for device initialization:
+
+  - Enable the device
+  - Request MMIO/IOP resources
+  - Set the DMA mask size (for both coherent and streaming DMA)
+  - Allocate and initialize shared control data (pci_allocate_coherent())
+  - Access device configuration space (if needed)
+  - Register IRQ handler (request_irq())
+  - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+  - Enable DMA/processing engines.
+
+The driver can access PCI config space registers at any time.
+(Well, almost. When running BIST, config space can go away...but
+that will just result in a PCI Bus Master Abort and config reads
+will return garbage).
+
+
+Enable the PCI device
+---------------------
+Before touching any device registers, the driver needs to enable
+the PCI device by calling pci_enable_device(). This will:
+
+  - wake up the device if it was in suspended state,
+  - allocate I/O and memory regions of the device (if BIOS did not),
+  - allocate an IRQ (if BIOS did not).
+
+.. note::
+   pci_enable_device() can fail! Check the return value.
+
+.. warning::
+   OS BUG: we don't check resource allocations before enabling those
+   resources. The sequence would make more sense if we called
+   pci_request_resources() before calling pci_enable_device().
+   Currently, the device drivers can't detect the bug when when two
+   devices have been allocated the same range. This is not a common
+   problem and unlikely to get fixed soon.
+
+   This has been discussed before but not changed as of 2.6.19:
+   http://lkml.org/lkml/2006/3/2/194
+
+
+pci_set_master() will enable DMA by setting the bus master bit
+in the PCI_COMMAND register. It also fixes the latency timer value if
+it's set to something bogus by the BIOS.  pci_clear_master() will
+disable DMA by clearing the bus master bit.
+
+If the PCI device can use the PCI Memory-Write-Invalidate transaction,
+call pci_set_mwi().  This enables the PCI_COMMAND bit for Mem-Wr-Inval
+and also ensures that the cache line size register is set correctly.
+Check the return value of pci_set_mwi() as not all architectures
+or chip-sets may support Memory-Write-Invalidate.  Alternatively,
+if Mem-Wr-Inval would be nice to have but is not required, call
+pci_try_set_mwi() to have the system do its best effort at enabling
+Mem-Wr-Inval.
+
+
+Request MMIO/IOP resources
+--------------------------
+Memory (MMIO), and I/O port addresses should NOT be read directly
+from the PCI device config space. Use the values in the pci_dev structure
+as the PCI "bus address" might have been remapped to a "host physical"
+address by the arch/chip-set specific kernel support.
+
+See Documentation/io-mapping.txt for how to access device registers
+or device memory.
+
+The device driver needs to call pci_request_region() to verify
+no other device is already using the same address resource.
+Conversely, drivers should call pci_release_region() AFTER
+calling pci_disable_device().
+The idea is to prevent two devices colliding on the same address range.
+
+.. tip::
+   See OS BUG comment above. Currently (2.6.19), The driver can only
+   determine MMIO and IO Port resource availability _after_ calling
+   pci_enable_device().
+
+Generic flavors of pci_request_region() are request_mem_region()
+(for MMIO ranges) and request_region() (for IO Port ranges).
+Use these for address resources that are not described by "normal" PCI
+BARs.
+
+Also see pci_request_selected_regions() below.
+
+
+Set the DMA mask size
+---------------------
+.. note::
+   If anything below doesn't make sense, please refer to
+   Documentation/DMA-API.txt. This section is just a reminder that
+   drivers need to indicate DMA capabilities of the device and is not
+   an authoritative source for DMA interfaces.
+
+While all drivers should explicitly indicate the DMA capability
+(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
+32-bit bus master capability for streaming data need the driver
+to "register" this capability by calling pci_set_dma_mask() with
+appropriate parameters.  In general this allows more efficient DMA
+on systems where System RAM exists above 4G _physical_ address.
+
+Drivers for all PCI-X and PCIe compliant devices must call
+pci_set_dma_mask() as they are 64-bit DMA devices.
+
+Similarly, drivers must also "register" this capability if the device
+can directly address "consistent memory" in System RAM above 4G physical
+address by calling pci_set_consistent_dma_mask().
+Again, this includes drivers for all PCI-X and PCIe compliant devices.
+Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
+64-bit DMA capable for payload ("streaming") data but not control
+("consistent") data.
+
+
+Setup shared control data
+-------------------------
+Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
+memory.  See Documentation/DMA-API.txt for a full description of
+the DMA APIs. This section is just a reminder that it needs to be done
+before enabling DMA on the device.
+
+
+Initialize device registers
+---------------------------
+Some drivers will need specific "capability" fields programmed
+or other "vendor specific" register initialized or reset.
+E.g. clearing pending interrupts.
+
+
+Register IRQ handler
+--------------------
+While calling request_irq() is the last step described here,
+this is often just another intermediate step to initialize a device.
+This step can often be deferred until the device is opened for use.
+
+All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
+and use the devid to map IRQs to devices (remember that all PCI IRQ lines
+can be shared).
+
+request_irq() will associate an interrupt handler and device handle
+with an interrupt number. Historically interrupt numbers represent
+IRQ lines which run from the PCI device to the Interrupt controller.
+With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
+
+request_irq() also enables the interrupt. Make sure the device is
+quiesced and does not have any interrupts pending before registering
+the interrupt handler.
+
+MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
+which deliver interrupts to the CPU via a DMA write to a Local APIC.
+The fundamental difference between MSI and MSI-X is how multiple
+"vectors" get allocated. MSI requires contiguous blocks of vectors
+while MSI-X can allocate several individual ones.
+
+MSI capability can be enabled by calling pci_alloc_irq_vectors() with the
+PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This
+causes the PCI support to program CPU vector data into the PCI device
+capability registers. Many architectures, chip-sets, or BIOSes do NOT
+support MSI or MSI-X and a call to pci_alloc_irq_vectors with just
+the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always
+specify PCI_IRQ_LEGACY as well.
+
+Drivers that have different interrupt handlers for MSI/MSI-X and
+legacy INTx should chose the right one based on the msi_enabled
+and msix_enabled flags in the pci_dev structure after calling
+pci_alloc_irq_vectors.
+
+There are (at least) two really good reasons for using MSI:
+
+1) MSI is an exclusive interrupt vector by definition.
+   This means the interrupt handler doesn't have to verify
+   its device caused the interrupt.
+
+2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
+   to be visible to the host CPU(s) when the MSI is delivered. This
+   is important for both data coherency and avoiding stale control data.
+   This guarantee allows the driver to omit MMIO reads to flush
+   the DMA stream.
+
+See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
+of MSI/MSI-X usage.
+
+
+PCI device shutdown
+===================
+
+When a PCI device driver is being unloaded, most of the following
+steps need to be performed:
+
+  - Disable the device from generating IRQs
+  - Release the IRQ (free_irq())
+  - Stop all DMA activity
+  - Release DMA buffers (both streaming and consistent)
+  - Unregister from other subsystems (e.g. scsi or netdev)
+  - Disable device from responding to MMIO/IO Port addresses
+  - Release MMIO/IO Port resource(s)
+
+
+Stop IRQs on the device
+-----------------------
+How to do this is chip/device specific. If it's not done, it opens
+the possibility of a "screaming interrupt" if (and only if)
+the IRQ is shared with another device.
+
+When the shared IRQ handler is "unhooked", the remaining devices
+using the same IRQ line will still need the IRQ enabled. Thus if the
+"unhooked" device asserts IRQ line, the system will respond assuming
+it was one of the remaining devices asserted the IRQ line. Since none
+of the other devices will handle the IRQ, the system will "hang" until
+it decides the IRQ isn't going to get handled and masks the IRQ (100,000
+iterations later). Once the shared IRQ is masked, the remaining devices
+will stop functioning properly. Not a nice situation.
+
+This is another reason to use MSI or MSI-X if it's available.
+MSI and MSI-X are defined to be exclusive interrupts and thus
+are not susceptible to the "screaming interrupt" problem.
+
+
+Release the IRQ
+---------------
+Once the device is quiesced (no more IRQs), one can call free_irq().
+This function will return control once any pending IRQs are handled,
+"unhook" the drivers IRQ handler from that IRQ, and finally release
+the IRQ if no one else is using it.
+
+
+Stop all DMA activity
+---------------------
+It's extremely important to stop all DMA operations BEFORE attempting
+to deallocate DMA control data. Failure to do so can result in memory
+corruption, hangs, and on some chip-sets a hard crash.
+
+Stopping DMA after stopping the IRQs can avoid races where the
+IRQ handler might restart DMA engines.
+
+While this step sounds obvious and trivial, several "mature" drivers
+didn't get this step right in the past.
+
+
+Release DMA buffers
+-------------------
+Once DMA is stopped, clean up streaming DMA first.
+I.e. unmap data buffers and return buffers to "upstream"
+owners if there is one.
+
+Then clean up "consistent" buffers which contain the control data.
+
+See Documentation/DMA-API.txt for details on unmapping interfaces.
+
+
+Unregister from other subsystems
+--------------------------------
+Most low level PCI device drivers support some other subsystem
+like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
+driver isn't losing resources from that other subsystem.
+If this happens, typically the symptom is an Oops (panic) when
+the subsystem attempts to call into a driver that has been unloaded.
+
+
+Disable Device from responding to MMIO/IO Port addresses
+--------------------------------------------------------
+io_unmap() MMIO or IO Port resources and then call pci_disable_device().
+This is the symmetric opposite of pci_enable_device().
+Do not access device registers after calling pci_disable_device().
+
+
+Release MMIO/IO Port Resource(s)
+--------------------------------
+Call pci_release_region() to mark the MMIO or IO Port range as available.
+Failure to do so usually results in the inability to reload the driver.
+
+
+How to access PCI config space
+==============================
+
+You can use `pci_(read|write)_config_(byte|word|dword)` to access the config
+space of a device represented by `struct pci_dev *`. All these functions return
+0 when successful or an error code (`PCIBIOS_...`) which can be translated to a
+text string by pcibios_strerror. Most drivers expect that accesses to valid PCI
+devices don't fail.
+
+If you don't have a struct pci_dev available, you can call
+`pci_bus_(read|write)_config_(byte|word|dword)` to access a given device
+and function on that bus.
+
+If you access fields in the standard portion of the config header, please
+use symbolic names of locations and bits declared in <linux/pci.h>.
+
+If you need to access Extended PCI Capability registers, just call
+pci_find_capability() for the particular capability and it will find the
+corresponding register block for you.
+
+
+Other interesting functions
+===========================
+
+=============================	================================================
+pci_get_domain_bus_and_slot()	Find pci_dev corresponding to given domain,
+				bus and slot and number. If the device is
+				found, its reference count is increased.
+pci_set_power_state()		Set PCI Power Management state (0=D0 ... 3=D3)
+pci_find_capability()		Find specified capability in device's capability
+				list.
+pci_resource_start()		Returns bus start address for a given PCI region
+pci_resource_end()		Returns bus end address for a given PCI region
+pci_resource_len()		Returns the byte length of a PCI region
+pci_set_drvdata()		Set private driver data pointer for a pci_dev
+pci_get_drvdata()		Return private driver data pointer for a pci_dev
+pci_set_mwi()			Enable Memory-Write-Invalidate transactions.
+pci_clear_mwi()			Disable Memory-Write-Invalidate transactions.
+=============================	================================================
+
+
+Miscellaneous hints
+===================
+
+When displaying PCI device names to the user (for example when a driver wants
+to tell the user what card has it found), please use pci_name(pci_dev).
+
+Always refer to the PCI devices by a pointer to the pci_dev structure.
+All PCI layer functions use this identification and it's the only
+reasonable one. Don't use bus/slot/function numbers except for very
+special purposes -- on systems with multiple primary buses their semantics
+can be pretty complex.
+
+Don't try to turn on Fast Back to Back writes in your driver.  All devices
+on the bus need to be capable of doing it, so this is something which needs
+to be handled by platform and generic code, not individual drivers.
+
+
+Vendor and device identifications
+=================================
+
+Do not add new device or vendor IDs to include/linux/pci_ids.h unless they
+are shared across multiple drivers.  You can add private definitions in
+your driver if they're helpful, or just use plain hex constants.
+
+The device IDs are arbitrary hex numbers (vendor controlled) and normally used
+only in a single location, the pci_device_id table.
+
+Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/.
+There are mirrors of the pci.ids file at http://pciids.sourceforge.net/
+and https://github.com/pciutils/pciids.
+
+
+Obsolete functions
+==================
+
+There are several functions which you might come across when trying to
+port an old driver to the new PCI interface.  They are no longer present
+in the kernel as they aren't compatible with hotplug or PCI domains or
+having sane locking.
+
+=================	===========================================
+pci_find_device()	Superseded by pci_get_device()
+pci_find_subsys()	Superseded by pci_get_subsys()
+pci_find_slot()		Superseded by pci_get_domain_bus_and_slot()
+pci_get_slot()		Superseded by pci_get_domain_bus_and_slot()
+=================	===========================================
+
+The alternative is the traditional PCI device driver that walks PCI
+device lists. This is still possible but discouraged.
+
+
+MMIO Space and "Write Posting"
+==============================
+
+Converting a driver from using I/O Port space to using MMIO space
+often requires some additional changes. Specifically, "write posting"
+needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
+already do this. I/O Port space guarantees write transactions reach the PCI
+device before the CPU can continue. Writes to MMIO space allow the CPU
+to continue before the transaction reaches the PCI device. HW weenies
+call this "Write Posting" because the write completion is "posted" to
+the CPU before the transaction has reached its destination.
+
+Thus, timing sensitive code should add readl() where the CPU is
+expected to wait before doing other work.  The classic "bit banging"
+sequence works fine for I/O Port space::
+
+       for (i = 8; --i; val >>= 1) {
+               outb(val & 1, ioport_reg);      /* write bit */
+               udelay(10);
+       }
+
+The same sequence for MMIO space should be::
+
+       for (i = 8; --i; val >>= 1) {
+               writeb(val & 1, mmio_reg);      /* write bit */
+               readb(safe_mmio_reg);           /* flush posted write */
+               udelay(10);
+       }
+
+It is important that "safe_mmio_reg" not have any side effects that
+interferes with the correct operation of the device.
+
+Another case to watch out for is when resetting a PCI device. Use PCI
+Configuration space reads to flush the writel(). This will gracefully
+handle the PCI master abort on all platforms if the PCI device is
+expected to not respond to a readl().  Most x86 platforms will allow
+MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
+(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt
deleted file mode 100644
index badb26ac33dc..000000000000
--- a/Documentation/PCI/pci.txt
+++ /dev/null
@@ -1,636 +0,0 @@
-
-			How To Write Linux PCI Drivers
-
-		by Martin Mares <mj@ucw.cz> on 07-Feb-2000
-	updated by Grant Grundler <grundler@parisc-linux.org> on 23-Dec-2006
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The world of PCI is vast and full of (mostly unpleasant) surprises.
-Since each CPU architecture implements different chip-sets and PCI devices
-have different requirements (erm, "features"), the result is the PCI support
-in the Linux kernel is not as trivial as one would wish. This short paper
-tries to introduce all potential driver authors to Linux APIs for
-PCI device drivers.
-
-A more complete resource is the third edition of "Linux Device Drivers"
-by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
-LDD3 is available for free (under Creative Commons License) from:
-
-	http://lwn.net/Kernel/LDD3/
-
-However, keep in mind that all documents are subject to "bit rot".
-Refer to the source code if things are not working as described here.
-
-Please send questions/comments/patches about Linux PCI API to the
-"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
-
-
-
-0. Structure of PCI drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-PCI drivers "discover" PCI devices in a system via pci_register_driver().
-Actually, it's the other way around. When the PCI generic code discovers
-a new device, the driver with a matching "description" will be notified.
-Details on this below.
-
-pci_register_driver() leaves most of the probing for devices to
-the PCI layer and supports online insertion/removal of devices [thus
-supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
-pci_register_driver() call requires passing in a table of function
-pointers and thus dictates the high level structure of a driver.
-
-Once the driver knows about a PCI device and takes ownership, the
-driver generally needs to perform the following initialization:
-
-	Enable the device
-	Request MMIO/IOP resources
-	Set the DMA mask size (for both coherent and streaming DMA)
-	Allocate and initialize shared control data (pci_allocate_coherent())
-	Access device configuration space (if needed)
-	Register IRQ handler (request_irq())
-	Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
-	Enable DMA/processing engines
-
-When done using the device, and perhaps the module needs to be unloaded,
-the driver needs to take the follow steps:
-	Disable the device from generating IRQs
-	Release the IRQ (free_irq())
-	Stop all DMA activity
-	Release DMA buffers (both streaming and coherent)
-	Unregister from other subsystems (e.g. scsi or netdev)
-	Release MMIO/IOP resources
-	Disable the device
-
-Most of these topics are covered in the following sections.
-For the rest look at LDD3 or <linux/pci.h> .
-
-If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
-the PCI functions described below are defined as inline functions either
-completely empty or just returning an appropriate error codes to avoid
-lots of ifdefs in the drivers.
-
-
-
-1. pci_register_driver() call
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-PCI device drivers call pci_register_driver() during their
-initialization with a pointer to a structure describing the driver
-(struct pci_driver):
-
-	field name	Description
-	----------	------------------------------------------------------
-	id_table	Pointer to table of device ID's the driver is
-			interested in.  Most drivers should export this
-			table using MODULE_DEVICE_TABLE(pci,...).
-
-	probe		This probing function gets called (during execution
-			of pci_register_driver() for already existing
-			devices or later if a new device gets inserted) for
-			all PCI devices which match the ID table and are not
-			"owned" by the other drivers yet. This function gets
-			passed a "struct pci_dev *" for each device whose
-			entry in the ID table matches the device. The probe
-			function returns zero when the driver chooses to
-			take "ownership" of the device or an error code
-			(negative number) otherwise.
-			The probe function always gets called from process
-			context, so it can sleep.
-
-	remove		The remove() function gets called whenever a device
-			being handled by this driver is removed (either during
-			deregistration of the driver or when it's manually
-			pulled out of a hot-pluggable slot).
-			The remove function always gets called from process
-			context, so it can sleep.
-
-	suspend		Put device into low power state.
-	suspend_late	Put device into low power state.
-
-	resume_early	Wake device from low power state.
-	resume		Wake device from low power state.
-
-		(Please see Documentation/power/pci.txt for descriptions
-		of PCI Power Management and the related functions.)
-
-	shutdown	Hook into reboot_notifier_list (kernel/sys.c).
-			Intended to stop any idling DMA operations.
-			Useful for enabling wake-on-lan (NIC) or changing
-			the power state of a device before reboot.
-			e.g. drivers/net/e100.c.
-
-	err_handler	See Documentation/PCI/pci-error-recovery.txt
-
-
-The ID table is an array of struct pci_device_id entries ending with an
-all-zero entry.  Definitions with static const are generally preferred.
-
-Each entry consists of:
-
-	vendor,device	Vendor and device ID to match (or PCI_ANY_ID)
-
-	subvendor,	Subsystem vendor and device ID to match (or PCI_ANY_ID)
-	subdevice,
-
-	class		Device class, subclass, and "interface" to match.
-			See Appendix D of the PCI Local Bus Spec or
-			include/linux/pci_ids.h for a full list of classes.
-			Most drivers do not need to specify class/class_mask
-			as vendor/device is normally sufficient.
-
-	class_mask	limit which sub-fields of the class field are compared.
-			See drivers/scsi/sym53c8xx_2/ for example of usage.
-
-	driver_data	Data private to the driver.
-			Most drivers don't need to use driver_data field.
-			Best practice is to use driver_data as an index
-			into a static list of equivalent device types,
-			instead of using it as a pointer.
-
-
-Most drivers only need PCI_DEVICE() or PCI_DEVICE_CLASS() to set up
-a pci_device_id table.
-
-New PCI IDs may be added to a device driver pci_ids table at runtime
-as shown below:
-
-echo "vendor device subvendor subdevice class class_mask driver_data" > \
-/sys/bus/pci/drivers/{driver}/new_id
-
-All fields are passed in as hexadecimal values (no leading 0x).
-The vendor and device fields are mandatory, the others are optional. Users
-need pass only as many optional fields as necessary:
-	o subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
-	o class and classmask fields default to 0
-	o driver_data defaults to 0UL.
-
-Note that driver_data must match the value used by any of the pci_device_id
-entries defined in the driver. This makes the driver_data field mandatory
-if all the pci_device_id entries have a non-zero driver_data value.
-
-Once added, the driver probe routine will be invoked for any unclaimed
-PCI devices listed in its (newly updated) pci_ids list.
-
-When the driver exits, it just calls pci_unregister_driver() and the PCI layer
-automatically calls the remove hook for all devices handled by the driver.
-
-
-1.1 "Attributes" for driver functions/data
-
-Please mark the initialization and cleanup functions where appropriate
-(the corresponding macros are defined in <linux/init.h>):
-
-	__init		Initialization code. Thrown away after the driver
-			initializes.
-	__exit		Exit code. Ignored for non-modular drivers.
-
-Tips on when/where to use the above attributes:
-	o The module_init()/module_exit() functions (and all
-	  initialization functions called _only_ from these)
-	  should be marked __init/__exit.
-
-	o Do not mark the struct pci_driver.
-
-	o Do NOT mark a function if you are not sure which mark to use.
-	  Better to not mark the function than mark the function wrong.
-
-
-
-2. How to find PCI devices manually
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-PCI drivers should have a really good reason for not using the
-pci_register_driver() interface to search for PCI devices.
-The main reason PCI devices are controlled by multiple drivers
-is because one PCI device implements several different HW services.
-E.g. combined serial/parallel port/floppy controller.
-
-A manual search may be performed using the following constructs:
-
-Searching by vendor and device ID:
-
-	struct pci_dev *dev = NULL;
-	while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
-		configure_device(dev);
-
-Searching by class ID (iterate in a similar way):
-
-	pci_get_class(CLASS_ID, dev)
-
-Searching by both vendor/device and subsystem vendor/device ID:
-
-	pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
-
-You can use the constant PCI_ANY_ID as a wildcard replacement for
-VENDOR_ID or DEVICE_ID.  This allows searching for any device from a
-specific vendor, for example.
-
-These functions are hotplug-safe. They increment the reference count on
-the pci_dev that they return. You must eventually (possibly at module unload)
-decrement the reference count on these devices by calling pci_dev_put().
-
-
-
-3. Device Initialization Steps
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-As noted in the introduction, most PCI drivers need the following steps
-for device initialization:
-
-	Enable the device
-	Request MMIO/IOP resources
-	Set the DMA mask size (for both coherent and streaming DMA)
-	Allocate and initialize shared control data (pci_allocate_coherent())
-	Access device configuration space (if needed)
-	Register IRQ handler (request_irq())
-	Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
-	Enable DMA/processing engines.
-
-The driver can access PCI config space registers at any time.
-(Well, almost. When running BIST, config space can go away...but
-that will just result in a PCI Bus Master Abort and config reads
-will return garbage).
-
-
-3.1 Enable the PCI device
-~~~~~~~~~~~~~~~~~~~~~~~~~
-Before touching any device registers, the driver needs to enable
-the PCI device by calling pci_enable_device(). This will:
-	o wake up the device if it was in suspended state,
-	o allocate I/O and memory regions of the device (if BIOS did not),
-	o allocate an IRQ (if BIOS did not).
-
-NOTE: pci_enable_device() can fail! Check the return value.
-
-[ OS BUG: we don't check resource allocations before enabling those
-  resources. The sequence would make more sense if we called
-  pci_request_resources() before calling pci_enable_device().
-  Currently, the device drivers can't detect the bug when when two
-  devices have been allocated the same range. This is not a common
-  problem and unlikely to get fixed soon.
-
-  This has been discussed before but not changed as of 2.6.19:
-	http://lkml.org/lkml/2006/3/2/194
-]
-
-pci_set_master() will enable DMA by setting the bus master bit
-in the PCI_COMMAND register. It also fixes the latency timer value if
-it's set to something bogus by the BIOS.  pci_clear_master() will
-disable DMA by clearing the bus master bit.
-
-If the PCI device can use the PCI Memory-Write-Invalidate transaction,
-call pci_set_mwi().  This enables the PCI_COMMAND bit for Mem-Wr-Inval
-and also ensures that the cache line size register is set correctly.
-Check the return value of pci_set_mwi() as not all architectures
-or chip-sets may support Memory-Write-Invalidate.  Alternatively,
-if Mem-Wr-Inval would be nice to have but is not required, call
-pci_try_set_mwi() to have the system do its best effort at enabling
-Mem-Wr-Inval.
-
-
-3.2 Request MMIO/IOP resources
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Memory (MMIO), and I/O port addresses should NOT be read directly
-from the PCI device config space. Use the values in the pci_dev structure
-as the PCI "bus address" might have been remapped to a "host physical"
-address by the arch/chip-set specific kernel support.
-
-See Documentation/io-mapping.txt for how to access device registers
-or device memory.
-
-The device driver needs to call pci_request_region() to verify
-no other device is already using the same address resource.
-Conversely, drivers should call pci_release_region() AFTER
-calling pci_disable_device().
-The idea is to prevent two devices colliding on the same address range.
-
-[ See OS BUG comment above. Currently (2.6.19), The driver can only
-  determine MMIO and IO Port resource availability _after_ calling
-  pci_enable_device(). ]
-
-Generic flavors of pci_request_region() are request_mem_region()
-(for MMIO ranges) and request_region() (for IO Port ranges).
-Use these for address resources that are not described by "normal" PCI
-BARs.
-
-Also see pci_request_selected_regions() below.
-
-
-3.3 Set the DMA mask size
-~~~~~~~~~~~~~~~~~~~~~~~~~
-[ If anything below doesn't make sense, please refer to
-  Documentation/DMA-API.txt. This section is just a reminder that
-  drivers need to indicate DMA capabilities of the device and is not
-  an authoritative source for DMA interfaces. ]
-
-While all drivers should explicitly indicate the DMA capability
-(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
-32-bit bus master capability for streaming data need the driver
-to "register" this capability by calling pci_set_dma_mask() with
-appropriate parameters.  In general this allows more efficient DMA
-on systems where System RAM exists above 4G _physical_ address.
-
-Drivers for all PCI-X and PCIe compliant devices must call
-pci_set_dma_mask() as they are 64-bit DMA devices.
-
-Similarly, drivers must also "register" this capability if the device
-can directly address "consistent memory" in System RAM above 4G physical
-address by calling pci_set_consistent_dma_mask().
-Again, this includes drivers for all PCI-X and PCIe compliant devices.
-Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
-64-bit DMA capable for payload ("streaming") data but not control
-("consistent") data.
-
-
-3.4 Setup shared control data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
-memory.  See Documentation/DMA-API.txt for a full description of
-the DMA APIs. This section is just a reminder that it needs to be done
-before enabling DMA on the device.
-
-
-3.5 Initialize device registers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some drivers will need specific "capability" fields programmed
-or other "vendor specific" register initialized or reset.
-E.g. clearing pending interrupts.
-
-
-3.6 Register IRQ handler
-~~~~~~~~~~~~~~~~~~~~~~~~
-While calling request_irq() is the last step described here,
-this is often just another intermediate step to initialize a device.
-This step can often be deferred until the device is opened for use.
-
-All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
-and use the devid to map IRQs to devices (remember that all PCI IRQ lines
-can be shared).
-
-request_irq() will associate an interrupt handler and device handle
-with an interrupt number. Historically interrupt numbers represent
-IRQ lines which run from the PCI device to the Interrupt controller.
-With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
-
-request_irq() also enables the interrupt. Make sure the device is
-quiesced and does not have any interrupts pending before registering
-the interrupt handler.
-
-MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
-which deliver interrupts to the CPU via a DMA write to a Local APIC.
-The fundamental difference between MSI and MSI-X is how multiple
-"vectors" get allocated. MSI requires contiguous blocks of vectors
-while MSI-X can allocate several individual ones.
-
-MSI capability can be enabled by calling pci_alloc_irq_vectors() with the
-PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This
-causes the PCI support to program CPU vector data into the PCI device
-capability registers. Many architectures, chip-sets, or BIOSes do NOT
-support MSI or MSI-X and a call to pci_alloc_irq_vectors with just
-the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always
-specify PCI_IRQ_LEGACY as well.
-
-Drivers that have different interrupt handlers for MSI/MSI-X and
-legacy INTx should chose the right one based on the msi_enabled
-and msix_enabled flags in the pci_dev structure after calling
-pci_alloc_irq_vectors.
-
-There are (at least) two really good reasons for using MSI:
-1) MSI is an exclusive interrupt vector by definition.
-   This means the interrupt handler doesn't have to verify
-   its device caused the interrupt.
-
-2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
-   to be visible to the host CPU(s) when the MSI is delivered. This
-   is important for both data coherency and avoiding stale control data.
-   This guarantee allows the driver to omit MMIO reads to flush
-   the DMA stream.
-
-See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
-of MSI/MSI-X usage.
-
-
-
-4. PCI device shutdown
-~~~~~~~~~~~~~~~~~~~~~~~
-
-When a PCI device driver is being unloaded, most of the following
-steps need to be performed:
-
-	Disable the device from generating IRQs
-	Release the IRQ (free_irq())
-	Stop all DMA activity
-	Release DMA buffers (both streaming and consistent)
-	Unregister from other subsystems (e.g. scsi or netdev)
-	Disable device from responding to MMIO/IO Port addresses
-	Release MMIO/IO Port resource(s)
-
-
-4.1 Stop IRQs on the device
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-How to do this is chip/device specific. If it's not done, it opens
-the possibility of a "screaming interrupt" if (and only if)
-the IRQ is shared with another device.
-
-When the shared IRQ handler is "unhooked", the remaining devices
-using the same IRQ line will still need the IRQ enabled. Thus if the
-"unhooked" device asserts IRQ line, the system will respond assuming
-it was one of the remaining devices asserted the IRQ line. Since none
-of the other devices will handle the IRQ, the system will "hang" until
-it decides the IRQ isn't going to get handled and masks the IRQ (100,000
-iterations later). Once the shared IRQ is masked, the remaining devices
-will stop functioning properly. Not a nice situation.
-
-This is another reason to use MSI or MSI-X if it's available.
-MSI and MSI-X are defined to be exclusive interrupts and thus
-are not susceptible to the "screaming interrupt" problem.
-
-
-4.2 Release the IRQ
-~~~~~~~~~~~~~~~~~~~
-Once the device is quiesced (no more IRQs), one can call free_irq().
-This function will return control once any pending IRQs are handled,
-"unhook" the drivers IRQ handler from that IRQ, and finally release
-the IRQ if no one else is using it.
-
-
-4.3 Stop all DMA activity
-~~~~~~~~~~~~~~~~~~~~~~~~~
-It's extremely important to stop all DMA operations BEFORE attempting
-to deallocate DMA control data. Failure to do so can result in memory
-corruption, hangs, and on some chip-sets a hard crash.
-
-Stopping DMA after stopping the IRQs can avoid races where the
-IRQ handler might restart DMA engines.
-
-While this step sounds obvious and trivial, several "mature" drivers
-didn't get this step right in the past.
-
-
-4.4 Release DMA buffers
-~~~~~~~~~~~~~~~~~~~~~~~
-Once DMA is stopped, clean up streaming DMA first.
-I.e. unmap data buffers and return buffers to "upstream"
-owners if there is one.
-
-Then clean up "consistent" buffers which contain the control data.
-
-See Documentation/DMA-API.txt for details on unmapping interfaces.
-
-
-4.5 Unregister from other subsystems
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Most low level PCI device drivers support some other subsystem
-like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
-driver isn't losing resources from that other subsystem.
-If this happens, typically the symptom is an Oops (panic) when
-the subsystem attempts to call into a driver that has been unloaded.
-
-
-4.6 Disable Device from responding to MMIO/IO Port addresses
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-io_unmap() MMIO or IO Port resources and then call pci_disable_device().
-This is the symmetric opposite of pci_enable_device().
-Do not access device registers after calling pci_disable_device().
-
-
-4.7 Release MMIO/IO Port Resource(s)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Call pci_release_region() to mark the MMIO or IO Port range as available.
-Failure to do so usually results in the inability to reload the driver.
-
-
-
-5. How to access PCI config space
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-You can use pci_(read|write)_config_(byte|word|dword) to access the config
-space of a device represented by struct pci_dev *. All these functions return 0
-when successful or an error code (PCIBIOS_...) which can be translated to a text
-string by pcibios_strerror. Most drivers expect that accesses to valid PCI
-devices don't fail.
-
-If you don't have a struct pci_dev available, you can call
-pci_bus_(read|write)_config_(byte|word|dword) to access a given device
-and function on that bus.
-
-If you access fields in the standard portion of the config header, please
-use symbolic names of locations and bits declared in <linux/pci.h>.
-
-If you need to access Extended PCI Capability registers, just call
-pci_find_capability() for the particular capability and it will find the
-corresponding register block for you.
-
-
-
-6. Other interesting functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-pci_get_domain_bus_and_slot()	Find pci_dev corresponding to given domain,
-				bus and slot and number. If the device is
-				found, its reference count is increased.
-pci_set_power_state()		Set PCI Power Management state (0=D0 ... 3=D3)
-pci_find_capability()		Find specified capability in device's capability
-				list.
-pci_resource_start()		Returns bus start address for a given PCI region
-pci_resource_end()		Returns bus end address for a given PCI region
-pci_resource_len()		Returns the byte length of a PCI region
-pci_set_drvdata()		Set private driver data pointer for a pci_dev
-pci_get_drvdata()		Return private driver data pointer for a pci_dev
-pci_set_mwi()			Enable Memory-Write-Invalidate transactions.
-pci_clear_mwi()			Disable Memory-Write-Invalidate transactions.
-
-
-
-7. Miscellaneous hints
-~~~~~~~~~~~~~~~~~~~~~~
-
-When displaying PCI device names to the user (for example when a driver wants
-to tell the user what card has it found), please use pci_name(pci_dev).
-
-Always refer to the PCI devices by a pointer to the pci_dev structure.
-All PCI layer functions use this identification and it's the only
-reasonable one. Don't use bus/slot/function numbers except for very
-special purposes -- on systems with multiple primary buses their semantics
-can be pretty complex.
-
-Don't try to turn on Fast Back to Back writes in your driver.  All devices
-on the bus need to be capable of doing it, so this is something which needs
-to be handled by platform and generic code, not individual drivers.
-
-
-
-8. Vendor and device identifications
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Do not add new device or vendor IDs to include/linux/pci_ids.h unless they
-are shared across multiple drivers.  You can add private definitions in
-your driver if they're helpful, or just use plain hex constants.
-
-The device IDs are arbitrary hex numbers (vendor controlled) and normally used
-only in a single location, the pci_device_id table.
-
-Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/.
-There are mirrors of the pci.ids file at http://pciids.sourceforge.net/
-and https://github.com/pciutils/pciids.
-
-
-
-9. Obsolete functions
-~~~~~~~~~~~~~~~~~~~~~
-
-There are several functions which you might come across when trying to
-port an old driver to the new PCI interface.  They are no longer present
-in the kernel as they aren't compatible with hotplug or PCI domains or
-having sane locking.
-
-pci_find_device()	Superseded by pci_get_device()
-pci_find_subsys()	Superseded by pci_get_subsys()
-pci_find_slot()		Superseded by pci_get_domain_bus_and_slot()
-pci_get_slot()		Superseded by pci_get_domain_bus_and_slot()
-
-
-The alternative is the traditional PCI device driver that walks PCI
-device lists. This is still possible but discouraged.
-
-
-
-10. MMIO Space and "Write Posting"
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Converting a driver from using I/O Port space to using MMIO space
-often requires some additional changes. Specifically, "write posting"
-needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
-already do this. I/O Port space guarantees write transactions reach the PCI
-device before the CPU can continue. Writes to MMIO space allow the CPU
-to continue before the transaction reaches the PCI device. HW weenies
-call this "Write Posting" because the write completion is "posted" to
-the CPU before the transaction has reached its destination.
-
-Thus, timing sensitive code should add readl() where the CPU is
-expected to wait before doing other work.  The classic "bit banging"
-sequence works fine for I/O Port space:
-
-       for (i = 8; --i; val >>= 1) {
-               outb(val & 1, ioport_reg);      /* write bit */
-               udelay(10);
-       }
-
-The same sequence for MMIO space should be:
-
-       for (i = 8; --i; val >>= 1) {
-               writeb(val & 1, mmio_reg);      /* write bit */
-               readb(safe_mmio_reg);           /* flush posted write */
-               udelay(10);
-       }
-
-It is important that "safe_mmio_reg" not have any side effects that
-interferes with the correct operation of the device.
-
-Another case to watch out for is when resetting a PCI device. Use PCI
-Configuration space reads to flush the writel(). This will gracefully
-handle the PCI master abort on all platforms if the PCI device is
-expected to not respond to a readl().  Most x86 platforms will allow
-MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
-(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
-
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 448621c32e4d..664c0fb1d53d 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -16,6 +16,25 @@ typedef unsigned long kernel_ulong_t;
 
 #define PCI_ANY_ID (~0)
 
+/**
+ * struct pci_device_id - PCI device ID structure
+ * @vendor:		Vendor ID to match (or PCI_ANY_ID)
+ * @device:		Device ID to match (or PCI_ANY_ID)
+ * @subvendor:		Subsystem vendor ID to match (or PCI_ANY_ID)
+ * @subdevice:		Subsystem device ID to match (or PCI_ANY_ID)
+ * @class:		Device class, subclass, and "interface" to match.
+ *			See Appendix D of the PCI Local Bus Spec or
+ *			include/linux/pci_ids.h for a full list of classes.
+ *			Most drivers do not need to specify class/class_mask
+ *			as vendor/device is normally sufficient.
+ * @class_mask:		Limit which sub-fields of the class field are compared.
+ *			See drivers/scsi/sym53c8xx_2/ for example of usage.
+ * @driver_data:	Data private to the driver.
+ *			Most drivers don't need to use driver_data field.
+ *			Best practice is to use driver_data as an index
+ *			into a static list of equivalent device types,
+ *			instead of using it as a pointer.
+ */
 struct pci_device_id {
 	__u32 vendor, device;		/* Vendor and device ID or PCI_ANY_ID*/
 	__u32 subvendor, subdevice;	/* Subsystem ID's or PCI_ANY_ID */
@@ -257,17 +276,17 @@ struct pcmcia_device_id {
 	__u16		match_flags;
 
 	__u16		manf_id;
-	__u16 		card_id;
+	__u16		card_id;
 
-	__u8  		func_id;
+	__u8		func_id;
 
 	/* for real multi-function devices */
-	__u8  		function;
+	__u8		function;
 
 	/* for pseudo multi-function devices */
-	__u8  		device_no;
+	__u8		device_no;
 
-	__u32 		prod_id_hash[4];
+	__u32		prod_id_hash[4];
 
 	/* not matched against in kernelspace */
 	const char *	prod_id[4];
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..b74b2a4e6df2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -151,6 +151,8 @@ static inline const char *pci_power_name(pci_power_t state)
 #define PCI_PM_BUS_WAIT		50
 
 /**
+ * typedef pci_channel_state_t
+ *
  * The pci_channel state describes connectivity between the CPU and
  * the PCI device.  If some PCI bus between here and the PCI device
  * has crashed or locked up, this info is reflected here.
@@ -775,6 +777,50 @@ struct pci_error_handlers {
 
 
 struct module;
+
+/**
+ * struct pci_driver - PCI driver structure
+ * @node:	List of driver structures.
+ * @name:	Driver name.
+ * @id_table:	Pointer to table of device IDs the driver is
+ *		interested in.  Most drivers should export this
+ *		table using MODULE_DEVICE_TABLE(pci,...).
+ * @probe:	This probing function gets called (during execution
+ *		of pci_register_driver() for already existing
+ *		devices or later if a new device gets inserted) for
+ *		all PCI devices which match the ID table and are not
+ *		"owned" by the other drivers yet. This function gets
+ *		passed a "struct pci_dev \*" for each device whose
+ *		entry in the ID table matches the device. The probe
+ *		function returns zero when the driver chooses to
+ *		take "ownership" of the device or an error code
+ *		(negative number) otherwise.
+ *		The probe function always gets called from process
+ *		context, so it can sleep.
+ * @remove:	The remove() function gets called whenever a device
+ *		being handled by this driver is removed (either during
+ *		deregistration of the driver or when it's manually
+ *		pulled out of a hot-pluggable slot).
+ *		The remove function always gets called from process
+ *		context, so it can sleep.
+ * @suspend:	Put device into low power state.
+ * @suspend_late: Put device into low power state.
+ * @resume_early: Wake device from low power state.
+ * @resume:	Wake device from low power state.
+ *		(Please see Documentation/power/pci.txt for descriptions
+ *		of PCI Power Management and the related functions.)
+ * @shutdown:	Hook into reboot_notifier_list (kernel/sys.c).
+ *		Intended to stop any idling DMA operations.
+ *		Useful for enabling wake-on-lan (NIC) or changing
+ *		the power state of a device before reboot.
+ *		e.g. drivers/net/e100.c.
+ * @sriov_configure: Optional driver callback to allow configuration of
+ *		number of VFs to enable via sysfs "sriov_numvfs" file.
+ * @err_handler: See Documentation/PCI/pci-error-recovery.rst
+ * @groups:	Sysfs attribute groups.
+ * @driver:	Driver model structure.
+ * @dynids:	List of dynamically added device IDs.
+ */
 struct pci_driver {
 	struct list_head	node;
 	const char		*name;
@@ -2206,7 +2252,7 @@ static inline u8 pci_vpd_srdt_tag(const u8 *srdt)
 
 /**
  * pci_vpd_info_field_size - Extracts the information field length
- * @lrdt: Pointer to the beginning of an information field header
+ * @info_field: Pointer to the beginning of an information field header
  *
  * Returns the extracted information field length.
  */
-- 
cgit v1.2.3


From 6e58ab7ac7fac61acd7705a8abf1632462c1512a Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Fri, 10 May 2019 14:15:08 +0200
Subject: drm/ttm: Make LRU removal optional v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We are already doing this for DMA-buf imports and also for
amdgpu VM BOs for quite a while now.

If this doesn't run into any problems we are probably going
to stop removing BOs from the LRU altogether.

v2: drop BUG_ON from ttm_bo_add_to_lru

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
Tested-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  9 +++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c           |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c          |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c          |  4 ++--
 drivers/gpu/drm/qxl/qxl_release.c                |  2 +-
 drivers/gpu/drm/radeon/radeon_gem.c              |  2 +-
 drivers/gpu/drm/radeon/radeon_object.c           |  2 +-
 drivers/gpu/drm/ttm/ttm_bo.c                     | 23 ++++++++++++-----------
 drivers/gpu/drm/ttm/ttm_execbuf_util.c           | 20 ++++++++++++--------
 drivers/gpu/drm/virtio/virtgpu_ioctl.c           |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c         |  3 ++-
 drivers/gpu/drm/vmwgfx/vmwgfx_validation.h       |  2 +-
 include/drm/ttm/ttm_bo_driver.h                  |  5 ++++-
 include/drm/ttm/ttm_execbuf_util.h               |  3 ++-
 14 files changed, 46 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index e30427161295..81e0e758cc54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -585,7 +585,7 @@ static int reserve_bo_and_vm(struct kgd_mem *mem,
 	amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]);
 
 	ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
-				     false, &ctx->duplicates);
+				     false, &ctx->duplicates, true);
 	if (!ret)
 		ctx->reserved = true;
 	else {
@@ -658,7 +658,7 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
 	}
 
 	ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
-				     false, &ctx->duplicates);
+				     false, &ctx->duplicates, true);
 	if (!ret)
 		ctx->reserved = true;
 	else
@@ -1808,7 +1808,8 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
 	}
 
 	/* Reserve all BOs and page tables for validation */
-	ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates);
+	ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates,
+				     true);
 	WARN(!list_empty(&duplicates), "Duplicates should be empty");
 	if (ret)
 		goto out_free;
@@ -2014,7 +2015,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
 	}
 
 	ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list,
-				     false, &duplicate_save);
+				     false, &duplicate_save, true);
 	if (ret) {
 		pr_debug("Memory eviction: TTM Reserve Failed. Try again\n");
 		goto ttm_reserve_fail;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d72cc583ebd1..fff558cf385b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -648,7 +648,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	}
 
 	r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
-				   &duplicates);
+				   &duplicates, true);
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
index 54dd02a898b9..06f83cac0d3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
@@ -79,7 +79,7 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	list_add(&csa_tv.head, &list);
 	amdgpu_vm_get_pd_bo(vm, &list, &pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL);
+	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
 	if (r) {
 		DRM_ERROR("failed to reserve CSA,PD BOs: err=%d\n", r);
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 7b840367004c..d513a5ad03dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -171,7 +171,7 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
 
 	amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates);
+	r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, true);
 	if (r) {
 		dev_err(adev->dev, "leaking bo va because "
 			"we fail to reserve bo (%d)\n", r);
@@ -608,7 +608,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 
 	amdgpu_vm_get_pd_bo(&fpriv->vm, &list, &vm_pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates);
+	r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, true);
 	if (r)
 		goto error_unref;
 
diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c
index 30f85f0130cb..49f9a9385393 100644
--- a/drivers/gpu/drm/qxl/qxl_release.c
+++ b/drivers/gpu/drm/qxl/qxl_release.c
@@ -256,7 +256,7 @@ int qxl_release_reserve_list(struct qxl_release *release, bool no_intr)
 		return 0;
 
 	ret = ttm_eu_reserve_buffers(&release->ticket, &release->bos,
-				     !no_intr, NULL);
+				     !no_intr, NULL, true);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index 44617dec8183..7411e69e2712 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -559,7 +559,7 @@ static void radeon_gem_va_update_vm(struct radeon_device *rdev,
 	if (!vm_bos)
 		return;
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL);
+	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
 	if (r)
 		goto error_free;
 
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index 833e909706a9..36683de0300b 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -539,7 +539,7 @@ int radeon_bo_list_validate(struct radeon_device *rdev,
 	u64 bytes_moved_threshold = radeon_bo_get_threshold_for_moves(rdev);
 
 	INIT_LIST_HEAD(&duplicates);
-	r = ttm_eu_reserve_buffers(ticket, head, true, &duplicates);
+	r = ttm_eu_reserve_buffers(ticket, head, true, &duplicates, true);
 	if (unlikely(r != 0)) {
 		return r;
 	}
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 2845fceb2fbd..06bbcd2679b2 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -173,19 +173,20 @@ void ttm_bo_add_to_lru(struct ttm_buffer_object *bo)
 
 	reservation_object_assert_held(bo->resv);
 
-	if (!(bo->mem.placement & TTM_PL_FLAG_NO_EVICT)) {
-		BUG_ON(!list_empty(&bo->lru));
+	if (!list_empty(&bo->lru))
+		return;
 
-		man = &bdev->man[bo->mem.mem_type];
-		list_add_tail(&bo->lru, &man->lru[bo->priority]);
-		kref_get(&bo->list_kref);
+	if (bo->mem.placement & TTM_PL_FLAG_NO_EVICT)
+		return;
 
-		if (bo->ttm && !(bo->ttm->page_flags &
-				 (TTM_PAGE_FLAG_SG | TTM_PAGE_FLAG_SWAPPED))) {
-			list_add_tail(&bo->swap,
-				      &bdev->glob->swap_lru[bo->priority]);
-			kref_get(&bo->list_kref);
-		}
+	man = &bdev->man[bo->mem.mem_type];
+	list_add_tail(&bo->lru, &man->lru[bo->priority]);
+	kref_get(&bo->list_kref);
+
+	if (bo->ttm && !(bo->ttm->page_flags &
+			 (TTM_PAGE_FLAG_SG | TTM_PAGE_FLAG_SWAPPED))) {
+		list_add_tail(&bo->swap, &bdev->glob->swap_lru[bo->priority]);
+		kref_get(&bo->list_kref);
 	}
 }
 EXPORT_SYMBOL(ttm_bo_add_to_lru);
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index 0075eb9a0b52..957ec375a4ba 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -69,7 +69,8 @@ void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
 	list_for_each_entry(entry, list, head) {
 		struct ttm_buffer_object *bo = entry->bo;
 
-		ttm_bo_add_to_lru(bo);
+		if (list_empty(&bo->lru))
+			ttm_bo_add_to_lru(bo);
 		reservation_object_unlock(bo->resv);
 	}
 	spin_unlock(&glob->lru_lock);
@@ -93,7 +94,7 @@ EXPORT_SYMBOL(ttm_eu_backoff_reservation);
 
 int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket,
 			   struct list_head *list, bool intr,
-			   struct list_head *dups)
+			   struct list_head *dups, bool del_lru)
 {
 	struct ttm_bo_global *glob;
 	struct ttm_validate_buffer *entry;
@@ -172,11 +173,11 @@ int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket,
 		list_add(&entry->head, list);
 	}
 
-	if (ticket)
-		ww_acquire_done(ticket);
-	spin_lock(&glob->lru_lock);
-	ttm_eu_del_from_lru_locked(list);
-	spin_unlock(&glob->lru_lock);
+	if (del_lru) {
+		spin_lock(&glob->lru_lock);
+		ttm_eu_del_from_lru_locked(list);
+		spin_unlock(&glob->lru_lock);
+	}
 	return 0;
 }
 EXPORT_SYMBOL(ttm_eu_reserve_buffers);
@@ -203,7 +204,10 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket,
 			reservation_object_add_shared_fence(bo->resv, fence);
 		else
 			reservation_object_add_excl_fence(bo->resv, fence);
-		ttm_bo_add_to_lru(bo);
+		if (list_empty(&bo->lru))
+			ttm_bo_add_to_lru(bo);
+		else
+			ttm_bo_move_to_lru_tail(bo, NULL);
 		reservation_object_unlock(bo->resv);
 	}
 	spin_unlock(&glob->lru_lock);
diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
index b7f9dfe61d1c..fe0bd6274db6 100644
--- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c
+++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
@@ -63,7 +63,7 @@ int virtio_gpu_object_list_validate(struct ww_acquire_ctx *ticket,
 	struct virtio_gpu_object *qobj;
 	int ret;
 
-	ret = ttm_eu_reserve_buffers(ticket, head, true, NULL);
+	ret = ttm_eu_reserve_buffers(ticket, head, true, NULL, true);
 	if (ret != 0)
 		return ret;
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index 711f8fd0dd45..1d38a8b2f2ec 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -464,7 +464,8 @@ vmw_resource_check_buffer(struct ww_acquire_ctx *ticket,
 	val_buf->bo = &res->backup->base;
 	val_buf->num_shared = 0;
 	list_add_tail(&val_buf->head, &val_list);
-	ret = ttm_eu_reserve_buffers(ticket, &val_list, interruptible, NULL);
+	ret = ttm_eu_reserve_buffers(ticket, &val_list, interruptible, NULL,
+				     true);
 	if (unlikely(ret != 0))
 		goto out_no_reserve;
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
index 523f6ac5c335..1d2322ad6fd5 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
@@ -169,7 +169,7 @@ vmw_validation_bo_reserve(struct vmw_validation_context *ctx,
 			  bool intr)
 {
 	return ttm_eu_reserve_buffers(&ctx->ticket, &ctx->bo_list, intr,
-				      NULL);
+				      NULL, true);
 }
 
 /**
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 129dabbc002d..9f54cf9c60df 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -769,7 +769,10 @@ static inline void ttm_bo_unreserve(struct ttm_buffer_object *bo)
 {
 	if (!(bo->mem.placement & TTM_PL_FLAG_NO_EVICT)) {
 		spin_lock(&bo->bdev->glob->lru_lock);
-		ttm_bo_add_to_lru(bo);
+		if (list_empty(&bo->lru))
+			ttm_bo_add_to_lru(bo);
+		else
+			ttm_bo_move_to_lru_tail(bo, NULL);
 		spin_unlock(&bo->bdev->glob->lru_lock);
 	}
 	reservation_object_unlock(bo->resv);
diff --git a/include/drm/ttm/ttm_execbuf_util.h b/include/drm/ttm/ttm_execbuf_util.h
index 621615fa7728..7e46cc678e7e 100644
--- a/include/drm/ttm/ttm_execbuf_util.h
+++ b/include/drm/ttm/ttm_execbuf_util.h
@@ -70,6 +70,7 @@ extern void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
  * @list:    thread private list of ttm_validate_buffer structs.
  * @intr:    should the wait be interruptible
  * @dups:    [out] optional list of duplicates.
+ * @del_lru: true if BOs should be removed from the LRU.
  *
  * Tries to reserve bos pointed to by the list entries for validation.
  * If the function returns 0, all buffers are marked as "unfenced",
@@ -98,7 +99,7 @@ extern void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
 
 extern int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket,
 				  struct list_head *list, bool intr,
-				  struct list_head *dups);
+				  struct list_head *dups, bool del_lru);
 
 /**
  * function ttm_eu_fence_buffer_objects.
-- 
cgit v1.2.3


From 87e5e6dab6c2a21fab2620f37786276d202e2ce0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 May 2019 16:02:22 -0600
Subject: uio: make import_iovec()/compat_import_iovec() return bytes on
 success

Currently these functions return < 0 on error, and 0 for success.
Change that so that we return < 0 on error, but number of bytes
for success.

Some callers already treat the return value that way, others need a
slight tweak.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c            |  9 +++++----
 fs/io_uring.c       | 16 ++++++++--------
 fs/splice.c         |  8 ++++----
 include/linux/uio.h |  4 ++--
 lib/iov_iter.c      | 15 ++++++++-------
 net/compat.c        |  3 ++-
 net/socket.c        |  3 ++-
 7 files changed, 31 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/fs/aio.c b/fs/aio.c
index 3490d1fa0e16..41824c710b36 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1479,8 +1479,9 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	return 0;
 }
 
-static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
-		bool vectored, bool compat, struct iov_iter *iter)
+static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
+		struct iovec **iovec, bool vectored, bool compat,
+		struct iov_iter *iter)
 {
 	void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
 	size_t len = iocb->aio_nbytes;
@@ -1537,7 +1538,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
 		return -EINVAL;
 
 	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret)
@@ -1565,7 +1566,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 		return -EINVAL;
 
 	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret) {
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0fbb486a320e..23e08c10f486 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1003,9 +1003,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 	return 0;
 }
 
-static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
-			   const struct sqe_submit *s, struct iovec **iovec,
-			   struct iov_iter *iter)
+static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
+			       const struct sqe_submit *s, struct iovec **iovec,
+			       struct iov_iter *iter)
 {
 	const struct io_uring_sqe *sqe = s->sqe;
 	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -1023,7 +1023,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 	opcode = READ_ONCE(sqe->opcode);
 	if (opcode == IORING_OP_READ_FIXED ||
 	    opcode == IORING_OP_WRITE_FIXED) {
-		int ret = io_import_fixed(ctx, rw, sqe, iter);
+		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
 		*iovec = NULL;
 		return ret;
 	}
@@ -1089,7 +1089,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	int ret;
+	ssize_t ret;
 
 	ret = io_prep_rw(req, s, force_nonblock);
 	if (ret)
@@ -1102,7 +1102,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 		return -EINVAL;
 
 	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	iov_count = iov_iter_count(&iter);
@@ -1136,7 +1136,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	int ret;
+	ssize_t ret;
 
 	ret = io_prep_rw(req, s, force_nonblock);
 	if (ret)
@@ -1149,7 +1149,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 		return -EINVAL;
 
 	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	iov_count = iov_iter_count(&iter);
diff --git a/fs/splice.c b/fs/splice.c
index 14cb602d9a2f..98412721f056 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1356,7 +1356,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	struct iov_iter iter;
-	long error;
+	ssize_t error;
 	struct fd f;
 	int type;
 
@@ -1367,7 +1367,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 
 	error = import_iovec(type, uiov, nr_segs,
 			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (!error) {
+	if (error >= 0) {
 		error = do_vmsplice(f.file, &iter, flags);
 		kfree(iov);
 	}
@@ -1382,7 +1382,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	struct iov_iter iter;
-	long error;
+	ssize_t error;
 	struct fd f;
 	int type;
 
@@ -1393,7 +1393,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
 
 	error = compat_import_iovec(type, iov32, nr_segs,
 			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (!error) {
+	if (error >= 0) {
 		error = do_vmsplice(f.file, &iter, flags);
 		kfree(iov);
 	}
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 2d0131ad4604..a61ceb6575ab 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -279,13 +279,13 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
 		struct iov_iter *i);
 
-int import_iovec(int type, const struct iovec __user * uvector,
+ssize_t import_iovec(int type, const struct iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
 		 struct iovec **iov, struct iov_iter *i);
 
 #ifdef CONFIG_COMPAT
 struct compat_iovec;
-int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
+ssize_t compat_import_iovec(int type, const struct compat_iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
 		 struct iovec **iov, struct iov_iter *i);
 #endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f99c41d4eb54..f1e0569b4539 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1634,9 +1634,9 @@ EXPORT_SYMBOL(dup_iter);
  * on-stack array was used or not (and regardless of whether this function
  * returns an error or not).
  *
- * Return: 0 on success or negative error code on error.
+ * Return: Negative error code on error, bytes imported on success
  */
-int import_iovec(int type, const struct iovec __user * uvector,
+ssize_t import_iovec(int type, const struct iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
 		 struct iovec **iov, struct iov_iter *i)
 {
@@ -1652,16 +1652,17 @@ int import_iovec(int type, const struct iovec __user * uvector,
 	}
 	iov_iter_init(i, type, p, nr_segs, n);
 	*iov = p == *iov ? NULL : p;
-	return 0;
+	return n;
 }
 EXPORT_SYMBOL(import_iovec);
 
 #ifdef CONFIG_COMPAT
 #include <linux/compat.h>
 
-int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
-		 unsigned nr_segs, unsigned fast_segs,
-		 struct iovec **iov, struct iov_iter *i)
+ssize_t compat_import_iovec(int type,
+		const struct compat_iovec __user * uvector,
+		unsigned nr_segs, unsigned fast_segs,
+		struct iovec **iov, struct iov_iter *i)
 {
 	ssize_t n;
 	struct iovec *p;
@@ -1675,7 +1676,7 @@ int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
 	}
 	iov_iter_init(i, type, p, nr_segs, n);
 	*iov = p == *iov ? NULL : p;
-	return 0;
+	return n;
 }
 #endif
 
diff --git a/net/compat.c b/net/compat.c
index 3f9ce609397f..0f7ded26059e 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -80,9 +80,10 @@ int get_compat_msghdr(struct msghdr *kmsg,
 
 	kmsg->msg_iocb = NULL;
 
-	return compat_import_iovec(save_addr ? READ : WRITE,
+	err = compat_import_iovec(save_addr ? READ : WRITE,
 				   compat_ptr(msg.msg_iov), msg.msg_iovlen,
 				   UIO_FASTIOV, iov, &kmsg->msg_iter);
+	return err < 0 ? err : 0;
 }
 
 /* Bleech... */
diff --git a/net/socket.c b/net/socket.c
index 72372dc5dd70..bffec466b4f1 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2208,9 +2208,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
 
 	kmsg->msg_iocb = NULL;
 
-	return import_iovec(save_addr ? READ : WRITE,
+	err = import_iovec(save_addr ? READ : WRITE,
 			    msg.msg_iov, msg.msg_iovlen,
 			    UIO_FASTIOV, iov, &kmsg->msg_iter);
+	return err < 0 ? err : 0;
 }
 
 static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
-- 
cgit v1.2.3


From 06c789a784c09b89422d93b439624de21bb7dc33 Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Mon, 20 May 2019 16:41:05 +0200
Subject: dt-bindings: gpio: meson-gxbb-gpio: update with SPDX Licence
 identifier

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Reviewed-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/dt-bindings/gpio/meson-gxbb-gpio.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/gpio/meson-gxbb-gpio.h b/include/dt-bindings/gpio/meson-gxbb-gpio.h
index 43a68a1110f0..c93274d7e108 100644
--- a/include/dt-bindings/gpio/meson-gxbb-gpio.h
+++ b/include/dt-bindings/gpio/meson-gxbb-gpio.h
@@ -1,15 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * GPIO definitions for Amlogic Meson GXBB SoCs
  *
  * Copyright (C) 2016 Endless Mobile, Inc.
  * Author: Carlo Caione <carlo@endlessm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _DT_BINDINGS_MESON_GXBB_GPIO_H
-- 
cgit v1.2.3


From eb5790db6a2818c58dd378afeb77db9407546655 Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Mon, 20 May 2019 16:41:06 +0200
Subject: dt-bindings: gpio: meson-gxl-gpio: update with SPDX Licence
 identifier

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Reviewed-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/dt-bindings/gpio/meson-gxl-gpio.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/gpio/meson-gxl-gpio.h b/include/dt-bindings/gpio/meson-gxl-gpio.h
index 01f2a2abd35e..62417358f55b 100644
--- a/include/dt-bindings/gpio/meson-gxl-gpio.h
+++ b/include/dt-bindings/gpio/meson-gxl-gpio.h
@@ -1,15 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * GPIO definitions for Amlogic Meson GXL SoCs
  *
  * Copyright (C) 2016 Endless Mobile, Inc.
  * Author: Carlo Caione <carlo@endlessm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _DT_BINDINGS_MESON_GXL_GPIO_H
-- 
cgit v1.2.3


From 03b30dff6d539a973fe10a863df4be00c9ca26f2 Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Mon, 20 May 2019 16:41:07 +0200
Subject: dt-bindings: gpio: meson8-gpio: update with SPDX Licence identifier

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Reviewed-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/dt-bindings/gpio/meson8-gpio.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/gpio/meson8-gpio.h b/include/dt-bindings/gpio/meson8-gpio.h
index fdaeb5cbf5e1..e1387a8520e5 100644
--- a/include/dt-bindings/gpio/meson8-gpio.h
+++ b/include/dt-bindings/gpio/meson8-gpio.h
@@ -1,14 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * GPIO definitions for Amlogic Meson8 SoCs
  *
  * Copyright (C) 2014 Beniamino Galvani <b.galvani@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _DT_BINDINGS_MESON8_GPIO_H
-- 
cgit v1.2.3


From fcae009397cd7c3c144e58e98027c97746559656 Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Mon, 20 May 2019 16:41:08 +0200
Subject: dt-bindings: gpio: meson8b-gpio: update with SPDX Licence identifier

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Reviewed-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/dt-bindings/gpio/meson8b-gpio.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/gpio/meson8b-gpio.h b/include/dt-bindings/gpio/meson8b-gpio.h
index bf0d76fa0e7b..e75d09b6087b 100644
--- a/include/dt-bindings/gpio/meson8b-gpio.h
+++ b/include/dt-bindings/gpio/meson8b-gpio.h
@@ -1,15 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * GPIO definitions for Amlogic Meson8b SoCs
  *
  * Copyright (C) 2015 Endless Mobile, Inc.
  * Author: Carlo Caione <carlo@endlessm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _DT_BINDINGS_MESON8B_GPIO_H
-- 
cgit v1.2.3


From 5213d7efc8ec26ed8938dce75427eff9275a62d9 Mon Sep 17 00:00:00 2001
From: Ruslan Babayev <ruslan@babayev.com>
Date: Tue, 28 May 2019 16:02:32 -0700
Subject: i2c: acpi: export i2c_acpi_find_adapter_by_handle

This allows drivers to lookup i2c adapters on ACPI based systems similar to
of_get_i2c_adapter_by_node() with DT based systems.

Signed-off-by: Ruslan Babayev <ruslan@babayev.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-acpi.c | 3 ++-
 include/linux/i2c.h         | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index 272800692088..964687534754 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -337,7 +337,7 @@ static int i2c_acpi_find_match_device(struct device *dev, void *data)
 	return ACPI_COMPANION(dev) == data;
 }
 
-static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
+struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
 {
 	struct device *dev;
 
@@ -345,6 +345,7 @@ static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
 			      i2c_acpi_find_match_adapter);
 	return dev ? i2c_verify_adapter(dev) : NULL;
 }
+EXPORT_SYMBOL_GPL(i2c_acpi_find_adapter_by_handle);
 
 static struct i2c_client *i2c_acpi_find_client_by_adev(struct acpi_device *adev)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 1308126fc384..e982b8913b73 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -14,6 +14,7 @@
 #ifndef _LINUX_I2C_H
 #define _LINUX_I2C_H
 
+#include <linux/acpi.h>		/* for acpi_handle */
 #include <linux/mod_devicetable.h>
 #include <linux/device.h>	/* for struct device */
 #include <linux/sched.h>	/* for completion */
@@ -981,6 +982,7 @@ bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 u32 i2c_acpi_find_bus_speed(struct device *dev);
 struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
 				       struct i2c_board_info *info);
+struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle);
 #else
 static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 					     struct acpi_resource_i2c_serialbus **i2c)
@@ -996,6 +998,10 @@ static inline struct i2c_client *i2c_acpi_new_device(struct device *dev,
 {
 	return NULL;
 }
+static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
+{
+	return NULL;
+}
 #endif /* CONFIG_ACPI */
 
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3


From 848d56dddad8a3a052a983d2f47b739d0492b8b1 Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Thu, 30 May 2019 01:29:01 +0530
Subject: drm: Drop a redundant unused variable

Drop a redundant and unused variable "hdr_output_metadata" from
drm_connector.

Suggested-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1559159944-21103-2-git-send-email-uma.shankar@intel.com
---
 include/drm/drm_connector.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index f8f4003e2758..547656173c74 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -1244,8 +1244,6 @@ struct drm_connector {
 	 */
 	struct llist_node free_node;
 
-	/* HDR metdata */
-	struct hdr_output_metadata hdr_output_metadata;
 	struct hdr_sink_metadata hdr_sink_metadata;
 };
 
-- 
cgit v1.2.3


From cfc1ce7e5212b19423d38ed2035318a35f983309 Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Thu, 30 May 2019 01:29:03 +0530
Subject: drm: Fixed doc warnings in drm uapi header

Fixed doc warnings in drm uapi header. All the UAPI
structures are now documented in kernel doc.

Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1559159944-21103-4-git-send-email-uma.shankar@intel.com
---
 include/uapi/drm/drm_mode.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 997a7e05c0c6..19b5cf368cff 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -826,6 +826,10 @@ struct drm_format_modifier {
 };
 
 /**
+ * struct drm_mode_create_blob - Create New block property
+ * @data: Pointer to data to copy.
+ * @length: Length of data to copy.
+ * @blob_id: new property ID.
  * Create a new 'blob' data property, copying length bytes from data pointer,
  * and returning new blob ID.
  */
@@ -839,6 +843,8 @@ struct drm_mode_create_blob {
 };
 
 /**
+ * struct drm_mode_destroy_blob - Destroy user blob
+ * @blob_id: blob_id to destroy
  * Destroy a user-created blob property.
  */
 struct drm_mode_destroy_blob {
@@ -846,6 +852,12 @@ struct drm_mode_destroy_blob {
 };
 
 /**
+ * struct drm_mode_create_lease - Create lease
+ * @object_ids: Pointer to array of object ids.
+ * @object_count: Number of object ids.
+ * @flags: flags for new FD.
+ * @lessee_id: unique identifier for lessee.
+ * @fd: file descriptor to new drm_master file.
  * Lease mode resources, creating another drm_master.
  */
 struct drm_mode_create_lease {
@@ -863,6 +875,10 @@ struct drm_mode_create_lease {
 };
 
 /**
+ * struct drm_mode_list_lessees - List lessees
+ * @count_lessees: Number of lessees.
+ * @pad: pad.
+ * @lessees_ptr: Pointer to lessess.
  * List lesses from a drm_master
  */
 struct drm_mode_list_lessees {
@@ -883,6 +899,10 @@ struct drm_mode_list_lessees {
 };
 
 /**
+ * struct drm_mode_get_lease - Get Lease
+ * @count_objects: Number of leased objects.
+ * @pad: pad.
+ * @objects_ptr: Pointer to objects.
  * Get leased objects
  */
 struct drm_mode_get_lease {
@@ -903,6 +923,8 @@ struct drm_mode_get_lease {
 };
 
 /**
+ * struct drm_mode_revoke_lease - Revoke lease
+ * @lessee_id: Unique ID of lessee.
  * Revoke lease
  */
 struct drm_mode_revoke_lease {
-- 
cgit v1.2.3


From 1b94f47793b1b1c4149d8192fa7b859fffd0e7ea Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Mon, 3 Jun 2019 16:28:48 +0200
Subject: drm/docs: More links for implicit/explicit fencing.

drm_atomic_set_fence_for_plane() contains the main discussion from a
driver pov, link to that from more places.

Cc: Pekka Paalanen <pekka.paalanen@collabora.com>
Reviewed-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20190603142848.26487-1-daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/drm_gem_framebuffer_helper.c | 6 ++++++
 include/drm/drm_plane.h                      | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_framebuffer_helper.c b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
index 6fd48efe288c..6791245963c3 100644
--- a/drivers/gpu/drm/drm_gem_framebuffer_helper.c
+++ b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
@@ -284,6 +284,9 @@ EXPORT_SYMBOL_GPL(drm_gem_fb_create_with_dirty);
  * There is no need for &drm_plane_helper_funcs.cleanup_fb hook for simple
  * gem based framebuffer drivers which have their buffers always pinned in
  * memory.
+ *
+ * See drm_atomic_set_fence_for_plane() for a discussion of implicit and
+ * explicit fencing in atomic modeset updates.
  */
 int drm_gem_fb_prepare_fb(struct drm_plane *plane,
 			  struct drm_plane_state *state)
@@ -314,6 +317,9 @@ EXPORT_SYMBOL_GPL(drm_gem_fb_prepare_fb);
  * &dma_buf attached, extracts the exclusive fence and attaches it to plane
  * state for the atomic helper to wait on. Drivers can use this as their
  * &drm_simple_display_pipe_funcs.prepare_fb callback.
+ *
+ * See drm_atomic_set_fence_for_plane() for a discussion of implicit and
+ * explicit fencing in atomic modeset updates.
  */
 int drm_gem_fb_simple_display_pipe_prepare_fb(struct drm_simple_display_pipe *pipe,
 					      struct drm_plane_state *plane_state)
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 6078c700d9ba..cd5903ad33f7 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -69,7 +69,7 @@ struct drm_plane_state {
 	 *
 	 * Optional fence to wait for before scanning out @fb. The core atomic
 	 * code will set this when userspace is using explicit fencing. Do not
-	 * write this directly for a driver's implicit fence, use
+	 * write this field directly for a driver's implicit fence, use
 	 * drm_atomic_set_fence_for_plane() to ensure that an explicit fence is
 	 * preserved.
 	 *
-- 
cgit v1.2.3


From d81294afeecdacc8d84804ba0bcb3d39e64d0f27 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Fri, 31 May 2019 16:01:11 +0200
Subject: drm/fb-helper: Remove drm_fb_helper_crtc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct drm_fb_helper_crtc is now just a wrapper around drm_mode_set so
use that directly instead and attach it as a modeset array onto
drm_client_dev. drm_fb_helper will use this array to store its modesets
which means it will always initialize a drm_client, but it will not
register the client (callbacks) unless it's the generic fbdev emulation.

Code will later be moved to drm_client, so add code there in a new file
drm_client_modeset.c with MIT license to match drm_fb_helper.c.

The modeset connector array size is hardcoded for the cloned case to avoid
having to pass in a value from the driver. A value of 8 is chosen to err
on the safe side. This means that the max connector argument for
drm_fb_helper_init() and drm_fb_helper_fbdev_setup() isn't used anymore,
a todo entry for this is added.

In pan_display_atomic() restore_fbdev_mode_force() is used instead of
restore_fbdev_mode_atomic() because that one will later become internal
to drm_client_modeset.

Locking order:
1. drm_fb_helper->lock
2. drm_master_internal_acquire
3. drm_client_dev->modeset_mutex

v6: Improve commit message (Sam Ravnborg)

v3:
- Use full drm_client_init/release for the modesets (Daniel Vetter)
- drm_client_for_each_modeset: use lockdep_assert_held (Daniel Vetter)
- Hook up to Documentation/gpu/drm-client.rst (Daniel Vetter)

v2:
- Add modesets array to drm_client (Daniel Vetter)
- Use a new file for the modeset code (Daniel Vetter)
- File has to be MIT licensed (Emmanuel Vadot)
- Add copyrights from drm_fb_helper.c

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190531140117.37751-3-noralf@tronnes.org
---
 Documentation/gpu/drm-client.rst     |   3 +
 Documentation/gpu/todo.rst           |   3 +
 drivers/gpu/drm/Makefile             |   2 +-
 drivers/gpu/drm/drm_client.c         |  10 +-
 drivers/gpu/drm/drm_client_modeset.c | 104 ++++++++++++
 drivers/gpu/drm/drm_fb_helper.c      | 299 +++++++++++++++--------------------
 include/drm/drm_client.h             |  30 ++++
 include/drm/drm_fb_helper.h          |   8 -
 8 files changed, 273 insertions(+), 186 deletions(-)
 create mode 100644 drivers/gpu/drm/drm_client_modeset.c

(limited to 'include')

diff --git a/Documentation/gpu/drm-client.rst b/Documentation/gpu/drm-client.rst
index 7e672063e7eb..58b5a1d1219d 100644
--- a/Documentation/gpu/drm-client.rst
+++ b/Documentation/gpu/drm-client.rst
@@ -10,3 +10,6 @@ Kernel clients
 
 .. kernel-doc:: drivers/gpu/drm/drm_client.c
    :export:
+
+.. kernel-doc:: drivers/gpu/drm/drm_client_modeset.c
+   :export:
diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
index 66f05f4e469f..9d4038c50013 100644
--- a/Documentation/gpu/todo.rst
+++ b/Documentation/gpu/todo.rst
@@ -289,6 +289,9 @@ drm_fb_helper tasks
   these igt tests need to be fixed: kms_fbcon_fbt@psr and
   kms_fbcon_fbt@psr-suspend.
 
+- The max connector argument for drm_fb_helper_init() and
+  drm_fb_helper_fbdev_setup() isn't used anymore and can be removed.
+
 Core refactorings
 =================
 
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 4a63c1fcf389..d36feb4a6233 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -17,7 +17,7 @@ drm-y       :=	drm_auth.o drm_cache.o \
 		drm_plane.o drm_color_mgmt.o drm_print.o \
 		drm_dumb_buffers.o drm_mode_config.o drm_vblank.o \
 		drm_syncobj.o drm_lease.o drm_writeback.o drm_client.o \
-		drm_atomic_uapi.o drm_hdcp.o
+		drm_client_modeset.o drm_atomic_uapi.o drm_hdcp.o
 
 drm-$(CONFIG_DRM_LEGACY) += drm_legacy_misc.o drm_bufs.o drm_context.o drm_dma.o drm_scatter.o drm_lock.o
 drm-$(CONFIG_DRM_LIB_RANDOM) += lib/drm_random.o
diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c
index e7870a54f498..410572f14257 100644
--- a/drivers/gpu/drm/drm_client.c
+++ b/drivers/gpu/drm/drm_client.c
@@ -27,7 +27,6 @@
  * DOC: overview
  *
  * This library provides support for clients running in the kernel like fbdev and bootsplash.
- * Currently it's only partially implemented, just enough to support fbdev.
  *
  * GEM drivers which provide a GEM based dumb buffer with a virtual address are supported.
  */
@@ -92,14 +91,20 @@ int drm_client_init(struct drm_device *dev, struct drm_client_dev *client,
 	client->name = name;
 	client->funcs = funcs;
 
-	ret = drm_client_open(client);
+	ret = drm_client_modeset_create(client);
 	if (ret)
 		goto err_put_module;
 
+	ret = drm_client_open(client);
+	if (ret)
+		goto err_free;
+
 	drm_dev_get(dev);
 
 	return 0;
 
+err_free:
+	drm_client_modeset_free(client);
 err_put_module:
 	if (funcs)
 		module_put(funcs->owner);
@@ -148,6 +153,7 @@ void drm_client_release(struct drm_client_dev *client)
 
 	DRM_DEV_DEBUG_KMS(dev->dev, "%s\n", client->name);
 
+	drm_client_modeset_free(client);
 	drm_client_close(client);
 	drm_dev_put(dev);
 	if (client->funcs)
diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c
new file mode 100644
index 000000000000..66770ed3299e
--- /dev/null
+++ b/drivers/gpu/drm/drm_client_modeset.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2018 Noralf Trønnes
+ * Copyright (c) 2006-2009 Red Hat Inc.
+ * Copyright (c) 2006-2008 Intel Corporation
+ *   Jesse Barnes <jesse.barnes@intel.com>
+ * Copyright (c) 2007 Dave Airlie <airlied@linux.ie>
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include <drm/drm_client.h>
+#include <drm/drm_crtc.h>
+#include <drm/drm_device.h>
+
+int drm_client_modeset_create(struct drm_client_dev *client)
+{
+	struct drm_device *dev = client->dev;
+	unsigned int num_crtc = dev->mode_config.num_crtc;
+	unsigned int max_connector_count = 1;
+	struct drm_mode_set *modeset;
+	struct drm_crtc *crtc;
+	unsigned int i = 0;
+
+	/* Add terminating zero entry to enable index less iteration */
+	client->modesets = kcalloc(num_crtc + 1, sizeof(*client->modesets), GFP_KERNEL);
+	if (!client->modesets)
+		return -ENOMEM;
+
+	mutex_init(&client->modeset_mutex);
+
+	drm_for_each_crtc(crtc, dev)
+		client->modesets[i++].crtc = crtc;
+
+	/* Cloning is only supported in the single crtc case. */
+	if (num_crtc == 1)
+		max_connector_count = DRM_CLIENT_MAX_CLONED_CONNECTORS;
+
+	for (modeset = client->modesets; modeset->crtc; modeset++) {
+		modeset->connectors = kcalloc(max_connector_count,
+					      sizeof(*modeset->connectors), GFP_KERNEL);
+		if (!modeset->connectors)
+			goto err_free;
+	}
+
+	return 0;
+
+err_free:
+	drm_client_modeset_free(client);
+
+	return -ENOMEM;
+}
+
+void drm_client_modeset_release(struct drm_client_dev *client)
+{
+	struct drm_mode_set *modeset;
+	unsigned int i;
+
+	drm_client_for_each_modeset(modeset, client) {
+		drm_mode_destroy(client->dev, modeset->mode);
+		modeset->mode = NULL;
+		modeset->fb = NULL;
+
+		for (i = 0; i < modeset->num_connectors; i++) {
+			drm_connector_put(modeset->connectors[i]);
+			modeset->connectors[i] = NULL;
+		}
+		modeset->num_connectors = 0;
+	}
+}
+/* TODO: Remove export when modeset code has been moved over */
+EXPORT_SYMBOL(drm_client_modeset_release);
+
+void drm_client_modeset_free(struct drm_client_dev *client)
+{
+	struct drm_mode_set *modeset;
+
+	mutex_lock(&client->modeset_mutex);
+
+	drm_client_modeset_release(client);
+
+	drm_client_for_each_modeset(modeset, client)
+		kfree(modeset->connectors);
+
+	mutex_unlock(&client->modeset_mutex);
+
+	mutex_destroy(&client->modeset_mutex);
+	kfree(client->modesets);
+}
+
+struct drm_mode_set *drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc)
+{
+	struct drm_mode_set *modeset;
+
+	drm_client_for_each_modeset(modeset, client)
+		if (modeset->crtc == crtc)
+			return modeset;
+
+	return NULL;
+}
+/* TODO: Remove export when modeset code has been moved over */
+EXPORT_SYMBOL(drm_client_find_modeset);
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 3f4cb66c7d65..b9b7c06cbc4f 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -322,13 +322,11 @@ int drm_fb_helper_debug_enter(struct fb_info *info)
 {
 	struct drm_fb_helper *helper = info->par;
 	const struct drm_crtc_helper_funcs *funcs;
-	int i;
+	struct drm_mode_set *mode_set;
 
 	list_for_each_entry(helper, &kernel_fb_helper_list, kernel_fb_list) {
-		for (i = 0; i < helper->crtc_count; i++) {
-			struct drm_mode_set *mode_set =
-				&helper->crtc_info[i].mode_set;
-
+		mutex_lock(&helper->client.modeset_mutex);
+		drm_client_for_each_modeset(mode_set, &helper->client) {
 			if (!mode_set->crtc->enabled)
 				continue;
 
@@ -345,6 +343,7 @@ int drm_fb_helper_debug_enter(struct fb_info *info)
 						    mode_set->y,
 						    ENTER_ATOMIC_MODE_SET);
 		}
+		mutex_unlock(&helper->client.modeset_mutex);
 	}
 
 	return 0;
@@ -358,14 +357,14 @@ EXPORT_SYMBOL(drm_fb_helper_debug_enter);
 int drm_fb_helper_debug_leave(struct fb_info *info)
 {
 	struct drm_fb_helper *helper = info->par;
+	struct drm_client_dev *client = &helper->client;
 	struct drm_crtc *crtc;
 	const struct drm_crtc_helper_funcs *funcs;
+	struct drm_mode_set *mode_set;
 	struct drm_framebuffer *fb;
-	int i;
-
-	for (i = 0; i < helper->crtc_count; i++) {
-		struct drm_mode_set *mode_set = &helper->crtc_info[i].mode_set;
 
+	mutex_lock(&client->modeset_mutex);
+	drm_client_for_each_modeset(mode_set, client) {
 		crtc = mode_set->crtc;
 		if (drm_drv_uses_atomic_modeset(crtc->dev))
 			continue;
@@ -388,6 +387,7 @@ int drm_fb_helper_debug_leave(struct fb_info *info)
 		funcs->mode_set_base_atomic(mode_set->crtc, fb, crtc->x,
 					    crtc->y, LEAVE_ATOMIC_MODE_SET);
 	}
+	mutex_unlock(&client->modeset_mutex);
 
 	return 0;
 }
@@ -438,12 +438,14 @@ static bool drm_fb_helper_panel_rotation(struct drm_mode_set *modeset,
 
 static int restore_fbdev_mode_atomic(struct drm_fb_helper *fb_helper, bool active)
 {
+	struct drm_client_dev *client = &fb_helper->client;
 	struct drm_device *dev = fb_helper->dev;
 	struct drm_plane_state *plane_state;
 	struct drm_plane *plane;
 	struct drm_atomic_state *state;
-	int i, ret;
 	struct drm_modeset_acquire_ctx ctx;
+	struct drm_mode_set *mode_set;
+	int ret;
 
 	drm_modeset_acquire_init(&ctx, 0);
 
@@ -473,8 +475,7 @@ retry:
 			goto out_state;
 	}
 
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		struct drm_mode_set *mode_set = &fb_helper->crtc_info[i].mode_set;
+	drm_client_for_each_modeset(mode_set, client) {
 		struct drm_plane *primary = mode_set->crtc->primary;
 		unsigned int rotation;
 
@@ -522,9 +523,11 @@ backoff:
 
 static int restore_fbdev_mode_legacy(struct drm_fb_helper *fb_helper)
 {
+	struct drm_client_dev *client = &fb_helper->client;
 	struct drm_device *dev = fb_helper->dev;
+	struct drm_mode_set *mode_set;
 	struct drm_plane *plane;
-	int i, ret = 0;
+	int ret = 0;
 
 	drm_modeset_lock_all(fb_helper->dev);
 	drm_for_each_plane(plane, dev) {
@@ -537,8 +540,7 @@ static int restore_fbdev_mode_legacy(struct drm_fb_helper *fb_helper)
 						    DRM_MODE_ROTATE_0);
 	}
 
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		struct drm_mode_set *mode_set = &fb_helper->crtc_info[i].mode_set;
+	drm_client_for_each_modeset(mode_set, client) {
 		struct drm_crtc *crtc = mode_set->crtc;
 
 		if (crtc->funcs->cursor_set2) {
@@ -564,11 +566,16 @@ out:
 static int restore_fbdev_mode_force(struct drm_fb_helper *fb_helper)
 {
 	struct drm_device *dev = fb_helper->dev;
+	int ret;
 
+	mutex_lock(&fb_helper->client.modeset_mutex);
 	if (drm_drv_uses_atomic_modeset(dev))
-		return restore_fbdev_mode_atomic(fb_helper, true);
+		ret = restore_fbdev_mode_atomic(fb_helper, true);
 	else
-		return restore_fbdev_mode_legacy(fb_helper);
+		ret = restore_fbdev_mode_legacy(fb_helper);
+	mutex_unlock(&fb_helper->client.modeset_mutex);
+
+	return ret;
 }
 
 static int restore_fbdev_mode(struct drm_fb_helper *fb_helper)
@@ -687,15 +694,14 @@ static struct sysrq_key_op sysrq_drm_fb_helper_restore_op = { };
 
 static void dpms_legacy(struct drm_fb_helper *fb_helper, int dpms_mode)
 {
+	struct drm_client_dev *client = &fb_helper->client;
 	struct drm_device *dev = fb_helper->dev;
 	struct drm_connector *connector;
 	struct drm_mode_set *modeset;
-	int i, j;
+	int j;
 
 	drm_modeset_lock_all(dev);
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		modeset = &fb_helper->crtc_info[i].mode_set;
-
+	drm_client_for_each_modeset(modeset, client) {
 		if (!modeset->crtc->enabled)
 			continue;
 
@@ -712,6 +718,7 @@ static void dpms_legacy(struct drm_fb_helper *fb_helper, int dpms_mode)
 static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode)
 {
 	struct drm_fb_helper *fb_helper = info->par;
+	struct drm_client_dev *client = &fb_helper->client;
 	struct drm_device *dev = fb_helper->dev;
 
 	/*
@@ -721,10 +728,12 @@ static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode)
 	if (!drm_master_internal_acquire(dev))
 		goto unlock;
 
+	mutex_lock(&client->modeset_mutex);
 	if (drm_drv_uses_atomic_modeset(dev))
 		restore_fbdev_mode_atomic(fb_helper, dpms_mode == DRM_MODE_DPMS_ON);
 	else
 		dpms_legacy(fb_helper, dpms_mode);
+	mutex_unlock(&client->modeset_mutex);
 
 	drm_master_internal_release(dev);
 unlock:
@@ -767,43 +776,6 @@ int drm_fb_helper_blank(int blank, struct fb_info *info)
 }
 EXPORT_SYMBOL(drm_fb_helper_blank);
 
-static void drm_fb_helper_modeset_release(struct drm_fb_helper *helper,
-					  struct drm_mode_set *modeset)
-{
-	int i;
-
-	for (i = 0; i < modeset->num_connectors; i++) {
-		drm_connector_put(modeset->connectors[i]);
-		modeset->connectors[i] = NULL;
-	}
-	modeset->num_connectors = 0;
-
-	drm_mode_destroy(helper->dev, modeset->mode);
-	modeset->mode = NULL;
-
-	/* FIXME should hold a ref? */
-	modeset->fb = NULL;
-}
-
-static void drm_fb_helper_crtc_free(struct drm_fb_helper *helper)
-{
-	int i;
-
-	for (i = 0; i < helper->connector_count; i++) {
-		drm_connector_put(helper->connector_info[i]->connector);
-		kfree(helper->connector_info[i]);
-	}
-	kfree(helper->connector_info);
-
-	for (i = 0; i < helper->crtc_count; i++) {
-		struct drm_mode_set *modeset = &helper->crtc_info[i].mode_set;
-
-		drm_fb_helper_modeset_release(helper, modeset);
-		kfree(modeset->connectors);
-	}
-	kfree(helper->crtc_info);
-}
-
 static void drm_fb_helper_resume_worker(struct work_struct *work)
 {
 	struct drm_fb_helper *helper = container_of(work, struct drm_fb_helper,
@@ -882,7 +854,7 @@ EXPORT_SYMBOL(drm_fb_helper_prepare);
  * drm_fb_helper_init - initialize a &struct drm_fb_helper
  * @dev: drm device
  * @fb_helper: driver-allocated fbdev helper structure to initialize
- * @max_conn_count: max connector count
+ * @max_conn_count: max connector count (not used)
  *
  * This allocates the structures for the fbdev helper with the given limits.
  * Note that this won't yet touch the hardware (through the driver interfaces)
@@ -898,53 +870,36 @@ int drm_fb_helper_init(struct drm_device *dev,
 		       struct drm_fb_helper *fb_helper,
 		       int max_conn_count)
 {
-	struct drm_crtc *crtc;
-	struct drm_mode_config *config = &dev->mode_config;
-	int i;
+	int ret;
 
 	if (!drm_fbdev_emulation) {
 		dev->fb_helper = fb_helper;
 		return 0;
 	}
 
-	if (!max_conn_count)
-		return -EINVAL;
-
-	fb_helper->crtc_info = kcalloc(config->num_crtc, sizeof(struct drm_fb_helper_crtc), GFP_KERNEL);
-	if (!fb_helper->crtc_info)
-		return -ENOMEM;
+	/*
+	 * If this is not the generic fbdev client, initialize a drm_client
+	 * without callbacks so we can use the modesets.
+	 */
+	if (!fb_helper->client.funcs) {
+		ret = drm_client_init(dev, &fb_helper->client, "drm_fb_helper", NULL);
+		if (ret)
+			return ret;
+	}
 
-	fb_helper->crtc_count = config->num_crtc;
 	fb_helper->connector_info = kcalloc(dev->mode_config.num_connector, sizeof(struct drm_fb_helper_connector *), GFP_KERNEL);
-	if (!fb_helper->connector_info) {
-		kfree(fb_helper->crtc_info);
-		return -ENOMEM;
-	}
+	if (!fb_helper->connector_info)
+		goto out_free;
+
 	fb_helper->connector_info_alloc_count = dev->mode_config.num_connector;
 	fb_helper->connector_count = 0;
 
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		fb_helper->crtc_info[i].mode_set.connectors =
-			kcalloc(max_conn_count,
-				sizeof(struct drm_connector *),
-				GFP_KERNEL);
-
-		if (!fb_helper->crtc_info[i].mode_set.connectors)
-			goto out_free;
-		fb_helper->crtc_info[i].mode_set.num_connectors = 0;
-	}
-
-	i = 0;
-	drm_for_each_crtc(crtc, dev) {
-		fb_helper->crtc_info[i].mode_set.crtc = crtc;
-		i++;
-	}
-
 	dev->fb_helper = fb_helper;
 
 	return 0;
 out_free:
-	drm_fb_helper_crtc_free(fb_helper);
+	drm_client_release(&fb_helper->client);
+
 	return -ENOMEM;
 }
 EXPORT_SYMBOL(drm_fb_helper_init);
@@ -1020,6 +975,7 @@ EXPORT_SYMBOL(drm_fb_helper_unregister_fbi);
 void drm_fb_helper_fini(struct drm_fb_helper *fb_helper)
 {
 	struct fb_info *info;
+	int i;
 
 	if (!fb_helper)
 		return;
@@ -1049,8 +1005,15 @@ void drm_fb_helper_fini(struct drm_fb_helper *fb_helper)
 	mutex_unlock(&kernel_fb_helper_lock);
 
 	mutex_destroy(&fb_helper->lock);
-	drm_fb_helper_crtc_free(fb_helper);
 
+	if (!fb_helper->client.funcs)
+		drm_client_release(&fb_helper->client);
+
+	for (i = 0; i < fb_helper->connector_count; i++) {
+		drm_connector_put(fb_helper->connector_info[i]->connector);
+		kfree(fb_helper->connector_info[i]);
+	}
+	kfree(fb_helper->connector_info);
 }
 EXPORT_SYMBOL(drm_fb_helper_fini);
 
@@ -1395,13 +1358,14 @@ static int setcmap_pseudo_palette(struct fb_cmap *cmap, struct fb_info *info)
 static int setcmap_legacy(struct fb_cmap *cmap, struct fb_info *info)
 {
 	struct drm_fb_helper *fb_helper = info->par;
+	struct drm_mode_set *modeset;
 	struct drm_crtc *crtc;
 	u16 *r, *g, *b;
-	int i, ret = 0;
+	int ret = 0;
 
 	drm_modeset_lock_all(fb_helper->dev);
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		crtc = fb_helper->crtc_info[i].mode_set.crtc;
+	drm_client_for_each_modeset(modeset, &fb_helper->client) {
+		crtc = modeset->crtc;
 		if (!crtc->funcs->gamma_set || !crtc->gamma_size)
 			return -EINVAL;
 
@@ -1477,10 +1441,11 @@ static int setcmap_atomic(struct fb_cmap *cmap, struct fb_info *info)
 	struct drm_modeset_acquire_ctx ctx;
 	struct drm_crtc_state *crtc_state;
 	struct drm_atomic_state *state;
+	struct drm_mode_set *modeset;
 	struct drm_crtc *crtc;
 	u16 *r, *g, *b;
-	int i, ret = 0;
 	bool replaced;
+	int ret = 0;
 
 	drm_modeset_acquire_init(&ctx, 0);
 
@@ -1492,8 +1457,8 @@ static int setcmap_atomic(struct fb_cmap *cmap, struct fb_info *info)
 
 	state->acquire_ctx = &ctx;
 retry:
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		crtc = fb_helper->crtc_info[i].mode_set.crtc;
+	drm_client_for_each_modeset(modeset, &fb_helper->client) {
+		crtc = modeset->crtc;
 
 		if (!gamma_lut)
 			gamma_lut = setcmap_new_gamma_lut(crtc, cmap);
@@ -1521,8 +1486,8 @@ retry:
 	if (ret)
 		goto out_state;
 
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		crtc = fb_helper->crtc_info[i].mode_set.crtc;
+	drm_client_for_each_modeset(modeset, &fb_helper->client) {
+		crtc = modeset->crtc;
 
 		r = crtc->gamma_store;
 		g = r + crtc->gamma_size;
@@ -1572,12 +1537,14 @@ int drm_fb_helper_setcmap(struct fb_cmap *cmap, struct fb_info *info)
 		goto unlock;
 	}
 
+	mutex_lock(&fb_helper->client.modeset_mutex);
 	if (info->fix.visual == FB_VISUAL_TRUECOLOR)
 		ret = setcmap_pseudo_palette(cmap, info);
 	else if (drm_drv_uses_atomic_modeset(fb_helper->dev))
 		ret = setcmap_atomic(cmap, info);
 	else
 		ret = setcmap_legacy(cmap, info);
+	mutex_unlock(&fb_helper->client.modeset_mutex);
 
 	drm_master_internal_release(dev);
 unlock:
@@ -1601,7 +1568,6 @@ int drm_fb_helper_ioctl(struct fb_info *info, unsigned int cmd,
 {
 	struct drm_fb_helper *fb_helper = info->par;
 	struct drm_device *dev = fb_helper->dev;
-	struct drm_mode_set *mode_set;
 	struct drm_crtc *crtc;
 	int ret = 0;
 
@@ -1629,8 +1595,7 @@ int drm_fb_helper_ioctl(struct fb_info *info, unsigned int cmd,
 		 * make. If we're not smart enough here, one should
 		 * just consider switch the userspace to KMS.
 		 */
-		mode_set = &fb_helper->crtc_info[0].mode_set;
-		crtc = mode_set->crtc;
+		crtc = fb_helper->client.modesets[0].crtc;
 
 		/*
 		 * Only wait for a vblank event if the CRTC is
@@ -1827,16 +1792,14 @@ EXPORT_SYMBOL(drm_fb_helper_set_par);
 
 static void pan_set(struct drm_fb_helper *fb_helper, int x, int y)
 {
-	int i;
-
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		struct drm_mode_set *mode_set;
-
-		mode_set = &fb_helper->crtc_info[i].mode_set;
+	struct drm_mode_set *mode_set;
 
+	mutex_lock(&fb_helper->client.modeset_mutex);
+	drm_client_for_each_modeset(mode_set, &fb_helper->client) {
 		mode_set->x = x;
 		mode_set->y = y;
 	}
+	mutex_unlock(&fb_helper->client.modeset_mutex);
 }
 
 static int pan_display_atomic(struct fb_var_screeninfo *var,
@@ -1847,7 +1810,7 @@ static int pan_display_atomic(struct fb_var_screeninfo *var,
 
 	pan_set(fb_helper, var->xoffset, var->yoffset);
 
-	ret = restore_fbdev_mode_atomic(fb_helper, true);
+	ret = restore_fbdev_mode_force(fb_helper);
 	if (!ret) {
 		info->var.xoffset = var->xoffset;
 		info->var.yoffset = var->yoffset;
@@ -1861,14 +1824,13 @@ static int pan_display_legacy(struct fb_var_screeninfo *var,
 			      struct fb_info *info)
 {
 	struct drm_fb_helper *fb_helper = info->par;
+	struct drm_client_dev *client = &fb_helper->client;
 	struct drm_mode_set *modeset;
 	int ret = 0;
-	int i;
 
 	drm_modeset_lock_all(fb_helper->dev);
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		modeset = &fb_helper->crtc_info[i].mode_set;
-
+	mutex_lock(&client->modeset_mutex);
+	drm_client_for_each_modeset(modeset, client) {
 		modeset->x = var->xoffset;
 		modeset->y = var->yoffset;
 
@@ -1880,6 +1842,7 @@ static int pan_display_legacy(struct fb_var_screeninfo *var,
 			}
 		}
 	}
+	mutex_unlock(&client->modeset_mutex);
 	drm_modeset_unlock_all(fb_helper->dev);
 
 	return ret;
@@ -1926,10 +1889,12 @@ EXPORT_SYMBOL(drm_fb_helper_pan_display);
 static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper,
 					 int preferred_bpp)
 {
+	struct drm_client_dev *client = &fb_helper->client;
 	int ret = 0;
 	int crtc_count = 0;
 	int i;
 	struct drm_fb_helper_surface_size sizes;
+	struct drm_mode_set *mode_set;
 	int best_depth = 0;
 
 	memset(&sizes, 0, sizeof(struct drm_fb_helper_surface_size));
@@ -1980,13 +1945,13 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper,
 	 * supports RGBA5551 (16 bpp, depth 15) but not RGB565 (16 bpp, depth
 	 * 16) we need to scale down the depth of the sizes we request.
 	 */
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		struct drm_mode_set *mode_set = &fb_helper->crtc_info[i].mode_set;
+	mutex_lock(&client->modeset_mutex);
+	drm_client_for_each_modeset(mode_set, client) {
 		struct drm_crtc *crtc = mode_set->crtc;
 		struct drm_plane *plane = crtc->primary;
 		int j;
 
-		DRM_DEBUG("test CRTC %d primary plane\n", i);
+		DRM_DEBUG("test CRTC %u primary plane\n", drm_crtc_index(crtc));
 
 		for (j = 0; j < plane->format_count; j++) {
 			const struct drm_format_info *fmt;
@@ -2026,9 +1991,8 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper,
 
 	/* first up get a count of crtcs now in use and new min/maxes width/heights */
 	crtc_count = 0;
-	for (i = 0; i < fb_helper->crtc_count; i++) {
+	drm_client_for_each_modeset(mode_set, client) {
 		struct drm_display_mode *desired_mode;
-		struct drm_mode_set *mode_set;
 		int x, y, j;
 		/* in case of tile group, are we the last tile vert or horiz?
 		 * If no tile group you are always the last one both vertically
@@ -2036,7 +2000,6 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper,
 		 */
 		bool lastv = true, lasth = true;
 
-		mode_set = &fb_helper->crtc_info[i].mode_set;
 		desired_mode = mode_set->mode;
 
 		if (!desired_mode)
@@ -2066,6 +2029,7 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper,
 		if (lastv)
 			sizes.fb_height = min_t(u32, desired_mode->vdisplay + y, sizes.fb_height);
 	}
+	mutex_unlock(&client->modeset_mutex);
 
 	if (crtc_count == 0 || sizes.fb_width == -1 || sizes.fb_height == -1) {
 		DRM_INFO("Cannot find any crtc or sizes\n");
@@ -2297,7 +2261,7 @@ static bool drm_target_cloned(struct drm_fb_helper *fb_helper,
 	struct drm_display_mode *dmt_mode, *mode;
 
 	/* only contemplate cloning in the single crtc case */
-	if (fb_helper->crtc_count > 1)
+	if (fb_helper->dev->mode_config.num_crtc > 1)
 		return false;
 
 	count = 0;
@@ -2486,15 +2450,17 @@ static bool connector_has_possible_crtc(struct drm_connector *connector,
 }
 
 static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
-			  struct drm_fb_helper_crtc **best_crtcs,
+			  struct drm_crtc **best_crtcs,
 			  struct drm_display_mode **modes,
 			  int n, int width, int height)
 {
-	int c, o;
+	struct drm_client_dev *client = &fb_helper->client;
 	struct drm_connector *connector;
 	int my_score, best_score, score;
-	struct drm_fb_helper_crtc **crtcs, *crtc;
+	struct drm_crtc **crtcs, *crtc;
+	struct drm_mode_set *modeset;
 	struct drm_fb_helper_connector *fb_helper_conn;
+	int o;
 
 	if (n == fb_helper->connector_count)
 		return 0;
@@ -2507,8 +2473,7 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
 	if (modes[n] == NULL)
 		return best_score;
 
-	crtcs = kcalloc(fb_helper->connector_count,
-			sizeof(struct drm_fb_helper_crtc *), GFP_KERNEL);
+	crtcs = kcalloc(fb_helper->connector_count, sizeof(*crtcs), GFP_KERNEL);
 	if (!crtcs)
 		return best_score;
 
@@ -2524,11 +2489,10 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
 	 * select a crtc for this connector and then attempt to configure
 	 * remaining connectors
 	 */
-	for (c = 0; c < fb_helper->crtc_count; c++) {
-		crtc = &fb_helper->crtc_info[c];
+	drm_client_for_each_modeset(modeset, client) {
+		crtc = modeset->crtc;
 
-		if (!connector_has_possible_crtc(connector,
-						 crtc->mode_set.crtc))
+		if (!connector_has_possible_crtc(connector, crtc))
 			continue;
 
 		for (o = 0; o < n; o++)
@@ -2537,7 +2501,7 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
 
 		if (o < n) {
 			/* ignore cloning unless only a single crtc */
-			if (fb_helper->crtc_count > 1)
+			if (fb_helper->dev->mode_config.num_crtc > 1)
 				continue;
 
 			if (!drm_mode_equal(modes[o], modes[n]))
@@ -2545,14 +2509,13 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
 		}
 
 		crtcs[n] = crtc;
-		memcpy(crtcs, best_crtcs, n * sizeof(struct drm_fb_helper_crtc *));
+		memcpy(crtcs, best_crtcs, n * sizeof(*crtcs));
 		score = my_score + drm_pick_crtcs(fb_helper, crtcs, modes, n + 1,
 						  width, height);
 		if (score > best_score) {
 			best_score = score;
 			memcpy(best_crtcs, crtcs,
-			       fb_helper->connector_count *
-			       sizeof(struct drm_fb_helper_crtc *));
+			       fb_helper->connector_count * sizeof(*crtcs));
 		}
 	}
 
@@ -2560,21 +2523,9 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
 	return best_score;
 }
 
-static struct drm_fb_helper_crtc *
-drm_fb_helper_crtc(struct drm_fb_helper *fb_helper, struct drm_crtc *crtc)
-{
-	int i;
-
-	for (i = 0; i < fb_helper->crtc_count; i++)
-		if (fb_helper->crtc_info[i].mode_set.crtc == crtc)
-			return &fb_helper->crtc_info[i];
-
-	return NULL;
-}
-
 /* Try to read the BIOS display configuration and use it for the initial config */
 static bool drm_fb_helper_firmware_config(struct drm_fb_helper *fb_helper,
-					  struct drm_fb_helper_crtc **crtcs,
+					  struct drm_crtc **crtcs,
 					  struct drm_display_mode **modes,
 					  struct drm_fb_offset *offsets,
 					  bool *enabled, int width, int height)
@@ -2610,7 +2561,7 @@ retry:
 		struct drm_fb_helper_connector *fb_conn;
 		struct drm_connector *connector;
 		struct drm_encoder *encoder;
-		struct drm_fb_helper_crtc *new_crtc;
+		struct drm_crtc *new_crtc;
 
 		fb_conn = fb_helper->connector_info[i];
 		connector = fb_conn->connector;
@@ -2652,7 +2603,7 @@ retry:
 
 		num_connectors_enabled++;
 
-		new_crtc = drm_fb_helper_crtc(fb_helper, connector->state->crtc);
+		new_crtc = connector->state->crtc;
 
 		/*
 		 * Make sure we're not trying to drive multiple connectors
@@ -2752,10 +2703,11 @@ bail:
 static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 			    u32 width, u32 height)
 {
+	struct drm_client_dev *client = &fb_helper->client;
 	struct drm_device *dev = fb_helper->dev;
-	struct drm_fb_helper_crtc **crtcs;
 	struct drm_display_mode **modes;
 	struct drm_fb_offset *offsets;
+	struct drm_crtc **crtcs;
 	bool *enabled;
 	int i;
 
@@ -2763,8 +2715,7 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 	/* prevent concurrent modification of connector_count by hotplug */
 	lockdep_assert_held(&fb_helper->lock);
 
-	crtcs = kcalloc(fb_helper->connector_count,
-			sizeof(struct drm_fb_helper_crtc *), GFP_KERNEL);
+	crtcs = kcalloc(fb_helper->connector_count, sizeof(*crtcs), GFP_KERNEL);
 	modes = kcalloc(fb_helper->connector_count,
 			sizeof(struct drm_display_mode *), GFP_KERNEL);
 	offsets = kcalloc(fb_helper->connector_count,
@@ -2776,6 +2727,8 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 		goto out;
 	}
 
+	mutex_lock(&client->modeset_mutex);
+
 	mutex_lock(&fb_helper->dev->mode_config.mutex);
 	if (drm_fb_helper_probe_connector_modes(fb_helper, width, height) == 0)
 		DRM_DEBUG_KMS("No connectors reported connected with modes\n");
@@ -2800,24 +2753,24 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 	}
 	mutex_unlock(&fb_helper->dev->mode_config.mutex);
 
-	/* need to set the modesets up here for use later */
-	/* fill out the connector<->crtc mappings into the modesets */
-	for (i = 0; i < fb_helper->crtc_count; i++)
-		drm_fb_helper_modeset_release(fb_helper,
-					      &fb_helper->crtc_info[i].mode_set);
+	drm_client_modeset_release(client);
 
 	drm_fb_helper_for_each_connector(fb_helper, i) {
 		struct drm_display_mode *mode = modes[i];
-		struct drm_fb_helper_crtc *fb_crtc = crtcs[i];
+		struct drm_crtc *crtc = crtcs[i];
 		struct drm_fb_offset *offset = &offsets[i];
 
-		if (mode && fb_crtc) {
-			struct drm_mode_set *modeset = &fb_crtc->mode_set;
+		if (mode && crtc) {
+			struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc);
 			struct drm_connector *connector =
 				fb_helper->connector_info[i]->connector;
 
 			DRM_DEBUG_KMS("desired mode %s set on crtc %d (%d,%d)\n",
-				      mode->name, fb_crtc->mode_set.crtc->base.id, offset->x, offset->y);
+				      mode->name, crtc->base.id, offset->x, offset->y);
+
+			if (WARN_ON_ONCE(modeset->num_connectors == DRM_CLIENT_MAX_CLONED_CONNECTORS ||
+					 (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1)))
+				break;
 
 			modeset->mode = drm_mode_duplicate(dev, mode);
 			drm_connector_get(connector);
@@ -2826,6 +2779,8 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 			modeset->y = offset->y;
 		}
 	}
+
+	mutex_unlock(&client->modeset_mutex);
 out:
 	kfree(crtcs);
 	kfree(modes);
@@ -2842,13 +2797,14 @@ out:
  */
 static void drm_setup_crtcs_fb(struct drm_fb_helper *fb_helper)
 {
+	struct drm_client_dev *client = &fb_helper->client;
 	struct fb_info *info = fb_helper->fbdev;
 	unsigned int rotation, sw_rotations = 0;
+	struct drm_mode_set *modeset;
 	int i;
 
-	for (i = 0; i < fb_helper->crtc_count; i++) {
-		struct drm_mode_set *modeset = &fb_helper->crtc_info[i].mode_set;
-
+	mutex_lock(&client->modeset_mutex);
+	drm_client_for_each_modeset(modeset, client) {
 		if (!modeset->num_connectors)
 			continue;
 
@@ -2860,6 +2816,7 @@ static void drm_setup_crtcs_fb(struct drm_fb_helper *fb_helper)
 		else
 			sw_rotations |= rotation;
 	}
+	mutex_unlock(&client->modeset_mutex);
 
 	mutex_lock(&fb_helper->dev->mode_config.mutex);
 	drm_fb_helper_for_each_connector(fb_helper, i) {
@@ -3075,8 +3032,7 @@ EXPORT_SYMBOL(drm_fb_helper_hotplug_event);
  * @funcs: fbdev helper functions
  * @preferred_bpp: Preferred bits per pixel for the device.
  *                 @dev->mode_config.preferred_depth is used if this is zero.
- * @max_conn_count: Maximum number of connectors.
- *                  @dev->mode_config.num_connector is used if this is zero.
+ * @max_conn_count: Maximum number of connectors (not used)
  *
  * This function sets up fbdev emulation and registers fbdev for access by
  * userspace. If all connectors are disconnected, setup is deferred to the next
@@ -3104,16 +3060,9 @@ int drm_fb_helper_fbdev_setup(struct drm_device *dev,
 	if (!preferred_bpp)
 		preferred_bpp = 32;
 
-	if (!max_conn_count)
-		max_conn_count = dev->mode_config.num_connector;
-	if (!max_conn_count) {
-		DRM_DEV_ERROR(dev->dev, "fbdev: No connectors\n");
-		return -EINVAL;
-	}
-
 	drm_fb_helper_prepare(dev, fb_helper, funcs);
 
-	ret = drm_fb_helper_init(dev, fb_helper, max_conn_count);
+	ret = drm_fb_helper_init(dev, fb_helper, 0);
 	if (ret < 0) {
 		DRM_DEV_ERROR(dev->dev, "fbdev: Failed to initialize (ret=%d)\n", ret);
 		return ret;
@@ -3426,7 +3375,7 @@ static int drm_fbdev_client_hotplug(struct drm_client_dev *client)
 
 	drm_fb_helper_prepare(dev, fb_helper, &drm_fb_helper_generic_funcs);
 
-	ret = drm_fb_helper_init(dev, fb_helper, dev->mode_config.num_connector);
+	ret = drm_fb_helper_init(dev, fb_helper, 0);
 	if (ret)
 		goto err;
 
diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h
index 268b2cf0052a..87be9aeb1fe0 100644
--- a/include/drm/drm_client.h
+++ b/include/drm/drm_client.h
@@ -3,8 +3,12 @@
 #ifndef _DRM_CLIENT_H_
 #define _DRM_CLIENT_H_
 
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
 #include <linux/types.h>
 
+#include <drm/drm_crtc.h>
+
 struct drm_client_dev;
 struct drm_device;
 struct drm_file;
@@ -13,6 +17,8 @@ struct drm_gem_object;
 struct drm_minor;
 struct module;
 
+#define DRM_CLIENT_MAX_CLONED_CONNECTORS	8
+
 /**
  * struct drm_client_funcs - DRM client callbacks
  */
@@ -85,6 +91,16 @@ struct drm_client_dev {
 	 * @file: DRM file
 	 */
 	struct drm_file *file;
+
+	/**
+	 * @modeset_mutex: Protects @modesets.
+	 */
+	struct mutex modeset_mutex;
+
+	/**
+	 * @modesets: CRTC configurations
+	 */
+	struct drm_mode_set *modesets;
 };
 
 int drm_client_init(struct drm_device *dev, struct drm_client_dev *client,
@@ -135,6 +151,20 @@ struct drm_client_buffer *
 drm_client_framebuffer_create(struct drm_client_dev *client, u32 width, u32 height, u32 format);
 void drm_client_framebuffer_delete(struct drm_client_buffer *buffer);
 
+int drm_client_modeset_create(struct drm_client_dev *client);
+void drm_client_modeset_free(struct drm_client_dev *client);
+void drm_client_modeset_release(struct drm_client_dev *client);
+struct drm_mode_set *drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc);
+
+/**
+ * drm_client_for_each_modeset() - Iterate over client modesets
+ * @modeset: &drm_mode_set loop cursor
+ * @client: DRM client
+ */
+#define drm_client_for_each_modeset(modeset, client) \
+	for (({ lockdep_assert_held(&(client)->modeset_mutex); }), \
+	     modeset = (client)->modesets; modeset->crtc; modeset++)
+
 int drm_client_debugfs_init(struct drm_minor *minor);
 
 #endif
diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h
index 2af1c6d3e147..6b334f4d8a22 100644
--- a/include/drm/drm_fb_helper.h
+++ b/include/drm/drm_fb_helper.h
@@ -47,10 +47,6 @@ struct drm_fb_offset {
 	int x, y;
 };
 
-struct drm_fb_helper_crtc {
-	struct drm_mode_set mode_set;
-};
-
 /**
  * struct drm_fb_helper_surface_size - describes fbdev size and scanout surface size
  * @fb_width: fbdev width
@@ -109,8 +105,6 @@ struct drm_fb_helper_connector {
  * struct drm_fb_helper - main structure to emulate fbdev on top of KMS
  * @fb: Scanout framebuffer object
  * @dev: DRM device
- * @crtc_count: number of possible CRTCs
- * @crtc_info: per-CRTC helper state (mode, x/y offset, etc)
  * @connector_count: number of connected connectors
  * @connector_info_alloc_count: size of connector_info
  * @funcs: driver callbacks for fb helper
@@ -144,8 +138,6 @@ struct drm_fb_helper {
 
 	struct drm_framebuffer *fb;
 	struct drm_device *dev;
-	int crtc_count;
-	struct drm_fb_helper_crtc *crtc_info;
 	int connector_count;
 	int connector_info_alloc_count;
 	/**
-- 
cgit v1.2.3


From a09db883e5d938b525a86a4630fc04f98ff1063d Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Tue, 4 Jun 2019 16:47:02 +0530
Subject: drm: Fix docbook warnings in hdr metadata helper structures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following warnings:
./include/drm/drm_mode_config.h:841: warning: Incorrect use of
kernel-doc format:          * hdr_output_metadata_property: Connector
property containing hdr
./include/drm/drm_mode_config.h:918: warning: Function parameter or member 'hdr_output_metadata_property' not described in 'drm_mode_config'
./include/drm/drm_connector.h:1251: warning: Function parameter or member 'hdr_output_metadata' not described in 'drm_connector'
./include/drm/drm_connector.h:1251: warning: Function parameter or member 'hdr_sink_metadata' not described in 'drm_connector'

Also adds some property documentation for HDR Metadata Connector
Property in connector property create function.

v2: Fixed Sean Paul's review comments.

v3: Fixed Daniel Vetter's review comments, added the UAPI structure
definition section in kernel docs.

v4: Fixed Daniel Vetter's review comments.

v5: Added structure member references as per Daniel's suggestion.

Cc: Shashank Sharma <shashank.sharma@intel.com>
Cc: Ville Syrjä <ville.syrjala@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: "Ville Syrjä" <ville.syrjala@linux.intel.com>
Cc: Hans Verkuil <hansverk@cisco.com>
Cc: dri-devel@lists.freedesktop.org
Cc: linux-fbdev@vger.kernel.org
Reviewed-by: Sean Paul <sean@poorly.run> (v1)
Signed-off-by: Uma Shankar <uma.shankar@intel.com>
[danvet: Fix up markup: () for functions, & for structs. Style guide
also recommends to prepend struct for structures.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1559647022-7336-1-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_connector.c | 40 ++++++++++++++++++++++
 include/drm/drm_connector.h     |  1 +
 include/drm/drm_mode_config.h   |  4 +--
 include/linux/hdmi.h            | 12 +++++++
 include/uapi/drm/drm_mode.h     | 74 ++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 128 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index c9ac8b9e83ea..e17586aaa80f 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -956,6 +956,46 @@ static const struct drm_prop_enum_list hdmi_colorspaces[] = {
  *	  is no longer protected and userspace should take appropriate action
  *	  (whatever that might be).
  *
+ * HDR_OUTPUT_METADATA:
+ *	Connector property to enable userspace to send HDR Metadata to
+ *	driver. This metadata is based on the composition and blending
+ *	policies decided by user, taking into account the hardware and
+ *	sink capabilities. The driver gets this metadata and creates a
+ *	Dynamic Range and Mastering Infoframe (DRM) in case of HDMI,
+ *	SDP packet (Non-audio INFOFRAME SDP v1.3) for DP. This is then
+ *	sent to sink. This notifies the sink of the upcoming frame's Color
+ *	Encoding and Luminance parameters.
+ *
+ *	Userspace first need to detect the HDR capabilities of sink by
+ *	reading and parsing the EDID. Details of HDR metadata for HDMI
+ *	are added in CTA 861.G spec. For DP , its defined in VESA DP
+ *	Standard v1.4. It needs to then get the metadata information
+ *	of the video/game/app content which are encoded in HDR (basically
+ *	using HDR transfer functions). With this information it needs to
+ *	decide on a blending policy and compose the relevant
+ *	layers/overlays into a common format. Once this blending is done,
+ *	userspace will be aware of the metadata of the composed frame to
+ *	be send to sink. It then uses this property to communicate this
+ *	metadata to driver which then make a Infoframe packet and sends
+ *	to sink based on the type of encoder connected.
+ *
+ *	Userspace will be responsible to do Tone mapping operation in case:
+ *		- Some layers are HDR and others are SDR
+ *		- HDR layers luminance is not same as sink
+ *	It will even need to do colorspace conversion and get all layers
+ *	to one common colorspace for blending. It can use either GL, Media
+ *	or display engine to get this done based on the capabilties of the
+ *	associated hardware.
+ *
+ *	Driver expects metadata to be put in &struct hdr_output_metadata
+ *	structure from userspace. This is received as blob and stored in
+ *	&drm_connector_state.hdr_output_metadata. It parses EDID and saves the
+ *	sink metadata in &struct hdr_sink_metadata, as
+ *	&drm_connector.hdr_sink_metadata.  Driver uses
+ *	drm_hdmi_infoframe_set_hdr_metadata() helper to set the HDR metadata,
+ *	hdmi_drm_infoframe_pack() to pack the infoframe as per spec, in case of
+ *	HDMI encoder.
+ *
  * max bpc:
  *	This range property is used by userspace to limit the bit depth. When
  *	used the driver would limit the bpc in accordance with the valid range
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 547656173c74..47e749b74e5f 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -1244,6 +1244,7 @@ struct drm_connector {
 	 */
 	struct llist_node free_node;
 
+	/** @hdr_sink_metadata: HDR Metadata Information read from sink */
 	struct hdr_sink_metadata hdr_sink_metadata;
 };
 
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 4f88cc972407..759d462d028b 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -837,8 +837,8 @@ struct drm_mode_config {
 	struct drm_property *writeback_out_fence_ptr_property;
 
 	/**
-	 * hdr_output_metadata_property: Connector property containing hdr
-	 * metatda. This will be provided by userspace compositors based
+	 * @hdr_output_metadata_property: Connector property containing hdr
+	 * metatada. This will be provided by userspace compositors based
 	 * on HDR content
 	 */
 	struct drm_property *hdr_output_metadata_property;
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index ee55ba589cdc..9918a6c910c5 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -367,8 +367,19 @@ struct hdr_static_metadata {
 	__u16 min_cll;
 };
 
+/**
+ * struct hdr_sink_metadata - HDR sink metadata
+ *
+ * Metadata Information read from Sink's EDID
+ */
 struct hdr_sink_metadata {
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
 	__u32 metadata_type;
+	/**
+	 * @hdmi_type1: HDR Metadata Infoframe.
+	 */
 	union {
 		struct hdr_static_metadata hdmi_type1;
 	};
@@ -398,6 +409,7 @@ union hdmi_vendor_any_infoframe {
  * @spd: spd infoframe
  * @vendor: union of all vendor infoframes
  * @audio: audio infoframe
+ * @drm: Dynamic Range and Mastering infoframe
  *
  * This is used by the generic pack function. This works since all infoframes
  * have the same header which also indicates which type of infoframe should be
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 19b5cf368cff..5ab331e5dc23 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -33,6 +33,15 @@
 extern "C" {
 #endif
 
+/**
+ * DOC: overview
+ *
+ * DRM exposes many UAPI and structure definition to have a consistent
+ * and standardized interface with user.
+ * Userspace can refer to these structure definitions and UAPI formats
+ * to communicate to driver
+ */
+
 #define DRM_CONNECTOR_NAME_LEN	32
 #define DRM_DISPLAY_MODE_LEN	32
 #define DRM_PROP_NAME_LEN	32
@@ -630,24 +639,87 @@ struct drm_color_lut {
 	__u16 reserved;
 };
 
-/* HDR Metadata Infoframe as per 861.G spec */
+/**
+ * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data.
+ *
+ * HDR Metadata Infoframe as per CTA 861.G spec. This is expected
+ * to match exactly with the spec.
+ *
+ * Userspace is expected to pass the metadata information as per
+ * the format described in this structure.
+ */
 struct hdr_metadata_infoframe {
+	/**
+	 * @eotf: Electro-Optical Transfer Function (EOTF)
+	 * used in the stream.
+	 */
 	__u8 eotf;
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
 	__u8 metadata_type;
+	/**
+	 * @display_primaries: Color Primaries of the Data.
+	 * These are coded as unsigned 16-bit values in units of
+	 * 0.00002, where 0x0000 represents zero and 0xC350
+	 * represents 1.0000.
+	 * @display_primaries.x: X cordinate of color primary.
+	 * @display_primaries.y: Y cordinate of color primary.
+	 */
 	struct {
 		__u16 x, y;
 		} display_primaries[3];
+	/**
+	 * @white_point: White Point of Colorspace Data.
+	 * These are coded as unsigned 16-bit values in units of
+	 * 0.00002, where 0x0000 represents zero and 0xC350
+	 * represents 1.0000.
+	 * @white_point.x: X cordinate of whitepoint of color primary.
+	 * @white_point.y: Y cordinate of whitepoint of color primary.
+	 */
 	struct {
 		__u16 x, y;
 		} white_point;
+	/**
+	 * @max_display_mastering_luminance: Max Mastering Display Luminance.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
 	__u16 max_display_mastering_luminance;
+	/**
+	 * @min_display_mastering_luminance: Min Mastering Display Luminance.
+	 * This value is coded as an unsigned 16-bit value in units of
+	 * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF
+	 * represents 6.5535 cd/m2.
+	 */
 	__u16 min_display_mastering_luminance;
+	/**
+	 * @max_cll: Max Content Light Level.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
 	__u16 max_cll;
+	/**
+	 * @max_fall: Max Frame Average Light Level.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
 	__u16 max_fall;
 };
 
+/**
+ * struct hdr_output_metadata - HDR output metadata
+ *
+ * Metadata Information to be passed from userspace
+ */
 struct hdr_output_metadata {
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
 	__u32 metadata_type;
+	/**
+	 * @hdmi_metadata_type1: HDR Metadata Infoframe.
+	 */
 	union {
 		struct hdr_metadata_infoframe hdmi_metadata_type1;
 	};
-- 
cgit v1.2.3


From 1e390478cfb527e34c9ab89ba57212cb05c33c51 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 5 Jun 2019 10:46:05 +0200
Subject: gpu: host1x: Increase maximum DMA segment size

Recent versions of the DMA API debug code have started to warn about
violations of the maximum DMA segment size. This is because the segment
size defaults to 64 KiB, which can easily be exceeded in large buffer
allocations such as used in DRM/KMS for framebuffers.

Technically the Tegra SMMU and ARM SMMU don't have a maximum segment
size (they map individual pages irrespective of whether they are
contiguous or not), so the choice of 4 MiB is a bit arbitrary here. The
maximum segment size is a 32-bit unsigned integer, though, so we can't
set it to the correct maximum size, which would be the size of the
aperture.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/bus.c | 3 +++
 include/linux/host1x.h   | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 103fffc1904b..c9a637d9417e 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -425,6 +425,9 @@ static int host1x_device_add(struct host1x *host1x,
 
 	of_dma_configure(&device->dev, host1x->dev->of_node, true);
 
+	device->dev.dma_parms = &device->dma_parms;
+	dma_set_max_seg_size(&device->dev, SZ_4M);
+
 	err = host1x_device_parse_dt(device, driver);
 	if (err < 0) {
 		kfree(device);
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 89110d896d72..aef6e2f73802 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -310,6 +310,8 @@ struct host1x_device {
 	struct list_head clients;
 
 	bool registered;
+
+	struct device_dma_parameters dma_parms;
 };
 
 static inline struct host1x_device *to_host1x_device(struct device *dev)
-- 
cgit v1.2.3


From ae96e8d7b6f437a0cece42c2e834d64c8f22234c Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Tue, 4 Jun 2019 17:07:10 +0200
Subject: drm/ttm: fix ttm_bo_unreserve
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since we now keep BOs on the LRU we need to explicitely remove
them from the LRU now after they are pinned.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/drm/ttm/ttm_bo_driver.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 9f54cf9c60df..c9b8ba492f24 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -767,14 +767,12 @@ static inline int ttm_bo_reserve_slowpath(struct ttm_buffer_object *bo,
  */
 static inline void ttm_bo_unreserve(struct ttm_buffer_object *bo)
 {
-	if (!(bo->mem.placement & TTM_PL_FLAG_NO_EVICT)) {
-		spin_lock(&bo->bdev->glob->lru_lock);
-		if (list_empty(&bo->lru))
-			ttm_bo_add_to_lru(bo);
-		else
-			ttm_bo_move_to_lru_tail(bo, NULL);
-		spin_unlock(&bo->bdev->glob->lru_lock);
-	}
+	spin_lock(&bo->bdev->glob->lru_lock);
+	if (list_empty(&bo->lru))
+		ttm_bo_add_to_lru(bo);
+	else
+		ttm_bo_move_to_lru_tail(bo, NULL);
+	spin_unlock(&bo->bdev->glob->lru_lock);
 	reservation_object_unlock(bo->resv);
 }
 
-- 
cgit v1.2.3


From 382d2af64e7d7083eefcc8c7e6c780634174ce87 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Wed, 29 May 2019 14:02:03 +0300
Subject: drm/edid: Clean up DRM_EDID_DIGITAL_* flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Give the "DFP 1.x" bit a proper name, and clean up the rest
of the bits defines as well.

Cc: Mario Kleiner <mario.kleiner.de@gmail.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190529110204.2384-1-ville.syrjala@linux.intel.com
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Mario Kleiner <mario.kleiner.de@gmail.com>
---
 drivers/gpu/drm/drm_edid.c |  2 +-
 include/drm/drm_edid.h     | 32 +++++++++++++++++---------------
 2 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index d87f574feeca..dd601ed6a30e 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -4570,7 +4570,7 @@ u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edi
 	 * extensions which tell otherwise.
 	 */
 	if ((info->bpc == 0) && (edid->revision < 4) &&
-	    (edid->input & DRM_EDID_DIGITAL_TYPE_DVI)) {
+	    (edid->input & DRM_EDID_DIGITAL_DFP_1_X)) {
 		info->bpc = 8;
 		DRM_DEBUG("%s: Assigning DFP sink color depth as %d bpc.\n",
 			  connector->name, info->bpc);
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 0e21e91c4314..88b63801f9db 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -177,21 +177,23 @@ struct detailed_timing {
 #define DRM_EDID_INPUT_BLANK_TO_BLACK  (1 << 4)
 #define DRM_EDID_INPUT_VIDEO_LEVEL     (3 << 5)
 #define DRM_EDID_INPUT_DIGITAL         (1 << 7)
-#define DRM_EDID_DIGITAL_DEPTH_MASK    (7 << 4)
-#define DRM_EDID_DIGITAL_DEPTH_UNDEF   (0 << 4)
-#define DRM_EDID_DIGITAL_DEPTH_6       (1 << 4)
-#define DRM_EDID_DIGITAL_DEPTH_8       (2 << 4)
-#define DRM_EDID_DIGITAL_DEPTH_10      (3 << 4)
-#define DRM_EDID_DIGITAL_DEPTH_12      (4 << 4)
-#define DRM_EDID_DIGITAL_DEPTH_14      (5 << 4)
-#define DRM_EDID_DIGITAL_DEPTH_16      (6 << 4)
-#define DRM_EDID_DIGITAL_DEPTH_RSVD    (7 << 4)
-#define DRM_EDID_DIGITAL_TYPE_UNDEF    (0)
-#define DRM_EDID_DIGITAL_TYPE_DVI      (1)
-#define DRM_EDID_DIGITAL_TYPE_HDMI_A   (2)
-#define DRM_EDID_DIGITAL_TYPE_HDMI_B   (3)
-#define DRM_EDID_DIGITAL_TYPE_MDDI     (4)
-#define DRM_EDID_DIGITAL_TYPE_DP       (5)
+#define DRM_EDID_DIGITAL_DEPTH_MASK    (7 << 4) /* 1.4 */
+#define DRM_EDID_DIGITAL_DEPTH_UNDEF   (0 << 4) /* 1.4 */
+#define DRM_EDID_DIGITAL_DEPTH_6       (1 << 4) /* 1.4 */
+#define DRM_EDID_DIGITAL_DEPTH_8       (2 << 4) /* 1.4 */
+#define DRM_EDID_DIGITAL_DEPTH_10      (3 << 4) /* 1.4 */
+#define DRM_EDID_DIGITAL_DEPTH_12      (4 << 4) /* 1.4 */
+#define DRM_EDID_DIGITAL_DEPTH_14      (5 << 4) /* 1.4 */
+#define DRM_EDID_DIGITAL_DEPTH_16      (6 << 4) /* 1.4 */
+#define DRM_EDID_DIGITAL_DEPTH_RSVD    (7 << 4) /* 1.4 */
+#define DRM_EDID_DIGITAL_TYPE_MASK     (7 << 0) /* 1.4 */
+#define DRM_EDID_DIGITAL_TYPE_UNDEF    (0 << 0) /* 1.4 */
+#define DRM_EDID_DIGITAL_TYPE_DVI      (1 << 0) /* 1.4 */
+#define DRM_EDID_DIGITAL_TYPE_HDMI_A   (2 << 0) /* 1.4 */
+#define DRM_EDID_DIGITAL_TYPE_HDMI_B   (3 << 0) /* 1.4 */
+#define DRM_EDID_DIGITAL_TYPE_MDDI     (4 << 0) /* 1.4 */
+#define DRM_EDID_DIGITAL_TYPE_DP       (5 << 0) /* 1.4 */
+#define DRM_EDID_DIGITAL_DFP_1_X       (1 << 0) /* 1.3 */
 
 #define DRM_EDID_FEATURE_DEFAULT_GTF      (1 << 0)
 #define DRM_EDID_FEATURE_PREFERRED_TIMING (1 << 1)
-- 
cgit v1.2.3


From cc9bdecf4b8d20b3d3d0f8a6cb3e577548b5539f Mon Sep 17 00:00:00 2001
From: Lukasz Luba <l.luba@partner.samsung.com>
Date: Wed, 5 Jun 2019 18:53:58 +0200
Subject: clk: samsung: add needed IDs for DMC clocks in Exynos5420

Define new IDs for clocks used by Dynamic Memory Controller in
Exynos5422 SoC.

Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Lukasz Luba <l.luba@partner.samsung.com>
Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
---
 include/dt-bindings/clock/exynos5420.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/exynos5420.h b/include/dt-bindings/clock/exynos5420.h
index 355f469943f1..02d5ac469a3d 100644
--- a/include/dt-bindings/clock/exynos5420.h
+++ b/include/dt-bindings/clock/exynos5420.h
@@ -60,6 +60,7 @@
 #define CLK_MAU_EPLL		159
 #define CLK_SCLK_HSIC_12M	160
 #define CLK_SCLK_MPHY_IXTAL24	161
+#define CLK_SCLK_BPLL		162
 
 /* gate clocks */
 #define CLK_UART0		257
@@ -195,6 +196,16 @@
 #define CLK_ACLK432_CAM		518
 #define CLK_ACLK_FL1550_CAM	519
 #define CLK_ACLK550_CAM		520
+#define CLK_CLKM_PHY0		521
+#define CLK_CLKM_PHY1		522
+#define CLK_ACLK_PPMU_DREX0_0	523
+#define CLK_ACLK_PPMU_DREX0_1	524
+#define CLK_ACLK_PPMU_DREX1_0	525
+#define CLK_ACLK_PPMU_DREX1_1	526
+#define CLK_PCLK_PPMU_DREX0_0	527
+#define CLK_PCLK_PPMU_DREX0_1	528
+#define CLK_PCLK_PPMU_DREX1_0	529
+#define CLK_PCLK_PPMU_DREX1_1	530
 
 /* mux clocks */
 #define CLK_MOUT_HDMI		640
@@ -217,6 +228,8 @@
 #define CLK_MOUT_EPLL		657
 #define CLK_MOUT_MAU_EPLL	658
 #define CLK_MOUT_USER_MAU_EPLL	659
+#define CLK_MOUT_SCLK_SPLL	660
+#define CLK_MOUT_MX_MSPLL_CCORE_PHY	661
 
 /* divider clocks */
 #define CLK_DOUT_PIXEL		768
@@ -248,8 +261,11 @@
 #define CLK_DOUT_CCLK_DREX0	794
 #define CLK_DOUT_CLK2X_PHY0	795
 #define CLK_DOUT_PCLK_CORE_MEM	796
+#define CLK_FF_DOUT_SPLL2	797
+#define CLK_DOUT_PCLK_DREX0	798
+#define CLK_DOUT_PCLK_DREX1	799
 
 /* must be greater than maximal clock id */
-#define CLK_NR_CLKS		797
+#define CLK_NR_CLKS		800
 
 #endif /* _DT_BINDINGS_CLOCK_EXYNOS_5420_H */
-- 
cgit v1.2.3


From 2076e5c0451ca943ff8ecc6def7239c84c77e070 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Mon, 6 May 2019 16:29:38 -0700
Subject: mm/hmm: update HMM documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the HMM documentation to reflect the latest API and make a few
minor wording changes.

Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 Documentation/vm/hmm.rst | 141 +++++++++++++++++++++++++----------------------
 include/linux/hmm.h      |   7 ++-
 2 files changed, 78 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 7cdf7282e022..7b6eeda5a7c0 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -10,7 +10,7 @@ of this being specialized struct page for such memory (see sections 5 to 7 of
 this document).
 
 HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
-allowing a device to transparently access program address coherently with
+allowing a device to transparently access program addresses coherently with
 the CPU meaning that any valid pointer on the CPU is also a valid pointer
 for the device. This is becoming mandatory to simplify the use of advanced
 heterogeneous computing where GPU, DSP, or FPGA are used to perform various
@@ -22,8 +22,8 @@ expose the hardware limitations that are inherent to many platforms. The third
 section gives an overview of the HMM design. The fourth section explains how
 CPU page-table mirroring works and the purpose of HMM in this context. The
 fifth section deals with how device memory is represented inside the kernel.
-Finally, the last section presents a new migration helper that allows lever-
-aging the device DMA engine.
+Finally, the last section presents a new migration helper that allows
+leveraging the device DMA engine.
 
 .. contents:: :local:
 
@@ -39,20 +39,20 @@ address space. I use shared address space to refer to the opposite situation:
 i.e., one in which any application memory region can be used by a device
 transparently.
 
-Split address space happens because device can only access memory allocated
-through device specific API. This implies that all memory objects in a program
+Split address space happens because devices can only access memory allocated
+through a device specific API. This implies that all memory objects in a program
 are not equal from the device point of view which complicates large programs
 that rely on a wide set of libraries.
 
-Concretely this means that code that wants to leverage devices like GPUs needs
-to copy object between generically allocated memory (malloc, mmap private, mmap
+Concretely, this means that code that wants to leverage devices like GPUs needs
+to copy objects between generically allocated memory (malloc, mmap private, mmap
 share) and memory allocated through the device driver API (this still ends up
 with an mmap but of the device file).
 
 For flat data sets (array, grid, image, ...) this isn't too hard to achieve but
-complex data sets (list, tree, ...) are hard to get right. Duplicating a
+for complex data sets (list, tree, ...) it's hard to get right. Duplicating a
 complex data set needs to re-map all the pointer relations between each of its
-elements. This is error prone and program gets harder to debug because of the
+elements. This is error prone and programs get harder to debug because of the
 duplicate data set and addresses.
 
 Split address space also means that libraries cannot transparently use data
@@ -77,12 +77,12 @@ I/O bus, device memory characteristics
 
 I/O buses cripple shared address spaces due to a few limitations. Most I/O
 buses only allow basic memory access from device to main memory; even cache
-coherency is often optional. Access to device memory from CPU is even more
+coherency is often optional. Access to device memory from a CPU is even more
 limited. More often than not, it is not cache coherent.
 
 If we only consider the PCIE bus, then a device can access main memory (often
 through an IOMMU) and be cache coherent with the CPUs. However, it only allows
-a limited set of atomic operations from device on main memory. This is worse
+a limited set of atomic operations from the device on main memory. This is worse
 in the other direction: the CPU can only access a limited range of the device
 memory and cannot perform atomic operations on it. Thus device memory cannot
 be considered the same as regular memory from the kernel point of view.
@@ -93,20 +93,20 @@ The final limitation is latency. Access to main memory from the device has an
 order of magnitude higher latency than when the device accesses its own memory.
 
 Some platforms are developing new I/O buses or additions/modifications to PCIE
-to address some of these limitations (OpenCAPI, CCIX). They mainly allow two-
-way cache coherency between CPU and device and allow all atomic operations the
+to address some of these limitations (OpenCAPI, CCIX). They mainly allow
+two-way cache coherency between CPU and device and allow all atomic operations the
 architecture supports. Sadly, not all platforms are following this trend and
 some major architectures are left without hardware solutions to these problems.
 
 So for shared address space to make sense, not only must we allow devices to
 access any memory but we must also permit any memory to be migrated to device
-memory while device is using it (blocking CPU access while it happens).
+memory while the device is using it (blocking CPU access while it happens).
 
 
 Shared address space and migration
 ==================================
 
-HMM intends to provide two main features. First one is to share the address
+HMM intends to provide two main features. The first one is to share the address
 space by duplicating the CPU page table in the device page table so the same
 address points to the same physical memory for any valid main memory address in
 the process address space.
@@ -121,14 +121,14 @@ why HMM provides helpers to factor out everything that can be while leaving the
 hardware specific details to the device driver.
 
 The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that
-allows allocating a struct page for each page of the device memory. Those pages
+allows allocating a struct page for each page of device memory. Those pages
 are special because the CPU cannot map them. However, they allow migrating
 main memory to device memory using existing migration mechanisms and everything
-looks like a page is swapped out to disk from the CPU point of view. Using a
-struct page gives the easiest and cleanest integration with existing mm mech-
-anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
+looks like a page that is swapped out to disk from the CPU point of view. Using a
+struct page gives the easiest and cleanest integration with existing mm
+mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
 memory for the device memory and second to perform migration. Policy decisions
-of what and when to migrate things is left to the device driver.
+of what and when to migrate is left to the device driver.
 
 Note that any CPU access to a device page triggers a page fault and a migration
 back to main memory. For example, when a page backing a given CPU address A is
@@ -136,8 +136,8 @@ migrated from a main memory page to a device page, then any CPU access to
 address A triggers a page fault and initiates a migration back to main memory.
 
 With these two features, HMM not only allows a device to mirror process address
-space and keeping both CPU and device page table synchronized, but also lever-
-ages device memory by migrating the part of the data set that is actively being
+space and keeps both CPU and device page tables synchronized, but also
+leverages device memory by migrating the part of the data set that is actively being
 used by the device.
 
 
@@ -151,21 +151,28 @@ registration of an hmm_mirror struct::
 
  int hmm_mirror_register(struct hmm_mirror *mirror,
                          struct mm_struct *mm);
- int hmm_mirror_register_locked(struct hmm_mirror *mirror,
-                                struct mm_struct *mm);
 
-
-The locked variant is to be used when the driver is already holding mmap_sem
-of the mm in write mode. The mirror struct has a set of callbacks that are used
+The mirror struct has a set of callbacks that are used
 to propagate CPU page tables::
 
  struct hmm_mirror_ops {
+     /* release() - release hmm_mirror
+      *
+      * @mirror: pointer to struct hmm_mirror
+      *
+      * This is called when the mm_struct is being released.  The callback
+      * must ensure that all access to any pages obtained from this mirror
+      * is halted before the callback returns. All future access should
+      * fault.
+      */
+     void (*release)(struct hmm_mirror *mirror);
+
      /* sync_cpu_device_pagetables() - synchronize page tables
       *
       * @mirror: pointer to struct hmm_mirror
-      * @update_type: type of update that occurred to the CPU page table
-      * @start: virtual start address of the range to update
-      * @end: virtual end address of the range to update
+      * @update: update information (see struct mmu_notifier_range)
+      * Return: -EAGAIN if update.blockable false and callback need to
+      *         block, 0 otherwise.
       *
       * This callback ultimately originates from mmu_notifiers when the CPU
       * page table is updated. The device driver must update its page table
@@ -176,14 +183,12 @@ to propagate CPU page tables::
       * page tables are completely updated (TLBs flushed, etc); this is a
       * synchronous call.
       */
-      void (*update)(struct hmm_mirror *mirror,
-                     enum hmm_update action,
-                     unsigned long start,
-                     unsigned long end);
+     int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
+                                       const struct hmm_update *update);
  };
 
 The device driver must perform the update action to the range (mark range
-read only, or fully unmap, ...). The device must be done with the update before
+read only, or fully unmap, etc.). The device must complete the update before
 the driver callback returns.
 
 When the device driver wants to populate a range of virtual addresses, it can
@@ -194,17 +199,18 @@ use either::
 
 The first one (hmm_range_snapshot()) will only fetch present CPU page table
 entries and will not trigger a page fault on missing or non-present entries.
-The second one does trigger a page fault on missing or read-only entry if the
-write parameter is true. Page faults use the generic mm page fault code path
-just like a CPU page fault.
+The second one does trigger a page fault on missing or read-only entries if
+write access is requested (see below). Page faults use the generic mm page
+fault code path just like a CPU page fault.
 
 Both functions copy CPU page table entries into their pfns array argument. Each
 entry in that array corresponds to an address in the virtual range. HMM
 provides a set of flags to help the driver identify special CPU page table
 entries.
 
-Locking with the update() callback is the most important aspect the driver must
-respect in order to keep things properly synchronized. The usage pattern is::
+Locking within the sync_cpu_device_pagetables() callback is the most important
+aspect the driver must respect in order to keep things properly synchronized.
+The usage pattern is::
 
  int driver_populate_range(...)
  {
@@ -239,11 +245,11 @@ respect in order to keep things properly synchronized. The usage pattern is::
             hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
             goto again;
           }
-          hmm_mirror_unregister(&range);
+          hmm_range_unregister(&range);
           return ret;
       }
       take_lock(driver->update);
-      if (!range.valid) {
+      if (!hmm_range_valid(&range)) {
           release_lock(driver->update);
           up_read(&mm->mmap_sem);
           goto again;
@@ -251,15 +257,15 @@ respect in order to keep things properly synchronized. The usage pattern is::
 
       // Use pfns array content to update device page table
 
-      hmm_mirror_unregister(&range);
+      hmm_range_unregister(&range);
       release_lock(driver->update);
       up_read(&mm->mmap_sem);
       return 0;
  }
 
 The driver->update lock is the same lock that the driver takes inside its
-update() callback. That lock must be held before checking the range.valid
-field to avoid any race with a concurrent CPU page table update.
+sync_cpu_device_pagetables() callback. That lock must be held before calling
+hmm_range_valid() to avoid any race with a concurrent CPU page table update.
 
 HMM implements all this on top of the mmu_notifier API because we wanted a
 simpler API and also to be able to perform optimizations latter on like doing
@@ -279,46 +285,47 @@ concurrently).
 Leverage default_flags and pfn_flags_mask
 =========================================
 
-The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows
-to set fault or snapshot policy for a whole range instead of having to set them
-for each entries in the range.
+The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify
+fault or snapshot policy for the whole range instead of having to set them
+for each entry in the pfns array.
+
+For instance, if the device flags for range.flags are::
 
-For instance if the device flags for device entries are:
-    VALID (1 << 63)
-    WRITE (1 << 62)
+    range.flags[HMM_PFN_VALID] = (1 << 63);
+    range.flags[HMM_PFN_WRITE] = (1 << 62);
 
-Now let say that device driver wants to fault with at least read a range then
-it does set::
+and the device driver wants pages for a range with at least read permission,
+it sets::
 
     range->default_flags = (1 << 63);
     range->pfn_flags_mask = 0;
 
-and calls hmm_range_fault() as described above. This will fill fault all page
+and calls hmm_range_fault() as described above. This will fill fault all pages
 in the range with at least read permission.
 
-Now let say driver wants to do the same except for one page in the range for
-which its want to have write. Now driver set::
+Now let's say the driver wants to do the same except for one page in the range for
+which it wants to have write permission. Now driver set::
 
     range->default_flags = (1 << 63);
     range->pfn_flags_mask = (1 << 62);
     range->pfns[index_of_write] = (1 << 62);
 
-With this HMM will fault in all page with at least read (ie valid) and for the
+With this, HMM will fault in all pages with at least read (i.e., valid) and for the
 address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
-write permission ie if the CPU pte does not have write permission set then HMM
+write permission i.e., if the CPU pte does not have write permission set then HMM
 will call handle_mm_fault().
 
-Note that HMM will populate the pfns array with write permission for any entry
-that have write permission within the CPU pte no matter what are the values set
+Note that HMM will populate the pfns array with write permission for any page
+that is mapped with CPU write permission no matter what values are set
 in default_flags or pfn_flags_mask.
 
 
 Represent and manage device memory from core kernel point of view
 =================================================================
 
-Several different designs were tried to support device memory. First one used
-a device specific data structure to keep information about migrated memory and
-HMM hooked itself in various places of mm code to handle any access to
+Several different designs were tried to support device memory. The first one
+used a device specific data structure to keep information about migrated memory
+and HMM hooked itself in various places of mm code to handle any access to
 addresses that were backed by device memory. It turns out that this ended up
 replicating most of the fields of struct page and also needed many kernel code
 paths to be updated to understand this new kind of memory.
@@ -341,7 +348,7 @@ The hmm_devmem_ops is where most of the important things are::
 
  struct hmm_devmem_ops {
      void (*free)(struct hmm_devmem *devmem, struct page *page);
-     int (*fault)(struct hmm_devmem *devmem,
+     vm_fault_t (*fault)(struct hmm_devmem *devmem,
                   struct vm_area_struct *vma,
                   unsigned long addr,
                   struct page *page,
@@ -417,9 +424,9 @@ willing to pay to keep all the code simpler.
 Memory cgroup (memcg) and rss accounting
 ========================================
 
-For now device memory is accounted as any regular page in rss counters (either
+For now, device memory is accounted as any regular page in rss counters (either
 anonymous if device page is used for anonymous, file if device page is used for
-file backed page or shmem if device page is used for shared memory). This is a
+file backed page, or shmem if device page is used for shared memory). This is a
 deliberate choice to keep existing applications, that might start using device
 memory without knowing about it, running unimpacted.
 
@@ -439,6 +446,6 @@ get more experience in how device memory is used and its impact on memory
 resource control.
 
 
-Note that device memory can never be pinned by device driver nor through GUP
+Note that device memory can never be pinned by a device driver nor through GUP
 and thus such memory is always free upon process exit. Or when last reference
 is dropped in case of shared memory or file backed memory.
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 044a36d7c3f8..740bb00853f5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -418,9 +418,10 @@ struct hmm_mirror_ops {
 	 *
 	 * @mirror: pointer to struct hmm_mirror
 	 *
-	 * This is called when the mm_struct is being released.
-	 * The callback should make sure no references to the mirror occur
-	 * after the callback returns.
+	 * This is called when the mm_struct is being released.  The callback
+	 * must ensure that all access to any pages obtained from this mirror
+	 * is halted before the callback returns. All future access should
+	 * fault.
 	 */
 	void (*release)(struct hmm_mirror *mirror);
 
-- 
cgit v1.2.3


From 085ea25064a9169eba5f2ed6484c111ab0f3ee79 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Mon, 6 May 2019 16:29:39 -0700
Subject: mm/hmm: clean up some coding style and comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no functional changes, just some coding style clean ups and
minor comment changes.

Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h | 71 +++++++++++++++++++++++++++--------------------------
 mm/hmm.c            | 62 ++++++++++++++++++++++++----------------------
 2 files changed, 68 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 740bb00853f5..7007123842ba 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -21,8 +21,8 @@
  *
  * HMM address space mirroring API:
  *
- * Use HMM address space mirroring if you want to mirror range of the CPU page
- * table of a process into a device page table. Here, "mirror" means "keep
+ * Use HMM address space mirroring if you want to mirror a range of the CPU
+ * page tables of a process into a device page table. Here, "mirror" means "keep
  * synchronized". Prerequisites: the device must provide the ability to write-
  * protect its page tables (at PAGE_SIZE granularity), and must be able to
  * recover from the resulting potential page faults.
@@ -105,10 +105,11 @@ struct hmm {
  * HMM_PFN_WRITE: CPU page table has write permission set
  * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
  *
- * The driver provide a flags array, if driver valid bit for an entry is bit
- * 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide
+ * The driver provides a flags array for mapping page protections to device
+ * PTE bits. If the driver valid bit for an entry is bit 3,
+ * i.e., (entry & (1 << 3)), then the driver must provide
  * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
- * Same logic apply to all flags. This is same idea as vm_page_prot in vma
+ * Same logic apply to all flags. This is the same idea as vm_page_prot in vma
  * except that this is per device driver rather than per architecture.
  */
 enum hmm_pfn_flag_e {
@@ -129,13 +130,13 @@ enum hmm_pfn_flag_e {
  *      be mirrored by a device, because the entry will never have HMM_PFN_VALID
  *      set and the pfn value is undefined.
  *
- * Driver provide entry value for none entry, error entry and special entry,
- * driver can alias (ie use same value for error and special for instance). It
- * should not alias none and error or special.
+ * Driver provides values for none entry, error entry, and special entry.
+ * Driver can alias (i.e., use same value) error and special, but
+ * it should not alias none with error or special.
  *
  * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
  * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
- * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table
+ * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry,
  * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one
  */
 enum hmm_pfn_value_e {
@@ -158,6 +159,7 @@ enum hmm_pfn_value_e {
  * @values: pfn value for some special case (none, special, error, ...)
  * @default_flags: default flags for the range (write, read, ... see hmm doc)
  * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
+ * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT)
  * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
@@ -180,7 +182,7 @@ struct hmm_range {
 /*
  * hmm_range_page_shift() - return the page shift for the range
  * @range: range being queried
- * Returns: page shift (page size = 1 << page shift) for the range
+ * Return: page shift (page size = 1 << page shift) for the range
  */
 static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
 {
@@ -190,7 +192,7 @@ static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
 /*
  * hmm_range_page_size() - return the page size for the range
  * @range: range being queried
- * Returns: page size for the range in bytes
+ * Return: page size for the range in bytes
  */
 static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
 {
@@ -201,7 +203,7 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
  * hmm_range_wait_until_valid() - wait for range to be valid
  * @range: range affected by invalidation to wait on
  * @timeout: time out for wait in ms (ie abort wait after that period of time)
- * Returns: true if the range is valid, false otherwise.
+ * Return: true if the range is valid, false otherwise.
  */
 static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 					      unsigned long timeout)
@@ -222,7 +224,7 @@ static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 /*
  * hmm_range_valid() - test if a range is valid or not
  * @range: range
- * Returns: true if the range is valid, false otherwise.
+ * Return: true if the range is valid, false otherwise.
  */
 static inline bool hmm_range_valid(struct hmm_range *range)
 {
@@ -233,7 +235,7 @@ static inline bool hmm_range_valid(struct hmm_range *range)
  * hmm_device_entry_to_page() - return struct page pointed to by a device entry
  * @range: range use to decode device entry value
  * @entry: device entry value to get corresponding struct page from
- * Returns: struct page pointer if entry is a valid, NULL otherwise
+ * Return: struct page pointer if entry is a valid, NULL otherwise
  *
  * If the device entry is valid (ie valid flag set) then return the struct page
  * matching the entry value. Otherwise return NULL.
@@ -256,7 +258,7 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang
  * hmm_device_entry_to_pfn() - return pfn value store in a device entry
  * @range: range use to decode device entry value
  * @entry: device entry to extract pfn from
- * Returns: pfn value if device entry is valid, -1UL otherwise
+ * Return: pfn value if device entry is valid, -1UL otherwise
  */
 static inline unsigned long
 hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
@@ -276,7 +278,7 @@ hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
  * hmm_device_entry_from_page() - create a valid device entry for a page
  * @range: range use to encode HMM pfn value
  * @page: page for which to create the device entry
- * Returns: valid device entry for the page
+ * Return: valid device entry for the page
  */
 static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
 						  struct page *page)
@@ -289,7 +291,7 @@ static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
  * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
  * @range: range use to encode HMM pfn value
  * @pfn: pfn value for which to create the device entry
- * Returns: valid device entry for the pfn
+ * Return: valid device entry for the pfn
  */
 static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
 						 unsigned long pfn)
@@ -394,7 +396,7 @@ enum hmm_update_event {
 };
 
 /*
- * struct hmm_update - HMM update informations for callback
+ * struct hmm_update - HMM update information for callback
  *
  * @start: virtual start address of the range to update
  * @end: virtual end address of the range to update
@@ -428,8 +430,8 @@ struct hmm_mirror_ops {
 	/* sync_cpu_device_pagetables() - synchronize page tables
 	 *
 	 * @mirror: pointer to struct hmm_mirror
-	 * @update: update informations (see struct hmm_update)
-	 * Returns: -EAGAIN if update.blockable false and callback need to
+	 * @update: update information (see struct hmm_update)
+	 * Return: -EAGAIN if update.blockable false and callback need to
 	 *          block, 0 otherwise.
 	 *
 	 * This callback ultimately originates from mmu_notifiers when the CPU
@@ -468,13 +470,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
 /*
  * hmm_mirror_mm_is_alive() - test if mm is still alive
  * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
- * Returns: false if the mm is dead, true otherwise
+ * Return: false if the mm is dead, true otherwise
  *
- * This is an optimization it will not accurately always return -EINVAL if the
- * mm is dead ie there can be false negative (process is being kill but HMM is
- * not yet inform of that). It is only intented to be use to optimize out case
- * where driver is about to do something time consuming and it would be better
- * to skip it if the mm is dead.
+ * This is an optimization, it will not always accurately return false if the
+ * mm is dead; i.e., there can be false negatives (process is being killed but
+ * HMM is not yet informed of that). It is only intended to be used to optimize
+ * out cases where the driver is about to do something time consuming and it
+ * would be better to skip it if the mm is dead.
  */
 static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
 {
@@ -489,7 +491,6 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
 	return true;
 }
 
-
 /*
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
@@ -562,7 +563,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block)
 	ret = hmm_range_fault(range, block);
 	if (ret <= 0) {
 		if (ret == -EBUSY || !ret) {
-			/* Same as above  drop mmap_sem to match old API. */
+			/* Same as above, drop mmap_sem to match old API. */
 			up_read(&range->vma->vm_mm->mmap_sem);
 			ret = -EBUSY;
 		} else if (ret == -EAGAIN)
@@ -629,7 +630,7 @@ struct hmm_devmem_ops {
 	 * @page: pointer to struct page backing virtual address (unreliable)
 	 * @flags: FAULT_FLAG_* (see include/linux/mm.h)
 	 * @pmdp: page middle directory
-	 * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
+	 * Return: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
 	 *   on error
 	 *
 	 * The callback occurs whenever there is a CPU page fault or GUP on a
@@ -637,14 +638,14 @@ struct hmm_devmem_ops {
 	 * page back to regular memory (CPU accessible).
 	 *
 	 * The device driver is free to migrate more than one page from the
-	 * fault() callback as an optimization. However if device decide to
-	 * migrate more than one page it must always priotirize the faulting
+	 * fault() callback as an optimization. However if the device decides
+	 * to migrate more than one page it must always priotirize the faulting
 	 * address over the others.
 	 *
-	 * The struct page pointer is only given as an hint to allow quick
+	 * The struct page pointer is only given as a hint to allow quick
 	 * lookup of internal device driver data. A concurrent migration
-	 * might have already free that page and the virtual address might
-	 * not longer be back by it. So it should not be modified by the
+	 * might have already freed that page and the virtual address might
+	 * no longer be backed by it. So it should not be modified by the
 	 * callback.
 	 *
 	 * Note that mmap semaphore is held in read mode at least when this
@@ -671,7 +672,7 @@ struct hmm_devmem_ops {
  * @ref: per CPU refcount
  * @page_fault: callback when CPU fault on an unaddressable device page
  *
- * This an helper structure for device drivers that do not wish to implement
+ * This is a helper structure for device drivers that do not wish to implement
  * the gory details related to hotplugging new memoy and allocating struct
  * pages.
  *
diff --git a/mm/hmm.c b/mm/hmm.c
index c62ae414a3a2..4db5dcf110ba 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -153,9 +153,8 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 
 	/* Wake-up everyone waiting on any range. */
 	mutex_lock(&hmm->lock);
-	list_for_each_entry(range, &hmm->ranges, list) {
+	list_for_each_entry(range, &hmm->ranges, list)
 		range->valid = false;
-	}
 	wake_up_all(&hmm->wq);
 	mutex_unlock(&hmm->lock);
 
@@ -166,9 +165,10 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 		list_del_init(&mirror->list);
 		if (mirror->ops->release) {
 			/*
-			 * Drop mirrors_sem so callback can wait on any pending
-			 * work that might itself trigger mmu_notifier callback
-			 * and thus would deadlock with us.
+			 * Drop mirrors_sem so the release callback can wait
+			 * on any pending work that might itself trigger a
+			 * mmu_notifier callback and thus would deadlock with
+			 * us.
 			 */
 			up_write(&hmm->mirrors_sem);
 			mirror->ops->release(mirror);
@@ -223,11 +223,8 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 		int ret;
 
 		ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
-		if (!update.blockable && ret == -EAGAIN) {
-			up_read(&hmm->mirrors_sem);
-			ret = -EAGAIN;
-			goto out;
-		}
+		if (!update.blockable && ret == -EAGAIN)
+			break;
 	}
 	up_read(&hmm->mirrors_sem);
 
@@ -271,6 +268,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
  *
  * @mirror: new mirror struct to register
  * @mm: mm to register against
+ * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments
  *
  * To start mirroring a process address space, the device driver must register
  * an HMM mirror struct.
@@ -298,7 +296,7 @@ EXPORT_SYMBOL(hmm_mirror_register);
 /*
  * hmm_mirror_unregister() - unregister a mirror
  *
- * @mirror: new mirror struct to register
+ * @mirror: mirror struct to unregister
  *
  * Stop mirroring a process address space, and cleanup.
  */
@@ -372,7 +370,7 @@ static int hmm_pfns_bad(unsigned long addr,
  * @fault: should we fault or not ?
  * @write_fault: write fault ?
  * @walk: mm_walk structure
- * Returns: 0 on success, -EBUSY after page fault, or page fault error
+ * Return: 0 on success, -EBUSY after page fault, or page fault error
  *
  * This function will be called whenever pmd_none() or pte_none() returns true,
  * or whenever there is no page directory covering the virtual address range.
@@ -911,6 +909,7 @@ int hmm_range_register(struct hmm_range *range,
 		       unsigned page_shift)
 {
 	unsigned long mask = ((1UL << page_shift) - 1UL);
+	struct hmm *hmm;
 
 	range->valid = false;
 	range->hmm = NULL;
@@ -924,28 +923,29 @@ int hmm_range_register(struct hmm_range *range,
 	range->start = start;
 	range->end = end;
 
-	range->hmm = hmm_get_or_create(mm);
-	if (!range->hmm)
+	hmm = hmm_get_or_create(mm);
+	if (!hmm)
 		return -EFAULT;
 
 	/* Check if hmm_mm_destroy() was call. */
-	if (range->hmm->mm == NULL || range->hmm->dead) {
-		hmm_put(range->hmm);
+	if (hmm->mm == NULL || hmm->dead) {
+		hmm_put(hmm);
 		return -EFAULT;
 	}
 
-	/* Initialize range to track CPU page table update */
-	mutex_lock(&range->hmm->lock);
+	/* Initialize range to track CPU page table updates. */
+	mutex_lock(&hmm->lock);
 
-	list_add_rcu(&range->list, &range->hmm->ranges);
+	range->hmm = hmm;
+	list_add_rcu(&range->list, &hmm->ranges);
 
 	/*
 	 * If there are any concurrent notifiers we have to wait for them for
 	 * the range to be valid (see hmm_range_wait_until_valid()).
 	 */
-	if (!range->hmm->notifiers)
+	if (!hmm->notifiers)
 		range->valid = true;
-	mutex_unlock(&range->hmm->lock);
+	mutex_unlock(&hmm->lock);
 
 	return 0;
 }
@@ -960,17 +960,19 @@ EXPORT_SYMBOL(hmm_range_register);
  */
 void hmm_range_unregister(struct hmm_range *range)
 {
+	struct hmm *hmm = range->hmm;
+
 	/* Sanity check this really should not happen. */
-	if (range->hmm == NULL || range->end <= range->start)
+	if (hmm == NULL || range->end <= range->start)
 		return;
 
-	mutex_lock(&range->hmm->lock);
+	mutex_lock(&hmm->lock);
 	list_del_rcu(&range->list);
-	mutex_unlock(&range->hmm->lock);
+	mutex_unlock(&hmm->lock);
 
 	/* Drop reference taken by hmm_range_register() */
 	range->valid = false;
-	hmm_put(range->hmm);
+	hmm_put(hmm);
 	range->hmm = NULL;
 }
 EXPORT_SYMBOL(hmm_range_unregister);
@@ -978,7 +980,7 @@ EXPORT_SYMBOL(hmm_range_unregister);
 /*
  * hmm_range_snapshot() - snapshot CPU page table for a range
  * @range: range
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
  *          permission (for instance asking for write and range is read only),
  *          -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
  *          vma or it is illegal to access that range), number of valid pages
@@ -1061,7 +1063,7 @@ EXPORT_SYMBOL(hmm_range_snapshot);
  * hmm_range_fault() - try to fault some address in a virtual address range
  * @range: range being faulted
  * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: number of valid pages in range->pfns[] (from range start
+ * Return: number of valid pages in range->pfns[] (from range start
  *          address). This may be zero. If the return value is negative,
  *          then one of the following values may be returned:
  *
@@ -1179,7 +1181,7 @@ EXPORT_SYMBOL(hmm_range_fault);
  * @device: device against to dma map page to
  * @daddrs: dma address of mapped pages
  * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been
  *          drop and you need to try again, some other error value otherwise
  *
  * Note same usage pattern as hmm_range_fault().
@@ -1267,7 +1269,7 @@ EXPORT_SYMBOL(hmm_range_dma_map);
  * @device: device against which dma map was done
  * @daddrs: dma address of mapped pages
  * @dirty: dirty page if it had the write flag set
- * Returns: number of page unmapped on success, -EINVAL otherwise
+ * Return: number of page unmapped on success, -EINVAL otherwise
  *
  * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
  * to the sync_cpu_device_pagetables() callback so that it is safe here to
@@ -1390,7 +1392,7 @@ static void hmm_devmem_free(struct page *page, void *data)
  * @ops: memory event device driver callback (see struct hmm_devmem_ops)
  * @device: device struct to bind the resource too
  * @size: size in bytes of the device memory to add
- * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
+ * Return: pointer to new hmm_devmem struct ERR_PTR otherwise
  *
  * This function first finds an empty range of physical address big enough to
  * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
-- 
cgit v1.2.3


From edcd69ab9a323b7ac7a86e1c44b6c9c46598391f Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Tue, 15 Jan 2019 12:19:57 +0000
Subject: iommu: Add virtio-iommu driver

The virtio IOMMU is a para-virtualized device, allowing to send IOMMU
requests such as map/unmap over virtio transport without emulating page
tables. This implementation handles ATTACH, DETACH, MAP and UNMAP
requests.

The bulk of the code transforms calls coming from the IOMMU API into
corresponding virtio requests. Mappings are kept in an interval tree
instead of page tables. A little more work is required for modular and x86
support, so for the moment the driver depends on CONFIG_VIRTIO=y and
CONFIG_ARM64.

Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 MAINTAINERS                       |   7 +
 drivers/iommu/Kconfig             |  11 +
 drivers/iommu/Makefile            |   1 +
 drivers/iommu/virtio-iommu.c      | 916 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/virtio_ids.h   |   1 +
 include/uapi/linux/virtio_iommu.h | 106 +++++
 6 files changed, 1042 insertions(+)
 create mode 100644 drivers/iommu/virtio-iommu.c
 create mode 100644 include/uapi/linux/virtio_iommu.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 429c6c624861..62bd1834d95a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16807,6 +16807,13 @@ S:	Maintained
 F:	drivers/virtio/virtio_input.c
 F:	include/uapi/linux/virtio_input.h
 
+VIRTIO IOMMU DRIVER
+M:	Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
+L:	virtualization@lists.linux-foundation.org
+S:	Maintained
+F:	drivers/iommu/virtio-iommu.c
+F:	include/uapi/linux/virtio_iommu.h
+
 VIRTUAL BOX GUEST DEVICE DRIVER
 M:	Hans de Goede <hdegoede@redhat.com>
 M:	Arnd Bergmann <arnd@arndb.de>
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 83664db5221d..e15cdcd8cb3c 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -473,4 +473,15 @@ config HYPERV_IOMMU
 	  Stub IOMMU driver to handle IRQs as to allow Hyper-V Linux
 	  guests to run with x2APIC mode enabled.
 
+config VIRTIO_IOMMU
+	bool "Virtio IOMMU driver"
+	depends on VIRTIO=y
+	depends on ARM64
+	select IOMMU_API
+	select INTERVAL_TREE
+	help
+	  Para-virtualised IOMMU driver with virtio.
+
+	  Say Y here if you intend to run this kernel as a guest.
+
 endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 8c71a15e986b..f13f36ae1af6 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
 obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
+obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
new file mode 100644
index 000000000000..6fa012cd727e
--- /dev/null
+++ b/drivers/iommu/virtio-iommu.c
@@ -0,0 +1,916 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Virtio driver for the paravirtualized IOMMU
+ *
+ * Copyright (C) 2018 Arm Limited
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/amba/bus.h>
+#include <linux/delay.h>
+#include <linux/dma-iommu.h>
+#include <linux/freezer.h>
+#include <linux/interval_tree.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/of_iommu.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ids.h>
+#include <linux/wait.h>
+
+#include <uapi/linux/virtio_iommu.h>
+
+#define MSI_IOVA_BASE			0x8000000
+#define MSI_IOVA_LENGTH			0x100000
+
+#define VIOMMU_REQUEST_VQ		0
+#define VIOMMU_NR_VQS			1
+
+struct viommu_dev {
+	struct iommu_device		iommu;
+	struct device			*dev;
+	struct virtio_device		*vdev;
+
+	struct ida			domain_ids;
+
+	struct virtqueue		*vqs[VIOMMU_NR_VQS];
+	spinlock_t			request_lock;
+	struct list_head		requests;
+
+	/* Device configuration */
+	struct iommu_domain_geometry	geometry;
+	u64				pgsize_bitmap;
+	u8				domain_bits;
+};
+
+struct viommu_mapping {
+	phys_addr_t			paddr;
+	struct interval_tree_node	iova;
+	u32				flags;
+};
+
+struct viommu_domain {
+	struct iommu_domain		domain;
+	struct viommu_dev		*viommu;
+	struct mutex			mutex; /* protects viommu pointer */
+	unsigned int			id;
+
+	spinlock_t			mappings_lock;
+	struct rb_root_cached		mappings;
+
+	unsigned long			nr_endpoints;
+};
+
+struct viommu_endpoint {
+	struct viommu_dev		*viommu;
+	struct viommu_domain		*vdomain;
+};
+
+struct viommu_request {
+	struct list_head		list;
+	void				*writeback;
+	unsigned int			write_offset;
+	unsigned int			len;
+	char				buf[];
+};
+
+#define to_viommu_domain(domain)	\
+	container_of(domain, struct viommu_domain, domain)
+
+static int viommu_get_req_errno(void *buf, size_t len)
+{
+	struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail);
+
+	switch (tail->status) {
+	case VIRTIO_IOMMU_S_OK:
+		return 0;
+	case VIRTIO_IOMMU_S_UNSUPP:
+		return -ENOSYS;
+	case VIRTIO_IOMMU_S_INVAL:
+		return -EINVAL;
+	case VIRTIO_IOMMU_S_RANGE:
+		return -ERANGE;
+	case VIRTIO_IOMMU_S_NOENT:
+		return -ENOENT;
+	case VIRTIO_IOMMU_S_FAULT:
+		return -EFAULT;
+	case VIRTIO_IOMMU_S_IOERR:
+	case VIRTIO_IOMMU_S_DEVERR:
+	default:
+		return -EIO;
+	}
+}
+
+static void viommu_set_req_status(void *buf, size_t len, int status)
+{
+	struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail);
+
+	tail->status = status;
+}
+
+static off_t viommu_get_write_desc_offset(struct viommu_dev *viommu,
+					  struct virtio_iommu_req_head *req,
+					  size_t len)
+{
+	size_t tail_size = sizeof(struct virtio_iommu_req_tail);
+
+	return len - tail_size;
+}
+
+/*
+ * __viommu_sync_req - Complete all in-flight requests
+ *
+ * Wait for all added requests to complete. When this function returns, all
+ * requests that were in-flight at the time of the call have completed.
+ */
+static int __viommu_sync_req(struct viommu_dev *viommu)
+{
+	int ret = 0;
+	unsigned int len;
+	size_t write_len;
+	struct viommu_request *req;
+	struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ];
+
+	assert_spin_locked(&viommu->request_lock);
+
+	virtqueue_kick(vq);
+
+	while (!list_empty(&viommu->requests)) {
+		len = 0;
+		req = virtqueue_get_buf(vq, &len);
+		if (!req)
+			continue;
+
+		if (!len)
+			viommu_set_req_status(req->buf, req->len,
+					      VIRTIO_IOMMU_S_IOERR);
+
+		write_len = req->len - req->write_offset;
+		if (req->writeback && len == write_len)
+			memcpy(req->writeback, req->buf + req->write_offset,
+			       write_len);
+
+		list_del(&req->list);
+		kfree(req);
+	}
+
+	return ret;
+}
+
+static int viommu_sync_req(struct viommu_dev *viommu)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&viommu->request_lock, flags);
+	ret = __viommu_sync_req(viommu);
+	if (ret)
+		dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret);
+	spin_unlock_irqrestore(&viommu->request_lock, flags);
+
+	return ret;
+}
+
+/*
+ * __viommu_add_request - Add one request to the queue
+ * @buf: pointer to the request buffer
+ * @len: length of the request buffer
+ * @writeback: copy data back to the buffer when the request completes.
+ *
+ * Add a request to the queue. Only synchronize the queue if it's already full.
+ * Otherwise don't kick the queue nor wait for requests to complete.
+ *
+ * When @writeback is true, data written by the device, including the request
+ * status, is copied into @buf after the request completes. This is unsafe if
+ * the caller allocates @buf on stack and drops the lock between add_req() and
+ * sync_req().
+ *
+ * Return 0 if the request was successfully added to the queue.
+ */
+static int __viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len,
+			    bool writeback)
+{
+	int ret;
+	off_t write_offset;
+	struct viommu_request *req;
+	struct scatterlist top_sg, bottom_sg;
+	struct scatterlist *sg[2] = { &top_sg, &bottom_sg };
+	struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ];
+
+	assert_spin_locked(&viommu->request_lock);
+
+	write_offset = viommu_get_write_desc_offset(viommu, buf, len);
+	if (write_offset <= 0)
+		return -EINVAL;
+
+	req = kzalloc(sizeof(*req) + len, GFP_ATOMIC);
+	if (!req)
+		return -ENOMEM;
+
+	req->len = len;
+	if (writeback) {
+		req->writeback = buf + write_offset;
+		req->write_offset = write_offset;
+	}
+	memcpy(&req->buf, buf, write_offset);
+
+	sg_init_one(&top_sg, req->buf, write_offset);
+	sg_init_one(&bottom_sg, req->buf + write_offset, len - write_offset);
+
+	ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC);
+	if (ret == -ENOSPC) {
+		/* If the queue is full, sync and retry */
+		if (!__viommu_sync_req(viommu))
+			ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC);
+	}
+	if (ret)
+		goto err_free;
+
+	list_add_tail(&req->list, &viommu->requests);
+	return 0;
+
+err_free:
+	kfree(req);
+	return ret;
+}
+
+static int viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&viommu->request_lock, flags);
+	ret = __viommu_add_req(viommu, buf, len, false);
+	if (ret)
+		dev_dbg(viommu->dev, "could not add request: %d\n", ret);
+	spin_unlock_irqrestore(&viommu->request_lock, flags);
+
+	return ret;
+}
+
+/*
+ * Send a request and wait for it to complete. Return the request status (as an
+ * errno)
+ */
+static int viommu_send_req_sync(struct viommu_dev *viommu, void *buf,
+				size_t len)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&viommu->request_lock, flags);
+
+	ret = __viommu_add_req(viommu, buf, len, true);
+	if (ret) {
+		dev_dbg(viommu->dev, "could not add request (%d)\n", ret);
+		goto out_unlock;
+	}
+
+	ret = __viommu_sync_req(viommu);
+	if (ret) {
+		dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret);
+		/* Fall-through (get the actual request status) */
+	}
+
+	ret = viommu_get_req_errno(buf, len);
+out_unlock:
+	spin_unlock_irqrestore(&viommu->request_lock, flags);
+	return ret;
+}
+
+/*
+ * viommu_add_mapping - add a mapping to the internal tree
+ *
+ * On success, return the new mapping. Otherwise return NULL.
+ */
+static int viommu_add_mapping(struct viommu_domain *vdomain, unsigned long iova,
+			      phys_addr_t paddr, size_t size, u32 flags)
+{
+	unsigned long irqflags;
+	struct viommu_mapping *mapping;
+
+	mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
+	if (!mapping)
+		return -ENOMEM;
+
+	mapping->paddr		= paddr;
+	mapping->iova.start	= iova;
+	mapping->iova.last	= iova + size - 1;
+	mapping->flags		= flags;
+
+	spin_lock_irqsave(&vdomain->mappings_lock, irqflags);
+	interval_tree_insert(&mapping->iova, &vdomain->mappings);
+	spin_unlock_irqrestore(&vdomain->mappings_lock, irqflags);
+
+	return 0;
+}
+
+/*
+ * viommu_del_mappings - remove mappings from the internal tree
+ *
+ * @vdomain: the domain
+ * @iova: start of the range
+ * @size: size of the range. A size of 0 corresponds to the entire address
+ *	space.
+ *
+ * On success, returns the number of unmapped bytes (>= size)
+ */
+static size_t viommu_del_mappings(struct viommu_domain *vdomain,
+				  unsigned long iova, size_t size)
+{
+	size_t unmapped = 0;
+	unsigned long flags;
+	unsigned long last = iova + size - 1;
+	struct viommu_mapping *mapping = NULL;
+	struct interval_tree_node *node, *next;
+
+	spin_lock_irqsave(&vdomain->mappings_lock, flags);
+	next = interval_tree_iter_first(&vdomain->mappings, iova, last);
+	while (next) {
+		node = next;
+		mapping = container_of(node, struct viommu_mapping, iova);
+		next = interval_tree_iter_next(node, iova, last);
+
+		/* Trying to split a mapping? */
+		if (mapping->iova.start < iova)
+			break;
+
+		/*
+		 * Virtio-iommu doesn't allow UNMAP to split a mapping created
+		 * with a single MAP request, so remove the full mapping.
+		 */
+		unmapped += mapping->iova.last - mapping->iova.start + 1;
+
+		interval_tree_remove(node, &vdomain->mappings);
+		kfree(mapping);
+	}
+	spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
+
+	return unmapped;
+}
+
+/*
+ * viommu_replay_mappings - re-send MAP requests
+ *
+ * When reattaching a domain that was previously detached from all endpoints,
+ * mappings were deleted from the device. Re-create the mappings available in
+ * the internal tree.
+ */
+static int viommu_replay_mappings(struct viommu_domain *vdomain)
+{
+	int ret = 0;
+	unsigned long flags;
+	struct viommu_mapping *mapping;
+	struct interval_tree_node *node;
+	struct virtio_iommu_req_map map;
+
+	spin_lock_irqsave(&vdomain->mappings_lock, flags);
+	node = interval_tree_iter_first(&vdomain->mappings, 0, -1UL);
+	while (node) {
+		mapping = container_of(node, struct viommu_mapping, iova);
+		map = (struct virtio_iommu_req_map) {
+			.head.type	= VIRTIO_IOMMU_T_MAP,
+			.domain		= cpu_to_le32(vdomain->id),
+			.virt_start	= cpu_to_le64(mapping->iova.start),
+			.virt_end	= cpu_to_le64(mapping->iova.last),
+			.phys_start	= cpu_to_le64(mapping->paddr),
+			.flags		= cpu_to_le32(mapping->flags),
+		};
+
+		ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map));
+		if (ret)
+			break;
+
+		node = interval_tree_iter_next(node, 0, -1UL);
+	}
+	spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
+
+	return ret;
+}
+
+/* IOMMU API */
+
+static struct iommu_domain *viommu_domain_alloc(unsigned type)
+{
+	struct viommu_domain *vdomain;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
+		return NULL;
+
+	vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL);
+	if (!vdomain)
+		return NULL;
+
+	mutex_init(&vdomain->mutex);
+	spin_lock_init(&vdomain->mappings_lock);
+	vdomain->mappings = RB_ROOT_CACHED;
+
+	if (type == IOMMU_DOMAIN_DMA &&
+	    iommu_get_dma_cookie(&vdomain->domain)) {
+		kfree(vdomain);
+		return NULL;
+	}
+
+	return &vdomain->domain;
+}
+
+static int viommu_domain_finalise(struct viommu_dev *viommu,
+				  struct iommu_domain *domain)
+{
+	int ret;
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+	unsigned int max_domain = viommu->domain_bits > 31 ? ~0 :
+				  (1U << viommu->domain_bits) - 1;
+
+	vdomain->viommu		= viommu;
+
+	domain->pgsize_bitmap	= viommu->pgsize_bitmap;
+	domain->geometry	= viommu->geometry;
+
+	ret = ida_alloc_max(&viommu->domain_ids, max_domain, GFP_KERNEL);
+	if (ret >= 0)
+		vdomain->id = (unsigned int)ret;
+
+	return ret > 0 ? 0 : ret;
+}
+
+static void viommu_domain_free(struct iommu_domain *domain)
+{
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	iommu_put_dma_cookie(domain);
+
+	/* Free all remaining mappings (size 2^64) */
+	viommu_del_mappings(vdomain, 0, 0);
+
+	if (vdomain->viommu)
+		ida_free(&vdomain->viommu->domain_ids, vdomain->id);
+
+	kfree(vdomain);
+}
+
+static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	int i;
+	int ret = 0;
+	struct virtio_iommu_req_attach req;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct viommu_endpoint *vdev = fwspec->iommu_priv;
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	mutex_lock(&vdomain->mutex);
+	if (!vdomain->viommu) {
+		/*
+		 * Properly initialize the domain now that we know which viommu
+		 * owns it.
+		 */
+		ret = viommu_domain_finalise(vdev->viommu, domain);
+	} else if (vdomain->viommu != vdev->viommu) {
+		dev_err(dev, "cannot attach to foreign vIOMMU\n");
+		ret = -EXDEV;
+	}
+	mutex_unlock(&vdomain->mutex);
+
+	if (ret)
+		return ret;
+
+	/*
+	 * In the virtio-iommu device, when attaching the endpoint to a new
+	 * domain, it is detached from the old one and, if as as a result the
+	 * old domain isn't attached to any endpoint, all mappings are removed
+	 * from the old domain and it is freed.
+	 *
+	 * In the driver the old domain still exists, and its mappings will be
+	 * recreated if it gets reattached to an endpoint. Otherwise it will be
+	 * freed explicitly.
+	 *
+	 * vdev->vdomain is protected by group->mutex
+	 */
+	if (vdev->vdomain)
+		vdev->vdomain->nr_endpoints--;
+
+	req = (struct virtio_iommu_req_attach) {
+		.head.type	= VIRTIO_IOMMU_T_ATTACH,
+		.domain		= cpu_to_le32(vdomain->id),
+	};
+
+	for (i = 0; i < fwspec->num_ids; i++) {
+		req.endpoint = cpu_to_le32(fwspec->ids[i]);
+
+		ret = viommu_send_req_sync(vdomain->viommu, &req, sizeof(req));
+		if (ret)
+			return ret;
+	}
+
+	if (!vdomain->nr_endpoints) {
+		/*
+		 * This endpoint is the first to be attached to the domain.
+		 * Replay existing mappings (e.g. SW MSI).
+		 */
+		ret = viommu_replay_mappings(vdomain);
+		if (ret)
+			return ret;
+	}
+
+	vdomain->nr_endpoints++;
+	vdev->vdomain = vdomain;
+
+	return 0;
+}
+
+static int viommu_map(struct iommu_domain *domain, unsigned long iova,
+		      phys_addr_t paddr, size_t size, int prot)
+{
+	int ret;
+	int flags;
+	struct virtio_iommu_req_map map;
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	flags = (prot & IOMMU_READ ? VIRTIO_IOMMU_MAP_F_READ : 0) |
+		(prot & IOMMU_WRITE ? VIRTIO_IOMMU_MAP_F_WRITE : 0) |
+		(prot & IOMMU_MMIO ? VIRTIO_IOMMU_MAP_F_MMIO : 0);
+
+	ret = viommu_add_mapping(vdomain, iova, paddr, size, flags);
+	if (ret)
+		return ret;
+
+	map = (struct virtio_iommu_req_map) {
+		.head.type	= VIRTIO_IOMMU_T_MAP,
+		.domain		= cpu_to_le32(vdomain->id),
+		.virt_start	= cpu_to_le64(iova),
+		.phys_start	= cpu_to_le64(paddr),
+		.virt_end	= cpu_to_le64(iova + size - 1),
+		.flags		= cpu_to_le32(flags),
+	};
+
+	if (!vdomain->nr_endpoints)
+		return 0;
+
+	ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map));
+	if (ret)
+		viommu_del_mappings(vdomain, iova, size);
+
+	return ret;
+}
+
+static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova,
+			   size_t size)
+{
+	int ret = 0;
+	size_t unmapped;
+	struct virtio_iommu_req_unmap unmap;
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	unmapped = viommu_del_mappings(vdomain, iova, size);
+	if (unmapped < size)
+		return 0;
+
+	/* Device already removed all mappings after detach. */
+	if (!vdomain->nr_endpoints)
+		return unmapped;
+
+	unmap = (struct virtio_iommu_req_unmap) {
+		.head.type	= VIRTIO_IOMMU_T_UNMAP,
+		.domain		= cpu_to_le32(vdomain->id),
+		.virt_start	= cpu_to_le64(iova),
+		.virt_end	= cpu_to_le64(iova + unmapped - 1),
+	};
+
+	ret = viommu_add_req(vdomain->viommu, &unmap, sizeof(unmap));
+	return ret ? 0 : unmapped;
+}
+
+static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain,
+				       dma_addr_t iova)
+{
+	u64 paddr = 0;
+	unsigned long flags;
+	struct viommu_mapping *mapping;
+	struct interval_tree_node *node;
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	spin_lock_irqsave(&vdomain->mappings_lock, flags);
+	node = interval_tree_iter_first(&vdomain->mappings, iova, iova);
+	if (node) {
+		mapping = container_of(node, struct viommu_mapping, iova);
+		paddr = mapping->paddr + (iova - mapping->iova.start);
+	}
+	spin_unlock_irqrestore(&vdomain->mappings_lock, flags);
+
+	return paddr;
+}
+
+static void viommu_iotlb_sync(struct iommu_domain *domain)
+{
+	struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+	viommu_sync_req(vdomain->viommu);
+}
+
+static void viommu_get_resv_regions(struct device *dev, struct list_head *head)
+{
+	struct iommu_resv_region *region;
+	int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+	region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, prot,
+					 IOMMU_RESV_SW_MSI);
+	if (!region)
+		return;
+
+	list_add_tail(&region->list, head);
+	iommu_dma_get_resv_regions(dev, head);
+}
+
+static void viommu_put_resv_regions(struct device *dev, struct list_head *head)
+{
+	struct iommu_resv_region *entry, *next;
+
+	list_for_each_entry_safe(entry, next, head, list)
+		kfree(entry);
+}
+
+static struct iommu_ops viommu_ops;
+static struct virtio_driver virtio_iommu_drv;
+
+static int viommu_match_node(struct device *dev, void *data)
+{
+	return dev->parent->fwnode == data;
+}
+
+static struct viommu_dev *viommu_get_by_fwnode(struct fwnode_handle *fwnode)
+{
+	struct device *dev = driver_find_device(&virtio_iommu_drv.driver, NULL,
+						fwnode, viommu_match_node);
+	put_device(dev);
+
+	return dev ? dev_to_virtio(dev)->priv : NULL;
+}
+
+static int viommu_add_device(struct device *dev)
+{
+	int ret;
+	struct iommu_group *group;
+	struct viommu_endpoint *vdev;
+	struct viommu_dev *viommu = NULL;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+	if (!fwspec || fwspec->ops != &viommu_ops)
+		return -ENODEV;
+
+	viommu = viommu_get_by_fwnode(fwspec->iommu_fwnode);
+	if (!viommu)
+		return -ENODEV;
+
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev)
+		return -ENOMEM;
+
+	vdev->viommu = viommu;
+	fwspec->iommu_priv = vdev;
+
+	ret = iommu_device_link(&viommu->iommu, dev);
+	if (ret)
+		goto err_free_dev;
+
+	/*
+	 * Last step creates a default domain and attaches to it. Everything
+	 * must be ready.
+	 */
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group)) {
+		ret = PTR_ERR(group);
+		goto err_unlink_dev;
+	}
+
+	iommu_group_put(group);
+
+	return PTR_ERR_OR_ZERO(group);
+
+err_unlink_dev:
+	iommu_device_unlink(&viommu->iommu, dev);
+err_free_dev:
+	kfree(vdev);
+
+	return ret;
+}
+
+static void viommu_remove_device(struct device *dev)
+{
+	struct viommu_endpoint *vdev;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+	if (!fwspec || fwspec->ops != &viommu_ops)
+		return;
+
+	vdev = fwspec->iommu_priv;
+
+	iommu_group_remove_device(dev);
+	iommu_device_unlink(&vdev->viommu->iommu, dev);
+	kfree(vdev);
+}
+
+static struct iommu_group *viommu_device_group(struct device *dev)
+{
+	if (dev_is_pci(dev))
+		return pci_device_group(dev);
+	else
+		return generic_device_group(dev);
+}
+
+static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+	return iommu_fwspec_add_ids(dev, args->args, 1);
+}
+
+static struct iommu_ops viommu_ops = {
+	.domain_alloc		= viommu_domain_alloc,
+	.domain_free		= viommu_domain_free,
+	.attach_dev		= viommu_attach_dev,
+	.map			= viommu_map,
+	.unmap			= viommu_unmap,
+	.iova_to_phys		= viommu_iova_to_phys,
+	.iotlb_sync		= viommu_iotlb_sync,
+	.add_device		= viommu_add_device,
+	.remove_device		= viommu_remove_device,
+	.device_group		= viommu_device_group,
+	.get_resv_regions	= viommu_get_resv_regions,
+	.put_resv_regions	= viommu_put_resv_regions,
+	.of_xlate		= viommu_of_xlate,
+};
+
+static int viommu_init_vqs(struct viommu_dev *viommu)
+{
+	struct virtio_device *vdev = dev_to_virtio(viommu->dev);
+	const char *name = "request";
+	void *ret;
+
+	ret = virtio_find_single_vq(vdev, NULL, name);
+	if (IS_ERR(ret)) {
+		dev_err(viommu->dev, "cannot find VQ\n");
+		return PTR_ERR(ret);
+	}
+
+	viommu->vqs[VIOMMU_REQUEST_VQ] = ret;
+
+	return 0;
+}
+
+static int viommu_probe(struct virtio_device *vdev)
+{
+	struct device *parent_dev = vdev->dev.parent;
+	struct viommu_dev *viommu = NULL;
+	struct device *dev = &vdev->dev;
+	u64 input_start = 0;
+	u64 input_end = -1UL;
+	int ret;
+
+	if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) ||
+	    !virtio_has_feature(vdev, VIRTIO_IOMMU_F_MAP_UNMAP))
+		return -ENODEV;
+
+	viommu = devm_kzalloc(dev, sizeof(*viommu), GFP_KERNEL);
+	if (!viommu)
+		return -ENOMEM;
+
+	spin_lock_init(&viommu->request_lock);
+	ida_init(&viommu->domain_ids);
+	viommu->dev = dev;
+	viommu->vdev = vdev;
+	INIT_LIST_HEAD(&viommu->requests);
+
+	ret = viommu_init_vqs(viommu);
+	if (ret)
+		return ret;
+
+	virtio_cread(vdev, struct virtio_iommu_config, page_size_mask,
+		     &viommu->pgsize_bitmap);
+
+	if (!viommu->pgsize_bitmap) {
+		ret = -EINVAL;
+		goto err_free_vqs;
+	}
+
+	viommu->domain_bits = 32;
+
+	/* Optional features */
+	virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE,
+			     struct virtio_iommu_config, input_range.start,
+			     &input_start);
+
+	virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE,
+			     struct virtio_iommu_config, input_range.end,
+			     &input_end);
+
+	virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_BITS,
+			     struct virtio_iommu_config, domain_bits,
+			     &viommu->domain_bits);
+
+	viommu->geometry = (struct iommu_domain_geometry) {
+		.aperture_start	= input_start,
+		.aperture_end	= input_end,
+		.force_aperture	= true,
+	};
+
+	viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap;
+
+	virtio_device_ready(vdev);
+
+	ret = iommu_device_sysfs_add(&viommu->iommu, dev, NULL, "%s",
+				     virtio_bus_name(vdev));
+	if (ret)
+		goto err_free_vqs;
+
+	iommu_device_set_ops(&viommu->iommu, &viommu_ops);
+	iommu_device_set_fwnode(&viommu->iommu, parent_dev->fwnode);
+
+	iommu_device_register(&viommu->iommu);
+
+#ifdef CONFIG_PCI
+	if (pci_bus_type.iommu_ops != &viommu_ops) {
+		pci_request_acs();
+		ret = bus_set_iommu(&pci_bus_type, &viommu_ops);
+		if (ret)
+			goto err_unregister;
+	}
+#endif
+#ifdef CONFIG_ARM_AMBA
+	if (amba_bustype.iommu_ops != &viommu_ops) {
+		ret = bus_set_iommu(&amba_bustype, &viommu_ops);
+		if (ret)
+			goto err_unregister;
+	}
+#endif
+	if (platform_bus_type.iommu_ops != &viommu_ops) {
+		ret = bus_set_iommu(&platform_bus_type, &viommu_ops);
+		if (ret)
+			goto err_unregister;
+	}
+
+	vdev->priv = viommu;
+
+	dev_info(dev, "input address: %u bits\n",
+		 order_base_2(viommu->geometry.aperture_end));
+	dev_info(dev, "page mask: %#llx\n", viommu->pgsize_bitmap);
+
+	return 0;
+
+err_unregister:
+	iommu_device_sysfs_remove(&viommu->iommu);
+	iommu_device_unregister(&viommu->iommu);
+err_free_vqs:
+	vdev->config->del_vqs(vdev);
+
+	return ret;
+}
+
+static void viommu_remove(struct virtio_device *vdev)
+{
+	struct viommu_dev *viommu = vdev->priv;
+
+	iommu_device_sysfs_remove(&viommu->iommu);
+	iommu_device_unregister(&viommu->iommu);
+
+	/* Stop all virtqueues */
+	vdev->config->reset(vdev);
+	vdev->config->del_vqs(vdev);
+
+	dev_info(&vdev->dev, "device removed\n");
+}
+
+static void viommu_config_changed(struct virtio_device *vdev)
+{
+	dev_warn(&vdev->dev, "config changed\n");
+}
+
+static unsigned int features[] = {
+	VIRTIO_IOMMU_F_MAP_UNMAP,
+	VIRTIO_IOMMU_F_DOMAIN_BITS,
+	VIRTIO_IOMMU_F_INPUT_RANGE,
+};
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_IOMMU, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static struct virtio_driver virtio_iommu_drv = {
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.feature_table		= features,
+	.feature_table_size	= ARRAY_SIZE(features),
+	.probe			= viommu_probe,
+	.remove			= viommu_remove,
+	.config_changed		= viommu_config_changed,
+};
+
+module_virtio_driver(virtio_iommu_drv);
+
+MODULE_DESCRIPTION("Virtio IOMMU driver");
+MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@arm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 6d5c3b2d4f4d..cfe47c5d9a56 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -43,5 +43,6 @@
 #define VIRTIO_ID_INPUT        18 /* virtio input */
 #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
 #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
+#define VIRTIO_ID_IOMMU        23 /* virtio IOMMU */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h
new file mode 100644
index 000000000000..5e5fd62689fb
--- /dev/null
+++ b/include/uapi/linux/virtio_iommu.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/*
+ * Virtio-iommu definition v0.9
+ *
+ * Copyright (C) 2018 Arm Ltd.
+ */
+#ifndef _UAPI_LINUX_VIRTIO_IOMMU_H
+#define _UAPI_LINUX_VIRTIO_IOMMU_H
+
+#include <linux/types.h>
+
+/* Feature bits */
+#define VIRTIO_IOMMU_F_INPUT_RANGE		0
+#define VIRTIO_IOMMU_F_DOMAIN_BITS		1
+#define VIRTIO_IOMMU_F_MAP_UNMAP		2
+#define VIRTIO_IOMMU_F_BYPASS			3
+
+struct virtio_iommu_range {
+	__u64					start;
+	__u64					end;
+};
+
+struct virtio_iommu_config {
+	/* Supported page sizes */
+	__u64					page_size_mask;
+	/* Supported IOVA range */
+	struct virtio_iommu_range		input_range;
+	/* Max domain ID size */
+	__u8					domain_bits;
+	__u8					padding[3];
+	/* Probe buffer size */
+	__u32					probe_size;
+};
+
+/* Request types */
+#define VIRTIO_IOMMU_T_ATTACH			0x01
+#define VIRTIO_IOMMU_T_DETACH			0x02
+#define VIRTIO_IOMMU_T_MAP			0x03
+#define VIRTIO_IOMMU_T_UNMAP			0x04
+
+/* Status types */
+#define VIRTIO_IOMMU_S_OK			0x00
+#define VIRTIO_IOMMU_S_IOERR			0x01
+#define VIRTIO_IOMMU_S_UNSUPP			0x02
+#define VIRTIO_IOMMU_S_DEVERR			0x03
+#define VIRTIO_IOMMU_S_INVAL			0x04
+#define VIRTIO_IOMMU_S_RANGE			0x05
+#define VIRTIO_IOMMU_S_NOENT			0x06
+#define VIRTIO_IOMMU_S_FAULT			0x07
+
+struct virtio_iommu_req_head {
+	__u8					type;
+	__u8					reserved[3];
+};
+
+struct virtio_iommu_req_tail {
+	__u8					status;
+	__u8					reserved[3];
+};
+
+struct virtio_iommu_req_attach {
+	struct virtio_iommu_req_head		head;
+	__le32					domain;
+	__le32					endpoint;
+	__u8					reserved[8];
+	struct virtio_iommu_req_tail		tail;
+};
+
+struct virtio_iommu_req_detach {
+	struct virtio_iommu_req_head		head;
+	__le32					domain;
+	__le32					endpoint;
+	__u8					reserved[8];
+	struct virtio_iommu_req_tail		tail;
+};
+
+#define VIRTIO_IOMMU_MAP_F_READ			(1 << 0)
+#define VIRTIO_IOMMU_MAP_F_WRITE		(1 << 1)
+#define VIRTIO_IOMMU_MAP_F_EXEC			(1 << 2)
+#define VIRTIO_IOMMU_MAP_F_MMIO			(1 << 3)
+
+#define VIRTIO_IOMMU_MAP_F_MASK			(VIRTIO_IOMMU_MAP_F_READ |	\
+						 VIRTIO_IOMMU_MAP_F_WRITE |	\
+						 VIRTIO_IOMMU_MAP_F_EXEC |	\
+						 VIRTIO_IOMMU_MAP_F_MMIO)
+
+struct virtio_iommu_req_map {
+	struct virtio_iommu_req_head		head;
+	__le32					domain;
+	__le64					virt_start;
+	__le64					virt_end;
+	__le64					phys_start;
+	__le32					flags;
+	struct virtio_iommu_req_tail		tail;
+};
+
+struct virtio_iommu_req_unmap {
+	struct virtio_iommu_req_head		head;
+	__le32					domain;
+	__le64					virt_start;
+	__le64					virt_end;
+	__u8					reserved[4];
+	struct virtio_iommu_req_tail		tail;
+};
+
+#endif
-- 
cgit v1.2.3


From 2a5a314874450decec244923209ce6ba97e3ed93 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Tue, 15 Jan 2019 12:19:58 +0000
Subject: iommu/virtio: Add probe request

When the device offers the probe feature, send a probe request for each
device managed by the IOMMU. Extract RESV_MEM information. When we
encounter a MSI doorbell region, set it up as a IOMMU_RESV_MSI region.
This will tell other subsystems that there is no need to map the MSI
doorbell in the virtio-iommu, because MSIs bypass it.

Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/iommu/virtio-iommu.c      | 157 ++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/virtio_iommu.h |  36 +++++++++
 2 files changed, 187 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 6fa012cd727e..5e194493a531 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -46,6 +46,7 @@ struct viommu_dev {
 	struct iommu_domain_geometry	geometry;
 	u64				pgsize_bitmap;
 	u8				domain_bits;
+	u32				probe_size;
 };
 
 struct viommu_mapping {
@@ -67,8 +68,10 @@ struct viommu_domain {
 };
 
 struct viommu_endpoint {
+	struct device			*dev;
 	struct viommu_dev		*viommu;
 	struct viommu_domain		*vdomain;
+	struct list_head		resv_regions;
 };
 
 struct viommu_request {
@@ -119,6 +122,9 @@ static off_t viommu_get_write_desc_offset(struct viommu_dev *viommu,
 {
 	size_t tail_size = sizeof(struct virtio_iommu_req_tail);
 
+	if (req->type == VIRTIO_IOMMU_T_PROBE)
+		return len - viommu->probe_size - tail_size;
+
 	return len - tail_size;
 }
 
@@ -393,6 +399,110 @@ static int viommu_replay_mappings(struct viommu_domain *vdomain)
 	return ret;
 }
 
+static int viommu_add_resv_mem(struct viommu_endpoint *vdev,
+			       struct virtio_iommu_probe_resv_mem *mem,
+			       size_t len)
+{
+	size_t size;
+	u64 start64, end64;
+	phys_addr_t start, end;
+	struct iommu_resv_region *region = NULL;
+	unsigned long prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+	start = start64 = le64_to_cpu(mem->start);
+	end = end64 = le64_to_cpu(mem->end);
+	size = end64 - start64 + 1;
+
+	/* Catch any overflow, including the unlikely end64 - start64 + 1 = 0 */
+	if (start != start64 || end != end64 || size < end64 - start64)
+		return -EOVERFLOW;
+
+	if (len < sizeof(*mem))
+		return -EINVAL;
+
+	switch (mem->subtype) {
+	default:
+		dev_warn(vdev->dev, "unknown resv mem subtype 0x%x\n",
+			 mem->subtype);
+		/* Fall-through */
+	case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
+		region = iommu_alloc_resv_region(start, size, 0,
+						 IOMMU_RESV_RESERVED);
+		break;
+	case VIRTIO_IOMMU_RESV_MEM_T_MSI:
+		region = iommu_alloc_resv_region(start, size, prot,
+						 IOMMU_RESV_MSI);
+		break;
+	}
+	if (!region)
+		return -ENOMEM;
+
+	list_add(&vdev->resv_regions, &region->list);
+	return 0;
+}
+
+static int viommu_probe_endpoint(struct viommu_dev *viommu, struct device *dev)
+{
+	int ret;
+	u16 type, len;
+	size_t cur = 0;
+	size_t probe_len;
+	struct virtio_iommu_req_probe *probe;
+	struct virtio_iommu_probe_property *prop;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct viommu_endpoint *vdev = fwspec->iommu_priv;
+
+	if (!fwspec->num_ids)
+		return -EINVAL;
+
+	probe_len = sizeof(*probe) + viommu->probe_size +
+		    sizeof(struct virtio_iommu_req_tail);
+	probe = kzalloc(probe_len, GFP_KERNEL);
+	if (!probe)
+		return -ENOMEM;
+
+	probe->head.type = VIRTIO_IOMMU_T_PROBE;
+	/*
+	 * For now, assume that properties of an endpoint that outputs multiple
+	 * IDs are consistent. Only probe the first one.
+	 */
+	probe->endpoint = cpu_to_le32(fwspec->ids[0]);
+
+	ret = viommu_send_req_sync(viommu, probe, probe_len);
+	if (ret)
+		goto out_free;
+
+	prop = (void *)probe->properties;
+	type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK;
+
+	while (type != VIRTIO_IOMMU_PROBE_T_NONE &&
+	       cur < viommu->probe_size) {
+		len = le16_to_cpu(prop->length) + sizeof(*prop);
+
+		switch (type) {
+		case VIRTIO_IOMMU_PROBE_T_RESV_MEM:
+			ret = viommu_add_resv_mem(vdev, (void *)prop, len);
+			break;
+		default:
+			dev_err(dev, "unknown viommu prop 0x%x\n", type);
+		}
+
+		if (ret)
+			dev_err(dev, "failed to parse viommu prop 0x%x\n", type);
+
+		cur += len;
+		if (cur >= viommu->probe_size)
+			break;
+
+		prop = (void *)probe->properties + cur;
+		type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK;
+	}
+
+out_free:
+	kfree(probe);
+	return ret;
+}
+
 /* IOMMU API */
 
 static struct iommu_domain *viommu_domain_alloc(unsigned type)
@@ -614,15 +724,34 @@ static void viommu_iotlb_sync(struct iommu_domain *domain)
 
 static void viommu_get_resv_regions(struct device *dev, struct list_head *head)
 {
-	struct iommu_resv_region *region;
+	struct iommu_resv_region *entry, *new_entry, *msi = NULL;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct viommu_endpoint *vdev = fwspec->iommu_priv;
 	int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
 
-	region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, prot,
-					 IOMMU_RESV_SW_MSI);
-	if (!region)
-		return;
+	list_for_each_entry(entry, &vdev->resv_regions, list) {
+		if (entry->type == IOMMU_RESV_MSI)
+			msi = entry;
+
+		new_entry = kmemdup(entry, sizeof(*entry), GFP_KERNEL);
+		if (!new_entry)
+			return;
+		list_add_tail(&new_entry->list, head);
+	}
+
+	/*
+	 * If the device didn't register any bypass MSI window, add a
+	 * software-mapped region.
+	 */
+	if (!msi) {
+		msi = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
+					      prot, IOMMU_RESV_SW_MSI);
+		if (!msi)
+			return;
+
+		list_add_tail(&msi->list, head);
+	}
 
-	list_add_tail(&region->list, head);
 	iommu_dma_get_resv_regions(dev, head);
 }
 
@@ -670,9 +799,18 @@ static int viommu_add_device(struct device *dev)
 	if (!vdev)
 		return -ENOMEM;
 
+	vdev->dev = dev;
 	vdev->viommu = viommu;
+	INIT_LIST_HEAD(&vdev->resv_regions);
 	fwspec->iommu_priv = vdev;
 
+	if (viommu->probe_size) {
+		/* Get additional information for this endpoint */
+		ret = viommu_probe_endpoint(viommu, dev);
+		if (ret)
+			goto err_free_dev;
+	}
+
 	ret = iommu_device_link(&viommu->iommu, dev);
 	if (ret)
 		goto err_free_dev;
@@ -694,6 +832,7 @@ static int viommu_add_device(struct device *dev)
 err_unlink_dev:
 	iommu_device_unlink(&viommu->iommu, dev);
 err_free_dev:
+	viommu_put_resv_regions(dev, &vdev->resv_regions);
 	kfree(vdev);
 
 	return ret;
@@ -711,6 +850,7 @@ static void viommu_remove_device(struct device *dev)
 
 	iommu_group_remove_device(dev);
 	iommu_device_unlink(&vdev->viommu->iommu, dev);
+	viommu_put_resv_regions(dev, &vdev->resv_regions);
 	kfree(vdev);
 }
 
@@ -810,6 +950,10 @@ static int viommu_probe(struct virtio_device *vdev)
 			     struct virtio_iommu_config, domain_bits,
 			     &viommu->domain_bits);
 
+	virtio_cread_feature(vdev, VIRTIO_IOMMU_F_PROBE,
+			     struct virtio_iommu_config, probe_size,
+			     &viommu->probe_size);
+
 	viommu->geometry = (struct iommu_domain_geometry) {
 		.aperture_start	= input_start,
 		.aperture_end	= input_end,
@@ -891,6 +1035,7 @@ static unsigned int features[] = {
 	VIRTIO_IOMMU_F_MAP_UNMAP,
 	VIRTIO_IOMMU_F_DOMAIN_BITS,
 	VIRTIO_IOMMU_F_INPUT_RANGE,
+	VIRTIO_IOMMU_F_PROBE,
 };
 
 static struct virtio_device_id id_table[] = {
diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h
index 5e5fd62689fb..ae6145cf5928 100644
--- a/include/uapi/linux/virtio_iommu.h
+++ b/include/uapi/linux/virtio_iommu.h
@@ -14,6 +14,7 @@
 #define VIRTIO_IOMMU_F_DOMAIN_BITS		1
 #define VIRTIO_IOMMU_F_MAP_UNMAP		2
 #define VIRTIO_IOMMU_F_BYPASS			3
+#define VIRTIO_IOMMU_F_PROBE			4
 
 struct virtio_iommu_range {
 	__u64					start;
@@ -37,6 +38,7 @@ struct virtio_iommu_config {
 #define VIRTIO_IOMMU_T_DETACH			0x02
 #define VIRTIO_IOMMU_T_MAP			0x03
 #define VIRTIO_IOMMU_T_UNMAP			0x04
+#define VIRTIO_IOMMU_T_PROBE			0x05
 
 /* Status types */
 #define VIRTIO_IOMMU_S_OK			0x00
@@ -103,4 +105,38 @@ struct virtio_iommu_req_unmap {
 	struct virtio_iommu_req_tail		tail;
 };
 
+#define VIRTIO_IOMMU_PROBE_T_NONE		0
+#define VIRTIO_IOMMU_PROBE_T_RESV_MEM		1
+
+#define VIRTIO_IOMMU_PROBE_T_MASK		0xfff
+
+struct virtio_iommu_probe_property {
+	__le16					type;
+	__le16					length;
+};
+
+#define VIRTIO_IOMMU_RESV_MEM_T_RESERVED	0
+#define VIRTIO_IOMMU_RESV_MEM_T_MSI		1
+
+struct virtio_iommu_probe_resv_mem {
+	struct virtio_iommu_probe_property	head;
+	__u8					subtype;
+	__u8					reserved[3];
+	__le64					start;
+	__le64					end;
+};
+
+struct virtio_iommu_req_probe {
+	struct virtio_iommu_req_head		head;
+	__le32					endpoint;
+	__u8					reserved[64];
+
+	__u8					properties[];
+
+	/*
+	 * Tail follows the variable-length properties array. No padding,
+	 * property lengths are all aligned on 8 bytes.
+	 */
+};
+
 #endif
-- 
cgit v1.2.3


From 169a126c6e88a99578a309a9021f314b5d532c5f Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Tue, 15 Jan 2019 12:19:59 +0000
Subject: iommu/virtio: Add event queue

The event queue offers a way for the device to report access faults from
endpoints. It is implemented on virtqueue #1. Whenever the host needs to
signal a fault, it fills one of the buffers offered by the guest and
interrupts it.

Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/iommu/virtio-iommu.c      | 115 +++++++++++++++++++++++++++++++++++---
 include/uapi/linux/virtio_iommu.h |  19 +++++++
 2 files changed, 125 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 5e194493a531..4620dd221ffd 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -29,7 +29,8 @@
 #define MSI_IOVA_LENGTH			0x100000
 
 #define VIOMMU_REQUEST_VQ		0
-#define VIOMMU_NR_VQS			1
+#define VIOMMU_EVENT_VQ			1
+#define VIOMMU_NR_VQS			2
 
 struct viommu_dev {
 	struct iommu_device		iommu;
@@ -41,6 +42,7 @@ struct viommu_dev {
 	struct virtqueue		*vqs[VIOMMU_NR_VQS];
 	spinlock_t			request_lock;
 	struct list_head		requests;
+	void				*evts;
 
 	/* Device configuration */
 	struct iommu_domain_geometry	geometry;
@@ -82,6 +84,15 @@ struct viommu_request {
 	char				buf[];
 };
 
+#define VIOMMU_FAULT_RESV_MASK		0xffffff00
+
+struct viommu_event {
+	union {
+		u32			head;
+		struct virtio_iommu_fault fault;
+	};
+};
+
 #define to_viommu_domain(domain)	\
 	container_of(domain, struct viommu_domain, domain)
 
@@ -503,6 +514,68 @@ out_free:
 	return ret;
 }
 
+static int viommu_fault_handler(struct viommu_dev *viommu,
+				struct virtio_iommu_fault *fault)
+{
+	char *reason_str;
+
+	u8 reason	= fault->reason;
+	u32 flags	= le32_to_cpu(fault->flags);
+	u32 endpoint	= le32_to_cpu(fault->endpoint);
+	u64 address	= le64_to_cpu(fault->address);
+
+	switch (reason) {
+	case VIRTIO_IOMMU_FAULT_R_DOMAIN:
+		reason_str = "domain";
+		break;
+	case VIRTIO_IOMMU_FAULT_R_MAPPING:
+		reason_str = "page";
+		break;
+	case VIRTIO_IOMMU_FAULT_R_UNKNOWN:
+	default:
+		reason_str = "unknown";
+		break;
+	}
+
+	/* TODO: find EP by ID and report_iommu_fault */
+	if (flags & VIRTIO_IOMMU_FAULT_F_ADDRESS)
+		dev_err_ratelimited(viommu->dev, "%s fault from EP %u at %#llx [%s%s%s]\n",
+				    reason_str, endpoint, address,
+				    flags & VIRTIO_IOMMU_FAULT_F_READ ? "R" : "",
+				    flags & VIRTIO_IOMMU_FAULT_F_WRITE ? "W" : "",
+				    flags & VIRTIO_IOMMU_FAULT_F_EXEC ? "X" : "");
+	else
+		dev_err_ratelimited(viommu->dev, "%s fault from EP %u\n",
+				    reason_str, endpoint);
+	return 0;
+}
+
+static void viommu_event_handler(struct virtqueue *vq)
+{
+	int ret;
+	unsigned int len;
+	struct scatterlist sg[1];
+	struct viommu_event *evt;
+	struct viommu_dev *viommu = vq->vdev->priv;
+
+	while ((evt = virtqueue_get_buf(vq, &len)) != NULL) {
+		if (len > sizeof(*evt)) {
+			dev_err(viommu->dev,
+				"invalid event buffer (len %u != %zu)\n",
+				len, sizeof(*evt));
+		} else if (!(evt->head & VIOMMU_FAULT_RESV_MASK)) {
+			viommu_fault_handler(viommu, &evt->fault);
+		}
+
+		sg_init_one(sg, evt, sizeof(*evt));
+		ret = virtqueue_add_inbuf(vq, sg, 1, evt, GFP_ATOMIC);
+		if (ret)
+			dev_err(viommu->dev, "could not add event buffer\n");
+	}
+
+	virtqueue_kick(vq);
+}
+
 /* IOMMU API */
 
 static struct iommu_domain *viommu_domain_alloc(unsigned type)
@@ -886,16 +959,35 @@ static struct iommu_ops viommu_ops = {
 static int viommu_init_vqs(struct viommu_dev *viommu)
 {
 	struct virtio_device *vdev = dev_to_virtio(viommu->dev);
-	const char *name = "request";
-	void *ret;
+	const char *names[] = { "request", "event" };
+	vq_callback_t *callbacks[] = {
+		NULL, /* No async requests */
+		viommu_event_handler,
+	};
 
-	ret = virtio_find_single_vq(vdev, NULL, name);
-	if (IS_ERR(ret)) {
-		dev_err(viommu->dev, "cannot find VQ\n");
-		return PTR_ERR(ret);
-	}
+	return virtio_find_vqs(vdev, VIOMMU_NR_VQS, viommu->vqs, callbacks,
+			       names, NULL);
+}
 
-	viommu->vqs[VIOMMU_REQUEST_VQ] = ret;
+static int viommu_fill_evtq(struct viommu_dev *viommu)
+{
+	int i, ret;
+	struct scatterlist sg[1];
+	struct viommu_event *evts;
+	struct virtqueue *vq = viommu->vqs[VIOMMU_EVENT_VQ];
+	size_t nr_evts = vq->num_free;
+
+	viommu->evts = evts = devm_kmalloc_array(viommu->dev, nr_evts,
+						 sizeof(*evts), GFP_KERNEL);
+	if (!evts)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_evts; i++) {
+		sg_init_one(sg, &evts[i], sizeof(*evts));
+		ret = virtqueue_add_inbuf(vq, sg, 1, &evts[i], GFP_KERNEL);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
@@ -964,6 +1056,11 @@ static int viommu_probe(struct virtio_device *vdev)
 
 	virtio_device_ready(vdev);
 
+	/* Populate the event queue with buffers */
+	ret = viommu_fill_evtq(viommu);
+	if (ret)
+		goto err_free_vqs;
+
 	ret = iommu_device_sysfs_add(&viommu->iommu, dev, NULL, "%s",
 				     virtio_bus_name(vdev));
 	if (ret)
diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h
index ae6145cf5928..ba1b460c9944 100644
--- a/include/uapi/linux/virtio_iommu.h
+++ b/include/uapi/linux/virtio_iommu.h
@@ -139,4 +139,23 @@ struct virtio_iommu_req_probe {
 	 */
 };
 
+/* Fault types */
+#define VIRTIO_IOMMU_FAULT_R_UNKNOWN		0
+#define VIRTIO_IOMMU_FAULT_R_DOMAIN		1
+#define VIRTIO_IOMMU_FAULT_R_MAPPING		2
+
+#define VIRTIO_IOMMU_FAULT_F_READ		(1 << 0)
+#define VIRTIO_IOMMU_FAULT_F_WRITE		(1 << 1)
+#define VIRTIO_IOMMU_FAULT_F_EXEC		(1 << 2)
+#define VIRTIO_IOMMU_FAULT_F_ADDRESS		(1 << 8)
+
+struct virtio_iommu_fault {
+	__u8					reason;
+	__u8					reserved[3];
+	__le32					flags;
+	__le32					endpoint;
+	__u8					reserved2[4];
+	__le64					address;
+};
+
 #endif
-- 
cgit v1.2.3


From 3d8b6e9c774ff57ad2860f79abd9cdb8f29ecaf8 Mon Sep 17 00:00:00 2001
From: Fabien Parent <fparent@baylibre.com>
Date: Thu, 2 May 2019 14:18:42 +0200
Subject: dt-bindings: mediatek: audsys: add support for MT8516

Add AUDSYS device tree bindings documentation for MediaTek MT8516 SoC.

Signed-off-by: Fabien Parent <fparent@baylibre.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../bindings/arm/mediatek/mediatek,audsys.txt           |  1 +
 include/dt-bindings/clock/mt8516-clk.h                  | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt
index f3cef1a6d95c..07c9d813465c 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt
@@ -10,6 +10,7 @@ Required Properties:
 	- "mediatek,mt7622-audsys", "syscon"
 	- "mediatek,mt7623-audsys", "mediatek,mt2701-audsys", "syscon"
 	- "mediatek,mt8183-audiosys", "syscon"
+	- "mediatek,mt8516-audsys", "syscon"
 - #clock-cells: Must be 1
 
 The AUDSYS controller uses the common clk binding from
diff --git a/include/dt-bindings/clock/mt8516-clk.h b/include/dt-bindings/clock/mt8516-clk.h
index 9cfca53cd78d..816447b98edd 100644
--- a/include/dt-bindings/clock/mt8516-clk.h
+++ b/include/dt-bindings/clock/mt8516-clk.h
@@ -208,4 +208,21 @@
 #define CLK_TOP_MSDC2_INFRA		176
 #define CLK_TOP_NR_CLK			177
 
+/* AUDSYS */
+
+#define CLK_AUD_AFE			0
+#define CLK_AUD_I2S			1
+#define CLK_AUD_22M			2
+#define CLK_AUD_24M			3
+#define CLK_AUD_INTDIR			4
+#define CLK_AUD_APLL2_TUNER		5
+#define CLK_AUD_APLL_TUNER		6
+#define CLK_AUD_HDMI			7
+#define CLK_AUD_SPDF			8
+#define CLK_AUD_ADC			9
+#define CLK_AUD_DAC			10
+#define CLK_AUD_DAC_PREDIS		11
+#define CLK_AUD_TML			12
+#define CLK_AUD_NR_CLK			13
+
 #endif /* _DT_BINDINGS_CLK_MT8516_H */
-- 
cgit v1.2.3


From 072a551fd5cff2329ad151c7c25b7958afc0f149 Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <jeffrey.l.hugo@gmail.com>
Date: Tue, 28 May 2019 09:47:40 -0700
Subject: dt-bindings: clock: Document gpucc for msm8998

The GPU for msm8998 has its own clock controller.  Document it.

Signed-off-by: Jeffrey Hugo <jeffrey.l.hugo@gmail.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../devicetree/bindings/clock/qcom,gpucc.txt       |  4 ++-
 include/dt-bindings/clock/qcom,gpucc-msm8998.h     | 29 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/clock/qcom,gpucc-msm8998.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,gpucc.txt b/Documentation/devicetree/bindings/clock/qcom,gpucc.txt
index 4e5215ef1acd..269afe8a757e 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gpucc.txt
+++ b/Documentation/devicetree/bindings/clock/qcom,gpucc.txt
@@ -2,13 +2,15 @@ Qualcomm Graphics Clock & Reset Controller Binding
 --------------------------------------------------
 
 Required properties :
-- compatible : shall contain "qcom,sdm845-gpucc"
+- compatible : shall contain "qcom,sdm845-gpucc" or "qcom,msm8998-gpucc"
 - reg : shall contain base register location and length
 - #clock-cells : from common clock binding, shall contain 1
 - #reset-cells : from common reset binding, shall contain 1
 - #power-domain-cells : from generic power domain binding, shall contain 1
 - clocks : shall contain the XO clock
+	   shall contain the gpll0 out main clock (msm8998)
 - clock-names : shall be "xo"
+		shall be "gpll0" (msm8998)
 
 Example:
 	gpucc: clock-controller@5090000 {
diff --git a/include/dt-bindings/clock/qcom,gpucc-msm8998.h b/include/dt-bindings/clock/qcom,gpucc-msm8998.h
new file mode 100644
index 000000000000..2623570ee974
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,gpucc-msm8998.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2019, Jeffrey Hugo
+ */
+
+#ifndef _DT_BINDINGS_CLK_MSM_GPUCC_8998_H
+#define _DT_BINDINGS_CLK_MSM_GPUCC_8998_H
+
+#define GPUPLL0						0
+#define GPUPLL0_OUT_EVEN				1
+#define RBCPR_CLK_SRC					2
+#define GFX3D_CLK_SRC					3
+#define RBBMTIMER_CLK_SRC				4
+#define GFX3D_ISENSE_CLK_SRC				5
+#define RBCPR_CLK					6
+#define GFX3D_CLK					7
+#define RBBMTIMER_CLK					8
+#define GFX3D_ISENSE_CLK				9
+#define GPUCC_CXO_CLK					10
+
+#define GPU_CX_BCR					0
+#define RBCPR_BCR					1
+#define GPU_GX_BCR					2
+#define GPU_ISENSE_BCR					3
+
+#define GPU_CX_GDSC					1
+#define GPU_GX_GDSC					2
+
+#endif
-- 
cgit v1.2.3


From 6d7c3cde93c1d9ac0b37f78ec3f2ff052159a242 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 22 May 2019 16:52:52 -0300
Subject: mm/hmm: fix use after free with struct hmm in the mmu notifiers

mmu_notifier_unregister_no_release() is not a fence and the mmu_notifier
system will continue to reference hmm->mn until the srcu grace period
expires.

Resulting in use after free races like this:

         CPU0                                     CPU1
                                               __mmu_notifier_invalidate_range_start()
                                                 srcu_read_lock
                                                 hlist_for_each ()
                                                   // mn == hmm->mn
hmm_mirror_unregister()
  hmm_put()
    hmm_free()
      mmu_notifier_unregister_no_release()
         hlist_del_init_rcu(hmm-mn->list)
			                           mn->ops->invalidate_range_start(mn, range);
					             mm_get_hmm()
      mm->hmm = NULL;
      kfree(hmm)
                                                     mutex_lock(&hmm->lock);

Use SRCU to kfree the hmm memory so that the notifiers can rely on hmm
existing. Get the now-safe hmm struct through container_of and directly
check kref_get_unless_zero to lock it against free.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h |  1 +
 mm/hmm.c            | 23 +++++++++++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 7007123842ba..cb01cf1fa3c0 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -93,6 +93,7 @@ struct hmm {
 	struct mmu_notifier	mmu_notifier;
 	struct rw_semaphore	mirrors_sem;
 	wait_queue_head_t	wq;
+	struct rcu_head		rcu;
 	long			notifiers;
 	bool			dead;
 };
diff --git a/mm/hmm.c b/mm/hmm.c
index 826816ab2377..f6956d78e3cb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -104,6 +104,11 @@ error:
 	return NULL;
 }
 
+static void hmm_free_rcu(struct rcu_head *rcu)
+{
+	kfree(container_of(rcu, struct hmm, rcu));
+}
+
 static void hmm_free(struct kref *kref)
 {
 	struct hmm *hmm = container_of(kref, struct hmm, kref);
@@ -116,7 +121,7 @@ static void hmm_free(struct kref *kref)
 		mm->hmm = NULL;
 	spin_unlock(&mm->page_table_lock);
 
-	kfree(hmm);
+	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
 }
 
 static inline void hmm_put(struct hmm *hmm)
@@ -144,10 +149,14 @@ void hmm_mm_destroy(struct mm_struct *mm)
 
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
-	struct hmm *hmm = mm_get_hmm(mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
 	struct hmm_range *range;
 
+	/* Bail out if hmm is in the process of being freed */
+	if (!kref_get_unless_zero(&hmm->kref))
+		return;
+
 	/* Report this HMM as dying. */
 	hmm->dead = true;
 
@@ -185,13 +194,14 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
-	struct hmm *hmm = mm_get_hmm(nrange->mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
 	struct hmm_update update;
 	struct hmm_range *range;
 	int ret = 0;
 
-	VM_BUG_ON(!hmm);
+	if (!kref_get_unless_zero(&hmm->kref))
+		return 0;
 
 	update.start = nrange->start;
 	update.end = nrange->end;
@@ -236,9 +246,10 @@ out:
 static void hmm_invalidate_range_end(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
-	struct hmm *hmm = mm_get_hmm(nrange->mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 
-	VM_BUG_ON(!hmm);
+	if (!kref_get_unless_zero(&hmm->kref))
+		return;
 
 	mutex_lock(&hmm->lock);
 	hmm->notifiers--;
-- 
cgit v1.2.3


From e5bbbff5b7d7e76ccfe922a014ba628c558eff2f Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Wed, 8 May 2019 15:39:22 -0700
Subject: clk: gcc-qcs404: Add PCIe resets

Enabling PCIe requires several of the PCIe related resets from GCC, so
add them all.

Reviewed-by: Niklas Cassel <niklas.cassel@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/qcom/gcc-qcs404.c               | 7 +++++++
 include/dt-bindings/clock/qcom,gcc-qcs404.h | 7 +++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/clk/qcom/gcc-qcs404.c b/drivers/clk/qcom/gcc-qcs404.c
index a54807eb3b28..29cf464dd2c8 100644
--- a/drivers/clk/qcom/gcc-qcs404.c
+++ b/drivers/clk/qcom/gcc-qcs404.c
@@ -2766,6 +2766,13 @@ static const struct qcom_reset_map gcc_qcs404_resets[] = {
 	[GCC_PCIE_0_PHY_BCR] = { 0x3e004 },
 	[GCC_PCIE_0_LINK_DOWN_BCR] = { 0x3e038 },
 	[GCC_PCIEPHY_0_PHY_BCR] = { 0x3e03c },
+	[GCC_PCIE_0_AXI_MASTER_STICKY_ARES] = { 0x3e040, 6},
+	[GCC_PCIE_0_AHB_ARES] = { 0x3e040, 5 },
+	[GCC_PCIE_0_AXI_SLAVE_ARES] = { 0x3e040, 4 },
+	[GCC_PCIE_0_AXI_MASTER_ARES] = { 0x3e040, 3 },
+	[GCC_PCIE_0_CORE_STICKY_ARES] = { 0x3e040, 2 },
+	[GCC_PCIE_0_SLEEP_ARES] = { 0x3e040, 1 },
+	[GCC_PCIE_0_PIPE_ARES] = { 0x3e040, 0 },
 	[GCC_EMAC_BCR] = { 0x4e000 },
 };
 
diff --git a/include/dt-bindings/clock/qcom,gcc-qcs404.h b/include/dt-bindings/clock/qcom,gcc-qcs404.h
index 454b3f43f538..2cd62c98561f 100644
--- a/include/dt-bindings/clock/qcom,gcc-qcs404.h
+++ b/include/dt-bindings/clock/qcom,gcc-qcs404.h
@@ -166,5 +166,12 @@
 #define GCC_PCIEPHY_0_PHY_BCR				12
 #define GCC_EMAC_BCR					13
 #define GCC_CDSP_RESTART				14
+#define GCC_PCIE_0_AXI_MASTER_STICKY_ARES		15
+#define GCC_PCIE_0_AHB_ARES				16
+#define GCC_PCIE_0_AXI_SLAVE_ARES			17
+#define GCC_PCIE_0_AXI_MASTER_ARES			18
+#define GCC_PCIE_0_CORE_STICKY_ARES			19
+#define GCC_PCIE_0_SLEEP_ARES				20
+#define GCC_PCIE_0_PIPE_ARES				21
 
 #endif
-- 
cgit v1.2.3


From b0a6b94027c85857f768a586cbcf1e96ee1d04ae Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Mon, 4 Mar 2019 23:05:34 +0200
Subject: drm: bridge: Add dual_link field to the drm_bridge_timings structure

Extend the drm_bridge_timings structure with a new dual_link field to
indicate that the bridge's input bus carries data on two separate
physical links. The first use case is LVDS dual-link mode where even-
and odd-numbered pixels are transferred on separate LVDS links.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
Tested-by: Jacopo Mondi <jacopo+renesas@jmondi.org>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
---
 include/drm/drm_bridge.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h
index d4428913a4e1..aea1fcfd92a7 100644
--- a/include/drm/drm_bridge.h
+++ b/include/drm/drm_bridge.h
@@ -265,6 +265,14 @@ struct drm_bridge_timings {
 	 * input signal after the clock edge.
 	 */
 	u32 hold_time_ps;
+	/**
+	 * @dual_link:
+	 *
+	 * True if the bus operates in dual-link mode. The exact meaning is
+	 * dependent on the bus type. For LVDS buses, this indicates that even-
+	 * and odd-numbered pixels are received on separate links.
+	 */
+	bool dual_link;
 };
 
 /**
-- 
cgit v1.2.3


From df73789514554761ebdd87f2426938696a2442a2 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Fri, 31 May 2019 16:01:10 +0200
Subject: drm/atomic: Move __drm_atomic_helper_disable_plane/set_config()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare for moving drm_fb_helper modesetting code to drm_client.
drm_client will be linked to drm.ko, so move
__drm_atomic_helper_disable_plane() and __drm_atomic_helper_set_config()
out of drm_kms_helper.ko.

While at it, fix two checkpatch complaints:
- WARNING: Block comments use a trailing */ on a separate line
- CHECK: Alignment should match open parenthesis

v7: Declare drm_mode_set and drm_plane_state

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190531140117.37751-2-noralf@tronnes.org
---
 drivers/gpu/drm/drm_atomic.c        | 168 ++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/drm_atomic_helper.c | 164 -----------------------------------
 drivers/gpu/drm/drm_crtc_internal.h |   7 ++
 include/drm/drm_atomic_helper.h     |   4 -
 4 files changed, 175 insertions(+), 168 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index b640e77184e5..169f06d7c5ce 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -1179,6 +1179,174 @@ int drm_atomic_nonblocking_commit(struct drm_atomic_state *state)
 }
 EXPORT_SYMBOL(drm_atomic_nonblocking_commit);
 
+/* just used from drm-client and atomic-helper: */
+int __drm_atomic_helper_disable_plane(struct drm_plane *plane,
+				      struct drm_plane_state *plane_state)
+{
+	int ret;
+
+	ret = drm_atomic_set_crtc_for_plane(plane_state, NULL);
+	if (ret != 0)
+		return ret;
+
+	drm_atomic_set_fb_for_plane(plane_state, NULL);
+	plane_state->crtc_x = 0;
+	plane_state->crtc_y = 0;
+	plane_state->crtc_w = 0;
+	plane_state->crtc_h = 0;
+	plane_state->src_x = 0;
+	plane_state->src_y = 0;
+	plane_state->src_w = 0;
+	plane_state->src_h = 0;
+
+	return 0;
+}
+EXPORT_SYMBOL(__drm_atomic_helper_disable_plane);
+
+static int update_output_state(struct drm_atomic_state *state,
+			       struct drm_mode_set *set)
+{
+	struct drm_device *dev = set->crtc->dev;
+	struct drm_crtc *crtc;
+	struct drm_crtc_state *new_crtc_state;
+	struct drm_connector *connector;
+	struct drm_connector_state *new_conn_state;
+	int ret, i;
+
+	ret = drm_modeset_lock(&dev->mode_config.connection_mutex,
+			       state->acquire_ctx);
+	if (ret)
+		return ret;
+
+	/* First disable all connectors on the target crtc. */
+	ret = drm_atomic_add_affected_connectors(state, set->crtc);
+	if (ret)
+		return ret;
+
+	for_each_new_connector_in_state(state, connector, new_conn_state, i) {
+		if (new_conn_state->crtc == set->crtc) {
+			ret = drm_atomic_set_crtc_for_connector(new_conn_state,
+								NULL);
+			if (ret)
+				return ret;
+
+			/* Make sure legacy setCrtc always re-trains */
+			new_conn_state->link_status = DRM_LINK_STATUS_GOOD;
+		}
+	}
+
+	/* Then set all connectors from set->connectors on the target crtc */
+	for (i = 0; i < set->num_connectors; i++) {
+		new_conn_state = drm_atomic_get_connector_state(state,
+								set->connectors[i]);
+		if (IS_ERR(new_conn_state))
+			return PTR_ERR(new_conn_state);
+
+		ret = drm_atomic_set_crtc_for_connector(new_conn_state,
+							set->crtc);
+		if (ret)
+			return ret;
+	}
+
+	for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) {
+		/*
+		 * Don't update ->enable for the CRTC in the set_config request,
+		 * since a mismatch would indicate a bug in the upper layers.
+		 * The actual modeset code later on will catch any
+		 * inconsistencies here.
+		 */
+		if (crtc == set->crtc)
+			continue;
+
+		if (!new_crtc_state->connector_mask) {
+			ret = drm_atomic_set_mode_prop_for_crtc(new_crtc_state,
+								NULL);
+			if (ret < 0)
+				return ret;
+
+			new_crtc_state->active = false;
+		}
+	}
+
+	return 0;
+}
+
+/* just used from drm-client and atomic-helper: */
+int __drm_atomic_helper_set_config(struct drm_mode_set *set,
+				   struct drm_atomic_state *state)
+{
+	struct drm_crtc_state *crtc_state;
+	struct drm_plane_state *primary_state;
+	struct drm_crtc *crtc = set->crtc;
+	int hdisplay, vdisplay;
+	int ret;
+
+	crtc_state = drm_atomic_get_crtc_state(state, crtc);
+	if (IS_ERR(crtc_state))
+		return PTR_ERR(crtc_state);
+
+	primary_state = drm_atomic_get_plane_state(state, crtc->primary);
+	if (IS_ERR(primary_state))
+		return PTR_ERR(primary_state);
+
+	if (!set->mode) {
+		WARN_ON(set->fb);
+		WARN_ON(set->num_connectors);
+
+		ret = drm_atomic_set_mode_for_crtc(crtc_state, NULL);
+		if (ret != 0)
+			return ret;
+
+		crtc_state->active = false;
+
+		ret = drm_atomic_set_crtc_for_plane(primary_state, NULL);
+		if (ret != 0)
+			return ret;
+
+		drm_atomic_set_fb_for_plane(primary_state, NULL);
+
+		goto commit;
+	}
+
+	WARN_ON(!set->fb);
+	WARN_ON(!set->num_connectors);
+
+	ret = drm_atomic_set_mode_for_crtc(crtc_state, set->mode);
+	if (ret != 0)
+		return ret;
+
+	crtc_state->active = true;
+
+	ret = drm_atomic_set_crtc_for_plane(primary_state, crtc);
+	if (ret != 0)
+		return ret;
+
+	drm_mode_get_hv_timing(set->mode, &hdisplay, &vdisplay);
+
+	drm_atomic_set_fb_for_plane(primary_state, set->fb);
+	primary_state->crtc_x = 0;
+	primary_state->crtc_y = 0;
+	primary_state->crtc_w = hdisplay;
+	primary_state->crtc_h = vdisplay;
+	primary_state->src_x = set->x << 16;
+	primary_state->src_y = set->y << 16;
+	if (drm_rotation_90_or_270(primary_state->rotation)) {
+		primary_state->src_w = vdisplay << 16;
+		primary_state->src_h = hdisplay << 16;
+	} else {
+		primary_state->src_w = hdisplay << 16;
+		primary_state->src_h = vdisplay << 16;
+	}
+
+commit:
+	ret = update_output_state(state, set);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL(__drm_atomic_helper_set_config);
+
 void drm_atomic_print_state(const struct drm_atomic_state *state)
 {
 	struct drm_printer p = drm_info_printer(state->dev->dev);
diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index acf993cb8e52..0fc63d682245 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -2844,95 +2844,6 @@ fail:
 }
 EXPORT_SYMBOL(drm_atomic_helper_disable_plane);
 
-/* just used from fb-helper and atomic-helper: */
-int __drm_atomic_helper_disable_plane(struct drm_plane *plane,
-		struct drm_plane_state *plane_state)
-{
-	int ret;
-
-	ret = drm_atomic_set_crtc_for_plane(plane_state, NULL);
-	if (ret != 0)
-		return ret;
-
-	drm_atomic_set_fb_for_plane(plane_state, NULL);
-	plane_state->crtc_x = 0;
-	plane_state->crtc_y = 0;
-	plane_state->crtc_w = 0;
-	plane_state->crtc_h = 0;
-	plane_state->src_x = 0;
-	plane_state->src_y = 0;
-	plane_state->src_w = 0;
-	plane_state->src_h = 0;
-
-	return 0;
-}
-
-static int update_output_state(struct drm_atomic_state *state,
-			       struct drm_mode_set *set)
-{
-	struct drm_device *dev = set->crtc->dev;
-	struct drm_crtc *crtc;
-	struct drm_crtc_state *new_crtc_state;
-	struct drm_connector *connector;
-	struct drm_connector_state *new_conn_state;
-	int ret, i;
-
-	ret = drm_modeset_lock(&dev->mode_config.connection_mutex,
-			       state->acquire_ctx);
-	if (ret)
-		return ret;
-
-	/* First disable all connectors on the target crtc. */
-	ret = drm_atomic_add_affected_connectors(state, set->crtc);
-	if (ret)
-		return ret;
-
-	for_each_new_connector_in_state(state, connector, new_conn_state, i) {
-		if (new_conn_state->crtc == set->crtc) {
-			ret = drm_atomic_set_crtc_for_connector(new_conn_state,
-								NULL);
-			if (ret)
-				return ret;
-
-			/* Make sure legacy setCrtc always re-trains */
-			new_conn_state->link_status = DRM_LINK_STATUS_GOOD;
-		}
-	}
-
-	/* Then set all connectors from set->connectors on the target crtc */
-	for (i = 0; i < set->num_connectors; i++) {
-		new_conn_state = drm_atomic_get_connector_state(state,
-							    set->connectors[i]);
-		if (IS_ERR(new_conn_state))
-			return PTR_ERR(new_conn_state);
-
-		ret = drm_atomic_set_crtc_for_connector(new_conn_state,
-							set->crtc);
-		if (ret)
-			return ret;
-	}
-
-	for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) {
-		/* Don't update ->enable for the CRTC in the set_config request,
-		 * since a mismatch would indicate a bug in the upper layers.
-		 * The actual modeset code later on will catch any
-		 * inconsistencies here. */
-		if (crtc == set->crtc)
-			continue;
-
-		if (!new_crtc_state->connector_mask) {
-			ret = drm_atomic_set_mode_prop_for_crtc(new_crtc_state,
-								NULL);
-			if (ret < 0)
-				return ret;
-
-			new_crtc_state->active = false;
-		}
-	}
-
-	return 0;
-}
-
 /**
  * drm_atomic_helper_set_config - set a new config from userspace
  * @set: mode set configuration
@@ -2977,81 +2888,6 @@ fail:
 }
 EXPORT_SYMBOL(drm_atomic_helper_set_config);
 
-/* just used from fb-helper and atomic-helper: */
-int __drm_atomic_helper_set_config(struct drm_mode_set *set,
-		struct drm_atomic_state *state)
-{
-	struct drm_crtc_state *crtc_state;
-	struct drm_plane_state *primary_state;
-	struct drm_crtc *crtc = set->crtc;
-	int hdisplay, vdisplay;
-	int ret;
-
-	crtc_state = drm_atomic_get_crtc_state(state, crtc);
-	if (IS_ERR(crtc_state))
-		return PTR_ERR(crtc_state);
-
-	primary_state = drm_atomic_get_plane_state(state, crtc->primary);
-	if (IS_ERR(primary_state))
-		return PTR_ERR(primary_state);
-
-	if (!set->mode) {
-		WARN_ON(set->fb);
-		WARN_ON(set->num_connectors);
-
-		ret = drm_atomic_set_mode_for_crtc(crtc_state, NULL);
-		if (ret != 0)
-			return ret;
-
-		crtc_state->active = false;
-
-		ret = drm_atomic_set_crtc_for_plane(primary_state, NULL);
-		if (ret != 0)
-			return ret;
-
-		drm_atomic_set_fb_for_plane(primary_state, NULL);
-
-		goto commit;
-	}
-
-	WARN_ON(!set->fb);
-	WARN_ON(!set->num_connectors);
-
-	ret = drm_atomic_set_mode_for_crtc(crtc_state, set->mode);
-	if (ret != 0)
-		return ret;
-
-	crtc_state->active = true;
-
-	ret = drm_atomic_set_crtc_for_plane(primary_state, crtc);
-	if (ret != 0)
-		return ret;
-
-	drm_mode_get_hv_timing(set->mode, &hdisplay, &vdisplay);
-
-	drm_atomic_set_fb_for_plane(primary_state, set->fb);
-	primary_state->crtc_x = 0;
-	primary_state->crtc_y = 0;
-	primary_state->crtc_w = hdisplay;
-	primary_state->crtc_h = vdisplay;
-	primary_state->src_x = set->x << 16;
-	primary_state->src_y = set->y << 16;
-	if (drm_rotation_90_or_270(primary_state->rotation)) {
-		primary_state->src_w = vdisplay << 16;
-		primary_state->src_h = hdisplay << 16;
-	} else {
-		primary_state->src_w = hdisplay << 16;
-		primary_state->src_h = vdisplay << 16;
-	}
-
-commit:
-	ret = update_output_state(state, set);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 /**
  * drm_atomic_helper_disable_all - disable all currently active outputs
  * @dev: DRM device
diff --git a/drivers/gpu/drm/drm_crtc_internal.h b/drivers/gpu/drm/drm_crtc_internal.h
index c78a44fad13d..c7d5e4c21423 100644
--- a/drivers/gpu/drm/drm_crtc_internal.h
+++ b/drivers/gpu/drm/drm_crtc_internal.h
@@ -50,7 +50,9 @@ struct drm_mode_create_dumb;
 struct drm_mode_fb_cmd2;
 struct drm_mode_fb_cmd;
 struct drm_mode_object;
+struct drm_mode_set;
 struct drm_plane;
+struct drm_plane_state;
 struct drm_property;
 struct edid;
 struct kref;
@@ -223,6 +225,11 @@ struct drm_minor;
 int drm_atomic_debugfs_init(struct drm_minor *minor);
 #endif
 
+int __drm_atomic_helper_disable_plane(struct drm_plane *plane,
+				      struct drm_plane_state *plane_state);
+int __drm_atomic_helper_set_config(struct drm_mode_set *set,
+				   struct drm_atomic_state *state);
+
 void drm_atomic_print_state(const struct drm_atomic_state *state);
 
 /* drm_atomic_uapi.c */
diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h
index 58214be3bf3d..bf4e07141d81 100644
--- a/include/drm/drm_atomic_helper.h
+++ b/include/drm/drm_atomic_helper.h
@@ -117,12 +117,8 @@ int drm_atomic_helper_update_plane(struct drm_plane *plane,
 				   struct drm_modeset_acquire_ctx *ctx);
 int drm_atomic_helper_disable_plane(struct drm_plane *plane,
 				    struct drm_modeset_acquire_ctx *ctx);
-int __drm_atomic_helper_disable_plane(struct drm_plane *plane,
-		struct drm_plane_state *plane_state);
 int drm_atomic_helper_set_config(struct drm_mode_set *set,
 				 struct drm_modeset_acquire_ctx *ctx);
-int __drm_atomic_helper_set_config(struct drm_mode_set *set,
-		struct drm_atomic_state *state);
 
 int drm_atomic_helper_disable_all(struct drm_device *dev,
 				  struct drm_modeset_acquire_ctx *ctx);
-- 
cgit v1.2.3


From aec3925f093d3ac880c53ec03ea28ef8608b0a52 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Fri, 31 May 2019 16:01:13 +0200
Subject: drm/fb-helper: Move out commit code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the modeset commit code to drm_client_modeset.
No changes except exporting API.

v7: Export drm_client_panel_rotation() (Gerd Hoffmann)
v2: Move to drm_client_modeset.c instead of drm_client.c

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190531140117.37751-5-noralf@tronnes.org
---
 drivers/gpu/drm/drm_client_modeset.c | 288 +++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/drm_fb_helper.c      | 282 ----------------------------------
 include/drm/drm_client.h             |   4 +
 3 files changed, 292 insertions(+), 282 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c
index 66770ed3299e..b1984f901a6d 100644
--- a/drivers/gpu/drm/drm_client_modeset.c
+++ b/drivers/gpu/drm/drm_client_modeset.c
@@ -11,9 +11,14 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 
+#include <drm/drm_atomic.h>
 #include <drm/drm_client.h>
 #include <drm/drm_crtc.h>
 #include <drm/drm_device.h>
+#include <drm/drm_drv.h>
+
+#include "drm_crtc_internal.h"
+#include "drm_internal.h"
 
 int drm_client_modeset_create(struct drm_client_dev *client)
 {
@@ -102,3 +107,286 @@ struct drm_mode_set *drm_client_find_modeset(struct drm_client_dev *client, stru
 }
 /* TODO: Remove export when modeset code has been moved over */
 EXPORT_SYMBOL(drm_client_find_modeset);
+
+/**
+ * drm_client_panel_rotation() - Check panel orientation
+ * @modeset: DRM modeset
+ * @rotation: Returned rotation value
+ *
+ * This function checks if the primary plane in @modeset can hw rotate to match
+ * the panel orientation on its connector.
+ *
+ * Note: Currently only 0 and 180 degrees are supported.
+ *
+ * Return:
+ * True if the plane can do the rotation, false otherwise.
+ */
+bool drm_client_panel_rotation(struct drm_mode_set *modeset, unsigned int *rotation)
+{
+	struct drm_connector *connector = modeset->connectors[0];
+	struct drm_plane *plane = modeset->crtc->primary;
+	u64 valid_mask = 0;
+	unsigned int i;
+
+	if (!modeset->num_connectors)
+		return false;
+
+	switch (connector->display_info.panel_orientation) {
+	case DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP:
+		*rotation = DRM_MODE_ROTATE_180;
+		break;
+	case DRM_MODE_PANEL_ORIENTATION_LEFT_UP:
+		*rotation = DRM_MODE_ROTATE_90;
+		break;
+	case DRM_MODE_PANEL_ORIENTATION_RIGHT_UP:
+		*rotation = DRM_MODE_ROTATE_270;
+		break;
+	default:
+		*rotation = DRM_MODE_ROTATE_0;
+	}
+
+	/*
+	 * TODO: support 90 / 270 degree hardware rotation,
+	 * depending on the hardware this may require the framebuffer
+	 * to be in a specific tiling format.
+	 */
+	if (*rotation != DRM_MODE_ROTATE_180 || !plane->rotation_property)
+		return false;
+
+	for (i = 0; i < plane->rotation_property->num_values; i++)
+		valid_mask |= (1ULL << plane->rotation_property->values[i]);
+
+	if (!(*rotation & valid_mask))
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL(drm_client_panel_rotation);
+
+static int drm_client_modeset_commit_atomic(struct drm_client_dev *client, bool active)
+{
+	struct drm_device *dev = client->dev;
+	struct drm_plane_state *plane_state;
+	struct drm_plane *plane;
+	struct drm_atomic_state *state;
+	struct drm_modeset_acquire_ctx ctx;
+	struct drm_mode_set *mode_set;
+	int ret;
+
+	drm_modeset_acquire_init(&ctx, 0);
+
+	state = drm_atomic_state_alloc(dev);
+	if (!state) {
+		ret = -ENOMEM;
+		goto out_ctx;
+	}
+
+	state->acquire_ctx = &ctx;
+retry:
+	drm_for_each_plane(plane, dev) {
+		plane_state = drm_atomic_get_plane_state(state, plane);
+		if (IS_ERR(plane_state)) {
+			ret = PTR_ERR(plane_state);
+			goto out_state;
+		}
+
+		plane_state->rotation = DRM_MODE_ROTATE_0;
+
+		/* disable non-primary: */
+		if (plane->type == DRM_PLANE_TYPE_PRIMARY)
+			continue;
+
+		ret = __drm_atomic_helper_disable_plane(plane, plane_state);
+		if (ret != 0)
+			goto out_state;
+	}
+
+	drm_client_for_each_modeset(mode_set, client) {
+		struct drm_plane *primary = mode_set->crtc->primary;
+		unsigned int rotation;
+
+		if (drm_client_panel_rotation(mode_set, &rotation)) {
+			/* Cannot fail as we've already gotten the plane state above */
+			plane_state = drm_atomic_get_new_plane_state(state, primary);
+			plane_state->rotation = rotation;
+		}
+
+		ret = __drm_atomic_helper_set_config(mode_set, state);
+		if (ret != 0)
+			goto out_state;
+
+		/*
+		 * __drm_atomic_helper_set_config() sets active when a
+		 * mode is set, unconditionally clear it if we force DPMS off
+		 */
+		if (!active) {
+			struct drm_crtc *crtc = mode_set->crtc;
+			struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc);
+
+			crtc_state->active = false;
+		}
+	}
+
+	ret = drm_atomic_commit(state);
+
+out_state:
+	if (ret == -EDEADLK)
+		goto backoff;
+
+	drm_atomic_state_put(state);
+out_ctx:
+	drm_modeset_drop_locks(&ctx);
+	drm_modeset_acquire_fini(&ctx);
+
+	return ret;
+
+backoff:
+	drm_atomic_state_clear(state);
+	drm_modeset_backoff(&ctx);
+
+	goto retry;
+}
+
+static int drm_client_modeset_commit_legacy(struct drm_client_dev *client)
+{
+	struct drm_device *dev = client->dev;
+	struct drm_mode_set *mode_set;
+	struct drm_plane *plane;
+	int ret = 0;
+
+	drm_modeset_lock_all(dev);
+	drm_for_each_plane(plane, dev) {
+		if (plane->type != DRM_PLANE_TYPE_PRIMARY)
+			drm_plane_force_disable(plane);
+
+		if (plane->rotation_property)
+			drm_mode_plane_set_obj_prop(plane,
+						    plane->rotation_property,
+						    DRM_MODE_ROTATE_0);
+	}
+
+	drm_client_for_each_modeset(mode_set, client) {
+		struct drm_crtc *crtc = mode_set->crtc;
+
+		if (crtc->funcs->cursor_set2) {
+			ret = crtc->funcs->cursor_set2(crtc, NULL, 0, 0, 0, 0, 0);
+			if (ret)
+				goto out;
+		} else if (crtc->funcs->cursor_set) {
+			ret = crtc->funcs->cursor_set(crtc, NULL, 0, 0, 0);
+			if (ret)
+				goto out;
+		}
+
+		ret = drm_mode_set_config_internal(mode_set);
+		if (ret)
+			goto out;
+	}
+out:
+	drm_modeset_unlock_all(dev);
+
+	return ret;
+}
+
+/**
+ * drm_client_modeset_commit_force() - Force commit CRTC configuration
+ * @client: DRM client
+ *
+ * Commit modeset configuration to crtcs without checking if there is a DRM master.
+ *
+ * Returns:
+ * Zero on success or negative error code on failure.
+ */
+int drm_client_modeset_commit_force(struct drm_client_dev *client)
+{
+	struct drm_device *dev = client->dev;
+	int ret;
+
+	mutex_lock(&client->modeset_mutex);
+	if (drm_drv_uses_atomic_modeset(dev))
+		ret = drm_client_modeset_commit_atomic(client, true);
+	else
+		ret = drm_client_modeset_commit_legacy(client);
+	mutex_unlock(&client->modeset_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(drm_client_modeset_commit_force);
+
+/**
+ * drm_client_modeset_commit() - Commit CRTC configuration
+ * @client: DRM client
+ *
+ * Commit modeset configuration to crtcs.
+ *
+ * Returns:
+ * Zero on success or negative error code on failure.
+ */
+int drm_client_modeset_commit(struct drm_client_dev *client)
+{
+	struct drm_device *dev = client->dev;
+	int ret;
+
+	if (!drm_master_internal_acquire(dev))
+		return -EBUSY;
+
+	ret = drm_client_modeset_commit_force(client);
+
+	drm_master_internal_release(dev);
+
+	return ret;
+}
+EXPORT_SYMBOL(drm_client_modeset_commit);
+
+static void drm_client_modeset_dpms_legacy(struct drm_client_dev *client, int dpms_mode)
+{
+	struct drm_device *dev = client->dev;
+	struct drm_connector *connector;
+	struct drm_mode_set *modeset;
+	int j;
+
+	drm_modeset_lock_all(dev);
+	drm_client_for_each_modeset(modeset, client) {
+		if (!modeset->crtc->enabled)
+			continue;
+
+		for (j = 0; j < modeset->num_connectors; j++) {
+			connector = modeset->connectors[j];
+			connector->funcs->dpms(connector, dpms_mode);
+			drm_object_property_set_value(&connector->base,
+				dev->mode_config.dpms_property, dpms_mode);
+		}
+	}
+	drm_modeset_unlock_all(dev);
+}
+
+/**
+ * drm_client_modeset_dpms() - Set DPMS mode
+ * @client: DRM client
+ * @mode: DPMS mode
+ *
+ * Note: For atomic drivers @mode is reduced to on/off.
+ *
+ * Returns:
+ * Zero on success or negative error code on failure.
+ */
+int drm_client_modeset_dpms(struct drm_client_dev *client, int mode)
+{
+	struct drm_device *dev = client->dev;
+	int ret = 0;
+
+	if (!drm_master_internal_acquire(dev))
+		return -EBUSY;
+
+	mutex_lock(&client->modeset_mutex);
+	if (drm_drv_uses_atomic_modeset(dev))
+		ret = drm_client_modeset_commit_atomic(client, mode == DRM_MODE_DPMS_ON);
+	else
+		drm_client_modeset_dpms_legacy(client, mode);
+	mutex_unlock(&client->modeset_mutex);
+
+	drm_master_internal_release(dev);
+
+	return ret;
+}
+EXPORT_SYMBOL(drm_client_modeset_dpms);
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 64800d43837d..7b388674a456 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -38,7 +38,6 @@
 #include <linux/vmalloc.h>
 
 #include <drm/drm_atomic.h>
-#include <drm/drm_atomic_helper.h>
 #include <drm/drm_crtc.h>
 #include <drm/drm_crtc_helper.h>
 #include <drm/drm_drv.h>
@@ -47,8 +46,6 @@
 #include <drm/drm_print.h>
 #include <drm/drm_vblank.h>
 
-#include "drm_crtc_helper_internal.h"
-#include "drm_crtc_internal.h"
 #include "drm_internal.h"
 
 static bool drm_fbdev_emulation = true;
@@ -393,233 +390,6 @@ int drm_fb_helper_debug_leave(struct fb_info *info)
 }
 EXPORT_SYMBOL(drm_fb_helper_debug_leave);
 
-/**
- * drm_client_panel_rotation() - Check panel orientation
- * @modeset: DRM modeset
- * @rotation: Returned rotation value
- *
- * This function checks if the primary plane in @modeset can hw rotate to match
- * the panel orientation on its connector.
- *
- * Note: Currently only 0 and 180 degrees are supported.
- *
- * Return:
- * True if the plane can do the rotation, false otherwise.
- */
-bool drm_client_panel_rotation(struct drm_mode_set *modeset, unsigned int *rotation)
-{
-	struct drm_connector *connector = modeset->connectors[0];
-	struct drm_plane *plane = modeset->crtc->primary;
-	u64 valid_mask = 0;
-	unsigned int i;
-
-	if (!modeset->num_connectors)
-		return false;
-
-	switch (connector->display_info.panel_orientation) {
-	case DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP:
-		*rotation = DRM_MODE_ROTATE_180;
-		break;
-	case DRM_MODE_PANEL_ORIENTATION_LEFT_UP:
-		*rotation = DRM_MODE_ROTATE_90;
-		break;
-	case DRM_MODE_PANEL_ORIENTATION_RIGHT_UP:
-		*rotation = DRM_MODE_ROTATE_270;
-		break;
-	default:
-		*rotation = DRM_MODE_ROTATE_0;
-	}
-
-	/*
-	 * TODO: support 90 / 270 degree hardware rotation,
-	 * depending on the hardware this may require the framebuffer
-	 * to be in a specific tiling format.
-	 */
-	if (*rotation != DRM_MODE_ROTATE_180 || !plane->rotation_property)
-		return false;
-
-	for (i = 0; i < plane->rotation_property->num_values; i++)
-		valid_mask |= (1ULL << plane->rotation_property->values[i]);
-
-	if (!(*rotation & valid_mask))
-		return false;
-
-	return true;
-}
-
-static int drm_client_modeset_commit_atomic(struct drm_client_dev *client, bool active)
-{
-	struct drm_device *dev = client->dev;
-	struct drm_plane_state *plane_state;
-	struct drm_plane *plane;
-	struct drm_atomic_state *state;
-	struct drm_modeset_acquire_ctx ctx;
-	struct drm_mode_set *mode_set;
-	int ret;
-
-	drm_modeset_acquire_init(&ctx, 0);
-
-	state = drm_atomic_state_alloc(dev);
-	if (!state) {
-		ret = -ENOMEM;
-		goto out_ctx;
-	}
-
-	state->acquire_ctx = &ctx;
-retry:
-	drm_for_each_plane(plane, dev) {
-		plane_state = drm_atomic_get_plane_state(state, plane);
-		if (IS_ERR(plane_state)) {
-			ret = PTR_ERR(plane_state);
-			goto out_state;
-		}
-
-		plane_state->rotation = DRM_MODE_ROTATE_0;
-
-		/* disable non-primary: */
-		if (plane->type == DRM_PLANE_TYPE_PRIMARY)
-			continue;
-
-		ret = __drm_atomic_helper_disable_plane(plane, plane_state);
-		if (ret != 0)
-			goto out_state;
-	}
-
-	drm_client_for_each_modeset(mode_set, client) {
-		struct drm_plane *primary = mode_set->crtc->primary;
-		unsigned int rotation;
-
-		if (drm_client_panel_rotation(mode_set, &rotation)) {
-			/* Cannot fail as we've already gotten the plane state above */
-			plane_state = drm_atomic_get_new_plane_state(state, primary);
-			plane_state->rotation = rotation;
-		}
-
-		ret = __drm_atomic_helper_set_config(mode_set, state);
-		if (ret != 0)
-			goto out_state;
-
-		/*
-		 * __drm_atomic_helper_set_config() sets active when a
-		 * mode is set, unconditionally clear it if we force DPMS off
-		 */
-		if (!active) {
-			struct drm_crtc *crtc = mode_set->crtc;
-			struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc);
-
-			crtc_state->active = false;
-		}
-	}
-
-	ret = drm_atomic_commit(state);
-
-out_state:
-	if (ret == -EDEADLK)
-		goto backoff;
-
-	drm_atomic_state_put(state);
-out_ctx:
-	drm_modeset_drop_locks(&ctx);
-	drm_modeset_acquire_fini(&ctx);
-
-	return ret;
-
-backoff:
-	drm_atomic_state_clear(state);
-	drm_modeset_backoff(&ctx);
-
-	goto retry;
-}
-
-static int drm_client_modeset_commit_legacy(struct drm_client_dev *client)
-{
-	struct drm_device *dev = client->dev;
-	struct drm_mode_set *mode_set;
-	struct drm_plane *plane;
-	int ret = 0;
-
-	drm_modeset_lock_all(dev);
-	drm_for_each_plane(plane, dev) {
-		if (plane->type != DRM_PLANE_TYPE_PRIMARY)
-			drm_plane_force_disable(plane);
-
-		if (plane->rotation_property)
-			drm_mode_plane_set_obj_prop(plane,
-						    plane->rotation_property,
-						    DRM_MODE_ROTATE_0);
-	}
-
-	drm_client_for_each_modeset(mode_set, client) {
-		struct drm_crtc *crtc = mode_set->crtc;
-
-		if (crtc->funcs->cursor_set2) {
-			ret = crtc->funcs->cursor_set2(crtc, NULL, 0, 0, 0, 0, 0);
-			if (ret)
-				goto out;
-		} else if (crtc->funcs->cursor_set) {
-			ret = crtc->funcs->cursor_set(crtc, NULL, 0, 0, 0);
-			if (ret)
-				goto out;
-		}
-
-		ret = drm_mode_set_config_internal(mode_set);
-		if (ret)
-			goto out;
-	}
-out:
-	drm_modeset_unlock_all(dev);
-
-	return ret;
-}
-
-/**
- * drm_client_modeset_commit_force() - Force commit CRTC configuration
- * @client: DRM client
- *
- * Commit modeset configuration to crtcs without checking if there is a DRM master.
- *
- * Returns:
- * Zero on success or negative error code on failure.
- */
-int drm_client_modeset_commit_force(struct drm_client_dev *client)
-{
-	struct drm_device *dev = client->dev;
-	int ret;
-
-	mutex_lock(&client->modeset_mutex);
-	if (drm_drv_uses_atomic_modeset(dev))
-		ret = drm_client_modeset_commit_atomic(client, true);
-	else
-		ret = drm_client_modeset_commit_legacy(client);
-	mutex_unlock(&client->modeset_mutex);
-
-	return ret;
-}
-
-/**
- * drm_client_modeset_commit() - Commit CRTC configuration
- * @client: DRM client
- *
- * Commit modeset configuration to crtcs.
- *
- * Returns:
- * Zero on success or negative error code on failure.
- */
-int drm_client_modeset_commit(struct drm_client_dev *client)
-{
-	struct drm_device *dev = client->dev;
-	int ret;
-
-	if (!drm_master_internal_acquire(dev))
-		return -EBUSY;
-
-	ret = drm_client_modeset_commit_force(client);
-
-	drm_master_internal_release(dev);
-
-	return ret;
-}
-
 /**
  * drm_fb_helper_restore_fbdev_mode_unlocked - restore fbdev configuration
  * @fb_helper: driver-allocated fbdev helper, can be NULL
@@ -719,58 +489,6 @@ static struct sysrq_key_op sysrq_drm_fb_helper_restore_op = {
 static struct sysrq_key_op sysrq_drm_fb_helper_restore_op = { };
 #endif
 
-static void drm_client_modeset_dpms_legacy(struct drm_client_dev *client, int dpms_mode)
-{
-	struct drm_device *dev = client->dev;
-	struct drm_connector *connector;
-	struct drm_mode_set *modeset;
-	int j;
-
-	drm_modeset_lock_all(dev);
-	drm_client_for_each_modeset(modeset, client) {
-		if (!modeset->crtc->enabled)
-			continue;
-
-		for (j = 0; j < modeset->num_connectors; j++) {
-			connector = modeset->connectors[j];
-			connector->funcs->dpms(connector, dpms_mode);
-			drm_object_property_set_value(&connector->base,
-				dev->mode_config.dpms_property, dpms_mode);
-		}
-	}
-	drm_modeset_unlock_all(dev);
-}
-
-/**
- * drm_client_modeset_dpms() - Set DPMS mode
- * @client: DRM client
- * @mode: DPMS mode
- *
- * Note: For atomic drivers @mode is reduced to on/off.
- *
- * Returns:
- * Zero on success or negative error code on failure.
- */
-int drm_client_modeset_dpms(struct drm_client_dev *client, int mode)
-{
-	struct drm_device *dev = client->dev;
-	int ret = 0;
-
-	if (!drm_master_internal_acquire(dev))
-		return -EBUSY;
-
-	mutex_lock(&client->modeset_mutex);
-	if (drm_drv_uses_atomic_modeset(dev))
-		ret = drm_client_modeset_commit_atomic(client, mode == DRM_MODE_DPMS_ON);
-	else
-		drm_client_modeset_dpms_legacy(client, mode);
-	mutex_unlock(&client->modeset_mutex);
-
-	drm_master_internal_release(dev);
-
-	return ret;
-}
-
 static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode)
 {
 	struct drm_fb_helper *fb_helper = info->par;
diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h
index 87be9aeb1fe0..6cf48419f77f 100644
--- a/include/drm/drm_client.h
+++ b/include/drm/drm_client.h
@@ -155,6 +155,10 @@ int drm_client_modeset_create(struct drm_client_dev *client);
 void drm_client_modeset_free(struct drm_client_dev *client);
 void drm_client_modeset_release(struct drm_client_dev *client);
 struct drm_mode_set *drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc);
+bool drm_client_panel_rotation(struct drm_mode_set *modeset, unsigned int *rotation);
+int drm_client_modeset_commit_force(struct drm_client_dev *client);
+int drm_client_modeset_commit(struct drm_client_dev *client);
+int drm_client_modeset_dpms(struct drm_client_dev *client, int mode);
 
 /**
  * drm_client_for_each_modeset() - Iterate over client modesets
-- 
cgit v1.2.3


From f652e66fcca07e59f207bcca27c5566193feabd5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Sun, 9 Jun 2019 23:43:13 +0900
Subject: pinctrl: add include guard to pinctrl-state.h

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinctrl-state.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/pinctrl/pinctrl-state.h b/include/linux/pinctrl/pinctrl-state.h
index a0e785815a64..635d97e9285e 100644
--- a/include/linux/pinctrl/pinctrl-state.h
+++ b/include/linux/pinctrl/pinctrl-state.h
@@ -3,6 +3,9 @@
  * Standard pin control state definitions
  */
 
+#ifndef __LINUX_PINCTRL_PINCTRL_STATE_H
+#define __LINUX_PINCTRL_PINCTRL_STATE_H
+
 /**
  * @PINCTRL_STATE_DEFAULT: the state the pinctrl handle shall be put
  *	into as default, usually this means the pins are up and ready to
@@ -31,3 +34,5 @@
 #define PINCTRL_STATE_INIT "init"
 #define PINCTRL_STATE_IDLE "idle"
 #define PINCTRL_STATE_SLEEP "sleep"
+
+#endif /* __LINUX_PINCTRL_PINCTRL_STATE_H */
-- 
cgit v1.2.3


From e63d79d1ffcd2201a2dbff1d7a1184b8f3ec74cf Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Tue, 4 Jun 2019 15:29:22 +0200
Subject: dmaengine: Add Synopsys eDMA IP core driver

Add Synopsys PCIe Endpoint eDMA IP core driver to kernel.

This IP is generally distributed with Synopsys PCIe Endpoint IP (depends
of the use and licensing agreement).

This core driver, initializes and configures the eDMA IP using vma-helpers
functions and dma-engine subsystem.

This driver can be compile as built-in or external module in kernel.

To enable this driver just select DW_EDMA option in kernel configuration,
however it requires and selects automatically DMA_ENGINE and
DMA_VIRTUAL_CHANNELS option too.

In order to transfer data from point A to B as fast as possible this IP
requires a dedicated memory space containing linked list of elements.

All elements of this linked list are continuous and each one describes a
data transfer (source and destination addresses, length and a control
variable).

For the sake of simplicity, lets assume a memory space for channel write
0 which allows about 42 elements.

+---------+
| Desc #0 |-+
+---------+ |
            V
       +----------+
       | Chunk #0 |-+
       |  CB = 1  | |  +----------+  +-----+  +-----------+  +-----+
       +----------+ +->| Burst #0 |->| ... |->| Burst #41 |->| llp |
            |          +----------+  +-----+  +-----------+  +-----+
            V
       +----------+
       | Chunk #1 |-+
       |  CB = 0  | |  +-----------+  +-----+  +-----------+  +-----+
       +----------+ +->| Burst #42 |->| ... |->| Burst #83 |->| llp |
            |          +-----------+  +-----+  +-----------+  +-----+
            V
       +----------+
       | Chunk #2 |-+
       |  CB = 1  | |  +-----------+  +-----+  +------------+  +-----+
       +----------+ +->| Burst #84 |->| ... |->| Burst #125 |->| llp |
            |          +-----------+  +-----+  +------------+  +-----+
            V
       +----------+
       | Chunk #3 |-+
       |  CB = 0  | |  +------------+  +-----+  +------------+  +-----+
       +----------+ +->| Burst #126 |->| ... |->| Burst #129 |->| llp |
                       +------------+  +-----+  +------------+  +-----+

Legend:
 - Linked list, also know as Chunk
 - Linked list element*, also know as Burst *CB*, also know as Change Bit,
it's a control bit (and typically is toggled) that allows to easily
identify and differentiate between the current linked list and the
previous or the next one.
 - LLP, is a special element that indicates the end of the linked list
element stream also informs that the next CB should be toggle

On every last Burst of the Chunk (Burst #41, Burst #83, Burst #125 or
even Burst #129) is set some flags on their control variable (RIE and
LIE bits) that will trigger the send of "done" interruption.

On the interruptions callback, is decided whether to recycle the linked
list memory space by writing a new set of Bursts elements (if still
exists Chunks to transfer) or is considered completed (if there is no
Chunks available to transfer).

On scatter-gather transfer mode, the client will submit a scatter-gather
list of n (on this case 130) elements, that will be divide in multiple
Chunks, each Chunk will have (on this case 42) a limited number of
Bursts and after transferring all Bursts, an interrupt will be
triggered, which will allow to recycle the all linked list dedicated
memory again with the new information relative to the next Chunk and
respective Burst associated and repeat the whole cycle again.

On cyclic transfer mode, the client will submit a buffer pointer, length
of it and number of repetitions, in this case each burst will correspond
directly to each repetition.

Each Burst can describes a data transfer from point A(source) to point
B(destination) with a length that can be from 1 byte up to 4 GB. Since
dedicated the memory space where the linked list will reside is limited,
the whole n burst elements will be organized in several Chunks, that
will be used later to recycle the dedicated memory space to initiate a
new sequence of data transfers.

The whole transfer is considered has completed when it was transferred
all bursts.

Currently this IP has a set well-known register map, which includes
support for legacy and unroll modes. Legacy mode is version of this
register map that has multiplexer register that allows to switch
registers between all write and read channels and the unroll modes
repeats all write and read channels registers with an offset between
them. This register map is called v0.

The IP team is creating a new register map more suitable to the latest
PCIe features, that very likely will change the map register, which this
version will be called v1. As soon as this new version is released by
the IP team the support for this version in be included on this driver.

According to the logic, patches 1, 2 and 3 should be squashed into 1
unique patch, but for the sake of simplicity of review, it was divided
in this 3 patches files.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Cc: Vinod Koul <vkoul@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/Kconfig                |   2 +
 drivers/dma/Makefile               |   1 +
 drivers/dma/dw-edma/Kconfig        |   9 +
 drivers/dma/dw-edma/Makefile       |   4 +
 drivers/dma/dw-edma/dw-edma-core.c | 936 +++++++++++++++++++++++++++++++++++++
 drivers/dma/dw-edma/dw-edma-core.h | 165 +++++++
 include/linux/dma/edma.h           |  47 ++
 7 files changed, 1164 insertions(+)
 create mode 100644 drivers/dma/dw-edma/Kconfig
 create mode 100644 drivers/dma/dw-edma/Makefile
 create mode 100644 drivers/dma/dw-edma/dw-edma-core.c
 create mode 100644 drivers/dma/dw-edma/dw-edma-core.h
 create mode 100644 include/linux/dma/edma.h

(limited to 'include')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index eaf78f4e07ce..76859aa2688c 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -665,6 +665,8 @@ source "drivers/dma/qcom/Kconfig"
 
 source "drivers/dma/dw/Kconfig"
 
+source "drivers/dma/dw-edma/Kconfig"
+
 source "drivers/dma/hsu/Kconfig"
 
 source "drivers/dma/sh/Kconfig"
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 6126e1c3a875..5bddf6f8790f 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_DMA_SUN4I) += sun4i-dma.o
 obj-$(CONFIG_DMA_SUN6I) += sun6i-dma.o
 obj-$(CONFIG_DW_AXI_DMAC) += dw-axi-dmac/
 obj-$(CONFIG_DW_DMAC_CORE) += dw/
+obj-$(CONFIG_DW_EDMA) += dw-edma/
 obj-$(CONFIG_EP93XX_DMA) += ep93xx_dma.o
 obj-$(CONFIG_FSL_DMA) += fsldma.o
 obj-$(CONFIG_FSL_EDMA) += fsl-edma.o fsl-edma-common.o
diff --git a/drivers/dma/dw-edma/Kconfig b/drivers/dma/dw-edma/Kconfig
new file mode 100644
index 000000000000..3016bed63589
--- /dev/null
+++ b/drivers/dma/dw-edma/Kconfig
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config DW_EDMA
+	tristate "Synopsys DesignWare eDMA controller driver"
+	select DMA_ENGINE
+	select DMA_VIRTUAL_CHANNELS
+	help
+	  Support the Synopsys DesignWare eDMA controller, normally
+	  implemented on endpoints SoCs.
diff --git a/drivers/dma/dw-edma/Makefile b/drivers/dma/dw-edma/Makefile
new file mode 100644
index 000000000000..322401089891
--- /dev/null
+++ b/drivers/dma/dw-edma/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_DW_EDMA)		+= dw-edma.o
+dw-edma-objs			:= dw-edma-core.o
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
new file mode 100644
index 000000000000..c9d032f49dc3
--- /dev/null
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -0,0 +1,936 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare eDMA core driver
+ *
+ * Author: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/pm_runtime.h>
+#include <linux/dmaengine.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/dma/edma.h>
+#include <linux/pci.h>
+
+#include "dw-edma-core.h"
+#include "../dmaengine.h"
+#include "../virt-dma.h"
+
+static inline
+struct device *dchan2dev(struct dma_chan *dchan)
+{
+	return &dchan->dev->device;
+}
+
+static inline
+struct device *chan2dev(struct dw_edma_chan *chan)
+{
+	return &chan->vc.chan.dev->device;
+}
+
+static inline
+struct dw_edma_desc *vd2dw_edma_desc(struct virt_dma_desc *vd)
+{
+	return container_of(vd, struct dw_edma_desc, vd);
+}
+
+static struct dw_edma_burst *dw_edma_alloc_burst(struct dw_edma_chunk *chunk)
+{
+	struct dw_edma_burst *burst;
+
+	burst = kzalloc(sizeof(*burst), GFP_NOWAIT);
+	if (unlikely(!burst))
+		return NULL;
+
+	INIT_LIST_HEAD(&burst->list);
+	if (chunk->burst) {
+		/* Create and add new element into the linked list */
+		chunk->bursts_alloc++;
+		list_add_tail(&burst->list, &chunk->burst->list);
+	} else {
+		/* List head */
+		chunk->bursts_alloc = 0;
+		chunk->burst = burst;
+	}
+
+	return burst;
+}
+
+static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc)
+{
+	struct dw_edma_chan *chan = desc->chan;
+	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma_chunk *chunk;
+
+	chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
+	if (unlikely(!chunk))
+		return NULL;
+
+	INIT_LIST_HEAD(&chunk->list);
+	chunk->chan = chan;
+	/* Toggling change bit (CB) in each chunk, this is a mechanism to
+	 * inform the eDMA HW block that this is a new linked list ready
+	 * to be consumed.
+	 *  - Odd chunks originate CB equal to 0
+	 *  - Even chunks originate CB equal to 1
+	 */
+	chunk->cb = !(desc->chunks_alloc % 2);
+	chunk->ll_region.paddr = dw->ll_region.paddr + chan->ll_off;
+	chunk->ll_region.vaddr = dw->ll_region.vaddr + chan->ll_off;
+
+	if (desc->chunk) {
+		/* Create and add new element into the linked list */
+		desc->chunks_alloc++;
+		list_add_tail(&chunk->list, &desc->chunk->list);
+		if (!dw_edma_alloc_burst(chunk)) {
+			kfree(chunk);
+			return NULL;
+		}
+	} else {
+		/* List head */
+		chunk->burst = NULL;
+		desc->chunks_alloc = 0;
+		desc->chunk = chunk;
+	}
+
+	return chunk;
+}
+
+static struct dw_edma_desc *dw_edma_alloc_desc(struct dw_edma_chan *chan)
+{
+	struct dw_edma_desc *desc;
+
+	desc = kzalloc(sizeof(*desc), GFP_NOWAIT);
+	if (unlikely(!desc))
+		return NULL;
+
+	desc->chan = chan;
+	if (!dw_edma_alloc_chunk(desc)) {
+		kfree(desc);
+		return NULL;
+	}
+
+	return desc;
+}
+
+static void dw_edma_free_burst(struct dw_edma_chunk *chunk)
+{
+	struct dw_edma_burst *child, *_next;
+
+	/* Remove all the list elements */
+	list_for_each_entry_safe(child, _next, &chunk->burst->list, list) {
+		list_del(&child->list);
+		kfree(child);
+		chunk->bursts_alloc--;
+	}
+
+	/* Remove the list head */
+	kfree(child);
+	chunk->burst = NULL;
+}
+
+static void dw_edma_free_chunk(struct dw_edma_desc *desc)
+{
+	struct dw_edma_chunk *child, *_next;
+
+	if (!desc->chunk)
+		return;
+
+	/* Remove all the list elements */
+	list_for_each_entry_safe(child, _next, &desc->chunk->list, list) {
+		dw_edma_free_burst(child);
+		list_del(&child->list);
+		kfree(child);
+		desc->chunks_alloc--;
+	}
+
+	/* Remove the list head */
+	kfree(child);
+	desc->chunk = NULL;
+}
+
+static void dw_edma_free_desc(struct dw_edma_desc *desc)
+{
+	dw_edma_free_chunk(desc);
+	kfree(desc);
+}
+
+static void vchan_free_desc(struct virt_dma_desc *vdesc)
+{
+	dw_edma_free_desc(vd2dw_edma_desc(vdesc));
+}
+
+static void dw_edma_start_transfer(struct dw_edma_chan *chan)
+{
+	struct dw_edma_chunk *child;
+	struct dw_edma_desc *desc;
+	struct virt_dma_desc *vd;
+
+	vd = vchan_next_desc(&chan->vc);
+	if (!vd)
+		return;
+
+	desc = vd2dw_edma_desc(vd);
+	if (!desc)
+		return;
+
+	child = list_first_entry_or_null(&desc->chunk->list,
+					 struct dw_edma_chunk, list);
+	if (!child)
+		return;
+
+	dw_edma_v0_core_start(child, !desc->xfer_sz);
+	desc->xfer_sz += child->ll_region.sz;
+	dw_edma_free_burst(child);
+	list_del(&child->list);
+	kfree(child);
+	desc->chunks_alloc--;
+}
+
+static int dw_edma_device_config(struct dma_chan *dchan,
+				 struct dma_slave_config *config)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+
+	memcpy(&chan->config, config, sizeof(*config));
+	chan->configured = true;
+
+	return 0;
+}
+
+static int dw_edma_device_pause(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int err = 0;
+
+	if (!chan->configured)
+		err = -EPERM;
+	else if (chan->status != EDMA_ST_BUSY)
+		err = -EPERM;
+	else if (chan->request != EDMA_REQ_NONE)
+		err = -EPERM;
+	else
+		chan->request = EDMA_REQ_PAUSE;
+
+	return err;
+}
+
+static int dw_edma_device_resume(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int err = 0;
+
+	if (!chan->configured) {
+		err = -EPERM;
+	} else if (chan->status != EDMA_ST_PAUSE) {
+		err = -EPERM;
+	} else if (chan->request != EDMA_REQ_NONE) {
+		err = -EPERM;
+	} else {
+		chan->status = EDMA_ST_BUSY;
+		dw_edma_start_transfer(chan);
+	}
+
+	return err;
+}
+
+static int dw_edma_device_terminate_all(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int err = 0;
+	LIST_HEAD(head);
+
+	if (!chan->configured) {
+		/* Do nothing */
+	} else if (chan->status == EDMA_ST_PAUSE) {
+		chan->status = EDMA_ST_IDLE;
+		chan->configured = false;
+	} else if (chan->status == EDMA_ST_IDLE) {
+		chan->configured = false;
+	} else if (dw_edma_v0_core_ch_status(chan) == DMA_COMPLETE) {
+		/*
+		 * The channel is in a false BUSY state, probably didn't
+		 * receive or lost an interrupt
+		 */
+		chan->status = EDMA_ST_IDLE;
+		chan->configured = false;
+	} else if (chan->request > EDMA_REQ_PAUSE) {
+		err = -EPERM;
+	} else {
+		chan->request = EDMA_REQ_STOP;
+	}
+
+	return err;
+}
+
+static void dw_edma_device_issue_pending(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	if (chan->configured && chan->request == EDMA_REQ_NONE &&
+	    chan->status == EDMA_ST_IDLE && vchan_issue_pending(&chan->vc)) {
+		chan->status = EDMA_ST_BUSY;
+		dw_edma_start_transfer(chan);
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+}
+
+static enum dma_status
+dw_edma_device_tx_status(struct dma_chan *dchan, dma_cookie_t cookie,
+			 struct dma_tx_state *txstate)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	struct dw_edma_desc *desc;
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+	enum dma_status ret;
+	u32 residue = 0;
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE)
+		return ret;
+
+	if (ret == DMA_IN_PROGRESS && chan->status == EDMA_ST_PAUSE)
+		ret = DMA_PAUSED;
+
+	if (!txstate)
+		goto ret_residue;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	vd = vchan_find_desc(&chan->vc, cookie);
+	if (vd) {
+		desc = vd2dw_edma_desc(vd);
+		if (desc)
+			residue = desc->alloc_sz - desc->xfer_sz;
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+ret_residue:
+	dma_set_residue(txstate, residue);
+
+	return ret;
+}
+
+static struct dma_async_tx_descriptor *
+dw_edma_device_transfer(struct dw_edma_transfer *xfer)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(xfer->dchan);
+	enum dma_transfer_direction direction = xfer->direction;
+	phys_addr_t src_addr, dst_addr;
+	struct scatterlist *sg = NULL;
+	struct dw_edma_chunk *chunk;
+	struct dw_edma_burst *burst;
+	struct dw_edma_desc *desc;
+	u32 cnt;
+	int i;
+
+	if ((direction == DMA_MEM_TO_DEV && chan->dir == EDMA_DIR_WRITE) ||
+	    (direction == DMA_DEV_TO_MEM && chan->dir == EDMA_DIR_READ))
+		return NULL;
+
+	if (xfer->cyclic) {
+		if (!xfer->xfer.cyclic.len || !xfer->xfer.cyclic.cnt)
+			return NULL;
+	} else {
+		if (xfer->xfer.sg.len < 1)
+			return NULL;
+	}
+
+	if (!chan->configured)
+		return NULL;
+
+	desc = dw_edma_alloc_desc(chan);
+	if (unlikely(!desc))
+		goto err_alloc;
+
+	chunk = dw_edma_alloc_chunk(desc);
+	if (unlikely(!chunk))
+		goto err_alloc;
+
+	src_addr = chan->config.src_addr;
+	dst_addr = chan->config.dst_addr;
+
+	if (xfer->cyclic) {
+		cnt = xfer->xfer.cyclic.cnt;
+	} else {
+		cnt = xfer->xfer.sg.len;
+		sg = xfer->xfer.sg.sgl;
+	}
+
+	for (i = 0; i < cnt; i++) {
+		if (!xfer->cyclic && !sg)
+			break;
+
+		if (chunk->bursts_alloc == chan->ll_max) {
+			chunk = dw_edma_alloc_chunk(desc);
+			if (unlikely(!chunk))
+				goto err_alloc;
+		}
+
+		burst = dw_edma_alloc_burst(chunk);
+		if (unlikely(!burst))
+			goto err_alloc;
+
+		if (xfer->cyclic)
+			burst->sz = xfer->xfer.cyclic.len;
+		else
+			burst->sz = sg_dma_len(sg);
+
+		chunk->ll_region.sz += burst->sz;
+		desc->alloc_sz += burst->sz;
+
+		if (direction == DMA_DEV_TO_MEM) {
+			burst->sar = src_addr;
+			if (xfer->cyclic) {
+				burst->dar = xfer->xfer.cyclic.paddr;
+			} else {
+				burst->dar = sg_dma_address(sg);
+				/* Unlike the typical assumption by other
+				 * drivers/IPs the peripheral memory isn't
+				 * a FIFO memory, in this case, it's a
+				 * linear memory and that why the source
+				 * and destination addresses are increased
+				 * by the same portion (data length)
+				 */
+				src_addr += sg_dma_len(sg);
+			}
+		} else {
+			burst->dar = dst_addr;
+			if (xfer->cyclic) {
+				burst->sar = xfer->xfer.cyclic.paddr;
+			} else {
+				burst->sar = sg_dma_address(sg);
+				/* Unlike the typical assumption by other
+				 * drivers/IPs the peripheral memory isn't
+				 * a FIFO memory, in this case, it's a
+				 * linear memory and that why the source
+				 * and destination addresses are increased
+				 * by the same portion (data length)
+				 */
+				dst_addr += sg_dma_len(sg);
+			}
+		}
+
+		if (!xfer->cyclic)
+			sg = sg_next(sg);
+	}
+
+	return vchan_tx_prep(&chan->vc, &desc->vd, xfer->flags);
+
+err_alloc:
+	if (desc)
+		dw_edma_free_desc(desc);
+
+	return NULL;
+}
+
+static struct dma_async_tx_descriptor *
+dw_edma_device_prep_slave_sg(struct dma_chan *dchan, struct scatterlist *sgl,
+			     unsigned int len,
+			     enum dma_transfer_direction direction,
+			     unsigned long flags, void *context)
+{
+	struct dw_edma_transfer xfer;
+
+	xfer.dchan = dchan;
+	xfer.direction = direction;
+	xfer.xfer.sg.sgl = sgl;
+	xfer.xfer.sg.len = len;
+	xfer.flags = flags;
+	xfer.cyclic = false;
+
+	return dw_edma_device_transfer(&xfer);
+}
+
+static struct dma_async_tx_descriptor *
+dw_edma_device_prep_dma_cyclic(struct dma_chan *dchan, dma_addr_t paddr,
+			       size_t len, size_t count,
+			       enum dma_transfer_direction direction,
+			       unsigned long flags)
+{
+	struct dw_edma_transfer xfer;
+
+	xfer.dchan = dchan;
+	xfer.direction = direction;
+	xfer.xfer.cyclic.paddr = paddr;
+	xfer.xfer.cyclic.len = len;
+	xfer.xfer.cyclic.cnt = count;
+	xfer.flags = flags;
+	xfer.cyclic = true;
+
+	return dw_edma_device_transfer(&xfer);
+}
+
+static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
+{
+	struct dw_edma_desc *desc;
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+
+	dw_edma_v0_core_clear_done_int(chan);
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	vd = vchan_next_desc(&chan->vc);
+	if (vd) {
+		switch (chan->request) {
+		case EDMA_REQ_NONE:
+			desc = vd2dw_edma_desc(vd);
+			if (desc->chunks_alloc) {
+				chan->status = EDMA_ST_BUSY;
+				dw_edma_start_transfer(chan);
+			} else {
+				list_del(&vd->node);
+				vchan_cookie_complete(vd);
+				chan->status = EDMA_ST_IDLE;
+			}
+			break;
+
+		case EDMA_REQ_STOP:
+			list_del(&vd->node);
+			vchan_cookie_complete(vd);
+			chan->request = EDMA_REQ_NONE;
+			chan->status = EDMA_ST_IDLE;
+			break;
+
+		case EDMA_REQ_PAUSE:
+			chan->request = EDMA_REQ_NONE;
+			chan->status = EDMA_ST_PAUSE;
+			break;
+
+		default:
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+}
+
+static void dw_edma_abort_interrupt(struct dw_edma_chan *chan)
+{
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+
+	dw_edma_v0_core_clear_abort_int(chan);
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	vd = vchan_next_desc(&chan->vc);
+	if (vd) {
+		list_del(&vd->node);
+		vchan_cookie_complete(vd);
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+	chan->request = EDMA_REQ_NONE;
+	chan->status = EDMA_ST_IDLE;
+}
+
+static irqreturn_t dw_edma_interrupt(int irq, void *data, bool write)
+{
+	struct dw_edma_irq *dw_irq = data;
+	struct dw_edma *dw = dw_irq->dw;
+	unsigned long total, pos, val;
+	unsigned long off;
+	u32 mask;
+
+	if (write) {
+		total = dw->wr_ch_cnt;
+		off = 0;
+		mask = dw_irq->wr_mask;
+	} else {
+		total = dw->rd_ch_cnt;
+		off = dw->wr_ch_cnt;
+		mask = dw_irq->rd_mask;
+	}
+
+	val = dw_edma_v0_core_status_done_int(dw, write ?
+							  EDMA_DIR_WRITE :
+							  EDMA_DIR_READ);
+	val &= mask;
+	for_each_set_bit(pos, &val, total) {
+		struct dw_edma_chan *chan = &dw->chan[pos + off];
+
+		dw_edma_done_interrupt(chan);
+	}
+
+	val = dw_edma_v0_core_status_abort_int(dw, write ?
+							   EDMA_DIR_WRITE :
+							   EDMA_DIR_READ);
+	val &= mask;
+	for_each_set_bit(pos, &val, total) {
+		struct dw_edma_chan *chan = &dw->chan[pos + off];
+
+		dw_edma_abort_interrupt(chan);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static inline irqreturn_t dw_edma_interrupt_write(int irq, void *data)
+{
+	return dw_edma_interrupt(irq, data, true);
+}
+
+static inline irqreturn_t dw_edma_interrupt_read(int irq, void *data)
+{
+	return dw_edma_interrupt(irq, data, false);
+}
+
+static irqreturn_t dw_edma_interrupt_common(int irq, void *data)
+{
+	dw_edma_interrupt(irq, data, true);
+	dw_edma_interrupt(irq, data, false);
+
+	return IRQ_HANDLED;
+}
+
+static int dw_edma_alloc_chan_resources(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+
+	if (chan->status != EDMA_ST_IDLE)
+		return -EBUSY;
+
+	pm_runtime_get(chan->chip->dev);
+
+	return 0;
+}
+
+static void dw_edma_free_chan_resources(struct dma_chan *dchan)
+{
+	unsigned long timeout = jiffies + msecs_to_jiffies(5000);
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int ret;
+
+	while (time_before(jiffies, timeout)) {
+		ret = dw_edma_device_terminate_all(dchan);
+		if (!ret)
+			break;
+
+		if (time_after_eq(jiffies, timeout))
+			return;
+
+		cpu_relax();
+	};
+
+	pm_runtime_put(chan->chip->dev);
+}
+
+static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
+				 u32 wr_alloc, u32 rd_alloc)
+{
+	struct dw_edma_region *dt_region;
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	struct dw_edma_chan *chan;
+	size_t ll_chunk, dt_chunk;
+	struct dw_edma_irq *irq;
+	struct dma_device *dma;
+	u32 i, j, cnt, ch_cnt;
+	u32 alloc, off_alloc;
+	int err = 0;
+	u32 pos;
+
+	ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt;
+	ll_chunk = dw->ll_region.sz;
+	dt_chunk = dw->dt_region.sz;
+
+	/* Calculate linked list chunk for each channel */
+	ll_chunk /= roundup_pow_of_two(ch_cnt);
+
+	/* Calculate linked list chunk for each channel */
+	dt_chunk /= roundup_pow_of_two(ch_cnt);
+
+	if (write) {
+		i = 0;
+		cnt = dw->wr_ch_cnt;
+		dma = &dw->wr_edma;
+		alloc = wr_alloc;
+		off_alloc = 0;
+	} else {
+		i = dw->wr_ch_cnt;
+		cnt = dw->rd_ch_cnt;
+		dma = &dw->rd_edma;
+		alloc = rd_alloc;
+		off_alloc = wr_alloc;
+	}
+
+	INIT_LIST_HEAD(&dma->channels);
+	for (j = 0; (alloc || dw->nr_irqs == 1) && j < cnt; j++, i++) {
+		chan = &dw->chan[i];
+
+		dt_region = devm_kzalloc(dev, sizeof(*dt_region), GFP_KERNEL);
+		if (!dt_region)
+			return -ENOMEM;
+
+		chan->vc.chan.private = dt_region;
+
+		chan->chip = chip;
+		chan->id = j;
+		chan->dir = write ? EDMA_DIR_WRITE : EDMA_DIR_READ;
+		chan->configured = false;
+		chan->request = EDMA_REQ_NONE;
+		chan->status = EDMA_ST_IDLE;
+
+		chan->ll_off = (ll_chunk * i);
+		chan->ll_max = (ll_chunk / EDMA_LL_SZ) - 1;
+
+		chan->dt_off = (dt_chunk * i);
+
+		dev_vdbg(dev, "L. List:\tChannel %s[%u] off=0x%.8lx, max_cnt=%u\n",
+			 write ? "write" : "read", j,
+			 chan->ll_off, chan->ll_max);
+
+		if (dw->nr_irqs == 1)
+			pos = 0;
+		else
+			pos = off_alloc + (j % alloc);
+
+		irq = &dw->irq[pos];
+
+		if (write)
+			irq->wr_mask |= BIT(j);
+		else
+			irq->rd_mask |= BIT(j);
+
+		irq->dw = dw;
+		memcpy(&chan->msi, &irq->msi, sizeof(chan->msi));
+
+		dev_vdbg(dev, "MSI:\t\tChannel %s[%u] addr=0x%.8x%.8x, data=0x%.8x\n",
+			 write ? "write" : "read", j,
+			 chan->msi.address_hi, chan->msi.address_lo,
+			 chan->msi.data);
+
+		chan->vc.desc_free = vchan_free_desc;
+		vchan_init(&chan->vc, dma);
+
+		dt_region->paddr = dw->dt_region.paddr + chan->dt_off;
+		dt_region->vaddr = dw->dt_region.vaddr + chan->dt_off;
+		dt_region->sz = dt_chunk;
+
+		dev_vdbg(dev, "Data:\tChannel %s[%u] off=0x%.8lx\n",
+			 write ? "write" : "read", j, chan->dt_off);
+
+		dw_edma_v0_core_device_config(chan);
+	}
+
+	/* Set DMA channel capabilities */
+	dma_cap_zero(dma->cap_mask);
+	dma_cap_set(DMA_SLAVE, dma->cap_mask);
+	dma_cap_set(DMA_CYCLIC, dma->cap_mask);
+	dma_cap_set(DMA_PRIVATE, dma->cap_mask);
+	dma->directions = BIT(write ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV);
+	dma->src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
+	dma->dst_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
+	dma->residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+	dma->chancnt = cnt;
+
+	/* Set DMA channel callbacks */
+	dma->dev = chip->dev;
+	dma->device_alloc_chan_resources = dw_edma_alloc_chan_resources;
+	dma->device_free_chan_resources = dw_edma_free_chan_resources;
+	dma->device_config = dw_edma_device_config;
+	dma->device_pause = dw_edma_device_pause;
+	dma->device_resume = dw_edma_device_resume;
+	dma->device_terminate_all = dw_edma_device_terminate_all;
+	dma->device_issue_pending = dw_edma_device_issue_pending;
+	dma->device_tx_status = dw_edma_device_tx_status;
+	dma->device_prep_slave_sg = dw_edma_device_prep_slave_sg;
+	dma->device_prep_dma_cyclic = dw_edma_device_prep_dma_cyclic;
+
+	dma_set_max_seg_size(dma->dev, U32_MAX);
+
+	/* Register DMA device */
+	err = dma_async_device_register(dma);
+
+	return err;
+}
+
+static inline void dw_edma_dec_irq_alloc(int *nr_irqs, u32 *alloc, u16 cnt)
+{
+	if (*nr_irqs && *alloc < cnt) {
+		(*alloc)++;
+		(*nr_irqs)--;
+	}
+}
+
+static inline void dw_edma_add_irq_mask(u32 *mask, u32 alloc, u16 cnt)
+{
+	while (*mask * alloc < cnt)
+		(*mask)++;
+}
+
+static int dw_edma_irq_request(struct dw_edma_chip *chip,
+			       u32 *wr_alloc, u32 *rd_alloc)
+{
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	u32 wr_mask = 1;
+	u32 rd_mask = 1;
+	int i, err = 0;
+	u32 ch_cnt;
+
+	ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt;
+
+	if (dw->nr_irqs < 1)
+		return -EINVAL;
+
+	if (dw->nr_irqs == 1) {
+		/* Common IRQ shared among all channels */
+		err = request_irq(pci_irq_vector(to_pci_dev(dev), 0),
+				  dw_edma_interrupt_common,
+				  IRQF_SHARED, dw->name, &dw->irq[0]);
+		if (err) {
+			dw->nr_irqs = 0;
+			return err;
+		}
+
+		get_cached_msi_msg(pci_irq_vector(to_pci_dev(dev), 0),
+				   &dw->irq[0].msi);
+	} else {
+		/* Distribute IRQs equally among all channels */
+		int tmp = dw->nr_irqs;
+
+		while (tmp && (*wr_alloc + *rd_alloc) < ch_cnt) {
+			dw_edma_dec_irq_alloc(&tmp, wr_alloc, dw->wr_ch_cnt);
+			dw_edma_dec_irq_alloc(&tmp, rd_alloc, dw->rd_ch_cnt);
+		}
+
+		dw_edma_add_irq_mask(&wr_mask, *wr_alloc, dw->wr_ch_cnt);
+		dw_edma_add_irq_mask(&rd_mask, *rd_alloc, dw->rd_ch_cnt);
+
+		for (i = 0; i < (*wr_alloc + *rd_alloc); i++) {
+			err = request_irq(pci_irq_vector(to_pci_dev(dev), i),
+					  i < *wr_alloc ?
+						dw_edma_interrupt_write :
+						dw_edma_interrupt_read,
+					  IRQF_SHARED, dw->name,
+					  &dw->irq[i]);
+			if (err) {
+				dw->nr_irqs = i;
+				return err;
+			}
+
+			get_cached_msi_msg(pci_irq_vector(to_pci_dev(dev), i),
+					   &dw->irq[i].msi);
+		}
+
+		dw->nr_irqs = i;
+	}
+
+	return err;
+}
+
+int dw_edma_probe(struct dw_edma_chip *chip)
+{
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	u32 wr_alloc = 0;
+	u32 rd_alloc = 0;
+	int i, err;
+
+	raw_spin_lock_init(&dw->lock);
+
+	/* Find out how many write channels are supported by hardware */
+	dw->wr_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE);
+	if (!dw->wr_ch_cnt)
+		return -EINVAL;
+
+	/* Find out how many read channels are supported by hardware */
+	dw->rd_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ);
+	if (!dw->rd_ch_cnt)
+		return -EINVAL;
+
+	dev_vdbg(dev, "Channels:\twrite=%d, read=%d\n",
+		 dw->wr_ch_cnt, dw->rd_ch_cnt);
+
+	/* Allocate channels */
+	dw->chan = devm_kcalloc(dev, dw->wr_ch_cnt + dw->rd_ch_cnt,
+				sizeof(*dw->chan), GFP_KERNEL);
+	if (!dw->chan)
+		return -ENOMEM;
+
+	snprintf(dw->name, sizeof(dw->name), "dw-edma-core:%d", chip->id);
+
+	/* Disable eDMA, only to establish the ideal initial conditions */
+	dw_edma_v0_core_off(dw);
+
+	/* Request IRQs */
+	err = dw_edma_irq_request(chip, &wr_alloc, &rd_alloc);
+	if (err)
+		return err;
+
+	/* Setup write channels */
+	err = dw_edma_channel_setup(chip, true, wr_alloc, rd_alloc);
+	if (err)
+		goto err_irq_free;
+
+	/* Setup read channels */
+	err = dw_edma_channel_setup(chip, false, wr_alloc, rd_alloc);
+	if (err)
+		goto err_irq_free;
+
+	/* Power management */
+	pm_runtime_enable(dev);
+
+	/* Turn debugfs on */
+	dw_edma_v0_core_debugfs_on(chip);
+
+	return 0;
+
+err_irq_free:
+	for (i = (dw->nr_irqs - 1); i >= 0; i--)
+		free_irq(pci_irq_vector(to_pci_dev(dev), i), &dw->irq[i]);
+
+	dw->nr_irqs = 0;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(dw_edma_probe);
+
+int dw_edma_remove(struct dw_edma_chip *chip)
+{
+	struct dw_edma_chan *chan, *_chan;
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	int i;
+
+	/* Disable eDMA */
+	dw_edma_v0_core_off(dw);
+
+	/* Free irqs */
+	for (i = (dw->nr_irqs - 1); i >= 0; i--)
+		free_irq(pci_irq_vector(to_pci_dev(dev), i), &dw->irq[i]);
+
+	/* Power management */
+	pm_runtime_disable(dev);
+
+	list_for_each_entry_safe(chan, _chan, &dw->wr_edma.channels,
+				 vc.chan.device_node) {
+		list_del(&chan->vc.chan.device_node);
+		tasklet_kill(&chan->vc.task);
+	}
+
+	list_for_each_entry_safe(chan, _chan, &dw->rd_edma.channels,
+				 vc.chan.device_node) {
+		list_del(&chan->vc.chan.device_node);
+		tasklet_kill(&chan->vc.task);
+	}
+
+	/* Deregister eDMA device */
+	dma_async_device_unregister(&dw->wr_edma);
+	dma_async_device_unregister(&dw->rd_edma);
+
+	/* Turn debugfs off */
+	dw_edma_v0_core_debugfs_off();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dw_edma_remove);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Synopsys DesignWare eDMA controller core driver");
+MODULE_AUTHOR("Gustavo Pimentel <gustavo.pimentel@synopsys.com>");
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
new file mode 100644
index 000000000000..b6cc90cbc9dc
--- /dev/null
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare eDMA core driver
+ *
+ * Author: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+ */
+
+#ifndef _DW_EDMA_CORE_H
+#define _DW_EDMA_CORE_H
+
+#include <linux/msi.h>
+#include <linux/dma/edma.h>
+
+#include "../virt-dma.h"
+
+#define EDMA_LL_SZ					24
+
+enum dw_edma_dir {
+	EDMA_DIR_WRITE = 0,
+	EDMA_DIR_READ
+};
+
+enum dw_edma_mode {
+	EDMA_MODE_LEGACY = 0,
+	EDMA_MODE_UNROLL
+};
+
+enum dw_edma_request {
+	EDMA_REQ_NONE = 0,
+	EDMA_REQ_STOP,
+	EDMA_REQ_PAUSE
+};
+
+enum dw_edma_status {
+	EDMA_ST_IDLE = 0,
+	EDMA_ST_PAUSE,
+	EDMA_ST_BUSY
+};
+
+struct dw_edma_chan;
+struct dw_edma_chunk;
+
+struct dw_edma_burst {
+	struct list_head		list;
+	u64				sar;
+	u64				dar;
+	u32				sz;
+};
+
+struct dw_edma_region {
+	phys_addr_t			paddr;
+	dma_addr_t			vaddr;
+	size_t				sz;
+};
+
+struct dw_edma_chunk {
+	struct list_head		list;
+	struct dw_edma_chan		*chan;
+	struct dw_edma_burst		*burst;
+
+	u32				bursts_alloc;
+
+	u8				cb;
+	struct dw_edma_region		ll_region;	/* Linked list */
+};
+
+struct dw_edma_desc {
+	struct virt_dma_desc		vd;
+	struct dw_edma_chan		*chan;
+	struct dw_edma_chunk		*chunk;
+
+	u32				chunks_alloc;
+
+	u32				alloc_sz;
+	u32				xfer_sz;
+};
+
+struct dw_edma_chan {
+	struct virt_dma_chan		vc;
+	struct dw_edma_chip		*chip;
+	int				id;
+	enum dw_edma_dir		dir;
+
+	off_t				ll_off;
+	u32				ll_max;
+
+	off_t				dt_off;
+
+	struct msi_msg			msi;
+
+	enum dw_edma_request		request;
+	enum dw_edma_status		status;
+	u8				configured;
+
+	struct dma_slave_config		config;
+};
+
+struct dw_edma_irq {
+	struct msi_msg                  msi;
+	u32				wr_mask;
+	u32				rd_mask;
+	struct dw_edma			*dw;
+};
+
+struct dw_edma {
+	char				name[20];
+
+	struct dma_device		wr_edma;
+	u16				wr_ch_cnt;
+
+	struct dma_device		rd_edma;
+	u16				rd_ch_cnt;
+
+	struct dw_edma_region		rg_region;	/* Registers */
+	struct dw_edma_region		ll_region;	/* Linked list */
+	struct dw_edma_region		dt_region;	/* Data */
+
+	struct dw_edma_irq		*irq;
+	int				nr_irqs;
+
+	u32				version;
+	enum dw_edma_mode		mode;
+
+	struct dw_edma_chan		*chan;
+	const struct dw_edma_core_ops	*ops;
+
+	raw_spinlock_t			lock;		/* Only for legacy */
+};
+
+struct dw_edma_sg {
+	struct scatterlist		*sgl;
+	unsigned int			len;
+};
+
+struct dw_edma_cyclic {
+	dma_addr_t			paddr;
+	size_t				len;
+	size_t				cnt;
+};
+
+struct dw_edma_transfer {
+	struct dma_chan			*dchan;
+	union dw_edma_xfer {
+		struct dw_edma_sg	sg;
+		struct dw_edma_cyclic	cyclic;
+	} xfer;
+	enum dma_transfer_direction	direction;
+	unsigned long			flags;
+	bool				cyclic;
+};
+
+static inline
+struct dw_edma_chan *vc2dw_edma_chan(struct virt_dma_chan *vc)
+{
+	return container_of(vc, struct dw_edma_chan, vc);
+}
+
+static inline
+struct dw_edma_chan *dchan2dw_edma_chan(struct dma_chan *dchan)
+{
+	return vc2dw_edma_chan(to_virt_chan(dchan));
+}
+
+#endif /* _DW_EDMA_CORE_H */
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
new file mode 100644
index 000000000000..cab6e18773da
--- /dev/null
+++ b/include/linux/dma/edma.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare eDMA core driver
+ *
+ * Author: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+ */
+
+#ifndef _DW_EDMA_H
+#define _DW_EDMA_H
+
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+
+struct dw_edma;
+
+/**
+ * struct dw_edma_chip - representation of DesignWare eDMA controller hardware
+ * @dev:		 struct device of the eDMA controller
+ * @id:			 instance ID
+ * @irq:		 irq line
+ * @dw:			 struct dw_edma that is filed by dw_edma_probe()
+ */
+struct dw_edma_chip {
+	struct device		*dev;
+	int			id;
+	int			irq;
+	struct dw_edma		*dw;
+};
+
+/* Export to the platform drivers */
+#if IS_ENABLED(CONFIG_DW_EDMA)
+int dw_edma_probe(struct dw_edma_chip *chip);
+int dw_edma_remove(struct dw_edma_chip *chip);
+#else
+static inline int dw_edma_probe(struct dw_edma_chip *chip)
+{
+	return -ENODEV;
+}
+
+static inline int dw_edma_remove(struct dw_edma_chip *chip)
+{
+	return 0;
+}
+#endif /* CONFIG_DW_EDMA */
+
+#endif /* _DW_EDMA_H */
-- 
cgit v1.2.3


From 1f418f46503d72594bbe6407d97fd2ae1ce15ee6 Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Tue, 4 Jun 2019 15:29:25 +0200
Subject: PCI: Add Synopsys endpoint EDDA Device ID

Create and add Synopsys Endpoint EDDA Device ID to PCI ID list, since
this ID is now being use on two different drivers (pci_endpoint_test.ko
and dw-edma-pcie.ko).

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Kishon Vijay Abraham I <kishon@ti.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/misc/pci_endpoint_test.c | 2 +-
 include/linux/pci_ids.h          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 7b015f2a1c6f..1f531c1b4f74 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -804,7 +804,7 @@ static const struct pci_device_id pci_endpoint_test_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA74x) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA72x) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, 0x81c0) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_SYNOPSYS, 0xedda) },
+	{ PCI_DEVICE_DATA(SYNOPSYS, EDDA, NULL) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_AM654),
 	  .driver_data = (kernel_ulong_t)&am654_data
 	},
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 70e86148cb1e..4aad69fc4d6b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2366,6 +2366,7 @@
 #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB3		0xabcd
 #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB3_AXI	0xabce
 #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB31	0xabcf
+#define PCI_DEVICE_ID_SYNOPSYS_EDDA	0xedda
 
 #define PCI_VENDOR_ID_USR		0x16ec
 
-- 
cgit v1.2.3


From 4e23be473e3063a9d3bc06bb0aee89885fffab0e Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 10 Jun 2019 04:48:05 -0700
Subject: bus: ti-sysc: Add support for module specific reset quirks

Some older interconnect target modules need module internal clock
toggling quirks to reset properly. We've been doing this in the
platform code earlier, but need to be able to it directly in the
ti-sysc driver when we no longer rely on on the platform code.

Let's add reset handling for 1-wire, i2c and watchdog. Later on
we can add more modules like msdi and dss as they get tested.
For dra7 pcie, we should be able to just use the rstctrl reset
driver when available.

Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 129 ++++++++++++++++++++++++++++++++--
 include/linux/platform_data/ti-sysc.h |   3 +
 2 files changed, 127 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index a366ae548ec9..e6deabd8305d 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -71,6 +71,9 @@ static const char * const clock_names[SYSC_MAX_CLOCKS] = {
  * @name: name if available
  * @revision: interconnect target module revision
  * @needs_resume: runtime resume needed on resume from suspend
+ * @clk_enable_quirk: module specific clock enable quirk
+ * @clk_disable_quirk: module specific clock disable quirk
+ * @reset_done_quirk: module specific reset done quirk
  */
 struct sysc {
 	struct device *dev;
@@ -94,6 +97,9 @@ struct sysc {
 	unsigned int child_needs_resume:1;
 	unsigned int disable_on_idle:1;
 	struct delayed_work idle_work;
+	void (*clk_enable_quirk)(struct sysc *sysc);
+	void (*clk_disable_quirk)(struct sysc *sysc);
+	void (*reset_done_quirk)(struct sysc *sysc);
 };
 
 static void sysc_parse_dts_quirks(struct sysc *ddata, struct device_node *np,
@@ -760,8 +766,11 @@ static int sysc_ioremap(struct sysc *ddata)
 			    ddata->offsets[SYSC_SYSCONFIG],
 			    ddata->offsets[SYSC_SYSSTATUS]);
 
+		if (size < SZ_1K)
+			size = SZ_1K;
+
 		if ((size + sizeof(u32)) > ddata->module_size)
-			return -EINVAL;
+			size = ddata->module_size;
 	}
 
 	ddata->module_va = devm_ioremap(ddata->dev,
@@ -1234,6 +1243,22 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 		   SYSC_QUIRK_EXT_OPT_CLOCK | SYSC_QUIRK_NO_RESET_ON_INIT |
 		   SYSC_QUIRK_SWSUP_SIDLE),
 
+	/* Quirks that need to be set based on detected module */
+	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff,
+		   SYSC_MODULE_QUIRK_HDQ1W),
+	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x0000000a, 0xffffffff,
+		   SYSC_MODULE_QUIRK_HDQ1W),
+	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x00000036, 0x000000ff,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x0000003c, 0x000000ff,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x00000040, 0x000000ff,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("i2c", 0, 0, 0x10, 0x90, 0x5040000a, 0xfffff0f0,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("wdt", 0, 0, 0x10, 0x14, 0x502a0500, 0xfffff0f0,
+		   SYSC_MODULE_QUIRK_WDT),
+
 #ifdef DEBUG
 	SYSC_QUIRK("adc", 0, 0, 0x10, -1, 0x47300001, 0xffffffff, 0),
 	SYSC_QUIRK("atl", 0, 0, -1, -1, 0x0a070100, 0xffffffff, 0),
@@ -1247,11 +1272,8 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 	SYSC_QUIRK("dwc3", 0, 0, 0x10, -1, 0x500a0200, 0xffffffff, 0),
 	SYSC_QUIRK("epwmss", 0, 0, 0x4, -1, 0x47400001, 0xffffffff, 0),
 	SYSC_QUIRK("gpu", 0, 0x1fc00, 0x1fc10, -1, 0, 0, 0),
-	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff, 0),
-	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x0000000a, 0xffffffff, 0),
 	SYSC_QUIRK("hsi", 0, 0, 0x10, 0x14, 0x50043101, 0xffffffff, 0),
 	SYSC_QUIRK("iss", 0, 0, 0x10, -1, 0x40000101, 0xffffffff, 0),
-	SYSC_QUIRK("i2c", 0, 0, 0x10, 0x90, 0x5040000a, 0xfffff0f0, 0),
 	SYSC_QUIRK("lcdc", 0, 0, 0x54, -1, 0x4f201000, 0xffffffff, 0),
 	SYSC_QUIRK("mcasp", 0, 0, 0x4, -1, 0x44306302, 0xffffffff, 0),
 	SYSC_QUIRK("mcasp", 0, 0, 0x4, -1, 0x44307b02, 0xffffffff, 0),
@@ -1287,7 +1309,6 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 	SYSC_QUIRK("usb_host_hs", 0, 0, 0x10, -1, 0x50700101, 0xffffffff, 0),
 	SYSC_QUIRK("usb_otg_hs", 0, 0x400, 0x404, 0x408, 0x00000050,
 		   0xffffffff, 0),
-	SYSC_QUIRK("wdt", 0, 0, 0x10, 0x14, 0x502a0500, 0xfffff0f0, 0),
 	SYSC_QUIRK("vfpe", 0, 0, 0x104, -1, 0x4d001200, 0xffffffff, 0),
 #endif
 };
@@ -1360,6 +1381,94 @@ static void sysc_init_revision_quirks(struct sysc *ddata)
 	}
 }
 
+/* 1-wire needs module's internal clocks enabled for reset */
+static void sysc_clk_enable_quirk_hdq1w(struct sysc *ddata)
+{
+	int offset = 0x0c;	/* HDQ_CTRL_STATUS */
+	u16 val;
+
+	val = sysc_read(ddata, offset);
+	val |= BIT(5);
+	sysc_write(ddata, offset, val);
+}
+
+/* I2C needs extra enable bit toggling for reset */
+static void sysc_clk_quirk_i2c(struct sysc *ddata, bool enable)
+{
+	int offset;
+	u16 val;
+
+	/* I2C_CON, omap2/3 is different from omap4 and later */
+	if ((ddata->revision & 0xffffff00) == 0x001f0000)
+		offset = 0x24;
+	else
+		offset = 0xa4;
+
+	/* I2C_EN */
+	val = sysc_read(ddata, offset);
+	if (enable)
+		val |= BIT(15);
+	else
+		val &= ~BIT(15);
+	sysc_write(ddata, offset, val);
+}
+
+static void sysc_clk_enable_quirk_i2c(struct sysc *ddata)
+{
+	sysc_clk_quirk_i2c(ddata, true);
+}
+
+static void sysc_clk_disable_quirk_i2c(struct sysc *ddata)
+{
+	sysc_clk_quirk_i2c(ddata, false);
+}
+
+/* Watchdog timer needs a disable sequence after reset */
+static void sysc_reset_done_quirk_wdt(struct sysc *ddata)
+{
+	int wps, spr, error;
+	u32 val;
+
+	wps = 0x34;
+	spr = 0x48;
+
+	sysc_write(ddata, spr, 0xaaaa);
+	error = readl_poll_timeout(ddata->module_va + wps, val,
+				   !(val & 0x10), 100,
+				   MAX_MODULE_SOFTRESET_WAIT);
+	if (error)
+		dev_warn(ddata->dev, "wdt disable spr failed\n");
+
+	sysc_write(ddata, wps, 0x5555);
+	error = readl_poll_timeout(ddata->module_va + wps, val,
+				   !(val & 0x10), 100,
+				   MAX_MODULE_SOFTRESET_WAIT);
+	if (error)
+		dev_warn(ddata->dev, "wdt disable wps failed\n");
+}
+
+static void sysc_init_module_quirks(struct sysc *ddata)
+{
+	if (ddata->legacy_mode || !ddata->name)
+		return;
+
+	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_HDQ1W) {
+		ddata->clk_enable_quirk = sysc_clk_enable_quirk_hdq1w;
+
+		return;
+	}
+
+	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_I2C) {
+		ddata->clk_enable_quirk = sysc_clk_enable_quirk_i2c;
+		ddata->clk_disable_quirk = sysc_clk_disable_quirk_i2c;
+
+		return;
+	}
+
+	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_WDT)
+		ddata->reset_done_quirk = sysc_reset_done_quirk_wdt;
+}
+
 static int sysc_clockdomain_init(struct sysc *ddata)
 {
 	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
@@ -1468,10 +1577,16 @@ static int sysc_reset(struct sysc *ddata)
 	else
 		syss_done = ddata->cfg.syss_mask;
 
+	if (ddata->clk_disable_quirk)
+		ddata->clk_disable_quirk(ddata);
+
 	sysc_val = sysc_read_sysconfig(ddata);
 	sysc_val |= sysc_mask;
 	sysc_write(ddata, sysc_offset, sysc_val);
 
+	if (ddata->clk_enable_quirk)
+		ddata->clk_enable_quirk(ddata);
+
 	/* Poll on reset status */
 	if (syss_offset >= 0) {
 		error = readx_poll_timeout(sysc_read_sysstatus, ddata, rstval,
@@ -1485,6 +1600,9 @@ static int sysc_reset(struct sysc *ddata)
 					   100, MAX_MODULE_SOFTRESET_WAIT);
 	}
 
+	if (ddata->reset_done_quirk)
+		ddata->reset_done_quirk(ddata);
+
 	return error;
 }
 
@@ -1531,6 +1649,7 @@ static int sysc_init_module(struct sysc *ddata)
 
 	ddata->revision = sysc_read_revision(ddata);
 	sysc_init_revision_quirks(ddata);
+	sysc_init_module_quirks(ddata);
 
 	if (ddata->legacy_mode) {
 		error = sysc_legacy_init(ddata);
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 8822e99ff813..0c587d4fc718 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -47,6 +47,9 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_MODULE_QUIRK_HDQ1W		BIT(17)
+#define SYSC_MODULE_QUIRK_I2C		BIT(16)
+#define SYSC_MODULE_QUIRK_WDT		BIT(15)
 #define SYSS_QUIRK_RESETDONE_INVERTED	BIT(14)
 #define SYSC_QUIRK_SWSUP_MSTANDBY	BIT(13)
 #define SYSC_QUIRK_SWSUP_SIDLE_ACT	BIT(12)
-- 
cgit v1.2.3


From e36acfe6c86d13eec62321e1e86a1ce287e52e7d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 09:41:19 -0300
Subject: mm/hmm: Use hmm_mirror not mm as an argument for hmm_range_register

Ralph observes that hmm_range_register() can only be called by a driver
while a mirror is registered. Make this clear in the API by passing in the
mirror structure as a parameter.

This also simplifies understanding the lifetime model for struct hmm, as
the hmm pointer must be valid as part of a registered mirror so all we
need in hmm_register_range() is a simple kref_get.

Suggested-by: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/nouveau/nouveau_svm.c |  2 +-
 include/linux/hmm.h                   |  7 ++++---
 mm/hmm.c                              | 13 ++++---------
 3 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 93ed43c413f0..8c92374afcf2 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
 		range.values = nouveau_svm_pfn_values;
 		range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
 again:
-		ret = hmm_vma_fault(&range, true);
+		ret = hmm_vma_fault(&svmm->mirror, &range, true);
 		if (ret == 0) {
 			mutex_lock(&svmm->mutex);
 			if (!hmm_vma_range_done(&range)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index cb01cf1fa3c0..1fba6979adf4 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -496,7 +496,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
 int hmm_range_register(struct hmm_range *range,
-		       struct mm_struct *mm,
+		       struct hmm_mirror *mirror,
 		       unsigned long start,
 		       unsigned long end,
 		       unsigned page_shift);
@@ -532,7 +532,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range)
 }
 
 /* This is a temporary helper to avoid merge conflict between trees. */
-static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+static inline int hmm_vma_fault(struct hmm_mirror *mirror,
+				struct hmm_range *range, bool block)
 {
 	long ret;
 
@@ -545,7 +546,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block)
 	range->default_flags = 0;
 	range->pfn_flags_mask = -1UL;
 
-	ret = hmm_range_register(range, range->vma->vm_mm,
+	ret = hmm_range_register(range, mirror,
 				 range->start, range->end,
 				 PAGE_SHIFT);
 	if (ret)
diff --git a/mm/hmm.c b/mm/hmm.c
index f6956d78e3cb..22a97ada108b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -914,13 +914,13 @@ static void hmm_pfns_clear(struct hmm_range *range,
  * Track updates to the CPU page table see include/linux/hmm.h
  */
 int hmm_range_register(struct hmm_range *range,
-		       struct mm_struct *mm,
+		       struct hmm_mirror *mirror,
 		       unsigned long start,
 		       unsigned long end,
 		       unsigned page_shift)
 {
 	unsigned long mask = ((1UL << page_shift) - 1UL);
-	struct hmm *hmm;
+	struct hmm *hmm = mirror->hmm;
 
 	range->valid = false;
 	range->hmm = NULL;
@@ -934,20 +934,15 @@ int hmm_range_register(struct hmm_range *range,
 	range->start = start;
 	range->end = end;
 
-	hmm = hmm_get_or_create(mm);
-	if (!hmm)
-		return -EFAULT;
-
 	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead) {
-		hmm_put(hmm);
+	if (hmm->mm == NULL || hmm->dead)
 		return -EFAULT;
-	}
 
 	/* Initialize range to track CPU page table updates. */
 	mutex_lock(&hmm->lock);
 
 	range->hmm = hmm;
+	kref_get(&hmm->kref);
 	list_add_rcu(&range->list, &hmm->ranges);
 
 	/*
-- 
cgit v1.2.3


From c8a53b2db0aec40d8b217936e1b7f3d840c50390 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 10:36:46 -0300
Subject: mm/hmm: Hold a mmgrab from hmm to mm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So long as a struct hmm pointer exists, so should the struct mm it is
linked too. Hold the mmgrab() as soon as a hmm is created, and mmdrop() it
once the hmm refcount goes to zero.

Since mmdrop() (ie a 0 kref on struct mm) is now impossible with a !NULL
mm->hmm delete the hmm_hmm_destroy().

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h |  3 ---
 kernel/fork.c       |  1 -
 mm/hmm.c            | 22 ++++------------------
 3 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 1fba6979adf4..1d97b6d62c5b 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -577,14 +577,11 @@ static inline int hmm_vma_fault(struct hmm_mirror *mirror,
 }
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
-void hmm_mm_destroy(struct mm_struct *mm);
-
 static inline void hmm_mm_init(struct mm_struct *mm)
 {
 	mm->hmm = NULL;
 }
 #else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 75675b9bf6df..c704c3cedee7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -673,7 +673,6 @@ void __mmdrop(struct mm_struct *mm)
 	WARN_ON_ONCE(mm == current->active_mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
-	hmm_mm_destroy(mm);
 	mmu_notifier_mm_destroy(mm);
 	check_mm(mm);
 	put_user_ns(mm->user_ns);
diff --git a/mm/hmm.c b/mm/hmm.c
index 22a97ada108b..080b17a2e87e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -20,6 +20,7 @@
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
 #include <linux/memremap.h>
+#include <linux/sched/mm.h>
 #include <linux/jump_label.h>
 #include <linux/dma-mapping.h>
 #include <linux/mmu_notifier.h>
@@ -73,6 +74,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	hmm->notifiers = 0;
 	hmm->dead = false;
 	hmm->mm = mm;
+	mmgrab(hmm->mm);
 
 	spin_lock(&mm->page_table_lock);
 	if (!mm->hmm)
@@ -100,6 +102,7 @@ error_mm:
 		mm->hmm = NULL;
 	spin_unlock(&mm->page_table_lock);
 error:
+	mmdrop(hmm->mm);
 	kfree(hmm);
 	return NULL;
 }
@@ -121,6 +124,7 @@ static void hmm_free(struct kref *kref)
 		mm->hmm = NULL;
 	spin_unlock(&mm->page_table_lock);
 
+	mmdrop(hmm->mm);
 	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
 }
 
@@ -129,24 +133,6 @@ static inline void hmm_put(struct hmm *hmm)
 	kref_put(&hmm->kref, hmm_free);
 }
 
-void hmm_mm_destroy(struct mm_struct *mm)
-{
-	struct hmm *hmm;
-
-	spin_lock(&mm->page_table_lock);
-	hmm = mm_get_hmm(mm);
-	mm->hmm = NULL;
-	if (hmm) {
-		hmm->mm = NULL;
-		hmm->dead = true;
-		spin_unlock(&mm->page_table_lock);
-		hmm_put(hmm);
-		return;
-	}
-
-	spin_unlock(&mm->page_table_lock);
-}
-
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
-- 
cgit v1.2.3


From a1a8e4a85cf7daff8b26c7b8698442ef677b4f97 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Mon, 10 Jun 2019 15:02:01 -0300
Subject: rdma: Delete the ib_ucm module

This has been marked CONFIG_BROKEN for over a year now with no complaints.
Delete the whole thing for good.

The module provided the /dev/infiniband/ucmX interface.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/Kconfig       |   11 -
 drivers/infiniband/core/Makefile |    3 -
 drivers/infiniband/core/ucm.c    | 1350 --------------------------------------
 include/uapi/rdma/ib_user_cm.h   |  326 ---------
 4 files changed, 1690 deletions(-)
 delete mode 100644 drivers/infiniband/core/ucm.c
 delete mode 100644 include/uapi/rdma/ib_user_cm.h

(limited to 'include')

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index cbfbea49f126..cbaafa4e0302 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -35,17 +35,6 @@ config INFINIBAND_USER_ACCESS
 	  libibverbs, libibcm and a hardware driver library from
 	  rdma-core <https://github.com/linux-rdma/rdma-core>.
 
-config INFINIBAND_USER_ACCESS_UCM
-	tristate "Userspace CM (UCM, DEPRECATED)"
-	depends on BROKEN || COMPILE_TEST
-	depends on INFINIBAND_USER_ACCESS
-	help
-	  The UCM module has known security flaws, which no one is
-	  interested to fix. The user-space part of this code was
-	  dropped from the upstream a long time ago.
-
-	  This option is DEPRECATED and planned to be removed.
-
 config INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI
 	bool "Allow experimental legacy verbs in new ioctl uAPI  (EXPERIMENTAL)"
 	depends on INFINIBAND_USER_ACCESS
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 313f2349b518..42f1b2a4f746 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -6,7 +6,6 @@ obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_cm.o iw_cm.o \
 					$(infiniband-y)
 obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y)
-obj-$(CONFIG_INFINIBAND_USER_ACCESS_UCM) += ib_ucm.o $(user_access-y)
 
 ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
@@ -29,8 +28,6 @@ rdma_ucm-y :=			ucma.o
 
 ib_umad-y :=			user_mad.o
 
-ib_ucm-y :=			ucm.o
-
 ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
 				rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
 				uverbs_std_types_cq.o \
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
deleted file mode 100644
index 8e7da2d41fd8..000000000000
--- a/drivers/infiniband/core/ucm.c
+++ /dev/null
@@ -1,1350 +0,0 @@
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *	copyright notice, this list of conditions and the following
- *	disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *	copyright notice, this list of conditions and the following
- *	disclaimer in the documentation and/or other materials
- *	provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/completion.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/poll.h>
-#include <linux/sched.h>
-#include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/cdev.h>
-#include <linux/xarray.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-
-#include <linux/nospec.h>
-
-#include <linux/uaccess.h>
-
-#include <rdma/ib.h>
-#include <rdma/ib_cm.h>
-#include <rdma/ib_user_cm.h>
-#include <rdma/ib_marshall.h>
-
-#include "core_priv.h"
-
-MODULE_AUTHOR("Libor Michalek");
-MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
-MODULE_LICENSE("Dual BSD/GPL");
-
-struct ib_ucm_device {
-	int			devnum;
-	struct cdev		cdev;
-	struct device		dev;
-	struct ib_device	*ib_dev;
-};
-
-struct ib_ucm_file {
-	struct mutex file_mutex;
-	struct file *filp;
-	struct ib_ucm_device *device;
-
-	struct list_head  ctxs;
-	struct list_head  events;
-	wait_queue_head_t poll_wait;
-};
-
-struct ib_ucm_context {
-	int                 id;
-	struct completion   comp;
-	atomic_t            ref;
-	int		    events_reported;
-
-	struct ib_ucm_file *file;
-	struct ib_cm_id    *cm_id;
-	__u64		   uid;
-
-	struct list_head    events;    /* list of pending events. */
-	struct list_head    file_list; /* member in file ctx list */
-};
-
-struct ib_ucm_event {
-	struct ib_ucm_context *ctx;
-	struct list_head file_list; /* member in file event list */
-	struct list_head ctx_list;  /* member in ctx event list */
-
-	struct ib_cm_id *cm_id;
-	struct ib_ucm_event_resp resp;
-	void *data;
-	void *info;
-	int data_len;
-	int info_len;
-};
-
-enum {
-	IB_UCM_MAJOR = 231,
-	IB_UCM_BASE_MINOR = 224,
-	IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS,
-	IB_UCM_NUM_FIXED_MINOR = 32,
-	IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR,
-};
-
-#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
-static dev_t dynamic_ucm_dev;
-
-static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
-
-static struct ib_client ucm_client = {
-	.name   = "ucm",
-	.add    = ib_ucm_add_one,
-	.remove = ib_ucm_remove_one
-};
-
-static DEFINE_XARRAY_ALLOC(ctx_id_table);
-static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
-
-static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
-{
-	struct ib_ucm_context *ctx;
-
-	xa_lock(&ctx_id_table);
-	ctx = xa_load(&ctx_id_table, id);
-	if (!ctx)
-		ctx = ERR_PTR(-ENOENT);
-	else if (ctx->file != file)
-		ctx = ERR_PTR(-EINVAL);
-	else
-		atomic_inc(&ctx->ref);
-	xa_unlock(&ctx_id_table);
-
-	return ctx;
-}
-
-static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
-{
-	if (atomic_dec_and_test(&ctx->ref))
-		complete(&ctx->comp);
-}
-
-static inline int ib_ucm_new_cm_id(int event)
-{
-	return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
-}
-
-static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
-{
-	struct ib_ucm_event *uevent;
-
-	mutex_lock(&ctx->file->file_mutex);
-	list_del(&ctx->file_list);
-	while (!list_empty(&ctx->events)) {
-
-		uevent = list_entry(ctx->events.next,
-				    struct ib_ucm_event, ctx_list);
-		list_del(&uevent->file_list);
-		list_del(&uevent->ctx_list);
-		mutex_unlock(&ctx->file->file_mutex);
-
-		/* clear incoming connections. */
-		if (ib_ucm_new_cm_id(uevent->resp.event))
-			ib_destroy_cm_id(uevent->cm_id);
-
-		kfree(uevent);
-		mutex_lock(&ctx->file->file_mutex);
-	}
-	mutex_unlock(&ctx->file->file_mutex);
-}
-
-static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
-{
-	struct ib_ucm_context *ctx;
-
-	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
-	if (!ctx)
-		return NULL;
-
-	atomic_set(&ctx->ref, 1);
-	init_completion(&ctx->comp);
-	ctx->file = file;
-	INIT_LIST_HEAD(&ctx->events);
-
-	if (xa_alloc(&ctx_id_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))
-		goto error;
-
-	list_add_tail(&ctx->file_list, &file->ctxs);
-	return ctx;
-
-error:
-	kfree(ctx);
-	return NULL;
-}
-
-static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
-				 const struct ib_cm_req_event_param *kreq)
-{
-	ureq->remote_ca_guid             = kreq->remote_ca_guid;
-	ureq->remote_qkey                = kreq->remote_qkey;
-	ureq->remote_qpn                 = kreq->remote_qpn;
-	ureq->qp_type                    = kreq->qp_type;
-	ureq->starting_psn               = kreq->starting_psn;
-	ureq->responder_resources        = kreq->responder_resources;
-	ureq->initiator_depth            = kreq->initiator_depth;
-	ureq->local_cm_response_timeout  = kreq->local_cm_response_timeout;
-	ureq->flow_control               = kreq->flow_control;
-	ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
-	ureq->retry_count                = kreq->retry_count;
-	ureq->rnr_retry_count            = kreq->rnr_retry_count;
-	ureq->srq                        = kreq->srq;
-	ureq->port			 = kreq->port;
-
-	ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
-	if (kreq->alternate_path)
-		ib_copy_path_rec_to_user(&ureq->alternate_path,
-					 kreq->alternate_path);
-}
-
-static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
-				 const struct ib_cm_rep_event_param *krep)
-{
-	urep->remote_ca_guid      = krep->remote_ca_guid;
-	urep->remote_qkey         = krep->remote_qkey;
-	urep->remote_qpn          = krep->remote_qpn;
-	urep->starting_psn        = krep->starting_psn;
-	urep->responder_resources = krep->responder_resources;
-	urep->initiator_depth     = krep->initiator_depth;
-	urep->target_ack_delay    = krep->target_ack_delay;
-	urep->failover_accepted   = krep->failover_accepted;
-	urep->flow_control        = krep->flow_control;
-	urep->rnr_retry_count     = krep->rnr_retry_count;
-	urep->srq                 = krep->srq;
-}
-
-static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
-				      const struct ib_cm_sidr_rep_event_param *krep)
-{
-	urep->status = krep->status;
-	urep->qkey   = krep->qkey;
-	urep->qpn    = krep->qpn;
-};
-
-static int ib_ucm_event_process(const struct ib_cm_event *evt,
-				struct ib_ucm_event *uvt)
-{
-	void *info = NULL;
-
-	switch (evt->event) {
-	case IB_CM_REQ_RECEIVED:
-		ib_ucm_event_req_get(&uvt->resp.u.req_resp,
-				     &evt->param.req_rcvd);
-		uvt->data_len      = IB_CM_REQ_PRIVATE_DATA_SIZE;
-		uvt->resp.present  = IB_UCM_PRES_PRIMARY;
-		uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
-				      IB_UCM_PRES_ALTERNATE : 0);
-		break;
-	case IB_CM_REP_RECEIVED:
-		ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
-				     &evt->param.rep_rcvd);
-		uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
-		break;
-	case IB_CM_RTU_RECEIVED:
-		uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
-		uvt->resp.u.send_status = evt->param.send_status;
-		break;
-	case IB_CM_DREQ_RECEIVED:
-		uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
-		uvt->resp.u.send_status = evt->param.send_status;
-		break;
-	case IB_CM_DREP_RECEIVED:
-		uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
-		uvt->resp.u.send_status = evt->param.send_status;
-		break;
-	case IB_CM_MRA_RECEIVED:
-		uvt->resp.u.mra_resp.timeout =
-					evt->param.mra_rcvd.service_timeout;
-		uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
-		break;
-	case IB_CM_REJ_RECEIVED:
-		uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
-		uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
-		uvt->info_len = evt->param.rej_rcvd.ari_length;
-		info	      = evt->param.rej_rcvd.ari;
-		break;
-	case IB_CM_LAP_RECEIVED:
-		ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
-					 evt->param.lap_rcvd.alternate_path);
-		uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
-		uvt->resp.present = IB_UCM_PRES_ALTERNATE;
-		break;
-	case IB_CM_APR_RECEIVED:
-		uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
-		uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
-		uvt->info_len = evt->param.apr_rcvd.info_len;
-		info	      = evt->param.apr_rcvd.apr_info;
-		break;
-	case IB_CM_SIDR_REQ_RECEIVED:
-		uvt->resp.u.sidr_req_resp.pkey =
-					evt->param.sidr_req_rcvd.pkey;
-		uvt->resp.u.sidr_req_resp.port =
-					evt->param.sidr_req_rcvd.port;
-		uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
-		break;
-	case IB_CM_SIDR_REP_RECEIVED:
-		ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
-					  &evt->param.sidr_rep_rcvd);
-		uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
-		uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
-		info	      = evt->param.sidr_rep_rcvd.info;
-		break;
-	default:
-		uvt->resp.u.send_status = evt->param.send_status;
-		break;
-	}
-
-	if (uvt->data_len) {
-		uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
-		if (!uvt->data)
-			goto err1;
-
-		uvt->resp.present |= IB_UCM_PRES_DATA;
-	}
-
-	if (uvt->info_len) {
-		uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
-		if (!uvt->info)
-			goto err2;
-
-		uvt->resp.present |= IB_UCM_PRES_INFO;
-	}
-	return 0;
-
-err2:
-	kfree(uvt->data);
-err1:
-	return -ENOMEM;
-}
-
-static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
-				const struct ib_cm_event *event)
-{
-	struct ib_ucm_event *uevent;
-	struct ib_ucm_context *ctx;
-	int result = 0;
-
-	ctx = cm_id->context;
-
-	uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
-	if (!uevent)
-		goto err1;
-
-	uevent->ctx = ctx;
-	uevent->cm_id = cm_id;
-	uevent->resp.uid = ctx->uid;
-	uevent->resp.id = ctx->id;
-	uevent->resp.event = event->event;
-
-	result = ib_ucm_event_process(event, uevent);
-	if (result)
-		goto err2;
-
-	mutex_lock(&ctx->file->file_mutex);
-	list_add_tail(&uevent->file_list, &ctx->file->events);
-	list_add_tail(&uevent->ctx_list, &ctx->events);
-	wake_up_interruptible(&ctx->file->poll_wait);
-	mutex_unlock(&ctx->file->file_mutex);
-	return 0;
-
-err2:
-	kfree(uevent);
-err1:
-	/* Destroy new cm_id's */
-	return ib_ucm_new_cm_id(event->event);
-}
-
-static ssize_t ib_ucm_event(struct ib_ucm_file *file,
-			    const char __user *inbuf,
-			    int in_len, int out_len)
-{
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_event_get cmd;
-	struct ib_ucm_event *uevent;
-	int result = 0;
-
-	if (out_len < sizeof(struct ib_ucm_event_resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	mutex_lock(&file->file_mutex);
-	while (list_empty(&file->events)) {
-		mutex_unlock(&file->file_mutex);
-
-		if (file->filp->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-
-		if (wait_event_interruptible(file->poll_wait,
-					     !list_empty(&file->events)))
-			return -ERESTARTSYS;
-
-		mutex_lock(&file->file_mutex);
-	}
-
-	uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
-
-	if (ib_ucm_new_cm_id(uevent->resp.event)) {
-		ctx = ib_ucm_ctx_alloc(file);
-		if (!ctx) {
-			result = -ENOMEM;
-			goto done;
-		}
-
-		ctx->cm_id = uevent->cm_id;
-		ctx->cm_id->context = ctx;
-		uevent->resp.id = ctx->id;
-	}
-
-	if (copy_to_user(u64_to_user_ptr(cmd.response),
-			 &uevent->resp, sizeof(uevent->resp))) {
-		result = -EFAULT;
-		goto done;
-	}
-
-	if (uevent->data) {
-		if (cmd.data_len < uevent->data_len) {
-			result = -ENOMEM;
-			goto done;
-		}
-		if (copy_to_user(u64_to_user_ptr(cmd.data),
-				 uevent->data, uevent->data_len)) {
-			result = -EFAULT;
-			goto done;
-		}
-	}
-
-	if (uevent->info) {
-		if (cmd.info_len < uevent->info_len) {
-			result = -ENOMEM;
-			goto done;
-		}
-		if (copy_to_user(u64_to_user_ptr(cmd.info),
-				 uevent->info, uevent->info_len)) {
-			result = -EFAULT;
-			goto done;
-		}
-	}
-
-	list_del(&uevent->file_list);
-	list_del(&uevent->ctx_list);
-	uevent->ctx->events_reported++;
-
-	kfree(uevent->data);
-	kfree(uevent->info);
-	kfree(uevent);
-done:
-	mutex_unlock(&file->file_mutex);
-	return result;
-}
-
-static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
-				const char __user *inbuf,
-				int in_len, int out_len)
-{
-	struct ib_ucm_create_id cmd;
-	struct ib_ucm_create_id_resp resp;
-	struct ib_ucm_context *ctx;
-	int result;
-
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	mutex_lock(&file->file_mutex);
-	ctx = ib_ucm_ctx_alloc(file);
-	mutex_unlock(&file->file_mutex);
-	if (!ctx)
-		return -ENOMEM;
-
-	ctx->uid = cmd.uid;
-	ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
-				     ib_ucm_event_handler, ctx);
-	if (IS_ERR(ctx->cm_id)) {
-		result = PTR_ERR(ctx->cm_id);
-		goto err1;
-	}
-
-	resp.id = ctx->id;
-	if (copy_to_user(u64_to_user_ptr(cmd.response),
-			 &resp, sizeof(resp))) {
-		result = -EFAULT;
-		goto err2;
-	}
-	return 0;
-
-err2:
-	ib_destroy_cm_id(ctx->cm_id);
-err1:
-	xa_erase(&ctx_id_table, ctx->id);
-	kfree(ctx);
-	return result;
-}
-
-static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
-				 const char __user *inbuf,
-				 int in_len, int out_len)
-{
-	struct ib_ucm_destroy_id cmd;
-	struct ib_ucm_destroy_id_resp resp;
-	struct ib_ucm_context *ctx;
-	int result = 0;
-
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	xa_lock(&ctx_id_table);
-	ctx = xa_load(&ctx_id_table, cmd.id);
-	if (!ctx)
-		ctx = ERR_PTR(-ENOENT);
-	else if (ctx->file != file)
-		ctx = ERR_PTR(-EINVAL);
-	else
-		__xa_erase(&ctx_id_table, ctx->id);
-	xa_unlock(&ctx_id_table);
-
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ib_ucm_ctx_put(ctx);
-	wait_for_completion(&ctx->comp);
-
-	/* No new events will be generated after destroying the cm_id. */
-	ib_destroy_cm_id(ctx->cm_id);
-	/* Cleanup events not yet reported to the user. */
-	ib_ucm_cleanup_events(ctx);
-
-	resp.events_reported = ctx->events_reported;
-	if (copy_to_user(u64_to_user_ptr(cmd.response),
-			 &resp, sizeof(resp)))
-		result = -EFAULT;
-
-	kfree(ctx);
-	return result;
-}
-
-static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
-			      const char __user *inbuf,
-			      int in_len, int out_len)
-{
-	struct ib_ucm_attr_id_resp resp;
-	struct ib_ucm_attr_id cmd;
-	struct ib_ucm_context *ctx;
-	int result = 0;
-
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	resp.service_id   = ctx->cm_id->service_id;
-	resp.service_mask = ctx->cm_id->service_mask;
-	resp.local_id     = ctx->cm_id->local_id;
-	resp.remote_id    = ctx->cm_id->remote_id;
-
-	if (copy_to_user(u64_to_user_ptr(cmd.response),
-			 &resp, sizeof(resp)))
-		result = -EFAULT;
-
-	ib_ucm_ctx_put(ctx);
-	return result;
-}
-
-static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
-				   const char __user *inbuf,
-				   int in_len, int out_len)
-{
-	struct ib_uverbs_qp_attr resp;
-	struct ib_ucm_init_qp_attr cmd;
-	struct ib_ucm_context *ctx;
-	struct ib_qp_attr qp_attr;
-	int result = 0;
-
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	resp.qp_attr_mask = 0;
-	memset(&qp_attr, 0, sizeof qp_attr);
-	qp_attr.qp_state = cmd.qp_state;
-	result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
-	if (result)
-		goto out;
-
-	ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
-
-	if (copy_to_user(u64_to_user_ptr(cmd.response),
-			 &resp, sizeof(resp)))
-		result = -EFAULT;
-
-out:
-	ib_ucm_ctx_put(ctx);
-	return result;
-}
-
-static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
-{
-	service_id &= service_mask;
-
-	if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
-	    ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
-		return -EINVAL;
-
-	return 0;
-}
-
-static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
-			     const char __user *inbuf,
-			     int in_len, int out_len)
-{
-	struct ib_ucm_listen cmd;
-	struct ib_ucm_context *ctx;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
-	if (result)
-		goto out;
-
-	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
-out:
-	ib_ucm_ctx_put(ctx);
-	return result;
-}
-
-static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
-			     const char __user *inbuf,
-			     int in_len, int out_len)
-{
-	struct ib_ucm_notify cmd;
-	struct ib_ucm_context *ctx;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
-	ib_ucm_ctx_put(ctx);
-	return result;
-}
-
-static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
-{
-	void *data;
-
-	*dest = NULL;
-
-	if (!len)
-		return 0;
-
-	data = memdup_user(u64_to_user_ptr(src), len);
-	if (IS_ERR(data))
-		return PTR_ERR(data);
-
-	*dest = data;
-	return 0;
-}
-
-static int ib_ucm_path_get(struct sa_path_rec **path, u64 src)
-{
-	struct ib_user_path_rec upath;
-	struct sa_path_rec  *sa_path;
-
-	*path = NULL;
-
-	if (!src)
-		return 0;
-
-	sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
-	if (!sa_path)
-		return -ENOMEM;
-
-	if (copy_from_user(&upath, u64_to_user_ptr(src),
-			   sizeof(upath))) {
-
-		kfree(sa_path);
-		return -EFAULT;
-	}
-
-	ib_copy_path_rec_from_user(sa_path, &upath);
-	*path = sa_path;
-	return 0;
-}
-
-static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	struct ib_cm_req_param param;
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_req cmd;
-	int result;
-
-	param.private_data   = NULL;
-	param.primary_path   = NULL;
-	param.alternate_path = NULL;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-	if (result)
-		goto done;
-
-	result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
-	if (result)
-		goto done;
-
-	result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
-	if (result)
-		goto done;
-
-	param.private_data_len           = cmd.len;
-	param.service_id                 = cmd.sid;
-	param.qp_num                     = cmd.qpn;
-	param.qp_type                    = cmd.qp_type;
-	param.starting_psn               = cmd.psn;
-	param.peer_to_peer               = cmd.peer_to_peer;
-	param.responder_resources        = cmd.responder_resources;
-	param.initiator_depth            = cmd.initiator_depth;
-	param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
-	param.flow_control               = cmd.flow_control;
-	param.local_cm_response_timeout  = cmd.local_cm_response_timeout;
-	param.retry_count                = cmd.retry_count;
-	param.rnr_retry_count            = cmd.rnr_retry_count;
-	param.max_cm_retries             = cmd.max_cm_retries;
-	param.srq                        = cmd.srq;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = ib_send_cm_req(ctx->cm_id, &param);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-done:
-	kfree(param.private_data);
-	kfree(param.primary_path);
-	kfree(param.alternate_path);
-	return result;
-}
-
-static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	struct ib_cm_rep_param param;
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_rep cmd;
-	int result;
-
-	param.private_data = NULL;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-	if (result)
-		return result;
-
-	param.qp_num              = cmd.qpn;
-	param.starting_psn        = cmd.psn;
-	param.private_data_len    = cmd.len;
-	param.responder_resources = cmd.responder_resources;
-	param.initiator_depth     = cmd.initiator_depth;
-	param.failover_accepted   = cmd.failover_accepted;
-	param.flow_control        = cmd.flow_control;
-	param.rnr_retry_count     = cmd.rnr_retry_count;
-	param.srq                 = cmd.srq;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		ctx->uid = cmd.uid;
-		result = ib_send_cm_rep(ctx->cm_id, &param);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-	kfree(param.private_data);
-	return result;
-}
-
-static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
-					const char __user *inbuf, int in_len,
-					int (*func)(struct ib_cm_id *cm_id,
-						    const void *private_data,
-						    u8 private_data_len))
-{
-	struct ib_ucm_private_data cmd;
-	struct ib_ucm_context *ctx;
-	const void *private_data = NULL;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
-	if (result)
-		return result;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = func(ctx->cm_id, private_data, cmd.len);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-	kfree(private_data);
-	return result;
-}
-
-static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
-}
-
-static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
-				const char __user *inbuf,
-				int in_len, int out_len)
-{
-	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
-}
-
-static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
-				const char __user *inbuf,
-				int in_len, int out_len)
-{
-	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
-}
-
-static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
-				const char __user *inbuf, int in_len,
-				int (*func)(struct ib_cm_id *cm_id,
-					    int status,
-					    const void *info,
-					    u8 info_len,
-					    const void *data,
-					    u8 data_len))
-{
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_info cmd;
-	const void *data = NULL;
-	const void *info = NULL;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
-	if (result)
-		goto done;
-
-	result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
-	if (result)
-		goto done;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
-			      data, cmd.data_len);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-done:
-	kfree(data);
-	kfree(info);
-	return result;
-}
-
-static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
-}
-
-static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
-}
-
-static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_mra cmd;
-	const void *data = NULL;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
-	if (result)
-		return result;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-	kfree(data);
-	return result;
-}
-
-static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	struct ib_ucm_context *ctx;
-	struct sa_path_rec *path = NULL;
-	struct ib_ucm_lap cmd;
-	const void *data = NULL;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
-	if (result)
-		goto done;
-
-	result = ib_ucm_path_get(&path, cmd.path);
-	if (result)
-		goto done;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-done:
-	kfree(data);
-	kfree(path);
-	return result;
-}
-
-static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
-				    const char __user *inbuf,
-				    int in_len, int out_len)
-{
-	struct ib_cm_sidr_req_param param = {};
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_sidr_req cmd;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-	if (result)
-		goto done;
-
-	result = ib_ucm_path_get(&param.path, cmd.path);
-	if (result)
-		goto done;
-
-	param.private_data_len = cmd.len;
-	param.service_id       = cmd.sid;
-	param.timeout_ms       = cmd.timeout;
-	param.max_cm_retries   = cmd.max_cm_retries;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = ib_send_cm_sidr_req(ctx->cm_id, &param);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-done:
-	kfree(param.private_data);
-	kfree(param.path);
-	return result;
-}
-
-static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
-				    const char __user *inbuf,
-				    int in_len, int out_len)
-{
-	struct ib_cm_sidr_rep_param param;
-	struct ib_ucm_sidr_rep cmd;
-	struct ib_ucm_context *ctx;
-	int result;
-
-	param.info = NULL;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&param.private_data,
-				   cmd.data, cmd.data_len);
-	if (result)
-		goto done;
-
-	result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
-	if (result)
-		goto done;
-
-	param.qp_num		= cmd.qpn;
-	param.qkey		= cmd.qkey;
-	param.status		= cmd.status;
-	param.info_length	= cmd.info_len;
-	param.private_data_len	= cmd.data_len;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-done:
-	kfree(param.private_data);
-	kfree(param.info);
-	return result;
-}
-
-static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
-				  const char __user *inbuf,
-				  int in_len, int out_len) = {
-	[IB_USER_CM_CMD_CREATE_ID]     = ib_ucm_create_id,
-	[IB_USER_CM_CMD_DESTROY_ID]    = ib_ucm_destroy_id,
-	[IB_USER_CM_CMD_ATTR_ID]       = ib_ucm_attr_id,
-	[IB_USER_CM_CMD_LISTEN]        = ib_ucm_listen,
-	[IB_USER_CM_CMD_NOTIFY]        = ib_ucm_notify,
-	[IB_USER_CM_CMD_SEND_REQ]      = ib_ucm_send_req,
-	[IB_USER_CM_CMD_SEND_REP]      = ib_ucm_send_rep,
-	[IB_USER_CM_CMD_SEND_RTU]      = ib_ucm_send_rtu,
-	[IB_USER_CM_CMD_SEND_DREQ]     = ib_ucm_send_dreq,
-	[IB_USER_CM_CMD_SEND_DREP]     = ib_ucm_send_drep,
-	[IB_USER_CM_CMD_SEND_REJ]      = ib_ucm_send_rej,
-	[IB_USER_CM_CMD_SEND_MRA]      = ib_ucm_send_mra,
-	[IB_USER_CM_CMD_SEND_LAP]      = ib_ucm_send_lap,
-	[IB_USER_CM_CMD_SEND_APR]      = ib_ucm_send_apr,
-	[IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
-	[IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
-	[IB_USER_CM_CMD_EVENT]	       = ib_ucm_event,
-	[IB_USER_CM_CMD_INIT_QP_ATTR]  = ib_ucm_init_qp_attr,
-};
-
-static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
-			    size_t len, loff_t *pos)
-{
-	struct ib_ucm_file *file = filp->private_data;
-	struct ib_ucm_cmd_hdr hdr;
-	ssize_t result;
-
-	if (!ib_safe_file_access(filp)) {
-		pr_err_once("ucm_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
-			    task_tgid_vnr(current), current->comm);
-		return -EACCES;
-	}
-
-	if (len < sizeof(hdr))
-		return -EINVAL;
-
-	if (copy_from_user(&hdr, buf, sizeof(hdr)))
-		return -EFAULT;
-
-	if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
-		return -EINVAL;
-	hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucm_cmd_table));
-
-	if (hdr.in + sizeof(hdr) > len)
-		return -EINVAL;
-
-	result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
-					hdr.in, hdr.out);
-	if (!result)
-		result = len;
-
-	return result;
-}
-
-static __poll_t ib_ucm_poll(struct file *filp,
-				struct poll_table_struct *wait)
-{
-	struct ib_ucm_file *file = filp->private_data;
-	__poll_t mask = 0;
-
-	poll_wait(filp, &file->poll_wait, wait);
-
-	if (!list_empty(&file->events))
-		mask = EPOLLIN | EPOLLRDNORM;
-
-	return mask;
-}
-
-/*
- * ib_ucm_open() does not need the BKL:
- *
- *  - no global state is referred to;
- *  - there is no ioctl method to race against;
- *  - no further module initialization is required for open to work
- *    after the device is registered.
- */
-static int ib_ucm_open(struct inode *inode, struct file *filp)
-{
-	struct ib_ucm_file *file;
-
-	file = kmalloc(sizeof(*file), GFP_KERNEL);
-	if (!file)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&file->events);
-	INIT_LIST_HEAD(&file->ctxs);
-	init_waitqueue_head(&file->poll_wait);
-
-	mutex_init(&file->file_mutex);
-
-	filp->private_data = file;
-	file->filp = filp;
-	file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
-
-	return stream_open(inode, filp);
-}
-
-static int ib_ucm_close(struct inode *inode, struct file *filp)
-{
-	struct ib_ucm_file *file = filp->private_data;
-	struct ib_ucm_context *ctx;
-
-	mutex_lock(&file->file_mutex);
-	while (!list_empty(&file->ctxs)) {
-		ctx = list_entry(file->ctxs.next,
-				 struct ib_ucm_context, file_list);
-		mutex_unlock(&file->file_mutex);
-
-		xa_erase(&ctx_id_table, ctx->id);
-		ib_destroy_cm_id(ctx->cm_id);
-		ib_ucm_cleanup_events(ctx);
-		kfree(ctx);
-
-		mutex_lock(&file->file_mutex);
-	}
-	mutex_unlock(&file->file_mutex);
-	kfree(file);
-	return 0;
-}
-
-static void ib_ucm_release_dev(struct device *dev)
-{
-	struct ib_ucm_device *ucm_dev;
-
-	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
-	kfree(ucm_dev);
-}
-
-static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev)
-{
-	clear_bit(ucm_dev->devnum, dev_map);
-}
-
-static const struct file_operations ucm_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = ib_ucm_open,
-	.release = ib_ucm_close,
-	.write	 = ib_ucm_write,
-	.poll    = ib_ucm_poll,
-	.llseek	 = no_llseek,
-};
-
-static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
-			  char *buf)
-{
-	struct ib_ucm_device *ucm_dev;
-
-	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
-	return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
-}
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
-
-static void ib_ucm_add_one(struct ib_device *device)
-{
-	int devnum;
-	dev_t base;
-	struct ib_ucm_device *ucm_dev;
-
-	if (!device->ops.alloc_ucontext || !rdma_cap_ib_cm(device, 1))
-		return;
-
-	ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
-	if (!ucm_dev)
-		return;
-
-	device_initialize(&ucm_dev->dev);
-	ucm_dev->ib_dev = device;
-	ucm_dev->dev.release = ib_ucm_release_dev;
-
-	devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
-	if (devnum >= IB_UCM_MAX_DEVICES)
-		goto err;
-	ucm_dev->devnum = devnum;
-	set_bit(devnum, dev_map);
-	if (devnum >= IB_UCM_NUM_FIXED_MINOR)
-		base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR;
-	else
-		base = IB_UCM_BASE_DEV + devnum;
-
-	cdev_init(&ucm_dev->cdev, &ucm_fops);
-	ucm_dev->cdev.owner = THIS_MODULE;
-	kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
-
-	ucm_dev->dev.class = &cm_class;
-	ucm_dev->dev.parent = device->dev.parent;
-	ucm_dev->dev.devt = base;
-
-	dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
-	if (cdev_device_add(&ucm_dev->cdev, &ucm_dev->dev))
-		goto err_devnum;
-
-	if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
-		goto err_dev;
-
-	ib_set_client_data(device, &ucm_client, ucm_dev);
-	return;
-
-err_dev:
-	cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
-err_devnum:
-	ib_ucm_free_dev(ucm_dev);
-err:
-	put_device(&ucm_dev->dev);
-	return;
-}
-
-static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
-{
-	struct ib_ucm_device *ucm_dev = client_data;
-
-	if (!ucm_dev)
-		return;
-
-	cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
-	ib_ucm_free_dev(ucm_dev);
-	put_device(&ucm_dev->dev);
-}
-
-static CLASS_ATTR_STRING(abi_version, S_IRUGO,
-			 __stringify(IB_USER_CM_ABI_VERSION));
-
-static int __init ib_ucm_init(void)
-{
-	int ret;
-
-	ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR,
-				     "infiniband_cm");
-	if (ret) {
-		pr_err("ucm: couldn't register device number\n");
-		goto error1;
-	}
-
-	ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR,
-				  "infiniband_cm");
-	if (ret) {
-		pr_err("ucm: couldn't register dynamic device number\n");
-		goto err_alloc;
-	}
-
-	ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
-	if (ret) {
-		pr_err("ucm: couldn't create abi_version attribute\n");
-		goto error2;
-	}
-
-	ret = ib_register_client(&ucm_client);
-	if (ret) {
-		pr_err("ucm: couldn't register client\n");
-		goto error3;
-	}
-	return 0;
-
-error3:
-	class_remove_file(&cm_class, &class_attr_abi_version.attr);
-error2:
-	unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
-err_alloc:
-	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
-error1:
-	return ret;
-}
-
-static void __exit ib_ucm_cleanup(void)
-{
-	ib_unregister_client(&ucm_client);
-	class_remove_file(&cm_class, &class_attr_abi_version.attr);
-	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
-	unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
-	WARN_ON(!xa_empty(&ctx_id_table));
-}
-
-module_init(ib_ucm_init);
-module_exit(ib_ucm_cleanup);
diff --git a/include/uapi/rdma/ib_user_cm.h b/include/uapi/rdma/ib_user_cm.h
deleted file mode 100644
index e2709bb8cb18..000000000000
--- a/include/uapi/rdma/ib_user_cm.h
+++ /dev/null
@@ -1,326 +0,0 @@
-/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IB_USER_CM_H
-#define IB_USER_CM_H
-
-#include <linux/types.h>
-#include <rdma/ib_user_sa.h>
-
-#define IB_USER_CM_ABI_VERSION 5
-
-enum {
-	IB_USER_CM_CMD_CREATE_ID,
-	IB_USER_CM_CMD_DESTROY_ID,
-	IB_USER_CM_CMD_ATTR_ID,
-
-	IB_USER_CM_CMD_LISTEN,
-	IB_USER_CM_CMD_NOTIFY,
-
-	IB_USER_CM_CMD_SEND_REQ,
-	IB_USER_CM_CMD_SEND_REP,
-	IB_USER_CM_CMD_SEND_RTU,
-	IB_USER_CM_CMD_SEND_DREQ,
-	IB_USER_CM_CMD_SEND_DREP,
-	IB_USER_CM_CMD_SEND_REJ,
-	IB_USER_CM_CMD_SEND_MRA,
-	IB_USER_CM_CMD_SEND_LAP,
-	IB_USER_CM_CMD_SEND_APR,
-	IB_USER_CM_CMD_SEND_SIDR_REQ,
-	IB_USER_CM_CMD_SEND_SIDR_REP,
-
-	IB_USER_CM_CMD_EVENT,
-	IB_USER_CM_CMD_INIT_QP_ATTR,
-};
-/*
- * command ABI structures.
- */
-struct ib_ucm_cmd_hdr {
-	__u32 cmd;
-	__u16 in;
-	__u16 out;
-};
-
-struct ib_ucm_create_id {
-	__aligned_u64 uid;
-	__aligned_u64 response;
-};
-
-struct ib_ucm_create_id_resp {
-	__u32 id;
-};
-
-struct ib_ucm_destroy_id {
-	__aligned_u64 response;
-	__u32 id;
-	__u32 reserved;
-};
-
-struct ib_ucm_destroy_id_resp {
-	__u32 events_reported;
-};
-
-struct ib_ucm_attr_id {
-	__aligned_u64 response;
-	__u32 id;
-	__u32 reserved;
-};
-
-struct ib_ucm_attr_id_resp {
-	__be64 service_id;
-	__be64 service_mask;
-	__be32 local_id;
-	__be32 remote_id;
-};
-
-struct ib_ucm_init_qp_attr {
-	__aligned_u64 response;
-	__u32 id;
-	__u32 qp_state;
-};
-
-struct ib_ucm_listen {
-	__be64 service_id;
-	__be64 service_mask;
-	__u32 id;
-	__u32 reserved;
-};
-
-struct ib_ucm_notify {
-	__u32 id;
-	__u32 event;
-};
-
-struct ib_ucm_private_data {
-	__aligned_u64 data;
-	__u32 id;
-	__u8  len;
-	__u8  reserved[3];
-};
-
-struct ib_ucm_req {
-	__u32 id;
-	__u32 qpn;
-	__u32 qp_type;
-	__u32 psn;
-	__be64 sid;
-	__aligned_u64 data;
-	__aligned_u64 primary_path;
-	__aligned_u64 alternate_path;
-	__u8  len;
-	__u8  peer_to_peer;
-	__u8  responder_resources;
-	__u8  initiator_depth;
-	__u8  remote_cm_response_timeout;
-	__u8  flow_control;
-	__u8  local_cm_response_timeout;
-	__u8  retry_count;
-	__u8  rnr_retry_count;
-	__u8  max_cm_retries;
-	__u8  srq;
-	__u8  reserved[5];
-};
-
-struct ib_ucm_rep {
-	__aligned_u64 uid;
-	__aligned_u64 data;
-	__u32 id;
-	__u32 qpn;
-	__u32 psn;
-	__u8  len;
-	__u8  responder_resources;
-	__u8  initiator_depth;
-	__u8  target_ack_delay;
-	__u8  failover_accepted;
-	__u8  flow_control;
-	__u8  rnr_retry_count;
-	__u8  srq;
-	__u8  reserved[4];
-};
-
-struct ib_ucm_info {
-	__u32 id;
-	__u32 status;
-	__aligned_u64 info;
-	__aligned_u64 data;
-	__u8  info_len;
-	__u8  data_len;
-	__u8  reserved[6];
-};
-
-struct ib_ucm_mra {
-	__aligned_u64 data;
-	__u32 id;
-	__u8  len;
-	__u8  timeout;
-	__u8  reserved[2];
-};
-
-struct ib_ucm_lap {
-	__aligned_u64 path;
-	__aligned_u64 data;
-	__u32 id;
-	__u8  len;
-	__u8  reserved[3];
-};
-
-struct ib_ucm_sidr_req {
-	__u32 id;
-	__u32 timeout;
-	__be64 sid;
-	__aligned_u64 data;
-	__aligned_u64 path;
-	__u16 reserved_pkey;
-	__u8  len;
-	__u8  max_cm_retries;
-	__u8  reserved[4];
-};
-
-struct ib_ucm_sidr_rep {
-	__u32 id;
-	__u32 qpn;
-	__u32 qkey;
-	__u32 status;
-	__aligned_u64 info;
-	__aligned_u64 data;
-	__u8  info_len;
-	__u8  data_len;
-	__u8  reserved[6];
-};
-/*
- * event notification ABI structures.
- */
-struct ib_ucm_event_get {
-	__aligned_u64 response;
-	__aligned_u64 data;
-	__aligned_u64 info;
-	__u8  data_len;
-	__u8  info_len;
-	__u8  reserved[6];
-};
-
-struct ib_ucm_req_event_resp {
-	struct ib_user_path_rec primary_path;
-	struct ib_user_path_rec alternate_path;
-	__be64                 remote_ca_guid;
-	__u32                  remote_qkey;
-	__u32                  remote_qpn;
-	__u32                  qp_type;
-	__u32                  starting_psn;
-	__u8  responder_resources;
-	__u8  initiator_depth;
-	__u8  local_cm_response_timeout;
-	__u8  flow_control;
-	__u8  remote_cm_response_timeout;
-	__u8  retry_count;
-	__u8  rnr_retry_count;
-	__u8  srq;
-	__u8  port;
-	__u8  reserved[7];
-};
-
-struct ib_ucm_rep_event_resp {
-	__be64 remote_ca_guid;
-	__u32 remote_qkey;
-	__u32 remote_qpn;
-	__u32 starting_psn;
-	__u8  responder_resources;
-	__u8  initiator_depth;
-	__u8  target_ack_delay;
-	__u8  failover_accepted;
-	__u8  flow_control;
-	__u8  rnr_retry_count;
-	__u8  srq;
-	__u8  reserved[5];
-};
-
-struct ib_ucm_rej_event_resp {
-	__u32 reason;
-	/* ari in ib_ucm_event_get info field. */
-};
-
-struct ib_ucm_mra_event_resp {
-	__u8  timeout;
-	__u8  reserved[3];
-};
-
-struct ib_ucm_lap_event_resp {
-	struct ib_user_path_rec path;
-};
-
-struct ib_ucm_apr_event_resp {
-	__u32 status;
-	/* apr info in ib_ucm_event_get info field. */
-};
-
-struct ib_ucm_sidr_req_event_resp {
-	__u16 pkey;
-	__u8  port;
-	__u8  reserved;
-};
-
-struct ib_ucm_sidr_rep_event_resp {
-	__u32 status;
-	__u32 qkey;
-	__u32 qpn;
-	/* info in ib_ucm_event_get info field. */
-};
-
-#define IB_UCM_PRES_DATA      0x01
-#define IB_UCM_PRES_INFO      0x02
-#define IB_UCM_PRES_PRIMARY   0x04
-#define IB_UCM_PRES_ALTERNATE 0x08
-
-struct ib_ucm_event_resp {
-	__aligned_u64 uid;
-	__u32 id;
-	__u32 event;
-	__u32 present;
-	__u32 reserved;
-	union {
-		struct ib_ucm_req_event_resp req_resp;
-		struct ib_ucm_rep_event_resp rep_resp;
-		struct ib_ucm_rej_event_resp rej_resp;
-		struct ib_ucm_mra_event_resp mra_resp;
-		struct ib_ucm_lap_event_resp lap_resp;
-		struct ib_ucm_apr_event_resp apr_resp;
-
-		struct ib_ucm_sidr_req_event_resp sidr_req_resp;
-		struct ib_ucm_sidr_rep_event_resp sidr_rep_resp;
-
-		__u32                             send_status;
-	} u;
-};
-
-#endif /* IB_USER_CM_H */
-- 
cgit v1.2.3


From b9560a419bfd498279333387817adcf5faef2825 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 5 Jun 2019 14:39:24 -0300
Subject: RDMA: Move driver_id into struct ib_device_ops

No reason for every driver to emit code to set this, just make it part of
the driver's existing static const ops structure.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c               | 12 +++++++++---
 drivers/infiniband/core/uverbs_uapi.c          |  2 +-
 drivers/infiniband/hw/bnxt_re/main.c           |  3 ++-
 drivers/infiniband/hw/cxgb3/iwch_provider.c    |  3 ++-
 drivers/infiniband/hw/cxgb4/provider.c         |  3 ++-
 drivers/infiniband/hw/efa/efa_main.c           |  3 ++-
 drivers/infiniband/hw/hfi1/verbs.c             |  4 +++-
 drivers/infiniband/hw/hns/hns_roce_main.c      |  3 ++-
 drivers/infiniband/hw/i40iw/i40iw_verbs.c      |  3 ++-
 drivers/infiniband/hw/mlx4/main.c              |  3 ++-
 drivers/infiniband/hw/mlx5/main.c              |  3 ++-
 drivers/infiniband/hw/mthca/mthca_provider.c   |  3 ++-
 drivers/infiniband/hw/nes/nes_verbs.c          |  3 ++-
 drivers/infiniband/hw/ocrdma/ocrdma_main.c     |  3 ++-
 drivers/infiniband/hw/qedr/main.c              |  3 ++-
 drivers/infiniband/hw/qib/qib_verbs.c          |  4 +++-
 drivers/infiniband/hw/usnic/usnic_ib_main.c    |  3 ++-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c |  3 ++-
 drivers/infiniband/sw/rdmavt/vt.c              |  3 +--
 drivers/infiniband/sw/rxe/rxe_verbs.c          |  3 ++-
 include/rdma/ib_verbs.h                        |  3 ++-
 include/rdma/rdma_vt.h                         |  2 +-
 22 files changed, 50 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index afb3f5946796..538d01f27bf8 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -373,7 +373,7 @@ struct ib_device *ib_device_get_by_name(const char *name,
 	down_read(&devices_rwsem);
 	device = __ib_device_get_by_name(name);
 	if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
-	    device->driver_id != driver_id)
+	    device->ops.driver_id != driver_id)
 		device = NULL;
 
 	if (device) {
@@ -1456,7 +1456,7 @@ void ib_unregister_driver(enum rdma_driver_id driver_id)
 
 	down_read(&devices_rwsem);
 	xa_for_each (&devices, index, ib_dev) {
-		if (ib_dev->driver_id != driver_id)
+		if (ib_dev->ops.driver_id != driver_id)
 			continue;
 
 		get_device(&ib_dev->dev);
@@ -2013,7 +2013,7 @@ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
 				    (uintptr_t)ndev) {
 		if (rcu_access_pointer(cur->netdev) == ndev &&
 		    (driver_id == RDMA_DRIVER_UNKNOWN ||
-		     cur->ib_dev->driver_id == driver_id) &&
+		     cur->ib_dev->ops.driver_id == driver_id) &&
 		    ib_device_try_get(cur->ib_dev)) {
 			res = cur->ib_dev;
 			break;
@@ -2318,6 +2318,12 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 
 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
 
+	if (ops->driver_id != RDMA_DRIVER_UNKNOWN) {
+		WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN &&
+			dev_ops->driver_id != ops->driver_id);
+		dev_ops->driver_id = ops->driver_id;
+	}
+
 	SET_DEVICE_OP(dev_ops, add_gid);
 	SET_DEVICE_OP(dev_ops, advise_mr);
 	SET_DEVICE_OP(dev_ops, alloc_dm);
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index ccc4be0a6566..00c547887132 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -647,7 +647,7 @@ struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL);
-	uapi->driver_id = ibdev->driver_id;
+	uapi->driver_id = ibdev->ops.driver_id;
 
 	rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false);
 	if (rc)
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 814f959c7db9..1ef5f83ec914 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -596,6 +596,8 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
 }
 
 static const struct ib_device_ops bnxt_re_dev_ops = {
+	.driver_id = RDMA_DRIVER_BNXT_RE,
+
 	.add_gid = bnxt_re_add_gid,
 	.alloc_hw_stats = bnxt_re_ib_alloc_hw_stats,
 	.alloc_mr = bnxt_re_alloc_mr,
@@ -691,7 +693,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 
 
 	rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group);
-	ibdev->driver_id = RDMA_DRIVER_BNXT_RE;
 	ib_set_device_ops(ibdev, &bnxt_re_dev_ops);
 	ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1);
 	if (ret)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 4bfab739ec0d..9a9527fdcb9a 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1236,6 +1236,8 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str)
 }
 
 static const struct ib_device_ops iwch_dev_ops = {
+	.driver_id = RDMA_DRIVER_CXGB3,
+
 	.alloc_hw_stats	= iwch_alloc_stats,
 	.alloc_mr = iwch_alloc_mr,
 	.alloc_mw = iwch_alloc_mw,
@@ -1319,7 +1321,6 @@ int iwch_register_device(struct iwch_dev *dev)
 	memcpy(dev->ibdev.iw_ifname, dev->rdev.t3cdev_p->lldev->name,
 	       sizeof(dev->ibdev.iw_ifname));
 
-	dev->ibdev.driver_id = RDMA_DRIVER_CXGB3;
 	rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group);
 	ib_set_device_ops(&dev->ibdev, &iwch_dev_ops);
 	return ib_register_device(&dev->ibdev, "cxgb3_%d");
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 8ed75b141521..74644afe25ab 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -489,6 +489,8 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
 }
 
 static const struct ib_device_ops c4iw_dev_ops = {
+	.driver_id = RDMA_DRIVER_CXGB4,
+
 	.alloc_hw_stats = c4iw_alloc_stats,
 	.alloc_mr = c4iw_alloc_mr,
 	.alloc_mw = c4iw_alloc_mw,
@@ -599,7 +601,6 @@ void c4iw_register_device(struct work_struct *work)
 	       sizeof(dev->ibdev.iw_ifname));
 
 	rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group);
-	dev->ibdev.driver_id = RDMA_DRIVER_CXGB4;
 	ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops);
 	ret = set_netdevs(&dev->ibdev, &dev->rdev);
 	if (ret)
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index db974caf1eb1..3803dd4526b5 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -197,6 +197,8 @@ static void efa_stats_init(struct efa_dev *dev)
 }
 
 static const struct ib_device_ops efa_dev_ops = {
+	.driver_id = RDMA_DRIVER_EFA,
+
 	.alloc_pd = efa_alloc_pd,
 	.alloc_ucontext = efa_alloc_ucontext,
 	.create_ah = efa_create_ah,
@@ -287,7 +289,6 @@ static int efa_ib_device_add(struct efa_dev *dev)
 	dev->ibdev.uverbs_ex_cmd_mask =
 		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
 
-	dev->ibdev.driver_id = RDMA_DRIVER_EFA;
 	ib_set_device_ops(&dev->ibdev, &efa_dev_ops);
 
 	err = ib_register_device(&dev->ibdev, "efa_%d");
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 1eb4105b2d22..a97f4f9e5c6a 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1779,6 +1779,8 @@ static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
 }
 
 static const struct ib_device_ops hfi1_dev_ops = {
+	.driver_id = RDMA_DRIVER_HFI1,
+
 	.alloc_hw_stats = alloc_hw_stats,
 	.alloc_rdma_netdev = hfi1_vnic_alloc_rn,
 	.get_dev_fw_str = hfi1_get_dev_fw_str,
@@ -1923,7 +1925,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev,
 				    &ib_hfi1_attr_group);
 
-	ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1);
+	ret = rvt_register_device(&dd->verbs_dev.rdi);
 	if (ret)
 		goto err_verbs_txreq;
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index a6c5c67d0b87..dd408f8afe72 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -414,6 +414,8 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
 }
 
 static const struct ib_device_ops hns_roce_dev_ops = {
+	.driver_id = RDMA_DRIVER_HNS,
+
 	.add_gid = hns_roce_add_gid,
 	.alloc_pd = hns_roce_alloc_pd,
 	.alloc_ucontext = hns_roce_alloc_ucontext,
@@ -536,7 +538,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
 		ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_srq_ops);
 	}
 
-	ib_dev->driver_id = RDMA_DRIVER_HNS;
 	ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
 	ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
 	for (i = 0; i < hr_dev->caps.num_ports; i++) {
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index a10a30d44b32..1979cefdf90c 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -2650,6 +2650,8 @@ static int i40iw_query_pkey(struct ib_device *ibdev,
 }
 
 static const struct ib_device_ops i40iw_dev_ops = {
+	.driver_id = RDMA_DRIVER_I40IW,
+
 	.alloc_hw_stats = i40iw_alloc_hw_stats,
 	.alloc_mr = i40iw_alloc_mr,
 	.alloc_pd = i40iw_alloc_pd,
@@ -2787,7 +2789,6 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev)
 		return -ENOMEM;
 	iwibdev = iwdev->iwibdev;
 	rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group);
-	iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW;
 	ret = ib_register_device(&iwibdev->ibdev, "i40iw%d");
 	if (ret)
 		goto error;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 25d09d53b51c..03847e2f7835 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2509,6 +2509,8 @@ static void get_fw_ver_str(struct ib_device *device, char *str)
 }
 
 static const struct ib_device_ops mlx4_ib_dev_ops = {
+	.driver_id = RDMA_DRIVER_MLX4,
+
 	.add_gid = mlx4_ib_add_gid,
 	.alloc_mr = mlx4_ib_alloc_mr,
 	.alloc_pd = mlx4_ib_alloc_pd,
@@ -2839,7 +2841,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		goto err_steer_free_bitmap;
 
 	rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group);
-	ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4;
 	if (ib_register_device(&ibdev->ib_dev, "mlx4_%d"))
 		goto err_diag_counters;
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index abac70ad5c7c..abd416a31e71 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6124,6 +6124,8 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
 }
 
 static const struct ib_device_ops mlx5_ib_dev_ops = {
+	.driver_id = RDMA_DRIVER_MLX5,
+
 	.add_gid = mlx5_ib_add_gid,
 	.alloc_mr = mlx5_ib_alloc_mr,
 	.alloc_pd = mlx5_ib_alloc_pd,
@@ -6290,7 +6292,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
 	if (mlx5_accel_ipsec_device_caps(dev->mdev) &
 	    MLX5_ACCEL_IPSEC_CAP_DEVICE)
 		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops);
-	dev->ib_dev.driver_id = RDMA_DRIVER_MLX5;
 	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
 
 	if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 4f40dfedf920..d6467da39aab 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1153,6 +1153,8 @@ static void get_dev_fw_str(struct ib_device *device, char *str)
 }
 
 static const struct ib_device_ops mthca_dev_ops = {
+	.driver_id = RDMA_DRIVER_MTHCA,
+
 	.alloc_pd = mthca_alloc_pd,
 	.alloc_ucontext = mthca_alloc_ucontext,
 	.attach_mcast = mthca_multicast_attach,
@@ -1303,7 +1305,6 @@ int mthca_register_device(struct mthca_dev *dev)
 	mutex_init(&dev->cap_mask_mutex);
 
 	rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group);
-	dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA;
 	ret = ib_register_device(&dev->ib_dev, "mthca%d");
 	if (ret)
 		return ret;
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index fb2d0762c7c8..3c85e2ef5a08 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -3558,6 +3558,8 @@ static void get_dev_fw_str(struct ib_device *dev, char *str)
 }
 
 static const struct ib_device_ops nes_dev_ops = {
+	.driver_id = RDMA_DRIVER_NES,
+
 	.alloc_mr = nes_alloc_mr,
 	.alloc_mw = nes_alloc_mw,
 	.alloc_pd = nes_alloc_pd,
@@ -3722,7 +3724,6 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev)
 	int ret;
 
 	rdma_set_device_sysfs_group(&nesvnic->nesibdev->ibdev, &nes_attr_group);
-	nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES;
 	ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d");
 	if (ret) {
 		return ret;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index fc6c0962dea9..a9da4b857566 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -144,6 +144,8 @@ static const struct attribute_group ocrdma_attr_group = {
 };
 
 static const struct ib_device_ops ocrdma_dev_ops = {
+	.driver_id = RDMA_DRIVER_OCRDMA,
+
 	.alloc_mr = ocrdma_alloc_mr,
 	.alloc_pd = ocrdma_alloc_pd,
 	.alloc_ucontext = ocrdma_alloc_ucontext,
@@ -249,7 +251,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 		ib_set_device_ops(&dev->ibdev, &ocrdma_dev_srq_ops);
 	}
 	rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group);
-	dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA;
 	ret = ib_device_set_netdev(&dev->ibdev, dev->nic_info.netdev, 1);
 	if (ret)
 		return ret;
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 083c2c00a8e9..737745231f8f 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -183,6 +183,8 @@ static void qedr_roce_register_device(struct qedr_dev *dev)
 }
 
 static const struct ib_device_ops qedr_dev_ops = {
+	.driver_id = RDMA_DRIVER_QEDR,
+
 	.alloc_mr = qedr_alloc_mr,
 	.alloc_pd = qedr_alloc_pd,
 	.alloc_ucontext = qedr_alloc_ucontext,
@@ -274,7 +276,6 @@ static int qedr_register_device(struct qedr_dev *dev)
 	rdma_set_device_sysfs_group(&dev->ibdev, &qedr_attr_group);
 	ib_set_device_ops(&dev->ibdev, &qedr_dev_ops);
 
-	dev->ibdev.driver_id = RDMA_DRIVER_QEDR;
 	rc = ib_device_set_netdev(&dev->ibdev, dev->ndev, 1);
 	if (rc)
 		return rc;
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 5ff32d32c61c..bbc331d7f49b 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1482,6 +1482,8 @@ static void qib_fill_device_attr(struct qib_devdata *dd)
 }
 
 static const struct ib_device_ops qib_dev_ops = {
+	.driver_id = RDMA_DRIVER_QIB,
+
 	.init_port = qib_create_port_files,
 	.modify_device = qib_modify_device,
 	.process_mad = qib_process_mad,
@@ -1616,7 +1618,7 @@ int qib_register_ib_device(struct qib_devdata *dd)
 	rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, &qib_attr_group);
 
 	ib_set_device_ops(ibdev, &qib_dev_ops);
-	ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_QIB);
+	ret = rvt_register_device(&dd->verbs_dev.rdi);
 	if (ret)
 		goto err_tx;
 
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index d88d9f8a7f9a..47830334cb55 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -329,6 +329,8 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str)
 }
 
 static const struct ib_device_ops usnic_dev_ops = {
+	.driver_id = RDMA_DRIVER_USNIC,
+
 	.alloc_pd = usnic_ib_alloc_pd,
 	.alloc_ucontext = usnic_ib_alloc_ucontext,
 	.create_cq = usnic_ib_create_cq,
@@ -412,7 +414,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
 
 	ib_set_device_ops(&us_ibdev->ib_dev, &usnic_dev_ops);
 
-	us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC;
 	rdma_set_device_sysfs_group(&us_ibdev->ib_dev, &usnic_attr_group);
 
 	ret = ib_device_set_netdev(&us_ibdev->ib_dev, us_ibdev->netdev, 1);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index 40182297f87f..54a0b6372629 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -144,6 +144,8 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
 }
 
 static const struct ib_device_ops pvrdma_dev_ops = {
+	.driver_id = RDMA_DRIVER_VMW_PVRDMA,
+
 	.add_gid = pvrdma_add_gid,
 	.alloc_mr = pvrdma_alloc_mr,
 	.alloc_pd = pvrdma_alloc_pd,
@@ -261,7 +263,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev)
 		if (!dev->srq_tbl)
 			goto err_qp_free;
 	}
-	dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA;
 	ret = ib_device_set_netdev(&dev->ib_dev, dev->netdev, 1);
 	if (ret)
 		return ret;
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 9546a837a8ac..60700e197e6c 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -530,7 +530,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
  *
  * Return: 0 on success otherwise an errno.
  */
-int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
+int rvt_register_device(struct rvt_dev_info *rdi)
 {
 	int ret = 0, i;
 
@@ -636,7 +636,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
 	if (!rdi->ibdev.num_comp_vectors)
 		rdi->ibdev.num_comp_vectors = 1;
 
-	rdi->ibdev.driver_id = driver_id;
 	/* We are now good to announce we exist */
 	ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev));
 	if (ret) {
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 8c3e2a18cfe4..3d3130dc6380 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1111,6 +1111,8 @@ static int rxe_enable_driver(struct ib_device *ib_dev)
 }
 
 static const struct ib_device_ops rxe_dev_ops = {
+	.driver_id = RDMA_DRIVER_RXE,
+
 	.alloc_hw_stats = rxe_ib_alloc_hw_stats,
 	.alloc_mr = rxe_alloc_mr,
 	.alloc_pd = rxe_alloc_pd,
@@ -1230,7 +1232,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
 	rxe->tfm = tfm;
 
 	rdma_set_device_sysfs_group(dev, &rxe_attr_group);
-	dev->driver_id = RDMA_DRIVER_RXE;
 	err = ib_register_device(dev, ibdev_name);
 	if (err)
 		pr_warn("%s failed with error %d\n", __func__, err);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index ec6446864b08..dacf2b5ad862 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2329,6 +2329,8 @@ struct iw_cm_conn_param;
  * need to define the supported operations, otherwise they will be set to null.
  */
 struct ib_device_ops {
+	enum rdma_driver_id driver_id;
+
 	int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr,
 			 const struct ib_send_wr **bad_send_wr);
 	int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
@@ -2672,7 +2674,6 @@ struct ib_device {
 	struct rdma_restrack_root *res;
 
 	const struct uapi_definition   *driver_def;
-	enum rdma_driver_id		driver_id;
 
 	/*
 	 * Positive refcount indicates that the device is currently
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index b9cd06db1a71..997f42678806 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -555,7 +555,7 @@ static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi,
 
 struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
 void rvt_dealloc_device(struct rvt_dev_info *rdi);
-int rvt_register_device(struct rvt_dev_info *rvd, u32 driver_id);
+int rvt_register_device(struct rvt_dev_info *rvd);
 void rvt_unregister_device(struct rvt_dev_info *rvd);
 int rvt_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
 int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port,
-- 
cgit v1.2.3


From 72c6ec18eb6161c8fc672ae96ec5c77df4d07405 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 5 Jun 2019 14:39:25 -0300
Subject: RDMA: Move uverbs_abi_ver into struct ib_device_ops

No reason for every driver to emit code to set this, just make it part of
the driver's existing static const ops structure.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c               |  2 ++
 drivers/infiniband/core/uverbs_main.c          |  2 +-
 drivers/infiniband/hw/bnxt_re/ib_verbs.c       |  6 +++---
 drivers/infiniband/hw/bnxt_re/main.c           |  2 +-
 drivers/infiniband/hw/cxgb3/iwch_provider.c    |  2 +-
 drivers/infiniband/hw/cxgb4/provider.c         |  2 +-
 drivers/infiniband/hw/efa/efa_main.c           |  2 +-
 drivers/infiniband/hw/hns/hns_roce_main.c      |  2 +-
 drivers/infiniband/hw/i40iw/i40iw_verbs.c      |  2 ++
 drivers/infiniband/hw/mlx4/main.c              | 15 ++++++++-------
 drivers/infiniband/hw/mlx5/main.c              |  2 +-
 drivers/infiniband/hw/mthca/mthca_provider.c   |  2 +-
 drivers/infiniband/hw/nes/nes_verbs.c          |  2 ++
 drivers/infiniband/hw/ocrdma/ocrdma_main.c     |  2 +-
 drivers/infiniband/hw/qedr/main.c              |  2 +-
 drivers/infiniband/hw/usnic/usnic_ib_main.c    |  2 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c |  2 +-
 drivers/infiniband/sw/rdmavt/vt.c              |  3 ++-
 drivers/infiniband/sw/rxe/rxe_verbs.c          |  2 +-
 include/rdma/ib_verbs.h                        |  2 +-
 20 files changed, 33 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 538d01f27bf8..a00b7fc360bf 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2323,6 +2323,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 			dev_ops->driver_id != ops->driver_id);
 		dev_ops->driver_id = ops->driver_id;
 	}
+	if (ops->uverbs_abi_ver)
+		dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
 
 	SET_DEVICE_OP(dev_ops, add_gid);
 	SET_DEVICE_OP(dev_ops, advise_mr);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 84a5e9a6d483..0f8a286a92d3 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -1186,7 +1186,7 @@ static ssize_t abi_version_show(struct device *device,
 	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
 	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
 	if (ib_dev)
-		ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+		ret = sprintf(buf, "%u\n", ib_dev->ops.uverbs_abi_ver);
 	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
 	return ret;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 2c3685faa57a..8af8e1472101 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -3630,10 +3630,10 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
 	u32 chip_met_rev_num = 0;
 	int rc;
 
-	dev_dbg(rdev_to_dev(rdev), "ABI version requested %d",
-		ibdev->uverbs_abi_ver);
+	dev_dbg(rdev_to_dev(rdev), "ABI version requested %u",
+		ibdev->ops.uverbs_abi_ver);
 
-	if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
+	if (ibdev->ops.uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
 		dev_dbg(rdev_to_dev(rdev), " is different from the device %d ",
 			BNXT_RE_ABI_VERSION);
 		return -EPERM;
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 1ef5f83ec914..a45cb9ee51ab 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -597,6 +597,7 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
 
 static const struct ib_device_ops bnxt_re_dev_ops = {
 	.driver_id = RDMA_DRIVER_BNXT_RE,
+	.uverbs_abi_ver = BNXT_RE_ABI_VERSION,
 
 	.add_gid = bnxt_re_add_gid,
 	.alloc_hw_stats = bnxt_re_ib_alloc_hw_stats,
@@ -663,7 +664,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 	ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY;
 
 	/* User space */
-	ibdev->uverbs_abi_ver = BNXT_RE_ABI_VERSION;
 	ibdev->uverbs_cmd_mask =
 			(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 			(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 9a9527fdcb9a..5b410605a3a1 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1237,6 +1237,7 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str)
 
 static const struct ib_device_ops iwch_dev_ops = {
 	.driver_id = RDMA_DRIVER_CXGB3,
+	.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION,
 
 	.alloc_hw_stats	= iwch_alloc_stats,
 	.alloc_mr = iwch_alloc_mr,
@@ -1316,7 +1317,6 @@ int iwch_register_device(struct iwch_dev *dev)
 	dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
 	dev->ibdev.num_comp_vectors = 1;
 	dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev;
-	dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
 
 	memcpy(dev->ibdev.iw_ifname, dev->rdev.t3cdev_p->lldev->name,
 	       sizeof(dev->ibdev.iw_ifname));
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 74644afe25ab..c56cdfbd8a88 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -490,6 +490,7 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
 
 static const struct ib_device_ops c4iw_dev_ops = {
 	.driver_id = RDMA_DRIVER_CXGB4,
+	.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION,
 
 	.alloc_hw_stats = c4iw_alloc_stats,
 	.alloc_mr = c4iw_alloc_mr,
@@ -595,7 +596,6 @@ void c4iw_register_device(struct work_struct *work)
 	dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports;
 	dev->ibdev.num_comp_vectors =  dev->rdev.lldi.nciq;
 	dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev;
-	dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
 
 	memcpy(dev->ibdev.iw_ifname, dev->rdev.lldi.ports[0]->name,
 	       sizeof(dev->ibdev.iw_ifname));
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index 3803dd4526b5..b05c5a0b9bc0 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -198,6 +198,7 @@ static void efa_stats_init(struct efa_dev *dev)
 
 static const struct ib_device_ops efa_dev_ops = {
 	.driver_id = RDMA_DRIVER_EFA,
+	.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION,
 
 	.alloc_pd = efa_alloc_pd,
 	.alloc_ucontext = efa_alloc_ucontext,
@@ -266,7 +267,6 @@ static int efa_ib_device_add(struct efa_dev *dev)
 	dev->ibdev.phys_port_cnt = 1;
 	dev->ibdev.num_comp_vectors = 1;
 	dev->ibdev.dev.parent = &pdev->dev;
-	dev->ibdev.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION;
 
 	dev->ibdev.uverbs_cmd_mask =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index dd408f8afe72..e496b0628e25 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -415,6 +415,7 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
 
 static const struct ib_device_ops hns_roce_dev_ops = {
 	.driver_id = RDMA_DRIVER_HNS,
+	.uverbs_abi_ver = 1,
 
 	.add_gid = hns_roce_add_gid,
 	.alloc_pd = hns_roce_alloc_pd,
@@ -489,7 +490,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
 	ib_dev->phys_port_cnt		= hr_dev->caps.num_ports;
 	ib_dev->local_dma_lkey		= hr_dev->caps.reserved_lkey;
 	ib_dev->num_comp_vectors	= hr_dev->caps.num_comp_vectors;
-	ib_dev->uverbs_abi_ver		= 1;
 	ib_dev->uverbs_cmd_mask		=
 		(1ULL << IB_USER_VERBS_CMD_GET_CONTEXT) |
 		(1ULL << IB_USER_VERBS_CMD_QUERY_DEVICE) |
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 1979cefdf90c..4dc647c8556b 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -2651,6 +2651,8 @@ static int i40iw_query_pkey(struct ib_device *ibdev,
 
 static const struct ib_device_ops i40iw_dev_ops = {
 	.driver_id = RDMA_DRIVER_I40IW,
+	/* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */
+	.uverbs_abi_ver = I40IW_ABI_VER,
 
 	.alloc_hw_stats = i40iw_alloc_hw_stats,
 	.alloc_mr = i40iw_alloc_mr,
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 03847e2f7835..1f87221acbb0 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1089,7 +1089,8 @@ static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx,
 	if (!dev->ib_active)
 		return -EAGAIN;
 
-	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+	if (ibdev->ops.uverbs_abi_ver ==
+	    MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
 		resp_v3.qp_tab_size      = dev->dev->caps.num_qps;
 		resp_v3.bf_reg_size      = dev->dev->caps.bf_reg_size;
 		resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
@@ -1111,7 +1112,7 @@ static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx,
 	INIT_LIST_HEAD(&context->wqn_ranges_list);
 	mutex_init(&context->wqn_ranges_mutex);
 
-	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
+	if (ibdev->ops.uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
 		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
 	else
 		err = ib_copy_to_udata(udata, &resp, sizeof(resp));
@@ -2510,6 +2511,7 @@ static void get_fw_ver_str(struct ib_device *device, char *str)
 
 static const struct ib_device_ops mlx4_ib_dev_ops = {
 	.driver_id = RDMA_DRIVER_MLX4,
+	.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION,
 
 	.add_gid = mlx4_ib_add_gid,
 	.alloc_mr = mlx4_ib_alloc_mr,
@@ -2653,11 +2655,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
 	ibdev->ib_dev.dev.parent	= &dev->persist->pdev->dev;
 
-	if (dev->caps.userspace_caps)
-		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
-	else
-		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
-
 	ibdev->ib_dev.uverbs_cmd_mask	=
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
@@ -2731,6 +2728,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fs_ops);
 	}
 
+	if (!dev->caps.userspace_caps)
+		ibdev->ib_dev.ops.uverbs_abi_ver =
+			MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
+
 	mlx4_ib_alloc_eqs(dev, ibdev);
 
 	spin_lock_init(&iboe->lock);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index abd416a31e71..0c23eccb8855 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6125,6 +6125,7 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
 
 static const struct ib_device_ops mlx5_ib_dev_ops = {
 	.driver_id = RDMA_DRIVER_MLX5,
+	.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION,
 
 	.add_gid = mlx5_ib_add_gid,
 	.alloc_mr = mlx5_ib_alloc_mr,
@@ -6223,7 +6224,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
 	struct mlx5_core_dev *mdev = dev->mdev;
 	int err;
 
-	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
 	dev->ib_dev.uverbs_cmd_mask	=
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index d6467da39aab..690c65accea4 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1154,6 +1154,7 @@ static void get_dev_fw_str(struct ib_device *device, char *str)
 
 static const struct ib_device_ops mthca_dev_ops = {
 	.driver_id = RDMA_DRIVER_MTHCA,
+	.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION,
 
 	.alloc_pd = mthca_alloc_pd,
 	.alloc_ucontext = mthca_alloc_ucontext,
@@ -1247,7 +1248,6 @@ int mthca_register_device(struct mthca_dev *dev)
 
 	dev->ib_dev.owner                = THIS_MODULE;
 
-	dev->ib_dev.uverbs_abi_ver	 = MTHCA_UVERBS_ABI_VERSION;
 	dev->ib_dev.uverbs_cmd_mask	 =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 3c85e2ef5a08..f1fdd6829a40 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -3559,6 +3559,8 @@ static void get_dev_fw_str(struct ib_device *dev, char *str)
 
 static const struct ib_device_ops nes_dev_ops = {
 	.driver_id = RDMA_DRIVER_NES,
+	/* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */
+	.uverbs_abi_ver = NES_ABI_USERSPACE_VER,
 
 	.alloc_mr = nes_alloc_mr,
 	.alloc_mw = nes_alloc_mw,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index a9da4b857566..ef823f1144b5 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -145,6 +145,7 @@ static const struct attribute_group ocrdma_attr_group = {
 
 static const struct ib_device_ops ocrdma_dev_ops = {
 	.driver_id = RDMA_DRIVER_OCRDMA,
+	.uverbs_abi_ver = OCRDMA_ABI_VERSION,
 
 	.alloc_mr = ocrdma_alloc_mr,
 	.alloc_pd = ocrdma_alloc_pd,
@@ -203,7 +204,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 	memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC,
 	       sizeof(OCRDMA_NODE_DESC));
 	dev->ibdev.owner = THIS_MODULE;
-	dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION;
 	dev->ibdev.uverbs_cmd_mask =
 	    OCRDMA_UVERBS(GET_CONTEXT) |
 	    OCRDMA_UVERBS(QUERY_DEVICE) |
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 737745231f8f..793e25776a7e 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -184,6 +184,7 @@ static void qedr_roce_register_device(struct qedr_dev *dev)
 
 static const struct ib_device_ops qedr_dev_ops = {
 	.driver_id = RDMA_DRIVER_QEDR,
+	.uverbs_abi_ver = QEDR_ABI_VERSION,
 
 	.alloc_mr = qedr_alloc_mr,
 	.alloc_pd = qedr_alloc_pd,
@@ -234,7 +235,6 @@ static int qedr_register_device(struct qedr_dev *dev)
 	dev->ibdev.node_guid = dev->attr.node_guid;
 	memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC));
 	dev->ibdev.owner = THIS_MODULE;
-	dev->ibdev.uverbs_abi_ver = QEDR_ABI_VERSION;
 
 	dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) |
 				     QEDR_UVERBS(QUERY_DEVICE) |
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index 47830334cb55..f61690816095 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -330,6 +330,7 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str)
 
 static const struct ib_device_ops usnic_dev_ops = {
 	.driver_id = RDMA_DRIVER_USNIC,
+	.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION,
 
 	.alloc_pd = usnic_ib_alloc_pd,
 	.alloc_ucontext = usnic_ib_alloc_ucontext,
@@ -391,7 +392,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
 	us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT;
 	us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS;
 	us_ibdev->ib_dev.dev.parent = &dev->dev;
-	us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION;
 
 	us_ibdev->ib_dev.uverbs_cmd_mask =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index 54a0b6372629..2a7eb2838453 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -145,6 +145,7 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
 
 static const struct ib_device_ops pvrdma_dev_ops = {
 	.driver_id = RDMA_DRIVER_VMW_PVRDMA,
+	.uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION,
 
 	.add_gid = pvrdma_add_gid,
 	.alloc_mr = pvrdma_alloc_mr,
@@ -203,7 +204,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev)
 	dev->ib_dev.owner = THIS_MODULE;
 	dev->ib_dev.num_comp_vectors = 1;
 	dev->ib_dev.dev.parent = &dev->pdev->dev;
-	dev->ib_dev.uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION;
 	dev->ib_dev.uverbs_cmd_mask =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 60700e197e6c..639ef8ac5400 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -382,6 +382,8 @@ enum {
 };
 
 static const struct ib_device_ops rvt_dev_ops = {
+	.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION,
+
 	.alloc_fmr = rvt_alloc_fmr,
 	.alloc_mr = rvt_alloc_mr,
 	.alloc_pd = rvt_alloc_pd,
@@ -600,7 +602,6 @@ int rvt_register_device(struct rvt_dev_info *rdi)
 	 * exactly which functions rdmavt supports, nor do they know the ABI
 	 * version, so we do all of this sort of stuff here.
 	 */
-	rdi->ibdev.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION;
 	rdi->ibdev.uverbs_cmd_mask =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 3d3130dc6380..9e87cdb82bec 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1112,6 +1112,7 @@ static int rxe_enable_driver(struct ib_device *ib_dev)
 
 static const struct ib_device_ops rxe_dev_ops = {
 	.driver_id = RDMA_DRIVER_RXE,
+	.uverbs_abi_ver = RXE_UVERBS_ABI_VERSION,
 
 	.alloc_hw_stats = rxe_ib_alloc_hw_stats,
 	.alloc_mr = rxe_alloc_mr,
@@ -1184,7 +1185,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
 	dma_coerce_mask_and_coherent(&dev->dev,
 				     dma_get_required_mask(&dev->dev));
 
-	dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION;
 	dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)
 	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)
 	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index dacf2b5ad862..16405b9bca13 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2330,6 +2330,7 @@ struct iw_cm_conn_param;
  */
 struct ib_device_ops {
 	enum rdma_driver_id driver_id;
+	u32 uverbs_abi_ver;
 
 	int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr,
 			 const struct ib_send_wr **bad_send_wr);
@@ -2650,7 +2651,6 @@ struct ib_device {
 	 */
 	const struct attribute_group	*groups[3];
 
-	int			     uverbs_abi_ver;
 	u64			     uverbs_cmd_mask;
 	u64			     uverbs_ex_cmd_mask;
 
-- 
cgit v1.2.3


From 7a15414252ae4f1d450462d83f883b2d9d8036ee Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 5 Jun 2019 14:39:26 -0300
Subject: RDMA: Move owner into struct ib_device_ops

This more closely follows how other subsytems work, with owner being a
member of the structure containing the function pointers.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c               | 4 ++++
 drivers/infiniband/core/uverbs_main.c          | 6 +++---
 drivers/infiniband/hw/bnxt_re/main.c           | 2 +-
 drivers/infiniband/hw/cxgb3/iwch_provider.c    | 2 +-
 drivers/infiniband/hw/cxgb4/provider.c         | 2 +-
 drivers/infiniband/hw/efa/efa_main.c           | 2 +-
 drivers/infiniband/hw/hfi1/verbs.c             | 2 +-
 drivers/infiniband/hw/hns/hns_roce_main.c      | 2 +-
 drivers/infiniband/hw/i40iw/i40iw_verbs.c      | 2 +-
 drivers/infiniband/hw/mlx4/main.c              | 2 +-
 drivers/infiniband/hw/mlx5/main.c              | 2 +-
 drivers/infiniband/hw/mthca/mthca_provider.c   | 3 +--
 drivers/infiniband/hw/nes/nes_verbs.c          | 2 +-
 drivers/infiniband/hw/ocrdma/ocrdma_main.c     | 2 +-
 drivers/infiniband/hw/qedr/main.c              | 2 +-
 drivers/infiniband/hw/qib/qib_verbs.c          | 2 +-
 drivers/infiniband/hw/usnic/usnic_ib_main.c    | 2 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 2 +-
 drivers/infiniband/sw/rxe/rxe_verbs.c          | 2 +-
 include/rdma/ib_verbs.h                        | 2 +-
 20 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a00b7fc360bf..357d74c8df2b 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2323,6 +2323,10 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 			dev_ops->driver_id != ops->driver_id);
 		dev_ops->driver_id = ops->driver_id;
 	}
+	if (ops->owner) {
+		WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner);
+		dev_ops->owner = ops->owner;
+	}
 	if (ops->uverbs_abi_ver)
 		dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
 
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 0f8a286a92d3..870b3dd35aac 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -198,7 +198,7 @@ void ib_uverbs_release_file(struct kref *ref)
 	ib_dev = srcu_dereference(file->device->ib_dev,
 				  &file->device->disassociate_srcu);
 	if (ib_dev && !ib_dev->ops.disassociate_ucontext)
-		module_put(ib_dev->owner);
+		module_put(ib_dev->ops.owner);
 	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 
 	if (atomic_dec_and_test(&file->device->refcount))
@@ -1065,7 +1065,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	module_dependent = !(ib_dev->ops.disassociate_ucontext);
 
 	if (module_dependent) {
-		if (!try_module_get(ib_dev->owner)) {
+		if (!try_module_get(ib_dev->ops.owner)) {
 			ret = -ENODEV;
 			goto err;
 		}
@@ -1100,7 +1100,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	return stream_open(inode, filp);
 
 err_module:
-	module_put(ib_dev->owner);
+	module_put(ib_dev->ops.owner);
 
 err:
 	mutex_unlock(&dev->lists_mutex);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index a45cb9ee51ab..351c420248a0 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -596,6 +596,7 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
 }
 
 static const struct ib_device_ops bnxt_re_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_BNXT_RE,
 	.uverbs_abi_ver = BNXT_RE_ABI_VERSION,
 
@@ -651,7 +652,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 	int ret;
 
 	/* ib device init */
-	ibdev->owner = THIS_MODULE;
 	ibdev->node_type = RDMA_NODE_IB_CA;
 	strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA",
 		strlen(BNXT_RE_DESC) + 5);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 5b410605a3a1..1b35941eee74 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1236,6 +1236,7 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str)
 }
 
 static const struct ib_device_ops iwch_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_CXGB3,
 	.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION,
 
@@ -1285,7 +1286,6 @@ int iwch_register_device(struct iwch_dev *dev)
 	pr_debug("%s iwch_dev %p\n", __func__, dev);
 	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
 	memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
-	dev->ibdev.owner = THIS_MODULE;
 	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
 				IB_DEVICE_MEM_WINDOW |
 				IB_DEVICE_MEM_MGT_EXTENSIONS;
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index c56cdfbd8a88..2b1f2443b7da 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -489,6 +489,7 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
 }
 
 static const struct ib_device_ops c4iw_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_CXGB4,
 	.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION,
 
@@ -563,7 +564,6 @@ void c4iw_register_device(struct work_struct *work)
 	pr_debug("c4iw_dev %p\n", dev);
 	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
 	memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
-	dev->ibdev.owner = THIS_MODULE;
 	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
 	if (fastreg_support)
 		dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index b05c5a0b9bc0..b891ee239a67 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -197,6 +197,7 @@ static void efa_stats_init(struct efa_dev *dev)
 }
 
 static const struct ib_device_ops efa_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_EFA,
 	.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION,
 
@@ -262,7 +263,6 @@ static int efa_ib_device_add(struct efa_dev *dev)
 	if (err)
 		goto err_release_doorbell_bar;
 
-	dev->ibdev.owner = THIS_MODULE;
 	dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED;
 	dev->ibdev.phys_port_cnt = 1;
 	dev->ibdev.num_comp_vectors = 1;
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index a97f4f9e5c6a..1f36db98240f 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1779,6 +1779,7 @@ static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
 }
 
 static const struct ib_device_ops hfi1_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_HFI1,
 
 	.alloc_hw_stats = alloc_hw_stats,
@@ -1831,7 +1832,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	 */
 	if (!ib_hfi1_sys_image_guid)
 		ib_hfi1_sys_image_guid = ibdev->node_guid;
-	ibdev->owner = THIS_MODULE;
 	ibdev->phys_port_cnt = dd->num_pports;
 	ibdev->dev.parent = &dd->pcidev->dev;
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index e496b0628e25..f07b2ec86ec2 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -414,6 +414,7 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
 }
 
 static const struct ib_device_ops hns_roce_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_HNS,
 	.uverbs_abi_ver = 1,
 
@@ -483,7 +484,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
 
 	ib_dev = &hr_dev->ib_dev;
 
-	ib_dev->owner			= THIS_MODULE;
 	ib_dev->node_type		= RDMA_NODE_IB_CA;
 	ib_dev->dev.parent		= dev;
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 4dc647c8556b..bfe16e6f04f4 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -2650,6 +2650,7 @@ static int i40iw_query_pkey(struct ib_device *ibdev,
 }
 
 static const struct ib_device_ops i40iw_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_I40IW,
 	/* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */
 	.uverbs_abi_ver = I40IW_ABI_VER,
@@ -2711,7 +2712,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
 		i40iw_pr_err("iwdev == NULL\n");
 		return NULL;
 	}
-	iwibdev->ibdev.owner = THIS_MODULE;
 	iwdev->iwibdev = iwibdev;
 	iwibdev->iwdev = iwdev;
 
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 1f87221acbb0..5d7a87842291 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2510,6 +2510,7 @@ static void get_fw_ver_str(struct ib_device *device, char *str)
 }
 
 static const struct ib_device_ops mlx4_ib_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_MLX4,
 	.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION,
 
@@ -2646,7 +2647,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->dev = dev;
 	ibdev->bond_next_port	= 0;
 
-	ibdev->ib_dev.owner		= THIS_MODULE;
 	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
 	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
 	ibdev->num_ports		= num_ports;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 0c23eccb8855..1e3d936ed809 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6044,7 +6044,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
 	if (mlx5_use_mad_ifc(dev))
 		get_ext_port_caps(dev);
 
-	dev->ib_dev.owner		= THIS_MODULE;
 	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
 	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
 	dev->ib_dev.phys_port_cnt	= dev->num_ports;
@@ -6124,6 +6123,7 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
 }
 
 static const struct ib_device_ops mlx5_ib_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_MLX5,
 	.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION,
 
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 690c65accea4..b128ff76f709 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1153,6 +1153,7 @@ static void get_dev_fw_str(struct ib_device *device, char *str)
 }
 
 static const struct ib_device_ops mthca_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_MTHCA,
 	.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION,
 
@@ -1246,8 +1247,6 @@ int mthca_register_device(struct mthca_dev *dev)
 	if (ret)
 		return ret;
 
-	dev->ib_dev.owner                = THIS_MODULE;
-
 	dev->ib_dev.uverbs_cmd_mask	 =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index f1fdd6829a40..db044b2eaead 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -3558,6 +3558,7 @@ static void get_dev_fw_str(struct ib_device *dev, char *str)
 }
 
 static const struct ib_device_ops nes_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_NES,
 	/* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */
 	.uverbs_abi_ver = NES_ABI_USERSPACE_VER,
@@ -3617,7 +3618,6 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
 	if (nesibdev == NULL) {
 		return NULL;
 	}
-	nesibdev->ibdev.owner = THIS_MODULE;
 
 	nesibdev->ibdev.node_type = RDMA_NODE_RNIC;
 	memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid));
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index ef823f1144b5..b326313d413f 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -144,6 +144,7 @@ static const struct attribute_group ocrdma_attr_group = {
 };
 
 static const struct ib_device_ops ocrdma_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_OCRDMA,
 	.uverbs_abi_ver = OCRDMA_ABI_VERSION,
 
@@ -203,7 +204,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 	BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX);
 	memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC,
 	       sizeof(OCRDMA_NODE_DESC));
-	dev->ibdev.owner = THIS_MODULE;
 	dev->ibdev.uverbs_cmd_mask =
 	    OCRDMA_UVERBS(GET_CONTEXT) |
 	    OCRDMA_UVERBS(QUERY_DEVICE) |
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 793e25776a7e..a0bb07ba0f3c 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -183,6 +183,7 @@ static void qedr_roce_register_device(struct qedr_dev *dev)
 }
 
 static const struct ib_device_ops qedr_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_QEDR,
 	.uverbs_abi_ver = QEDR_ABI_VERSION,
 
@@ -234,7 +235,6 @@ static int qedr_register_device(struct qedr_dev *dev)
 
 	dev->ibdev.node_guid = dev->attr.node_guid;
 	memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC));
-	dev->ibdev.owner = THIS_MODULE;
 
 	dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) |
 				     QEDR_UVERBS(QUERY_DEVICE) |
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index bbc331d7f49b..54310fd6c7b6 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1482,6 +1482,7 @@ static void qib_fill_device_attr(struct qib_devdata *dd)
 }
 
 static const struct ib_device_ops qib_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_QIB,
 
 	.init_port = qib_create_port_files,
@@ -1547,7 +1548,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
 	if (!ib_qib_sys_image_guid)
 		ib_qib_sys_image_guid = ppd->guid;
 
-	ibdev->owner = THIS_MODULE;
 	ibdev->node_guid = ppd->guid;
 	ibdev->phys_port_cnt = dd->num_pports;
 	ibdev->dev.parent = &dd->pcidev->dev;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index f61690816095..e701322dc740 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -329,6 +329,7 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str)
 }
 
 static const struct ib_device_ops usnic_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_USNIC,
 	.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION,
 
@@ -387,7 +388,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
 
 	us_ibdev->pdev = dev;
 	us_ibdev->netdev = pci_get_drvdata(dev);
-	us_ibdev->ib_dev.owner = THIS_MODULE;
 	us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP;
 	us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT;
 	us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS;
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index 2a7eb2838453..0c48464ffff1 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -144,6 +144,7 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
 }
 
 static const struct ib_device_ops pvrdma_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_VMW_PVRDMA,
 	.uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION,
 
@@ -201,7 +202,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev)
 	dev->ib_dev.node_guid = dev->dsr->caps.node_guid;
 	dev->sys_image_guid = dev->dsr->caps.sys_image_guid;
 	dev->flags = 0;
-	dev->ib_dev.owner = THIS_MODULE;
 	dev->ib_dev.num_comp_vectors = 1;
 	dev->ib_dev.dev.parent = &dev->pdev->dev;
 	dev->ib_dev.uverbs_cmd_mask =
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 9e87cdb82bec..046129393215 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1111,6 +1111,7 @@ static int rxe_enable_driver(struct ib_device *ib_dev)
 }
 
 static const struct ib_device_ops rxe_dev_ops = {
+	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_RXE,
 	.uverbs_abi_ver = RXE_UVERBS_ABI_VERSION,
 
@@ -1173,7 +1174,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
 
 	strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc));
 
-	dev->owner = THIS_MODULE;
 	dev->node_type = RDMA_NODE_IB_CA;
 	dev->phys_port_cnt = 1;
 	dev->num_comp_vectors = num_possible_cpus();
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 16405b9bca13..d1f16a6c4810 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2329,6 +2329,7 @@ struct iw_cm_conn_param;
  * need to define the supported operations, otherwise they will be set to null.
  */
 struct ib_device_ops {
+	struct module *owner;
 	enum rdma_driver_id driver_id;
 	u32 uverbs_abi_ver;
 
@@ -2639,7 +2640,6 @@ struct ib_device {
 
 	int			      num_comp_vectors;
 
-	struct module               *owner;
 	union {
 		struct device		dev;
 		struct ib_core_device	coredev;
-- 
cgit v1.2.3


From 75f3f70f0462f443283e66d8448bed9330073178 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sat, 8 Jun 2019 10:02:35 +0200
Subject: drm: drm_crtc.h self-contained

While removing drmP.h from drm/radeon a few files ended
up including drm_crtc.h as the first file.
This failed build due to a missing dependency in drm_crtc.h.

Add the missing include file.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20190608080241.4958-2-sam@ravnborg.org
---
 include/drm/drm_crtc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
index 58ad983d7cd6..dc42b9e35333 100644
--- a/include/drm/drm_crtc.h
+++ b/include/drm/drm_crtc.h
@@ -39,6 +39,7 @@
 #include <drm/drm_framebuffer.h>
 #include <drm/drm_modes.h>
 #include <drm/drm_connector.h>
+#include <drm/drm_device.h>
 #include <drm/drm_property.h>
 #include <drm/drm_bridge.h>
 #include <drm/drm_edid.h>
-- 
cgit v1.2.3


From a7d469cc99b16cfda51d1c8a5ea250119c3e220a Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sat, 8 Jun 2019 10:02:36 +0200
Subject: drm: drm_debugfs.h self-contained

While removing drmP.h from drm/radeon a few files ended
up including drm_debugfs.h as the first file.
This failed build due to missing dependencies in drm_debugfs.h.

Add the missing include files.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20190608080241.4958-3-sam@ravnborg.org
---
 include/drm/drm_debugfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_debugfs.h b/include/drm/drm_debugfs.h
index ac0f75df1ac9..7501e323d383 100644
--- a/include/drm/drm_debugfs.h
+++ b/include/drm/drm_debugfs.h
@@ -32,6 +32,8 @@
 #ifndef _DRM_DEBUGFS_H_
 #define _DRM_DEBUGFS_H_
 
+#include <linux/types.h>
+#include <linux/seq_file.h>
 /**
  * struct drm_info_list - debugfs info list entry
  *
-- 
cgit v1.2.3


From 656600efd67b9b185bc1c85ec68d9420c2bbb812 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Mon, 10 Jun 2019 00:07:48 +0200
Subject: drm: fix build errors with drm_print.h

drm_print.h requires <drm/drm.h> to fix build when macros are used.
Pull in the header file in drm_print.h so users do not have to do it.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20190609220757.10862-2-sam@ravnborg.org
---
 include/drm/drm_print.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index 3a4247319e63..a5d6f2f3e430 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -32,6 +32,8 @@
 #include <linux/device.h>
 #include <linux/debugfs.h>
 
+#include <drm/drm.h>
+
 /**
  * DOC: print
  *
-- 
cgit v1.2.3


From a987be182c309f565a2d3545f2e9c9a4f8d07a4d Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Mon, 20 May 2019 22:03:16 +0200
Subject: dt-bindings: clock: meson8b: add the audio clocks

The audio controllers on Meson8, Meson8b and Meson8m2 use similar
(potentially the same) audio clocks as GXBB, GXL and GXM. Add the
CLKID_CTS_AMCLK, CLKID_CTS_MCLK_I958 and CLKID_CTS_I958 clock IDs so
they can be used for the audio controllers.

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
---
 include/dt-bindings/clock/meson8b-clkc.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/meson8b-clkc.h b/include/dt-bindings/clock/meson8b-clkc.h
index 47556539f0ee..68862aaf977e 100644
--- a/include/dt-bindings/clock/meson8b-clkc.h
+++ b/include/dt-bindings/clock/meson8b-clkc.h
@@ -112,5 +112,8 @@
 #define CLKID_VDEC_HCODEC	199
 #define CLKID_VDEC_2		202
 #define CLKID_VDEC_HEVC		206
+#define CLKID_CTS_AMCLK		209
+#define CLKID_CTS_MCLK_I958	212
+#define CLKID_CTS_I958		213
 
 #endif /* __MESON8B_CLKC_H */
-- 
cgit v1.2.3


From 6e47ef34db571e3eebda46ddaddae00d369df5f9 Mon Sep 17 00:00:00 2001
From: Guillaume La Roque <glaroque@baylibre.com>
Date: Fri, 12 Apr 2019 12:02:20 +0200
Subject: dt-bindings: clk: g12a-clkc: add Temperature Sensor clock IDs

Add clock ids used by the temperature sensors of the G12A Socs

Reviewed-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Guillaume La Roque <glaroque@baylibre.com>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com> [fixed commit message]
---
 include/dt-bindings/clock/g12a-clkc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/g12a-clkc.h b/include/dt-bindings/clock/g12a-clkc.h
index 82c9e0c020b2..cfda463091b7 100644
--- a/include/dt-bindings/clock/g12a-clkc.h
+++ b/include/dt-bindings/clock/g12a-clkc.h
@@ -136,5 +136,6 @@
 #define CLKID_VDEC_1				204
 #define CLKID_VDEC_HEVC				207
 #define CLKID_VDEC_HEVCF			210
+#define CLKID_TS				212
 
 #endif /* __G12A_CLKC_H */
-- 
cgit v1.2.3


From e5852bee90d6cb7d9bd2c635c056e7746f137e06 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Sat, 8 Jun 2019 17:26:53 +0200
Subject: drm/fb-helper: Remove drm_fb_helper_connector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All drivers add all their connectors so there's no need to keep around an
array of available connectors. Instead we just put the useable (not
writeback) connectors in a temporary array using
drm_client_for_each_connector_iter() everytime we probe the outputs.
Other places where it's necessary to look at the connectors, we just
iterate over them using the same iterator function.

Rename functions which signature is changed since they will be moved to
drm_client in a later patch.

v6: Improve commit message (Sam Ravnborg)

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190608152657.36613-2-noralf@tronnes.org
---
 Documentation/gpu/todo.rst      |   4 +
 drivers/gpu/drm/drm_fb_helper.c | 506 +++++++++++++---------------------------
 include/drm/drm_client.h        |  15 ++
 include/drm/drm_fb_helper.h     |  80 ++-----
 4 files changed, 197 insertions(+), 408 deletions(-)

(limited to 'include')

diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
index 9d4038c50013..ab96ba0600a9 100644
--- a/Documentation/gpu/todo.rst
+++ b/Documentation/gpu/todo.rst
@@ -292,6 +292,10 @@ drm_fb_helper tasks
 - The max connector argument for drm_fb_helper_init() and
   drm_fb_helper_fbdev_setup() isn't used anymore and can be removed.
 
+- The helper doesn't keep an array of connectors anymore so these can be
+  removed: drm_fb_helper_single_add_all_connectors(),
+  drm_fb_helper_add_one_connector() and drm_fb_helper_remove_one_connector().
+
 Core refactorings
 =================
 
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 7b388674a456..b936db6280ac 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -95,12 +95,6 @@ static DEFINE_MUTEX(kernel_fb_helper_lock);
  * Setup fbdev emulation by calling drm_fb_helper_fbdev_setup() and tear it
  * down by calling drm_fb_helper_fbdev_teardown().
  *
- * Drivers that need to handle connector hotplugging (e.g. dp mst) can't use
- * the setup helper and will need to do the whole four-step setup process with
- * drm_fb_helper_prepare(), drm_fb_helper_init(),
- * drm_fb_helper_single_add_all_connectors(), enable hotplugging and
- * drm_fb_helper_initial_config() to avoid a possible race window.
- *
  * At runtime drivers should restore the fbdev console by using
  * drm_fb_helper_lastclose() as their &drm_driver.lastclose callback.
  * They should also notify the fb helper code from updates to the output
@@ -123,8 +117,7 @@ static DEFINE_MUTEX(kernel_fb_helper_lock);
  * encoders and connectors. To finish up the fbdev helper initialization, the
  * drm_fb_helper_init() function is called. To probe for all attached displays
  * and set up an initial configuration using the detected hardware, drivers
- * should call drm_fb_helper_single_add_all_connectors() followed by
- * drm_fb_helper_initial_config().
+ * should call drm_fb_helper_initial_config().
  *
  * If &drm_framebuffer_funcs.dirty is set, the
  * drm_fb_helper_{cfb,sys}_{write,fillrect,copyarea,imageblit} functions will
@@ -137,165 +130,6 @@ static DEFINE_MUTEX(kernel_fb_helper_lock);
  * deferred I/O (coupled with drm_fb_helper_fbdev_teardown()).
  */
 
-#define drm_fb_helper_for_each_connector(fbh, i__) \
-	for (({ lockdep_assert_held(&(fbh)->lock); }), \
-	     i__ = 0; i__ < (fbh)->connector_count; i__++)
-
-static int __drm_fb_helper_add_one_connector(struct drm_fb_helper *fb_helper,
-					     struct drm_connector *connector)
-{
-	struct drm_fb_helper_connector *fb_conn;
-	struct drm_fb_helper_connector **temp;
-	unsigned int count;
-
-	if (!drm_fbdev_emulation)
-		return 0;
-
-	lockdep_assert_held(&fb_helper->lock);
-
-	count = fb_helper->connector_count + 1;
-
-	if (count > fb_helper->connector_info_alloc_count) {
-		size_t size = count * sizeof(fb_conn);
-
-		temp = krealloc(fb_helper->connector_info, size, GFP_KERNEL);
-		if (!temp)
-			return -ENOMEM;
-
-		fb_helper->connector_info_alloc_count = count;
-		fb_helper->connector_info = temp;
-	}
-
-	fb_conn = kzalloc(sizeof(*fb_conn), GFP_KERNEL);
-	if (!fb_conn)
-		return -ENOMEM;
-
-	drm_connector_get(connector);
-	fb_conn->connector = connector;
-	fb_helper->connector_info[fb_helper->connector_count++] = fb_conn;
-
-	return 0;
-}
-
-int drm_fb_helper_add_one_connector(struct drm_fb_helper *fb_helper,
-				    struct drm_connector *connector)
-{
-	int err;
-
-	if (!fb_helper)
-		return 0;
-
-	mutex_lock(&fb_helper->lock);
-	err = __drm_fb_helper_add_one_connector(fb_helper, connector);
-	mutex_unlock(&fb_helper->lock);
-
-	return err;
-}
-EXPORT_SYMBOL(drm_fb_helper_add_one_connector);
-
-/**
- * drm_fb_helper_single_add_all_connectors() - add all connectors to fbdev
- * 					       emulation helper
- * @fb_helper: fbdev initialized with drm_fb_helper_init, can be NULL
- *
- * This functions adds all the available connectors for use with the given
- * fb_helper. This is a separate step to allow drivers to freely assign
- * connectors to the fbdev, e.g. if some are reserved for special purposes or
- * not adequate to be used for the fbcon.
- *
- * This function is protected against concurrent connector hotadds/removals
- * using drm_fb_helper_add_one_connector() and
- * drm_fb_helper_remove_one_connector().
- */
-int drm_fb_helper_single_add_all_connectors(struct drm_fb_helper *fb_helper)
-{
-	struct drm_device *dev;
-	struct drm_connector *connector;
-	struct drm_connector_list_iter conn_iter;
-	int i, ret = 0;
-
-	if (!drm_fbdev_emulation || !fb_helper)
-		return 0;
-
-	dev = fb_helper->dev;
-
-	mutex_lock(&fb_helper->lock);
-	drm_connector_list_iter_begin(dev, &conn_iter);
-	drm_for_each_connector_iter(connector, &conn_iter) {
-		if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK)
-			continue;
-
-		ret = __drm_fb_helper_add_one_connector(fb_helper, connector);
-		if (ret)
-			goto fail;
-	}
-	goto out;
-
-fail:
-	drm_fb_helper_for_each_connector(fb_helper, i) {
-		struct drm_fb_helper_connector *fb_helper_connector =
-			fb_helper->connector_info[i];
-
-		drm_connector_put(fb_helper_connector->connector);
-
-		kfree(fb_helper_connector);
-		fb_helper->connector_info[i] = NULL;
-	}
-	fb_helper->connector_count = 0;
-out:
-	drm_connector_list_iter_end(&conn_iter);
-	mutex_unlock(&fb_helper->lock);
-
-	return ret;
-}
-EXPORT_SYMBOL(drm_fb_helper_single_add_all_connectors);
-
-static int __drm_fb_helper_remove_one_connector(struct drm_fb_helper *fb_helper,
-						struct drm_connector *connector)
-{
-	struct drm_fb_helper_connector *fb_helper_connector;
-	int i, j;
-
-	if (!drm_fbdev_emulation)
-		return 0;
-
-	lockdep_assert_held(&fb_helper->lock);
-
-	drm_fb_helper_for_each_connector(fb_helper, i) {
-		if (fb_helper->connector_info[i]->connector == connector)
-			break;
-	}
-
-	if (i == fb_helper->connector_count)
-		return -EINVAL;
-	fb_helper_connector = fb_helper->connector_info[i];
-	drm_connector_put(fb_helper_connector->connector);
-
-	for (j = i + 1; j < fb_helper->connector_count; j++)
-		fb_helper->connector_info[j - 1] = fb_helper->connector_info[j];
-
-	fb_helper->connector_count--;
-	kfree(fb_helper_connector);
-
-	return 0;
-}
-
-int drm_fb_helper_remove_one_connector(struct drm_fb_helper *fb_helper,
-				       struct drm_connector *connector)
-{
-	int err;
-
-	if (!fb_helper)
-		return 0;
-
-	mutex_lock(&fb_helper->lock);
-	err = __drm_fb_helper_remove_one_connector(fb_helper, connector);
-	mutex_unlock(&fb_helper->lock);
-
-	return err;
-}
-EXPORT_SYMBOL(drm_fb_helper_remove_one_connector);
-
 static void drm_fb_helper_restore_lut_atomic(struct drm_crtc *crtc)
 {
 	uint16_t *r_base, *g_base, *b_base;
@@ -645,20 +479,9 @@ int drm_fb_helper_init(struct drm_device *dev,
 			return ret;
 	}
 
-	fb_helper->connector_info = kcalloc(dev->mode_config.num_connector, sizeof(struct drm_fb_helper_connector *), GFP_KERNEL);
-	if (!fb_helper->connector_info)
-		goto out_free;
-
-	fb_helper->connector_info_alloc_count = dev->mode_config.num_connector;
-	fb_helper->connector_count = 0;
-
 	dev->fb_helper = fb_helper;
 
 	return 0;
-out_free:
-	drm_client_release(&fb_helper->client);
-
-	return -ENOMEM;
 }
 EXPORT_SYMBOL(drm_fb_helper_init);
 
@@ -733,7 +556,6 @@ EXPORT_SYMBOL(drm_fb_helper_unregister_fbi);
 void drm_fb_helper_fini(struct drm_fb_helper *fb_helper)
 {
 	struct fb_info *info;
-	int i;
 
 	if (!fb_helper)
 		return;
@@ -766,12 +588,6 @@ void drm_fb_helper_fini(struct drm_fb_helper *fb_helper)
 
 	if (!fb_helper->client.funcs)
 		drm_client_release(&fb_helper->client);
-
-	for (i = 0; i < fb_helper->connector_count; i++) {
-		drm_connector_put(fb_helper->connector_info[i]->connector);
-		kfree(fb_helper->connector_info[i]);
-	}
-	kfree(fb_helper->connector_info);
 }
 EXPORT_SYMBOL(drm_fb_helper_fini);
 
@@ -1650,8 +1466,9 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper,
 	struct drm_client_dev *client = &fb_helper->client;
 	int ret = 0;
 	int crtc_count = 0;
-	int i;
+	struct drm_connector_list_iter conn_iter;
 	struct drm_fb_helper_surface_size sizes;
+	struct drm_connector *connector;
 	struct drm_mode_set *mode_set;
 	int best_depth = 0;
 
@@ -1668,11 +1485,11 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper,
 	if (preferred_bpp != sizes.surface_bpp)
 		sizes.surface_depth = sizes.surface_bpp = preferred_bpp;
 
-	drm_fb_helper_for_each_connector(fb_helper, i) {
-		struct drm_fb_helper_connector *fb_helper_conn = fb_helper->connector_info[i];
+	drm_connector_list_iter_begin(fb_helper->dev, &conn_iter);
+	drm_client_for_each_connector_iter(connector, &conn_iter) {
 		struct drm_cmdline_mode *cmdline_mode;
 
-		cmdline_mode = &fb_helper_conn->connector->cmdline_mode;
+		cmdline_mode = &connector->cmdline_mode;
 
 		if (cmdline_mode->bpp_specified) {
 			switch (cmdline_mode->bpp) {
@@ -1697,6 +1514,7 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper,
 			break;
 		}
 	}
+	drm_connector_list_iter_end(&conn_iter);
 
 	/*
 	 * If we run into a situation where, for example, the primary plane
@@ -1881,26 +1699,12 @@ void drm_fb_helper_fill_info(struct fb_info *info,
 }
 EXPORT_SYMBOL(drm_fb_helper_fill_info);
 
-static int drm_fb_helper_probe_connector_modes(struct drm_fb_helper *fb_helper,
-						uint32_t maxX,
-						uint32_t maxY)
-{
-	struct drm_connector *connector;
-	int i, count = 0;
-
-	drm_fb_helper_for_each_connector(fb_helper, i) {
-		connector = fb_helper->connector_info[i]->connector;
-		count += connector->funcs->fill_modes(connector, maxX, maxY);
-	}
-
-	return count;
-}
-
-struct drm_display_mode *drm_has_preferred_mode(struct drm_fb_helper_connector *fb_connector, int width, int height)
+static struct drm_display_mode *
+drm_connector_has_preferred_mode(struct drm_connector *connector, int width, int height)
 {
 	struct drm_display_mode *mode;
 
-	list_for_each_entry(mode, &fb_connector->connector->modes, head) {
+	list_for_each_entry(mode, &connector->modes, head) {
 		if (mode->hdisplay > width ||
 		    mode->vdisplay > height)
 			continue;
@@ -1909,20 +1713,15 @@ struct drm_display_mode *drm_has_preferred_mode(struct drm_fb_helper_connector *
 	}
 	return NULL;
 }
-EXPORT_SYMBOL(drm_has_preferred_mode);
-
-static bool drm_has_cmdline_mode(struct drm_fb_helper_connector *fb_connector)
-{
-	return fb_connector->connector->cmdline_mode.specified;
-}
 
-struct drm_display_mode *drm_pick_cmdline_mode(struct drm_fb_helper_connector *fb_helper_conn)
+static struct drm_display_mode *
+drm_connector_pick_cmdline_mode(struct drm_connector *connector)
 {
 	struct drm_cmdline_mode *cmdline_mode;
 	struct drm_display_mode *mode;
 	bool prefer_non_interlace;
 
-	cmdline_mode = &fb_helper_conn->connector->cmdline_mode;
+	cmdline_mode = &connector->cmdline_mode;
 	if (cmdline_mode->specified == false)
 		return NULL;
 
@@ -1934,7 +1733,7 @@ struct drm_display_mode *drm_pick_cmdline_mode(struct drm_fb_helper_connector *f
 
 	prefer_non_interlace = !cmdline_mode->interlace;
 again:
-	list_for_each_entry(mode, &fb_helper_conn->connector->modes, head) {
+	list_for_each_entry(mode, &connector->modes, head) {
 		/* check width/height */
 		if (mode->hdisplay != cmdline_mode->xres ||
 		    mode->vdisplay != cmdline_mode->yres)
@@ -1961,12 +1760,11 @@ again:
 	}
 
 create_mode:
-	mode = drm_mode_create_from_cmdline_mode(fb_helper_conn->connector->dev,
-						 cmdline_mode);
-	list_add(&mode->head, &fb_helper_conn->connector->modes);
+	mode = drm_mode_create_from_cmdline_mode(connector->dev, cmdline_mode);
+	list_add(&mode->head, &connector->modes);
+
 	return mode;
 }
-EXPORT_SYMBOL(drm_pick_cmdline_mode);
 
 static bool drm_connector_enabled(struct drm_connector *connector, bool strict)
 {
@@ -1983,15 +1781,16 @@ static bool drm_connector_enabled(struct drm_connector *connector, bool strict)
 	return enable;
 }
 
-static void drm_enable_connectors(struct drm_fb_helper *fb_helper,
-				  bool *enabled)
+static void drm_client_connectors_enabled(struct drm_connector **connectors,
+					  unsigned int connector_count,
+					  bool *enabled)
 {
 	bool any_enabled = false;
 	struct drm_connector *connector;
 	int i = 0;
 
-	drm_fb_helper_for_each_connector(fb_helper, i) {
-		connector = fb_helper->connector_info[i]->connector;
+	for (i = 0; i < connector_count; i++) {
+		connector = connectors[i];
 		enabled[i] = drm_connector_enabled(connector, true);
 		DRM_DEBUG_KMS("connector %d enabled? %s\n", connector->base.id,
 			      connector->display_info.non_desktop ? "non desktop" : enabled[i] ? "yes" : "no");
@@ -2002,28 +1801,27 @@ static void drm_enable_connectors(struct drm_fb_helper *fb_helper,
 	if (any_enabled)
 		return;
 
-	drm_fb_helper_for_each_connector(fb_helper, i) {
-		connector = fb_helper->connector_info[i]->connector;
-		enabled[i] = drm_connector_enabled(connector, false);
-	}
+	for (i = 0; i < connector_count; i++)
+		enabled[i] = drm_connector_enabled(connectors[i], false);
 }
 
-static bool drm_target_cloned(struct drm_fb_helper *fb_helper,
-			      struct drm_display_mode **modes,
-			      struct drm_fb_offset *offsets,
-			      bool *enabled, int width, int height)
+static bool drm_client_target_cloned(struct drm_device *dev,
+				     struct drm_connector **connectors,
+				     unsigned int connector_count,
+				     struct drm_display_mode **modes,
+				     struct drm_fb_offset *offsets,
+				     bool *enabled, int width, int height)
 {
 	int count, i, j;
 	bool can_clone = false;
-	struct drm_fb_helper_connector *fb_helper_conn;
 	struct drm_display_mode *dmt_mode, *mode;
 
 	/* only contemplate cloning in the single crtc case */
-	if (fb_helper->dev->mode_config.num_crtc > 1)
+	if (dev->mode_config.num_crtc > 1)
 		return false;
 
 	count = 0;
-	drm_fb_helper_for_each_connector(fb_helper, i) {
+	for (i = 0; i < connector_count; i++) {
 		if (enabled[i])
 			count++;
 	}
@@ -2034,11 +1832,10 @@ static bool drm_target_cloned(struct drm_fb_helper *fb_helper,
 
 	/* check the command line or if nothing common pick 1024x768 */
 	can_clone = true;
-	drm_fb_helper_for_each_connector(fb_helper, i) {
+	for (i = 0; i < connector_count; i++) {
 		if (!enabled[i])
 			continue;
-		fb_helper_conn = fb_helper->connector_info[i];
-		modes[i] = drm_pick_cmdline_mode(fb_helper_conn);
+		modes[i] = drm_connector_pick_cmdline_mode(connectors[i]);
 		if (!modes[i]) {
 			can_clone = false;
 			break;
@@ -2062,14 +1859,13 @@ static bool drm_target_cloned(struct drm_fb_helper *fb_helper,
 
 	/* try and find a 1024x768 mode on each connector */
 	can_clone = true;
-	dmt_mode = drm_mode_find_dmt(fb_helper->dev, 1024, 768, 60, false);
+	dmt_mode = drm_mode_find_dmt(dev, 1024, 768, 60, false);
 
-	drm_fb_helper_for_each_connector(fb_helper, i) {
+	for (i = 0; i < connector_count; i++) {
 		if (!enabled[i])
 			continue;
 
-		fb_helper_conn = fb_helper->connector_info[i];
-		list_for_each_entry(mode, &fb_helper_conn->connector->modes, head) {
+		list_for_each_entry(mode, &connectors[i]->modes, head) {
 			if (drm_mode_match(mode, dmt_mode,
 					   DRM_MODE_MATCH_TIMINGS |
 					   DRM_MODE_MATCH_CLOCK |
@@ -2089,30 +1885,31 @@ static bool drm_target_cloned(struct drm_fb_helper *fb_helper,
 	return false;
 }
 
-static int drm_get_tile_offsets(struct drm_fb_helper *fb_helper,
-				struct drm_display_mode **modes,
-				struct drm_fb_offset *offsets,
-				int idx,
-				int h_idx, int v_idx)
+static int drm_client_get_tile_offsets(struct drm_connector **connectors,
+				       unsigned int connector_count,
+				       struct drm_display_mode **modes,
+				       struct drm_fb_offset *offsets,
+				       int idx,
+				       int h_idx, int v_idx)
 {
-	struct drm_fb_helper_connector *fb_helper_conn;
+	struct drm_connector *connector;
 	int i;
 	int hoffset = 0, voffset = 0;
 
-	drm_fb_helper_for_each_connector(fb_helper, i) {
-		fb_helper_conn = fb_helper->connector_info[i];
-		if (!fb_helper_conn->connector->has_tile)
+	for (i = 0; i < connector_count; i++) {
+		connector = connectors[i];
+		if (!connector->has_tile)
 			continue;
 
 		if (!modes[i] && (h_idx || v_idx)) {
 			DRM_DEBUG_KMS("no modes for connector tiled %d %d\n", i,
-				      fb_helper_conn->connector->base.id);
+				      connector->base.id);
 			continue;
 		}
-		if (fb_helper_conn->connector->tile_h_loc < h_idx)
+		if (connector->tile_h_loc < h_idx)
 			hoffset += modes[i]->hdisplay;
 
-		if (fb_helper_conn->connector->tile_v_loc < v_idx)
+		if (connector->tile_v_loc < v_idx)
 			voffset += modes[i]->vdisplay;
 	}
 	offsets[idx].x = hoffset;
@@ -2121,20 +1918,21 @@ static int drm_get_tile_offsets(struct drm_fb_helper *fb_helper,
 	return 0;
 }
 
-static bool drm_target_preferred(struct drm_fb_helper *fb_helper,
-				 struct drm_display_mode **modes,
-				 struct drm_fb_offset *offsets,
-				 bool *enabled, int width, int height)
+static bool drm_client_target_preferred(struct drm_connector **connectors,
+					unsigned int connector_count,
+					struct drm_display_mode **modes,
+					struct drm_fb_offset *offsets,
+					bool *enabled, int width, int height)
 {
-	struct drm_fb_helper_connector *fb_helper_conn;
-	const u64 mask = BIT_ULL(fb_helper->connector_count) - 1;
+	const u64 mask = BIT_ULL(connector_count) - 1;
+	struct drm_connector *connector;
 	u64 conn_configured = 0;
 	int tile_pass = 0;
 	int i;
 
 retry:
-	drm_fb_helper_for_each_connector(fb_helper, i) {
-		fb_helper_conn = fb_helper->connector_info[i];
+	for (i = 0; i < connector_count; i++) {
+		connector = connectors[i];
 
 		if (conn_configured & BIT_ULL(i))
 			continue;
@@ -2145,17 +1943,17 @@ retry:
 		}
 
 		/* first pass over all the untiled connectors */
-		if (tile_pass == 0 && fb_helper_conn->connector->has_tile)
+		if (tile_pass == 0 && connector->has_tile)
 			continue;
 
 		if (tile_pass == 1) {
-			if (fb_helper_conn->connector->tile_h_loc != 0 ||
-			    fb_helper_conn->connector->tile_v_loc != 0)
+			if (connector->tile_h_loc != 0 ||
+			    connector->tile_v_loc != 0)
 				continue;
 
 		} else {
-			if (fb_helper_conn->connector->tile_h_loc != tile_pass - 1 &&
-			    fb_helper_conn->connector->tile_v_loc != tile_pass - 1)
+			if (connector->tile_h_loc != tile_pass - 1 &&
+			    connector->tile_v_loc != tile_pass - 1)
 			/* if this tile_pass doesn't cover any of the tiles - keep going */
 				continue;
 
@@ -2163,22 +1961,22 @@ retry:
 			 * find the tile offsets for this pass - need to find
 			 * all tiles left and above
 			 */
-			drm_get_tile_offsets(fb_helper, modes, offsets,
-					     i, fb_helper_conn->connector->tile_h_loc, fb_helper_conn->connector->tile_v_loc);
+			drm_client_get_tile_offsets(connectors, connector_count, modes, offsets, i,
+						    connector->tile_h_loc, connector->tile_v_loc);
 		}
 		DRM_DEBUG_KMS("looking for cmdline mode on connector %d\n",
-			      fb_helper_conn->connector->base.id);
+			      connector->base.id);
 
 		/* got for command line mode first */
-		modes[i] = drm_pick_cmdline_mode(fb_helper_conn);
+		modes[i] = drm_connector_pick_cmdline_mode(connector);
 		if (!modes[i]) {
 			DRM_DEBUG_KMS("looking for preferred mode on connector %d %d\n",
-				      fb_helper_conn->connector->base.id, fb_helper_conn->connector->tile_group ? fb_helper_conn->connector->tile_group->id : 0);
-			modes[i] = drm_has_preferred_mode(fb_helper_conn, width, height);
+				      connector->base.id, connector->tile_group ? connector->tile_group->id : 0);
+			modes[i] = drm_connector_has_preferred_mode(connector, width, height);
 		}
 		/* No preferred modes, pick one off the list */
-		if (!modes[i] && !list_empty(&fb_helper_conn->connector->modes)) {
-			list_for_each_entry(modes[i], &fb_helper_conn->connector->modes, head)
+		if (!modes[i] && !list_empty(&connector->modes)) {
+			list_for_each_entry(modes[i], &connector->modes, head)
 				break;
 		}
 		DRM_DEBUG_KMS("found mode %s\n", modes[i] ? modes[i]->name :
@@ -2207,40 +2005,41 @@ static bool connector_has_possible_crtc(struct drm_connector *connector,
 	return false;
 }
 
-static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
-			  struct drm_crtc **best_crtcs,
-			  struct drm_display_mode **modes,
-			  int n, int width, int height)
+static int drm_client_pick_crtcs(struct drm_client_dev *client,
+				 struct drm_connector **connectors,
+				 unsigned int connector_count,
+				 struct drm_crtc **best_crtcs,
+				 struct drm_display_mode **modes,
+				 int n, int width, int height)
 {
-	struct drm_client_dev *client = &fb_helper->client;
+	struct drm_device *dev = client->dev;
 	struct drm_connector *connector;
 	int my_score, best_score, score;
 	struct drm_crtc **crtcs, *crtc;
 	struct drm_mode_set *modeset;
-	struct drm_fb_helper_connector *fb_helper_conn;
 	int o;
 
-	if (n == fb_helper->connector_count)
+	if (n == connector_count)
 		return 0;
 
-	fb_helper_conn = fb_helper->connector_info[n];
-	connector = fb_helper_conn->connector;
+	connector = connectors[n];
 
 	best_crtcs[n] = NULL;
-	best_score = drm_pick_crtcs(fb_helper, best_crtcs, modes, n+1, width, height);
+	best_score = drm_client_pick_crtcs(client, connectors, connector_count,
+					   best_crtcs, modes, n + 1, width, height);
 	if (modes[n] == NULL)
 		return best_score;
 
-	crtcs = kcalloc(fb_helper->connector_count, sizeof(*crtcs), GFP_KERNEL);
+	crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL);
 	if (!crtcs)
 		return best_score;
 
 	my_score = 1;
 	if (connector->status == connector_status_connected)
 		my_score++;
-	if (drm_has_cmdline_mode(fb_helper_conn))
+	if (connector->cmdline_mode.specified)
 		my_score++;
-	if (drm_has_preferred_mode(fb_helper_conn, width, height))
+	if (drm_connector_has_preferred_mode(connector, width, height))
 		my_score++;
 
 	/*
@@ -2259,7 +2058,7 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
 
 		if (o < n) {
 			/* ignore cloning unless only a single crtc */
-			if (fb_helper->dev->mode_config.num_crtc > 1)
+			if (dev->mode_config.num_crtc > 1)
 				continue;
 
 			if (!drm_mode_equal(modes[o], modes[n]))
@@ -2268,12 +2067,11 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
 
 		crtcs[n] = crtc;
 		memcpy(crtcs, best_crtcs, n * sizeof(*crtcs));
-		score = my_score + drm_pick_crtcs(fb_helper, crtcs, modes, n + 1,
-						  width, height);
+		score = my_score + drm_client_pick_crtcs(client, connectors, connector_count,
+							 crtcs, modes, n + 1, width, height);
 		if (score > best_score) {
 			best_score = score;
-			memcpy(best_crtcs, crtcs,
-			       fb_helper->connector_count * sizeof(*crtcs));
+			memcpy(best_crtcs, crtcs, connector_count * sizeof(*crtcs));
 		}
 	}
 
@@ -2282,15 +2080,17 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
 }
 
 /* Try to read the BIOS display configuration and use it for the initial config */
-static bool drm_fb_helper_firmware_config(struct drm_fb_helper *fb_helper,
-					  struct drm_crtc **crtcs,
-					  struct drm_display_mode **modes,
-					  struct drm_fb_offset *offsets,
-					  bool *enabled, int width, int height)
-{
-	struct drm_device *dev = fb_helper->dev;
-	unsigned int count = min(fb_helper->connector_count, BITS_PER_LONG);
+static bool drm_client_firmware_config(struct drm_client_dev *client,
+				       struct drm_connector **connectors,
+				       unsigned int connector_count,
+				       struct drm_crtc **crtcs,
+				       struct drm_display_mode **modes,
+				       struct drm_fb_offset *offsets,
+				       bool *enabled, int width, int height)
+{
+	unsigned int count = min_t(unsigned int, connector_count, BITS_PER_LONG);
 	unsigned long conn_configured, conn_seq, mask;
+	struct drm_device *dev = client->dev;
 	int i, j;
 	bool *save_enabled;
 	bool fallback = true, ret = true;
@@ -2316,13 +2116,11 @@ static bool drm_fb_helper_firmware_config(struct drm_fb_helper *fb_helper,
 retry:
 	conn_seq = conn_configured;
 	for (i = 0; i < count; i++) {
-		struct drm_fb_helper_connector *fb_conn;
 		struct drm_connector *connector;
 		struct drm_encoder *encoder;
 		struct drm_crtc *new_crtc;
 
-		fb_conn = fb_helper->connector_info[i];
-		connector = fb_conn->connector;
+		connector = connectors[i];
 
 		if (conn_configured & BIT(i))
 			continue;
@@ -2379,14 +2177,13 @@ retry:
 			      connector->name);
 
 		/* go for command line mode first */
-		modes[i] = drm_pick_cmdline_mode(fb_conn);
+		modes[i] = drm_connector_pick_cmdline_mode(connector);
 
 		/* try for preferred next */
 		if (!modes[i]) {
 			DRM_DEBUG_KMS("looking for preferred mode on connector %s %d\n",
 				      connector->name, connector->has_tile);
-			modes[i] = drm_has_preferred_mode(fb_conn, width,
-							  height);
+			modes[i] = drm_connector_has_preferred_mode(connector, width, height);
 		}
 
 		/* No preferred mode marked by the EDID? Are there any modes? */
@@ -2461,8 +2258,12 @@ bail:
 static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 			    u32 width, u32 height)
 {
+	struct drm_connector *connector, **connectors = NULL;
 	struct drm_client_dev *client = &fb_helper->client;
+	struct drm_connector_list_iter conn_iter;
 	struct drm_device *dev = fb_helper->dev;
+	unsigned int total_modes_count = 0;
+	unsigned int connector_count = 0;
 	struct drm_display_mode **modes;
 	struct drm_fb_offset *offsets;
 	struct drm_crtc **crtcs;
@@ -2470,16 +2271,28 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 	int i;
 
 	DRM_DEBUG_KMS("\n");
-	/* prevent concurrent modification of connector_count by hotplug */
-	lockdep_assert_held(&fb_helper->lock);
-
-	crtcs = kcalloc(fb_helper->connector_count, sizeof(*crtcs), GFP_KERNEL);
-	modes = kcalloc(fb_helper->connector_count,
-			sizeof(struct drm_display_mode *), GFP_KERNEL);
-	offsets = kcalloc(fb_helper->connector_count,
-			  sizeof(struct drm_fb_offset), GFP_KERNEL);
-	enabled = kcalloc(fb_helper->connector_count,
-			  sizeof(bool), GFP_KERNEL);
+
+	drm_connector_list_iter_begin(dev, &conn_iter);
+	drm_client_for_each_connector_iter(connector, &conn_iter) {
+		struct drm_connector **tmp;
+
+		tmp = krealloc(connectors, (connector_count + 1) * sizeof(*connectors), GFP_KERNEL);
+		if (!tmp)
+			goto free_connectors;
+
+		connectors = tmp;
+		drm_connector_get(connector);
+		connectors[connector_count++] = connector;
+	}
+	drm_connector_list_iter_end(&conn_iter);
+
+	if (!connector_count)
+		return;
+
+	crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL);
+	modes = kcalloc(connector_count, sizeof(*modes), GFP_KERNEL);
+	offsets = kcalloc(connector_count, sizeof(*offsets), GFP_KERNEL);
+	enabled = kcalloc(connector_count, sizeof(bool), GFP_KERNEL);
 	if (!crtcs || !modes || !enabled || !offsets) {
 		DRM_ERROR("Memory allocation failed\n");
 		goto out;
@@ -2488,40 +2301,42 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 	mutex_lock(&client->modeset_mutex);
 
 	mutex_lock(&fb_helper->dev->mode_config.mutex);
-	if (drm_fb_helper_probe_connector_modes(fb_helper, width, height) == 0)
+	for (i = 0; i < connector_count; i++)
+		total_modes_count += connectors[i]->funcs->fill_modes(connectors[i], width, height);
+	if (!total_modes_count)
 		DRM_DEBUG_KMS("No connectors reported connected with modes\n");
-	drm_enable_connectors(fb_helper, enabled);
-
-	if (!drm_fb_helper_firmware_config(fb_helper, crtcs, modes, offsets,
-					   enabled, width, height)) {
-		memset(modes, 0, fb_helper->connector_count*sizeof(modes[0]));
-		memset(crtcs, 0, fb_helper->connector_count*sizeof(crtcs[0]));
-		memset(offsets, 0, fb_helper->connector_count*sizeof(offsets[0]));
-
-		if (!drm_target_cloned(fb_helper, modes, offsets,
-				       enabled, width, height) &&
-		    !drm_target_preferred(fb_helper, modes, offsets,
-					  enabled, width, height))
+	drm_client_connectors_enabled(connectors, connector_count, enabled);
+
+	if (!drm_client_firmware_config(client, connectors, connector_count, crtcs,
+					modes, offsets, enabled, width, height)) {
+		memset(modes, 0, connector_count * sizeof(*modes));
+		memset(crtcs, 0, connector_count * sizeof(*crtcs));
+		memset(offsets, 0, connector_count * sizeof(*offsets));
+
+		if (!drm_client_target_cloned(dev, connectors, connector_count, modes,
+					      offsets, enabled, width, height) &&
+		    !drm_client_target_preferred(connectors, connector_count, modes,
+						 offsets, enabled, width, height))
 			DRM_ERROR("Unable to find initial modes\n");
 
 		DRM_DEBUG_KMS("picking CRTCs for %dx%d config\n",
 			      width, height);
 
-		drm_pick_crtcs(fb_helper, crtcs, modes, 0, width, height);
+		drm_client_pick_crtcs(client, connectors, connector_count,
+				      crtcs, modes, 0, width, height);
 	}
 	mutex_unlock(&fb_helper->dev->mode_config.mutex);
 
 	drm_client_modeset_release(client);
 
-	drm_fb_helper_for_each_connector(fb_helper, i) {
+	for (i = 0; i < connector_count; i++) {
 		struct drm_display_mode *mode = modes[i];
 		struct drm_crtc *crtc = crtcs[i];
 		struct drm_fb_offset *offset = &offsets[i];
 
 		if (mode && crtc) {
 			struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc);
-			struct drm_connector *connector =
-				fb_helper->connector_info[i]->connector;
+			struct drm_connector *connector = connectors[i];
 
 			DRM_DEBUG_KMS("desired mode %s set on crtc %d (%d,%d)\n",
 				      mode->name, crtc->base.id, offset->x, offset->y);
@@ -2544,6 +2359,10 @@ out:
 	kfree(modes);
 	kfree(offsets);
 	kfree(enabled);
+free_connectors:
+	for (i = 0; i < connector_count; i++)
+		drm_connector_put(connectors[i]);
+	kfree(connectors);
 }
 
 /*
@@ -2556,10 +2375,11 @@ out:
 static void drm_setup_crtcs_fb(struct drm_fb_helper *fb_helper)
 {
 	struct drm_client_dev *client = &fb_helper->client;
+	struct drm_connector_list_iter conn_iter;
 	struct fb_info *info = fb_helper->fbdev;
 	unsigned int rotation, sw_rotations = 0;
+	struct drm_connector *connector;
 	struct drm_mode_set *modeset;
-	int i;
 
 	mutex_lock(&client->modeset_mutex);
 	drm_client_for_each_modeset(modeset, client) {
@@ -2576,10 +2396,8 @@ static void drm_setup_crtcs_fb(struct drm_fb_helper *fb_helper)
 	}
 	mutex_unlock(&client->modeset_mutex);
 
-	mutex_lock(&fb_helper->dev->mode_config.mutex);
-	drm_fb_helper_for_each_connector(fb_helper, i) {
-		struct drm_connector *connector =
-					fb_helper->connector_info[i]->connector;
+	drm_connector_list_iter_begin(fb_helper->dev, &conn_iter);
+	drm_client_for_each_connector_iter(connector, &conn_iter) {
 
 		/* use first connected connector for the physical dimensions */
 		if (connector->status == connector_status_connected) {
@@ -2588,7 +2406,7 @@ static void drm_setup_crtcs_fb(struct drm_fb_helper *fb_helper)
 			break;
 		}
 	}
-	mutex_unlock(&fb_helper->dev->mode_config.mutex);
+	drm_connector_list_iter_end(&conn_iter);
 
 	switch (sw_rotations) {
 	case DRM_MODE_ROTATE_0:
@@ -2826,12 +2644,6 @@ int drm_fb_helper_fbdev_setup(struct drm_device *dev,
 		return ret;
 	}
 
-	ret = drm_fb_helper_single_add_all_connectors(fb_helper);
-	if (ret < 0) {
-		DRM_DEV_ERROR(dev->dev, "fbdev: Failed to add connectors (ret=%d)\n", ret);
-		goto err_drm_fb_helper_fini;
-	}
-
 	if (!drm_drv_uses_atomic_modeset(dev))
 		drm_helper_disable_unused_functions(dev);
 
@@ -3137,10 +2949,6 @@ static int drm_fbdev_client_hotplug(struct drm_client_dev *client)
 	if (ret)
 		goto err;
 
-	ret = drm_fb_helper_single_add_all_connectors(fb_helper);
-	if (ret)
-		goto err_cleanup;
-
 	if (!drm_drv_uses_atomic_modeset(dev))
 		drm_helper_disable_unused_functions(dev);
 
diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h
index 6cf48419f77f..8d94880bbe25 100644
--- a/include/drm/drm_client.h
+++ b/include/drm/drm_client.h
@@ -7,6 +7,7 @@
 #include <linux/mutex.h>
 #include <linux/types.h>
 
+#include <drm/drm_connector.h>
 #include <drm/drm_crtc.h>
 
 struct drm_client_dev;
@@ -169,6 +170,20 @@ int drm_client_modeset_dpms(struct drm_client_dev *client, int mode);
 	for (({ lockdep_assert_held(&(client)->modeset_mutex); }), \
 	     modeset = (client)->modesets; modeset->crtc; modeset++)
 
+/**
+ * drm_client_for_each_connector_iter - connector_list iterator macro
+ * @connector: &struct drm_connector pointer used as cursor
+ * @iter: &struct drm_connector_list_iter
+ *
+ * This iterates the connectors that are useable for internal clients (excludes
+ * writeback connectors).
+ *
+ * For more info see drm_for_each_connector_iter().
+ */
+#define drm_client_for_each_connector_iter(connector, iter) \
+	drm_for_each_connector_iter(connector, iter) \
+		if (connector->connector_type != DRM_MODE_CONNECTOR_WRITEBACK)
+
 int drm_client_debugfs_init(struct drm_minor *minor);
 
 #endif
diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h
index 6b334f4d8a22..e14d52ef9638 100644
--- a/include/drm/drm_fb_helper.h
+++ b/include/drm/drm_fb_helper.h
@@ -97,16 +97,10 @@ struct drm_fb_helper_funcs {
 			struct drm_fb_helper_surface_size *sizes);
 };
 
-struct drm_fb_helper_connector {
-	struct drm_connector *connector;
-};
-
 /**
  * struct drm_fb_helper - main structure to emulate fbdev on top of KMS
  * @fb: Scanout framebuffer object
  * @dev: DRM device
- * @connector_count: number of connected connectors
- * @connector_info_alloc_count: size of connector_info
  * @funcs: driver callbacks for fb helper
  * @fbdev: emulated fbdev device info struct
  * @pseudo_palette: fake palette of 16 colors
@@ -138,15 +132,6 @@ struct drm_fb_helper {
 
 	struct drm_framebuffer *fb;
 	struct drm_device *dev;
-	int connector_count;
-	int connector_info_alloc_count;
-	/**
-	 * @connector_info:
-	 *
-	 * Array of per-connector information. Do not iterate directly, but use
-	 * drm_fb_helper_for_each_connector.
-	 */
-	struct drm_fb_helper_connector **connector_info;
 	const struct drm_fb_helper_funcs *funcs;
 	struct fb_info *fbdev;
 	u32 pseudo_palette[17];
@@ -286,18 +271,8 @@ int drm_fb_helper_ioctl(struct fb_info *info, unsigned int cmd,
 
 int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper);
 int drm_fb_helper_initial_config(struct drm_fb_helper *fb_helper, int bpp_sel);
-int drm_fb_helper_single_add_all_connectors(struct drm_fb_helper *fb_helper);
 int drm_fb_helper_debug_enter(struct fb_info *info);
 int drm_fb_helper_debug_leave(struct fb_info *info);
-struct drm_display_mode *
-drm_has_preferred_mode(struct drm_fb_helper_connector *fb_connector,
-			int width, int height);
-struct drm_display_mode *
-drm_pick_cmdline_mode(struct drm_fb_helper_connector *fb_helper_conn);
-
-int drm_fb_helper_add_one_connector(struct drm_fb_helper *fb_helper, struct drm_connector *connector);
-int drm_fb_helper_remove_one_connector(struct drm_fb_helper *fb_helper,
-				       struct drm_connector *connector);
 
 int drm_fb_helper_fbdev_setup(struct drm_device *dev,
 			      struct drm_fb_helper *fb_helper,
@@ -472,12 +447,6 @@ static inline int drm_fb_helper_initial_config(struct drm_fb_helper *fb_helper,
 	return 0;
 }
 
-static inline int
-drm_fb_helper_single_add_all_connectors(struct drm_fb_helper *fb_helper)
-{
-	return 0;
-}
-
 static inline int drm_fb_helper_debug_enter(struct fb_info *info)
 {
 	return 0;
@@ -488,34 +457,6 @@ static inline int drm_fb_helper_debug_leave(struct fb_info *info)
 	return 0;
 }
 
-static inline struct drm_display_mode *
-drm_has_preferred_mode(struct drm_fb_helper_connector *fb_connector,
-		       int width, int height)
-{
-	return NULL;
-}
-
-static inline struct drm_display_mode *
-drm_pick_cmdline_mode(struct drm_fb_helper_connector *fb_helper_conn,
-		      int width, int height)
-{
-	return NULL;
-}
-
-static inline int
-drm_fb_helper_add_one_connector(struct drm_fb_helper *fb_helper,
-				struct drm_connector *connector)
-{
-	return 0;
-}
-
-static inline int
-drm_fb_helper_remove_one_connector(struct drm_fb_helper *fb_helper,
-				   struct drm_connector *connector)
-{
-	return 0;
-}
-
 static inline int
 drm_fb_helper_fbdev_setup(struct drm_device *dev,
 			  struct drm_fb_helper *fb_helper,
@@ -557,6 +498,27 @@ drm_fbdev_generic_setup(struct drm_device *dev, unsigned int preferred_bpp)
 
 #endif
 
+/* TODO: There's a todo entry to remove these three */
+static inline int
+drm_fb_helper_single_add_all_connectors(struct drm_fb_helper *fb_helper)
+{
+	return 0;
+}
+
+static inline int
+drm_fb_helper_add_one_connector(struct drm_fb_helper *fb_helper,
+				struct drm_connector *connector)
+{
+	return 0;
+}
+
+static inline int
+drm_fb_helper_remove_one_connector(struct drm_fb_helper *fb_helper,
+				   struct drm_connector *connector)
+{
+	return 0;
+}
+
 /**
  * drm_fb_helper_remove_conflicting_framebuffers - remove firmware-configured framebuffers
  * @a: memory range, users of which are to be removed
-- 
cgit v1.2.3


From aafa9e0668720a663eb47f25d863a84160eead7b Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Sat, 8 Jun 2019 17:26:54 +0200
Subject: drm/fb-helper: Prepare to move out modeset config code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the modeset code so it can be moved out as-is in the next
patch.

v3: Remove stray newline

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Reviewed-by: Maxime Ripard <maxime.ripard@bootlin.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190608152657.36613-3-noralf@tronnes.org
---
 drivers/gpu/drm/drm_fb_helper.c | 62 +++++++++++++++++++++++++++++------------
 include/drm/drm_fb_helper.h     |  4 ---
 2 files changed, 44 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index b936db6280ac..e7f41e5d924c 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -48,6 +48,10 @@
 
 #include "drm_internal.h"
 
+struct drm_client_offset {
+	int x, y;
+};
+
 static bool drm_fbdev_emulation = true;
 module_param_named(fbdev_emulation, drm_fbdev_emulation, bool, 0600);
 MODULE_PARM_DESC(fbdev_emulation,
@@ -1809,7 +1813,7 @@ static bool drm_client_target_cloned(struct drm_device *dev,
 				     struct drm_connector **connectors,
 				     unsigned int connector_count,
 				     struct drm_display_mode **modes,
-				     struct drm_fb_offset *offsets,
+				     struct drm_client_offset *offsets,
 				     bool *enabled, int width, int height)
 {
 	int count, i, j;
@@ -1888,7 +1892,7 @@ static bool drm_client_target_cloned(struct drm_device *dev,
 static int drm_client_get_tile_offsets(struct drm_connector **connectors,
 				       unsigned int connector_count,
 				       struct drm_display_mode **modes,
-				       struct drm_fb_offset *offsets,
+				       struct drm_client_offset *offsets,
 				       int idx,
 				       int h_idx, int v_idx)
 {
@@ -1921,7 +1925,7 @@ static int drm_client_get_tile_offsets(struct drm_connector **connectors,
 static bool drm_client_target_preferred(struct drm_connector **connectors,
 					unsigned int connector_count,
 					struct drm_display_mode **modes,
-					struct drm_fb_offset *offsets,
+					struct drm_client_offset *offsets,
 					bool *enabled, int width, int height)
 {
 	const u64 mask = BIT_ULL(connector_count) - 1;
@@ -2085,7 +2089,7 @@ static bool drm_client_firmware_config(struct drm_client_dev *client,
 				       unsigned int connector_count,
 				       struct drm_crtc **crtcs,
 				       struct drm_display_mode **modes,
-				       struct drm_fb_offset *offsets,
+				       struct drm_client_offset *offsets,
 				       bool *enabled, int width, int height)
 {
 	unsigned int count = min_t(unsigned int, connector_count, BITS_PER_LONG);
@@ -2255,30 +2259,47 @@ bail:
 	return ret;
 }
 
-static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
-			    u32 width, u32 height)
+/**
+ * drm_client_modeset_probe() - Probe for displays
+ * @client: DRM client
+ * @width: Maximum display mode width (optional)
+ * @height: Maximum display mode height (optional)
+ *
+ * This function sets up display pipelines for enabled connectors and stores the
+ * config in the client's modeset array.
+ *
+ * Returns:
+ * Zero on success or negative error code on failure.
+ */
+int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height)
 {
 	struct drm_connector *connector, **connectors = NULL;
-	struct drm_client_dev *client = &fb_helper->client;
 	struct drm_connector_list_iter conn_iter;
-	struct drm_device *dev = fb_helper->dev;
+	struct drm_device *dev = client->dev;
 	unsigned int total_modes_count = 0;
+	struct drm_client_offset *offsets;
 	unsigned int connector_count = 0;
 	struct drm_display_mode **modes;
-	struct drm_fb_offset *offsets;
 	struct drm_crtc **crtcs;
+	int i, ret = 0;
 	bool *enabled;
-	int i;
 
 	DRM_DEBUG_KMS("\n");
 
+	if (!width)
+		width = dev->mode_config.max_width;
+	if (!height)
+		height = dev->mode_config.max_height;
+
 	drm_connector_list_iter_begin(dev, &conn_iter);
 	drm_client_for_each_connector_iter(connector, &conn_iter) {
 		struct drm_connector **tmp;
 
 		tmp = krealloc(connectors, (connector_count + 1) * sizeof(*connectors), GFP_KERNEL);
-		if (!tmp)
+		if (!tmp) {
+			ret = -ENOMEM;
 			goto free_connectors;
+		}
 
 		connectors = tmp;
 		drm_connector_get(connector);
@@ -2287,7 +2308,7 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 	drm_connector_list_iter_end(&conn_iter);
 
 	if (!connector_count)
-		return;
+		return 0;
 
 	crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL);
 	modes = kcalloc(connector_count, sizeof(*modes), GFP_KERNEL);
@@ -2295,12 +2316,13 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 	enabled = kcalloc(connector_count, sizeof(bool), GFP_KERNEL);
 	if (!crtcs || !modes || !enabled || !offsets) {
 		DRM_ERROR("Memory allocation failed\n");
+		ret = -ENOMEM;
 		goto out;
 	}
 
 	mutex_lock(&client->modeset_mutex);
 
-	mutex_lock(&fb_helper->dev->mode_config.mutex);
+	mutex_lock(&dev->mode_config.mutex);
 	for (i = 0; i < connector_count; i++)
 		total_modes_count += connectors[i]->funcs->fill_modes(connectors[i], width, height);
 	if (!total_modes_count)
@@ -2325,14 +2347,14 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 		drm_client_pick_crtcs(client, connectors, connector_count,
 				      crtcs, modes, 0, width, height);
 	}
-	mutex_unlock(&fb_helper->dev->mode_config.mutex);
+	mutex_unlock(&dev->mode_config.mutex);
 
 	drm_client_modeset_release(client);
 
 	for (i = 0; i < connector_count; i++) {
 		struct drm_display_mode *mode = modes[i];
 		struct drm_crtc *crtc = crtcs[i];
-		struct drm_fb_offset *offset = &offsets[i];
+		struct drm_client_offset *offset = &offsets[i];
 
 		if (mode && crtc) {
 			struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc);
@@ -2342,8 +2364,10 @@ static void drm_setup_crtcs(struct drm_fb_helper *fb_helper,
 				      mode->name, crtc->base.id, offset->x, offset->y);
 
 			if (WARN_ON_ONCE(modeset->num_connectors == DRM_CLIENT_MAX_CLONED_CONNECTORS ||
-					 (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1)))
+					 (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1))) {
+				ret = -EINVAL;
 				break;
+			}
 
 			modeset->mode = drm_mode_duplicate(dev, mode);
 			drm_connector_get(connector);
@@ -2363,6 +2387,8 @@ free_connectors:
 	for (i = 0; i < connector_count; i++)
 		drm_connector_put(connectors[i]);
 	kfree(connectors);
+
+	return ret;
 }
 
 /*
@@ -2444,7 +2470,7 @@ __drm_fb_helper_initial_config_and_unlock(struct drm_fb_helper *fb_helper,
 	width = dev->mode_config.max_width;
 	height = dev->mode_config.max_height;
 
-	drm_setup_crtcs(fb_helper, width, height);
+	drm_client_modeset_probe(&fb_helper->client, width, height);
 	ret = drm_fb_helper_single_fb_probe(fb_helper, bpp_sel);
 	if (ret < 0) {
 		if (ret == -EAGAIN) {
@@ -2591,7 +2617,7 @@ int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper)
 
 	DRM_DEBUG_KMS("\n");
 
-	drm_setup_crtcs(fb_helper, fb_helper->fb->width, fb_helper->fb->height);
+	drm_client_modeset_probe(&fb_helper->client, fb_helper->fb->width, fb_helper->fb->height);
 	drm_setup_crtcs_fb(fb_helper);
 	mutex_unlock(&fb_helper->lock);
 
diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h
index e14d52ef9638..c8a8ae2a678a 100644
--- a/include/drm/drm_fb_helper.h
+++ b/include/drm/drm_fb_helper.h
@@ -43,10 +43,6 @@ enum mode_set_atomic {
 	ENTER_ATOMIC_MODE_SET,
 };
 
-struct drm_fb_offset {
-	int x, y;
-};
-
 /**
  * struct drm_fb_helper_surface_size - describes fbdev size and scanout surface size
  * @fb_width: fbdev width
-- 
cgit v1.2.3


From cf13909aee054f5aa667d4b9da0ac7df4f6c1327 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Sat, 8 Jun 2019 17:26:55 +0200
Subject: drm/fb-helper: Move out modeset config code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No functional changes, just moving code as-is and fixing includes.

Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Reviewed-by: Maxime Ripard <maxime.ripard@bootlin.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190608152657.36613-4-noralf@tronnes.org
---
 drivers/gpu/drm/drm_client_modeset.c | 707 ++++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/drm_fb_helper.c      | 692 ----------------------------------
 include/drm/drm_client.h             |   5 +-
 3 files changed, 702 insertions(+), 702 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c
index b1984f901a6d..006bf7390e7d 100644
--- a/drivers/gpu/drm/drm_client_modeset.c
+++ b/drivers/gpu/drm/drm_client_modeset.c
@@ -13,13 +13,22 @@
 
 #include <drm/drm_atomic.h>
 #include <drm/drm_client.h>
+#include <drm/drm_connector.h>
 #include <drm/drm_crtc.h>
 #include <drm/drm_device.h>
 #include <drm/drm_drv.h>
+#include <drm/drm_encoder.h>
+#include <drm/drm_print.h>
 
 #include "drm_crtc_internal.h"
 #include "drm_internal.h"
 
+#define DRM_CLIENT_MAX_CLONED_CONNECTORS	8
+
+struct drm_client_offset {
+	int x, y;
+};
+
 int drm_client_modeset_create(struct drm_client_dev *client)
 {
 	struct drm_device *dev = client->dev;
@@ -58,7 +67,7 @@ err_free:
 	return -ENOMEM;
 }
 
-void drm_client_modeset_release(struct drm_client_dev *client)
+static void drm_client_modeset_release(struct drm_client_dev *client)
 {
 	struct drm_mode_set *modeset;
 	unsigned int i;
@@ -75,8 +84,6 @@ void drm_client_modeset_release(struct drm_client_dev *client)
 		modeset->num_connectors = 0;
 	}
 }
-/* TODO: Remove export when modeset code has been moved over */
-EXPORT_SYMBOL(drm_client_modeset_release);
 
 void drm_client_modeset_free(struct drm_client_dev *client)
 {
@@ -95,7 +102,8 @@ void drm_client_modeset_free(struct drm_client_dev *client)
 	kfree(client->modesets);
 }
 
-struct drm_mode_set *drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc)
+static struct drm_mode_set *
+drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc)
 {
 	struct drm_mode_set *modeset;
 
@@ -105,8 +113,695 @@ struct drm_mode_set *drm_client_find_modeset(struct drm_client_dev *client, stru
 
 	return NULL;
 }
-/* TODO: Remove export when modeset code has been moved over */
-EXPORT_SYMBOL(drm_client_find_modeset);
+
+static struct drm_display_mode *
+drm_connector_has_preferred_mode(struct drm_connector *connector, int width, int height)
+{
+	struct drm_display_mode *mode;
+
+	list_for_each_entry(mode, &connector->modes, head) {
+		if (mode->hdisplay > width ||
+		    mode->vdisplay > height)
+			continue;
+		if (mode->type & DRM_MODE_TYPE_PREFERRED)
+			return mode;
+	}
+	return NULL;
+}
+
+static struct drm_display_mode *
+drm_connector_pick_cmdline_mode(struct drm_connector *connector)
+{
+	struct drm_cmdline_mode *cmdline_mode;
+	struct drm_display_mode *mode;
+	bool prefer_non_interlace;
+
+	cmdline_mode = &connector->cmdline_mode;
+	if (cmdline_mode->specified == false)
+		return NULL;
+
+	/* attempt to find a matching mode in the list of modes
+	 *  we have gotten so far, if not add a CVT mode that conforms
+	 */
+	if (cmdline_mode->rb || cmdline_mode->margins)
+		goto create_mode;
+
+	prefer_non_interlace = !cmdline_mode->interlace;
+again:
+	list_for_each_entry(mode, &connector->modes, head) {
+		/* check width/height */
+		if (mode->hdisplay != cmdline_mode->xres ||
+		    mode->vdisplay != cmdline_mode->yres)
+			continue;
+
+		if (cmdline_mode->refresh_specified) {
+			if (mode->vrefresh != cmdline_mode->refresh)
+				continue;
+		}
+
+		if (cmdline_mode->interlace) {
+			if (!(mode->flags & DRM_MODE_FLAG_INTERLACE))
+				continue;
+		} else if (prefer_non_interlace) {
+			if (mode->flags & DRM_MODE_FLAG_INTERLACE)
+				continue;
+		}
+		return mode;
+	}
+
+	if (prefer_non_interlace) {
+		prefer_non_interlace = false;
+		goto again;
+	}
+
+create_mode:
+	mode = drm_mode_create_from_cmdline_mode(connector->dev, cmdline_mode);
+	list_add(&mode->head, &connector->modes);
+
+	return mode;
+}
+
+static bool drm_connector_enabled(struct drm_connector *connector, bool strict)
+{
+	bool enable;
+
+	if (connector->display_info.non_desktop)
+		return false;
+
+	if (strict)
+		enable = connector->status == connector_status_connected;
+	else
+		enable = connector->status != connector_status_disconnected;
+
+	return enable;
+}
+
+static void drm_client_connectors_enabled(struct drm_connector **connectors,
+					  unsigned int connector_count,
+					  bool *enabled)
+{
+	bool any_enabled = false;
+	struct drm_connector *connector;
+	int i = 0;
+
+	for (i = 0; i < connector_count; i++) {
+		connector = connectors[i];
+		enabled[i] = drm_connector_enabled(connector, true);
+		DRM_DEBUG_KMS("connector %d enabled? %s\n", connector->base.id,
+			      connector->display_info.non_desktop ? "non desktop" : enabled[i] ? "yes" : "no");
+
+		any_enabled |= enabled[i];
+	}
+
+	if (any_enabled)
+		return;
+
+	for (i = 0; i < connector_count; i++)
+		enabled[i] = drm_connector_enabled(connectors[i], false);
+}
+
+static bool drm_client_target_cloned(struct drm_device *dev,
+				     struct drm_connector **connectors,
+				     unsigned int connector_count,
+				     struct drm_display_mode **modes,
+				     struct drm_client_offset *offsets,
+				     bool *enabled, int width, int height)
+{
+	int count, i, j;
+	bool can_clone = false;
+	struct drm_display_mode *dmt_mode, *mode;
+
+	/* only contemplate cloning in the single crtc case */
+	if (dev->mode_config.num_crtc > 1)
+		return false;
+
+	count = 0;
+	for (i = 0; i < connector_count; i++) {
+		if (enabled[i])
+			count++;
+	}
+
+	/* only contemplate cloning if more than one connector is enabled */
+	if (count <= 1)
+		return false;
+
+	/* check the command line or if nothing common pick 1024x768 */
+	can_clone = true;
+	for (i = 0; i < connector_count; i++) {
+		if (!enabled[i])
+			continue;
+		modes[i] = drm_connector_pick_cmdline_mode(connectors[i]);
+		if (!modes[i]) {
+			can_clone = false;
+			break;
+		}
+		for (j = 0; j < i; j++) {
+			if (!enabled[j])
+				continue;
+			if (!drm_mode_match(modes[j], modes[i],
+					    DRM_MODE_MATCH_TIMINGS |
+					    DRM_MODE_MATCH_CLOCK |
+					    DRM_MODE_MATCH_FLAGS |
+					    DRM_MODE_MATCH_3D_FLAGS))
+				can_clone = false;
+		}
+	}
+
+	if (can_clone) {
+		DRM_DEBUG_KMS("can clone using command line\n");
+		return true;
+	}
+
+	/* try and find a 1024x768 mode on each connector */
+	can_clone = true;
+	dmt_mode = drm_mode_find_dmt(dev, 1024, 768, 60, false);
+
+	for (i = 0; i < connector_count; i++) {
+		if (!enabled[i])
+			continue;
+
+		list_for_each_entry(mode, &connectors[i]->modes, head) {
+			if (drm_mode_match(mode, dmt_mode,
+					   DRM_MODE_MATCH_TIMINGS |
+					   DRM_MODE_MATCH_CLOCK |
+					   DRM_MODE_MATCH_FLAGS |
+					   DRM_MODE_MATCH_3D_FLAGS))
+				modes[i] = mode;
+		}
+		if (!modes[i])
+			can_clone = false;
+	}
+
+	if (can_clone) {
+		DRM_DEBUG_KMS("can clone using 1024x768\n");
+		return true;
+	}
+	DRM_INFO("kms: can't enable cloning when we probably wanted to.\n");
+	return false;
+}
+
+static int drm_client_get_tile_offsets(struct drm_connector **connectors,
+				       unsigned int connector_count,
+				       struct drm_display_mode **modes,
+				       struct drm_client_offset *offsets,
+				       int idx,
+				       int h_idx, int v_idx)
+{
+	struct drm_connector *connector;
+	int i;
+	int hoffset = 0, voffset = 0;
+
+	for (i = 0; i < connector_count; i++) {
+		connector = connectors[i];
+		if (!connector->has_tile)
+			continue;
+
+		if (!modes[i] && (h_idx || v_idx)) {
+			DRM_DEBUG_KMS("no modes for connector tiled %d %d\n", i,
+				      connector->base.id);
+			continue;
+		}
+		if (connector->tile_h_loc < h_idx)
+			hoffset += modes[i]->hdisplay;
+
+		if (connector->tile_v_loc < v_idx)
+			voffset += modes[i]->vdisplay;
+	}
+	offsets[idx].x = hoffset;
+	offsets[idx].y = voffset;
+	DRM_DEBUG_KMS("returned %d %d for %d %d\n", hoffset, voffset, h_idx, v_idx);
+	return 0;
+}
+
+static bool drm_client_target_preferred(struct drm_connector **connectors,
+					unsigned int connector_count,
+					struct drm_display_mode **modes,
+					struct drm_client_offset *offsets,
+					bool *enabled, int width, int height)
+{
+	const u64 mask = BIT_ULL(connector_count) - 1;
+	struct drm_connector *connector;
+	u64 conn_configured = 0;
+	int tile_pass = 0;
+	int i;
+
+retry:
+	for (i = 0; i < connector_count; i++) {
+		connector = connectors[i];
+
+		if (conn_configured & BIT_ULL(i))
+			continue;
+
+		if (enabled[i] == false) {
+			conn_configured |= BIT_ULL(i);
+			continue;
+		}
+
+		/* first pass over all the untiled connectors */
+		if (tile_pass == 0 && connector->has_tile)
+			continue;
+
+		if (tile_pass == 1) {
+			if (connector->tile_h_loc != 0 ||
+			    connector->tile_v_loc != 0)
+				continue;
+
+		} else {
+			if (connector->tile_h_loc != tile_pass - 1 &&
+			    connector->tile_v_loc != tile_pass - 1)
+			/* if this tile_pass doesn't cover any of the tiles - keep going */
+				continue;
+
+			/*
+			 * find the tile offsets for this pass - need to find
+			 * all tiles left and above
+			 */
+			drm_client_get_tile_offsets(connectors, connector_count, modes, offsets, i,
+						    connector->tile_h_loc, connector->tile_v_loc);
+		}
+		DRM_DEBUG_KMS("looking for cmdline mode on connector %d\n",
+			      connector->base.id);
+
+		/* got for command line mode first */
+		modes[i] = drm_connector_pick_cmdline_mode(connector);
+		if (!modes[i]) {
+			DRM_DEBUG_KMS("looking for preferred mode on connector %d %d\n",
+				      connector->base.id, connector->tile_group ? connector->tile_group->id : 0);
+			modes[i] = drm_connector_has_preferred_mode(connector, width, height);
+		}
+		/* No preferred modes, pick one off the list */
+		if (!modes[i] && !list_empty(&connector->modes)) {
+			list_for_each_entry(modes[i], &connector->modes, head)
+				break;
+		}
+		DRM_DEBUG_KMS("found mode %s\n", modes[i] ? modes[i]->name :
+			  "none");
+		conn_configured |= BIT_ULL(i);
+	}
+
+	if ((conn_configured & mask) != mask) {
+		tile_pass++;
+		goto retry;
+	}
+	return true;
+}
+
+static bool connector_has_possible_crtc(struct drm_connector *connector,
+					struct drm_crtc *crtc)
+{
+	struct drm_encoder *encoder;
+	int i;
+
+	drm_connector_for_each_possible_encoder(connector, encoder, i) {
+		if (encoder->possible_crtcs & drm_crtc_mask(crtc))
+			return true;
+	}
+
+	return false;
+}
+
+static int drm_client_pick_crtcs(struct drm_client_dev *client,
+				 struct drm_connector **connectors,
+				 unsigned int connector_count,
+				 struct drm_crtc **best_crtcs,
+				 struct drm_display_mode **modes,
+				 int n, int width, int height)
+{
+	struct drm_device *dev = client->dev;
+	struct drm_connector *connector;
+	int my_score, best_score, score;
+	struct drm_crtc **crtcs, *crtc;
+	struct drm_mode_set *modeset;
+	int o;
+
+	if (n == connector_count)
+		return 0;
+
+	connector = connectors[n];
+
+	best_crtcs[n] = NULL;
+	best_score = drm_client_pick_crtcs(client, connectors, connector_count,
+					   best_crtcs, modes, n + 1, width, height);
+	if (modes[n] == NULL)
+		return best_score;
+
+	crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL);
+	if (!crtcs)
+		return best_score;
+
+	my_score = 1;
+	if (connector->status == connector_status_connected)
+		my_score++;
+	if (connector->cmdline_mode.specified)
+		my_score++;
+	if (drm_connector_has_preferred_mode(connector, width, height))
+		my_score++;
+
+	/*
+	 * select a crtc for this connector and then attempt to configure
+	 * remaining connectors
+	 */
+	drm_client_for_each_modeset(modeset, client) {
+		crtc = modeset->crtc;
+
+		if (!connector_has_possible_crtc(connector, crtc))
+			continue;
+
+		for (o = 0; o < n; o++)
+			if (best_crtcs[o] == crtc)
+				break;
+
+		if (o < n) {
+			/* ignore cloning unless only a single crtc */
+			if (dev->mode_config.num_crtc > 1)
+				continue;
+
+			if (!drm_mode_equal(modes[o], modes[n]))
+				continue;
+		}
+
+		crtcs[n] = crtc;
+		memcpy(crtcs, best_crtcs, n * sizeof(*crtcs));
+		score = my_score + drm_client_pick_crtcs(client, connectors, connector_count,
+							 crtcs, modes, n + 1, width, height);
+		if (score > best_score) {
+			best_score = score;
+			memcpy(best_crtcs, crtcs, connector_count * sizeof(*crtcs));
+		}
+	}
+
+	kfree(crtcs);
+	return best_score;
+}
+
+/* Try to read the BIOS display configuration and use it for the initial config */
+static bool drm_client_firmware_config(struct drm_client_dev *client,
+				       struct drm_connector **connectors,
+				       unsigned int connector_count,
+				       struct drm_crtc **crtcs,
+				       struct drm_display_mode **modes,
+				       struct drm_client_offset *offsets,
+				       bool *enabled, int width, int height)
+{
+	unsigned int count = min_t(unsigned int, connector_count, BITS_PER_LONG);
+	unsigned long conn_configured, conn_seq, mask;
+	struct drm_device *dev = client->dev;
+	int i, j;
+	bool *save_enabled;
+	bool fallback = true, ret = true;
+	int num_connectors_enabled = 0;
+	int num_connectors_detected = 0;
+	struct drm_modeset_acquire_ctx ctx;
+
+	if (!drm_drv_uses_atomic_modeset(dev))
+		return false;
+
+	save_enabled = kcalloc(count, sizeof(bool), GFP_KERNEL);
+	if (!save_enabled)
+		return false;
+
+	drm_modeset_acquire_init(&ctx, 0);
+
+	while (drm_modeset_lock_all_ctx(dev, &ctx) != 0)
+		drm_modeset_backoff(&ctx);
+
+	memcpy(save_enabled, enabled, count);
+	mask = GENMASK(count - 1, 0);
+	conn_configured = 0;
+retry:
+	conn_seq = conn_configured;
+	for (i = 0; i < count; i++) {
+		struct drm_connector *connector;
+		struct drm_encoder *encoder;
+		struct drm_crtc *new_crtc;
+
+		connector = connectors[i];
+
+		if (conn_configured & BIT(i))
+			continue;
+
+		if (conn_seq == 0 && !connector->has_tile)
+			continue;
+
+		if (connector->status == connector_status_connected)
+			num_connectors_detected++;
+
+		if (!enabled[i]) {
+			DRM_DEBUG_KMS("connector %s not enabled, skipping\n",
+				      connector->name);
+			conn_configured |= BIT(i);
+			continue;
+		}
+
+		if (connector->force == DRM_FORCE_OFF) {
+			DRM_DEBUG_KMS("connector %s is disabled by user, skipping\n",
+				      connector->name);
+			enabled[i] = false;
+			continue;
+		}
+
+		encoder = connector->state->best_encoder;
+		if (!encoder || WARN_ON(!connector->state->crtc)) {
+			if (connector->force > DRM_FORCE_OFF)
+				goto bail;
+
+			DRM_DEBUG_KMS("connector %s has no encoder or crtc, skipping\n",
+				      connector->name);
+			enabled[i] = false;
+			conn_configured |= BIT(i);
+			continue;
+		}
+
+		num_connectors_enabled++;
+
+		new_crtc = connector->state->crtc;
+
+		/*
+		 * Make sure we're not trying to drive multiple connectors
+		 * with a single CRTC, since our cloning support may not
+		 * match the BIOS.
+		 */
+		for (j = 0; j < count; j++) {
+			if (crtcs[j] == new_crtc) {
+				DRM_DEBUG_KMS("fallback: cloned configuration\n");
+				goto bail;
+			}
+		}
+
+		DRM_DEBUG_KMS("looking for cmdline mode on connector %s\n",
+			      connector->name);
+
+		/* go for command line mode first */
+		modes[i] = drm_connector_pick_cmdline_mode(connector);
+
+		/* try for preferred next */
+		if (!modes[i]) {
+			DRM_DEBUG_KMS("looking for preferred mode on connector %s %d\n",
+				      connector->name, connector->has_tile);
+			modes[i] = drm_connector_has_preferred_mode(connector, width, height);
+		}
+
+		/* No preferred mode marked by the EDID? Are there any modes? */
+		if (!modes[i] && !list_empty(&connector->modes)) {
+			DRM_DEBUG_KMS("using first mode listed on connector %s\n",
+				      connector->name);
+			modes[i] = list_first_entry(&connector->modes,
+						    struct drm_display_mode,
+						    head);
+		}
+
+		/* last resort: use current mode */
+		if (!modes[i]) {
+			/*
+			 * IMPORTANT: We want to use the adjusted mode (i.e.
+			 * after the panel fitter upscaling) as the initial
+			 * config, not the input mode, which is what crtc->mode
+			 * usually contains. But since our current
+			 * code puts a mode derived from the post-pfit timings
+			 * into crtc->mode this works out correctly.
+			 *
+			 * This is crtc->mode and not crtc->state->mode for the
+			 * fastboot check to work correctly.
+			 */
+			DRM_DEBUG_KMS("looking for current mode on connector %s\n",
+				      connector->name);
+			modes[i] = &connector->state->crtc->mode;
+		}
+		crtcs[i] = new_crtc;
+
+		DRM_DEBUG_KMS("connector %s on [CRTC:%d:%s]: %dx%d%s\n",
+			      connector->name,
+			      connector->state->crtc->base.id,
+			      connector->state->crtc->name,
+			      modes[i]->hdisplay, modes[i]->vdisplay,
+			      modes[i]->flags & DRM_MODE_FLAG_INTERLACE ? "i" : "");
+
+		fallback = false;
+		conn_configured |= BIT(i);
+	}
+
+	if ((conn_configured & mask) != mask && conn_configured != conn_seq)
+		goto retry;
+
+	/*
+	 * If the BIOS didn't enable everything it could, fall back to have the
+	 * same user experiencing of lighting up as much as possible like the
+	 * fbdev helper library.
+	 */
+	if (num_connectors_enabled != num_connectors_detected &&
+	    num_connectors_enabled < dev->mode_config.num_crtc) {
+		DRM_DEBUG_KMS("fallback: Not all outputs enabled\n");
+		DRM_DEBUG_KMS("Enabled: %i, detected: %i\n", num_connectors_enabled,
+			      num_connectors_detected);
+		fallback = true;
+	}
+
+	if (fallback) {
+bail:
+		DRM_DEBUG_KMS("Not using firmware configuration\n");
+		memcpy(enabled, save_enabled, count);
+		ret = false;
+	}
+
+	drm_modeset_drop_locks(&ctx);
+	drm_modeset_acquire_fini(&ctx);
+
+	kfree(save_enabled);
+	return ret;
+}
+
+/**
+ * drm_client_modeset_probe() - Probe for displays
+ * @client: DRM client
+ * @width: Maximum display mode width (optional)
+ * @height: Maximum display mode height (optional)
+ *
+ * This function sets up display pipelines for enabled connectors and stores the
+ * config in the client's modeset array.
+ *
+ * Returns:
+ * Zero on success or negative error code on failure.
+ */
+int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height)
+{
+	struct drm_connector *connector, **connectors = NULL;
+	struct drm_connector_list_iter conn_iter;
+	struct drm_device *dev = client->dev;
+	unsigned int total_modes_count = 0;
+	struct drm_client_offset *offsets;
+	unsigned int connector_count = 0;
+	struct drm_display_mode **modes;
+	struct drm_crtc **crtcs;
+	int i, ret = 0;
+	bool *enabled;
+
+	DRM_DEBUG_KMS("\n");
+
+	if (!width)
+		width = dev->mode_config.max_width;
+	if (!height)
+		height = dev->mode_config.max_height;
+
+	drm_connector_list_iter_begin(dev, &conn_iter);
+	drm_client_for_each_connector_iter(connector, &conn_iter) {
+		struct drm_connector **tmp;
+
+		tmp = krealloc(connectors, (connector_count + 1) * sizeof(*connectors), GFP_KERNEL);
+		if (!tmp) {
+			ret = -ENOMEM;
+			goto free_connectors;
+		}
+
+		connectors = tmp;
+		drm_connector_get(connector);
+		connectors[connector_count++] = connector;
+	}
+	drm_connector_list_iter_end(&conn_iter);
+
+	if (!connector_count)
+		return 0;
+
+	crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL);
+	modes = kcalloc(connector_count, sizeof(*modes), GFP_KERNEL);
+	offsets = kcalloc(connector_count, sizeof(*offsets), GFP_KERNEL);
+	enabled = kcalloc(connector_count, sizeof(bool), GFP_KERNEL);
+	if (!crtcs || !modes || !enabled || !offsets) {
+		DRM_ERROR("Memory allocation failed\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&client->modeset_mutex);
+
+	mutex_lock(&dev->mode_config.mutex);
+	for (i = 0; i < connector_count; i++)
+		total_modes_count += connectors[i]->funcs->fill_modes(connectors[i], width, height);
+	if (!total_modes_count)
+		DRM_DEBUG_KMS("No connectors reported connected with modes\n");
+	drm_client_connectors_enabled(connectors, connector_count, enabled);
+
+	if (!drm_client_firmware_config(client, connectors, connector_count, crtcs,
+					modes, offsets, enabled, width, height)) {
+		memset(modes, 0, connector_count * sizeof(*modes));
+		memset(crtcs, 0, connector_count * sizeof(*crtcs));
+		memset(offsets, 0, connector_count * sizeof(*offsets));
+
+		if (!drm_client_target_cloned(dev, connectors, connector_count, modes,
+					      offsets, enabled, width, height) &&
+		    !drm_client_target_preferred(connectors, connector_count, modes,
+						 offsets, enabled, width, height))
+			DRM_ERROR("Unable to find initial modes\n");
+
+		DRM_DEBUG_KMS("picking CRTCs for %dx%d config\n",
+			      width, height);
+
+		drm_client_pick_crtcs(client, connectors, connector_count,
+				      crtcs, modes, 0, width, height);
+	}
+	mutex_unlock(&dev->mode_config.mutex);
+
+	drm_client_modeset_release(client);
+
+	for (i = 0; i < connector_count; i++) {
+		struct drm_display_mode *mode = modes[i];
+		struct drm_crtc *crtc = crtcs[i];
+		struct drm_client_offset *offset = &offsets[i];
+
+		if (mode && crtc) {
+			struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc);
+			struct drm_connector *connector = connectors[i];
+
+			DRM_DEBUG_KMS("desired mode %s set on crtc %d (%d,%d)\n",
+				      mode->name, crtc->base.id, offset->x, offset->y);
+
+			if (WARN_ON_ONCE(modeset->num_connectors == DRM_CLIENT_MAX_CLONED_CONNECTORS ||
+					 (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1))) {
+				ret = -EINVAL;
+				break;
+			}
+
+			modeset->mode = drm_mode_duplicate(dev, mode);
+			drm_connector_get(connector);
+			modeset->connectors[modeset->num_connectors++] = connector;
+			modeset->x = offset->x;
+			modeset->y = offset->y;
+		}
+	}
+
+	mutex_unlock(&client->modeset_mutex);
+out:
+	kfree(crtcs);
+	kfree(modes);
+	kfree(offsets);
+	kfree(enabled);
+free_connectors:
+	for (i = 0; i < connector_count; i++)
+		drm_connector_put(connectors[i]);
+	kfree(connectors);
+
+	return ret;
+}
+EXPORT_SYMBOL(drm_client_modeset_probe);
 
 /**
  * drm_client_panel_rotation() - Check panel orientation
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index e7f41e5d924c..946cae196475 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -48,10 +48,6 @@
 
 #include "drm_internal.h"
 
-struct drm_client_offset {
-	int x, y;
-};
-
 static bool drm_fbdev_emulation = true;
 module_param_named(fbdev_emulation, drm_fbdev_emulation, bool, 0600);
 MODULE_PARM_DESC(fbdev_emulation,
@@ -1703,694 +1699,6 @@ void drm_fb_helper_fill_info(struct fb_info *info,
 }
 EXPORT_SYMBOL(drm_fb_helper_fill_info);
 
-static struct drm_display_mode *
-drm_connector_has_preferred_mode(struct drm_connector *connector, int width, int height)
-{
-	struct drm_display_mode *mode;
-
-	list_for_each_entry(mode, &connector->modes, head) {
-		if (mode->hdisplay > width ||
-		    mode->vdisplay > height)
-			continue;
-		if (mode->type & DRM_MODE_TYPE_PREFERRED)
-			return mode;
-	}
-	return NULL;
-}
-
-static struct drm_display_mode *
-drm_connector_pick_cmdline_mode(struct drm_connector *connector)
-{
-	struct drm_cmdline_mode *cmdline_mode;
-	struct drm_display_mode *mode;
-	bool prefer_non_interlace;
-
-	cmdline_mode = &connector->cmdline_mode;
-	if (cmdline_mode->specified == false)
-		return NULL;
-
-	/* attempt to find a matching mode in the list of modes
-	 *  we have gotten so far, if not add a CVT mode that conforms
-	 */
-	if (cmdline_mode->rb || cmdline_mode->margins)
-		goto create_mode;
-
-	prefer_non_interlace = !cmdline_mode->interlace;
-again:
-	list_for_each_entry(mode, &connector->modes, head) {
-		/* check width/height */
-		if (mode->hdisplay != cmdline_mode->xres ||
-		    mode->vdisplay != cmdline_mode->yres)
-			continue;
-
-		if (cmdline_mode->refresh_specified) {
-			if (mode->vrefresh != cmdline_mode->refresh)
-				continue;
-		}
-
-		if (cmdline_mode->interlace) {
-			if (!(mode->flags & DRM_MODE_FLAG_INTERLACE))
-				continue;
-		} else if (prefer_non_interlace) {
-			if (mode->flags & DRM_MODE_FLAG_INTERLACE)
-				continue;
-		}
-		return mode;
-	}
-
-	if (prefer_non_interlace) {
-		prefer_non_interlace = false;
-		goto again;
-	}
-
-create_mode:
-	mode = drm_mode_create_from_cmdline_mode(connector->dev, cmdline_mode);
-	list_add(&mode->head, &connector->modes);
-
-	return mode;
-}
-
-static bool drm_connector_enabled(struct drm_connector *connector, bool strict)
-{
-	bool enable;
-
-	if (connector->display_info.non_desktop)
-		return false;
-
-	if (strict)
-		enable = connector->status == connector_status_connected;
-	else
-		enable = connector->status != connector_status_disconnected;
-
-	return enable;
-}
-
-static void drm_client_connectors_enabled(struct drm_connector **connectors,
-					  unsigned int connector_count,
-					  bool *enabled)
-{
-	bool any_enabled = false;
-	struct drm_connector *connector;
-	int i = 0;
-
-	for (i = 0; i < connector_count; i++) {
-		connector = connectors[i];
-		enabled[i] = drm_connector_enabled(connector, true);
-		DRM_DEBUG_KMS("connector %d enabled? %s\n", connector->base.id,
-			      connector->display_info.non_desktop ? "non desktop" : enabled[i] ? "yes" : "no");
-
-		any_enabled |= enabled[i];
-	}
-
-	if (any_enabled)
-		return;
-
-	for (i = 0; i < connector_count; i++)
-		enabled[i] = drm_connector_enabled(connectors[i], false);
-}
-
-static bool drm_client_target_cloned(struct drm_device *dev,
-				     struct drm_connector **connectors,
-				     unsigned int connector_count,
-				     struct drm_display_mode **modes,
-				     struct drm_client_offset *offsets,
-				     bool *enabled, int width, int height)
-{
-	int count, i, j;
-	bool can_clone = false;
-	struct drm_display_mode *dmt_mode, *mode;
-
-	/* only contemplate cloning in the single crtc case */
-	if (dev->mode_config.num_crtc > 1)
-		return false;
-
-	count = 0;
-	for (i = 0; i < connector_count; i++) {
-		if (enabled[i])
-			count++;
-	}
-
-	/* only contemplate cloning if more than one connector is enabled */
-	if (count <= 1)
-		return false;
-
-	/* check the command line or if nothing common pick 1024x768 */
-	can_clone = true;
-	for (i = 0; i < connector_count; i++) {
-		if (!enabled[i])
-			continue;
-		modes[i] = drm_connector_pick_cmdline_mode(connectors[i]);
-		if (!modes[i]) {
-			can_clone = false;
-			break;
-		}
-		for (j = 0; j < i; j++) {
-			if (!enabled[j])
-				continue;
-			if (!drm_mode_match(modes[j], modes[i],
-					    DRM_MODE_MATCH_TIMINGS |
-					    DRM_MODE_MATCH_CLOCK |
-					    DRM_MODE_MATCH_FLAGS |
-					    DRM_MODE_MATCH_3D_FLAGS))
-				can_clone = false;
-		}
-	}
-
-	if (can_clone) {
-		DRM_DEBUG_KMS("can clone using command line\n");
-		return true;
-	}
-
-	/* try and find a 1024x768 mode on each connector */
-	can_clone = true;
-	dmt_mode = drm_mode_find_dmt(dev, 1024, 768, 60, false);
-
-	for (i = 0; i < connector_count; i++) {
-		if (!enabled[i])
-			continue;
-
-		list_for_each_entry(mode, &connectors[i]->modes, head) {
-			if (drm_mode_match(mode, dmt_mode,
-					   DRM_MODE_MATCH_TIMINGS |
-					   DRM_MODE_MATCH_CLOCK |
-					   DRM_MODE_MATCH_FLAGS |
-					   DRM_MODE_MATCH_3D_FLAGS))
-				modes[i] = mode;
-		}
-		if (!modes[i])
-			can_clone = false;
-	}
-
-	if (can_clone) {
-		DRM_DEBUG_KMS("can clone using 1024x768\n");
-		return true;
-	}
-	DRM_INFO("kms: can't enable cloning when we probably wanted to.\n");
-	return false;
-}
-
-static int drm_client_get_tile_offsets(struct drm_connector **connectors,
-				       unsigned int connector_count,
-				       struct drm_display_mode **modes,
-				       struct drm_client_offset *offsets,
-				       int idx,
-				       int h_idx, int v_idx)
-{
-	struct drm_connector *connector;
-	int i;
-	int hoffset = 0, voffset = 0;
-
-	for (i = 0; i < connector_count; i++) {
-		connector = connectors[i];
-		if (!connector->has_tile)
-			continue;
-
-		if (!modes[i] && (h_idx || v_idx)) {
-			DRM_DEBUG_KMS("no modes for connector tiled %d %d\n", i,
-				      connector->base.id);
-			continue;
-		}
-		if (connector->tile_h_loc < h_idx)
-			hoffset += modes[i]->hdisplay;
-
-		if (connector->tile_v_loc < v_idx)
-			voffset += modes[i]->vdisplay;
-	}
-	offsets[idx].x = hoffset;
-	offsets[idx].y = voffset;
-	DRM_DEBUG_KMS("returned %d %d for %d %d\n", hoffset, voffset, h_idx, v_idx);
-	return 0;
-}
-
-static bool drm_client_target_preferred(struct drm_connector **connectors,
-					unsigned int connector_count,
-					struct drm_display_mode **modes,
-					struct drm_client_offset *offsets,
-					bool *enabled, int width, int height)
-{
-	const u64 mask = BIT_ULL(connector_count) - 1;
-	struct drm_connector *connector;
-	u64 conn_configured = 0;
-	int tile_pass = 0;
-	int i;
-
-retry:
-	for (i = 0; i < connector_count; i++) {
-		connector = connectors[i];
-
-		if (conn_configured & BIT_ULL(i))
-			continue;
-
-		if (enabled[i] == false) {
-			conn_configured |= BIT_ULL(i);
-			continue;
-		}
-
-		/* first pass over all the untiled connectors */
-		if (tile_pass == 0 && connector->has_tile)
-			continue;
-
-		if (tile_pass == 1) {
-			if (connector->tile_h_loc != 0 ||
-			    connector->tile_v_loc != 0)
-				continue;
-
-		} else {
-			if (connector->tile_h_loc != tile_pass - 1 &&
-			    connector->tile_v_loc != tile_pass - 1)
-			/* if this tile_pass doesn't cover any of the tiles - keep going */
-				continue;
-
-			/*
-			 * find the tile offsets for this pass - need to find
-			 * all tiles left and above
-			 */
-			drm_client_get_tile_offsets(connectors, connector_count, modes, offsets, i,
-						    connector->tile_h_loc, connector->tile_v_loc);
-		}
-		DRM_DEBUG_KMS("looking for cmdline mode on connector %d\n",
-			      connector->base.id);
-
-		/* got for command line mode first */
-		modes[i] = drm_connector_pick_cmdline_mode(connector);
-		if (!modes[i]) {
-			DRM_DEBUG_KMS("looking for preferred mode on connector %d %d\n",
-				      connector->base.id, connector->tile_group ? connector->tile_group->id : 0);
-			modes[i] = drm_connector_has_preferred_mode(connector, width, height);
-		}
-		/* No preferred modes, pick one off the list */
-		if (!modes[i] && !list_empty(&connector->modes)) {
-			list_for_each_entry(modes[i], &connector->modes, head)
-				break;
-		}
-		DRM_DEBUG_KMS("found mode %s\n", modes[i] ? modes[i]->name :
-			  "none");
-		conn_configured |= BIT_ULL(i);
-	}
-
-	if ((conn_configured & mask) != mask) {
-		tile_pass++;
-		goto retry;
-	}
-	return true;
-}
-
-static bool connector_has_possible_crtc(struct drm_connector *connector,
-					struct drm_crtc *crtc)
-{
-	struct drm_encoder *encoder;
-	int i;
-
-	drm_connector_for_each_possible_encoder(connector, encoder, i) {
-		if (encoder->possible_crtcs & drm_crtc_mask(crtc))
-			return true;
-	}
-
-	return false;
-}
-
-static int drm_client_pick_crtcs(struct drm_client_dev *client,
-				 struct drm_connector **connectors,
-				 unsigned int connector_count,
-				 struct drm_crtc **best_crtcs,
-				 struct drm_display_mode **modes,
-				 int n, int width, int height)
-{
-	struct drm_device *dev = client->dev;
-	struct drm_connector *connector;
-	int my_score, best_score, score;
-	struct drm_crtc **crtcs, *crtc;
-	struct drm_mode_set *modeset;
-	int o;
-
-	if (n == connector_count)
-		return 0;
-
-	connector = connectors[n];
-
-	best_crtcs[n] = NULL;
-	best_score = drm_client_pick_crtcs(client, connectors, connector_count,
-					   best_crtcs, modes, n + 1, width, height);
-	if (modes[n] == NULL)
-		return best_score;
-
-	crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL);
-	if (!crtcs)
-		return best_score;
-
-	my_score = 1;
-	if (connector->status == connector_status_connected)
-		my_score++;
-	if (connector->cmdline_mode.specified)
-		my_score++;
-	if (drm_connector_has_preferred_mode(connector, width, height))
-		my_score++;
-
-	/*
-	 * select a crtc for this connector and then attempt to configure
-	 * remaining connectors
-	 */
-	drm_client_for_each_modeset(modeset, client) {
-		crtc = modeset->crtc;
-
-		if (!connector_has_possible_crtc(connector, crtc))
-			continue;
-
-		for (o = 0; o < n; o++)
-			if (best_crtcs[o] == crtc)
-				break;
-
-		if (o < n) {
-			/* ignore cloning unless only a single crtc */
-			if (dev->mode_config.num_crtc > 1)
-				continue;
-
-			if (!drm_mode_equal(modes[o], modes[n]))
-				continue;
-		}
-
-		crtcs[n] = crtc;
-		memcpy(crtcs, best_crtcs, n * sizeof(*crtcs));
-		score = my_score + drm_client_pick_crtcs(client, connectors, connector_count,
-							 crtcs, modes, n + 1, width, height);
-		if (score > best_score) {
-			best_score = score;
-			memcpy(best_crtcs, crtcs, connector_count * sizeof(*crtcs));
-		}
-	}
-
-	kfree(crtcs);
-	return best_score;
-}
-
-/* Try to read the BIOS display configuration and use it for the initial config */
-static bool drm_client_firmware_config(struct drm_client_dev *client,
-				       struct drm_connector **connectors,
-				       unsigned int connector_count,
-				       struct drm_crtc **crtcs,
-				       struct drm_display_mode **modes,
-				       struct drm_client_offset *offsets,
-				       bool *enabled, int width, int height)
-{
-	unsigned int count = min_t(unsigned int, connector_count, BITS_PER_LONG);
-	unsigned long conn_configured, conn_seq, mask;
-	struct drm_device *dev = client->dev;
-	int i, j;
-	bool *save_enabled;
-	bool fallback = true, ret = true;
-	int num_connectors_enabled = 0;
-	int num_connectors_detected = 0;
-	struct drm_modeset_acquire_ctx ctx;
-
-	if (!drm_drv_uses_atomic_modeset(dev))
-		return false;
-
-	save_enabled = kcalloc(count, sizeof(bool), GFP_KERNEL);
-	if (!save_enabled)
-		return false;
-
-	drm_modeset_acquire_init(&ctx, 0);
-
-	while (drm_modeset_lock_all_ctx(dev, &ctx) != 0)
-		drm_modeset_backoff(&ctx);
-
-	memcpy(save_enabled, enabled, count);
-	mask = GENMASK(count - 1, 0);
-	conn_configured = 0;
-retry:
-	conn_seq = conn_configured;
-	for (i = 0; i < count; i++) {
-		struct drm_connector *connector;
-		struct drm_encoder *encoder;
-		struct drm_crtc *new_crtc;
-
-		connector = connectors[i];
-
-		if (conn_configured & BIT(i))
-			continue;
-
-		if (conn_seq == 0 && !connector->has_tile)
-			continue;
-
-		if (connector->status == connector_status_connected)
-			num_connectors_detected++;
-
-		if (!enabled[i]) {
-			DRM_DEBUG_KMS("connector %s not enabled, skipping\n",
-				      connector->name);
-			conn_configured |= BIT(i);
-			continue;
-		}
-
-		if (connector->force == DRM_FORCE_OFF) {
-			DRM_DEBUG_KMS("connector %s is disabled by user, skipping\n",
-				      connector->name);
-			enabled[i] = false;
-			continue;
-		}
-
-		encoder = connector->state->best_encoder;
-		if (!encoder || WARN_ON(!connector->state->crtc)) {
-			if (connector->force > DRM_FORCE_OFF)
-				goto bail;
-
-			DRM_DEBUG_KMS("connector %s has no encoder or crtc, skipping\n",
-				      connector->name);
-			enabled[i] = false;
-			conn_configured |= BIT(i);
-			continue;
-		}
-
-		num_connectors_enabled++;
-
-		new_crtc = connector->state->crtc;
-
-		/*
-		 * Make sure we're not trying to drive multiple connectors
-		 * with a single CRTC, since our cloning support may not
-		 * match the BIOS.
-		 */
-		for (j = 0; j < count; j++) {
-			if (crtcs[j] == new_crtc) {
-				DRM_DEBUG_KMS("fallback: cloned configuration\n");
-				goto bail;
-			}
-		}
-
-		DRM_DEBUG_KMS("looking for cmdline mode on connector %s\n",
-			      connector->name);
-
-		/* go for command line mode first */
-		modes[i] = drm_connector_pick_cmdline_mode(connector);
-
-		/* try for preferred next */
-		if (!modes[i]) {
-			DRM_DEBUG_KMS("looking for preferred mode on connector %s %d\n",
-				      connector->name, connector->has_tile);
-			modes[i] = drm_connector_has_preferred_mode(connector, width, height);
-		}
-
-		/* No preferred mode marked by the EDID? Are there any modes? */
-		if (!modes[i] && !list_empty(&connector->modes)) {
-			DRM_DEBUG_KMS("using first mode listed on connector %s\n",
-				      connector->name);
-			modes[i] = list_first_entry(&connector->modes,
-						    struct drm_display_mode,
-						    head);
-		}
-
-		/* last resort: use current mode */
-		if (!modes[i]) {
-			/*
-			 * IMPORTANT: We want to use the adjusted mode (i.e.
-			 * after the panel fitter upscaling) as the initial
-			 * config, not the input mode, which is what crtc->mode
-			 * usually contains. But since our current
-			 * code puts a mode derived from the post-pfit timings
-			 * into crtc->mode this works out correctly.
-			 *
-			 * This is crtc->mode and not crtc->state->mode for the
-			 * fastboot check to work correctly.
-			 */
-			DRM_DEBUG_KMS("looking for current mode on connector %s\n",
-				      connector->name);
-			modes[i] = &connector->state->crtc->mode;
-		}
-		crtcs[i] = new_crtc;
-
-		DRM_DEBUG_KMS("connector %s on [CRTC:%d:%s]: %dx%d%s\n",
-			      connector->name,
-			      connector->state->crtc->base.id,
-			      connector->state->crtc->name,
-			      modes[i]->hdisplay, modes[i]->vdisplay,
-			      modes[i]->flags & DRM_MODE_FLAG_INTERLACE ? "i" : "");
-
-		fallback = false;
-		conn_configured |= BIT(i);
-	}
-
-	if ((conn_configured & mask) != mask && conn_configured != conn_seq)
-		goto retry;
-
-	/*
-	 * If the BIOS didn't enable everything it could, fall back to have the
-	 * same user experiencing of lighting up as much as possible like the
-	 * fbdev helper library.
-	 */
-	if (num_connectors_enabled != num_connectors_detected &&
-	    num_connectors_enabled < dev->mode_config.num_crtc) {
-		DRM_DEBUG_KMS("fallback: Not all outputs enabled\n");
-		DRM_DEBUG_KMS("Enabled: %i, detected: %i\n", num_connectors_enabled,
-			      num_connectors_detected);
-		fallback = true;
-	}
-
-	if (fallback) {
-bail:
-		DRM_DEBUG_KMS("Not using firmware configuration\n");
-		memcpy(enabled, save_enabled, count);
-		ret = false;
-	}
-
-	drm_modeset_drop_locks(&ctx);
-	drm_modeset_acquire_fini(&ctx);
-
-	kfree(save_enabled);
-	return ret;
-}
-
-/**
- * drm_client_modeset_probe() - Probe for displays
- * @client: DRM client
- * @width: Maximum display mode width (optional)
- * @height: Maximum display mode height (optional)
- *
- * This function sets up display pipelines for enabled connectors and stores the
- * config in the client's modeset array.
- *
- * Returns:
- * Zero on success or negative error code on failure.
- */
-int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height)
-{
-	struct drm_connector *connector, **connectors = NULL;
-	struct drm_connector_list_iter conn_iter;
-	struct drm_device *dev = client->dev;
-	unsigned int total_modes_count = 0;
-	struct drm_client_offset *offsets;
-	unsigned int connector_count = 0;
-	struct drm_display_mode **modes;
-	struct drm_crtc **crtcs;
-	int i, ret = 0;
-	bool *enabled;
-
-	DRM_DEBUG_KMS("\n");
-
-	if (!width)
-		width = dev->mode_config.max_width;
-	if (!height)
-		height = dev->mode_config.max_height;
-
-	drm_connector_list_iter_begin(dev, &conn_iter);
-	drm_client_for_each_connector_iter(connector, &conn_iter) {
-		struct drm_connector **tmp;
-
-		tmp = krealloc(connectors, (connector_count + 1) * sizeof(*connectors), GFP_KERNEL);
-		if (!tmp) {
-			ret = -ENOMEM;
-			goto free_connectors;
-		}
-
-		connectors = tmp;
-		drm_connector_get(connector);
-		connectors[connector_count++] = connector;
-	}
-	drm_connector_list_iter_end(&conn_iter);
-
-	if (!connector_count)
-		return 0;
-
-	crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL);
-	modes = kcalloc(connector_count, sizeof(*modes), GFP_KERNEL);
-	offsets = kcalloc(connector_count, sizeof(*offsets), GFP_KERNEL);
-	enabled = kcalloc(connector_count, sizeof(bool), GFP_KERNEL);
-	if (!crtcs || !modes || !enabled || !offsets) {
-		DRM_ERROR("Memory allocation failed\n");
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	mutex_lock(&client->modeset_mutex);
-
-	mutex_lock(&dev->mode_config.mutex);
-	for (i = 0; i < connector_count; i++)
-		total_modes_count += connectors[i]->funcs->fill_modes(connectors[i], width, height);
-	if (!total_modes_count)
-		DRM_DEBUG_KMS("No connectors reported connected with modes\n");
-	drm_client_connectors_enabled(connectors, connector_count, enabled);
-
-	if (!drm_client_firmware_config(client, connectors, connector_count, crtcs,
-					modes, offsets, enabled, width, height)) {
-		memset(modes, 0, connector_count * sizeof(*modes));
-		memset(crtcs, 0, connector_count * sizeof(*crtcs));
-		memset(offsets, 0, connector_count * sizeof(*offsets));
-
-		if (!drm_client_target_cloned(dev, connectors, connector_count, modes,
-					      offsets, enabled, width, height) &&
-		    !drm_client_target_preferred(connectors, connector_count, modes,
-						 offsets, enabled, width, height))
-			DRM_ERROR("Unable to find initial modes\n");
-
-		DRM_DEBUG_KMS("picking CRTCs for %dx%d config\n",
-			      width, height);
-
-		drm_client_pick_crtcs(client, connectors, connector_count,
-				      crtcs, modes, 0, width, height);
-	}
-	mutex_unlock(&dev->mode_config.mutex);
-
-	drm_client_modeset_release(client);
-
-	for (i = 0; i < connector_count; i++) {
-		struct drm_display_mode *mode = modes[i];
-		struct drm_crtc *crtc = crtcs[i];
-		struct drm_client_offset *offset = &offsets[i];
-
-		if (mode && crtc) {
-			struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc);
-			struct drm_connector *connector = connectors[i];
-
-			DRM_DEBUG_KMS("desired mode %s set on crtc %d (%d,%d)\n",
-				      mode->name, crtc->base.id, offset->x, offset->y);
-
-			if (WARN_ON_ONCE(modeset->num_connectors == DRM_CLIENT_MAX_CLONED_CONNECTORS ||
-					 (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1))) {
-				ret = -EINVAL;
-				break;
-			}
-
-			modeset->mode = drm_mode_duplicate(dev, mode);
-			drm_connector_get(connector);
-			modeset->connectors[modeset->num_connectors++] = connector;
-			modeset->x = offset->x;
-			modeset->y = offset->y;
-		}
-	}
-
-	mutex_unlock(&client->modeset_mutex);
-out:
-	kfree(crtcs);
-	kfree(modes);
-	kfree(offsets);
-	kfree(enabled);
-free_connectors:
-	for (i = 0; i < connector_count; i++)
-		drm_connector_put(connectors[i]);
-	kfree(connectors);
-
-	return ret;
-}
-
 /*
  * This is a continuation of drm_setup_crtcs() that sets up anything related
  * to the framebuffer. During initialization, drm_setup_crtcs() is called before
diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h
index 8d94880bbe25..f2d5ed745733 100644
--- a/include/drm/drm_client.h
+++ b/include/drm/drm_client.h
@@ -18,8 +18,6 @@ struct drm_gem_object;
 struct drm_minor;
 struct module;
 
-#define DRM_CLIENT_MAX_CLONED_CONNECTORS	8
-
 /**
  * struct drm_client_funcs - DRM client callbacks
  */
@@ -154,8 +152,7 @@ void drm_client_framebuffer_delete(struct drm_client_buffer *buffer);
 
 int drm_client_modeset_create(struct drm_client_dev *client);
 void drm_client_modeset_free(struct drm_client_dev *client);
-void drm_client_modeset_release(struct drm_client_dev *client);
-struct drm_mode_set *drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc);
+int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height);
 bool drm_client_panel_rotation(struct drm_mode_set *modeset, unsigned int *rotation);
 int drm_client_modeset_commit_force(struct drm_client_dev *client);
 int drm_client_modeset_commit(struct drm_client_dev *client);
-- 
cgit v1.2.3


From ecf79e7ca1565e4d36482aa4d2950bc397313425 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 11 Jun 2019 13:28:59 +0200
Subject: drm/fb: document dirty helper better
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apparently little known fact that there's no need to hand-roll your own
anymore. Cc'ing a bunch of driver people who might want to know this
too.

v2: s/none/known/ (Chris Wilson)

Cc: Rob Clark <robdclark@gmail.com>
Cc: Sebastian Reichel <sebastian.reichel@collabora.com>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: David Lechner <david@lechnology.com>
Cc: Noralf Trønnes <noralf@tronnes.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190611112859.16375-1-daniel.vetter@ffwll.ch
---
 include/drm/drm_framebuffer.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_framebuffer.h b/include/drm/drm_framebuffer.h
index c23016748e3f..c0e0256e3e98 100644
--- a/include/drm/drm_framebuffer.h
+++ b/include/drm/drm_framebuffer.h
@@ -87,6 +87,9 @@ struct drm_framebuffer_funcs {
 	 * for more information as all the semantics and arguments have a one to
 	 * one mapping on this function.
 	 *
+	 * Atomic drivers should use drm_atomic_helper_dirtyfb() to implement
+	 * this hook.
+	 *
 	 * RETURNS:
 	 *
 	 * 0 on success or a negative error code on failure.
-- 
cgit v1.2.3


From 99d02ed523dcaf5be0c6a778708df2779cfb3ed6 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 4 Jun 2019 13:42:06 -0700
Subject: drm: bridge: dw-hdmi: Add hook for resume

On Rockchip rk3288-based Chromebooks when you do a suspend/resume
cycle:

1. You lose the ability to detect an HDMI device being plugged in.

2. If you're using the i2c bus built in to dw_hdmi then it stops
working.

Let's add a hook to the core dw-hdmi driver so that we can call it in
dw_hdmi-rockchip in the next commit.

NOTE: the exact set of steps I've done here in resume come from
looking at the normal dw_hdmi init sequence in upstream Linux plus the
sequence that we did in downstream Chrome OS 3.14.  Testing show that
it seems to work, but if an extra step is needed or something here is
not needed we could improve it.

As part of this change we'll refactor the hardware init bits of
dw-hdmi to happen all in one function and all at the same time.  Since
we need to init the interrupt mutes before we request the IRQ, this
means moving the hardware init earlier in the function, but there
should be no problems with that.  Also as part of this we now
unconditionally init the "i2c" parts of dw-hdmi, but again that ought
to be fine.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190604204207.168085-1-dianders@chromium.org
---
 drivers/gpu/drm/bridge/synopsys/dw-hdmi.c | 48 +++++++++++++++++++------------
 include/drm/bridge/dw_hdmi.h              |  2 ++
 2 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c
index 3a6320108575..f6c89007d3bb 100644
--- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c
+++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c
@@ -233,6 +233,13 @@ static void hdmi_mask_writeb(struct dw_hdmi *hdmi, u8 data, unsigned int reg,
 
 static void dw_hdmi_i2c_init(struct dw_hdmi *hdmi)
 {
+	hdmi_writeb(hdmi, HDMI_PHY_I2CM_INT_ADDR_DONE_POL,
+		    HDMI_PHY_I2CM_INT_ADDR);
+
+	hdmi_writeb(hdmi, HDMI_PHY_I2CM_CTLINT_ADDR_NAC_POL |
+		    HDMI_PHY_I2CM_CTLINT_ADDR_ARBITRATION_POL,
+		    HDMI_PHY_I2CM_CTLINT_ADDR);
+
 	/* Software reset */
 	hdmi_writeb(hdmi, 0x00, HDMI_I2CM_SOFTRSTZ);
 
@@ -1994,16 +2001,6 @@ static int dw_hdmi_setup(struct dw_hdmi *hdmi, struct drm_display_mode *mode)
 	return 0;
 }
 
-static void dw_hdmi_setup_i2c(struct dw_hdmi *hdmi)
-{
-	hdmi_writeb(hdmi, HDMI_PHY_I2CM_INT_ADDR_DONE_POL,
-		    HDMI_PHY_I2CM_INT_ADDR);
-
-	hdmi_writeb(hdmi, HDMI_PHY_I2CM_CTLINT_ADDR_NAC_POL |
-		    HDMI_PHY_I2CM_CTLINT_ADDR_ARBITRATION_POL,
-		    HDMI_PHY_I2CM_CTLINT_ADDR);
-}
-
 static void initialize_hdmi_ih_mutes(struct dw_hdmi *hdmi)
 {
 	u8 ih_mute;
@@ -2504,6 +2501,21 @@ static const struct regmap_config hdmi_regmap_32bit_config = {
 	.max_register	= HDMI_I2CM_FS_SCL_LCNT_0_ADDR << 2,
 };
 
+static void dw_hdmi_init_hw(struct dw_hdmi *hdmi)
+{
+	initialize_hdmi_ih_mutes(hdmi);
+
+	/*
+	 * Reset HDMI DDC I2C master controller and mute I2CM interrupts.
+	 * Even if we are using a separate i2c adapter doing this doesn't
+	 * hurt.
+	 */
+	dw_hdmi_i2c_init(hdmi);
+
+	if (hdmi->phy.ops->setup_hpd)
+		hdmi->phy.ops->setup_hpd(hdmi, hdmi->phy.data);
+}
+
 static struct dw_hdmi *
 __dw_hdmi_probe(struct platform_device *pdev,
 		const struct dw_hdmi_plat_data *plat_data)
@@ -2655,7 +2667,7 @@ __dw_hdmi_probe(struct platform_device *pdev,
 		 prod_id1 & HDMI_PRODUCT_ID1_HDCP ? "with" : "without",
 		 hdmi->phy.name);
 
-	initialize_hdmi_ih_mutes(hdmi);
+	dw_hdmi_init_hw(hdmi);
 
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
@@ -2712,10 +2724,6 @@ __dw_hdmi_probe(struct platform_device *pdev,
 	hdmi->bridge.of_node = pdev->dev.of_node;
 #endif
 
-	dw_hdmi_setup_i2c(hdmi);
-	if (hdmi->phy.ops->setup_hpd)
-		hdmi->phy.ops->setup_hpd(hdmi, hdmi->phy.data);
-
 	memset(&pdevinfo, 0, sizeof(pdevinfo));
 	pdevinfo.parent = dev;
 	pdevinfo.id = PLATFORM_DEVID_AUTO;
@@ -2768,10 +2776,6 @@ __dw_hdmi_probe(struct platform_device *pdev,
 		hdmi->cec = platform_device_register_full(&pdevinfo);
 	}
 
-	/* Reset HDMI DDC I2C master controller and mute I2CM interrupts */
-	if (hdmi->i2c)
-		dw_hdmi_i2c_init(hdmi);
-
 	return hdmi;
 
 err_iahb:
@@ -2875,6 +2879,12 @@ void dw_hdmi_unbind(struct dw_hdmi *hdmi)
 }
 EXPORT_SYMBOL_GPL(dw_hdmi_unbind);
 
+void dw_hdmi_resume(struct dw_hdmi *hdmi)
+{
+	dw_hdmi_init_hw(hdmi);
+}
+EXPORT_SYMBOL_GPL(dw_hdmi_resume);
+
 MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>");
 MODULE_AUTHOR("Andy Yan <andy.yan@rock-chips.com>");
 MODULE_AUTHOR("Yakir Yang <ykk@rock-chips.com>");
diff --git a/include/drm/bridge/dw_hdmi.h b/include/drm/bridge/dw_hdmi.h
index 66e70770cce5..601243b56b69 100644
--- a/include/drm/bridge/dw_hdmi.h
+++ b/include/drm/bridge/dw_hdmi.h
@@ -154,6 +154,8 @@ struct dw_hdmi *dw_hdmi_bind(struct platform_device *pdev,
 			     struct drm_encoder *encoder,
 			     const struct dw_hdmi_plat_data *plat_data);
 
+void dw_hdmi_resume(struct dw_hdmi *hdmi);
+
 void dw_hdmi_setup_rx_sense(struct dw_hdmi *hdmi, bool hpd, bool rx_sense);
 
 void dw_hdmi_set_sample_rate(struct dw_hdmi *hdmi, unsigned int rate);
-- 
cgit v1.2.3


From a52c8e2469c30cf7ac453d624aed9c168b23d1af Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 28 May 2019 14:37:28 +0300
Subject: RDMA: Clean destroy CQ in drivers do not return errors

Like all other destroy commands, .destroy_cq() call is not supposed
to fail. In all flows, the attempt to return earlier caused to memory
leaks.

This patch converts .destroy_cq() to do not return any errors.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Acked-by: Gal Pressman <galpress@amazon.com>
Acked-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/cq.c                    |  5 +--
 drivers/infiniband/core/verbs.c                 |  3 +-
 drivers/infiniband/hw/bnxt_re/ib_verbs.c        | 13 ++-----
 drivers/infiniband/hw/bnxt_re/ib_verbs.h        |  2 +-
 drivers/infiniband/hw/cxgb3/cxio_hal.c          |  6 ++--
 drivers/infiniband/hw/cxgb3/cxio_hal.h          |  2 +-
 drivers/infiniband/hw/cxgb3/iwch_provider.c     |  3 +-
 drivers/infiniband/hw/cxgb4/cq.c                | 13 +++----
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h          |  2 +-
 drivers/infiniband/hw/efa/efa.h                 |  2 +-
 drivers/infiniband/hw/efa/efa_verbs.c           |  9 ++---
 drivers/infiniband/hw/hns/hns_roce_cq.c         | 48 ++++++++++++-------------
 drivers/infiniband/hw/hns/hns_roce_device.h     |  4 +--
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c      | 14 ++------
 drivers/infiniband/hw/i40iw/i40iw_verbs.c       |  3 +-
 drivers/infiniband/hw/mlx4/cq.c                 |  4 +--
 drivers/infiniband/hw/mlx4/mlx4_ib.h            |  2 +-
 drivers/infiniband/hw/mlx5/cq.c                 |  4 +--
 drivers/infiniband/hw/mlx5/mlx5_ib.h            |  2 +-
 drivers/infiniband/hw/mthca/mthca_provider.c    |  4 +--
 drivers/infiniband/hw/nes/nes_utils.c           |  4 +--
 drivers/infiniband/hw/nes/nes_verbs.c           | 30 +++++-----------
 drivers/infiniband/hw/ocrdma/ocrdma_hw.c        |  8 ++---
 drivers/infiniband/hw/ocrdma/ocrdma_hw.h        |  2 +-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c     |  6 ++--
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h     |  2 +-
 drivers/infiniband/hw/qedr/verbs.c              | 20 ++---------
 drivers/infiniband/hw/qedr/verbs.h              |  2 +-
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c    |  4 +--
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h    |  2 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c    |  6 +---
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h |  2 +-
 drivers/infiniband/sw/rdmavt/cq.c               |  6 +---
 drivers/infiniband/sw/rdmavt/cq.h               |  2 +-
 drivers/infiniband/sw/rxe/rxe_verbs.c           |  3 +-
 include/rdma/ib_verbs.h                         |  2 +-
 36 files changed, 81 insertions(+), 165 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index cb72aa4985a4..6ee62600a812 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -207,8 +207,6 @@ EXPORT_SYMBOL(__ib_alloc_cq_user);
  */
 void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 {
-	int ret;
-
 	if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
 		return;
 
@@ -228,7 +226,6 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 
 	kfree(cq->wc);
 	rdma_restrack_del(&cq->res);
-	ret = cq->device->ops.destroy_cq(cq, udata);
-	WARN_ON_ONCE(ret);
+	cq->device->ops.destroy_cq(cq, udata);
 }
 EXPORT_SYMBOL(ib_free_cq_user);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 4fd5aad890d2..933bc35701ad 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1949,7 +1949,8 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 		return -EBUSY;
 
 	rdma_restrack_del(&cq->res);
-	return cq->device->ops.destroy_cq(cq, udata);
+	cq->device->ops.destroy_cq(cq, udata);
+	return 0;
 }
 EXPORT_SYMBOL(ib_destroy_cq_user);
 
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 8af8e1472101..0127af45dcd1 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -2517,9 +2517,8 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr,
 }
 
 /* Completion Queues */
-int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
-	int rc;
 	struct bnxt_re_cq *cq;
 	struct bnxt_qplib_nq *nq;
 	struct bnxt_re_dev *rdev;
@@ -2528,20 +2527,14 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 	rdev = cq->rdev;
 	nq = cq->qplib_cq.nq;
 
-	rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
-	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to destroy HW CQ");
-		return rc;
-	}
-	if (!IS_ERR_OR_NULL(cq->umem))
+	bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
+	if (!cq->umem)
 		ib_umem_release(cq->umem);
 
 	atomic_dec(&rdev->cq_count);
 	nq->budget--;
 	kfree(cq->cql);
 	kfree(cq);
-
-	return 0;
 }
 
 struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index 09a33049e42f..828403ee0104 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -193,7 +193,7 @@ int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
 struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
 				const struct ib_cq_init_attr *attr,
 				struct ib_udata *udata);
-int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+void bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
 int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index d9c741fea0e9..37ee93824349 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -303,17 +303,15 @@ err1:
 	return -ENOMEM;
 }
 
-int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+void cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
 {
-	int err;
-	err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
+	cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
 	kfree(cq->sw_queue);
 	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
 			  (1UL << (cq->size_log2))
 			  * sizeof(struct t3_cqe) + 1, cq->queue,
 			  dma_unmap_addr(cq, mapping));
 	cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
-	return err;
 }
 
 int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq,
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
index 5fc26a4648d3..40c029ffa425 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -158,7 +158,7 @@ void cxio_rdev_close(struct cxio_rdev *rdev);
 int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
 		   enum t3_cq_opcode op, u32 credit);
 int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel);
-int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+void cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
 void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
 void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
 int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 1b35941eee74..5bde4ae93681 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -88,7 +88,7 @@ static int iwch_alloc_ucontext(struct ib_ucontext *ucontext,
 	return 0;
 }
 
-static int iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+static void iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct iwch_cq *chp;
 
@@ -101,7 +101,6 @@ static int iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 
 	cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
 	kfree(chp);
-	return 0;
 }
 
 static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index 6557e7c5af66..f49e6d271c42 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -34,14 +34,13 @@
 
 #include "iw_cxgb4.h"
 
-static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
-		      struct c4iw_dev_ucontext *uctx, struct sk_buff *skb,
-		      struct c4iw_wr_wait *wr_waitp)
+static void destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+		       struct c4iw_dev_ucontext *uctx, struct sk_buff *skb,
+		       struct c4iw_wr_wait *wr_waitp)
 {
 	struct fw_ri_res_wr *res_wr;
 	struct fw_ri_res *res;
 	int wr_len;
-	int ret;
 
 	wr_len = sizeof(*res_wr) + sizeof(*res);
 	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
@@ -59,14 +58,13 @@ static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
 	res->u.cq.iqid = cpu_to_be32(cq->cqid);
 
 	c4iw_init_wr_wait(wr_waitp);
-	ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
+	c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
 
 	kfree(cq->sw_queue);
 	dma_free_coherent(&(rdev->lldi.pdev->dev),
 			  cq->memsize, cq->queue,
 			  dma_unmap_addr(cq, mapping));
 	c4iw_put_cqid(rdev, cq->cqid, uctx);
-	return ret;
 }
 
 static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
@@ -970,7 +968,7 @@ int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 	return !err || err == -ENODATA ? npolled : err;
 }
 
-int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct c4iw_cq *chp;
 	struct c4iw_ucontext *ucontext;
@@ -989,7 +987,6 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 		   chp->destroy_skb, chp->wr_waitp);
 	c4iw_put_wr_wait(chp->wr_waitp);
 	kfree(chp);
-	return 0;
 }
 
 struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index cf7512b2c4c0..45e720288f0f 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -992,7 +992,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
 					   struct ib_udata *udata);
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
 int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
-int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
+void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
 struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
 			     const struct ib_cq_init_attr *attr,
 			     struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index 14a36546985b..52d894f0ad3e 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -134,7 +134,7 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata);
-int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 struct ib_cq *efa_create_cq(struct ib_device *ibdev,
 			    const struct ib_cq_init_attr *attr,
 			    struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 607aff869200..42865cf4f149 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -847,25 +847,20 @@ static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx)
 	return efa_com_destroy_cq(&dev->edev, &params);
 }
 
-int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct efa_dev *dev = to_edev(ibcq->device);
 	struct efa_cq *cq = to_ecq(ibcq);
-	int err;
 
 	ibdev_dbg(&dev->ibdev,
 		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
 		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
 
-	err = efa_destroy_cq_idx(dev, cq->cq_idx);
-	if (err)
-		return err;
-
+	efa_destroy_cq_idx(dev, cq->cq_idx);
 	dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
 			 DMA_FROM_DEVICE);
 
 	kfree(cq);
-	return 0;
 }
 
 static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 6e81ff3f1813..0eb7c16c007b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -443,40 +443,36 @@ err_cq:
 }
 EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq);
 
-int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
-	int ret = 0;
 
 	if (hr_dev->hw->destroy_cq) {
-		ret = hr_dev->hw->destroy_cq(ib_cq, udata);
-	} else {
-		hns_roce_free_cq(hr_dev, hr_cq);
-		hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-
-		if (udata) {
-			ib_umem_release(hr_cq->umem);
-
-			if (hr_cq->db_en == 1)
-				hns_roce_db_unmap_user(
-					rdma_udata_to_drv_context(
-						udata,
-						struct hns_roce_ucontext,
-						ibucontext),
-					&hr_cq->db);
-		} else {
-			/* Free the buff of stored cq */
-			hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf,
-						ib_cq->cqe);
-			if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
-				hns_roce_free_db(hr_dev, &hr_cq->db);
-		}
+		hr_dev->hw->destroy_cq(ib_cq, udata);
+		return;
+	}
+
+	hns_roce_free_cq(hr_dev, hr_cq);
+	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
 
-		kfree(hr_cq);
+	if (udata) {
+		ib_umem_release(hr_cq->umem);
+
+		if (hr_cq->db_en == 1)
+			hns_roce_db_unmap_user(rdma_udata_to_drv_context(
+						       udata,
+						       struct hns_roce_ucontext,
+						       ibucontext),
+					       &hr_cq->db);
+	} else {
+		/* Free the buff of stored cq */
+		hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, ib_cq->cqe);
+		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
+			hns_roce_free_db(hr_dev, &hr_cq->db);
 	}
 
-	return ret;
+	kfree(hr_cq);
 }
 EXPORT_SYMBOL_GPL(hns_roce_ib_destroy_cq);
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index ce23338831eb..2f7d9644fd24 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -938,7 +938,7 @@ struct hns_roce_hw {
 	int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 	int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
 			struct ib_udata *udata);
-	int (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata);
+	void (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata);
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 	int (*init_eq)(struct hns_roce_dev *hr_dev);
 	void (*cleanup_eq)(struct hns_roce_dev *hr_dev);
@@ -1209,7 +1209,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
 				    const struct ib_cq_init_attr *attr,
 				    struct ib_udata *udata);
 
-int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
+void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
 void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq);
 
 int hns_roce_db_map_user(struct hns_roce_ucontext *context,
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 878c8ae35630..aa7b67d283af 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -865,8 +865,7 @@ alloc_pd_failed:
 	kfree(pd);
 
 alloc_mem_failed:
-	if (hns_roce_ib_destroy_cq(cq, NULL))
-		dev_err(dev, "Destroy cq for create_lp_qp failed!\n");
+	hns_roce_ib_destroy_cq(cq, NULL);
 
 	return ret;
 }
@@ -894,10 +893,7 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev)
 				i, ret);
 	}
 
-	ret = hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL);
-	if (ret)
-		dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret);
-
+	hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL);
 	hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd, NULL);
 }
 
@@ -3654,7 +3650,7 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 	return 0;
 }
 
-static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
@@ -3663,7 +3659,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	u32 cqe_cnt_cur;
 	u32 cq_buf_size;
 	int wait_time = 0;
-	int ret = 0;
 
 	hns_roce_free_cq(hr_dev, hr_cq);
 
@@ -3685,7 +3680,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		if (wait_time > HNS_ROCE_MAX_FREE_CQ_WAIT_CNT) {
 			dev_warn(dev, "Destroy cq 0x%lx timeout!\n",
 				hr_cq->cqn);
-			ret = -ETIMEDOUT;
 			break;
 		}
 		wait_time++;
@@ -3702,8 +3696,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	}
 
 	kfree(hr_cq);
-
-	return ret;
 }
 
 static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not)
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index bfe16e6f04f4..205053cb5f97 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -1064,7 +1064,7 @@ void i40iw_cq_wq_destroy(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq)
  * @ib_cq: cq pointer
  * @udata: user data or NULL for kernel object
  */
-static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+static void i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct i40iw_cq *iwcq;
 	struct i40iw_device *iwdev;
@@ -1077,7 +1077,6 @@ static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 	cq_free_resources(iwdev, iwcq);
 	kfree(iwcq);
 	i40iw_rem_devusecount(iwdev);
-	return 0;
 }
 
 /**
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 022a0b4ea452..8eb7490dabb8 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -486,7 +486,7 @@ out:
 	return err;
 }
 
-int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(cq->device);
 	struct mlx4_ib_cq *mcq = to_mcq(cq);
@@ -508,8 +508,6 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 	}
 
 	kfree(mcq);
-
-	return 0;
 }
 
 static void dump_cqe(void *cqe)
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 26897102057d..af5ee45a9f19 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -746,7 +746,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
 struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 				const struct ib_cq_init_attr *attr,
 				struct ib_udata *udata);
-int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 2e2e65f00257..ebd01bd7f8f6 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -998,7 +998,7 @@ err_create:
 	return ERR_PTR(err);
 }
 
-int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(cq->device);
 	struct mlx5_ib_cq *mcq = to_mcq(cq);
@@ -1010,8 +1010,6 @@ int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 		destroy_cq_kernel(dev, mcq);
 
 	kfree(mcq);
-
-	return 0;
 }
 
 static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 40eb8be482e4..bf697c5b3e79 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1118,7 +1118,7 @@ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
 struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 				const struct ib_cq_init_attr *attr,
 				struct ib_udata *udata);
-int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index b128ff76f709..81fc04e1c142 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -804,7 +804,7 @@ out:
 	return ret;
 }
 
-static int mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+static void mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	if (udata) {
 		struct mthca_ucontext *context =
@@ -824,8 +824,6 @@ static int mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 	}
 	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
 	kfree(cq);
-
-	return 0;
 }
 
 static inline u32 convert_access(int acc)
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
index 90f28890246d..e976292fc6c0 100644
--- a/drivers/infiniband/hw/nes/nes_utils.c
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -588,9 +588,7 @@ struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev)
 		cqp_request->callback = 0;
 		nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n",
 				cqp_request);
-	} else
-		printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n",
-			   __func__);
+	}
 
 	return cqp_request;
 }
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 7677f1f734bb..cac3fa624c4d 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -1634,7 +1634,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 /**
  * nes_destroy_cq
  */
-static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+static void nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct nes_cq *nescq;
 	struct nes_device *nesdev;
@@ -1644,7 +1644,6 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 	struct nes_cqp_request cqp_request = {};
 	unsigned long flags;
 	u32 opcode = 0;
-	int ret;
 
 	nescq = to_nescq(ib_cq);
 	nesvnic = to_nesvnic(ib_cq->device);
@@ -1656,6 +1655,7 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 	/* Send DestroyCQ request to CQP */
 	INIT_LIST_HEAD(&cqp_request.list);
 	init_waitqueue_head(&cqp_request.waitq);
+
 	cqp_request.waiting = 1;
 	cqp_wqe = &cqp_request.cqp_wqe;
 	opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16);
@@ -1689,30 +1689,18 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 	/* Wait for CQP */
 	nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n",
 			nescq->hw_cq.cq_number);
-	ret = wait_event_timeout(cqp_request.waitq, cqp_request.request_done,
-				 NES_EVENT_TIMEOUT);
-	nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u,"
-			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-			nescq->hw_cq.cq_number, ret, cqp_request.major_code,
-			cqp_request.minor_code);
-	if (!ret) {
-		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
-					nescq->hw_cq.cq_number);
-		ret = -ETIME;
-	} else if (cqp_request.major_code) {
-		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
-					nescq->hw_cq.cq_number);
-		ret = -EIO;
-	} else {
-		ret = 0;
-	}
+	wait_event_timeout(cqp_request.waitq, cqp_request.request_done,
+			   NES_EVENT_TIMEOUT);
+	nes_debug(
+		NES_DBG_CQ,
+		"Destroy iWARP CQ%u completed CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+		nescq->hw_cq.cq_number, cqp_request.major_code,
+		cqp_request.minor_code);
 
 	if (nescq->cq_mem_size)
 		pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size,
 				    nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase);
 	kfree(nescq);
-
-	return ret;
 }
 
 /**
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
index 5127e2ea4bdd..b2dd4e0a4be2 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -1888,14 +1888,13 @@ mem_err:
 	return status;
 }
 
-int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
+void ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
 {
-	int status = -ENOMEM;
 	struct ocrdma_destroy_cq *cmd;
 
 	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_CQ, sizeof(*cmd));
 	if (!cmd)
-		return status;
+		return;
 	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_DELETE_CQ,
 			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
 
@@ -1903,11 +1902,10 @@ int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
 	    (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) &
 	    OCRDMA_DESTROY_CQ_QID_MASK;
 
-	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
 	ocrdma_unbind_eq(dev, cq->eqn);
 	dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa);
 	kfree(cmd);
-	return status;
 }
 
 int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
index 06ec59326a90..12c23a7652b9 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
@@ -122,7 +122,7 @@ int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr,
 			u32 pd_id, int acc);
 int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *,
 				int entries, int dpp_cq, u16 pd_id);
-int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *);
+void ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq);
 
 int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs,
 			 u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 35ec87015792..94e4f7f9b1f7 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -1070,7 +1070,7 @@ static void ocrdma_flush_cq(struct ocrdma_cq *cq)
 	spin_unlock_irqrestore(&cq->cq_lock, flags);
 }
 
-int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
 	struct ocrdma_eq *eq = NULL;
@@ -1080,14 +1080,13 @@ int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 
 	dev->cq_tbl[cq->id] = NULL;
 	indx = ocrdma_get_eq_table_index(dev, cq->eqn);
-	BUG_ON(indx == -EINVAL);
 
 	eq = &dev->eq_tbl[indx];
 	irq = ocrdma_get_irq(dev, eq);
 	synchronize_irq(irq);
 	ocrdma_flush_cq(cq);
 
-	(void)ocrdma_mbx_destroy_cq(dev, cq);
+	ocrdma_mbx_destroy_cq(dev, cq);
 	if (cq->ucontext) {
 		pdid = cq->ucontext->cntxt_pd->id;
 		ocrdma_del_mmap(cq->ucontext, (u64) cq->pa,
@@ -1098,7 +1097,6 @@ int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	}
 
 	kfree(cq);
-	return 0;
 }
 
 static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index d76aae7ed0d3..89cebe05669e 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -75,7 +75,7 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
 			       const struct ib_cq_init_attr *attr,
 			       struct ib_udata *udata);
 int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
-int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 
 struct ib_qp *ocrdma_create_qp(struct ib_pd *,
 			       struct ib_qp_init_attr *attrs,
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 3c0dba072071..be29bbbc4b14 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -955,14 +955,13 @@ int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata)
 #define QEDR_DESTROY_CQ_MAX_ITERATIONS		(10)
 #define QEDR_DESTROY_CQ_ITER_DURATION		(10)
 
-int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct qedr_dev *dev = get_qedr_dev(ibcq->device);
 	struct qed_rdma_destroy_cq_out_params oparams;
 	struct qed_rdma_destroy_cq_in_params iparams;
 	struct qedr_cq *cq = get_qedr_cq(ibcq);
 	int iter;
-	int rc;
 
 	DP_DEBUG(dev, QEDR_MSG_CQ, "destroy cq %p (icid=%d)\n", cq, cq->icid);
 
@@ -973,10 +972,7 @@ int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		goto done;
 
 	iparams.icid = cq->icid;
-	rc = dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
-	if (rc)
-		return rc;
-
+	dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
 	dev->ops->common->chain_free(dev->cdev, &cq->pbl);
 
 	if (udata) {
@@ -1007,9 +1003,6 @@ int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		iter--;
 	}
 
-	if (oparams.num_cq_notif != cq->cnq_notif)
-		goto err;
-
 	/* Note that we don't need to have explicit code to wait for the
 	 * completion of the event handler because it is invoked from the EQ.
 	 * Since the destroy CQ ramrod has also been received on the EQ we can
@@ -1019,15 +1012,6 @@ done:
 	cq->sig = ~cq->sig;
 
 	kfree(cq);
-
-	return 0;
-
-err:
-	DP_ERR(dev,
-	       "CQ %p (icid=%d) not freed, expecting %d ints but got %d ints\n",
-	       cq, cq->icid, oparams.num_cq_notif, cq->cnq_notif);
-
-	return -EINVAL;
 }
 
 static inline int get_gid_info_from_table(struct ib_qp *ibqp,
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 9328c80375ef..32d7ce77e339 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -54,7 +54,7 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
 			     const struct ib_cq_init_attr *attr,
 			     struct ib_udata *udata);
 int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
-int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 struct ib_qp *qedr_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs,
 			     struct ib_udata *);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index e9352750e029..5686d14b86fe 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -604,11 +604,9 @@ struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
 	return cq;
 }
 
-int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
-	usnic_dbg("\n");
 	kfree(cq);
-	return 0;
 }
 
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index 028f322f8e9b..0b9d993433a7 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -61,7 +61,7 @@ int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
 				 const struct ib_cq_init_attr *attr,
 				 struct ib_udata *udata);
-int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
 				u64 virt_addr, int access_flags,
 				struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
index d7deb19a2800..0682781f6555 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
@@ -246,10 +246,8 @@ static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq)
  * pvrdma_destroy_cq - destroy completion queue
  * @cq: the completion queue to destroy.
  * @udata: user data or null for kernel object
- *
- * @return: 0 for success.
  */
-int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	struct pvrdma_cq *vcq = to_vcq(cq);
 	union pvrdma_cmd_req req;
@@ -275,8 +273,6 @@ int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 
 	pvrdma_free_cq(dev, vcq);
 	atomic_dec(&dev->num_cqs);
-
-	return ret;
 }
 
 static inline struct pvrdma_cqe *get_cqe(struct pvrdma_cq *cq, int i)
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
index 9d7b021e1c59..f0dd6e4d058b 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -412,7 +412,7 @@ int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
 			       const struct ib_cq_init_attr *attr,
 			       struct ib_udata *udata);
-int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 int pvrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index a06e6da7a026..8e76036fad4a 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -300,10 +300,8 @@ done:
  * @udata: user data or NULL for kernel object
  *
  * Called by ib_destroy_cq() in the generic verbs code.
- *
- * Return: always 0
  */
-int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
 	struct rvt_dev_info *rdi = cq->rdi;
@@ -317,8 +315,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	else
 		vfree(cq->queue);
 	kfree(cq);
-
-	return 0;
 }
 
 /**
diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h
index 3ad6faf18ecb..495d8c3e6580 100644
--- a/drivers/infiniband/sw/rdmavt/cq.h
+++ b/drivers/infiniband/sw/rdmavt/cq.h
@@ -54,7 +54,7 @@
 struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 			    const struct ib_cq_init_attr *attr,
 			    struct ib_udata *udata);
-int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 046129393215..b14881decbee 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -819,14 +819,13 @@ err1:
 	return ERR_PTR(err);
 }
 
-static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+static void rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct rxe_cq *cq = to_rcq(ibcq);
 
 	rxe_cq_disable(cq);
 
 	rxe_drop_ref(cq);
-	return 0;
 }
 
 static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index d1f16a6c4810..bc1d94c9c9ba 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2462,7 +2462,7 @@ struct ib_device_ops {
 				   const struct ib_cq_init_attr *attr,
 				   struct ib_udata *udata);
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
-	int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
+	void (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
 	int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata);
 	struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags);
 	struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length,
-- 
cgit v1.2.3


From e39afe3d6dbd908d8fd189571a3c1561088a86c2 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 28 May 2019 14:37:29 +0300
Subject: RDMA: Convert CQ allocations to be under core responsibility

Ensure that CQ is allocated and freed by IB/core and not by drivers.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Acked-by: Gal Pressman <galpress@amazon.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Tested-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/cq.c                    | 28 +++++++-----
 drivers/infiniband/core/device.c                |  1 +
 drivers/infiniband/core/uverbs_cmd.c            | 15 ++++---
 drivers/infiniband/core/uverbs_std_types_cq.c   | 19 +++++---
 drivers/infiniband/core/verbs.c                 | 32 ++++++++-----
 drivers/infiniband/hw/bnxt_re/ib_verbs.c        | 20 +++------
 drivers/infiniband/hw/bnxt_re/ib_verbs.h        |  7 ++-
 drivers/infiniband/hw/bnxt_re/main.c            |  1 +
 drivers/infiniband/hw/cxgb3/iwch_provider.c     | 43 ++++++++----------
 drivers/infiniband/hw/cxgb4/cq.c                | 27 +++++------
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h          |  5 +--
 drivers/infiniband/hw/cxgb4/provider.c          |  1 +
 drivers/infiniband/hw/efa/efa.h                 |  5 +--
 drivers/infiniband/hw/efa/efa_main.c            |  1 +
 drivers/infiniband/hw/efa/efa_verbs.c           | 48 ++++++--------------
 drivers/infiniband/hw/hns/hns_roce_cq.c         | 23 ++++------
 drivers/infiniband/hw/hns/hns_roce_device.h     |  6 +--
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c      | 21 +++++----
 drivers/infiniband/hw/hns/hns_roce_main.c       |  1 +
 drivers/infiniband/hw/i40iw/i40iw_verbs.c       | 33 ++++++--------
 drivers/infiniband/hw/mlx4/cq.c                 | 25 ++++-------
 drivers/infiniband/hw/mlx4/main.c               |  1 +
 drivers/infiniband/hw/mlx4/mlx4_ib.h            |  5 +--
 drivers/infiniband/hw/mlx5/cq.c                 | 32 +++++--------
 drivers/infiniband/hw/mlx5/main.c               | 21 +++++----
 drivers/infiniband/hw/mlx5/mlx5_ib.h            |  5 +--
 drivers/infiniband/hw/mthca/mthca_provider.c    | 36 +++++++--------
 drivers/infiniband/hw/nes/nes_verbs.c           | 60 +++++++++----------------
 drivers/infiniband/hw/ocrdma/ocrdma_main.c      |  1 +
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c     | 29 +++++-------
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h     |  5 +--
 drivers/infiniband/hw/qedr/main.c               |  1 +
 drivers/infiniband/hw/qedr/verbs.c              | 28 ++++--------
 drivers/infiniband/hw/qedr/verbs.h              |  5 +--
 drivers/infiniband/hw/usnic/usnic_ib.h          |  4 ++
 drivers/infiniband/hw/usnic/usnic_ib_main.c     |  1 +
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c    | 18 +++-----
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h    |  5 +--
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c    | 34 +++++---------
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c  |  1 +
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h |  5 +--
 drivers/infiniband/sw/rdmavt/cq.c               | 51 +++++++--------------
 drivers/infiniband/sw/rdmavt/cq.h               |  5 +--
 drivers/infiniband/sw/rdmavt/vt.c               |  1 +
 drivers/infiniband/sw/rxe/rxe_pool.c            |  1 +
 drivers/infiniband/sw/rxe/rxe_verbs.c           | 30 +++++--------
 drivers/infiniband/sw/rxe/rxe_verbs.h           |  2 +-
 include/rdma/ib_verbs.h                         |  6 +--
 48 files changed, 318 insertions(+), 437 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index 6ee62600a812..3b9412c69565 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -147,23 +147,26 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 	struct ib_cq *cq;
 	int ret = -ENOMEM;
 
-	cq = dev->ops.create_cq(dev, &cq_attr, NULL);
-	if (IS_ERR(cq))
-		return cq;
+	cq = rdma_zalloc_drv_obj(dev, ib_cq);
+	if (!cq)
+		return ERR_PTR(ret);
 
 	cq->device = dev;
-	cq->uobject = NULL;
-	cq->event_handler = NULL;
 	cq->cq_context = private;
 	cq->poll_ctx = poll_ctx;
 	atomic_set(&cq->usecnt, 0);
 
 	cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
 	if (!cq->wc)
-		goto out_destroy_cq;
+		goto out_free_cq;
 
 	cq->res.type = RDMA_RESTRACK_CQ;
 	rdma_restrack_set_task(&cq->res, caller);
+
+	ret = dev->ops.create_cq(cq, &cq_attr, NULL);
+	if (ret)
+		goto out_free_wc;
+
 	rdma_restrack_kadd(&cq->res);
 
 	switch (cq->poll_ctx) {
@@ -186,16 +189,18 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 		break;
 	default:
 		ret = -EINVAL;
-		goto out_free_wc;
+		goto out_destroy_cq;
 	}
 
 	return cq;
 
-out_free_wc:
-	kfree(cq->wc);
-	rdma_restrack_del(&cq->res);
 out_destroy_cq:
+	rdma_restrack_del(&cq->res);
 	cq->device->ops.destroy_cq(cq, udata);
+out_free_wc:
+	kfree(cq->wc);
+out_free_cq:
+	kfree(cq);
 	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL(__ib_alloc_cq_user);
@@ -224,8 +229,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 		WARN_ON_ONCE(1);
 	}
 
-	kfree(cq->wc);
 	rdma_restrack_del(&cq->res);
 	cq->device->ops.destroy_cq(cq, udata);
+	kfree(cq->wc);
+	kfree(cq);
 }
 EXPORT_SYMBOL(ib_free_cq_user);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 357d74c8df2b..abb169f31d0f 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2431,6 +2431,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, unmap_fmr);
 
 	SET_OBJ_SIZE(dev_ops, ib_ah);
+	SET_OBJ_SIZE(dev_ops, ib_cq);
 	SET_OBJ_SIZE(dev_ops, ib_pd);
 	SET_OBJ_SIZE(dev_ops, ib_srq);
 	SET_OBJ_SIZE(dev_ops, ib_ucontext);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 5a3a1780ceea..5c00d9a5698a 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1010,12 +1010,11 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
 	attr.comp_vector = cmd->comp_vector;
 	attr.flags = cmd->flags;
 
-	cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);
-	if (IS_ERR(cq)) {
-		ret = PTR_ERR(cq);
+	cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
+	if (!cq) {
+		ret = -ENOMEM;
 		goto err_file;
 	}
-
 	cq->device        = ib_dev;
 	cq->uobject       = &obj->uobject;
 	cq->comp_handler  = ib_uverbs_comp_handler;
@@ -1023,6 +1022,10 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
 	cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
 	atomic_set(&cq->usecnt, 0);
 
+	ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+	if (ret)
+		goto err_free;
+
 	obj->uobject.object = cq;
 	memset(&resp, 0, sizeof resp);
 	resp.base.cq_handle = obj->uobject.id;
@@ -1043,7 +1046,9 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
 
 err_cb:
 	ib_destroy_cq(cq);
-
+	cq = NULL;
+err_free:
+	kfree(cq);
 err_file:
 	if (ev_file)
 		ib_uverbs_release_ucq(attrs->ufile, ev_file, obj);
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index db5c46a1bb2d..06b8c7d017b7 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -111,9 +111,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
 	INIT_LIST_HEAD(&obj->comp_list);
 	INIT_LIST_HEAD(&obj->async_list);
 
-	cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);
-	if (IS_ERR(cq)) {
-		ret = PTR_ERR(cq);
+	cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
+	if (!cq) {
+		ret = -ENOMEM;
 		goto err_event_file;
 	}
 
@@ -122,10 +122,15 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
 	cq->comp_handler  = ib_uverbs_comp_handler;
 	cq->event_handler = ib_uverbs_cq_event_handler;
 	cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
-	obj->uobject.object = cq;
-	obj->uobject.user_handle = user_handle;
 	atomic_set(&cq->usecnt, 0);
 	cq->res.type = RDMA_RESTRACK_CQ;
+
+	ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+	if (ret)
+		goto err_free;
+
+	obj->uobject.object = cq;
+	obj->uobject.user_handle = user_handle;
 	rdma_restrack_uadd(&cq->res);
 
 	ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe,
@@ -136,7 +141,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
 	return 0;
 err_cq:
 	ib_destroy_cq(cq);
-
+	cq = NULL;
+err_free:
+	kfree(cq);
 err_event_file:
 	if (ev_file)
 		uverbs_uobject_put(ev_file_uobj);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 933bc35701ad..585e100706aa 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1916,21 +1916,28 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
 			     const char *caller)
 {
 	struct ib_cq *cq;
+	int ret;
+
+	cq = rdma_zalloc_drv_obj(device, ib_cq);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
 
-	cq = device->ops.create_cq(device, cq_attr, NULL);
-
-	if (!IS_ERR(cq)) {
-		cq->device        = device;
-		cq->uobject       = NULL;
-		cq->comp_handler  = comp_handler;
-		cq->event_handler = event_handler;
-		cq->cq_context    = cq_context;
-		atomic_set(&cq->usecnt, 0);
-		cq->res.type = RDMA_RESTRACK_CQ;
-		rdma_restrack_set_task(&cq->res, caller);
-		rdma_restrack_kadd(&cq->res);
+	cq->device = device;
+	cq->uobject = NULL;
+	cq->comp_handler = comp_handler;
+	cq->event_handler = event_handler;
+	cq->cq_context = cq_context;
+	atomic_set(&cq->usecnt, 0);
+	cq->res.type = RDMA_RESTRACK_CQ;
+	rdma_restrack_set_task(&cq->res, caller);
+
+	ret = device->ops.create_cq(cq, cq_attr, NULL);
+	if (ret) {
+		kfree(cq);
+		return ERR_PTR(ret);
 	}
 
+	rdma_restrack_kadd(&cq->res);
 	return cq;
 }
 EXPORT_SYMBOL(__ib_create_cq);
@@ -1950,6 +1957,7 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 
 	rdma_restrack_del(&cq->res);
 	cq->device->ops.destroy_cq(cq, udata);
+	kfree(cq);
 	return 0;
 }
 EXPORT_SYMBOL(ib_destroy_cq_user);
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 0127af45dcd1..44cc5f19df3b 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -2534,16 +2534,14 @@ void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 	atomic_dec(&rdev->cq_count);
 	nq->budget--;
 	kfree(cq->cql);
-	kfree(cq);
 }
 
-struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_udata *udata)
+int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata)
 {
-	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibcq->device, ibdev);
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
-	struct bnxt_re_cq *cq = NULL;
+	struct bnxt_re_cq *cq = container_of(ibcq, struct bnxt_re_cq, ib_cq);
 	int rc, entries;
 	int cqe = attr->cqe;
 	struct bnxt_qplib_nq *nq = NULL;
@@ -2552,11 +2550,8 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
 	/* Validate CQ fields */
 	if (cqe < 1 || cqe > dev_attr->max_cq_wqes) {
 		dev_err(rdev_to_dev(rdev), "Failed to create CQ -max exceeded");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
 
 	cq->rdev = rdev;
 	cq->qplib_cq.cq_handle = (u64)(unsigned long)(&cq->qplib_cq);
@@ -2634,15 +2629,14 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
 		}
 	}
 
-	return &cq->ib_cq;
+	return 0;
 
 c2fail:
 	if (udata)
 		ib_umem_release(cq->umem);
 fail:
 	kfree(cq->cql);
-	kfree(cq);
-	return ERR_PTR(rc);
+	return rc;
 }
 
 static u8 __req_to_ib_wc_status(u8 qstatus)
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index 828403ee0104..31662b1ee35a 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -94,11 +94,11 @@ struct bnxt_re_qp {
 };
 
 struct bnxt_re_cq {
+	struct ib_cq		ib_cq;
 	struct bnxt_re_dev	*rdev;
 	spinlock_t              cq_lock;	/* protect cq */
 	u16			cq_count;
 	u16			cq_period;
-	struct ib_cq		ib_cq;
 	struct bnxt_qplib_cq	qplib_cq;
 	struct bnxt_qplib_cqe	*cql;
 #define MAX_CQL_PER_POLL	1024
@@ -190,9 +190,8 @@ int bnxt_re_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr,
 		      const struct ib_send_wr **bad_send_wr);
 int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
 		      const struct ib_recv_wr **bad_recv_wr);
-struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_udata *udata);
+int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata);
 void bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
 int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 351c420248a0..029babe713f3 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -641,6 +641,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = {
 	.reg_user_mr = bnxt_re_reg_user_mr,
 	.req_notify_cq = bnxt_re_req_notify_cq,
 	INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd),
 	INIT_RDMA_OBJ_SIZE(ib_srq, bnxt_re_srq, ib_srq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx),
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 5bde4ae93681..acba96f289cc 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -100,16 +100,16 @@ static void iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 	wait_event(chp->wait, !atomic_read(&chp->refcnt));
 
 	cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
-	kfree(chp);
 }
 
-static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
-				    const struct ib_cq_init_attr *attr,
-				    struct ib_udata *udata)
+static int iwch_create_cq(struct ib_cq *ibcq,
+			  const struct ib_cq_init_attr *attr,
+			  struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
-	struct iwch_dev *rhp;
-	struct iwch_cq *chp;
+	struct iwch_dev *rhp = to_iwch_dev(ibcq->device);
+	struct iwch_cq *chp = to_iwch_cq(ibcq);
 	struct iwch_create_cq_resp uresp;
 	struct iwch_create_cq_req ureq;
 	static int warned;
@@ -117,19 +117,13 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
 
 	pr_debug("%s ib_dev %p entries %d\n", __func__, ibdev, entries);
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
-
-	rhp = to_iwch_dev(ibdev);
-	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
-	if (!chp)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
 
 	if (udata) {
 		if (!t3a_device(rhp)) {
-			if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
-				kfree(chp);
-				return ERR_PTR(-EFAULT);
-			}
+			if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+				return  -EFAULT;
+
 			chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr;
 		}
 	}
@@ -150,10 +144,9 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
 	entries = roundup_pow_of_two(entries);
 	chp->cq.size_log2 = ilog2(entries);
 
-	if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata)) {
-		kfree(chp);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata))
+		return -ENOMEM;
+
 	chp->rhp = rhp;
 	chp->ibcq.cqe = 1 << chp->cq.size_log2;
 	spin_lock_init(&chp->lock);
@@ -162,8 +155,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
 	init_waitqueue_head(&chp->wait);
 	if (xa_store_irq(&rhp->cqs, chp->cq.cqid, chp, GFP_KERNEL)) {
 		cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
-		kfree(chp);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
 	if (udata) {
@@ -174,7 +166,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
 		mm = kmalloc(sizeof(*mm), GFP_KERNEL);
 		if (!mm) {
 			iwch_destroy_cq(&chp->ibcq, udata);
-			return ERR_PTR(-ENOMEM);
+			return -ENOMEM;
 		}
 		uresp.cqid = chp->cq.cqid;
 		uresp.size_log2 = chp->cq.size_log2;
@@ -200,14 +192,14 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
 		if (ib_copy_to_udata(udata, &uresp, resplen)) {
 			kfree(mm);
 			iwch_destroy_cq(&chp->ibcq, udata);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 		insert_mmap(ucontext, mm);
 	}
 	pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr %pad\n",
 		 chp->cq.cqid, chp, (1 << chp->cq.size_log2),
 		 &chp->cq.dma_addr);
-	return &chp->ibcq;
+	return 0;
 }
 
 static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
@@ -1277,6 +1269,7 @@ static const struct ib_device_ops iwch_dev_ops = {
 	.reg_user_mr = iwch_reg_user_mr,
 	.req_notify_cq = iwch_arm_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_cq, iwch_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext),
 };
 
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index f49e6d271c42..3cc4d3331a3f 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -986,17 +986,16 @@ void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 		   ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx,
 		   chp->destroy_skb, chp->wr_waitp);
 	c4iw_put_wr_wait(chp->wr_waitp);
-	kfree(chp);
 }
 
-struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_udata *udata)
+int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		   struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	int vector = attr->comp_vector;
-	struct c4iw_dev *rhp;
-	struct c4iw_cq *chp;
+	struct c4iw_dev *rhp = to_c4iw_dev(ibcq->device);
+	struct c4iw_cq *chp = to_c4iw_cq(ibcq);
 	struct c4iw_create_cq ucmd;
 	struct c4iw_create_cq_resp uresp;
 	int ret, wr_len;
@@ -1007,22 +1006,16 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
 
 	pr_debug("ib_dev %p entries %d\n", ibdev, entries);
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
-
-	rhp = to_c4iw_dev(ibdev);
+		return -EINVAL;
 
 	if (vector >= rhp->rdev.lldi.nciq)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (udata) {
 		if (udata->inlen < sizeof(ucmd))
 			ucontext->is_32b_cqe = 1;
 	}
 
-	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
-	if (!chp)
-		return ERR_PTR(-ENOMEM);
-
 	chp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
 	if (!chp->wr_waitp) {
 		ret = -ENOMEM;
@@ -1132,10 +1125,11 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
 		mm2->len = PAGE_SIZE;
 		insert_mmap(ucontext, mm2);
 	}
+
 	pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr %pad\n",
 		 chp->cq.cqid, chp, chp->cq.size, chp->cq.memsize,
 		 &chp->cq.dma_addr);
-	return &chp->ibcq;
+	return 0;
 err_free_mm2:
 	kfree(mm2);
 err_free_mm:
@@ -1151,8 +1145,7 @@ err_free_skb:
 err_free_wr_wait:
 	c4iw_put_wr_wait(chp->wr_waitp);
 err_free_chp:
-	kfree(chp);
-	return ERR_PTR(ret);
+	return ret;
 }
 
 int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 45e720288f0f..7d06b0f8d49a 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -993,9 +993,8 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
 int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
 void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
-struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_udata *udata);
+int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		   struct ib_udata *udata);
 int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr,
 		    enum ib_srq_attr_mask srq_attr_mask,
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 2b1f2443b7da..5e59c5708729 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -537,6 +537,7 @@ static const struct ib_device_ops c4iw_dev_ops = {
 	.reg_user_mr = c4iw_reg_user_mr,
 	.req_notify_cq = c4iw_arm_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_cq, c4iw_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_srq, c4iw_srq, ibsrq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext),
 };
diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index 52d894f0ad3e..119f8efec564 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -135,9 +135,8 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata);
 void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
-struct ib_cq *efa_create_cq(struct ib_device *ibdev,
-			    const struct ib_cq_init_attr *attr,
-			    struct ib_udata *udata);
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata);
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 			 u64 virt_addr, int access_flags,
 			 struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index b891ee239a67..46861461dd2d 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -224,6 +224,7 @@ static const struct ib_device_ops efa_dev_ops = {
 	.reg_user_mr = efa_reg_mr,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext),
 };
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 42865cf4f149..a9372c9e4b30 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -859,8 +859,6 @@ void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	efa_destroy_cq_idx(dev, cq->cq_idx);
 	dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
 			 DMA_FROM_DEVICE);
-
-	kfree(cq);
 }
 
 static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
@@ -876,17 +874,20 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
 	return 0;
 }
 
-static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
-				  int vector, struct ib_ucontext *ibucontext,
-				  struct ib_udata *udata)
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata)
 {
+	struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct efa_ucontext, ibucontext);
 	struct efa_ibv_create_cq_resp resp = {};
 	struct efa_com_create_cq_params params;
 	struct efa_com_create_cq_result result;
+	struct ib_device *ibdev = ibcq->device;
 	struct efa_dev *dev = to_edev(ibdev);
 	struct efa_ibv_create_cq cmd = {};
+	struct efa_cq *cq = to_ecq(ibcq);
 	bool cq_entry_inserted = false;
-	struct efa_cq *cq;
+	int entries = attr->cqe;
 	int err;
 
 	ibdev_dbg(ibdev, "create_cq entries %d\n", entries);
@@ -944,19 +945,13 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
 		goto err_out;
 	}
 
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-
-	cq->ucontext = to_eucontext(ibucontext);
+	cq->ucontext = ucontext;
 	cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
 	cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
 					 DMA_FROM_DEVICE);
 	if (!cq->cpu_addr) {
 		err = -ENOMEM;
-		goto err_free_cq;
+		goto err_out;
 	}
 
 	params.uarn = cq->ucontext->uarn;
@@ -975,8 +970,8 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
 
 	err = cq_mmap_entries_setup(dev, cq, &resp);
 	if (err) {
-		ibdev_dbg(ibdev,
-			  "Could not setup cq[%u] mmap entries\n", cq->cq_idx);
+		ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n",
+			  cq->cq_idx);
 		goto err_destroy_cq;
 	}
 
@@ -992,11 +987,10 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
 		}
 	}
 
-	ibdev_dbg(ibdev,
-		  "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n",
+	ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n",
 		  cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr);
 
-	return &cq->ibcq;
+	return 0;
 
 err_destroy_cq:
 	efa_destroy_cq_idx(dev, cq->cq_idx);
@@ -1005,23 +999,9 @@ err_free_mapped:
 			 DMA_FROM_DEVICE);
 	if (!cq_entry_inserted)
 		free_pages_exact(cq->cpu_addr, cq->size);
-err_free_cq:
-	kfree(cq);
 err_out:
 	atomic64_inc(&dev->stats.sw_stats.create_cq_err);
-	return ERR_PTR(err);
-}
-
-struct ib_cq *efa_create_cq(struct ib_device *ibdev,
-			    const struct ib_cq_init_attr *attr,
-			    struct ib_udata *udata)
-{
-	struct efa_ucontext *ucontext = rdma_udata_to_drv_context(udata,
-								  struct efa_ucontext,
-								  ibucontext);
-
-	return do_create_cq(ibdev, attr->cqe, attr->comp_vector,
-			    &ucontext->ibucontext, udata);
+	return err;
 }
 
 static int umem_to_page_list(struct efa_dev *dev,
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 0eb7c16c007b..7e198c9ffbfe 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -299,15 +299,15 @@ static void hns_roce_ib_free_cq_buf(struct hns_roce_dev *hr_dev,
 			  &buf->hr_buf);
 }
 
-struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
-				    const struct ib_cq_init_attr *attr,
-				    struct ib_udata *udata)
+int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
+			  const struct ib_cq_init_attr *attr,
+			  struct ib_udata *udata)
 {
-	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_ib_create_cq ucmd;
 	struct hns_roce_ib_create_cq_resp resp = {};
-	struct hns_roce_cq *hr_cq = NULL;
+	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
 	struct hns_roce_uar *uar = NULL;
 	int vector = attr->comp_vector;
 	int cq_entries = attr->cqe;
@@ -318,13 +318,9 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
 	if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) {
 		dev_err(dev, "Creat CQ failed. entries=%d, max=%d\n",
 			cq_entries, hr_dev->caps.max_cqes);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	hr_cq = kzalloc(sizeof(*hr_cq), GFP_KERNEL);
-	if (!hr_cq)
-		return ERR_PTR(-ENOMEM);
-
 	if (hr_dev->caps.min_cqes)
 		cq_entries = max(cq_entries, hr_dev->caps.min_cqes);
 
@@ -415,7 +411,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
 			goto err_cqc;
 	}
 
-	return &hr_cq->ib_cq;
+	return 0;
 
 err_cqc:
 	hns_roce_free_cq(hr_dev, hr_cq);
@@ -438,8 +434,7 @@ err_db:
 		hns_roce_free_db(hr_dev, &hr_cq->db);
 
 err_cq:
-	kfree(hr_cq);
-	return ERR_PTR(ret);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq);
 
@@ -471,8 +466,6 @@ void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
 			hns_roce_free_db(hr_dev, &hr_cq->db);
 	}
-
-	kfree(hr_cq);
 }
 EXPORT_SYMBOL_GPL(hns_roce_ib_destroy_cq);
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 2f7d9644fd24..303ea7c614a8 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -1205,9 +1205,9 @@ void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
 __be32 send_ieth(const struct ib_send_wr *wr);
 int to_hr_qp_type(int qp_type);
 
-struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
-				    const struct ib_cq_init_attr *attr,
-				    struct ib_udata *udata);
+int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
+			  const struct ib_cq_init_attr *attr,
+			  struct ib_udata *udata);
 
 void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
 void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index aa7b67d283af..c899879da222 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -717,7 +717,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 	union ib_gid dgid;
 	u64 subnet_prefix;
 	int attr_mask = 0;
-	int ret = -ENOMEM;
+	int ret;
 	int i, j;
 	u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 };
 	u8 phy_port;
@@ -730,10 +730,16 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 	/* Reserved cq for loop qp */
 	cq_init_attr.cqe		= HNS_ROCE_MIN_WQE_NUM * 2;
 	cq_init_attr.comp_vector	= 0;
-	cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL);
-	if (IS_ERR(cq)) {
-		dev_err(dev, "Create cq for reserved loop qp failed!");
+
+	ibdev = &hr_dev->ib_dev;
+	cq = rdma_zalloc_drv_obj(ibdev, ib_cq);
+	if (!cq)
 		return -ENOMEM;
+
+	ret = hns_roce_ib_create_cq(cq, &cq_init_attr, NULL);
+	if (ret) {
+		dev_err(dev, "Create cq for reserved loop qp failed!");
+		goto alloc_cq_failed;
 	}
 	free_mr->mr_free_cq = to_hr_cq(cq);
 	free_mr->mr_free_cq->ib_cq.device		= &hr_dev->ib_dev;
@@ -743,7 +749,6 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 	free_mr->mr_free_cq->ib_cq.cq_context		= NULL;
 	atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0);
 
-	ibdev = &hr_dev->ib_dev;
 	pd = rdma_zalloc_drv_obj(ibdev, ib_pd);
 	if (!pd)
 		goto alloc_mem_failed;
@@ -866,7 +871,8 @@ alloc_pd_failed:
 
 alloc_mem_failed:
 	hns_roce_ib_destroy_cq(cq, NULL);
-
+alloc_cq_failed:
+	kfree(cq);
 	return ret;
 }
 
@@ -894,6 +900,7 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev)
 	}
 
 	hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL);
+	kfree(&free_mr->mr_free_cq->ib_cq);
 	hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd, NULL);
 }
 
@@ -3694,8 +3701,6 @@ static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		cq_buf_size = (ibcq->cqe + 1) * hr_dev->caps.cq_entry_sz;
 		hns_roce_buf_free(hr_dev, cq_buf_size, &hr_cq->hr_buf.hr_buf);
 	}
-
-	kfree(hr_cq);
 }
 
 static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index f07b2ec86ec2..3e45b119b0eb 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -446,6 +446,7 @@ static const struct ib_device_ops hns_roce_dev_ops = {
 	.reg_user_mr = hns_roce_reg_user_mr,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, hns_roce_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, hns_roce_cq, ib_cq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, hns_roce_ucontext, ibucontext),
 };
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 205053cb5f97..3100b0c31b0a 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -1075,27 +1075,27 @@ static void i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 	cq = &iwcq->sc_cq;
 	i40iw_cq_wq_destroy(iwdev, cq);
 	cq_free_resources(iwdev, iwcq);
-	kfree(iwcq);
 	i40iw_rem_devusecount(iwdev);
 }
 
 /**
  * i40iw_create_cq - create cq
- * @ibdev: device pointer from stack
+ * @ibcq: CQ allocated
  * @attr: attributes for cq
  * @udata: user data
  */
-static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
-				     const struct ib_cq_init_attr *attr,
-				     struct ib_udata *udata)
+static int i40iw_create_cq(struct ib_cq *ibcq,
+			   const struct ib_cq_init_attr *attr,
+			   struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	struct i40iw_device *iwdev = to_iwdev(ibdev);
-	struct i40iw_cq *iwcq;
+	struct i40iw_cq *iwcq = to_iwcq(ibcq);
 	struct i40iw_pbl *iwpbl;
 	u32 cq_num = 0;
 	struct i40iw_sc_cq *cq;
 	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_cq_init_info info;
+	struct i40iw_cq_init_info info = {};
 	enum i40iw_status_code status;
 	struct i40iw_cqp_request *cqp_request;
 	struct cqp_commands_info *cqp_info;
@@ -1105,22 +1105,16 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
 	int entries = attr->cqe;
 
 	if (iwdev->closing)
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 
 	if (entries > iwdev->max_cqe)
-		return ERR_PTR(-EINVAL);
-
-	iwcq = kzalloc(sizeof(*iwcq), GFP_KERNEL);
-	if (!iwcq)
-		return ERR_PTR(-ENOMEM);
-
-	memset(&info, 0, sizeof(info));
+		return -EINVAL;
 
 	err_code = i40iw_alloc_resource(iwdev, iwdev->allocated_cqs,
 					iwdev->max_cq, &cq_num,
 					&iwdev->next_cq);
 	if (err_code)
-		goto error;
+		return err_code;
 
 	cq = &iwcq->sc_cq;
 	cq->back_cq = (void *)iwcq;
@@ -1227,15 +1221,13 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
 	}
 
 	i40iw_add_devusecount(iwdev);
-	return (struct ib_cq *)iwcq;
+	return 0;
 
 cq_destroy:
 	i40iw_cq_wq_destroy(iwdev, cq);
 cq_free_resources:
 	cq_free_resources(iwdev, iwcq);
-error:
-	kfree(iwcq);
-	return ERR_PTR(err_code);
+	return err_code;
 }
 
 /**
@@ -2693,6 +2685,7 @@ static const struct ib_device_ops i40iw_dev_ops = {
 	.reg_user_mr = i40iw_reg_user_mr,
 	.req_notify_cq = i40iw_req_notify_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_cq, i40iw_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, i40iw_ucontext, ibucontext),
 };
 
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 8eb7490dabb8..72f238ddafb5 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -172,14 +172,14 @@ err_buf:
 }
 
 #define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION
-struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_udata *udata)
+int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	int vector = attr->comp_vector;
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
-	struct mlx4_ib_cq *cq;
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
 	struct mlx4_uar *uar;
 	void *buf_addr;
 	int err;
@@ -187,14 +187,10 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 		udata, struct mlx4_ib_ucontext, ibucontext);
 
 	if (entries < 1 || entries > dev->dev->caps.max_cqes)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
-		return ERR_PTR(-EINVAL);
-
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
 
 	entries      = roundup_pow_of_two(entries + 1);
 	cq->ibcq.cqe = entries - 1;
@@ -269,7 +265,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 			goto err_cq_free;
 		}
 
-	return &cq->ibcq;
+	return 0;
 
 err_cq_free:
 	mlx4_cq_free(dev->dev, &cq->mcq);
@@ -289,11 +285,8 @@ err_mtt:
 err_db:
 	if (!udata)
 		mlx4_db_free(dev->dev, &cq->db);
-
 err_cq:
-	kfree(cq);
-
-	return ERR_PTR(err);
+	return err;
 }
 
 static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
@@ -506,8 +499,6 @@ void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 		mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
 		mlx4_db_free(dev->dev, &mcq->db);
 	}
-
-	kfree(mcq);
 }
 
 static void dump_cqe(void *cqe)
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 5d7a87842291..8790101facb7 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2565,6 +2565,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = {
 	.resize_cq = mlx4_ib_resize_cq,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, mlx4_ib_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, mlx4_ib_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_srq, mlx4_ib_srq, ibsrq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx4_ib_ucontext, ibucontext),
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index af5ee45a9f19..81b3d85e5167 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -743,9 +743,8 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset);
 int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
-struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_udata *udata);
+int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata);
 void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index ebd01bd7f8f6..07b73df0e1a3 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -884,14 +884,14 @@ static void notify_soft_wc_handler(struct work_struct *work)
 	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
 }
 
-struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_udata *udata)
+int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	int vector = attr->comp_vector;
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_ib_cq *cq;
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
 	int uninitialized_var(index);
 	int uninitialized_var(inlen);
 	u32 *cqb = NULL;
@@ -903,18 +903,14 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 
 	if (entries < 0 ||
 	    (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (check_cq_create_flags(attr->flags))
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	entries = roundup_pow_of_two(entries + 1);
 	if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
-		return ERR_PTR(-EINVAL);
-
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
 
 	cq->ibcq.cqe = entries - 1;
 	mutex_init(&cq->resize_mutex);
@@ -929,13 +925,13 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 		err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size,
 				     &index, &inlen);
 		if (err)
-			goto err_create;
+			return err;
 	} else {
 		cqe_size = cache_line_size() == 128 ? 128 : 64;
 		err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb,
 				       &index, &inlen);
 		if (err)
-			goto err_create;
+			return err;
 
 		INIT_WORK(&cq->notify_work, notify_soft_wc_handler);
 	}
@@ -980,7 +976,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 
 
 	kvfree(cqb);
-	return &cq->ibcq;
+	return 0;
 
 err_cmd:
 	mlx5_core_destroy_cq(dev->mdev, &cq->mcq);
@@ -991,11 +987,7 @@ err_cqb:
 		destroy_cq_user(cq, udata);
 	else
 		destroy_cq_kernel(dev, cq);
-
-err_create:
-	kfree(cq);
-
-	return ERR_PTR(err);
+	return err;
 }
 
 void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
@@ -1008,8 +1000,6 @@ void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 		destroy_cq_user(mcq, udata);
 	else
 		destroy_cq_kernel(dev, mcq);
-
-	kfree(mcq);
 }
 
 static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 1e3d936ed809..99eb4a8b0b0d 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4891,18 +4891,19 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
 	if (ret)
 		goto error0;
 
-	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL);
-	if (IS_ERR(devr->c0)) {
-		ret = PTR_ERR(devr->c0);
+	devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq);
+	if (!devr->c0) {
+		ret = -ENOMEM;
 		goto error1;
 	}
-	devr->c0->device        = &dev->ib_dev;
-	devr->c0->uobject       = NULL;
-	devr->c0->comp_handler  = NULL;
-	devr->c0->event_handler = NULL;
-	devr->c0->cq_context    = NULL;
+
+	devr->c0->device = &dev->ib_dev;
 	atomic_set(&devr->c0->usecnt, 0);
 
+	ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL);
+	if (ret)
+		goto err_create_cq;
+
 	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
 	if (IS_ERR(devr->x0)) {
 		ret = PTR_ERR(devr->x0);
@@ -4994,6 +4995,8 @@ error3:
 	mlx5_ib_dealloc_xrcd(devr->x0, NULL);
 error2:
 	mlx5_ib_destroy_cq(devr->c0, NULL);
+err_create_cq:
+	kfree(devr->c0);
 error1:
 	mlx5_ib_dealloc_pd(devr->p0, NULL);
 error0:
@@ -5012,6 +5015,7 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr)
 	mlx5_ib_dealloc_xrcd(devr->x0, NULL);
 	mlx5_ib_dealloc_xrcd(devr->x1, NULL);
 	mlx5_ib_destroy_cq(devr->c0, NULL);
+	kfree(devr->c0);
 	mlx5_ib_dealloc_pd(devr->p0, NULL);
 	kfree(devr->p0);
 
@@ -6182,6 +6186,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
 	.resize_cq = mlx5_ib_resize_cq,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index bf697c5b3e79..f2ad0372d38d 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1115,9 +1115,8 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
 			     int buflen, size_t *bc);
 int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
 			      void *buffer, int buflen, size_t *bc);
-struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_udata *udata);
+int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata);
 void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 81fc04e1c142..efd4e3d13ae2 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -601,10 +601,11 @@ static int mthca_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 	return 0;
 }
 
-static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
-				     const struct ib_cq_init_attr *attr,
-				     struct ib_udata *udata)
+static int mthca_create_cq(struct ib_cq *ibcq,
+			   const struct ib_cq_init_attr *attr,
+			   struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	struct mthca_create_cq ucmd;
 	struct mthca_cq *cq;
@@ -614,20 +615,20 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
 		udata, struct mthca_ucontext, ibucontext);
 
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (udata) {
-		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
-			return ERR_PTR(-EFAULT);
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+			return -EFAULT;
 
 		err = mthca_map_user_db(to_mdev(ibdev), &context->uar,
 					context->db_tab, ucmd.set_db_index,
 					ucmd.set_db_page);
 		if (err)
-			return ERR_PTR(err);
+			return err;
 
 		err = mthca_map_user_db(to_mdev(ibdev), &context->uar,
 					context->db_tab, ucmd.arm_db_index,
@@ -636,11 +637,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
 			goto err_unmap_set;
 	}
 
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq) {
-		err = -ENOMEM;
-		goto err_unmap_arm;
-	}
+	cq = to_mcq(ibcq);
 
 	if (udata) {
 		cq->buf.mr.ibmr.lkey = ucmd.lkey;
@@ -655,20 +652,17 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
 			    udata ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
 			    cq);
 	if (err)
-		goto err_free;
+		goto err_unmap_arm;
 
 	if (udata && ib_copy_to_udata(udata, &cq->cqn, sizeof(__u32))) {
 		mthca_free_cq(to_mdev(ibdev), cq);
 		err = -EFAULT;
-		goto err_free;
+		goto err_unmap_arm;
 	}
 
 	cq->resize_buf = NULL;
 
-	return &cq->ibcq;
-
-err_free:
-	kfree(cq);
+	return 0;
 
 err_unmap_arm:
 	if (udata)
@@ -680,7 +674,7 @@ err_unmap_set:
 		mthca_unmap_user_db(to_mdev(ibdev), &context->uar,
 				    context->db_tab, ucmd.set_db_index);
 
-	return ERR_PTR(err);
+	return err;
 }
 
 static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq,
@@ -823,7 +817,6 @@ static void mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 				    to_mcq(cq)->set_ci_db_index);
 	}
 	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
-	kfree(cq);
 }
 
 static inline u32 convert_access(int acc)
@@ -1187,6 +1180,7 @@ static const struct ib_device_ops mthca_dev_ops = {
 	.resize_cq = mthca_resize_cq,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, mthca_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, mthca_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, mthca_ucontext, ibucontext),
 };
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index cac3fa624c4d..0420203820f6 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -1374,16 +1374,17 @@ static int nes_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 /**
  * nes_create_cq
  */
-static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
+static int nes_create_cq(struct ib_cq *ibcq,
 				   const struct ib_cq_init_attr *attr,
 				   struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	u64 u64temp;
 	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
 	struct nes_device *nesdev = nesvnic->nesdev;
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_cq *nescq;
+	struct nes_cq *nescq = to_nescq(ibcq);
 	struct nes_ucontext *nes_ucontext = NULL;
 	struct nes_cqp_request *cqp_request;
 	void *mem = NULL;
@@ -1399,22 +1400,15 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 	int ret;
 
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (entries > nesadapter->max_cqe)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs,
 			nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ);
-	if (err) {
-		return ERR_PTR(err);
-	}
-
-	nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL);
-	if (!nescq) {
-		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (err)
+		return err;
 
 	nescq->hw_cq.cq_size = max(entries + 1, 5);
 	nescq->hw_cq.cq_number = cq_num;
@@ -1424,10 +1418,10 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 		struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context(
 			udata, struct nes_ucontext, ibucontext);
 
-		if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) {
+		if (ib_copy_from_udata(&req, udata,
+				       sizeof(struct nes_create_cq_req))) {
 			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-			kfree(nescq);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 		nesvnic->mcrq_ucontext = nes_ucontext;
 		nes_ucontext->mcrqf = req.mcrqf;
@@ -1441,8 +1435,6 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 			nescq->mcrqf = nes_ucontext->mcrqf;
 			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
 		}
-		nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n",
-				(unsigned long)req.user_cq_buffer, entries);
 		err = 1;
 		list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) {
 			if (nespbl->user_base == (unsigned long )req.user_cq_buffer) {
@@ -1455,8 +1447,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 		}
 		if (err) {
 			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-			kfree(nescq);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 
 		pbl_entries = nespbl->pbl_size >> 3;
@@ -1472,15 +1463,11 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 		if (!mem) {
 			printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n");
 			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-			kfree(nescq);
-			return ERR_PTR(-ENOMEM);
+			return -ENOMEM;
 		}
 
 		nescq->hw_cq.cq_vbase = mem;
 		nescq->hw_cq.cq_head = 0;
-		nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n",
-				nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase,
-				(u32)nescq->hw_cq.cq_pbase);
 	}
 
 	nescq->hw_cq.ce_handler = nes_iwarp_ce_handler;
@@ -1500,8 +1487,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 		}
 
 		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-		kfree(nescq);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 	cqp_request->waiting = 1;
 	cqp_wqe = &cqp_request->cqp_wqe;
@@ -1528,8 +1514,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 					kfree(nespbl);
 				}
 				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-				kfree(nescq);
-				return ERR_PTR(-ENOMEM);
+				return -ENOMEM;
 			} else {
 				opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK);
 				nescq->virtual_cq = 2;
@@ -1550,8 +1535,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 					kfree(nespbl);
 				}
 				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-				kfree(nescq);
-				return ERR_PTR(-ENOMEM);
+				return -ENOMEM;
 			} else {
 				opcode |= NES_CQP_CQ_VIRT;
 				nescq->virtual_cq = 1;
@@ -1607,8 +1591,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 			kfree(nespbl);
 		}
 		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-		kfree(nescq);
-		return ERR_PTR(-EIO);
+		return -EIO;
 	}
 	nes_put_cqp_request(nesdev, cqp_request);
 
@@ -1620,17 +1603,16 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 		resp.cq_id = nescq->hw_cq.cq_number;
 		resp.cq_size = nescq->hw_cq.cq_size;
 		resp.mmap_db_index = 0;
-		if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) {
+		if (ib_copy_to_udata(udata, &resp,
+				     sizeof(resp) - sizeof(resp.reserved))) {
 			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-			kfree(nescq);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 	}
 
-	return &nescq->ibcq;
+	return 0;
 }
 
-
 /**
  * nes_destroy_cq
  */
@@ -1700,7 +1682,6 @@ static void nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 	if (nescq->cq_mem_size)
 		pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size,
 				    nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase);
-	kfree(nescq);
 }
 
 /**
@@ -3584,6 +3565,7 @@ static const struct ib_device_ops nes_dev_ops = {
 	.reg_user_mr = nes_reg_user_mr,
 	.req_notify_cq = nes_req_notify_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_cq, nes_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, nes_ucontext, ibucontext),
 };
 
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index b326313d413f..c15cfc6cef81 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -182,6 +182,7 @@ static const struct ib_device_ops ocrdma_dev_ops = {
 	.resize_cq = ocrdma_resize_cq,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, ocrdma_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, ocrdma_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, ocrdma_ucontext, ibucontext),
 };
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 94e4f7f9b1f7..10b35edb286b 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -977,12 +977,12 @@ err:
 	return status;
 }
 
-struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
-			       const struct ib_cq_init_attr *attr,
-			       struct ib_udata *udata)
+int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		     struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
-	struct ocrdma_cq *cq;
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
 	struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context(
 		udata, struct ocrdma_ucontext, ibucontext);
@@ -991,16 +991,13 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
 	struct ocrdma_create_cq_ureq ureq;
 
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (udata) {
 		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 	} else
 		ureq.dpp_cq = 0;
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&cq->cq_lock);
 	spin_lock_init(&cq->comp_handler_lock);
@@ -1011,10 +1008,9 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
 		pd_id = uctx->cntxt_pd->id;
 
 	status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id);
-	if (status) {
-		kfree(cq);
-		return ERR_PTR(status);
-	}
+	if (status)
+		return status;
+
 	if (udata) {
 		status = ocrdma_copy_cq_uresp(dev, cq, udata);
 		if (status)
@@ -1022,12 +1018,11 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
 	}
 	cq->phase = OCRDMA_CQE_VALID;
 	dev->cq_tbl[cq->id] = cq;
-	return &cq->ibcq;
+	return 0;
 
 ctx_err:
 	ocrdma_mbx_destroy_cq(dev, cq);
-	kfree(cq);
-	return ERR_PTR(status);
+	return status;
 }
 
 int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt,
@@ -1095,8 +1090,6 @@ void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 				ocrdma_get_db_addr(dev, pdid),
 				dev->nic_info.db_page_size);
 	}
-
-	kfree(cq);
 }
 
 static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index 89cebe05669e..32488da1b752 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -71,9 +71,8 @@ int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
 int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 void ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
-struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
-			       const struct ib_cq_init_attr *attr,
-			       struct ib_udata *udata);
+int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		     struct ib_udata *udata);
 int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
 void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index a0bb07ba0f3c..a0a7ba0a5af4 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -224,6 +224,7 @@ static const struct ib_device_ops qedr_dev_ops = {
 	.resize_cq = qedr_resize_cq,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, qedr_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, qedr_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_srq, qedr_srq, ibsrq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, qedr_ucontext, ibucontext),
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index be29bbbc4b14..3fc7a4e901c3 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -806,20 +806,20 @@ int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
 	return 0;
 }
 
-struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_udata *udata)
+int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		   struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	struct qedr_ucontext *ctx = rdma_udata_to_drv_context(
 		udata, struct qedr_ucontext, ibucontext);
 	struct qed_rdma_destroy_cq_out_params destroy_oparams;
 	struct qed_rdma_destroy_cq_in_params destroy_iparams;
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
 	struct qed_rdma_create_cq_in_params params;
-	struct qedr_create_cq_ureq ureq;
+	struct qedr_create_cq_ureq ureq = {};
 	int vector = attr->comp_vector;
 	int entries = attr->cqe;
-	struct qedr_cq *cq;
+	struct qedr_cq *cq = get_qedr_cq(ibcq);
 	int chain_entries;
 	int page_cnt;
 	u64 pbl_ptr;
@@ -834,18 +834,13 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
 		DP_ERR(dev,
 		       "create cq: the number of entries %d is too high. Must be equal or below %d.\n",
 		       entries, QEDR_MAX_CQES);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	chain_entries = qedr_align_cq_entries(entries);
 	chain_entries = min_t(int, chain_entries, QEDR_MAX_CQES);
 
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
-
 	if (udata) {
-		memset(&ureq, 0, sizeof(ureq));
 		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
 			DP_ERR(dev,
 			       "create cq: problem copying data from user space\n");
@@ -923,7 +918,7 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
 		 "create cq: icid=0x%0x, addr=%p, size(entries)=0x%0x\n",
 		 cq->icid, cq, params.cq_size);
 
-	return &cq->ibcq;
+	return 0;
 
 err3:
 	destroy_iparams.icid = cq->icid;
@@ -938,8 +933,7 @@ err1:
 	if (udata)
 		ib_umem_release(cq->q.umem);
 err0:
-	kfree(cq);
-	return ERR_PTR(-EINVAL);
+	return -EINVAL;
 }
 
 int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata)
@@ -969,7 +963,7 @@ void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 
 	/* GSIs CQs are handled by driver, so they don't exist in the FW */
 	if (cq->cq_type == QEDR_CQ_TYPE_GSI)
-		goto done;
+		return;
 
 	iparams.icid = cq->icid;
 	dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
@@ -1008,10 +1002,6 @@ void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	 * Since the destroy CQ ramrod has also been received on the EQ we can
 	 * be certain that there's no event handler in process.
 	 */
-done:
-	cq->sig = ~cq->sig;
-
-	kfree(cq);
 }
 
 static inline int get_gid_info_from_table(struct ib_qp *ibqp,
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 32d7ce77e339..9aaa90283d6e 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -50,9 +50,8 @@ int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
 int qedr_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 void qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
-struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_udata *udata);
+int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		   struct ib_udata *udata);
 int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
 void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h
index 525bf272671e..84dd682d2334 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib.h
@@ -61,6 +61,10 @@ struct usnic_ib_pd {
 	struct usnic_uiom_pd		*umem_pd;
 };
 
+struct usnic_ib_cq {
+	struct ib_cq			ibcq;
+};
+
 struct usnic_ib_mr {
 	struct ib_mr			ibmr;
 	struct usnic_uiom_reg		*umem;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index e701322dc740..6ae5ce007fed 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -354,6 +354,7 @@ static const struct ib_device_ops usnic_dev_ops = {
 	.query_qp = usnic_ib_query_qp,
 	.reg_user_mr = usnic_ib_reg_mr,
 	INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_cq, usnic_ib_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, usnic_ib_ucontext, ibucontext),
 };
 
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 5686d14b86fe..eeb07b245ef9 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -587,26 +587,18 @@ out_unlock:
 	return status;
 }
 
-struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
-				 const struct ib_cq_init_attr *attr,
-				 struct ib_udata *udata)
+int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		       struct ib_udata *udata)
 {
-	struct ib_cq *cq;
-
-	usnic_dbg("\n");
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
-
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-EBUSY);
+		return -EINVAL;
 
-	return cq;
+	return 0;
 }
 
 void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
-	kfree(cq);
+	return;
 }
 
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index 0b9d993433a7..2aedf78c13cf 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -58,9 +58,8 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
 int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 				int attr_mask, struct ib_udata *udata);
-struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
-				 const struct ib_cq_init_attr *attr,
-				 struct ib_udata *udata);
+int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		       struct ib_udata *udata);
 void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
 				u64 virt_addr, int access_flags,
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
index 0682781f6555..38573fc0a9bf 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
@@ -92,20 +92,19 @@ int pvrdma_req_notify_cq(struct ib_cq *ibcq,
 
 /**
  * pvrdma_create_cq - create completion queue
- * @ibdev: the device
+ * @ibcq: Allocated CQ
  * @attr: completion queue attributes
  * @udata: user data
  *
- * @return: ib_cq completion queue pointer on success,
- *          otherwise returns negative errno.
+ * @return: 0 on success
  */
-struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
-			       const struct ib_cq_init_attr *attr,
-			       struct ib_udata *udata)
+int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		     struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	struct pvrdma_dev *dev = to_vdev(ibdev);
-	struct pvrdma_cq *cq;
+	struct pvrdma_cq *cq = to_vcq(ibcq);
 	int ret;
 	int npages;
 	unsigned long flags;
@@ -113,7 +112,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
 	union pvrdma_cmd_resp rsp;
 	struct pvrdma_cmd_create_cq *cmd = &req.create_cq;
 	struct pvrdma_cmd_create_cq_resp *resp = &rsp.create_cq_resp;
-	struct pvrdma_create_cq_resp cq_resp = {0};
+	struct pvrdma_create_cq_resp cq_resp = {};
 	struct pvrdma_create_cq ucmd;
 	struct pvrdma_ucontext *context = rdma_udata_to_drv_context(
 		udata, struct pvrdma_ucontext, ibucontext);
@@ -122,16 +121,10 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
 
 	entries = roundup_pow_of_two(entries);
 	if (entries < 1 || entries > dev->dsr->caps.max_cqe)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (!atomic_add_unless(&dev->num_cqs, 1, dev->dsr->caps.max_cq))
-		return ERR_PTR(-ENOMEM);
-
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq) {
-		atomic_dec(&dev->num_cqs);
-		return ERR_PTR(-ENOMEM);
-	}
+		return -ENOMEM;
 
 	cq->ibcq.cqe = entries;
 	cq->is_kernel = !udata;
@@ -211,11 +204,11 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
 			dev_warn(&dev->pdev->dev,
 				 "failed to copy back udata\n");
 			pvrdma_destroy_cq(&cq->ibcq, udata);
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 		}
 	}
 
-	return &cq->ibcq;
+	return 0;
 
 err_page_dir:
 	pvrdma_page_dir_cleanup(dev, &cq->pdir);
@@ -224,9 +217,7 @@ err_umem:
 		ib_umem_release(cq->umem);
 err_cq:
 	atomic_dec(&dev->num_cqs);
-	kfree(cq);
-
-	return ERR_PTR(ret);
+	return ret;
 }
 
 static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq)
@@ -239,7 +230,6 @@ static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq)
 		ib_umem_release(cq->umem);
 
 	pvrdma_page_dir_cleanup(dev, &cq->pdir);
-	kfree(cq);
 }
 
 /**
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index 0c48464ffff1..e580ae9cc55a 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -182,6 +182,7 @@ static const struct ib_device_ops pvrdma_dev_ops = {
 	.req_notify_cq = pvrdma_req_notify_cq,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, pvrdma_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, pvrdma_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, pvrdma_ucontext, ibucontext),
 };
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
index f0dd6e4d058b..e4a48f5c0c85 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -409,9 +409,8 @@ struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 			      u32 max_num_sg, struct ib_udata *udata);
 int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 		     int sg_nents, unsigned int *sg_offset);
-struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
-			       const struct ib_cq_init_attr *attr,
-			       struct ib_udata *udata);
+int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		     struct ib_udata *udata);
 void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 8e76036fad4a..b46714a92b7a 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -166,43 +166,37 @@ static void send_complete(struct work_struct *work)
 
 /**
  * rvt_create_cq - create a completion queue
- * @ibdev: the device this completion queue is attached to
+ * @ibcq: Allocated CQ
  * @attr: creation attributes
  * @udata: user data for libibverbs.so
  *
  * Called by ib_create_cq() in the generic verbs code.
  *
- * Return: pointer to the completion queue or negative errno values
- * for failure.
+ * Return: 0 on success
  */
-struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
-			    const struct ib_cq_init_attr *attr,
-			    struct ib_udata *udata)
+int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
-	struct rvt_cq *cq;
+	struct rvt_cq *cq = container_of(ibcq, struct rvt_cq, ibcq);
 	struct rvt_cq_wc *wc;
-	struct ib_cq *ret;
 	u32 sz;
 	unsigned int entries = attr->cqe;
 	int comp_vector = attr->comp_vector;
+	int err;
 
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (entries < 1 || entries > rdi->dparms.props.max_cqe)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (comp_vector < 0)
 		comp_vector = 0;
 
 	comp_vector = comp_vector % rdi->ibdev.num_comp_vectors;
 
-	/* Allocate the completion queue structure. */
-	cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
-
 	/*
 	 * Allocate the completion queue entries and head/tail pointers.
 	 * This is allocated separately so that it can be resized and
@@ -218,36 +212,29 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 	wc = udata ?
 		vmalloc_user(sz) :
 		vzalloc_node(sz, rdi->dparms.node);
-	if (!wc) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail_cq;
-	}
-
+	if (!wc)
+		return -ENOMEM;
 	/*
 	 * Return the address of the WC as the offset to mmap.
 	 * See rvt_mmap() for details.
 	 */
 	if (udata && udata->outlen >= sizeof(__u64)) {
-		int err;
-
 		cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc);
 		if (!cq->ip) {
-			ret = ERR_PTR(-ENOMEM);
+			err = -ENOMEM;
 			goto bail_wc;
 		}
 
 		err = ib_copy_to_udata(udata, &cq->ip->offset,
 				       sizeof(cq->ip->offset));
-		if (err) {
-			ret = ERR_PTR(err);
+		if (err)
 			goto bail_ip;
-		}
 	}
 
 	spin_lock_irq(&rdi->n_cqs_lock);
 	if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) {
 		spin_unlock_irq(&rdi->n_cqs_lock);
-		ret = ERR_PTR(-ENOMEM);
+		err = -ENOMEM;
 		goto bail_ip;
 	}
 
@@ -279,19 +266,14 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 	INIT_WORK(&cq->comptask, send_complete);
 	cq->queue = wc;
 
-	ret = &cq->ibcq;
-
 	trace_rvt_create_cq(cq, attr);
-	goto done;
+	return 0;
 
 bail_ip:
 	kfree(cq->ip);
 bail_wc:
 	vfree(wc);
-bail_cq:
-	kfree(cq);
-done:
-	return ret;
+	return err;
 }
 
 /**
@@ -314,7 +296,6 @@ void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		kref_put(&cq->ip->ref, rvt_release_mmap_info);
 	else
 		vfree(cq->queue);
-	kfree(cq);
 }
 
 /**
diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h
index 495d8c3e6580..5e26a2eb19a4 100644
--- a/drivers/infiniband/sw/rdmavt/cq.h
+++ b/drivers/infiniband/sw/rdmavt/cq.h
@@ -51,9 +51,8 @@
 #include <rdma/rdma_vt.h>
 #include <rdma/rdmavt_cq.h>
 
-struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
-			    const struct ib_cq_init_attr *attr,
-			    struct ib_udata *udata);
+int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata);
 void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 639ef8ac5400..18da1e1ea979 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -429,6 +429,7 @@ static const struct ib_device_ops rvt_dev_ops = {
 	.unmap_fmr = rvt_unmap_fmr,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, rvt_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_srq, rvt_srq, ibsrq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext),
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 56cf18af016a..fbcbac52290b 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -72,6 +72,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
 	[RXE_TYPE_CQ] = {
 		.name		= "rxe-cq",
 		.size		= sizeof(struct rxe_cq),
+		.flags          = RXE_POOL_NO_ALLOC,
 		.cleanup	= rxe_cq_cleanup,
 	},
 	[RXE_TYPE_MR] = {
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index b14881decbee..4ebdfcf4d33e 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -778,45 +778,34 @@ err1:
 	return err;
 }
 
-static struct ib_cq *rxe_create_cq(struct ib_device *dev,
-				   const struct ib_cq_init_attr *attr,
-				   struct ib_udata *udata)
+static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+			 struct ib_udata *udata)
 {
 	int err;
+	struct ib_device *dev = ibcq->device;
 	struct rxe_dev *rxe = to_rdev(dev);
-	struct rxe_cq *cq;
+	struct rxe_cq *cq = to_rcq(ibcq);
 	struct rxe_create_cq_resp __user *uresp = NULL;
 
 	if (udata) {
 		if (udata->outlen < sizeof(*uresp))
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 		uresp = udata->outbuf;
 	}
 
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector);
 	if (err)
-		goto err1;
-
-	cq = rxe_alloc(&rxe->cq_pool);
-	if (!cq) {
-		err = -ENOMEM;
-		goto err1;
-	}
+		return err;
 
 	err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata,
 			       uresp);
 	if (err)
-		goto err2;
-
-	return &cq->ibcq;
+		return err;
 
-err2:
-	rxe_drop_ref(cq);
-err1:
-	return ERR_PTR(err);
+	return rxe_add_to_pool(&rxe->cq_pool, &cq->pelem);
 }
 
 static void rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
@@ -1160,6 +1149,7 @@ static const struct ib_device_ops rxe_dev_ops = {
 	.resize_cq = rxe_resize_cq,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index e8be7f44e3be..6c997d39a418 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -85,8 +85,8 @@ struct rxe_cqe {
 };
 
 struct rxe_cq {
-	struct rxe_pool_entry	pelem;
 	struct ib_cq		ibcq;
+	struct rxe_pool_entry	pelem;
 	struct rxe_queue	*queue;
 	spinlock_t		cq_lock;
 	u8			notify;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index bc1d94c9c9ba..f357e03a85a6 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2458,9 +2458,8 @@ struct ib_device_ops {
 	int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
 			int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
 	int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata);
-	struct ib_cq *(*create_cq)(struct ib_device *device,
-				   const struct ib_cq_init_attr *attr,
-				   struct ib_udata *udata);
+	int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr,
+			 struct ib_udata *udata);
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 	void (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
 	int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata);
@@ -2601,6 +2600,7 @@ struct ib_device_ops {
 	int (*iw_destroy_listen)(struct iw_cm_id *cm_id);
 
 	DECLARE_RDMA_OBJ_SIZE(ib_ah);
+	DECLARE_RDMA_OBJ_SIZE(ib_cq);
 	DECLARE_RDMA_OBJ_SIZE(ib_pd);
 	DECLARE_RDMA_OBJ_SIZE(ib_srq);
 	DECLARE_RDMA_OBJ_SIZE(ib_ucontext);
-- 
cgit v1.2.3


From 8ad2b4b371bcfe7069ee28ad18d722e1240a0a97 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 11 Jun 2019 21:45:33 -0700
Subject: dt-bindings: soc: qcom: Add AOSS QMP binding

Add binding for the QMP based side-channel communication mechanism to
the AOSS, which is used to control resources not exposed through the
RPMh interface.

Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 .../devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt | 81 ++++++++++++++++++++++
 include/dt-bindings/power/qcom-aoss-qmp.h          | 14 ++++
 2 files changed, 95 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt
 create mode 100644 include/dt-bindings/power/qcom-aoss-qmp.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt
new file mode 100644
index 000000000000..954ffee0a9c4
--- /dev/null
+++ b/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt
@@ -0,0 +1,81 @@
+Qualcomm Always-On Subsystem side channel binding
+
+This binding describes the hardware component responsible for side channel
+requests to the always-on subsystem (AOSS), used for certain power management
+requests that is not handled by the standard RPMh interface. Each client in the
+SoC has it's own block of message RAM and IRQ for communication with the AOSS.
+The protocol used to communicate in the message RAM is known as Qualcomm
+Messaging Protocol (QMP)
+
+The AOSS side channel exposes control over a set of resources, used to control
+a set of debug related clocks and to affect the low power state of resources
+related to the secondary subsystems. These resources are exposed as a set of
+power-domains.
+
+- compatible:
+	Usage: required
+	Value type: <string>
+	Definition: must be "qcom,sdm845-aoss-qmp"
+
+- reg:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: the base address and size of the message RAM for this
+		    client's communication with the AOSS
+
+- interrupts:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: should specify the AOSS message IRQ for this client
+
+- mboxes:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: reference to the mailbox representing the outgoing doorbell
+		    in APCS for this client, as described in mailbox/mailbox.txt
+
+- #clock-cells:
+	Usage: optional
+	Value type: <u32>
+	Definition: must be 0
+		    The single clock represents the QDSS clock.
+
+- #power-domain-cells:
+	Usage: optional
+	Value type: <u32>
+	Definition: must be 1
+		    The provided power-domains are:
+		    CDSP state (0), LPASS state (1), modem state (2), SLPI
+		    state (3), SPSS state (4) and Venus state (5).
+
+= SUBNODES
+The AOSS side channel also provides the controls for three cooling devices,
+these are expressed as subnodes of the QMP node. The name of the node is used
+to identify the resource and must therefor be "cx", "mx" or "ebi".
+
+- #cooling-cells:
+	Usage: optional
+	Value type: <u32>
+	Definition: must be 2
+
+= EXAMPLE
+
+The following example represents the AOSS side-channel message RAM and the
+mechanism exposing the power-domains, as found in SDM845.
+
+  aoss_qmp: qmp@c300000 {
+	  compatible = "qcom,sdm845-aoss-qmp";
+	  reg = <0x0c300000 0x100000>;
+	  interrupts = <GIC_SPI 389 IRQ_TYPE_EDGE_RISING>;
+	  mboxes = <&apss_shared 0>;
+
+	  #power-domain-cells = <1>;
+
+	  cx_cdev: cx {
+		#cooling-cells = <2>;
+	  };
+
+	  mx_cdev: mx {
+		#cooling-cells = <2>;
+	  };
+  };
diff --git a/include/dt-bindings/power/qcom-aoss-qmp.h b/include/dt-bindings/power/qcom-aoss-qmp.h
new file mode 100644
index 000000000000..ec336d31dee4
--- /dev/null
+++ b/include/dt-bindings/power/qcom-aoss-qmp.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Linaro Ltd. */
+
+#ifndef __DT_BINDINGS_POWER_QCOM_AOSS_QMP_H
+#define __DT_BINDINGS_POWER_QCOM_AOSS_QMP_H
+
+#define AOSS_QMP_LS_CDSP		0
+#define AOSS_QMP_LS_LPASS	1
+#define AOSS_QMP_LS_MODEM	2
+#define AOSS_QMP_LS_SLPI		3
+#define AOSS_QMP_LS_SPSS		4
+#define AOSS_QMP_LS_VENUS	5
+
+#endif
-- 
cgit v1.2.3


From a3e69b86cf04b7c0b624572c181678ea74e83400 Mon Sep 17 00:00:00 2001
From: Yannick Fertré <yannick.fertre@st.com>
Date: Mon, 27 May 2019 12:21:38 +0200
Subject: drm/bridge/synopsys: dsi: add power on/off optional phy ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add power on & off optional physical operation functions, helpful to
program specific registers of the DSI physical part.

Signed-off-by: Yannick Fertré <yannick.fertre@st.com>
Reviewed-by: Philippe Cornu <philippe.cornu@st.com>
Tested-by: Philippe Cornu <philippe.cornu@st.com>
Reviewed-by: Andrzej Hajda <a.hajda@samsung.com>
Signed-off-by: Andrzej Hajda <a.hajda@samsung.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1558952499-15418-2-git-send-email-yannick.fertre@st.com
---
 drivers/gpu/drm/bridge/synopsys/dw-mipi-dsi.c | 8 ++++++++
 include/drm/bridge/dw_mipi_dsi.h              | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/bridge/synopsys/dw-mipi-dsi.c b/drivers/gpu/drm/bridge/synopsys/dw-mipi-dsi.c
index a79c87bd0147..281c58bab1a1 100644
--- a/drivers/gpu/drm/bridge/synopsys/dw-mipi-dsi.c
+++ b/drivers/gpu/drm/bridge/synopsys/dw-mipi-dsi.c
@@ -778,6 +778,10 @@ static void dw_mipi_dsi_clear_err(struct dw_mipi_dsi *dsi)
 static void dw_mipi_dsi_bridge_post_disable(struct drm_bridge *bridge)
 {
 	struct dw_mipi_dsi *dsi = bridge_to_dsi(bridge);
+	const struct dw_mipi_dsi_phy_ops *phy_ops = dsi->plat_data->phy_ops;
+
+	if (phy_ops->power_off)
+		phy_ops->power_off(dsi->plat_data->priv_data);
 
 	/*
 	 * Switch to command mode before panel-bridge post_disable &
@@ -877,11 +881,15 @@ static void dw_mipi_dsi_bridge_mode_set(struct drm_bridge *bridge,
 static void dw_mipi_dsi_bridge_enable(struct drm_bridge *bridge)
 {
 	struct dw_mipi_dsi *dsi = bridge_to_dsi(bridge);
+	const struct dw_mipi_dsi_phy_ops *phy_ops = dsi->plat_data->phy_ops;
 
 	/* Switch to video mode for panel-bridge enable & panel enable */
 	dw_mipi_dsi_set_mode(dsi, MIPI_DSI_MODE_VIDEO);
 	if (dsi->slave)
 		dw_mipi_dsi_set_mode(dsi->slave, MIPI_DSI_MODE_VIDEO);
+
+	if (phy_ops->power_on)
+		phy_ops->power_on(dsi->plat_data->priv_data);
 }
 
 static enum drm_mode_status
diff --git a/include/drm/bridge/dw_mipi_dsi.h b/include/drm/bridge/dw_mipi_dsi.h
index 92cf9d1e1655..be8cdc2bf67d 100644
--- a/include/drm/bridge/dw_mipi_dsi.h
+++ b/include/drm/bridge/dw_mipi_dsi.h
@@ -22,6 +22,8 @@ struct platform_device;
 
 struct dw_mipi_dsi_phy_ops {
 	int (*init)(void *priv_data);
+	void (*power_on)(void *priv_data);
+	void (*power_off)(void *priv_data);
 	int (*get_lane_mbps)(void *priv_data,
 			     const struct drm_display_mode *mode,
 			     unsigned long mode_flags, u32 lanes, u32 format,
-- 
cgit v1.2.3


From 78b99577b3934e3e787fe0c52aa1b59442c8bbb5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 10 Jun 2019 00:09:53 +0900
Subject: pinctrl: remove unused pin_is_valid()

This function was used by pin_request() to pointlessly double-check
the pin validity, and it was the only user ever.

Since commit d2f6a1c6fb0e ("pinctrl: remove double pin validity
check."), no one has ever used it.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c          | 23 -----------------------
 include/linux/pinctrl/pinctrl.h | 10 ----------
 2 files changed, 33 deletions(-)

(limited to 'include')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 04787eefe2a2..e745788fa36f 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -178,29 +178,6 @@ const char *pin_get_name(struct pinctrl_dev *pctldev, const unsigned pin)
 	return desc->name;
 }
 
-/**
- * pin_is_valid() - check if pin exists on controller
- * @pctldev: the pin control device to check the pin on
- * @pin: pin to check, use the local pin controller index number
- *
- * This tells us whether a certain pin exist on a certain pin controller or
- * not. Pin lists may be sparse, so some pins may not exist.
- */
-bool pin_is_valid(struct pinctrl_dev *pctldev, int pin)
-{
-	struct pin_desc *pindesc;
-
-	if (pin < 0)
-		return false;
-
-	mutex_lock(&pctldev->mutex);
-	pindesc = pin_desc_get(pctldev, pin);
-	mutex_unlock(&pctldev->mutex);
-
-	return pindesc != NULL;
-}
-EXPORT_SYMBOL_GPL(pin_is_valid);
-
 /* Deletes a range of pin descriptors */
 static void pinctrl_free_pindescs(struct pinctrl_dev *pctldev,
 				  const struct pinctrl_pin_desc *pins,
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 2744113f1024..36a79fe7b84f 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -172,7 +172,6 @@ extern struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
 extern void devm_pinctrl_unregister(struct device *dev,
 				struct pinctrl_dev *pctldev);
 
-extern bool pin_is_valid(struct pinctrl_dev *pctldev, int pin);
 extern void pinctrl_add_gpio_range(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *range);
 extern void pinctrl_add_gpio_ranges(struct pinctrl_dev *pctldev,
@@ -203,15 +202,6 @@ struct pinctrl_dev *of_pinctrl_get(struct device_node *np)
 extern const char *pinctrl_dev_get_name(struct pinctrl_dev *pctldev);
 extern const char *pinctrl_dev_get_devname(struct pinctrl_dev *pctldev);
 extern void *pinctrl_dev_get_drvdata(struct pinctrl_dev *pctldev);
-#else
-
-struct pinctrl_dev;
-
-/* Sufficiently stupid default functions when pinctrl is not in use */
-static inline bool pin_is_valid(struct pinctrl_dev *pctldev, int pin)
-{
-	return pin >= 0;
-}
 
 #endif /* !CONFIG_PINCTRL */
 
-- 
cgit v1.2.3


From 0b673b6486998061b0489b09447ebe8452da0146 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 8 May 2019 11:46:34 -0700
Subject: firmware: arm_scmi: fetch and store sensor scale

In preparation for dealing with scales within the SCMI HWMON driver,
fetch and store the sensor unit scale into the scmi_sensor_info
structure. In order to simplify computations for upper layer, take care
of sign extending the scale to a full 8-bit signed value.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
[sudeep.holla: update bitfield values as per specification]
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 6 ++++++
 include/linux/scmi_protocol.h       | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index c00287b5f2c2..0e94ab56f679 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -34,6 +34,8 @@ struct scmi_msg_resp_sensor_description {
 		__le32 attributes_high;
 #define SENSOR_TYPE(x)		((x) & 0xff)
 #define SENSOR_SCALE(x)		(((x) >> 11) & 0x1f)
+#define SENSOR_SCALE_SIGN	BIT(4)
+#define SENSOR_SCALE_EXTEND	GENMASK(7, 5)
 #define SENSOR_UPDATE_SCALE(x)	(((x) >> 22) & 0x1f)
 #define SENSOR_UPDATE_BASE(x)	(((x) >> 27) & 0x1f)
 		    u8 name[SCMI_MAX_STR_SIZE];
@@ -140,6 +142,10 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 			s = &si->sensors[desc_index + cnt];
 			s->id = le32_to_cpu(buf->desc[cnt].id);
 			s->type = SENSOR_TYPE(attrh);
+			s->scale = SENSOR_SCALE(attrh);
+			/* Sign extend to a full s8 */
+			if (s->scale & SENSOR_SCALE_SIGN)
+				s->scale |= SENSOR_SCALE_EXTEND;
 			strlcpy(s->name, buf->desc[cnt].name, SCMI_MAX_STR_SIZE);
 		}
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 3105055c00a7..9ff2e9357e9a 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -144,6 +144,7 @@ struct scmi_power_ops {
 struct scmi_sensor_info {
 	u32 id;
 	u8 type;
+	s8 scale;
 	char name[SCMI_MAX_STR_SIZE];
 };
 
-- 
cgit v1.2.3


From 81f4458c9c6998fcd37c427d16d76d4dba65d015 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 28 May 2019 16:10:24 +0300
Subject: firmware: ti_sci: extend clock identifiers from u8 to u32

Future SoCs are going to have more than 255 device clocks in certain cases,
and thus the API must be extended to support this. The support is done in
backwards compatible extension, in which the new u32 clock identifier
fields are only used if the existing u8 size clock identifier is set as
255. In all the other cases, the existing u8 clock identifier is used. As
the size of the messages sent / received is not verified for existing
devices / old firmware, increasing the size of the messages from the end
is also fine. Due to this reason, depending on ABI version isn't necessary
either.

Acked-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 drivers/firmware/ti_sci.c              | 115 ++++++++++++++++++++++++---------
 drivers/firmware/ti_sci.h              |  63 ++++++++++++++----
 include/linux/soc/ti/ti_sci_protocol.h |  28 ++++----
 3 files changed, 150 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index ef93406ace1b..b417cef35769 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -916,7 +916,7 @@ static int ti_sci_cmd_get_device_resets(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_set_clock_state(const struct ti_sci_handle *handle,
-				  u32 dev_id, u8 clk_id,
+				  u32 dev_id, u32 clk_id,
 				  u32 flags, u8 state)
 {
 	struct ti_sci_info *info;
@@ -944,7 +944,12 @@ static int ti_sci_set_clock_state(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_set_clock_state *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 	req->request_state = state;
 
 	ret = ti_sci_do_xfer(info, xfer);
@@ -976,7 +981,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_get_clock_state(const struct ti_sci_handle *handle,
-				      u32 dev_id, u8 clk_id,
+				      u32 dev_id, u32 clk_id,
 				      u8 *programmed_state, u8 *current_state)
 {
 	struct ti_sci_info *info;
@@ -1007,7 +1012,12 @@ static int ti_sci_cmd_get_clock_state(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_state *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1047,8 +1057,8 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_get_clock(const struct ti_sci_handle *handle, u32 dev_id,
-				u8 clk_id, bool needs_ssc, bool can_change_freq,
-				bool enable_input_term)
+				u32 clk_id, bool needs_ssc,
+				bool can_change_freq, bool enable_input_term)
 {
 	u32 flags = 0;
 
@@ -1073,7 +1083,7 @@ static int ti_sci_cmd_get_clock(const struct ti_sci_handle *handle, u32 dev_id,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_idle_clock(const struct ti_sci_handle *handle,
-				 u32 dev_id, u8 clk_id)
+				 u32 dev_id, u32 clk_id)
 {
 	return ti_sci_set_clock_state(handle, dev_id, clk_id, 0,
 				      MSG_CLOCK_SW_STATE_UNREQ);
@@ -1092,7 +1102,7 @@ static int ti_sci_cmd_idle_clock(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_put_clock(const struct ti_sci_handle *handle,
-				u32 dev_id, u8 clk_id)
+				u32 dev_id, u32 clk_id)
 {
 	return ti_sci_set_clock_state(handle, dev_id, clk_id, 0,
 				      MSG_CLOCK_SW_STATE_AUTO);
@@ -1110,7 +1120,7 @@ static int ti_sci_cmd_put_clock(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_is_auto(const struct ti_sci_handle *handle,
-				  u32 dev_id, u8 clk_id, bool *req_state)
+				  u32 dev_id, u32 clk_id, bool *req_state)
 {
 	u8 state = 0;
 	int ret;
@@ -1139,7 +1149,7 @@ static int ti_sci_cmd_clk_is_auto(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_is_on(const struct ti_sci_handle *handle, u32 dev_id,
-				u8 clk_id, bool *req_state, bool *curr_state)
+				u32 clk_id, bool *req_state, bool *curr_state)
 {
 	u8 c_state = 0, r_state = 0;
 	int ret;
@@ -1172,7 +1182,7 @@ static int ti_sci_cmd_clk_is_on(const struct ti_sci_handle *handle, u32 dev_id,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_is_off(const struct ti_sci_handle *handle, u32 dev_id,
-				 u8 clk_id, bool *req_state, bool *curr_state)
+				 u32 clk_id, bool *req_state, bool *curr_state)
 {
 	u8 c_state = 0, r_state = 0;
 	int ret;
@@ -1204,7 +1214,7 @@ static int ti_sci_cmd_clk_is_off(const struct ti_sci_handle *handle, u32 dev_id,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_set_parent(const struct ti_sci_handle *handle,
-				     u32 dev_id, u8 clk_id, u8 parent_id)
+				     u32 dev_id, u32 clk_id, u32 parent_id)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_set_clock_parent *req;
@@ -1231,8 +1241,18 @@ static int ti_sci_cmd_clk_set_parent(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_set_clock_parent *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
-	req->parent_id = parent_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
+	if (parent_id < 255) {
+		req->parent_id = parent_id;
+	} else {
+		req->parent_id = 255;
+		req->parent_id_32 = parent_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1262,7 +1282,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle,
-				     u32 dev_id, u8 clk_id, u8 *parent_id)
+				     u32 dev_id, u32 clk_id, u32 *parent_id)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_get_clock_parent *req;
@@ -1289,7 +1309,12 @@ static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_parent *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1299,10 +1324,14 @@ static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle,
 
 	resp = (struct ti_sci_msg_resp_get_clock_parent *)xfer->xfer_buf;
 
-	if (!ti_sci_is_response_ack(resp))
+	if (!ti_sci_is_response_ack(resp)) {
 		ret = -ENODEV;
-	else
-		*parent_id = resp->parent_id;
+	} else {
+		if (resp->parent_id < 255)
+			*parent_id = resp->parent_id;
+		else
+			*parent_id = resp->parent_id_32;
+	}
 
 fail:
 	ti_sci_put_one_xfer(&info->minfo, xfer);
@@ -1322,8 +1351,8 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle,
-					  u32 dev_id, u8 clk_id,
-					  u8 *num_parents)
+					  u32 dev_id, u32 clk_id,
+					  u32 *num_parents)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_get_clock_num_parents *req;
@@ -1350,7 +1379,12 @@ static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_num_parents *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1360,10 +1394,14 @@ static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle,
 
 	resp = (struct ti_sci_msg_resp_get_clock_num_parents *)xfer->xfer_buf;
 
-	if (!ti_sci_is_response_ack(resp))
+	if (!ti_sci_is_response_ack(resp)) {
 		ret = -ENODEV;
-	else
-		*num_parents = resp->num_parents;
+	} else {
+		if (resp->num_parents < 255)
+			*num_parents = resp->num_parents;
+		else
+			*num_parents = resp->num_parents_32;
+	}
 
 fail:
 	ti_sci_put_one_xfer(&info->minfo, xfer);
@@ -1391,7 +1429,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_match_freq(const struct ti_sci_handle *handle,
-					 u32 dev_id, u8 clk_id, u64 min_freq,
+					 u32 dev_id, u32 clk_id, u64 min_freq,
 					 u64 target_freq, u64 max_freq,
 					 u64 *match_freq)
 {
@@ -1420,7 +1458,12 @@ static int ti_sci_cmd_clk_get_match_freq(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_query_clock_freq *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 	req->min_freq_hz = min_freq;
 	req->target_freq_hz = target_freq;
 	req->max_freq_hz = max_freq;
@@ -1463,7 +1506,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_set_freq(const struct ti_sci_handle *handle,
-				   u32 dev_id, u8 clk_id, u64 min_freq,
+				   u32 dev_id, u32 clk_id, u64 min_freq,
 				   u64 target_freq, u64 max_freq)
 {
 	struct ti_sci_info *info;
@@ -1491,7 +1534,12 @@ static int ti_sci_cmd_clk_set_freq(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_set_clock_freq *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 	req->min_freq_hz = min_freq;
 	req->target_freq_hz = target_freq;
 	req->max_freq_hz = max_freq;
@@ -1524,7 +1572,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_freq(const struct ti_sci_handle *handle,
-				   u32 dev_id, u8 clk_id, u64 *freq)
+				   u32 dev_id, u32 clk_id, u64 *freq)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_get_clock_freq *req;
@@ -1551,7 +1599,12 @@ static int ti_sci_cmd_clk_get_freq(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_freq *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 4983827151bf..ad0b47981b87 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -202,7 +202,8 @@ struct ti_sci_msg_req_set_device_resets {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to modify.
+ *		which clock input to modify. Set to 255 if clock ID is
+ *		greater than or equal to 255.
  * @request_state: Request the state for the clock to be set to.
  *		MSG_CLOCK_SW_STATE_UNREQ: The IP does not require this clock,
  *		it can be disabled, regardless of the state of the device
@@ -213,6 +214,9 @@ struct ti_sci_msg_req_set_device_resets {
  *		being required by the device.(default)
  *		MSG_CLOCK_SW_STATE_REQ:  Configure the clock to be enabled,
  *		regardless of the state of the device.
+ * @clk_id_32:	Clock identifier for the device for this request.
+ *		Only to be used if the clock ID is greater than or equal to
+ *		255.
  *
  * Normally, all required clocks are managed by TISCI entity, this is used
  * only for specific control *IF* required. Auto managed state is
@@ -234,6 +238,7 @@ struct ti_sci_msg_req_set_clock_state {
 #define MSG_CLOCK_SW_STATE_AUTO		1
 #define MSG_CLOCK_SW_STATE_REQ		2
 	u8 request_state;
+	u32 clk_id_32;
 } __packed;
 
 /**
@@ -242,7 +247,11 @@ struct ti_sci_msg_req_set_clock_state {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to get state of.
+ *		which clock input to get state of. Set to 255 if the clock
+ *		ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier for the device for the request.
+ *		Only to be used if the clock ID is greater than or equal to
+ *		255.
  *
  * Request type is TI_SCI_MSG_GET_CLOCK_STATE, response is state
  * of the clock
@@ -251,6 +260,7 @@ struct ti_sci_msg_req_get_clock_state {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
@@ -278,9 +288,13 @@ struct ti_sci_msg_resp_get_clock_state {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to modify.
+ *		which clock input to modify. Set to 255 if clock ID is
+ *		greater than or equal to 255.
  * @parent_id:	The new clock parent is selectable by an index via this
- *		parameter.
+ *		parameter. Set to 255 if clock ID is greater than or
+ *		equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id field is 255.
+ * @parent_id_32:	Parent identifier if @parent_id is 255.
  *
  * Request type is TI_SCI_MSG_SET_CLOCK_PARENT, response is generic
  * ACK / NACK message.
@@ -290,6 +304,8 @@ struct ti_sci_msg_req_set_clock_parent {
 	u32 dev_id;
 	u8 clk_id;
 	u8 parent_id;
+	u32 clk_id_32;
+	u32 parent_id_32;
 } __packed;
 
 /**
@@ -298,7 +314,10 @@ struct ti_sci_msg_req_set_clock_parent {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to get the parent for.
+ *		which clock input to get the parent for. If this field
+ *		contains 255, the actual clock identifier is stored in
+ *		@clk_id_32.
+ * @clk_id_32:	Clock identifier if the @clk_id field contains 255.
  *
  * Request type is TI_SCI_MSG_GET_CLOCK_PARENT, response is parent information
  */
@@ -306,25 +325,32 @@ struct ti_sci_msg_req_get_clock_parent {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_resp_get_clock_parent - Response with clock parent
  * @hdr:	Generic Header
- * @parent_id:	The current clock parent
+ * @parent_id:	The current clock parent. If set to 255, the current parent
+ *		ID can be found from the @parent_id_32 field.
+ * @parent_id_32:	Current clock parent if @parent_id field is set to
+ *			255.
  *
  * Response to TI_SCI_MSG_GET_CLOCK_PARENT.
  */
 struct ti_sci_msg_resp_get_clock_parent {
 	struct ti_sci_msg_hdr hdr;
 	u8 parent_id;
+	u32 parent_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_req_get_clock_num_parents - Request to get clock parents
  * @hdr:	Generic header
  * @dev_id:	Device identifier this request is for
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if the @clk_id field contains 255.
  *
  * This request provides information about how many clock parent options
  * are available for a given clock to a device. This is typically used
@@ -337,18 +363,24 @@ struct ti_sci_msg_req_get_clock_num_parents {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_resp_get_clock_num_parents - Response for get clk parents
  * @hdr:		Generic header
- * @num_parents:	Number of clock parents
+ * @num_parents:	Number of clock parents. If set to 255, the actual
+ *			number of parents is stored into @num_parents_32
+ *			field instead.
+ * @num_parents_32:	Number of clock parents if @num_parents field is
+ *			set to 255.
  *
  * Response to TI_SCI_MSG_GET_NUM_CLOCK_PARENTS
  */
 struct ti_sci_msg_resp_get_clock_num_parents {
 	struct ti_sci_msg_hdr hdr;
 	u8 num_parents;
+	u32 num_parents_32;
 } __packed;
 
 /**
@@ -363,7 +395,9 @@ struct ti_sci_msg_resp_get_clock_num_parents {
  * @max_freq_hz: The maximum allowable frequency in Hz. This is the maximum
  *		allowable programmed frequency and does not account for clock
  *		tolerances and jitter.
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock identifier is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id is set to 255.
  *
  * NOTE: Normally clock frequency management is automatically done by TISCI
  * entity. In case of specific requests, TISCI evaluates capability to achieve
@@ -380,6 +414,7 @@ struct ti_sci_msg_req_query_clock_freq {
 	u64 target_freq_hz;
 	u64 max_freq_hz;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
@@ -407,7 +442,9 @@ struct ti_sci_msg_resp_query_clock_freq {
  * @max_freq_hz: The maximum allowable frequency in Hz. This is the maximum
  *		allowable programmed frequency and does not account for clock
  *		tolerances and jitter.
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id field is set to 255.
  *
  * NOTE: Normally clock frequency management is automatically done by TISCI
  * entity. In case of specific requests, TISCI evaluates capability to achieve
@@ -436,13 +473,16 @@ struct ti_sci_msg_req_set_clock_freq {
 	u64 target_freq_hz;
 	u64 max_freq_hz;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_req_get_clock_freq - Request to get the clock frequency
  * @hdr:	Generic Header
  * @dev_id:	Device identifier this request is for
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id field is set to 255.
  *
  * NOTE: Normally clock frequency management is automatically done by TISCI
  * entity. In some cases, clock frequencies are configured by host.
@@ -454,6 +494,7 @@ struct ti_sci_msg_req_get_clock_freq {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 568722a041bf..406e6717d252 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -166,29 +166,29 @@ struct ti_sci_dev_ops {
  * managed by driver for that purpose.
  */
 struct ti_sci_clk_ops {
-	int (*get_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*get_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 			 bool needs_ssc, bool can_change_freq,
 			 bool enable_input_term);
-	int (*idle_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid);
-	int (*put_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid);
-	int (*is_auto)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*idle_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid);
+	int (*put_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid);
+	int (*is_auto)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 		       bool *req_state);
-	int (*is_on)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*is_on)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 		     bool *req_state, bool *current_state);
-	int (*is_off)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*is_off)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 		      bool *req_state, bool *current_state);
-	int (*set_parent)(const struct ti_sci_handle *handle, u32 did, u8 cid,
-			  u8 parent_id);
-	int (*get_parent)(const struct ti_sci_handle *handle, u32 did, u8 cid,
-			  u8 *parent_id);
+	int (*set_parent)(const struct ti_sci_handle *handle, u32 did, u32 cid,
+			  u32 parent_id);
+	int (*get_parent)(const struct ti_sci_handle *handle, u32 did, u32 cid,
+			  u32 *parent_id);
 	int (*get_num_parents)(const struct ti_sci_handle *handle, u32 did,
-			       u8 cid, u8 *num_parents);
+			       u32 cid, u32 *num_parents);
 	int (*get_best_match_freq)(const struct ti_sci_handle *handle, u32 did,
-				   u8 cid, u64 min_freq, u64 target_freq,
+				   u32 cid, u64 min_freq, u64 target_freq,
 				   u64 max_freq, u64 *match_freq);
-	int (*set_freq)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*set_freq)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 			u64 min_freq, u64 target_freq, u64 max_freq);
-	int (*get_freq)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*get_freq)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 			u64 *current_freq);
 };
 
-- 
cgit v1.2.3


From 5740671e596bdc3986a5391997de194300970201 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 12 Jun 2019 14:28:30 +0100
Subject: dma-fence/reservation: Markup rcu protected access for DEBUG_MUTEXES
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mark the access to reservation_object.fence as being protected to
silence sparse.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190612132830.31221-1-chris@chris-wilson.co.uk
---
 include/linux/reservation.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index ee750765cc94..644a22dbe53b 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -216,8 +216,12 @@ reservation_object_unlock(struct reservation_object *obj)
 {
 #ifdef CONFIG_DEBUG_MUTEXES
 	/* Test shared fence slot reservation */
-	if (obj->fence)
-		obj->fence->shared_max = obj->fence->shared_count;
+	if (rcu_access_pointer(obj->fence)) {
+		struct reservation_object_list *fence =
+			reservation_object_get_list(obj);
+
+		fence->shared_max = fence->shared_count;
+	}
 #endif
 	ww_mutex_unlock(&obj->lock);
 }
-- 
cgit v1.2.3


From 09cc560951dbcc74c7f9419f7cf703b57472b7a2 Mon Sep 17 00:00:00 2001
From: Sean Paul <seanpaul@chromium.org>
Date: Wed, 12 Jun 2019 11:00:34 -0400
Subject: drm: Tweak drm_encoder_helper_funcs.enable kerneldoc

I copied the kerneldoc for encoder_funcs.atomic_enable from encoder_funcs.enable
in a recent patch [1]. Sam rightly pointed out in the review that "for symmetry
with" text is awkward [2]. So here's a patch to fix up the source of the awkward
language.

[1] https://patchwork.freedesktop.org/patch/msgid/20190611160844.257498-2-sean@poorly.run
[2] https://patchwork.freedesktop.org/patch/msgid/20190611185352.GA16305@ravnborg.org

Suggested-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190612150038.194843-1-sean@poorly.run
---
 include/drm/drm_modeset_helper_vtables.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_modeset_helper_vtables.h b/include/drm/drm_modeset_helper_vtables.h
index f9c94c2a1364..df80131bb10f 100644
--- a/include/drm/drm_modeset_helper_vtables.h
+++ b/include/drm/drm_modeset_helper_vtables.h
@@ -719,11 +719,11 @@ struct drm_encoder_helper_funcs {
 	 * hooks and call them from CRTC's callback by looping over all encoders
 	 * connected to it using for_each_encoder_on_crtc().
 	 *
-	 * This hook is used only by atomic helpers, for symmetry with @disable.
-	 * Atomic drivers don't need to implement it if there's no need to
-	 * enable anything at the encoder level. To ensure that runtime PM handling
-	 * (using either DPMS or the new "ACTIVE" property) works
-	 * @enable must be the inverse of @disable for atomic drivers.
+	 * This hook is only used by atomic helpers, it is the opposite of
+	 * @disable. Atomic drivers don't need to implement it if there's no
+	 * need to enable anything at the encoder level. To ensure that
+	 * runtime PM handling (using either DPMS or the new "ACTIVE" property)
+	 * works @enable must be the inverse of @disable for atomic drivers.
 	 */
 	void (*enable)(struct drm_encoder *encoder);
 
-- 
cgit v1.2.3


From d664c43958e0d9e0b34e23b6f8a8f4cf8ec61a2e Mon Sep 17 00:00:00 2001
From: Enrico Weigelt <info@metux.net>
Date: Wed, 12 Jun 2019 23:59:36 +0200
Subject: gpio: Fix build warnings on undefined struct pinctrl_dev

This fixes the warnings:

* include/linux/gpio.h:254:11: warning: 'struct pinctrl_dev' declared
  inside parameter list will not be visible outside of this definition
  or declaration
* include/linux/gpio/driver.h:602:11: warning: 'struct pinctrl_dev'
  declared inside parameter list will not be visible outside of this
  definition or declaration

Fixes: 78b99577b393 ("pinctrl: remove unused pin_is_valid()")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Enrico Weigelt <info@metux.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio.h        | 1 +
 include/linux/gpio/driver.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 39745b8bdd65..40915b461f18 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -106,6 +106,7 @@ void devm_gpio_free(struct device *dev, unsigned int gpio);
 
 struct device;
 struct gpio_chip;
+struct pinctrl_dev;
 
 static inline bool gpio_is_valid(int number)
 {
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index a1d273c96016..b58b27c11355 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -590,6 +590,8 @@ void gpiochip_remove_pin_ranges(struct gpio_chip *chip);
 
 #else
 
+struct pinctrl_dev;
+
 static inline int
 gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name,
 		       unsigned int gpio_offset, unsigned int pin_offset,
-- 
cgit v1.2.3


From 68608b5e5063dd12942f1118286c6f595d0c4a05 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Mon, 10 Jun 2019 12:18:56 +0300
Subject: firmware: ti_sci: Add resource management APIs for ringacc, psi-l and
 udma

Configuration of NAVSS resource, like rings, UDMAP channels, flows
and PSI-L thread management need to be done via TISCI.

Add the needed structures and functions for NAVSS resource configuration of
the following:
Rings from Ring Accelerator
PSI-L thread management
UDMAP tchan, rchan and rflow configuration.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Lokesh Vutla <lokeshvutla@ti.com>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 488 ++++++++++++++++++++++++
 drivers/firmware/ti_sci.h              | 675 +++++++++++++++++++++++++++++++++
 include/linux/soc/ti/ti_sci_protocol.h | 215 +++++++++++
 3 files changed, 1378 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 36ce11a67235..02fa196428d8 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2004,6 +2004,481 @@ static int ti_sci_cmd_free_event_map(const struct ti_sci_handle *handle,
 			       ia_id, vint, global_event, vint_status_bit, 0);
 }
 
+/**
+ * ti_sci_cmd_ring_config() - configure RA ring
+ * @handle:		Pointer to TI SCI handle.
+ * @valid_params:	Bitfield defining validity of ring configuration
+ *			parameters
+ * @nav_id:		Device ID of Navigator Subsystem from which the ring is
+ *			allocated
+ * @index:		Ring index
+ * @addr_lo:		The ring base address lo 32 bits
+ * @addr_hi:		The ring base address hi 32 bits
+ * @count:		Number of ring elements
+ * @mode:		The mode of the ring
+ * @size:		The ring element size.
+ * @order_id:		Specifies the ring's bus order ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_ring_cfg_req for more info.
+ */
+static int ti_sci_cmd_ring_config(const struct ti_sci_handle *handle,
+				  u32 valid_params, u16 nav_id, u16 index,
+				  u32 addr_lo, u32 addr_hi, u32 count,
+				  u8 mode, u8 size, u8 order_id)
+{
+	struct ti_sci_msg_rm_ring_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev, "RM_RA:Message config failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_ring_cfg_req *)xfer->xfer_buf;
+	req->valid_params = valid_params;
+	req->nav_id = nav_id;
+	req->index = index;
+	req->addr_lo = addr_lo;
+	req->addr_hi = addr_hi;
+	req->count = count;
+	req->mode = mode;
+	req->size = size;
+	req->order_id = order_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "RM_RA:Mbox config send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RM_RA:config ring %u ret:%d\n", index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_ring_get_config() - get RA ring configuration
+ * @handle:	Pointer to TI SCI handle.
+ * @nav_id:	Device ID of Navigator Subsystem from which the ring is
+ *		allocated
+ * @index:	Ring index
+ * @addr_lo:	Returns ring's base address lo 32 bits
+ * @addr_hi:	Returns ring's base address hi 32 bits
+ * @count:	Returns number of ring elements
+ * @mode:	Returns mode of the ring
+ * @size:	Returns ring element size
+ * @order_id:	Returns ring's bus order ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_ring_get_cfg_req for more info.
+ */
+static int ti_sci_cmd_ring_get_config(const struct ti_sci_handle *handle,
+				      u32 nav_id, u32 index, u8 *mode,
+				      u32 *addr_lo, u32 *addr_hi,
+				      u32 *count, u8 *size, u8 *order_id)
+{
+	struct ti_sci_msg_rm_ring_get_cfg_resp *resp;
+	struct ti_sci_msg_rm_ring_get_cfg_req *req;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_GET_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev,
+			"RM_RA:Message get config failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_ring_get_cfg_req *)xfer->xfer_buf;
+	req->nav_id = nav_id;
+	req->index = index;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "RM_RA:Mbox get config send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_rm_ring_get_cfg_resp *)xfer->xfer_buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		ret = -ENODEV;
+	} else {
+		if (mode)
+			*mode = resp->mode;
+		if (addr_lo)
+			*addr_lo = resp->addr_lo;
+		if (addr_hi)
+			*addr_hi = resp->addr_hi;
+		if (count)
+			*count = resp->count;
+		if (size)
+			*size = resp->size;
+		if (order_id)
+			*order_id = resp->order_id;
+	};
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RM_RA:get config ring %u ret:%d\n", index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_psil_pair() - Pair PSI-L source to destination thread
+ * @handle:	Pointer to TI SCI handle.
+ * @nav_id:	Device ID of Navigator Subsystem which should be used for
+ *		pairing
+ * @src_thread:	Source PSI-L thread ID
+ * @dst_thread: Destination PSI-L thread ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_rm_psil_pair(const struct ti_sci_handle *handle,
+				   u32 nav_id, u32 src_thread, u32 dst_thread)
+{
+	struct ti_sci_msg_psil_pair *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_PSIL_PAIR,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RM_PSIL:Message reconfig failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_psil_pair *)xfer->xfer_buf;
+	req->nav_id = nav_id;
+	req->src_thread = src_thread;
+	req->dst_thread = dst_thread;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RM_PSIL:Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_psil_unpair() - Unpair PSI-L source from destination thread
+ * @handle:	Pointer to TI SCI handle.
+ * @nav_id:	Device ID of Navigator Subsystem which should be used for
+ *		unpairing
+ * @src_thread:	Source PSI-L thread ID
+ * @dst_thread:	Destination PSI-L thread ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_rm_psil_unpair(const struct ti_sci_handle *handle,
+				     u32 nav_id, u32 src_thread, u32 dst_thread)
+{
+	struct ti_sci_msg_psil_unpair *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_PSIL_UNPAIR,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RM_PSIL:Message reconfig failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_psil_unpair *)xfer->xfer_buf;
+	req->nav_id = nav_id;
+	req->src_thread = src_thread;
+	req->dst_thread = dst_thread;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RM_PSIL:Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_udmap_tx_ch_cfg() - Configure a UDMAP TX channel
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_udmap_tx_ch_cfg TX channel config
+ *		structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_udmap_tx_ch_cfg and @ti_sci_msg_rm_udmap_tx_ch_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_udmap_tx_ch_cfg(const struct ti_sci_handle *handle,
+			const struct ti_sci_msg_rm_udmap_tx_ch_cfg *params)
+{
+	struct ti_sci_msg_rm_udmap_tx_ch_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_TX_CH_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev, "Message TX_CH_CFG alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_udmap_tx_ch_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->index = params->index;
+	req->tx_pause_on_err = params->tx_pause_on_err;
+	req->tx_filt_einfo = params->tx_filt_einfo;
+	req->tx_filt_pswords = params->tx_filt_pswords;
+	req->tx_atype = params->tx_atype;
+	req->tx_chan_type = params->tx_chan_type;
+	req->tx_supr_tdpkt = params->tx_supr_tdpkt;
+	req->tx_fetch_size = params->tx_fetch_size;
+	req->tx_credit_count = params->tx_credit_count;
+	req->txcq_qnum = params->txcq_qnum;
+	req->tx_priority = params->tx_priority;
+	req->tx_qos = params->tx_qos;
+	req->tx_orderid = params->tx_orderid;
+	req->fdepth = params->fdepth;
+	req->tx_sched_priority = params->tx_sched_priority;
+	req->tx_burst_size = params->tx_burst_size;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "Mbox send TX_CH_CFG fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "TX_CH_CFG: chn %u ret:%u\n", params->index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_udmap_rx_ch_cfg() - Configure a UDMAP RX channel
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_udmap_rx_ch_cfg RX channel config
+ *		structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_udmap_rx_ch_cfg and @ti_sci_msg_rm_udmap_rx_ch_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_udmap_rx_ch_cfg(const struct ti_sci_handle *handle,
+			const struct ti_sci_msg_rm_udmap_rx_ch_cfg *params)
+{
+	struct ti_sci_msg_rm_udmap_rx_ch_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_RX_CH_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev, "Message RX_CH_CFG alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_udmap_rx_ch_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->index = params->index;
+	req->rx_fetch_size = params->rx_fetch_size;
+	req->rxcq_qnum = params->rxcq_qnum;
+	req->rx_priority = params->rx_priority;
+	req->rx_qos = params->rx_qos;
+	req->rx_orderid = params->rx_orderid;
+	req->rx_sched_priority = params->rx_sched_priority;
+	req->flowid_start = params->flowid_start;
+	req->flowid_cnt = params->flowid_cnt;
+	req->rx_pause_on_err = params->rx_pause_on_err;
+	req->rx_atype = params->rx_atype;
+	req->rx_chan_type = params->rx_chan_type;
+	req->rx_ignore_short = params->rx_ignore_short;
+	req->rx_ignore_long = params->rx_ignore_long;
+	req->rx_burst_size = params->rx_burst_size;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "Mbox send RX_CH_CFG fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RX_CH_CFG: chn %u ret:%d\n", params->index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_udmap_rx_flow_cfg() - Configure UDMAP RX FLOW
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_udmap_flow_cfg RX FLOW config
+ *		structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_udmap_flow_cfg and @ti_sci_msg_rm_udmap_flow_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_udmap_rx_flow_cfg(const struct ti_sci_handle *handle,
+			const struct ti_sci_msg_rm_udmap_flow_cfg *params)
+{
+	struct ti_sci_msg_rm_udmap_flow_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_FLOW_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RX_FL_CFG: Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_udmap_flow_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->flow_index = params->flow_index;
+	req->rx_einfo_present = params->rx_einfo_present;
+	req->rx_psinfo_present = params->rx_psinfo_present;
+	req->rx_error_handling = params->rx_error_handling;
+	req->rx_desc_type = params->rx_desc_type;
+	req->rx_sop_offset = params->rx_sop_offset;
+	req->rx_dest_qnum = params->rx_dest_qnum;
+	req->rx_src_tag_hi = params->rx_src_tag_hi;
+	req->rx_src_tag_lo = params->rx_src_tag_lo;
+	req->rx_dest_tag_hi = params->rx_dest_tag_hi;
+	req->rx_dest_tag_lo = params->rx_dest_tag_lo;
+	req->rx_src_tag_hi_sel = params->rx_src_tag_hi_sel;
+	req->rx_src_tag_lo_sel = params->rx_src_tag_lo_sel;
+	req->rx_dest_tag_hi_sel = params->rx_dest_tag_hi_sel;
+	req->rx_dest_tag_lo_sel = params->rx_dest_tag_lo_sel;
+	req->rx_fdq0_sz0_qnum = params->rx_fdq0_sz0_qnum;
+	req->rx_fdq1_qnum = params->rx_fdq1_qnum;
+	req->rx_fdq2_qnum = params->rx_fdq2_qnum;
+	req->rx_fdq3_qnum = params->rx_fdq3_qnum;
+	req->rx_ps_location = params->rx_ps_location;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RX_FL_CFG: Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RX_FL_CFG: %u ret:%d\n", params->flow_index, ret);
+	return ret;
+}
+
 /*
  * ti_sci_setup_ops() - Setup the operations structures
  * @info:	pointer to TISCI pointer
@@ -2016,6 +2491,9 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	struct ti_sci_clk_ops *cops = &ops->clk_ops;
 	struct ti_sci_rm_core_ops *rm_core_ops = &ops->rm_core_ops;
 	struct ti_sci_rm_irq_ops *iops = &ops->rm_irq_ops;
+	struct ti_sci_rm_ringacc_ops *rops = &ops->rm_ring_ops;
+	struct ti_sci_rm_psil_ops *psilops = &ops->rm_psil_ops;
+	struct ti_sci_rm_udmap_ops *udmap_ops = &ops->rm_udmap_ops;
 
 	core_ops->reboot_device = ti_sci_cmd_core_reboot;
 
@@ -2055,6 +2533,16 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	iops->set_event_map = ti_sci_cmd_set_event_map;
 	iops->free_irq = ti_sci_cmd_free_irq;
 	iops->free_event_map = ti_sci_cmd_free_event_map;
+
+	rops->config = ti_sci_cmd_ring_config;
+	rops->get_config = ti_sci_cmd_ring_get_config;
+
+	psilops->pair = ti_sci_cmd_rm_psil_pair;
+	psilops->unpair = ti_sci_cmd_rm_psil_unpair;
+
+	udmap_ops->tx_ch_cfg = ti_sci_cmd_rm_udmap_tx_ch_cfg;
+	udmap_ops->rx_ch_cfg = ti_sci_cmd_rm_udmap_rx_ch_cfg;
+	udmap_ops->rx_flow_cfg = ti_sci_cmd_rm_udmap_rx_flow_cfg;
 }
 
 /**
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 4983827151bf..2bb81ec7793c 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -42,6 +42,35 @@
 #define TI_SCI_MSG_SET_IRQ		0x1000
 #define TI_SCI_MSG_FREE_IRQ		0x1001
 
+/* NAVSS resource management */
+/* Ringacc requests */
+#define TI_SCI_MSG_RM_RING_ALLOCATE		0x1100
+#define TI_SCI_MSG_RM_RING_FREE			0x1101
+#define TI_SCI_MSG_RM_RING_RECONFIG		0x1102
+#define TI_SCI_MSG_RM_RING_RESET		0x1103
+#define TI_SCI_MSG_RM_RING_CFG			0x1110
+#define TI_SCI_MSG_RM_RING_GET_CFG		0x1111
+
+/* PSI-L requests */
+#define TI_SCI_MSG_RM_PSIL_PAIR			0x1280
+#define TI_SCI_MSG_RM_PSIL_UNPAIR		0x1281
+
+#define TI_SCI_MSG_RM_UDMAP_TX_ALLOC		0x1200
+#define TI_SCI_MSG_RM_UDMAP_TX_FREE		0x1201
+#define TI_SCI_MSG_RM_UDMAP_RX_ALLOC		0x1210
+#define TI_SCI_MSG_RM_UDMAP_RX_FREE		0x1211
+#define TI_SCI_MSG_RM_UDMAP_FLOW_CFG		0x1220
+#define TI_SCI_MSG_RM_UDMAP_OPT_FLOW_CFG	0x1221
+
+#define TISCI_MSG_RM_UDMAP_TX_CH_CFG		0x1205
+#define TISCI_MSG_RM_UDMAP_TX_CH_GET_CFG	0x1206
+#define TISCI_MSG_RM_UDMAP_RX_CH_CFG		0x1215
+#define TISCI_MSG_RM_UDMAP_RX_CH_GET_CFG	0x1216
+#define TISCI_MSG_RM_UDMAP_FLOW_CFG		0x1230
+#define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_CFG	0x1231
+#define TISCI_MSG_RM_UDMAP_FLOW_GET_CFG		0x1232
+#define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_GET_CFG	0x1233
+
 /**
  * struct ti_sci_msg_hdr - Generic Message Header for All messages and responses
  * @type:	Type of messages: One of TI_SCI_MSG* values
@@ -563,4 +592,650 @@ struct ti_sci_msg_req_manage_irq {
 	u8 secondary_host;
 } __packed;
 
+/**
+ * struct ti_sci_msg_rm_ring_cfg_req - Configure a Navigator Subsystem ring
+ *
+ * Configures the non-real-time registers of a Navigator Subsystem ring.
+ * @hdr:	Generic Header
+ * @valid_params: Bitfield defining validity of ring configuration parameters.
+ *	The ring configuration fields are not valid, and will not be used for
+ *	ring configuration, if their corresponding valid bit is zero.
+ *	Valid bit usage:
+ *	0 - Valid bit for @tisci_msg_rm_ring_cfg_req addr_lo
+ *	1 - Valid bit for @tisci_msg_rm_ring_cfg_req addr_hi
+ *	2 - Valid bit for @tisci_msg_rm_ring_cfg_req count
+ *	3 - Valid bit for @tisci_msg_rm_ring_cfg_req mode
+ *	4 - Valid bit for @tisci_msg_rm_ring_cfg_req size
+ *	5 - Valid bit for @tisci_msg_rm_ring_cfg_req order_id
+ * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated
+ * @index: ring index to be configured.
+ * @addr_lo: 32 LSBs of ring base address to be programmed into the ring's
+ *	RING_BA_LO register
+ * @addr_hi: 16 MSBs of ring base address to be programmed into the ring's
+ *	RING_BA_HI register.
+ * @count: Number of ring elements. Must be even if mode is CREDENTIALS or QM
+ *	modes.
+ * @mode: Specifies the mode the ring is to be configured.
+ * @size: Specifies encoded ring element size. To calculate the encoded size use
+ *	the formula (log2(size_bytes) - 2), where size_bytes cannot be
+ *	greater than 256.
+ * @order_id: Specifies the ring's bus order ID.
+ */
+struct ti_sci_msg_rm_ring_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u32 addr_lo;
+	u32 addr_hi;
+	u32 count;
+	u8 mode;
+	u8 size;
+	u8 order_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_rm_ring_get_cfg_req - Get RA ring's configuration
+ *
+ * Gets the configuration of the non-real-time register fields of a ring.  The
+ * host, or a supervisor of the host, who owns the ring must be the requesting
+ * host.  The values of the non-real-time registers are returned in
+ * @ti_sci_msg_rm_ring_get_cfg_resp.
+ *
+ * @hdr: Generic Header
+ * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated
+ * @index: ring index.
+ */
+struct ti_sci_msg_rm_ring_get_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u16 nav_id;
+	u16 index;
+} __packed;
+
+/**
+ * struct ti_sci_msg_rm_ring_get_cfg_resp -  Ring get configuration response
+ *
+ * Response received by host processor after RM has handled
+ * @ti_sci_msg_rm_ring_get_cfg_req. The response contains the ring's
+ * non-real-time register values.
+ *
+ * @hdr: Generic Header
+ * @addr_lo: Ring 32 LSBs of base address
+ * @addr_hi: Ring 16 MSBs of base address.
+ * @count: Ring number of elements.
+ * @mode: Ring mode.
+ * @size: encoded Ring element size
+ * @order_id: ing order ID.
+ */
+struct ti_sci_msg_rm_ring_get_cfg_resp {
+	struct ti_sci_msg_hdr hdr;
+	u32 addr_lo;
+	u32 addr_hi;
+	u32 count;
+	u8 mode;
+	u8 size;
+	u8 order_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_psil_pair - Pairs a PSI-L source thread to a destination
+ *				 thread
+ * @hdr:	Generic Header
+ * @nav_id:	SoC Navigator Subsystem device ID whose PSI-L config proxy is
+ *		used to pair the source and destination threads.
+ * @src_thread:	PSI-L source thread ID within the PSI-L System thread map.
+ *
+ * UDMAP transmit channels mapped to source threads will have their
+ * TCHAN_THRD_ID register programmed with the destination thread if the pairing
+ * is successful.
+
+ * @dst_thread: PSI-L destination thread ID within the PSI-L System thread map.
+ * PSI-L destination threads start at index 0x8000.  The request is NACK'd if
+ * the destination thread is not greater than or equal to 0x8000.
+ *
+ * UDMAP receive channels mapped to destination threads will have their
+ * RCHAN_THRD_ID register programmed with the source thread if the pairing
+ * is successful.
+ *
+ * Request type is TI_SCI_MSG_RM_PSIL_PAIR, response is a generic ACK or NACK
+ * message.
+ */
+struct ti_sci_msg_psil_pair {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 src_thread;
+	u32 dst_thread;
+} __packed;
+
+/**
+ * struct ti_sci_msg_psil_unpair - Unpairs a PSI-L source thread from a
+ *				   destination thread
+ * @hdr:	Generic Header
+ * @nav_id:	SoC Navigator Subsystem device ID whose PSI-L config proxy is
+ *		used to unpair the source and destination threads.
+ * @src_thread:	PSI-L source thread ID within the PSI-L System thread map.
+ *
+ * UDMAP transmit channels mapped to source threads will have their
+ * TCHAN_THRD_ID register cleared if the unpairing is successful.
+ *
+ * @dst_thread: PSI-L destination thread ID within the PSI-L System thread map.
+ * PSI-L destination threads start at index 0x8000.  The request is NACK'd if
+ * the destination thread is not greater than or equal to 0x8000.
+ *
+ * UDMAP receive channels mapped to destination threads will have their
+ * RCHAN_THRD_ID register cleared if the unpairing is successful.
+ *
+ * Request type is TI_SCI_MSG_RM_PSIL_UNPAIR, response is a generic ACK or NACK
+ * message.
+ */
+struct ti_sci_msg_psil_unpair {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 src_thread;
+	u32 dst_thread;
+} __packed;
+
+/**
+ * struct ti_sci_msg_udmap_rx_flow_cfg -  UDMAP receive flow configuration
+ *					  message
+ * @hdr: Generic Header
+ * @nav_id: SoC Navigator Subsystem device ID from which the receive flow is
+ *	allocated
+ * @flow_index: UDMAP receive flow index for non-optional configuration.
+ * @rx_ch_index: Specifies the index of the receive channel using the flow_index
+ * @rx_einfo_present: UDMAP receive flow extended packet info present.
+ * @rx_psinfo_present: UDMAP receive flow PS words present.
+ * @rx_error_handling: UDMAP receive flow error handling configuration. Valid
+ *	values are TI_SCI_RM_UDMAP_RX_FLOW_ERR_DROP/RETRY.
+ * @rx_desc_type: UDMAP receive flow descriptor type. It can be one of
+ *	TI_SCI_RM_UDMAP_RX_FLOW_DESC_HOST/MONO.
+ * @rx_sop_offset: UDMAP receive flow start of packet offset.
+ * @rx_dest_qnum: UDMAP receive flow destination queue number.
+ * @rx_ps_location: UDMAP receive flow PS words location.
+ *	0 - end of packet descriptor
+ *	1 - Beginning of the data buffer
+ * @rx_src_tag_hi: UDMAP receive flow source tag high byte constant
+ * @rx_src_tag_lo: UDMAP receive flow source tag low byte constant
+ * @rx_dest_tag_hi: UDMAP receive flow destination tag high byte constant
+ * @rx_dest_tag_lo: UDMAP receive flow destination tag low byte constant
+ * @rx_src_tag_hi_sel: UDMAP receive flow source tag high byte selector
+ * @rx_src_tag_lo_sel: UDMAP receive flow source tag low byte selector
+ * @rx_dest_tag_hi_sel: UDMAP receive flow destination tag high byte selector
+ * @rx_dest_tag_lo_sel: UDMAP receive flow destination tag low byte selector
+ * @rx_size_thresh_en: UDMAP receive flow packet size based free buffer queue
+ *	enable. If enabled, the ti_sci_rm_udmap_rx_flow_opt_cfg also need to be
+ *	configured and sent.
+ * @rx_fdq0_sz0_qnum: UDMAP receive flow free descriptor queue 0.
+ * @rx_fdq1_qnum: UDMAP receive flow free descriptor queue 1.
+ * @rx_fdq2_qnum: UDMAP receive flow free descriptor queue 2.
+ * @rx_fdq3_qnum: UDMAP receive flow free descriptor queue 3.
+ *
+ * For detailed information on the settings, see the UDMAP section of the TRM.
+ */
+struct ti_sci_msg_udmap_rx_flow_cfg {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 flow_index;
+	u32 rx_ch_index;
+	u8 rx_einfo_present;
+	u8 rx_psinfo_present;
+	u8 rx_error_handling;
+	u8 rx_desc_type;
+	u16 rx_sop_offset;
+	u16 rx_dest_qnum;
+	u8 rx_ps_location;
+	u8 rx_src_tag_hi;
+	u8 rx_src_tag_lo;
+	u8 rx_dest_tag_hi;
+	u8 rx_dest_tag_lo;
+	u8 rx_src_tag_hi_sel;
+	u8 rx_src_tag_lo_sel;
+	u8 rx_dest_tag_hi_sel;
+	u8 rx_dest_tag_lo_sel;
+	u8 rx_size_thresh_en;
+	u16 rx_fdq0_sz0_qnum;
+	u16 rx_fdq1_qnum;
+	u16 rx_fdq2_qnum;
+	u16 rx_fdq3_qnum;
+} __packed;
+
+/**
+ * struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg - parameters for UDMAP receive
+ *						flow optional configuration
+ * @hdr: Generic Header
+ * @nav_id: SoC Navigator Subsystem device ID from which the receive flow is
+ *	allocated
+ * @flow_index: UDMAP receive flow index for optional configuration.
+ * @rx_ch_index: Specifies the index of the receive channel using the flow_index
+ * @rx_size_thresh0: UDMAP receive flow packet size threshold 0.
+ * @rx_size_thresh1: UDMAP receive flow packet size threshold 1.
+ * @rx_size_thresh2: UDMAP receive flow packet size threshold 2.
+ * @rx_fdq0_sz1_qnum: UDMAP receive flow free descriptor queue for size
+ *	threshold 1.
+ * @rx_fdq0_sz2_qnum: UDMAP receive flow free descriptor queue for size
+ *	threshold 2.
+ * @rx_fdq0_sz3_qnum: UDMAP receive flow free descriptor queue for size
+ *	threshold 3.
+ *
+ * For detailed information on the settings, see the UDMAP section of the TRM.
+ */
+struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 flow_index;
+	u32 rx_ch_index;
+	u16 rx_size_thresh0;
+	u16 rx_size_thresh1;
+	u16 rx_size_thresh2;
+	u16 rx_fdq0_sz1_qnum;
+	u16 rx_fdq0_sz2_qnum;
+	u16 rx_fdq0_sz3_qnum;
+} __packed;
+
+/**
+ * Configures a Navigator Subsystem UDMAP transmit channel
+ *
+ * Configures the non-real-time registers of a Navigator Subsystem UDMAP
+ * transmit channel.  The channel index must be assigned to the host defined
+ * in the TISCI header via the RM board configuration resource assignment
+ * range list.
+ *
+ * @hdr: Generic Header
+ *
+ * @valid_params: Bitfield defining validity of tx channel configuration
+ * parameters. The tx channel configuration fields are not valid, and will not
+ * be used for ch configuration, if their corresponding valid bit is zero.
+ * Valid bit usage:
+ *    0 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_pause_on_err
+ *    1 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_atype
+ *    2 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_chan_type
+ *    3 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_fetch_size
+ *    4 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::txcq_qnum
+ *    5 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_priority
+ *    6 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_qos
+ *    7 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_orderid
+ *    8 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_sched_priority
+ *    9 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_filt_einfo
+ *   10 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_filt_pswords
+ *   11 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_supr_tdpkt
+ *   12 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_credit_count
+ *   13 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::fdepth
+ *   14 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_burst_size
+ *
+ * @nav_id: SoC device ID of Navigator Subsystem where tx channel is located
+ *
+ * @index: UDMAP transmit channel index.
+ *
+ * @tx_pause_on_err: UDMAP transmit channel pause on error configuration to
+ * be programmed into the tx_pause_on_err field of the channel's TCHAN_TCFG
+ * register.
+ *
+ * @tx_filt_einfo: UDMAP transmit channel extended packet information passing
+ * configuration to be programmed into the tx_filt_einfo field of the
+ * channel's TCHAN_TCFG register.
+ *
+ * @tx_filt_pswords: UDMAP transmit channel protocol specific word passing
+ * configuration to be programmed into the tx_filt_pswords field of the
+ * channel's TCHAN_TCFG register.
+ *
+ * @tx_atype: UDMAP transmit channel non Ring Accelerator access pointer
+ * interpretation configuration to be programmed into the tx_atype field of
+ * the channel's TCHAN_TCFG register.
+ *
+ * @tx_chan_type: UDMAP transmit channel functional channel type and work
+ * passing mechanism configuration to be programmed into the tx_chan_type
+ * field of the channel's TCHAN_TCFG register.
+ *
+ * @tx_supr_tdpkt: UDMAP transmit channel teardown packet generation suppression
+ * configuration to be programmed into the tx_supr_tdpkt field of the channel's
+ * TCHAN_TCFG register.
+ *
+ * @tx_fetch_size: UDMAP transmit channel number of 32-bit descriptor words to
+ * fetch configuration to be programmed into the tx_fetch_size field of the
+ * channel's TCHAN_TCFG register.  The user must make sure to set the maximum
+ * word count that can pass through the channel for any allowed descriptor type.
+ *
+ * @tx_credit_count: UDMAP transmit channel transfer request credit count
+ * configuration to be programmed into the count field of the TCHAN_TCREDIT
+ * register.  Specifies how many credits for complete TRs are available.
+ *
+ * @txcq_qnum: UDMAP transmit channel completion queue configuration to be
+ * programmed into the txcq_qnum field of the TCHAN_TCQ register. The specified
+ * completion queue must be assigned to the host, or a subordinate of the host,
+ * requesting configuration of the transmit channel.
+ *
+ * @tx_priority: UDMAP transmit channel transmit priority value to be programmed
+ * into the priority field of the channel's TCHAN_TPRI_CTRL register.
+ *
+ * @tx_qos: UDMAP transmit channel transmit qos value to be programmed into the
+ * qos field of the channel's TCHAN_TPRI_CTRL register.
+ *
+ * @tx_orderid: UDMAP transmit channel bus order id value to be programmed into
+ * the orderid field of the channel's TCHAN_TPRI_CTRL register.
+ *
+ * @fdepth: UDMAP transmit channel FIFO depth configuration to be programmed
+ * into the fdepth field of the TCHAN_TFIFO_DEPTH register. Sets the number of
+ * Tx FIFO bytes which are allowed to be stored for the channel. Check the UDMAP
+ * section of the TRM for restrictions regarding this parameter.
+ *
+ * @tx_sched_priority: UDMAP transmit channel tx scheduling priority
+ * configuration to be programmed into the priority field of the channel's
+ * TCHAN_TST_SCHED register.
+ *
+ * @tx_burst_size: UDMAP transmit channel burst size configuration to be
+ * programmed into the tx_burst_size field of the TCHAN_TCFG register.
+ */
+struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u8 tx_pause_on_err;
+	u8 tx_filt_einfo;
+	u8 tx_filt_pswords;
+	u8 tx_atype;
+	u8 tx_chan_type;
+	u8 tx_supr_tdpkt;
+	u16 tx_fetch_size;
+	u8 tx_credit_count;
+	u16 txcq_qnum;
+	u8 tx_priority;
+	u8 tx_qos;
+	u8 tx_orderid;
+	u16 fdepth;
+	u8 tx_sched_priority;
+	u8 tx_burst_size;
+} __packed;
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive channel
+ *
+ * Configures the non-real-time registers of a Navigator Subsystem UDMAP
+ * receive channel.  The channel index must be assigned to the host defined
+ * in the TISCI header via the RM board configuration resource assignment
+ * range list.
+ *
+ * @hdr: Generic Header
+ *
+ * @valid_params: Bitfield defining validity of rx channel configuration
+ * parameters.
+ * The rx channel configuration fields are not valid, and will not be used for
+ * ch configuration, if their corresponding valid bit is zero.
+ * Valid bit usage:
+ *    0 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_pause_on_err
+ *    1 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_atype
+ *    2 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_chan_type
+ *    3 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_fetch_size
+ *    4 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rxcq_qnum
+ *    5 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_priority
+ *    6 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_qos
+ *    7 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_orderid
+ *    8 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_sched_priority
+ *    9 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::flowid_start
+ *   10 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::flowid_cnt
+ *   11 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_ignore_short
+ *   12 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_ignore_long
+ *   14 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_burst_size
+ *
+ * @nav_id: SoC device ID of Navigator Subsystem where rx channel is located
+ *
+ * @index: UDMAP receive channel index.
+ *
+ * @rx_fetch_size: UDMAP receive channel number of 32-bit descriptor words to
+ * fetch configuration to be programmed into the rx_fetch_size field of the
+ * channel's RCHAN_RCFG register.
+ *
+ * @rxcq_qnum: UDMAP receive channel completion queue configuration to be
+ * programmed into the rxcq_qnum field of the RCHAN_RCQ register.
+ * The specified completion queue must be assigned to the host, or a subordinate
+ * of the host, requesting configuration of the receive channel.
+ *
+ * @rx_priority: UDMAP receive channel receive priority value to be programmed
+ * into the priority field of the channel's RCHAN_RPRI_CTRL register.
+ *
+ * @rx_qos: UDMAP receive channel receive qos value to be programmed into the
+ * qos field of the channel's RCHAN_RPRI_CTRL register.
+ *
+ * @rx_orderid: UDMAP receive channel bus order id value to be programmed into
+ * the orderid field of the channel's RCHAN_RPRI_CTRL register.
+ *
+ * @rx_sched_priority: UDMAP receive channel rx scheduling priority
+ * configuration to be programmed into the priority field of the channel's
+ * RCHAN_RST_SCHED register.
+ *
+ * @flowid_start: UDMAP receive channel additional flows starting index
+ * configuration to program into the flow_start field of the RCHAN_RFLOW_RNG
+ * register. Specifies the starting index for flow IDs the receive channel is to
+ * make use of beyond the default flow. flowid_start and @ref flowid_cnt must be
+ * set as valid and configured together. The starting flow ID set by
+ * @ref flowid_cnt must be a flow index within the Navigator Subsystem's subset
+ * of flows beyond the default flows statically mapped to receive channels.
+ * The additional flows must be assigned to the host, or a subordinate of the
+ * host, requesting configuration of the receive channel.
+ *
+ * @flowid_cnt: UDMAP receive channel additional flows count configuration to
+ * program into the flowid_cnt field of the RCHAN_RFLOW_RNG register.
+ * This field specifies how many flow IDs are in the additional contiguous range
+ * of legal flow IDs for the channel.  @ref flowid_start and flowid_cnt must be
+ * set as valid and configured together. Disabling the valid_params field bit
+ * for flowid_cnt indicates no flow IDs other than the default are to be
+ * allocated and used by the receive channel. @ref flowid_start plus flowid_cnt
+ * cannot be greater than the number of receive flows in the receive channel's
+ * Navigator Subsystem.  The additional flows must be assigned to the host, or a
+ * subordinate of the host, requesting configuration of the receive channel.
+ *
+ * @rx_pause_on_err: UDMAP receive channel pause on error configuration to be
+ * programmed into the rx_pause_on_err field of the channel's RCHAN_RCFG
+ * register.
+ *
+ * @rx_atype: UDMAP receive channel non Ring Accelerator access pointer
+ * interpretation configuration to be programmed into the rx_atype field of the
+ * channel's RCHAN_RCFG register.
+ *
+ * @rx_chan_type: UDMAP receive channel functional channel type and work passing
+ * mechanism configuration to be programmed into the rx_chan_type field of the
+ * channel's RCHAN_RCFG register.
+ *
+ * @rx_ignore_short: UDMAP receive channel short packet treatment configuration
+ * to be programmed into the rx_ignore_short field of the RCHAN_RCFG register.
+ *
+ * @rx_ignore_long: UDMAP receive channel long packet treatment configuration to
+ * be programmed into the rx_ignore_long field of the RCHAN_RCFG register.
+ *
+ * @rx_burst_size: UDMAP receive channel burst size configuration to be
+ * programmed into the rx_burst_size field of the RCHAN_RCFG register.
+ */
+struct ti_sci_msg_rm_udmap_rx_ch_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u16 rx_fetch_size;
+	u16 rxcq_qnum;
+	u8 rx_priority;
+	u8 rx_qos;
+	u8 rx_orderid;
+	u8 rx_sched_priority;
+	u16 flowid_start;
+	u16 flowid_cnt;
+	u8 rx_pause_on_err;
+	u8 rx_atype;
+	u8 rx_chan_type;
+	u8 rx_ignore_short;
+	u8 rx_ignore_long;
+	u8 rx_burst_size;
+} __packed;
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive flow
+ *
+ * Configures a Navigator Subsystem UDMAP receive flow's registers.
+ * Configuration does not include the flow registers which handle size-based
+ * free descriptor queue routing.
+ *
+ * The flow index must be assigned to the host defined in the TISCI header via
+ * the RM board configuration resource assignment range list.
+ *
+ * @hdr: Standard TISCI header
+ *
+ * @valid_params
+ * Bitfield defining validity of rx flow configuration parameters.  The
+ * rx flow configuration fields are not valid, and will not be used for flow
+ * configuration, if their corresponding valid bit is zero.  Valid bit usage:
+ *     0 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_einfo_present
+ *     1 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_psinfo_present
+ *     2 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_error_handling
+ *     3 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_desc_type
+ *     4 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_sop_offset
+ *     5 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_qnum
+ *     6 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_hi
+ *     7 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_lo
+ *     8 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_hi
+ *     9 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_lo
+ *    10 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_hi_sel
+ *    11 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_lo_sel
+ *    12 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_hi_sel
+ *    13 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_lo_sel
+ *    14 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq0_sz0_qnum
+ *    15 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq1_sz0_qnum
+ *    16 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq2_sz0_qnum
+ *    17 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq3_sz0_qnum
+ *    18 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_ps_location
+ *
+ * @nav_id: SoC device ID of Navigator Subsystem from which the receive flow is
+ * allocated
+ *
+ * @flow_index: UDMAP receive flow index for non-optional configuration.
+ *
+ * @rx_einfo_present:
+ * UDMAP receive flow extended packet info present configuration to be
+ * programmed into the rx_einfo_present field of the flow's RFLOW_RFA register.
+ *
+ * @rx_psinfo_present:
+ * UDMAP receive flow PS words present configuration to be programmed into the
+ * rx_psinfo_present field of the flow's RFLOW_RFA register.
+ *
+ * @rx_error_handling:
+ * UDMAP receive flow error handling configuration to be programmed into the
+ * rx_error_handling field of the flow's RFLOW_RFA register.
+ *
+ * @rx_desc_type:
+ * UDMAP receive flow descriptor type configuration to be programmed into the
+ * rx_desc_type field field of the flow's RFLOW_RFA register.
+ *
+ * @rx_sop_offset:
+ * UDMAP receive flow start of packet offset configuration to be programmed
+ * into the rx_sop_offset field of the RFLOW_RFA register.  See the UDMAP
+ * section of the TRM for more information on this setting.  Valid values for
+ * this field are 0-255 bytes.
+ *
+ * @rx_dest_qnum:
+ * UDMAP receive flow destination queue configuration to be programmed into the
+ * rx_dest_qnum field of the flow's RFLOW_RFA register.  The specified
+ * destination queue must be valid within the Navigator Subsystem and must be
+ * owned by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_src_tag_hi:
+ * UDMAP receive flow source tag high byte constant configuration to be
+ * programmed into the rx_src_tag_hi field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_src_tag_lo:
+ * UDMAP receive flow source tag low byte constant configuration to be
+ * programmed into the rx_src_tag_lo field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_hi:
+ * UDMAP receive flow destination tag high byte constant configuration to be
+ * programmed into the rx_dest_tag_hi field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_lo:
+ * UDMAP receive flow destination tag low byte constant configuration to be
+ * programmed into the rx_dest_tag_lo field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_src_tag_hi_sel:
+ * UDMAP receive flow source tag high byte selector configuration to be
+ * programmed into the rx_src_tag_hi_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_src_tag_lo_sel:
+ * UDMAP receive flow source tag low byte selector configuration to be
+ * programmed into the rx_src_tag_lo_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_hi_sel:
+ * UDMAP receive flow destination tag high byte selector configuration to be
+ * programmed into the rx_dest_tag_hi_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_lo_sel:
+ * UDMAP receive flow destination tag low byte selector configuration to be
+ * programmed into the rx_dest_tag_lo_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_fdq0_sz0_qnum:
+ * UDMAP receive flow free descriptor queue 0 configuration to be programmed
+ * into the rx_fdq0_sz0_qnum field of the flow's RFLOW_RFD register.  See the
+ * UDMAP section of the TRM for more information on this setting. The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_fdq1_qnum:
+ * UDMAP receive flow free descriptor queue 1 configuration to be programmed
+ * into the rx_fdq1_qnum field of the flow's RFLOW_RFD register.  See the
+ * UDMAP section of the TRM for more information on this setting.  The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_fdq2_qnum:
+ * UDMAP receive flow free descriptor queue 2 configuration to be programmed
+ * into the rx_fdq2_qnum field of the flow's RFLOW_RFE register.  See the
+ * UDMAP section of the TRM for more information on this setting.  The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_fdq3_qnum:
+ * UDMAP receive flow free descriptor queue 3 configuration to be programmed
+ * into the rx_fdq3_qnum field of the flow's RFLOW_RFE register.  See the
+ * UDMAP section of the TRM for more information on this setting.  The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_ps_location:
+ * UDMAP receive flow PS words location configuration to be programmed into the
+ * rx_ps_location field of the flow's RFLOW_RFA register.
+ */
+struct ti_sci_msg_rm_udmap_flow_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 flow_index;
+	u8 rx_einfo_present;
+	u8 rx_psinfo_present;
+	u8 rx_error_handling;
+	u8 rx_desc_type;
+	u16 rx_sop_offset;
+	u16 rx_dest_qnum;
+	u8 rx_src_tag_hi;
+	u8 rx_src_tag_lo;
+	u8 rx_dest_tag_hi;
+	u8 rx_dest_tag_lo;
+	u8 rx_src_tag_hi_sel;
+	u8 rx_src_tag_lo_sel;
+	u8 rx_dest_tag_hi_sel;
+	u8 rx_dest_tag_lo_sel;
+	u16 rx_fdq0_sz0_qnum;
+	u16 rx_fdq1_qnum;
+	u16 rx_fdq2_qnum;
+	u16 rx_fdq3_qnum;
+	u8 rx_ps_location;
+} __packed;
+
 #endif /* __TI_SCI_H */
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 568722a041bf..4fd9bff5806b 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -241,6 +241,218 @@ struct ti_sci_rm_irq_ops {
 			      u16 global_event, u8 vint_status_bit);
 };
 
+/* RA config.addr_lo parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ADDR_LO_VALID	BIT(0)
+/* RA config.addr_hi parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ADDR_HI_VALID	BIT(1)
+ /* RA config.count parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_COUNT_VALID	BIT(2)
+/* RA config.mode parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_MODE_VALID	BIT(3)
+/* RA config.size parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID	BIT(4)
+/* RA config.order_id parameter is valid for RM ring configure TISCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ORDER_ID_VALID	BIT(5)
+
+#define TI_SCI_MSG_VALUE_RM_ALL_NO_ORDER \
+	(TI_SCI_MSG_VALUE_RM_RING_ADDR_LO_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_ADDR_HI_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_COUNT_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_MODE_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID)
+
+/**
+ * struct ti_sci_rm_ringacc_ops - Ring Accelerator Management operations
+ * @config: configure the SoC Navigator Subsystem Ring Accelerator ring
+ * @get_config: get the SoC Navigator Subsystem Ring Accelerator ring
+ *		configuration
+ */
+struct ti_sci_rm_ringacc_ops {
+	int (*config)(const struct ti_sci_handle *handle,
+		      u32 valid_params, u16 nav_id, u16 index,
+		      u32 addr_lo, u32 addr_hi, u32 count, u8 mode,
+		      u8 size, u8 order_id
+	);
+	int (*get_config)(const struct ti_sci_handle *handle,
+			  u32 nav_id, u32 index, u8 *mode,
+			  u32 *addr_lo, u32 *addr_hi, u32 *count,
+			  u8 *size, u8 *order_id);
+};
+
+/**
+ * struct ti_sci_rm_psil_ops - PSI-L thread operations
+ * @pair: pair PSI-L source thread to a destination thread.
+ *	If the src_thread is mapped to UDMA tchan, the corresponding channel's
+ *	TCHAN_THRD_ID register is updated.
+ *	If the dst_thread is mapped to UDMA rchan, the corresponding channel's
+ *	RCHAN_THRD_ID register is updated.
+ * @unpair: unpair PSI-L source thread from a destination thread.
+ *	If the src_thread is mapped to UDMA tchan, the corresponding channel's
+ *	TCHAN_THRD_ID register is cleared.
+ *	If the dst_thread is mapped to UDMA rchan, the corresponding channel's
+ *	RCHAN_THRD_ID register is cleared.
+ */
+struct ti_sci_rm_psil_ops {
+	int (*pair)(const struct ti_sci_handle *handle, u32 nav_id,
+		    u32 src_thread, u32 dst_thread);
+	int (*unpair)(const struct ti_sci_handle *handle, u32 nav_id,
+		      u32 src_thread, u32 dst_thread);
+};
+
+/* UDMAP channel types */
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR		2
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR_SB		3	/* RX only */
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_PBRR		10
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_PBVR		11
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBRR	12
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBVR	13
+
+#define TI_SCI_RM_UDMAP_RX_FLOW_DESC_HOST		0
+#define TI_SCI_RM_UDMAP_RX_FLOW_DESC_MONO		2
+
+#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES	1
+#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES	2
+#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES	3
+
+/* UDMAP TX/RX channel valid_params common declarations */
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID		BIT(0)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID                BIT(1)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_CHAN_TYPE_VALID            BIT(2)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_FETCH_SIZE_VALID           BIT(3)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID              BIT(4)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_PRIORITY_VALID             BIT(5)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_QOS_VALID                  BIT(6)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_ORDER_ID_VALID             BIT(7)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_SCHED_PRIORITY_VALID       BIT(8)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_BURST_SIZE_VALID		BIT(14)
+
+/**
+ * Configures a Navigator Subsystem UDMAP transmit channel
+ *
+ * Configures a Navigator Subsystem UDMAP transmit channel registers.
+ * See @ti_sci_msg_rm_udmap_tx_ch_cfg_req
+ */
+struct ti_sci_msg_rm_udmap_tx_ch_cfg {
+	u32 valid_params;
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_EINFO_VALID        BIT(9)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_PSWORDS_VALID      BIT(10)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_SUPR_TDPKT_VALID        BIT(11)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_CREDIT_COUNT_VALID      BIT(12)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FDEPTH_VALID            BIT(13)
+	u16 nav_id;
+	u16 index;
+	u8 tx_pause_on_err;
+	u8 tx_filt_einfo;
+	u8 tx_filt_pswords;
+	u8 tx_atype;
+	u8 tx_chan_type;
+	u8 tx_supr_tdpkt;
+	u16 tx_fetch_size;
+	u8 tx_credit_count;
+	u16 txcq_qnum;
+	u8 tx_priority;
+	u8 tx_qos;
+	u8 tx_orderid;
+	u16 fdepth;
+	u8 tx_sched_priority;
+	u8 tx_burst_size;
+};
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive channel
+ *
+ * Configures a Navigator Subsystem UDMAP receive channel registers.
+ * See @ti_sci_msg_rm_udmap_rx_ch_cfg_req
+ */
+struct ti_sci_msg_rm_udmap_rx_ch_cfg {
+	u32 valid_params;
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID      BIT(9)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID        BIT(10)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_IGNORE_SHORT_VALID      BIT(11)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_IGNORE_LONG_VALID       BIT(12)
+	u16 nav_id;
+	u16 index;
+	u16 rx_fetch_size;
+	u16 rxcq_qnum;
+	u8 rx_priority;
+	u8 rx_qos;
+	u8 rx_orderid;
+	u8 rx_sched_priority;
+	u16 flowid_start;
+	u16 flowid_cnt;
+	u8 rx_pause_on_err;
+	u8 rx_atype;
+	u8 rx_chan_type;
+	u8 rx_ignore_short;
+	u8 rx_ignore_long;
+	u8 rx_burst_size;
+};
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive flow
+ *
+ * Configures a Navigator Subsystem UDMAP receive flow's registers.
+ * See @tis_ci_msg_rm_udmap_flow_cfg_req
+ */
+struct ti_sci_msg_rm_udmap_flow_cfg {
+	u32 valid_params;
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_EINFO_PRESENT_VALID	BIT(0)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_PSINFO_PRESENT_VALID     BIT(1)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_ERROR_HANDLING_VALID     BIT(2)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DESC_TYPE_VALID          BIT(3)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SOP_OFFSET_VALID         BIT(4)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_QNUM_VALID          BIT(5)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_HI_VALID         BIT(6)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_LO_VALID         BIT(7)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_HI_VALID        BIT(8)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_LO_VALID        BIT(9)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_HI_SEL_VALID     BIT(10)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_LO_SEL_VALID     BIT(11)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_HI_SEL_VALID    BIT(12)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_LO_SEL_VALID    BIT(13)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ0_SZ0_QNUM_VALID      BIT(14)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ1_QNUM_VALID          BIT(15)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ2_QNUM_VALID          BIT(16)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ3_QNUM_VALID          BIT(17)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_PS_LOCATION_VALID        BIT(18)
+	u16 nav_id;
+	u16 flow_index;
+	u8 rx_einfo_present;
+	u8 rx_psinfo_present;
+	u8 rx_error_handling;
+	u8 rx_desc_type;
+	u16 rx_sop_offset;
+	u16 rx_dest_qnum;
+	u8 rx_src_tag_hi;
+	u8 rx_src_tag_lo;
+	u8 rx_dest_tag_hi;
+	u8 rx_dest_tag_lo;
+	u8 rx_src_tag_hi_sel;
+	u8 rx_src_tag_lo_sel;
+	u8 rx_dest_tag_hi_sel;
+	u8 rx_dest_tag_lo_sel;
+	u16 rx_fdq0_sz0_qnum;
+	u16 rx_fdq1_qnum;
+	u16 rx_fdq2_qnum;
+	u16 rx_fdq3_qnum;
+	u8 rx_ps_location;
+};
+
+/**
+ * struct ti_sci_rm_udmap_ops - UDMA Management operations
+ * @tx_ch_cfg: configure SoC Navigator Subsystem UDMA transmit channel.
+ * @rx_ch_cfg: configure SoC Navigator Subsystem UDMA receive channel.
+ * @rx_flow_cfg1: configure SoC Navigator Subsystem UDMA receive flow.
+ */
+struct ti_sci_rm_udmap_ops {
+	int (*tx_ch_cfg)(const struct ti_sci_handle *handle,
+			 const struct ti_sci_msg_rm_udmap_tx_ch_cfg *params);
+	int (*rx_ch_cfg)(const struct ti_sci_handle *handle,
+			 const struct ti_sci_msg_rm_udmap_rx_ch_cfg *params);
+	int (*rx_flow_cfg)(const struct ti_sci_handle *handle,
+			   const struct ti_sci_msg_rm_udmap_flow_cfg *params);
+};
+
 /**
  * struct ti_sci_ops - Function support for TI SCI
  * @dev_ops:	Device specific operations
@@ -254,6 +466,9 @@ struct ti_sci_ops {
 	struct ti_sci_clk_ops clk_ops;
 	struct ti_sci_rm_core_ops rm_core_ops;
 	struct ti_sci_rm_irq_ops rm_irq_ops;
+	struct ti_sci_rm_ringacc_ops rm_ring_ops;
+	struct ti_sci_rm_psil_ops rm_psil_ops;
+	struct ti_sci_rm_udmap_ops rm_udmap_ops;
 };
 
 /**
-- 
cgit v1.2.3


From 1e407f337f4015c8ffc56e7cfd70e06b2e9fc9da Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 5 Jun 2019 17:33:34 -0500
Subject: firmware: ti_sci: Add support for processor control

Texas Instrument's System Control Interface (TI-SCI) Message Protocol
is used in Texas Instrument's System on Chip (SoC) such as those
in K3 family AM654 SoC to communicate between various compute
processors with a central system controller entity.

The system controller provides various services including the control
of other compute processors within the SoC. Extend the TI-SCI protocol
support to add various TI-SCI commands to invoke services associated
with power and reset control, and boot vector management of the
various compute processors from the Linux kernel.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 350 +++++++++++++++++++++++++++++++++
 drivers/firmware/ti_sci.h              | 135 +++++++++++++
 include/linux/soc/ti/ti_sci_protocol.h |  31 +++
 3 files changed, 516 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 02fa196428d8..b47e33e7411f 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2479,6 +2479,348 @@ fail:
 	return ret;
 }
 
+/**
+ * ti_sci_cmd_proc_request() - Command to request a physical processor control
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_request(const struct ti_sci_handle *handle,
+				   u8 proc_id)
+{
+	struct ti_sci_msg_req_proc_request *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_REQUEST,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_proc_request *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_release() - Command to release a physical processor control
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_release(const struct ti_sci_handle *handle,
+				   u8 proc_id)
+{
+	struct ti_sci_msg_req_proc_release *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_RELEASE,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_proc_release *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_handover() - Command to handover a physical processor
+ *				control to a host in the processor's access
+ *				control list.
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ * @host_id:	Host ID to get the control of the processor
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_handover(const struct ti_sci_handle *handle,
+				    u8 proc_id, u8 host_id)
+{
+	struct ti_sci_msg_req_proc_handover *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_HANDOVER,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_proc_handover *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+	req->host_id = host_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_set_config() - Command to set the processor boot
+ *				    configuration flags
+ * @handle:		Pointer to TI SCI handle
+ * @proc_id:		Processor ID this request is for
+ * @config_flags_set:	Configuration flags to be set
+ * @config_flags_clear:	Configuration flags to be cleared.
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_set_config(const struct ti_sci_handle *handle,
+				      u8 proc_id, u64 bootvector,
+				      u32 config_flags_set,
+				      u32 config_flags_clear)
+{
+	struct ti_sci_msg_req_set_config *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_SET_CONFIG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_set_config *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+	req->bootvector_low = bootvector & TI_SCI_ADDR_LOW_MASK;
+	req->bootvector_high = (bootvector & TI_SCI_ADDR_HIGH_MASK) >>
+				TI_SCI_ADDR_HIGH_SHIFT;
+	req->config_flags_set = config_flags_set;
+	req->config_flags_clear = config_flags_clear;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_set_control() - Command to set the processor boot
+ *				     control flags
+ * @handle:			Pointer to TI SCI handle
+ * @proc_id:			Processor ID this request is for
+ * @control_flags_set:		Control flags to be set
+ * @control_flags_clear:	Control flags to be cleared
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_set_control(const struct ti_sci_handle *handle,
+				       u8 proc_id, u32 control_flags_set,
+				       u32 control_flags_clear)
+{
+	struct ti_sci_msg_req_set_ctrl *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_SET_CTRL,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_set_ctrl *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+	req->control_flags_set = control_flags_set;
+	req->control_flags_clear = control_flags_clear;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_get_boot_status() - Command to get the processor boot status
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_get_status(const struct ti_sci_handle *handle,
+				      u8 proc_id, u64 *bv, u32 *cfg_flags,
+				      u32 *ctrl_flags, u32 *sts_flags)
+{
+	struct ti_sci_msg_resp_get_status *resp;
+	struct ti_sci_msg_req_get_status *req;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_GET_STATUS,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_get_status *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_resp_get_status *)xfer->tx_message.buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		ret = -ENODEV;
+	} else {
+		*bv = (resp->bootvector_low & TI_SCI_ADDR_LOW_MASK) |
+		      (((u64)resp->bootvector_high << TI_SCI_ADDR_HIGH_SHIFT) &
+		       TI_SCI_ADDR_HIGH_MASK);
+		*cfg_flags = resp->config_flags;
+		*ctrl_flags = resp->control_flags;
+		*sts_flags = resp->status_flags;
+	}
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
 /*
  * ti_sci_setup_ops() - Setup the operations structures
  * @info:	pointer to TISCI pointer
@@ -2494,6 +2836,7 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	struct ti_sci_rm_ringacc_ops *rops = &ops->rm_ring_ops;
 	struct ti_sci_rm_psil_ops *psilops = &ops->rm_psil_ops;
 	struct ti_sci_rm_udmap_ops *udmap_ops = &ops->rm_udmap_ops;
+	struct ti_sci_proc_ops *pops = &ops->proc_ops;
 
 	core_ops->reboot_device = ti_sci_cmd_core_reboot;
 
@@ -2543,6 +2886,13 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	udmap_ops->tx_ch_cfg = ti_sci_cmd_rm_udmap_tx_ch_cfg;
 	udmap_ops->rx_ch_cfg = ti_sci_cmd_rm_udmap_rx_ch_cfg;
 	udmap_ops->rx_flow_cfg = ti_sci_cmd_rm_udmap_rx_flow_cfg;
+
+	pops->request = ti_sci_cmd_proc_request;
+	pops->release = ti_sci_cmd_proc_release;
+	pops->handover = ti_sci_cmd_proc_handover;
+	pops->set_config = ti_sci_cmd_proc_set_config;
+	pops->set_control = ti_sci_cmd_proc_set_control;
+	pops->get_status = ti_sci_cmd_proc_get_status;
 }
 
 /**
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 2bb81ec7793c..662dcffef311 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -71,6 +71,14 @@
 #define TISCI_MSG_RM_UDMAP_FLOW_GET_CFG		0x1232
 #define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_GET_CFG	0x1233
 
+/* Processor Control requests */
+#define TI_SCI_MSG_PROC_REQUEST		0xc000
+#define TI_SCI_MSG_PROC_RELEASE		0xc001
+#define TI_SCI_MSG_PROC_HANDOVER	0xc005
+#define TI_SCI_MSG_SET_CONFIG		0xc100
+#define TI_SCI_MSG_SET_CTRL		0xc101
+#define TI_SCI_MSG_GET_STATUS		0xc400
+
 /**
  * struct ti_sci_msg_hdr - Generic Message Header for All messages and responses
  * @type:	Type of messages: One of TI_SCI_MSG* values
@@ -1238,4 +1246,131 @@ struct ti_sci_msg_rm_udmap_flow_cfg_req {
 	u8 rx_ps_location;
 } __packed;
 
+/**
+ * struct ti_sci_msg_req_proc_request - Request a processor
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being requested
+ *
+ * Request type is TI_SCI_MSG_PROC_REQUEST, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_proc_request {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_proc_release - Release a processor
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being released
+ *
+ * Request type is TI_SCI_MSG_PROC_RELEASE, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_proc_release {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_proc_handover - Handover a processor to a host
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being handed over
+ * @host_id:		Host ID the control needs to be transferred to
+ *
+ * Request type is TI_SCI_MSG_PROC_HANDOVER, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_proc_handover {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u8 host_id;
+} __packed;
+
+/* Boot Vector masks */
+#define TI_SCI_ADDR_LOW_MASK			GENMASK_ULL(31, 0)
+#define TI_SCI_ADDR_HIGH_MASK			GENMASK_ULL(63, 32)
+#define TI_SCI_ADDR_HIGH_SHIFT			32
+
+/**
+ * struct ti_sci_msg_req_set_config - Set Processor boot configuration
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being configured
+ * @bootvector_low:	Lower 32 bit address (Little Endian) of boot vector
+ * @bootvector_high:	Higher 32 bit address (Little Endian) of boot vector
+ * @config_flags_set:	Optional Processor specific Config Flags to set.
+ *			Setting a bit here implies the corresponding mode
+ *			will be set
+ * @config_flags_clear:	Optional Processor specific Config Flags to clear.
+ *			Setting a bit here implies the corresponding mode
+ *			will be cleared
+ *
+ * Request type is TI_SCI_MSG_PROC_HANDOVER, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_set_config {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u32 bootvector_low;
+	u32 bootvector_high;
+	u32 config_flags_set;
+	u32 config_flags_clear;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_set_ctrl - Set Processor boot control flags
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being configured
+ * @control_flags_set:	Optional Processor specific Control Flags to set.
+ *			Setting a bit here implies the corresponding mode
+ *			will be set
+ * @control_flags_clear:Optional Processor specific Control Flags to clear.
+ *			Setting a bit here implies the corresponding mode
+ *			will be cleared
+ *
+ * Request type is TI_SCI_MSG_SET_CTRL, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_set_ctrl {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u32 control_flags_set;
+	u32 control_flags_clear;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_get_status - Processor boot status request
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor whose status is being requested
+ *
+ * Request type is TI_SCI_MSG_GET_STATUS, response is an appropriate
+ * message, or NACK in case of inability to satisfy request.
+ */
+struct ti_sci_msg_req_get_status {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_resp_get_status - Processor boot status response
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor whose status is returned
+ * @bootvector_low:	Lower 32 bit address (Little Endian) of boot vector
+ * @bootvector_high:	Higher 32 bit address (Little Endian) of boot vector
+ * @config_flags:	Optional Processor specific Config Flags set currently
+ * @control_flags:	Optional Processor specific Control Flags set currently
+ * @status_flags:	Optional Processor specific Status Flags set currently
+ *
+ * Response structure to a TI_SCI_MSG_GET_STATUS request.
+ */
+struct ti_sci_msg_resp_get_status {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u32 bootvector_low;
+	u32 bootvector_high;
+	u32 config_flags;
+	u32 control_flags;
+	u32 status_flags;
+} __packed;
+
 #endif /* __TI_SCI_H */
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 4fd9bff5806b..7b3762f68df9 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -453,12 +453,42 @@ struct ti_sci_rm_udmap_ops {
 			   const struct ti_sci_msg_rm_udmap_flow_cfg *params);
 };
 
+/**
+ * struct ti_sci_proc_ops - Processor Control operations
+ * @request:	Request to control a physical processor. The requesting host
+ *		should be in the processor access list
+ * @release:	Relinquish a physical processor control
+ * @handover:	Handover a physical processor control to another host
+ *		in the permitted list
+ * @set_config:	Set base configuration of a processor
+ * @set_control: Setup limited control flags in specific cases
+ * @get_status: Get the state of physical processor
+ *
+ * NOTE: The following paramteres are generic in nature for all these ops,
+ * -handle:	Pointer to TI SCI handle as retrieved by *ti_sci_get_handle
+ * -pid:	Processor ID
+ * -hid:	Host ID
+ */
+struct ti_sci_proc_ops {
+	int (*request)(const struct ti_sci_handle *handle, u8 pid);
+	int (*release)(const struct ti_sci_handle *handle, u8 pid);
+	int (*handover)(const struct ti_sci_handle *handle, u8 pid, u8 hid);
+	int (*set_config)(const struct ti_sci_handle *handle, u8 pid,
+			  u64 boot_vector, u32 cfg_set, u32 cfg_clr);
+	int (*set_control)(const struct ti_sci_handle *handle, u8 pid,
+			   u32 ctrl_set, u32 ctrl_clr);
+	int (*get_status)(const struct ti_sci_handle *handle, u8 pid,
+			  u64 *boot_vector, u32 *cfg_flags, u32 *ctrl_flags,
+			  u32 *status_flags);
+};
+
 /**
  * struct ti_sci_ops - Function support for TI SCI
  * @dev_ops:	Device specific operations
  * @clk_ops:	Clock specific operations
  * @rm_core_ops:	Resource management core operations.
  * @rm_irq_ops:		IRQ management specific operations
+ * @proc_ops:	Processor Control specific operations
  */
 struct ti_sci_ops {
 	struct ti_sci_core_ops core_ops;
@@ -469,6 +499,7 @@ struct ti_sci_ops {
 	struct ti_sci_rm_ringacc_ops rm_ring_ops;
 	struct ti_sci_rm_psil_ops rm_psil_ops;
 	struct ti_sci_rm_udmap_ops rm_udmap_ops;
+	struct ti_sci_proc_ops proc_ops;
 };
 
 /**
-- 
cgit v1.2.3


From 57c84d5c9348bda5e9129bc4e4e567546915ad8c Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Thu, 13 Jun 2019 09:30:40 +0200
Subject: drm: Remove lock interfaces from GEM VRAM helpers

The lock functions and the locked-pin/unpin functions of GEM VRAM are not
required any longer. Remove them.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190613073041.29350-9-tzimmermann@suse.de
---
 drivers/gpu/drm/drm_gem_vram_helper.c | 109 ----------------------------------
 include/drm/drm_gem_vram_helper.h     |   5 --
 2 files changed, 114 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index f3e5803affb0..4cae52054e61 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -151,36 +151,6 @@ void drm_gem_vram_put(struct drm_gem_vram_object *gbo)
 }
 EXPORT_SYMBOL(drm_gem_vram_put);
 
-/**
- * drm_gem_vram_lock() - Locks a VRAM-backed GEM object
- * @gbo:	the GEM VRAM object
- * @no_wait:	don't wait for buffer object to become available
- *
- * See ttm_bo_reserve() for more information.
- *
- * Returns:
- * 0 on success, or
- * a negative error code otherwise
- */
-int drm_gem_vram_lock(struct drm_gem_vram_object *gbo, bool no_wait)
-{
-	return ttm_bo_reserve(&gbo->bo, true, no_wait, NULL);
-}
-EXPORT_SYMBOL(drm_gem_vram_lock);
-
-/**
- * drm_gem_vram_unlock() - \
-	Release a reservation acquired by drm_gem_vram_lock()
- * @gbo:	the GEM VRAM object
- *
- * See ttm_bo_unreserve() for more information.
- */
-void drm_gem_vram_unlock(struct drm_gem_vram_object *gbo)
-{
-	ttm_bo_unreserve(&gbo->bo);
-}
-EXPORT_SYMBOL(drm_gem_vram_unlock);
-
 /**
  * drm_gem_vram_mmap_offset() - Returns a GEM VRAM object's mmap offset
  * @gbo:	the GEM VRAM object
@@ -266,49 +236,6 @@ err_ttm_bo_unreserve:
 }
 EXPORT_SYMBOL(drm_gem_vram_pin);
 
-/**
- * drm_gem_vram_pin_locked() - Pins a GEM VRAM object in a region.
- * @gbo:	the GEM VRAM object
- * @pl_flag:	a bitmask of possible memory regions
- *
- * Pinning a buffer object ensures that it is not evicted from
- * a memory region. A pinned buffer object has to be unpinned before
- * it can be pinned to another region.
- *
- * This function pins a GEM VRAM object that has already been
- * locked. Use drm_gem_vram_pin() if possible.
- *
- * Returns:
- * 0 on success, or
- * a negative error code otherwise.
- */
-int drm_gem_vram_pin_locked(struct drm_gem_vram_object *gbo,
-			    unsigned long pl_flag)
-{
-	int i, ret;
-	struct ttm_operation_ctx ctx = { false, false };
-
-	lockdep_assert_held(&gbo->bo.resv->lock.base);
-
-	if (gbo->pin_count) {
-		++gbo->pin_count;
-		return 0;
-	}
-
-	drm_gem_vram_placement(gbo, pl_flag);
-	for (i = 0; i < gbo->placement.num_placement; ++i)
-		gbo->placements[i].flags |= TTM_PL_FLAG_NO_EVICT;
-
-	ret = ttm_bo_validate(&gbo->bo, &gbo->placement, &ctx);
-	if (ret < 0)
-		return ret;
-
-	gbo->pin_count = 1;
-
-	return 0;
-}
-EXPORT_SYMBOL(drm_gem_vram_pin_locked);
-
 /**
  * drm_gem_vram_unpin() - Unpins a GEM VRAM object
  * @gbo:	the GEM VRAM object
@@ -351,42 +278,6 @@ err_ttm_bo_unreserve:
 }
 EXPORT_SYMBOL(drm_gem_vram_unpin);
 
-/**
- * drm_gem_vram_unpin_locked() - Unpins a GEM VRAM object
- * @gbo:	the GEM VRAM object
- *
- * This function unpins a GEM VRAM object that has already been
- * locked. Use drm_gem_vram_unpin() if possible.
- *
- * Returns:
- * 0 on success, or
- * a negative error code otherwise.
- */
-int drm_gem_vram_unpin_locked(struct drm_gem_vram_object *gbo)
-{
-	int i, ret;
-	struct ttm_operation_ctx ctx = { false, false };
-
-	lockdep_assert_held(&gbo->bo.resv->lock.base);
-
-	if (WARN_ON_ONCE(!gbo->pin_count))
-		return 0;
-
-	--gbo->pin_count;
-	if (gbo->pin_count)
-		return 0;
-
-	for (i = 0; i < gbo->placement.num_placement ; ++i)
-		gbo->placements[i].flags &= ~TTM_PL_FLAG_NO_EVICT;
-
-	ret = ttm_bo_validate(&gbo->bo, &gbo->placement, &ctx);
-	if (ret < 0)
-		return ret;
-
-	return 0;
-}
-EXPORT_SYMBOL(drm_gem_vram_unpin_locked);
-
 /**
  * drm_gem_vram_kmap_at() - Maps a GEM VRAM object into kernel address space
  * @gbo:	the GEM VRAM object
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index 4d1d2c1bf32b..1d4aa87f8dfa 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -77,15 +77,10 @@ struct drm_gem_vram_object *drm_gem_vram_create(struct drm_device *dev,
 						unsigned long pg_align,
 						bool interruptible);
 void drm_gem_vram_put(struct drm_gem_vram_object *gbo);
-int drm_gem_vram_lock(struct drm_gem_vram_object *gbo, bool no_wait);
-void drm_gem_vram_unlock(struct drm_gem_vram_object *gbo);
 u64 drm_gem_vram_mmap_offset(struct drm_gem_vram_object *gbo);
 s64 drm_gem_vram_offset(struct drm_gem_vram_object *gbo);
 int drm_gem_vram_pin(struct drm_gem_vram_object *gbo, unsigned long pl_flag);
-int drm_gem_vram_pin_locked(struct drm_gem_vram_object *gbo,
-			      unsigned long pl_flag);
 int drm_gem_vram_unpin(struct drm_gem_vram_object *gbo);
-int drm_gem_vram_unpin_locked(struct drm_gem_vram_object *gbo);
 void *drm_gem_vram_kmap_at(struct drm_gem_vram_object *gbo, bool map,
 			   bool *is_iomem, struct ttm_bo_kmap_obj *kmap);
 void *drm_gem_vram_kmap(struct drm_gem_vram_object *gbo, bool map,
-- 
cgit v1.2.3


From 921721738f4994b54f916f9a3bdd5dfdf26f753c Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Thu, 13 Jun 2019 09:30:41 +0200
Subject: drm: Remove functions with kmap-object argument from GEM VRAM helpers

The GEM VRAM functions with kmap-object argument are not required any
longer. Remove them.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190613073041.29350-10-tzimmermann@suse.de
---
 drivers/gpu/drm/drm_gem_vram_helper.c | 50 ++++++-----------------------------
 include/drm/drm_gem_vram_helper.h     |  4 ---
 2 files changed, 8 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index 4cae52054e61..4de782ca26b2 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -279,12 +279,11 @@ err_ttm_bo_unreserve:
 EXPORT_SYMBOL(drm_gem_vram_unpin);
 
 /**
- * drm_gem_vram_kmap_at() - Maps a GEM VRAM object into kernel address space
+ * drm_gem_vram_kmap() - Maps a GEM VRAM object into kernel address space
  * @gbo:	the GEM VRAM object
  * @map:	establish a mapping if necessary
  * @is_iomem:	returns true if the mapped memory is I/O memory, or false \
 	otherwise; can be NULL
- * @kmap:	the mapping's kmap object
  *
  * This function maps the buffer object into the kernel's address space
  * or returns the current mapping. If the parameter map is false, the
@@ -296,10 +295,11 @@ EXPORT_SYMBOL(drm_gem_vram_unpin);
  * NULL if not mapped, or
  * an ERR_PTR()-encoded error code otherwise.
  */
-void *drm_gem_vram_kmap_at(struct drm_gem_vram_object *gbo, bool map,
-			   bool *is_iomem, struct ttm_bo_kmap_obj *kmap)
+void *drm_gem_vram_kmap(struct drm_gem_vram_object *gbo, bool map,
+			bool *is_iomem)
 {
 	int ret;
+	struct ttm_bo_kmap_obj *kmap = &gbo->kmap;
 
 	if (kmap->virtual || !map)
 		goto out;
@@ -317,56 +317,22 @@ out:
 	}
 	return ttm_kmap_obj_virtual(kmap, is_iomem);
 }
-EXPORT_SYMBOL(drm_gem_vram_kmap_at);
-
-/**
- * drm_gem_vram_kmap() - Maps a GEM VRAM object into kernel address space
- * @gbo:	the GEM VRAM object
- * @map:	establish a mapping if necessary
- * @is_iomem:	returns true if the mapped memory is I/O memory, or false \
-	otherwise; can be NULL
- *
- * This function maps the buffer object into the kernel's address space
- * or returns the current mapping. If the parameter map is false, the
- * function only queries the current mapping, but does not establish a
- * new one.
- *
- * Returns:
- * The buffers virtual address if mapped, or
- * NULL if not mapped, or
- * an ERR_PTR()-encoded error code otherwise.
- */
-void *drm_gem_vram_kmap(struct drm_gem_vram_object *gbo, bool map,
-			bool *is_iomem)
-{
-	return drm_gem_vram_kmap_at(gbo, map, is_iomem, &gbo->kmap);
-}
 EXPORT_SYMBOL(drm_gem_vram_kmap);
 
 /**
- * drm_gem_vram_kunmap_at() - Unmaps a GEM VRAM object
+ * drm_gem_vram_kunmap() - Unmaps a GEM VRAM object
  * @gbo:	the GEM VRAM object
- * @kmap:	the mapping's kmap object
  */
-void drm_gem_vram_kunmap_at(struct drm_gem_vram_object *gbo,
-			    struct ttm_bo_kmap_obj *kmap)
+void drm_gem_vram_kunmap(struct drm_gem_vram_object *gbo)
 {
+	struct ttm_bo_kmap_obj *kmap = &gbo->kmap;
+
 	if (!kmap->virtual)
 		return;
 
 	ttm_bo_kunmap(kmap);
 	kmap->virtual = NULL;
 }
-EXPORT_SYMBOL(drm_gem_vram_kunmap_at);
-
-/**
- * drm_gem_vram_kunmap() - Unmaps a GEM VRAM object
- * @gbo:	the GEM VRAM object
- */
-void drm_gem_vram_kunmap(struct drm_gem_vram_object *gbo)
-{
-	drm_gem_vram_kunmap_at(gbo, &gbo->kmap);
-}
 EXPORT_SYMBOL(drm_gem_vram_kunmap);
 
 /**
diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index 1d4aa87f8dfa..9581ea0a4f7e 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -81,12 +81,8 @@ u64 drm_gem_vram_mmap_offset(struct drm_gem_vram_object *gbo);
 s64 drm_gem_vram_offset(struct drm_gem_vram_object *gbo);
 int drm_gem_vram_pin(struct drm_gem_vram_object *gbo, unsigned long pl_flag);
 int drm_gem_vram_unpin(struct drm_gem_vram_object *gbo);
-void *drm_gem_vram_kmap_at(struct drm_gem_vram_object *gbo, bool map,
-			   bool *is_iomem, struct ttm_bo_kmap_obj *kmap);
 void *drm_gem_vram_kmap(struct drm_gem_vram_object *gbo, bool map,
 			bool *is_iomem);
-void drm_gem_vram_kunmap_at(struct drm_gem_vram_object *gbo,
-			    struct ttm_bo_kmap_obj *kmap);
 void drm_gem_vram_kunmap(struct drm_gem_vram_object *gbo);
 
 int drm_gem_vram_fill_create_dumb(struct drm_file *file,
-- 
cgit v1.2.3


From 18c8c0954d15105b02f7d2f556b9eafae426871f Mon Sep 17 00:00:00 2001
From: Wesley Sheng <wesley.sheng@microchip.com>
Date: Tue, 30 Apr 2019 18:04:29 +0800
Subject: NTB: correct ntb_dev_ops and ntb_dev comment typos

The comment for ntb_dev_ops and ntb_dev incorrectly referred to
ntb_ctx_ops and ntb_device.

Signed-off-by: Wesley Sheng <wesley.sheng@microchip.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 56a92e3ae3ae..604abc883741 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -205,7 +205,7 @@ static inline int ntb_ctx_ops_is_valid(const struct ntb_ctx_ops *ops)
 }
 
 /**
- * struct ntb_ctx_ops - ntb device operations
+ * struct ntb_dev_ops - ntb device operations
  * @port_number:	See ntb_port_number().
  * @peer_port_count:	See ntb_peer_port_count().
  * @peer_port_number:	See ntb_peer_port_number().
@@ -404,7 +404,7 @@ struct ntb_client {
 #define drv_ntb_client(__drv) container_of((__drv), struct ntb_client, drv)
 
 /**
- * struct ntb_device - ntb device
+ * struct ntb_dev - ntb device
  * @dev:		Linux device object.
  * @pdev:		PCI device entry of the ntb.
  * @topo:		Detected topology of the ntb.
-- 
cgit v1.2.3


From d7cc609fb679e11dc2b00cbe6c50cbd37ec4bfa2 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:51 -0600
Subject: PCI/MSI: Support allocating virtual MSI interrupts

For NTB devices, we want to be able to trigger MSI interrupts
through a memory window. In these cases we may want to use
more interrupts than the NTB PCI device has available in its MSI-X
table.

We allow for this by creating a new 'virtual' interrupt. These
interrupts are allocated as usual but are not programmed into the
MSI-X table (as there may not be space for them).

The MSI address and data will then handled through an NTB MSI library
introduced later in this series.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/pci/msi.c   | 54 ++++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/msi.h |  8 ++++++++
 include/linux/pci.h |  9 +++++++++
 3 files changed, 62 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index e039b740fe74..ace978deaf93 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -192,6 +192,9 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 
 static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
 {
+	if (desc->msi_attrib.is_virtual)
+		return NULL;
+
 	return desc->mask_base +
 		desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 }
@@ -206,14 +209,19 @@ static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
 	u32 mask_bits = desc->masked;
+	void __iomem *desc_addr;
 
 	if (pci_msi_ignore_mask)
 		return 0;
+	desc_addr = pci_msix_desc_addr(desc);
+	if (!desc_addr)
+		return 0;
 
 	mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
 	if (flag)
 		mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
-	writel(mask_bits, pci_msix_desc_addr(desc) + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+	writel(mask_bits, desc_addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
 
 	return mask_bits;
 }
@@ -273,6 +281,11 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 	if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
 
+		if (!base) {
+			WARN_ON(1);
+			return;
+		}
+
 		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
 		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
 		msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
@@ -303,6 +316,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 	} else if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
 
+		if (!base)
+			goto skip;
+
 		writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
 		writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
 		writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
@@ -327,7 +343,13 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 					      msg->data);
 		}
 	}
+
+skip:
 	entry->msg = *msg;
+
+	if (entry->write_msi_msg)
+		entry->write_msi_msg(entry, entry->write_msi_msg_data);
+
 }
 
 void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
@@ -550,6 +572,7 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd)
 
 	entry->msi_attrib.is_msix	= 0;
 	entry->msi_attrib.is_64		= !!(control & PCI_MSI_FLAGS_64BIT);
+	entry->msi_attrib.is_virtual    = 0;
 	entry->msi_attrib.entry_nr	= 0;
 	entry->msi_attrib.maskbit	= !!(control & PCI_MSI_FLAGS_MASKBIT);
 	entry->msi_attrib.default_irq	= dev->irq;	/* Save IOAPIC IRQ */
@@ -674,6 +697,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 	struct irq_affinity_desc *curmsk, *masks = NULL;
 	struct msi_desc *entry;
 	int ret, i;
+	int vec_count = pci_msix_vec_count(dev);
 
 	if (affd)
 		masks = irq_create_affinity_masks(nvec, affd);
@@ -696,6 +720,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			entry->msi_attrib.entry_nr = entries[i].entry;
 		else
 			entry->msi_attrib.entry_nr = i;
+
+		entry->msi_attrib.is_virtual =
+			entry->msi_attrib.entry_nr >= vec_count;
+
 		entry->msi_attrib.default_irq	= dev->irq;
 		entry->mask_base		= base;
 
@@ -714,12 +742,19 @@ static void msix_program_entries(struct pci_dev *dev,
 {
 	struct msi_desc *entry;
 	int i = 0;
+	void __iomem *desc_addr;
 
 	for_each_pci_msi_entry(entry, dev) {
 		if (entries)
 			entries[i++].vector = entry->irq;
-		entry->masked = readl(pci_msix_desc_addr(entry) +
-				PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+		desc_addr = pci_msix_desc_addr(entry);
+		if (desc_addr)
+			entry->masked = readl(desc_addr +
+					      PCI_MSIX_ENTRY_VECTOR_CTRL);
+		else
+			entry->masked = 0;
+
 		msix_mask_irq(entry, 1);
 	}
 }
@@ -932,7 +967,7 @@ int pci_msix_vec_count(struct pci_dev *dev)
 EXPORT_SYMBOL(pci_msix_vec_count);
 
 static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
-			     int nvec, struct irq_affinity *affd)
+			     int nvec, struct irq_affinity *affd, int flags)
 {
 	int nr_entries;
 	int i, j;
@@ -943,7 +978,7 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 	nr_entries = pci_msix_vec_count(dev);
 	if (nr_entries < 0)
 		return nr_entries;
-	if (nvec > nr_entries)
+	if (nvec > nr_entries && !(flags & PCI_IRQ_VIRTUAL))
 		return nr_entries;
 
 	if (entries) {
@@ -1079,7 +1114,8 @@ EXPORT_SYMBOL(pci_enable_msi);
 
 static int __pci_enable_msix_range(struct pci_dev *dev,
 				   struct msix_entry *entries, int minvec,
-				   int maxvec, struct irq_affinity *affd)
+				   int maxvec, struct irq_affinity *affd,
+				   int flags)
 {
 	int rc, nvec = maxvec;
 
@@ -1096,7 +1132,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 				return -ENOSPC;
 		}
 
-		rc = __pci_enable_msix(dev, entries, nvec, affd);
+		rc = __pci_enable_msix(dev, entries, nvec, affd, flags);
 		if (rc == 0)
 			return nvec;
 
@@ -1127,7 +1163,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
 		int minvec, int maxvec)
 {
-	return __pci_enable_msix_range(dev, entries, minvec, maxvec, NULL);
+	return __pci_enable_msix_range(dev, entries, minvec, maxvec, NULL, 0);
 }
 EXPORT_SYMBOL(pci_enable_msix_range);
 
@@ -1167,7 +1203,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 
 	if (flags & PCI_IRQ_MSIX) {
 		msix_vecs = __pci_enable_msix_range(dev, NULL, min_vecs,
-						    max_vecs, affd);
+						    max_vecs, affd, flags);
 		if (msix_vecs > 0)
 			return msix_vecs;
 	}
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d48e919d55ae..8ad679e9d9c0 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -64,6 +64,10 @@ struct ti_sci_inta_msi_desc {
  * @msg:	The last set MSI message cached for reuse
  * @affinity:	Optional pointer to a cpu affinity mask for this descriptor
  *
+ * @write_msi_msg:	Callback that may be called when the MSI message
+ *			address or data changes
+ * @write_msi_msg_data:	Data parameter for the callback.
+ *
  * @masked:	[PCI MSI/X] Mask bits
  * @is_msix:	[PCI MSI/X] True if MSI-X
  * @multiple:	[PCI MSI/X] log2 num of messages allocated
@@ -90,6 +94,9 @@ struct msi_desc {
 	const void			*iommu_cookie;
 #endif
 
+	void (*write_msi_msg)(struct msi_desc *entry, void *data);
+	void *write_msi_msg_data;
+
 	union {
 		/* PCI MSI/X specific data */
 		struct {
@@ -100,6 +107,7 @@ struct msi_desc {
 				u8	multi_cap	: 3;
 				u8	maskbit		: 1;
 				u8	is_64		: 1;
+				u8	is_virtual	: 1;
 				u16	entry_nr;
 				unsigned default_irq;
 			} msi_attrib;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..19b5c27c6f63 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1362,6 +1362,15 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
 #define PCI_IRQ_MSI		(1 << 1) /* Allow MSI interrupts */
 #define PCI_IRQ_MSIX		(1 << 2) /* Allow MSI-X interrupts */
 #define PCI_IRQ_AFFINITY	(1 << 3) /* Auto-assign affinity */
+
+/*
+ * Virtual interrupts allow for more interrupts to be allocated
+ * than the device has interrupts for. These are not programmed
+ * into the device's MSI-X table and must be handled by some
+ * other driver means.
+ */
+#define PCI_IRQ_VIRTUAL		(1 << 4)
+
 #define PCI_IRQ_ALL_TYPES \
 	(PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX)
 
-- 
cgit v1.2.3


From 246a42c51bc5dd247629f86c87d5e1b7628343c4 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:53 -0600
Subject: NTB: Introduce helper functions to calculate logical port number

This patch introduces the "Logical Port Number" which is similar to the
"Port Number" in that it enumerates the ports in the system.

The original (or Physical) "Port Number" can be any number used by the
hardware to uniquely identify a port in the system. The "Logical Port
Number" enumerates all ports in the system from 0 to the number of
ports minus one.

For example a system with 5 ports might have the following port numbers
which would be enumerated thusly:

Port Number:           1  2  5  7  116
Logical Port Number:   0  1  2  3  4

The logical port number is useful when calculating which resources
to use for which peers. So we thus define two helper functions:
ntb_logical_port_number() and ntb_peer_logical_port_number() which
provide the "Logical Port Number" for the local port and any peer
respectively.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Allen Hubbe <allenbh@gmail.com>
Cc: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 604abc883741..2fadd0352683 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -616,7 +616,6 @@ static inline int ntb_port_number(struct ntb_dev *ntb)
 
 	return ntb->ops->port_number(ntb);
 }
-
 /**
  * ntb_peer_port_count() - get the number of peer device ports
  * @ntb:	NTB device context.
@@ -653,6 +652,58 @@ static inline int ntb_peer_port_number(struct ntb_dev *ntb, int pidx)
 	return ntb->ops->peer_port_number(ntb, pidx);
 }
 
+/**
+ * ntb_logical_port_number() - get the logical port number of the local port
+ * @ntb:	NTB device context.
+ *
+ * The Logical Port Number is defined to be a unique number for each
+ * port starting from zero through to the number of ports minus one.
+ * This is in contrast to the Port Number where each port can be assigned
+ * any unique physical number by the hardware.
+ *
+ * The logical port number is useful for calculating the resource indexes
+ * used by peers.
+ *
+ * Return: the logical port number or negative value indicating an error
+ */
+static inline int ntb_logical_port_number(struct ntb_dev *ntb)
+{
+	int lport = ntb_port_number(ntb);
+	int pidx;
+
+	if (lport < 0)
+		return lport;
+
+	for (pidx = 0; pidx < ntb_peer_port_count(ntb); pidx++)
+		if (lport <= ntb_peer_port_number(ntb, pidx))
+			return pidx;
+
+	return pidx;
+}
+
+/**
+ * ntb_peer_logical_port_number() - get the logical peer port by given index
+ * @ntb:	NTB device context.
+ * @pidx:	Peer port index.
+ *
+ * The Logical Port Number is defined to be a unique number for each
+ * port starting from zero through to the number of ports minus one.
+ * This is in contrast to the Port Number where each port can be assigned
+ * any unique physical number by the hardware.
+ *
+ * The logical port number is useful for calculating the resource indexes
+ * used by peers.
+ *
+ * Return: the peer's logical port number or negative value indicating an error
+ */
+static inline int ntb_peer_logical_port_number(struct ntb_dev *ntb, int pidx)
+{
+	if (ntb_peer_port_number(ntb, pidx) < ntb_port_number(ntb))
+		return pidx;
+	else
+		return pidx + 1;
+}
+
 /**
  * ntb_peer_port_idx() - get the peer device port index by given port number
  * @ntb:	NTB device context.
-- 
cgit v1.2.3


From 5f1b1f065c791de71017502ed3ba46779e231d9b Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:54 -0600
Subject: NTB: Introduce functions to calculate multi-port resource index

When using multi-ports each port uses resources (dbs, msgs, mws, etc)
on every other port. Creating a mapping for these resources such that
each port has a corresponding resource on every other port is a bit
tricky.

Introduce the ntb_peer_resource_idx() function for this purpose.
It returns the peer resource number that will correspond with the
local peer index on the remote peer.

Also, introduce ntb_peer_highest_mw_idx() which will use
ntb_peer_resource_idx() but return the MW index starting with the
highest index and working down.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Allen Hubbe <allenbh@gmail.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'include')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 2fadd0352683..bed421b9579b 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -1557,4 +1557,74 @@ static inline int ntb_peer_msg_write(struct ntb_dev *ntb, int pidx, int midx,
 	return ntb->ops->peer_msg_write(ntb, pidx, midx, msg);
 }
 
+/**
+ * ntb_peer_resource_idx() - get a resource index for a given peer idx
+ * @ntb:	NTB device context.
+ * @pidx:	Peer port index.
+ *
+ * When constructing a graph of peers, each remote peer must use a different
+ * resource index (mw, doorbell, etc) to communicate with each other
+ * peer.
+ *
+ * In a two peer system, this function should always return 0 such that
+ * resource 0 points to the remote peer on both ports.
+ *
+ * In a 5 peer system, this function will return the following matrix
+ *
+ * pidx \ port    0    1    2    3    4
+ * 0              0    0    1    2    3
+ * 1              0    1    1    2    3
+ * 2              0    1    2    2    3
+ * 3              0    1    2    3    3
+ *
+ * For example, if this function is used to program peer's memory
+ * windows, port 0 will program MW 0 on all it's peers to point to itself.
+ * port 1 will program MW 0 in port 0 to point to itself and MW 1 on all
+ * other ports. etc.
+ *
+ * For the legacy two host case, ntb_port_number() and ntb_peer_port_number()
+ * both return zero and therefore this function will always return zero.
+ * So MW 0 on each host would be programmed to point to the other host.
+ *
+ * Return: the resource index to use for that peer.
+ */
+static inline int ntb_peer_resource_idx(struct ntb_dev *ntb, int pidx)
+{
+	int local_port, peer_port;
+
+	if (pidx >= ntb_peer_port_count(ntb))
+		return -EINVAL;
+
+	local_port = ntb_logical_port_number(ntb);
+	peer_port = ntb_peer_logical_port_number(ntb, pidx);
+
+	if (peer_port < local_port)
+		return local_port - 1;
+	else
+		return local_port;
+}
+
+/**
+ * ntb_peer_highest_mw_idx() - get a memory window index for a given peer idx
+ *	using the highest index memory windows first
+ *
+ * @ntb:	NTB device context.
+ * @pidx:	Peer port index.
+ *
+ * Like ntb_peer_resource_idx(), except it returns indexes starting with
+ * last memory window index.
+ *
+ * Return: the resource index to use for that peer.
+ */
+static inline int ntb_peer_highest_mw_idx(struct ntb_dev *ntb, int pidx)
+{
+	int ret;
+
+	ret = ntb_peer_resource_idx(ntb, pidx);
+	if (ret < 0)
+		return ret;
+
+	return ntb_mw_count(ntb, pidx) - ret - 1;
+}
+
 #endif
-- 
cgit v1.2.3


From 26b3a37b928457ba2cd98eaf6d7b0feca5a30fa6 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:56 -0600
Subject: NTB: Introduce MSI library

The NTB MSI library allows passing MSI interrupts across a memory
window. This offers similar functionality to doorbells or messages
except will often have much better latency and the client can
potentially use significantly more remote interrupts than typical hardware
provides for doorbells. (Which can be important in high-multiport
setups.)

The library utilizes one memory window per peer and uses the highest
index memory windows. Before any ntb_msi function may be used, the user
must call ntb_msi_init(). It may then setup and tear down the memory
windows when the link state changes using ntb_msi_setup_mws() and
ntb_msi_clear_mws().

The peer which receives the interrupt must call ntb_msim_request_irq()
to assign the interrupt handler (this function is functionally
similar to devm_request_irq()) and the returned descriptor must be
transferred to the peer which can use it to trigger the interrupt.
The triggering peer, once having received the descriptor, can
trigger the interrupt by calling ntb_msi_peer_trigger().

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Allen Hubbe <allenbh@gmail.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/ntb/Kconfig  |  11 ++
 drivers/ntb/Makefile |   3 +-
 drivers/ntb/msi.c    | 415 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ntb.h  |  73 +++++++++
 4 files changed, 501 insertions(+), 1 deletion(-)
 create mode 100644 drivers/ntb/msi.c

(limited to 'include')

diff --git a/drivers/ntb/Kconfig b/drivers/ntb/Kconfig
index 95944e52fa36..5760764052be 100644
--- a/drivers/ntb/Kconfig
+++ b/drivers/ntb/Kconfig
@@ -12,6 +12,17 @@ menuconfig NTB
 
 if NTB
 
+config NTB_MSI
+	bool "MSI Interrupt Support"
+	depends on PCI_MSI
+	help
+	 Support using MSI interrupt forwarding instead of (or in addition to)
+	 hardware doorbells. MSI interrupts typically offer lower latency
+	 than doorbells and more MSI interrupts can be made available to
+	 clients. However this requires an extra memory window and support
+	 in the hardware driver for creating the MSI interrupts.
+
+	 If unsure, say N.
 source "drivers/ntb/hw/Kconfig"
 
 source "drivers/ntb/test/Kconfig"
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 537226f8e78d..cc27ad2ef150 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_NTB) += ntb.o hw/ test/
 obj-$(CONFIG_NTB_TRANSPORT) += ntb_transport.o
 
-ntb-y := core.o
+ntb-y			:= core.o
+ntb-$(CONFIG_NTB_MSI)	+= msi.o
diff --git a/drivers/ntb/msi.c b/drivers/ntb/msi.c
new file mode 100644
index 000000000000..9dddf133658f
--- /dev/null
+++ b/drivers/ntb/msi.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/ntb.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("0.1");
+MODULE_AUTHOR("Logan Gunthorpe <logang@deltatee.com>");
+MODULE_DESCRIPTION("NTB MSI Interrupt Library");
+
+struct ntb_msi {
+	u64 base_addr;
+	u64 end_addr;
+
+	void (*desc_changed)(void *ctx);
+
+	u32 __iomem *peer_mws[];
+};
+
+/**
+ * ntb_msi_init() - Initialize the MSI context
+ * @ntb:	NTB device context
+ *
+ * This function must be called before any other ntb_msi function.
+ * It initializes the context for MSI operations and maps
+ * the peer memory windows.
+ *
+ * This function reserves the last N outbound memory windows (where N
+ * is the number of peers).
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_init(struct ntb_dev *ntb,
+		 void (*desc_changed)(void *ctx))
+{
+	phys_addr_t mw_phys_addr;
+	resource_size_t mw_size;
+	size_t struct_size;
+	int peer_widx;
+	int peers;
+	int ret;
+	int i;
+
+	peers = ntb_peer_port_count(ntb);
+	if (peers <= 0)
+		return -EINVAL;
+
+	struct_size = sizeof(*ntb->msi) + sizeof(*ntb->msi->peer_mws) * peers;
+
+	ntb->msi = devm_kzalloc(&ntb->dev, struct_size, GFP_KERNEL);
+	if (!ntb->msi)
+		return -ENOMEM;
+
+	ntb->msi->desc_changed = desc_changed;
+
+	for (i = 0; i < peers; i++) {
+		peer_widx = ntb_peer_mw_count(ntb) - 1 - i;
+
+		ret = ntb_peer_mw_get_addr(ntb, peer_widx, &mw_phys_addr,
+					   &mw_size);
+		if (ret)
+			goto unroll;
+
+		ntb->msi->peer_mws[i] = devm_ioremap(&ntb->dev, mw_phys_addr,
+						     mw_size);
+		if (!ntb->msi->peer_mws[i]) {
+			ret = -EFAULT;
+			goto unroll;
+		}
+	}
+
+	return 0;
+
+unroll:
+	for (i = 0; i < peers; i++)
+		if (ntb->msi->peer_mws[i])
+			devm_iounmap(&ntb->dev, ntb->msi->peer_mws[i]);
+
+	devm_kfree(&ntb->dev, ntb->msi);
+	ntb->msi = NULL;
+	return ret;
+}
+EXPORT_SYMBOL(ntb_msi_init);
+
+/**
+ * ntb_msi_setup_mws() - Initialize the MSI inbound memory windows
+ * @ntb:	NTB device context
+ *
+ * This function sets up the required inbound memory windows. It should be
+ * called from a work function after a link up event.
+ *
+ * Over the entire network, this function will reserves the last N
+ * inbound memory windows for each peer (where N is the number of peers).
+ *
+ * ntb_msi_init() must be called before this function.
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_setup_mws(struct ntb_dev *ntb)
+{
+	struct msi_desc *desc;
+	u64 addr;
+	int peer, peer_widx;
+	resource_size_t addr_align, size_align, size_max;
+	resource_size_t mw_size = SZ_32K;
+	resource_size_t mw_min_size = mw_size;
+	int i;
+	int ret;
+
+	if (!ntb->msi)
+		return -EINVAL;
+
+	desc = first_msi_entry(&ntb->pdev->dev);
+	addr = desc->msg.address_lo + ((uint64_t)desc->msg.address_hi << 32);
+
+	for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0)
+			return peer_widx;
+
+		ret = ntb_mw_get_align(ntb, peer, peer_widx, &addr_align,
+				       NULL, NULL);
+		if (ret)
+			return ret;
+
+		addr &= ~(addr_align - 1);
+	}
+
+	for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0) {
+			ret = peer_widx;
+			goto error_out;
+		}
+
+		ret = ntb_mw_get_align(ntb, peer, peer_widx, NULL,
+				       &size_align, &size_max);
+		if (ret)
+			goto error_out;
+
+		mw_size = round_up(mw_size, size_align);
+		mw_size = max(mw_size, size_max);
+		if (mw_size < mw_min_size)
+			mw_min_size = mw_size;
+
+		ret = ntb_mw_set_trans(ntb, peer, peer_widx,
+				       addr, mw_size);
+		if (ret)
+			goto error_out;
+	}
+
+	ntb->msi->base_addr = addr;
+	ntb->msi->end_addr = addr + mw_min_size;
+
+	return 0;
+
+error_out:
+	for (i = 0; i < peer; i++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0)
+			continue;
+
+		ntb_mw_clear_trans(ntb, i, peer_widx);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ntb_msi_setup_mws);
+
+/**
+ * ntb_msi_clear_mws() - Clear all inbound memory windows
+ * @ntb:	NTB device context
+ *
+ * This function tears down the resources used by ntb_msi_setup_mws().
+ */
+void ntb_msi_clear_mws(struct ntb_dev *ntb)
+{
+	int peer;
+	int peer_widx;
+
+	for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0)
+			continue;
+
+		ntb_mw_clear_trans(ntb, peer, peer_widx);
+	}
+}
+EXPORT_SYMBOL(ntb_msi_clear_mws);
+
+struct ntb_msi_devres {
+	struct ntb_dev *ntb;
+	struct msi_desc *entry;
+	struct ntb_msi_desc *msi_desc;
+};
+
+static int ntb_msi_set_desc(struct ntb_dev *ntb, struct msi_desc *entry,
+			    struct ntb_msi_desc *msi_desc)
+{
+	u64 addr;
+
+	addr = entry->msg.address_lo +
+		((uint64_t)entry->msg.address_hi << 32);
+
+	if (addr < ntb->msi->base_addr || addr >= ntb->msi->end_addr) {
+		dev_warn_once(&ntb->dev,
+			      "IRQ %d: MSI Address not within the memory window (%llx, [%llx %llx])\n",
+			      entry->irq, addr, ntb->msi->base_addr,
+			      ntb->msi->end_addr);
+		return -EFAULT;
+	}
+
+	msi_desc->addr_offset = addr - ntb->msi->base_addr;
+	msi_desc->data = entry->msg.data;
+
+	return 0;
+}
+
+static void ntb_msi_write_msg(struct msi_desc *entry, void *data)
+{
+	struct ntb_msi_devres *dr = data;
+
+	WARN_ON(ntb_msi_set_desc(dr->ntb, entry, dr->msi_desc));
+
+	if (dr->ntb->msi->desc_changed)
+		dr->ntb->msi->desc_changed(dr->ntb->ctx);
+}
+
+static void ntbm_msi_callback_release(struct device *dev, void *res)
+{
+	struct ntb_msi_devres *dr = res;
+
+	dr->entry->write_msi_msg = NULL;
+	dr->entry->write_msi_msg_data = NULL;
+}
+
+static int ntbm_msi_setup_callback(struct ntb_dev *ntb, struct msi_desc *entry,
+				   struct ntb_msi_desc *msi_desc)
+{
+	struct ntb_msi_devres *dr;
+
+	dr = devres_alloc(ntbm_msi_callback_release,
+			  sizeof(struct ntb_msi_devres), GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	dr->ntb = ntb;
+	dr->entry = entry;
+	dr->msi_desc = msi_desc;
+
+	devres_add(&ntb->dev, dr);
+
+	dr->entry->write_msi_msg = ntb_msi_write_msg;
+	dr->entry->write_msi_msg_data = dr;
+
+	return 0;
+}
+
+/**
+ * ntbm_msi_request_threaded_irq() - allocate an MSI interrupt
+ * @ntb:	NTB device context
+ * @handler:	Function to be called when the IRQ occurs
+ * @thread_fn:  Function to be called in a threaded interrupt context. NULL
+ *              for clients which handle everything in @handler
+ * @devname:    An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id:     A cookie passed back to the handler function
+ *
+ * This function assigns an interrupt handler to an unused
+ * MSI interrupt and returns the descriptor used to trigger
+ * it. The descriptor can then be sent to a peer to trigger
+ * the interrupt.
+ *
+ * The interrupt resource is managed with devres so it will
+ * be automatically freed when the NTB device is torn down.
+ *
+ * If an IRQ allocated with this function needs to be freed
+ * separately, ntbm_free_irq() must be used.
+ *
+ * Return: IRQ number assigned on success, otherwise a negative error number.
+ */
+int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler,
+				  irq_handler_t thread_fn,
+				  const char *name, void *dev_id,
+				  struct ntb_msi_desc *msi_desc)
+{
+	struct msi_desc *entry;
+	struct irq_desc *desc;
+	int ret;
+
+	if (!ntb->msi)
+		return -EINVAL;
+
+	for_each_pci_msi_entry(entry, ntb->pdev) {
+		desc = irq_to_desc(entry->irq);
+		if (desc->action)
+			continue;
+
+		ret = devm_request_threaded_irq(&ntb->dev, entry->irq, handler,
+						thread_fn, 0, name, dev_id);
+		if (ret)
+			continue;
+
+		if (ntb_msi_set_desc(ntb, entry, msi_desc)) {
+			devm_free_irq(&ntb->dev, entry->irq, dev_id);
+			continue;
+		}
+
+		ret = ntbm_msi_setup_callback(ntb, entry, msi_desc);
+		if (ret) {
+			devm_free_irq(&ntb->dev, entry->irq, dev_id);
+			return ret;
+		}
+
+
+		return entry->irq;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL(ntbm_msi_request_threaded_irq);
+
+static int ntbm_msi_callback_match(struct device *dev, void *res, void *data)
+{
+	struct ntb_dev *ntb = dev_ntb(dev);
+	struct ntb_msi_devres *dr = res;
+
+	return dr->ntb == ntb && dr->entry == data;
+}
+
+/**
+ * ntbm_msi_free_irq() - free an interrupt
+ * @ntb:	NTB device context
+ * @irq:	Interrupt line to free
+ * @dev_id:	Device identity to free
+ *
+ * This function should be used to manually free IRQs allocated with
+ * ntbm_request_[threaded_]irq().
+ */
+void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq, void *dev_id)
+{
+	struct msi_desc *entry = irq_get_msi_desc(irq);
+
+	entry->write_msi_msg = NULL;
+	entry->write_msi_msg_data = NULL;
+
+	WARN_ON(devres_destroy(&ntb->dev, ntbm_msi_callback_release,
+			       ntbm_msi_callback_match, entry));
+
+	devm_free_irq(&ntb->dev, irq, dev_id);
+}
+EXPORT_SYMBOL(ntbm_msi_free_irq);
+
+/**
+ * ntb_msi_peer_trigger() - Trigger an interrupt handler on a peer
+ * @ntb:	NTB device context
+ * @peer:	Peer index
+ * @desc:	MSI descriptor data which triggers the interrupt
+ *
+ * This function triggers an interrupt on a peer. It requires
+ * the descriptor structure to have been passed from that peer
+ * by some other means.
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer,
+			 struct ntb_msi_desc *desc)
+{
+	int idx;
+
+	if (!ntb->msi)
+		return -EINVAL;
+
+	idx = desc->addr_offset / sizeof(*ntb->msi->peer_mws[peer]);
+
+	iowrite32(desc->data, &ntb->msi->peer_mws[peer][idx]);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_msi_peer_trigger);
+
+/**
+ * ntb_msi_peer_addr() - Get the DMA address to trigger a peer's MSI interrupt
+ * @ntb:	NTB device context
+ * @peer:	Peer index
+ * @desc:	MSI descriptor data which triggers the interrupt
+ * @msi_addr:   Physical address to trigger the interrupt
+ *
+ * This function allows using DMA engines to trigger an interrupt
+ * (for example, trigger an interrupt to process the data after
+ * sending it). To trigger the interrupt, write @desc.data to the address
+ * returned in @msi_addr
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer,
+		      struct ntb_msi_desc *desc,
+		      phys_addr_t *msi_addr)
+{
+	int peer_widx = ntb_peer_mw_count(ntb) - 1 - peer;
+	phys_addr_t mw_phys_addr;
+	int ret;
+
+	ret = ntb_peer_mw_get_addr(ntb, peer_widx, &mw_phys_addr, NULL);
+	if (ret)
+		return ret;
+
+	if (msi_addr)
+		*msi_addr = mw_phys_addr + desc->addr_offset;
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_msi_peer_addr);
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index bed421b9579b..8c13538aeffe 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -58,9 +58,11 @@
 
 #include <linux/completion.h>
 #include <linux/device.h>
+#include <linux/interrupt.h>
 
 struct ntb_client;
 struct ntb_dev;
+struct ntb_msi;
 struct pci_dev;
 
 /**
@@ -426,6 +428,10 @@ struct ntb_dev {
 	spinlock_t			ctx_lock;
 	/* block unregister until device is fully released */
 	struct completion		released;
+
+#ifdef CONFIG_NTB_MSI
+	struct ntb_msi *msi;
+#endif
 };
 #define dev_ntb(__dev) container_of((__dev), struct ntb_dev, dev)
 
@@ -1627,4 +1633,71 @@ static inline int ntb_peer_highest_mw_idx(struct ntb_dev *ntb, int pidx)
 	return ntb_mw_count(ntb, pidx) - ret - 1;
 }
 
+struct ntb_msi_desc {
+	u32 addr_offset;
+	u32 data;
+};
+
+#ifdef CONFIG_NTB_MSI
+
+int ntb_msi_init(struct ntb_dev *ntb, void (*desc_changed)(void *ctx));
+int ntb_msi_setup_mws(struct ntb_dev *ntb);
+void ntb_msi_clear_mws(struct ntb_dev *ntb);
+int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler,
+				  irq_handler_t thread_fn,
+				  const char *name, void *dev_id,
+				  struct ntb_msi_desc *msi_desc);
+void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq, void *dev_id);
+int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer,
+			 struct ntb_msi_desc *desc);
+int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer,
+		      struct ntb_msi_desc *desc,
+		      phys_addr_t *msi_addr);
+
+#else /* not CONFIG_NTB_MSI */
+
+static inline int ntb_msi_init(struct ntb_dev *ntb,
+			       void (*desc_changed)(void *ctx))
+{
+	return -EOPNOTSUPP;
+}
+static inline int ntb_msi_setup_mws(struct ntb_dev *ntb)
+{
+	return -EOPNOTSUPP;
+}
+static inline void ntb_msi_clear_mws(struct ntb_dev *ntb) {}
+static inline int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb,
+						irq_handler_t handler,
+						irq_handler_t thread_fn,
+						const char *name, void *dev_id,
+						struct ntb_msi_desc *msi_desc)
+{
+	return -EOPNOTSUPP;
+}
+static inline void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq,
+				     void *dev_id) {}
+static inline int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer,
+				       struct ntb_msi_desc *desc)
+{
+	return -EOPNOTSUPP;
+}
+static inline int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer,
+				    struct ntb_msi_desc *desc,
+				    phys_addr_t *msi_addr)
+{
+	return -EOPNOTSUPP;
+
+}
+
+#endif /* CONFIG_NTB_MSI */
+
+static inline int ntbm_msi_request_irq(struct ntb_dev *ntb,
+				       irq_handler_t handler,
+				       const char *name, void *dev_id,
+				       struct ntb_msi_desc *msi_desc)
+{
+	return ntbm_msi_request_threaded_irq(ntb, handler, NULL, name,
+					     dev_id, msi_desc);
+}
+
 #endif
-- 
cgit v1.2.3


From 43c76d72ea853ccaeb22a497c13d3cd946869ec6 Mon Sep 17 00:00:00 2001
From: Sean Paul <seanpaul@chromium.org>
Date: Tue, 11 Jun 2019 16:49:53 -0400
Subject: drm: Add atomic variants of enable/disable to encoder helper funcs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds atomic_enable and atomic_disable callbacks to the
encoder helpers. This will allow encoders to make informed decisions in
their start-up/shutdown based on the committed state.

Aside from the new hooks, this patch also introduces the new signature
for .atomic_* functions going forward. Instead of passing object state
(well, encoders don't have atomic state, but let's ignore that), we pass
the entire atomic state so the driver can inspect more than what's
happening locally.

This is particularly important for the upcoming self refresh helpers.

Changes in v3:
- Added patch to the set
Changes in v4:
- Move atomic_disable above prepare (Daniel)
- Add breadcrumb to .enable() docbook (Daniel)
Changes in v5:
- None
Changes in v6:
- Tweak kerneldoc some more (Sam)

Link to v3: https://patchwork.freedesktop.org/patch/msgid/20190502194956.218441-2-sean@poorly.run
Link to v4: https://patchwork.freedesktop.org/patch/msgid/20190508160920.144739-2-sean@poorly.run
Link to v5: https://patchwork.freedesktop.org/patch/msgid/20190611160844.257498-2-sean@poorly.run

Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190611204959.180855-1-sean@poorly.run
---
 drivers/gpu/drm/drm_atomic_helper.c      |  8 ++++--
 include/drm/drm_modeset_helper_vtables.h | 49 ++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 0fc63d682245..62e29b5ebb6e 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -1002,7 +1002,9 @@ disable_outputs(struct drm_device *dev, struct drm_atomic_state *old_state)
 
 		/* Right function depends upon target state. */
 		if (funcs) {
-			if (new_conn_state->crtc && funcs->prepare)
+			if (funcs->atomic_disable)
+				funcs->atomic_disable(encoder, old_state);
+			else if (new_conn_state->crtc && funcs->prepare)
 				funcs->prepare(encoder);
 			else if (funcs->disable)
 				funcs->disable(encoder);
@@ -1311,7 +1313,9 @@ void drm_atomic_helper_commit_modeset_enables(struct drm_device *dev,
 		drm_bridge_pre_enable(encoder->bridge);
 
 		if (funcs) {
-			if (funcs->enable)
+			if (funcs->atomic_enable)
+				funcs->atomic_enable(encoder, old_state);
+			else if (funcs->enable)
 				funcs->enable(encoder);
 			else if (funcs->commit)
 				funcs->commit(encoder);
diff --git a/include/drm/drm_modeset_helper_vtables.h b/include/drm/drm_modeset_helper_vtables.h
index df80131bb10f..13f0e3ca0a29 100644
--- a/include/drm/drm_modeset_helper_vtables.h
+++ b/include/drm/drm_modeset_helper_vtables.h
@@ -679,6 +679,52 @@ struct drm_encoder_helper_funcs {
 	enum drm_connector_status (*detect)(struct drm_encoder *encoder,
 					    struct drm_connector *connector);
 
+	/**
+	 * @atomic_disable:
+	 *
+	 * This callback should be used to disable the encoder. With the atomic
+	 * drivers it is called before this encoder's CRTC has been shut off
+	 * using their own &drm_crtc_helper_funcs.atomic_disable hook. If that
+	 * sequence is too simple drivers can just add their own driver private
+	 * encoder hooks and call them from CRTC's callback by looping over all
+	 * encoders connected to it using for_each_encoder_on_crtc().
+	 *
+	 * This callback is a variant of @disable that provides the atomic state
+	 * to the driver. If @atomic_disable is implemented, @disable is not
+	 * called by the helpers.
+	 *
+	 * This hook is only used by atomic helpers. Atomic drivers don't need
+	 * to implement it if there's no need to disable anything at the encoder
+	 * level. To ensure that runtime PM handling (using either DPMS or the
+	 * new "ACTIVE" property) works @atomic_disable must be the inverse of
+	 * @atomic_enable.
+	 */
+	void (*atomic_disable)(struct drm_encoder *encoder,
+			       struct drm_atomic_state *state);
+
+	/**
+	 * @atomic_enable:
+	 *
+	 * This callback should be used to enable the encoder. It is called
+	 * after this encoder's CRTC has been enabled using their own
+	 * &drm_crtc_helper_funcs.atomic_enable hook. If that sequence is
+	 * too simple drivers can just add their own driver private encoder
+	 * hooks and call them from CRTC's callback by looping over all encoders
+	 * connected to it using for_each_encoder_on_crtc().
+	 *
+	 * This callback is a variant of @enable that provides the atomic state
+	 * to the driver. If @atomic_enable is implemented, @enable is not
+	 * called by the helpers.
+	 *
+	 * This hook is only used by atomic helpers, it is the opposite of
+	 * @atomic_disable. Atomic drivers don't need to implement it if there's
+	 * no need to enable anything at the encoder level. To ensure that
+	 * runtime PM handling works @atomic_enable must be the inverse of
+	 * @atomic_disable.
+	 */
+	void (*atomic_enable)(struct drm_encoder *encoder,
+			      struct drm_atomic_state *state);
+
 	/**
 	 * @disable:
 	 *
@@ -695,6 +741,9 @@ struct drm_encoder_helper_funcs {
 	 * handling (using either DPMS or the new "ACTIVE" property) works
 	 * @disable must be the inverse of @enable for atomic drivers.
 	 *
+	 * For atomic drivers also consider @atomic_disable and save yourself
+	 * from having to read the NOTE below!
+	 *
 	 * NOTE:
 	 *
 	 * With legacy CRTC helpers there's a big semantic difference between
-- 
cgit v1.2.3


From 1b27fbdde1df172dba604855c45078d741f8c858 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Tue, 11 Jun 2019 16:51:43 -0400
Subject: drm: Add drm_atomic_get_(old|new)_connector_for_encoder() helpers

Add functions to the atomic core to retrieve the old and new connectors
associated with an encoder in a drm_atomic_state. This is useful for
encoders and bridges that need to access the connector, for instance for
the drm_display_info.

The CRTC associated with the encoder can also be retrieved through the
connector state, and from it, the old and new CRTC states.

Changed in v4:
- Added to the set
Changed in v5:
- Fix up docbook (Daniel & Laurent)
Changed in v6:
- Updated commit subject (Sam)

Link to v4: https://patchwork.freedesktop.org/patch/msgid/20190508160920.144739-3-sean@poorly.run
Link to v5: https://patchwork.freedesktop.org/patch/msgid/20190611160844.257498-3-sean@poorly.run

Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Sam Ravnborg <sam@ravnborg.org>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>

[seanpaul removed WARNs from helpers and added docs to explain why
returning NULL might be valid]
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190611205147.181298-1-sean@poorly.run
---
 drivers/gpu/drm/drm_atomic.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
 include/drm/drm_atomic.h     |  7 +++++
 include/drm/drm_connector.h  |  5 ++++
 3 files changed, 81 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index 169f06d7c5ce..a4e779deab0f 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -846,6 +846,75 @@ drm_atomic_get_new_private_obj_state(struct drm_atomic_state *state,
 }
 EXPORT_SYMBOL(drm_atomic_get_new_private_obj_state);
 
+/**
+ * drm_atomic_get_old_connector_for_encoder - Get old connector for an encoder
+ * @state: Atomic state
+ * @encoder: The encoder to fetch the connector state for
+ *
+ * This function finds and returns the connector that was connected to @encoder
+ * as specified by the @state.
+ *
+ * If there is no connector in @state which previously had @encoder connected to
+ * it, this function will return NULL. While this may seem like an invalid use
+ * case, it is sometimes useful to differentiate commits which had no prior
+ * connectors attached to @encoder vs ones that did (and to inspect their
+ * state). This is especially true in enable hooks because the pipeline has
+ * changed.
+ *
+ * Returns: The old connector connected to @encoder, or NULL if the encoder is
+ * not connected.
+ */
+struct drm_connector *
+drm_atomic_get_old_connector_for_encoder(struct drm_atomic_state *state,
+					 struct drm_encoder *encoder)
+{
+	struct drm_connector_state *conn_state;
+	struct drm_connector *connector;
+	unsigned int i;
+
+	for_each_old_connector_in_state(state, connector, conn_state, i) {
+		if (conn_state->best_encoder == encoder)
+			return connector;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(drm_atomic_get_old_connector_for_encoder);
+
+/**
+ * drm_atomic_get_new_connector_for_encoder - Get new connector for an encoder
+ * @state: Atomic state
+ * @encoder: The encoder to fetch the connector state for
+ *
+ * This function finds and returns the connector that will be connected to
+ * @encoder as specified by the @state.
+ *
+ * If there is no connector in @state which will have @encoder connected to it,
+ * this function will return NULL. While this may seem like an invalid use case,
+ * it is sometimes useful to differentiate commits which have no connectors
+ * attached to @encoder vs ones that do (and to inspect their state). This is
+ * especially true in disable hooks because the pipeline will change.
+ *
+ * Returns: The new connector connected to @encoder, or NULL if the encoder is
+ * not connected.
+ */
+struct drm_connector *
+drm_atomic_get_new_connector_for_encoder(struct drm_atomic_state *state,
+					 struct drm_encoder *encoder)
+{
+	struct drm_connector_state *conn_state;
+	struct drm_connector *connector;
+	unsigned int i;
+
+	for_each_new_connector_in_state(state, connector, conn_state, i) {
+		if (conn_state->best_encoder == encoder)
+			return connector;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(drm_atomic_get_new_connector_for_encoder);
+
 /**
  * drm_atomic_get_connector_state - get connector state
  * @state: global atomic state object
diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h
index e937ff2beb04..f12215647801 100644
--- a/include/drm/drm_atomic.h
+++ b/include/drm/drm_atomic.h
@@ -459,6 +459,13 @@ struct drm_private_state *
 drm_atomic_get_new_private_obj_state(struct drm_atomic_state *state,
 				     struct drm_private_obj *obj);
 
+struct drm_connector *
+drm_atomic_get_old_connector_for_encoder(struct drm_atomic_state *state,
+					 struct drm_encoder *encoder);
+struct drm_connector *
+drm_atomic_get_new_connector_for_encoder(struct drm_atomic_state *state,
+					 struct drm_encoder *encoder);
+
 /**
  * drm_atomic_get_existing_crtc_state - get crtc state, if it exists
  * @state: global atomic state object
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 47e749b74e5f..071143bc0ebd 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -518,6 +518,11 @@ struct drm_connector_state {
 	 * &drm_connector_helper_funcs.atomic_best_encoder or
 	 * &drm_connector_helper_funcs.best_encoder callbacks.
 	 *
+	 * This is also used in the atomic helpers to map encoders to their
+	 * current and previous connectors, see
+	 * &drm_atomic_get_old_connector_for_encoder() and
+	 * &drm_atomic_get_new_connector_for_encoder().
+	 *
 	 * NOTE: Atomic drivers must fill this out (either themselves or through
 	 * helpers), for otherwise the GETCONNECTOR and GETENCODER IOCTLs will
 	 * not return correct data to userspace.
-- 
cgit v1.2.3


From 5ade071ba13e3bb24e3a9d5f8d6a3cf145deeb18 Mon Sep 17 00:00:00 2001
From: Sean Paul <seanpaul@chromium.org>
Date: Tue, 11 Jun 2019 12:08:17 -0400
Subject: drm: Add atomic variants for bridge enable/disable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds atomic variants for all of
pre_enable/enable/disable/post_disable bridge functions. These will be
called from the appropriate atomic helper functions. If the bridge
driver doesn't implement the atomic version of the function, we will
fall back to the vanilla implementation.

Note that some drivers call drm_bridge_disable directly, and these cases
are not covered. It's up to the driver to decide whether to implement
both atomic_disable and disable, or if it's not necessary.

Changes in v3:
- Added to the patchset
Changes in v4:
- Fix up docbook references (Daniel)
Changes in v5:
- None

Link to v3: https://patchwork.freedesktop.org/patch/msgid/20190502194956.218441-4-sean@poorly.run
Link to v4: https://patchwork.freedesktop.org/patch/msgid/20190508160920.144739-4-sean@poorly.run

Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Andrzej Hajda <a.hajda@samsung.com>
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190611160844.257498-4-sean@poorly.run
---
 drivers/gpu/drm/drm_atomic_helper.c |   8 +--
 drivers/gpu/drm/drm_bridge.c        | 110 ++++++++++++++++++++++++++++++++++++
 include/drm/drm_bridge.h            | 106 ++++++++++++++++++++++++++++++++++
 3 files changed, 220 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 62e29b5ebb6e..2133f6253917 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -998,7 +998,7 @@ disable_outputs(struct drm_device *dev, struct drm_atomic_state *old_state)
 		 * Each encoder has at most one connector (since we always steal
 		 * it away), so we won't call disable hooks twice.
 		 */
-		drm_bridge_disable(encoder->bridge);
+		drm_atomic_bridge_disable(encoder->bridge, old_state);
 
 		/* Right function depends upon target state. */
 		if (funcs) {
@@ -1012,7 +1012,7 @@ disable_outputs(struct drm_device *dev, struct drm_atomic_state *old_state)
 				funcs->dpms(encoder, DRM_MODE_DPMS_OFF);
 		}
 
-		drm_bridge_post_disable(encoder->bridge);
+		drm_atomic_bridge_post_disable(encoder->bridge, old_state);
 	}
 
 	for_each_oldnew_crtc_in_state(old_state, crtc, old_crtc_state, new_crtc_state, i) {
@@ -1310,7 +1310,7 @@ void drm_atomic_helper_commit_modeset_enables(struct drm_device *dev,
 		 * Each encoder has at most one connector (since we always steal
 		 * it away), so we won't call enable hooks twice.
 		 */
-		drm_bridge_pre_enable(encoder->bridge);
+		drm_atomic_bridge_pre_enable(encoder->bridge, old_state);
 
 		if (funcs) {
 			if (funcs->atomic_enable)
@@ -1321,7 +1321,7 @@ void drm_atomic_helper_commit_modeset_enables(struct drm_device *dev,
 				funcs->commit(encoder);
 		}
 
-		drm_bridge_enable(encoder->bridge);
+		drm_atomic_bridge_enable(encoder->bridge, old_state);
 	}
 
 	drm_atomic_helper_commit_writebacks(dev, old_state);
diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c
index 138b2711d389..cba537c99e43 100644
--- a/drivers/gpu/drm/drm_bridge.c
+++ b/drivers/gpu/drm/drm_bridge.c
@@ -352,6 +352,116 @@ void drm_bridge_enable(struct drm_bridge *bridge)
 }
 EXPORT_SYMBOL(drm_bridge_enable);
 
+/**
+ * drm_atomic_bridge_disable - disables all bridges in the encoder chain
+ * @bridge: bridge control structure
+ * @state: atomic state being committed
+ *
+ * Calls &drm_bridge_funcs.atomic_disable (falls back on
+ * &drm_bridge_funcs.disable) op for all the bridges in the encoder chain,
+ * starting from the last bridge to the first. These are called before calling
+ * &drm_encoder_helper_funcs.atomic_disable
+ *
+ * Note: the bridge passed should be the one closest to the encoder
+ */
+void drm_atomic_bridge_disable(struct drm_bridge *bridge,
+			       struct drm_atomic_state *state)
+{
+	if (!bridge)
+		return;
+
+	drm_atomic_bridge_disable(bridge->next, state);
+
+	if (bridge->funcs->atomic_disable)
+		bridge->funcs->atomic_disable(bridge, state);
+	else if (bridge->funcs->disable)
+		bridge->funcs->disable(bridge);
+}
+EXPORT_SYMBOL(drm_atomic_bridge_disable);
+
+/**
+ * drm_atomic_bridge_post_disable - cleans up after disabling all bridges in the
+ *				    encoder chain
+ * @bridge: bridge control structure
+ * @state: atomic state being committed
+ *
+ * Calls &drm_bridge_funcs.atomic_post_disable (falls back on
+ * &drm_bridge_funcs.post_disable) op for all the bridges in the encoder chain,
+ * starting from the first bridge to the last. These are called after completing
+ * &drm_encoder_helper_funcs.atomic_disable
+ *
+ * Note: the bridge passed should be the one closest to the encoder
+ */
+void drm_atomic_bridge_post_disable(struct drm_bridge *bridge,
+				    struct drm_atomic_state *state)
+{
+	if (!bridge)
+		return;
+
+	if (bridge->funcs->atomic_post_disable)
+		bridge->funcs->atomic_post_disable(bridge, state);
+	else if (bridge->funcs->post_disable)
+		bridge->funcs->post_disable(bridge);
+
+	drm_atomic_bridge_post_disable(bridge->next, state);
+}
+EXPORT_SYMBOL(drm_atomic_bridge_post_disable);
+
+/**
+ * drm_atomic_bridge_pre_enable - prepares for enabling all bridges in the
+ *				  encoder chain
+ * @bridge: bridge control structure
+ * @state: atomic state being committed
+ *
+ * Calls &drm_bridge_funcs.atomic_pre_enable (falls back on
+ * &drm_bridge_funcs.pre_enable) op for all the bridges in the encoder chain,
+ * starting from the last bridge to the first. These are called before calling
+ * &drm_encoder_helper_funcs.atomic_enable
+ *
+ * Note: the bridge passed should be the one closest to the encoder
+ */
+void drm_atomic_bridge_pre_enable(struct drm_bridge *bridge,
+				  struct drm_atomic_state *state)
+{
+	if (!bridge)
+		return;
+
+	drm_atomic_bridge_pre_enable(bridge->next, state);
+
+	if (bridge->funcs->atomic_pre_enable)
+		bridge->funcs->atomic_pre_enable(bridge, state);
+	else if (bridge->funcs->pre_enable)
+		bridge->funcs->pre_enable(bridge);
+}
+EXPORT_SYMBOL(drm_atomic_bridge_pre_enable);
+
+/**
+ * drm_atomic_bridge_enable - enables all bridges in the encoder chain
+ * @bridge: bridge control structure
+ * @state: atomic state being committed
+ *
+ * Calls &drm_bridge_funcs.atomic_enable (falls back on
+ * &drm_bridge_funcs.enable) op for all the bridges in the encoder chain,
+ * starting from the first bridge to the last. These are called after completing
+ * &drm_encoder_helper_funcs.atomic_enable
+ *
+ * Note: the bridge passed should be the one closest to the encoder
+ */
+void drm_atomic_bridge_enable(struct drm_bridge *bridge,
+			      struct drm_atomic_state *state)
+{
+	if (!bridge)
+		return;
+
+	if (bridge->funcs->atomic_enable)
+		bridge->funcs->atomic_enable(bridge, state);
+	else if (bridge->funcs->enable)
+		bridge->funcs->enable(bridge);
+
+	drm_atomic_bridge_enable(bridge->next, state);
+}
+EXPORT_SYMBOL(drm_atomic_bridge_enable);
+
 #ifdef CONFIG_OF
 /**
  * of_drm_find_bridge - find the bridge corresponding to the device node in
diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h
index d4428913a4e1..322801884814 100644
--- a/include/drm/drm_bridge.h
+++ b/include/drm/drm_bridge.h
@@ -237,6 +237,103 @@ struct drm_bridge_funcs {
 	 * The enable callback is optional.
 	 */
 	void (*enable)(struct drm_bridge *bridge);
+
+	/**
+	 * @atomic_pre_enable:
+	 *
+	 * This callback should enable the bridge. It is called right before
+	 * the preceding element in the display pipe is enabled. If the
+	 * preceding element is a bridge this means it's called before that
+	 * bridge's @atomic_pre_enable or @pre_enable function. If the preceding
+	 * element is a &drm_encoder it's called right before the encoder's
+	 * &drm_encoder_helper_funcs.atomic_enable hook.
+	 *
+	 * The display pipe (i.e. clocks and timing signals) feeding this bridge
+	 * will not yet be running when this callback is called. The bridge must
+	 * not enable the display link feeding the next bridge in the chain (if
+	 * there is one) when this callback is called.
+	 *
+	 * Note that this function will only be invoked in the context of an
+	 * atomic commit. It will not be invoked from &drm_bridge_pre_enable. It
+	 * would be prudent to also provide an implementation of @pre_enable if
+	 * you are expecting driver calls into &drm_bridge_pre_enable.
+	 *
+	 * The @atomic_pre_enable callback is optional.
+	 */
+	void (*atomic_pre_enable)(struct drm_bridge *bridge,
+				  struct drm_atomic_state *state);
+
+	/**
+	 * @atomic_enable:
+	 *
+	 * This callback should enable the bridge. It is called right after
+	 * the preceding element in the display pipe is enabled. If the
+	 * preceding element is a bridge this means it's called after that
+	 * bridge's @atomic_enable or @enable function. If the preceding element
+	 * is a &drm_encoder it's called right after the encoder's
+	 * &drm_encoder_helper_funcs.atomic_enable hook.
+	 *
+	 * The bridge can assume that the display pipe (i.e. clocks and timing
+	 * signals) feeding it is running when this callback is called. This
+	 * callback must enable the display link feeding the next bridge in the
+	 * chain if there is one.
+	 *
+	 * Note that this function will only be invoked in the context of an
+	 * atomic commit. It will not be invoked from &drm_bridge_enable. It
+	 * would be prudent to also provide an implementation of @enable if
+	 * you are expecting driver calls into &drm_bridge_enable.
+	 *
+	 * The enable callback is optional.
+	 */
+	void (*atomic_enable)(struct drm_bridge *bridge,
+			      struct drm_atomic_state *state);
+	/**
+	 * @atomic_disable:
+	 *
+	 * This callback should disable the bridge. It is called right before
+	 * the preceding element in the display pipe is disabled. If the
+	 * preceding element is a bridge this means it's called before that
+	 * bridge's @atomic_disable or @disable vfunc. If the preceding element
+	 * is a &drm_encoder it's called right before the
+	 * &drm_encoder_helper_funcs.atomic_disable hook.
+	 *
+	 * The bridge can assume that the display pipe (i.e. clocks and timing
+	 * signals) feeding it is still running when this callback is called.
+	 *
+	 * Note that this function will only be invoked in the context of an
+	 * atomic commit. It will not be invoked from &drm_bridge_disable. It
+	 * would be prudent to also provide an implementation of @disable if
+	 * you are expecting driver calls into &drm_bridge_disable.
+	 *
+	 * The disable callback is optional.
+	 */
+	void (*atomic_disable)(struct drm_bridge *bridge,
+			       struct drm_atomic_state *state);
+
+	/**
+	 * @atomic_post_disable:
+	 *
+	 * This callback should disable the bridge. It is called right after the
+	 * preceding element in the display pipe is disabled. If the preceding
+	 * element is a bridge this means it's called after that bridge's
+	 * @atomic_post_disable or @post_disable function. If the preceding
+	 * element is a &drm_encoder it's called right after the encoder's
+	 * &drm_encoder_helper_funcs.atomic_disable hook.
+	 *
+	 * The bridge must assume that the display pipe (i.e. clocks and timing
+	 * signals) feeding it is no longer running when this callback is
+	 * called.
+	 *
+	 * Note that this function will only be invoked in the context of an
+	 * atomic commit. It will not be invoked from &drm_bridge_post_disable.
+	 * It would be prudent to also provide an implementation of
+	 * @post_disable if you are expecting driver calls into
+	 * &drm_bridge_post_disable.
+	 *
+	 * The post_disable callback is optional.
+	 */
+	void (*atomic_post_disable)(struct drm_bridge *bridge,
+				    struct drm_atomic_state *state);
 };
 
 /**
@@ -314,6 +411,15 @@ void drm_bridge_mode_set(struct drm_bridge *bridge,
 void drm_bridge_pre_enable(struct drm_bridge *bridge);
 void drm_bridge_enable(struct drm_bridge *bridge);
 
+void drm_atomic_bridge_disable(struct drm_bridge *bridge,
+			       struct drm_atomic_state *state);
+void drm_atomic_bridge_post_disable(struct drm_bridge *bridge,
+				    struct drm_atomic_state *state);
+void drm_atomic_bridge_pre_enable(struct drm_bridge *bridge,
+				  struct drm_atomic_state *state);
+void drm_atomic_bridge_enable(struct drm_bridge *bridge,
+			      struct drm_atomic_state *state);
+
 #ifdef CONFIG_DRM_PANEL_BRIDGE
 struct drm_bridge *drm_panel_bridge_add(struct drm_panel *panel,
 					u32 connector_type);
-- 
cgit v1.2.3


From 6f3b62781bbd2670756a4847113d410a827a2593 Mon Sep 17 00:00:00 2001
From: Sean Paul <seanpaul@chromium.org>
Date: Tue, 11 Jun 2019 12:08:18 -0400
Subject: drm: Convert connector_helper_funcs->atomic_check to accept
 drm_atomic_state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Everyone who implements connector_helper_funcs->atomic_check reaches
into the connector state to get the atomic state. Instead of continuing
this pattern, change the callback signature to just give atomic state
and let the driver determine what it does and does not need from it.

Eventually all atomic functions should do this, but that's just too much
busy work for me.

Changes in v3:
- Added to the set
Changes in v4:
- None
Changes in v5:
- intel_digital_connector_atomic_check declaration moved to i915_atomic.h

Link to v3: https://patchwork.freedesktop.org/patch/msgid/20190502194956.218441-5-sean@poorly.run
Link to v4: https://patchwork.freedesktop.org/patch/msgid/20190508160920.144739-5-sean@poorly.run

Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
Cc: Eric Anholt <eric@anholt.net>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com> [for rcar lvds]
Acked-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190611160844.257498-5-sean@poorly.run
---
 drivers/gpu/drm/drm_atomic_helper.c      |  4 ++--
 drivers/gpu/drm/i915/intel_atomic.c      |  8 +++++---
 drivers/gpu/drm/i915/intel_atomic.h      |  2 +-
 drivers/gpu/drm/i915/intel_dp_mst.c      |  7 ++++---
 drivers/gpu/drm/i915/intel_sdvo.c        |  9 +++++----
 drivers/gpu/drm/i915/intel_tv.c          |  8 +++++---
 drivers/gpu/drm/nouveau/dispnv50/disp.c  |  5 +++--
 drivers/gpu/drm/rcar-du/rcar_lvds.c      | 12 +++++++-----
 drivers/gpu/drm/vc4/vc4_txp.c            |  7 ++++---
 include/drm/drm_modeset_helper_vtables.h |  2 +-
 10 files changed, 37 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 2133f6253917..e58be6996069 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -686,7 +686,7 @@ drm_atomic_helper_check_modeset(struct drm_device *dev,
 		}
 
 		if (funcs->atomic_check)
-			ret = funcs->atomic_check(connector, new_connector_state);
+			ret = funcs->atomic_check(connector, state);
 		if (ret)
 			return ret;
 
@@ -728,7 +728,7 @@ drm_atomic_helper_check_modeset(struct drm_device *dev,
 			continue;
 
 		if (funcs->atomic_check)
-			ret = funcs->atomic_check(connector, new_connector_state);
+			ret = funcs->atomic_check(connector, state);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpu/drm/i915/intel_atomic.c b/drivers/gpu/drm/i915/intel_atomic.c
index 58b8049649a0..ab40448a19d5 100644
--- a/drivers/gpu/drm/i915/intel_atomic.c
+++ b/drivers/gpu/drm/i915/intel_atomic.c
@@ -106,12 +106,14 @@ int intel_digital_connector_atomic_set_property(struct drm_connector *connector,
 }
 
 int intel_digital_connector_atomic_check(struct drm_connector *conn,
-					 struct drm_connector_state *new_state)
+					 struct drm_atomic_state *state)
 {
+	struct drm_connector_state *new_state =
+		drm_atomic_get_new_connector_state(state, conn);
 	struct intel_digital_connector_state *new_conn_state =
 		to_intel_digital_connector_state(new_state);
 	struct drm_connector_state *old_state =
-		drm_atomic_get_old_connector_state(new_state->state, conn);
+		drm_atomic_get_old_connector_state(state, conn);
 	struct intel_digital_connector_state *old_conn_state =
 		to_intel_digital_connector_state(old_state);
 	struct drm_crtc_state *crtc_state;
@@ -121,7 +123,7 @@ int intel_digital_connector_atomic_check(struct drm_connector *conn,
 	if (!new_state->crtc)
 		return 0;
 
-	crtc_state = drm_atomic_get_new_crtc_state(new_state->state, new_state->crtc);
+	crtc_state = drm_atomic_get_new_crtc_state(state, new_state->crtc);
 
 	/*
 	 * These properties are handled by fastset, and might not end
diff --git a/drivers/gpu/drm/i915/intel_atomic.h b/drivers/gpu/drm/i915/intel_atomic.h
index 1c8507da1a69..58065d3161a3 100644
--- a/drivers/gpu/drm/i915/intel_atomic.h
+++ b/drivers/gpu/drm/i915/intel_atomic.h
@@ -28,7 +28,7 @@ int intel_digital_connector_atomic_set_property(struct drm_connector *connector,
 						struct drm_property *property,
 						u64 val);
 int intel_digital_connector_atomic_check(struct drm_connector *conn,
-					 struct drm_connector_state *new_state);
+					 struct drm_atomic_state *state);
 struct drm_connector_state *
 intel_digital_connector_duplicate_state(struct drm_connector *connector);
 
diff --git a/drivers/gpu/drm/i915/intel_dp_mst.c b/drivers/gpu/drm/i915/intel_dp_mst.c
index 0caf645fbbb8..60652ebbdf61 100644
--- a/drivers/gpu/drm/i915/intel_dp_mst.c
+++ b/drivers/gpu/drm/i915/intel_dp_mst.c
@@ -151,9 +151,10 @@ static int intel_dp_mst_compute_config(struct intel_encoder *encoder,
 
 static int
 intel_dp_mst_atomic_check(struct drm_connector *connector,
-			  struct drm_connector_state *new_conn_state)
+			  struct drm_atomic_state *state)
 {
-	struct drm_atomic_state *state = new_conn_state->state;
+	struct drm_connector_state *new_conn_state =
+		drm_atomic_get_new_connector_state(state, connector);
 	struct drm_connector_state *old_conn_state =
 		drm_atomic_get_old_connector_state(state, connector);
 	struct intel_connector *intel_connector =
@@ -163,7 +164,7 @@ intel_dp_mst_atomic_check(struct drm_connector *connector,
 	struct drm_dp_mst_topology_mgr *mgr;
 	int ret;
 
-	ret = intel_digital_connector_atomic_check(connector, new_conn_state);
+	ret = intel_digital_connector_atomic_check(connector, state);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c
index 6c98b35790db..368b99f6f7fb 100644
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -2353,9 +2353,10 @@ static const struct drm_connector_funcs intel_sdvo_connector_funcs = {
 };
 
 static int intel_sdvo_atomic_check(struct drm_connector *conn,
-				   struct drm_connector_state *new_conn_state)
+				   struct drm_atomic_state *state)
 {
-	struct drm_atomic_state *state = new_conn_state->state;
+	struct drm_connector_state *new_conn_state =
+		drm_atomic_get_new_connector_state(state, conn);
 	struct drm_connector_state *old_conn_state =
 		drm_atomic_get_old_connector_state(state, conn);
 	struct intel_sdvo_connector_state *old_state =
@@ -2367,13 +2368,13 @@ static int intel_sdvo_atomic_check(struct drm_connector *conn,
 	    (memcmp(&old_state->tv, &new_state->tv, sizeof(old_state->tv)) ||
 	     memcmp(&old_conn_state->tv, &new_conn_state->tv, sizeof(old_conn_state->tv)))) {
 		struct drm_crtc_state *crtc_state =
-			drm_atomic_get_new_crtc_state(new_conn_state->state,
+			drm_atomic_get_new_crtc_state(state,
 						      new_conn_state->crtc);
 
 		crtc_state->connectors_changed = true;
 	}
 
-	return intel_digital_connector_atomic_check(conn, new_conn_state);
+	return intel_digital_connector_atomic_check(conn, state);
 }
 
 static const struct drm_connector_helper_funcs intel_sdvo_connector_helper_funcs = {
diff --git a/drivers/gpu/drm/i915/intel_tv.c b/drivers/gpu/drm/i915/intel_tv.c
index 5dc594eafaf2..0a95df6c6a57 100644
--- a/drivers/gpu/drm/i915/intel_tv.c
+++ b/drivers/gpu/drm/i915/intel_tv.c
@@ -1821,16 +1821,18 @@ static const struct drm_connector_funcs intel_tv_connector_funcs = {
 };
 
 static int intel_tv_atomic_check(struct drm_connector *connector,
-				 struct drm_connector_state *new_state)
+				 struct drm_atomic_state *state)
 {
+	struct drm_connector_state *new_state;
 	struct drm_crtc_state *new_crtc_state;
 	struct drm_connector_state *old_state;
 
+	new_state = drm_atomic_get_new_connector_state(state, connector);
 	if (!new_state->crtc)
 		return 0;
 
-	old_state = drm_atomic_get_old_connector_state(new_state->state, connector);
-	new_crtc_state = drm_atomic_get_new_crtc_state(new_state->state, new_state->crtc);
+	old_state = drm_atomic_get_old_connector_state(state, connector);
+	new_crtc_state = drm_atomic_get_new_crtc_state(state, new_state->crtc);
 
 	if (old_state->tv.mode != new_state->tv.mode ||
 	    old_state->tv.margins.left != new_state->tv.margins.left ||
diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c
index 4b1650f51955..7ba373f493b2 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/disp.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c
@@ -948,11 +948,12 @@ nv50_mstc_get_modes(struct drm_connector *connector)
 
 static int
 nv50_mstc_atomic_check(struct drm_connector *connector,
-		       struct drm_connector_state *new_conn_state)
+		       struct drm_atomic_state *state)
 {
-	struct drm_atomic_state *state = new_conn_state->state;
 	struct nv50_mstc *mstc = nv50_mstc(connector);
 	struct drm_dp_mst_topology_mgr *mgr = &mstc->mstm->mgr;
+	struct drm_connector_state *new_conn_state =
+		drm_atomic_get_new_connector_state(state, connector);
 	struct drm_connector_state *old_conn_state =
 		drm_atomic_get_old_connector_state(state, connector);
 	struct drm_crtc_state *crtc_state;
diff --git a/drivers/gpu/drm/rcar-du/rcar_lvds.c b/drivers/gpu/drm/rcar-du/rcar_lvds.c
index 620b51aab291..5b81ba2a7f27 100644
--- a/drivers/gpu/drm/rcar-du/rcar_lvds.c
+++ b/drivers/gpu/drm/rcar-du/rcar_lvds.c
@@ -92,13 +92,15 @@ static int rcar_lvds_connector_get_modes(struct drm_connector *connector)
 }
 
 static int rcar_lvds_connector_atomic_check(struct drm_connector *connector,
-					    struct drm_connector_state *state)
+					    struct drm_atomic_state *state)
 {
 	struct rcar_lvds *lvds = connector_to_rcar_lvds(connector);
 	const struct drm_display_mode *panel_mode;
+	struct drm_connector_state *conn_state;
 	struct drm_crtc_state *crtc_state;
 
-	if (!state->crtc)
+	conn_state = drm_atomic_get_new_connector_state(state, connector);
+	if (!conn_state->crtc)
 		return 0;
 
 	if (list_empty(&connector->modes)) {
@@ -110,9 +112,9 @@ static int rcar_lvds_connector_atomic_check(struct drm_connector *connector,
 				      struct drm_display_mode, head);
 
 	/* We're not allowed to modify the resolution. */
-	crtc_state = drm_atomic_get_crtc_state(state->state, state->crtc);
-	if (IS_ERR(crtc_state))
-		return PTR_ERR(crtc_state);
+	crtc_state = drm_atomic_get_crtc_state(state, conn_state->crtc);
+	if (!crtc_state)
+		return -EINVAL;
 
 	if (crtc_state->mode.hdisplay != panel_mode->hdisplay ||
 	    crtc_state->mode.vdisplay != panel_mode->vdisplay)
diff --git a/drivers/gpu/drm/vc4/vc4_txp.c b/drivers/gpu/drm/vc4/vc4_txp.c
index c8b89a78f9f4..96f91c1b4b6e 100644
--- a/drivers/gpu/drm/vc4/vc4_txp.c
+++ b/drivers/gpu/drm/vc4/vc4_txp.c
@@ -221,17 +221,18 @@ static const u32 txp_fmts[] = {
 };
 
 static int vc4_txp_connector_atomic_check(struct drm_connector *conn,
-					struct drm_connector_state *conn_state)
+					  struct drm_atomic_state *state)
 {
+	struct drm_connector_state *conn_state;
 	struct drm_crtc_state *crtc_state;
 	struct drm_framebuffer *fb;
 	int i;
 
+	conn_state = drm_atomic_get_new_connector_state(state, conn);
 	if (!conn_state->writeback_job || !conn_state->writeback_job->fb)
 		return 0;
 
-	crtc_state = drm_atomic_get_new_crtc_state(conn_state->state,
-						   conn_state->crtc);
+	crtc_state = drm_atomic_get_new_crtc_state(state, conn_state->crtc);
 
 	fb = conn_state->writeback_job->fb;
 	if (fb->width != crtc_state->mode.hdisplay ||
diff --git a/include/drm/drm_modeset_helper_vtables.h b/include/drm/drm_modeset_helper_vtables.h
index 13f0e3ca0a29..f36be43bd052 100644
--- a/include/drm/drm_modeset_helper_vtables.h
+++ b/include/drm/drm_modeset_helper_vtables.h
@@ -1028,7 +1028,7 @@ struct drm_connector_helper_funcs {
 	 * deadlock.
 	 */
 	int (*atomic_check)(struct drm_connector *connector,
-			    struct drm_connector_state *state);
+			    struct drm_atomic_state *state);
 
 	/**
 	 * @atomic_commit:
-- 
cgit v1.2.3


From 1452c25b0e60278820f3d2155c65f1bfcce5ee79 Mon Sep 17 00:00:00 2001
From: Sean Paul <seanpaul@chromium.org>
Date: Wed, 12 Jun 2019 10:50:19 -0400
Subject: drm: Add helpers to kick off self refresh mode in drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds a new drm helper library to help drivers implement
self refresh. Drivers choosing to use it will register crtcs and
will receive callbacks when it's time to enter or exit self refresh
mode.

In its current form, it has a timer which will trigger after a
driver-specified amount of inactivity. When the timer triggers, the
helpers will submit a new atomic commit to shut the refreshing pipe
off. On the next atomic commit, the drm core will revert the self
refresh state and bring everything back up to be actively driven.

From the driver's perspective, this works like a regular disable/enable
cycle. The driver need only check the 'self_refresh_active' state in
crtc_state. It should initiate self refresh mode on the panel and enter
an off or low-power state.

Changes in v2:
- s/psr/self_refresh/ (Daniel)
- integrated the psr exit into the commit that wakes it up (Jose/Daniel)
- made the psr state per-crtc (Jose/Daniel)
Changes in v3:
- Remove the self_refresh_(active|changed) from connector state (Daniel)
- Simplify loop in drm_self_refresh_helper_alter_state (Daniel)
- Improve self_refresh_aware comment (Daniel)
- s/self_refresh_state/self_refresh_data/ (Daniel)
Changes in v4:
- Move docbook location below panel (Daniel)
- Improve docbook with references and more detailed explanation (Daniel)
- Instead of register/unregister, use init/cleanup (Daniel)
Changes in v5:
- Resolved conflict in drm_atomic_helper.c #include block
- Resolved conflict in rst with HDCP helper docs
Changes in v6:
- Fix include ordering, clean up forward declarations (Sam)

Link to v1: https://patchwork.freedesktop.org/patch/msgid/20190228210939.83386-2-sean@poorly.run
Link to v2: https://patchwork.freedesktop.org/patch/msgid/20190326204509.96515-1-sean@poorly.run
Link to v3: https://patchwork.freedesktop.org/patch/msgid/20190502194956.218441-6-sean@poorly.run
Link to v4: https://patchwork.freedesktop.org/patch/msgid/20190508160920.144739-6-sean@poorly.run
Link to v5: https://patchwork.freedesktop.org/patch/msgid/20190611160844.257498-6-sean@poorly.run

Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Jose Souza <jose.souza@intel.com>
Cc: Zain Wang <wzz@rock-chips.com>
Cc: Tomasz Figa <tfiga@chromium.org>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190612145026.191846-1-sean@poorly.run
---
 Documentation/gpu/drm-kms-helpers.rst     |   9 ++
 drivers/gpu/drm/Makefile                  |   2 +-
 drivers/gpu/drm/drm_atomic.c              |   2 +
 drivers/gpu/drm/drm_atomic_helper.c       |  35 ++++-
 drivers/gpu/drm/drm_atomic_state_helper.c |   4 +
 drivers/gpu/drm/drm_atomic_uapi.c         |   7 +-
 drivers/gpu/drm/drm_self_refresh_helper.c | 216 ++++++++++++++++++++++++++++++
 include/drm/drm_atomic.h                  |  15 +++
 include/drm/drm_connector.h               |  14 ++
 include/drm/drm_crtc.h                    |  19 +++
 include/drm/drm_self_refresh_helper.h     |  20 +++
 11 files changed, 338 insertions(+), 5 deletions(-)
 create mode 100644 drivers/gpu/drm/drm_self_refresh_helper.c
 create mode 100644 include/drm/drm_self_refresh_helper.h

(limited to 'include')

diff --git a/Documentation/gpu/drm-kms-helpers.rst b/Documentation/gpu/drm-kms-helpers.rst
index 0fe726a6ee67..b327bbc11182 100644
--- a/Documentation/gpu/drm-kms-helpers.rst
+++ b/Documentation/gpu/drm-kms-helpers.rst
@@ -181,6 +181,15 @@ Panel Helper Reference
 .. kernel-doc:: drivers/gpu/drm/drm_panel_orientation_quirks.c
    :export:
 
+Panel Self Refresh Helper Reference
+===================================
+
+.. kernel-doc:: drivers/gpu/drm/drm_self_refresh_helper.c
+   :doc: overview
+
+.. kernel-doc:: drivers/gpu/drm/drm_self_refresh_helper.c
+   :export:
+
 HDCP Helper Functions Reference
 ===============================
 
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index d36feb4a6233..9d630a28a788 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -43,7 +43,7 @@ drm_kms_helper-y := drm_crtc_helper.o drm_dp_helper.o drm_dsc.o drm_probe_helper
 		drm_simple_kms_helper.o drm_modeset_helper.o \
 		drm_scdc_helper.o drm_gem_framebuffer_helper.o \
 		drm_atomic_state_helper.o drm_damage_helper.o \
-		drm_format_helper.o
+		drm_format_helper.o drm_self_refresh_helper.o
 
 drm_kms_helper-$(CONFIG_DRM_PANEL_BRIDGE) += bridge/panel.o
 drm_kms_helper-$(CONFIG_DRM_FBDEV_EMULATION) += drm_fb_helper.o
diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index a4e779deab0f..419381abbdd1 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -384,6 +384,7 @@ static void drm_atomic_crtc_print_state(struct drm_printer *p,
 	drm_printf(p, "crtc[%u]: %s\n", crtc->base.id, crtc->name);
 	drm_printf(p, "\tenable=%d\n", state->enable);
 	drm_printf(p, "\tactive=%d\n", state->active);
+	drm_printf(p, "\tself_refresh_active=%d\n", state->self_refresh_active);
 	drm_printf(p, "\tplanes_changed=%d\n", state->planes_changed);
 	drm_printf(p, "\tmode_changed=%d\n", state->mode_changed);
 	drm_printf(p, "\tactive_changed=%d\n", state->active_changed);
@@ -999,6 +1000,7 @@ static void drm_atomic_connector_print_state(struct drm_printer *p,
 
 	drm_printf(p, "connector[%u]: %s\n", connector->base.id, connector->name);
 	drm_printf(p, "\tcrtc=%s\n", state->crtc ? state->crtc->name : "(null)");
+	drm_printf(p, "\tself_refresh_aware=%d\n", state->self_refresh_aware);
 
 	if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK)
 		if (state->writeback_job && state->writeback_job->fb)
diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index e58be6996069..d9ea7e51ffdc 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -34,6 +34,7 @@
 #include <drm/drm_device.h>
 #include <drm/drm_plane_helper.h>
 #include <drm/drm_print.h>
+#include <drm/drm_self_refresh_helper.h>
 #include <drm/drm_vblank.h>
 #include <drm/drm_writeback.h>
 
@@ -953,10 +954,33 @@ int drm_atomic_helper_check(struct drm_device *dev,
 	if (state->legacy_cursor_update)
 		state->async_update = !drm_atomic_helper_async_check(dev, state);
 
+	drm_self_refresh_helper_alter_state(state);
+
 	return ret;
 }
 EXPORT_SYMBOL(drm_atomic_helper_check);
 
+static bool
+crtc_needs_disable(struct drm_crtc_state *old_state,
+		   struct drm_crtc_state *new_state)
+{
+	/*
+	 * No new_state means the crtc is off, so the only criteria is whether
+	 * it's currently active or in self refresh mode.
+	 */
+	if (!new_state)
+		return drm_atomic_crtc_effectively_active(old_state);
+
+	/*
+	 * We need to run through the crtc_funcs->disable() function if the crtc
+	 * is currently on, if it's transitioning to self refresh mode, or if
+	 * it's in self refresh mode and needs to be fully disabled.
+	 */
+	return old_state->active ||
+	       (old_state->self_refresh_active && !new_state->enable) ||
+	       new_state->self_refresh_active;
+}
+
 static void
 disable_outputs(struct drm_device *dev, struct drm_atomic_state *old_state)
 {
@@ -977,7 +1001,14 @@ disable_outputs(struct drm_device *dev, struct drm_atomic_state *old_state)
 
 		old_crtc_state = drm_atomic_get_old_crtc_state(old_state, old_conn_state->crtc);
 
-		if (!old_crtc_state->active ||
+		if (new_conn_state->crtc)
+			new_crtc_state = drm_atomic_get_new_crtc_state(
+						old_state,
+						new_conn_state->crtc);
+		else
+			new_crtc_state = NULL;
+
+		if (!crtc_needs_disable(old_crtc_state, new_crtc_state) ||
 		    !drm_atomic_crtc_needs_modeset(old_conn_state->crtc->state))
 			continue;
 
@@ -1023,7 +1054,7 @@ disable_outputs(struct drm_device *dev, struct drm_atomic_state *old_state)
 		if (!drm_atomic_crtc_needs_modeset(new_crtc_state))
 			continue;
 
-		if (!old_crtc_state->active)
+		if (!crtc_needs_disable(old_crtc_state, new_crtc_state))
 			continue;
 
 		funcs = crtc->helper_private;
diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c
index 97ab26679b96..7d7347a6f194 100644
--- a/drivers/gpu/drm/drm_atomic_state_helper.c
+++ b/drivers/gpu/drm/drm_atomic_state_helper.c
@@ -129,6 +129,10 @@ void __drm_atomic_helper_crtc_duplicate_state(struct drm_crtc *crtc,
 	state->commit = NULL;
 	state->event = NULL;
 	state->pageflip_flags = 0;
+
+	/* Self refresh should be canceled when a new update is available */
+	state->active = drm_atomic_crtc_effectively_active(state);
+	state->self_refresh_active = false;
 }
 EXPORT_SYMBOL(__drm_atomic_helper_crtc_duplicate_state);
 
diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
index eb22e8bdd853..abe38bdf85ae 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -490,7 +490,7 @@ drm_atomic_crtc_get_property(struct drm_crtc *crtc,
 	struct drm_mode_config *config = &dev->mode_config;
 
 	if (property == config->prop_active)
-		*val = state->active;
+		*val = drm_atomic_crtc_effectively_active(state);
 	else if (property == config->prop_mode_id)
 		*val = (state->mode_blob) ? state->mode_blob->base.id : 0;
 	else if (property == config->prop_vrr_enabled)
@@ -788,7 +788,10 @@ drm_atomic_connector_get_property(struct drm_connector *connector,
 	if (property == config->prop_crtc_id) {
 		*val = (state->crtc) ? state->crtc->base.id : 0;
 	} else if (property == config->dpms_property) {
-		*val = connector->dpms;
+		if (state->crtc && state->crtc->state->self_refresh_active)
+			*val = DRM_MODE_DPMS_ON;
+		else
+			*val = connector->dpms;
 	} else if (property == config->tv_select_subconnector_property) {
 		*val = state->tv.subconnector;
 	} else if (property == config->tv_left_margin_property) {
diff --git a/drivers/gpu/drm/drm_self_refresh_helper.c b/drivers/gpu/drm/drm_self_refresh_helper.c
new file mode 100644
index 000000000000..2b3daaf77841
--- /dev/null
+++ b/drivers/gpu/drm/drm_self_refresh_helper.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright (C) 2019 Google, Inc.
+ *
+ * Authors:
+ * Sean Paul <seanpaul@chromium.org>
+ */
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include <drm/drm_atomic.h>
+#include <drm/drm_atomic_helper.h>
+#include <drm/drm_connector.h>
+#include <drm/drm_crtc.h>
+#include <drm/drm_device.h>
+#include <drm/drm_mode_config.h>
+#include <drm/drm_modeset_lock.h>
+#include <drm/drm_print.h>
+#include <drm/drm_self_refresh_helper.h>
+
+/**
+ * DOC: overview
+ *
+ * This helper library provides an easy way for drivers to leverage the atomic
+ * framework to implement panel self refresh (SR) support. Drivers are
+ * responsible for initializing and cleaning up the SR helpers on load/unload
+ * (see &drm_self_refresh_helper_init/&drm_self_refresh_helper_cleanup).
+ * The connector is responsible for setting
+ * &drm_connector_state.self_refresh_aware to true at runtime if it is SR-aware
+ * (meaning it knows how to initiate self refresh on the panel).
+ *
+ * Once a crtc has enabled SR using &drm_self_refresh_helper_init, the
+ * helpers will monitor activity and call back into the driver to enable/disable
+ * SR as appropriate. The best way to think about this is that it's a DPMS
+ * on/off request with &drm_crtc_state.self_refresh_active set in crtc state
+ * that tells you to disable/enable SR on the panel instead of power-cycling it.
+ *
+ * During SR, drivers may choose to fully disable their crtc/encoder/bridge
+ * hardware (in which case no driver changes are necessary), or they can inspect
+ * &drm_crtc_state.self_refresh_active if they want to enter low power mode
+ * without full disable (in case full disable/enable is too slow).
+ *
+ * SR will be deactivated if there are any atomic updates affecting the
+ * pipe that is in SR mode. If a crtc is driving multiple connectors, all
+ * connectors must be SR aware and all will enter/exit SR mode at the same time.
+ *
+ * If the crtc and connector are SR aware, but the panel connected does not
+ * support it (or is otherwise unable to enter SR), the driver should fail
+ * atomic_check when &drm_crtc_state.self_refresh_active is true.
+ */
+
+struct drm_self_refresh_data {
+	struct drm_crtc *crtc;
+	struct delayed_work entry_work;
+	struct drm_atomic_state *save_state;
+	unsigned int entry_delay_ms;
+};
+
+static void drm_self_refresh_helper_entry_work(struct work_struct *work)
+{
+	struct drm_self_refresh_data *sr_data = container_of(
+				to_delayed_work(work),
+				struct drm_self_refresh_data, entry_work);
+	struct drm_crtc *crtc = sr_data->crtc;
+	struct drm_device *dev = crtc->dev;
+	struct drm_modeset_acquire_ctx ctx;
+	struct drm_atomic_state *state;
+	struct drm_connector *conn;
+	struct drm_connector_state *conn_state;
+	struct drm_crtc_state *crtc_state;
+	int i, ret;
+
+	drm_modeset_acquire_init(&ctx, 0);
+
+	state = drm_atomic_state_alloc(dev);
+	if (!state) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+retry:
+	state->acquire_ctx = &ctx;
+
+	crtc_state = drm_atomic_get_crtc_state(state, crtc);
+	if (IS_ERR(crtc_state)) {
+		ret = PTR_ERR(crtc_state);
+		goto out;
+	}
+
+	if (!crtc_state->enable)
+		goto out;
+
+	ret = drm_atomic_add_affected_connectors(state, crtc);
+	if (ret)
+		goto out;
+
+	for_each_new_connector_in_state(state, conn, conn_state, i) {
+		if (!conn_state->self_refresh_aware)
+			goto out;
+	}
+
+	crtc_state->active = false;
+	crtc_state->self_refresh_active = true;
+
+	ret = drm_atomic_commit(state);
+	if (ret)
+		goto out;
+
+out:
+	if (ret == -EDEADLK) {
+		drm_atomic_state_clear(state);
+		ret = drm_modeset_backoff(&ctx);
+		if (!ret)
+			goto retry;
+	}
+
+	drm_atomic_state_put(state);
+	drm_modeset_drop_locks(&ctx);
+	drm_modeset_acquire_fini(&ctx);
+}
+
+/**
+ * drm_self_refresh_helper_alter_state - Alters the atomic state for SR exit
+ * @state: the state currently being checked
+ *
+ * Called at the end of atomic check. This function checks the state for flags
+ * incompatible with self refresh exit and changes them. This is a bit
+ * disingenuous since userspace is expecting one thing and we're giving it
+ * another. However in order to keep self refresh entirely hidden from
+ * userspace, this is required.
+ *
+ * At the end, we queue up the self refresh entry work so we can enter PSR after
+ * the desired delay.
+ */
+void drm_self_refresh_helper_alter_state(struct drm_atomic_state *state)
+{
+	struct drm_crtc *crtc;
+	struct drm_crtc_state *crtc_state;
+	int i;
+
+	if (state->async_update || !state->allow_modeset) {
+		for_each_old_crtc_in_state(state, crtc, crtc_state, i) {
+			if (crtc_state->self_refresh_active) {
+				state->async_update = false;
+				state->allow_modeset = true;
+				break;
+			}
+		}
+	}
+
+	for_each_new_crtc_in_state(state, crtc, crtc_state, i) {
+		struct drm_self_refresh_data *sr_data;
+
+		/* Don't trigger the entry timer when we're already in SR */
+		if (crtc_state->self_refresh_active)
+			continue;
+
+		sr_data = crtc->self_refresh_data;
+		if (!sr_data)
+			continue;
+
+		mod_delayed_work(system_wq, &sr_data->entry_work,
+				 msecs_to_jiffies(sr_data->entry_delay_ms));
+	}
+}
+EXPORT_SYMBOL(drm_self_refresh_helper_alter_state);
+
+/**
+ * drm_self_refresh_helper_init - Initializes self refresh helpers for a crtc
+ * @crtc: the crtc which supports self refresh supported displays
+ * @entry_delay_ms: amount of inactivity to wait before entering self refresh
+ *
+ * Returns zero if successful or -errno on failure
+ */
+int drm_self_refresh_helper_init(struct drm_crtc *crtc,
+				 unsigned int entry_delay_ms)
+{
+	struct drm_self_refresh_data *sr_data = crtc->self_refresh_data;
+
+	/* Helper is already initialized */
+	if (WARN_ON(sr_data))
+		return -EINVAL;
+
+	sr_data = kzalloc(sizeof(*sr_data), GFP_KERNEL);
+	if (!sr_data)
+		return -ENOMEM;
+
+	INIT_DELAYED_WORK(&sr_data->entry_work,
+			  drm_self_refresh_helper_entry_work);
+	sr_data->entry_delay_ms = entry_delay_ms;
+	sr_data->crtc = crtc;
+
+	crtc->self_refresh_data = sr_data;
+	return 0;
+}
+EXPORT_SYMBOL(drm_self_refresh_helper_init);
+
+/**
+ * drm_self_refresh_helper_cleanup - Cleans up self refresh helpers for a crtc
+ * @crtc: the crtc to cleanup
+ */
+void drm_self_refresh_helper_cleanup(struct drm_crtc *crtc)
+{
+	struct drm_self_refresh_data *sr_data = crtc->self_refresh_data;
+
+	/* Helper is already uninitialized */
+	if (sr_data)
+		return;
+
+	crtc->self_refresh_data = NULL;
+
+	cancel_delayed_work_sync(&sr_data->entry_work);
+	kfree(sr_data);
+}
+EXPORT_SYMBOL(drm_self_refresh_helper_cleanup);
diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h
index f12215647801..927e1205d7aa 100644
--- a/include/drm/drm_atomic.h
+++ b/include/drm/drm_atomic.h
@@ -957,4 +957,19 @@ drm_atomic_crtc_needs_modeset(const struct drm_crtc_state *state)
 	       state->connectors_changed;
 }
 
+/**
+ * drm_atomic_crtc_effectively_active - compute whether crtc is actually active
+ * @state: &drm_crtc_state for the CRTC
+ *
+ * When in self refresh mode, the crtc_state->active value will be false, since
+ * the crtc is off. However in some cases we're interested in whether the crtc
+ * is active, or effectively active (ie: it's connected to an active display).
+ * In these cases, use this function instead of just checking active.
+ */
+static inline bool
+drm_atomic_crtc_effectively_active(const struct drm_crtc_state *state)
+{
+	return state->active || state->self_refresh_active;
+}
+
 #endif /* DRM_ATOMIC_H_ */
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 071143bc0ebd..c6f8486d8b8f 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -548,6 +548,20 @@ struct drm_connector_state {
 	/** @tv: TV connector state */
 	struct drm_tv_connector_state tv;
 
+	/**
+	 * @self_refresh_aware:
+	 *
+	 * This tracks whether a connector is aware of the self refresh state.
+	 * It should be set to true for those connector implementations which
+	 * understand the self refresh state. This is needed since the crtc
+	 * registers the self refresh helpers and it doesn't know if the
+	 * connectors downstream have implemented self refresh entry/exit.
+	 *
+	 * Drivers should set this to true in atomic_check if they know how to
+	 * handle self_refresh requests.
+	 */
+	bool self_refresh_aware;
+
 	/**
 	 * @picture_aspect_ratio: Connector property to control the
 	 * HDMI infoframe aspect ratio setting.
diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
index dc42b9e35333..128d8b210621 100644
--- a/include/drm/drm_crtc.h
+++ b/include/drm/drm_crtc.h
@@ -54,6 +54,7 @@ struct drm_mode_set;
 struct drm_file;
 struct drm_clip_rect;
 struct drm_printer;
+struct drm_self_refresh_data;
 struct device_node;
 struct dma_fence;
 struct edid;
@@ -300,6 +301,17 @@ struct drm_crtc_state {
 	 */
 	bool vrr_enabled;
 
+	/**
+	 * @self_refresh_active:
+	 *
+	 * Used by the self refresh helpers to denote when a self refresh
+	 * transition is occurring. This will be set on enable/disable callbacks
+	 * when self refresh is being enabled or disabled. In some cases, it may
+	 * not be desirable to fully shut off the crtc during self refresh.
+	 * CRTC's can inspect this flag and determine the best course of action.
+	 */
+	bool self_refresh_active;
+
 	/**
 	 * @event:
 	 *
@@ -1088,6 +1100,13 @@ struct drm_crtc {
 	 * The name of the CRTC's fence timeline.
 	 */
 	char timeline_name[32];
+
+	/**
+	 * @self_refresh_data: Holds the state for the self refresh helpers
+	 *
+	 * Initialized via drm_self_refresh_helper_register().
+	 */
+	struct drm_self_refresh_data *self_refresh_data;
 };
 
 /**
diff --git a/include/drm/drm_self_refresh_helper.h b/include/drm/drm_self_refresh_helper.h
new file mode 100644
index 000000000000..397a583ccca7
--- /dev/null
+++ b/include/drm/drm_self_refresh_helper.h
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright (C) 2019 Google, Inc.
+ *
+ * Authors:
+ * Sean Paul <seanpaul@chromium.org>
+ */
+#ifndef DRM_SELF_REFRESH_HELPER_H_
+#define DRM_SELF_REFRESH_HELPER_H_
+
+struct drm_atomic_state;
+struct drm_crtc;
+
+void drm_self_refresh_helper_alter_state(struct drm_atomic_state *state);
+
+int drm_self_refresh_helper_init(struct drm_crtc *crtc,
+				 unsigned int entry_delay_ms);
+
+void drm_self_refresh_helper_cleanup(struct drm_crtc *crtc);
+#endif
-- 
cgit v1.2.3


From de76cda215d56256ffcda7ffa538b70f9fb301a7 Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Tue, 4 Jun 2019 18:24:43 +0200
Subject: PCI: Decode PCIe 32 GT/s link speed

PCIe r5.0, sec 7.5.3.18, defines a new 32.0 GT/s bit in the Supported Link
Speeds Vector of Link Capabilities 2.  Decode this new speed.  This does
not affect the speed of the link, which should be negotiated automatically
by the hardware; it only adds decoding when showing the speed to the user.

Previously, reading the speed of a link operating at this speed showed
"Unknown speed" instead of "32.0 GT/s".

Link: https://lore.kernel.org/lkml/92365e3caf0fc559f9ab14bcd053bfc92d4f661c.1559664969.git.gustavo.pimentel@synopsys.com
Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci-sysfs.c       | 3 +++
 drivers/pci/pci.c             | 4 +++-
 drivers/pci/probe.c           | 2 +-
 drivers/pci/slot.c            | 1 +
 include/linux/pci.h           | 1 +
 include/uapi/linux/pci_regs.h | 4 ++++
 6 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 6d27475e39b2..d52d30448e41 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -182,6 +182,9 @@ static ssize_t current_link_speed_show(struct device *dev,
 		return -EINVAL;
 
 	switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+	case PCI_EXP_LNKSTA_CLS_32_0GB:
+		speed = "32 GT/s";
+		break;
 	case PCI_EXP_LNKSTA_CLS_16_0GB:
 		speed = "16 GT/s";
 		break;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8abc843b1615..4729a7c7a9d9 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5621,7 +5621,9 @@ enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
 	 */
 	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
 	if (lnkcap2) { /* PCIe r3.0-compliant */
-		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB)
+		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_32_0GB)
+			return PCIE_SPEED_32_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB)
 			return PCIE_SPEED_16_0GT;
 		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
 			return PCIE_SPEED_8_0GT;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0e8e2c186f50..c5f27c8cd140 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -668,7 +668,7 @@ const unsigned char pcie_link_speed[] = {
 	PCIE_SPEED_5_0GT,		/* 2 */
 	PCIE_SPEED_8_0GT,		/* 3 */
 	PCIE_SPEED_16_0GT,		/* 4 */
-	PCI_SPEED_UNKNOWN,		/* 5 */
+	PCIE_SPEED_32_0GT,		/* 5 */
 	PCI_SPEED_UNKNOWN,		/* 6 */
 	PCI_SPEED_UNKNOWN,		/* 7 */
 	PCI_SPEED_UNKNOWN,		/* 8 */
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index f4d92b1afe7b..ae4aa0e1f2f4 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -75,6 +75,7 @@ static const char *pci_bus_speed_strings[] = {
 	"5.0 GT/s PCIe",	/* 0x15 */
 	"8.0 GT/s PCIe",	/* 0x16 */
 	"16.0 GT/s PCIe",	/* 0x17 */
+	"32.0 GT/s PCIe",	/* 0x18 */
 };
 
 static ssize_t bus_speed_read(enum pci_bus_speed speed, char *buf)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..2173e6b75579 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -258,6 +258,7 @@ enum pci_bus_speed {
 	PCIE_SPEED_5_0GT		= 0x15,
 	PCIE_SPEED_8_0GT		= 0x16,
 	PCIE_SPEED_16_0GT		= 0x17,
+	PCIE_SPEED_32_0GT		= 0x18,
 	PCI_SPEED_UNKNOWN		= 0xff,
 };
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 27164769d184..f28e562d7ca8 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -528,6 +528,7 @@
 #define  PCI_EXP_LNKCAP_SLS_5_0GB 0x00000002 /* LNKCAP2 SLS Vector bit 1 */
 #define  PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */
 #define  PCI_EXP_LNKCAP_SLS_16_0GB 0x00000004 /* LNKCAP2 SLS Vector bit 3 */
+#define  PCI_EXP_LNKCAP_SLS_32_0GB 0x00000005 /* LNKCAP2 SLS Vector bit 4 */
 #define  PCI_EXP_LNKCAP_MLW	0x000003f0 /* Maximum Link Width */
 #define  PCI_EXP_LNKCAP_ASPMS	0x00000c00 /* ASPM Support */
 #define  PCI_EXP_LNKCAP_L0SEL	0x00007000 /* L0s Exit Latency */
@@ -556,6 +557,7 @@
 #define  PCI_EXP_LNKSTA_CLS_5_0GB 0x0002 /* Current Link Speed 5.0GT/s */
 #define  PCI_EXP_LNKSTA_CLS_8_0GB 0x0003 /* Current Link Speed 8.0GT/s */
 #define  PCI_EXP_LNKSTA_CLS_16_0GB 0x0004 /* Current Link Speed 16.0GT/s */
+#define  PCI_EXP_LNKSTA_CLS_32_0GB 0x0005 /* Current Link Speed 32.0GT/s */
 #define  PCI_EXP_LNKSTA_NLW	0x03f0	/* Negotiated Link Width */
 #define  PCI_EXP_LNKSTA_NLW_X1	0x0010	/* Current Link Width x1 */
 #define  PCI_EXP_LNKSTA_NLW_X2	0x0020	/* Current Link Width x2 */
@@ -661,6 +663,7 @@
 #define  PCI_EXP_LNKCAP2_SLS_5_0GB	0x00000004 /* Supported Speed 5GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_8_0GB	0x00000008 /* Supported Speed 8GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_16_0GB	0x00000010 /* Supported Speed 16GT/s */
+#define  PCI_EXP_LNKCAP2_SLS_32_0GB	0x00000020 /* Supported Speed 32GT/s */
 #define  PCI_EXP_LNKCAP2_CROSSLINK	0x00000100 /* Crosslink supported */
 #define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
 #define  PCI_EXP_LNKCTL2_TLS		0x000f
@@ -668,6 +671,7 @@
 #define  PCI_EXP_LNKCTL2_TLS_5_0GT	0x0002 /* Supported Speed 5GT/s */
 #define  PCI_EXP_LNKCTL2_TLS_8_0GT	0x0003 /* Supported Speed 8GT/s */
 #define  PCI_EXP_LNKCTL2_TLS_16_0GT	0x0004 /* Supported Speed 16GT/s */
+#define  PCI_EXP_LNKCTL2_TLS_32_0GT	0x0005 /* Supported Speed 32GT/s */
 #define PCI_EXP_LNKSTA2		50	/* Link Status 2 */
 #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2	52	/* v2 endpoints with link end here */
 #define PCI_EXP_SLTCAP2		52	/* Slot Capabilities 2 */
-- 
cgit v1.2.3


From 2305ff225c0b1691ec2e93f3d6990e13a2e63c95 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 4 Jun 2019 20:16:32 +0900
Subject: ocxl: do not use C++ style comments in uapi header

Linux kernel tolerates C++ style comments these days. Actually, the
SPDX License tags for .c files start with //.

On the other hand, uapi headers are written in more strict C, where
the C++ comment style is forbidden.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Frederic Barrat <fbarrat@linux.ibm.com>
Acked-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/uapi/misc/ocxl.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/misc/ocxl.h b/include/uapi/misc/ocxl.h
index 97937cfa3baa..6d29a60a896a 100644
--- a/include/uapi/misc/ocxl.h
+++ b/include/uapi/misc/ocxl.h
@@ -33,23 +33,23 @@ struct ocxl_ioctl_attach {
 };
 
 struct ocxl_ioctl_metadata {
-	__u16 version; // struct version, always backwards compatible
+	__u16 version; /* struct version, always backwards compatible */
 
-	// Version 0 fields
+	/* Version 0 fields */
 	__u8  afu_version_major;
 	__u8  afu_version_minor;
-	__u32 pasid;		// PASID assigned to the current context
+	__u32 pasid;		/* PASID assigned to the current context */
 
-	__u64 pp_mmio_size;	// Per PASID MMIO size
+	__u64 pp_mmio_size;	/* Per PASID MMIO size */
 	__u64 global_mmio_size;
 
-	// End version 0 fields
+	/* End version 0 fields */
 
-	__u64 reserved[13]; // Total of 16*u64
+	__u64 reserved[13]; /* Total of 16*u64 */
 };
 
 struct ocxl_ioctl_p9_wait {
-	__u16 thread_id; // The thread ID required to wake this thread
+	__u16 thread_id; /* The thread ID required to wake this thread */
 	__u16 reserved1;
 	__u32 reserved2;
 	__u64 reserved3[3];
-- 
cgit v1.2.3


From ed63bb1d1f8469586006a9ca63c42344401aa2ab Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Thu, 13 Jun 2019 15:34:06 -0700
Subject: dma-buf: give each buffer a full-fledged inode

By traversing /proc/*/fd and /proc/*/map_files, processes with CAP_ADMIN
can get a lot of fine-grained data about how shmem buffers are shared
among processes.  stat(2) on each entry gives the caller a unique
ID (st_ino), the buffer's size (st_size), and even the number of pages
currently charged to the buffer (st_blocks / 512).

In contrast, all dma-bufs share the same anonymous inode.  So while we
can count how many dma-buf fds or mappings a process has, we can't get
the size of the backing buffers or tell if two entries point to the same
dma-buf.  On systems with debugfs, we can get a per-buffer breakdown of
size and reference count, but can't tell which processes are actually
holding the references to each buffer.

Replace the singleton inode with full-fledged inodes allocated by
alloc_anon_inode().  This involves creating and mounting a
mini-pseudo-filesystem for dma-buf, following the example in fs/aio.c.

Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Chenbo Feng <fengc@google.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190613223408.139221-2-fengc@google.com
---
 drivers/dma-buf/dma-buf.c  | 63 +++++++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/magic.h |  1 +
 2 files changed, 58 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index f4104a21b069..3612ccededd6 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -34,8 +34,10 @@
 #include <linux/poll.h>
 #include <linux/reservation.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 
 #include <uapi/linux/dma-buf.h>
+#include <uapi/linux/magic.h>
 
 static inline int is_dma_buf_file(struct file *);
 
@@ -46,6 +48,25 @@ struct dma_buf_list {
 
 static struct dma_buf_list db_list;
 
+static const struct dentry_operations dma_buf_dentry_ops = {
+	.d_dname = simple_dname,
+};
+
+static struct vfsmount *dma_buf_mnt;
+
+static struct dentry *dma_buf_fs_mount(struct file_system_type *fs_type,
+		int flags, const char *name, void *data)
+{
+	return mount_pseudo(fs_type, "dmabuf:", NULL, &dma_buf_dentry_ops,
+			DMA_BUF_MAGIC);
+}
+
+static struct file_system_type dma_buf_fs_type = {
+	.name = "dmabuf",
+	.mount = dma_buf_fs_mount,
+	.kill_sb = kill_anon_super,
+};
+
 static int dma_buf_release(struct inode *inode, struct file *file)
 {
 	struct dma_buf *dmabuf;
@@ -342,6 +363,31 @@ static inline int is_dma_buf_file(struct file *file)
 	return file->f_op == &dma_buf_fops;
 }
 
+static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags)
+{
+	struct file *file;
+	struct inode *inode = alloc_anon_inode(dma_buf_mnt->mnt_sb);
+
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	inode->i_size = dmabuf->size;
+	inode_set_bytes(inode, dmabuf->size);
+
+	file = alloc_file_pseudo(inode, dma_buf_mnt, "dmabuf",
+				 flags, &dma_buf_fops);
+	if (IS_ERR(file))
+		goto err_alloc_file;
+	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
+	file->private_data = dmabuf;
+
+	return file;
+
+err_alloc_file:
+	iput(inode);
+	return file;
+}
+
 /**
  * DOC: dma buf device access
  *
@@ -436,8 +482,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
 	}
 	dmabuf->resv = resv;
 
-	file = anon_inode_getfile("dmabuf", &dma_buf_fops, dmabuf,
-					exp_info->flags);
+	file = dma_buf_getfile(dmabuf, exp_info->flags);
 	if (IS_ERR(file)) {
 		ret = PTR_ERR(file);
 		goto err_dmabuf;
@@ -1055,8 +1100,8 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 		return ret;
 
 	seq_puts(s, "\nDma-buf Objects:\n");
-	seq_printf(s, "%-8s\t%-8s\t%-8s\t%-8s\texp_name\n",
-		   "size", "flags", "mode", "count");
+	seq_printf(s, "%-8s\t%-8s\t%-8s\t%-8s\texp_name\t%-8s\n",
+		   "size", "flags", "mode", "count", "ino");
 
 	list_for_each_entry(buf_obj, &db_list.head, list_node) {
 		ret = mutex_lock_interruptible(&buf_obj->lock);
@@ -1067,11 +1112,12 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 			continue;
 		}
 
-		seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\n",
+		seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\n",
 				buf_obj->size,
 				buf_obj->file->f_flags, buf_obj->file->f_mode,
 				file_count(buf_obj->file),
-				buf_obj->exp_name);
+				buf_obj->exp_name,
+				file_inode(buf_obj->file)->i_ino);
 
 		robj = buf_obj->resv;
 		while (true) {
@@ -1167,6 +1213,10 @@ static inline void dma_buf_uninit_debugfs(void)
 
 static int __init dma_buf_init(void)
 {
+	dma_buf_mnt = kern_mount(&dma_buf_fs_type);
+	if (IS_ERR(dma_buf_mnt))
+		return PTR_ERR(dma_buf_mnt);
+
 	mutex_init(&db_list.lock);
 	INIT_LIST_HEAD(&db_list.head);
 	dma_buf_init_debugfs();
@@ -1177,5 +1227,6 @@ subsys_initcall(dma_buf_init);
 static void __exit dma_buf_deinit(void)
 {
 	dma_buf_uninit_debugfs();
+	kern_unmount(dma_buf_mnt);
 }
 __exitcall(dma_buf_deinit);
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index f8c00045d537..665e18627f78 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -91,5 +91,6 @@
 #define UDF_SUPER_MAGIC		0x15013346
 #define BALLOON_KVM_MAGIC	0x13661366
 #define ZSMALLOC_MAGIC		0x58295829
+#define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
 
 #endif /* __LINUX_MAGIC_H__ */
-- 
cgit v1.2.3


From bb2bb903042517b8fb17b2bc21e00512f2dcac01 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Thu, 13 Jun 2019 15:34:07 -0700
Subject: dma-buf: add DMA_BUF_SET_NAME ioctls

This patch adds complimentary DMA_BUF_SET_NAME  ioctls, which lets
userspace processes attach a free-form name to each buffer.

This information can be extremely helpful for tracking and accounting
shared buffers.  For example, on Android, we know what each buffer will
be used for at allocation time: GL, multimedia, camera, etc.  The
userspace allocator can use DMA_BUF_SET_NAME to associate that
information with the buffer, so we can later give developers a
breakdown of how much memory they're allocating for graphics, camera,
etc.

Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Chenbo Feng <fengc@google.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190613223408.139221-3-fengc@google.com
---
 drivers/dma-buf/dma-buf.c    | 65 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/dma-buf.h      |  5 +++-
 include/uapi/linux/dma-buf.h |  3 ++
 3 files changed, 69 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 3612ccededd6..ab96410d1dcd 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -48,8 +48,24 @@ struct dma_buf_list {
 
 static struct dma_buf_list db_list;
 
+static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct dma_buf *dmabuf;
+	char name[DMA_BUF_NAME_LEN];
+	size_t ret = 0;
+
+	dmabuf = dentry->d_fsdata;
+	mutex_lock(&dmabuf->lock);
+	if (dmabuf->name)
+		ret = strlcpy(name, dmabuf->name, DMA_BUF_NAME_LEN);
+	mutex_unlock(&dmabuf->lock);
+
+	return dynamic_dname(dentry, buffer, buflen, "/%s:%s",
+			     dentry->d_name.name, ret > 0 ? name : "");
+}
+
 static const struct dentry_operations dma_buf_dentry_ops = {
-	.d_dname = simple_dname,
+	.d_dname = dmabuffs_dname,
 };
 
 static struct vfsmount *dma_buf_mnt;
@@ -301,6 +317,43 @@ out:
 	return events;
 }
 
+/**
+ * dma_buf_set_name - Set a name to a specific dma_buf to track the usage.
+ * The name of the dma-buf buffer can only be set when the dma-buf is not
+ * attached to any devices. It could theoritically support changing the
+ * name of the dma-buf if the same piece of memory is used for multiple
+ * purpose between different devices.
+ *
+ * @dmabuf [in]     dmabuf buffer that will be renamed.
+ * @buf:   [in]     A piece of userspace memory that contains the name of
+ *                  the dma-buf.
+ *
+ * Returns 0 on success. If the dma-buf buffer is already attached to
+ * devices, return -EBUSY.
+ *
+ */
+static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
+{
+	char *name = strndup_user(buf, DMA_BUF_NAME_LEN);
+	long ret = 0;
+
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	mutex_lock(&dmabuf->lock);
+	if (!list_empty(&dmabuf->attachments)) {
+		ret = -EBUSY;
+		kfree(name);
+		goto out_unlock;
+	}
+	kfree(dmabuf->name);
+	dmabuf->name = name;
+
+out_unlock:
+	mutex_unlock(&dmabuf->lock);
+	return ret;
+}
+
 static long dma_buf_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long arg)
 {
@@ -339,6 +392,10 @@ static long dma_buf_ioctl(struct file *file,
 			ret = dma_buf_begin_cpu_access(dmabuf, direction);
 
 		return ret;
+
+	case DMA_BUF_SET_NAME:
+		return dma_buf_set_name(dmabuf, (const char __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -380,6 +437,7 @@ static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags)
 		goto err_alloc_file;
 	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
 	file->private_data = dmabuf;
+	file->f_path.dentry->d_fsdata = dmabuf;
 
 	return file;
 
@@ -1112,12 +1170,13 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 			continue;
 		}
 
-		seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\n",
+		seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n",
 				buf_obj->size,
 				buf_obj->file->f_flags, buf_obj->file->f_mode,
 				file_count(buf_obj->file),
 				buf_obj->exp_name,
-				file_inode(buf_obj->file)->i_ino);
+				file_inode(buf_obj->file)->i_ino,
+				buf_obj->name ?: "");
 
 		robj = buf_obj->resv;
 		while (true) {
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 8a327566d7f4..01ad5b942a6f 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -280,10 +280,12 @@ struct dma_buf_ops {
  * @file: file pointer used for sharing buffers across, and for refcounting.
  * @attachments: list of dma_buf_attachment that denotes all devices attached.
  * @ops: dma_buf_ops associated with this buffer object.
- * @lock: used internally to serialize list manipulation, attach/detach and vmap/unmap
+ * @lock: used internally to serialize list manipulation, attach/detach and
+ *        vmap/unmap, and accesses to name
  * @vmapping_counter: used internally to refcnt the vmaps
  * @vmap_ptr: the current vmap ptr if vmapping_counter > 0
  * @exp_name: name of the exporter; useful for debugging.
+ * @name: userspace-provided name; useful for accounting and debugging.
  * @owner: pointer to exporter module; used for refcounting when exporter is a
  *         kernel module.
  * @list_node: node for dma_buf accounting and debugging.
@@ -311,6 +313,7 @@ struct dma_buf {
 	unsigned vmapping_counter;
 	void *vmap_ptr;
 	const char *exp_name;
+	const char *name;
 	struct module *owner;
 	struct list_head list_node;
 	void *priv;
diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
index d75df5210a4a..dbc7092e04b5 100644
--- a/include/uapi/linux/dma-buf.h
+++ b/include/uapi/linux/dma-buf.h
@@ -35,7 +35,10 @@ struct dma_buf_sync {
 #define DMA_BUF_SYNC_VALID_FLAGS_MASK \
 	(DMA_BUF_SYNC_RW | DMA_BUF_SYNC_END)
 
+#define DMA_BUF_NAME_LEN	32
+
 #define DMA_BUF_BASE		'b'
 #define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
+#define DMA_BUF_SET_NAME	_IOW(DMA_BUF_BASE, 1, const char *)
 
 #endif
-- 
cgit v1.2.3


From f208b26e61df411cf0bee413bee57ff434e8b38a Mon Sep 17 00:00:00 2001
From: Steve Longerbeam <slongerbeam@gmail.com>
Date: Tue, 21 May 2019 18:03:14 -0700
Subject: gpu: ipu-v3: ipu-ic: Fully describe colorspace conversions

Only providing the input and output RGB/YUV space to the IC task init
functions is not sufficient. To fully characterize a colorspace
conversion, the Y'CbCr encoding standard, and quantization also
need to be specified.

Define a 'struct ipu_ic_colorspace' that includes all the above.

This allows to actually enforce the fact that the IC:

- can only encode to/from YUV and RGB full range. A follow-up patch will
  remove this restriction.
- can only encode using BT.601 standard. A follow-up patch will add
  Rec.709 encoding support.

The determination of the CSC coefficients based on the input/output
'struct ipu_ic_colorspace' are moved to a new exported function
ipu_ic_calc_csc(), and 'struct ic_csc_params' is exported as
'struct ipu_ic_csc_params'. ipu_ic_calc_csc() fills a 'struct ipu_ic_csc'
with the input/output 'struct ipu_ic_colorspace' and the calculated
'struct ic_csc_params' from those input/output colorspaces.

The functions ipu_ic_task_init(_rsc)() now take a filled 'struct
ipu_ic_csc'.

The existing CSC coefficient tables and ipu_ic_calc_csc() are moved
to a new module ipu-ic-csc.c. This is in preparation for adding more
coefficient tables for limited range quantization and more encoding
standards.

The existing ycbcr2rgb and inverse rgb2ycbcr tables defined the BT.601
Y'CbCr encoding coefficients. The rgb2ycbcr table specifically described
the BT.601 encoding from full range RGB to full range YUV. Table
comments have been added in ipu-ic-csc.c to make this more clear.

The ycbcr2rgb inverse table described encoding YUV limited range to RGB
full range. To be consistent with the rgb2ycbcr table, this table is
converted to YUV full range to RGB full range, and the comments are
expanded in ipu-ic-csc.c.

The ic_csc_rgb2rgb table was just an identity matrix, so it is renamed
'identity' in ipu-ic-csc.c.

Signed-off-by: Steve Longerbeam <slongerbeam@gmail.com>
[p.zabel@pengutronix.de: removed a superfluous blank line]
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/gpu/ipu-v3/Makefile                 |   4 +-
 drivers/gpu/ipu-v3/ipu-ic-csc.c             | 128 ++++++++++++++++++++++++++
 drivers/gpu/ipu-v3/ipu-ic.c                 | 138 +++++++++-------------------
 drivers/gpu/ipu-v3/ipu-image-convert.c      |  28 +++---
 drivers/staging/media/imx/imx-ic-prpencvf.c |  34 +++++--
 include/video/imx-ipu-v3.h                  |  56 +++++++++--
 6 files changed, 270 insertions(+), 118 deletions(-)
 create mode 100644 drivers/gpu/ipu-v3/ipu-ic-csc.c

(limited to 'include')

diff --git a/drivers/gpu/ipu-v3/Makefile b/drivers/gpu/ipu-v3/Makefile
index 7cc8b47e488b..5fe5ef20701a 100644
--- a/drivers/gpu/ipu-v3/Makefile
+++ b/drivers/gpu/ipu-v3/Makefile
@@ -2,8 +2,8 @@
 obj-$(CONFIG_IMX_IPUV3_CORE) += imx-ipu-v3.o
 
 imx-ipu-v3-objs := ipu-common.o ipu-cpmem.o ipu-csi.o ipu-dc.o ipu-di.o \
-		ipu-dp.o ipu-dmfc.o ipu-ic.o ipu-image-convert.o \
-		ipu-smfc.o ipu-vdi.o
+		ipu-dp.o ipu-dmfc.o ipu-ic.o ipu-ic-csc.o \
+		ipu-image-convert.o ipu-smfc.o ipu-vdi.o
 
 ifdef CONFIG_DRM
 	imx-ipu-v3-objs += ipu-pre.o ipu-prg.o
diff --git a/drivers/gpu/ipu-v3/ipu-ic-csc.c b/drivers/gpu/ipu-v3/ipu-ic-csc.c
new file mode 100644
index 000000000000..a5a26fe37608
--- /dev/null
+++ b/drivers/gpu/ipu-v3/ipu-ic-csc.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Mentor Graphics Inc.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/sizes.h>
+#include "ipu-prv.h"
+
+/* identity matrix */
+static const struct ipu_ic_csc_params identity = {
+	.coeff = {
+		{  128,    0,    0, },
+		{    0,  128,    0, },
+		{    0,    0,  128, },
+	},
+	.offset = { 0, 0, 0, },
+	.scale = 2,
+};
+
+static const struct ipu_ic_csc_params *rgb2rgb[] = {
+	&identity,
+};
+
+static const struct ipu_ic_csc_params *yuv2yuv[] = {
+	&identity,
+};
+
+/*
+ * BT.601 RGB full-range to YUV full-range
+ *
+ * Y =  .2990 * R + .5870 * G + .1140 * B
+ * U = -.1687 * R - .3313 * G + .5000 * B + 128
+ * V =  .5000 * R - .4187 * G - .0813 * B + 128
+ */
+static const struct ipu_ic_csc_params rgbf2yuvf_601 = {
+	.coeff = {
+		{   77,  150,   29, },
+		{  -43,  -85,  128, },
+		{  128, -107,  -21, },
+	},
+	.offset = { 0, 512, 512, },
+	.scale = 1,
+};
+
+/*
+ * BT.601 YUV full-range to RGB full-range
+ *
+ * R = 1. * Y +      0 * (Cb - 128) + 1.4020 * (Cr - 128)
+ * G = 1. * Y -  .3441 * (Cb - 128) -  .7141 * (Cr - 128)
+ * B = 1. * Y + 1.7720 * (Cb - 128) +      0 * (Cr - 128)
+ *
+ * equivalently (factoring out the offsets):
+ *
+ * R = 1. * Y  +      0 * Cb + 1.4020 * Cr - 179.456
+ * G = 1. * Y  -  .3441 * Cb -  .7141 * Cr + 135.450
+ * B = 1. * Y  + 1.7720 * Cb +      0 * Cr - 226.816
+ */
+static const struct ipu_ic_csc_params yuvf2rgbf_601 = {
+	.coeff = {
+		{  128,    0,  179, },
+		{  128,  -44,  -91, },
+		{  128,  227,    0, },
+	},
+	.offset = { -359, 271, -454, },
+	.scale = 2,
+};
+
+static const struct ipu_ic_csc_params *rgb2yuv_601[] = {
+	&rgbf2yuvf_601,
+};
+
+static const struct ipu_ic_csc_params *yuv2rgb_601[] = {
+	&yuvf2rgbf_601,
+};
+
+static int calc_csc_coeffs(struct ipu_ic_csc *csc)
+{
+	if (csc->out_cs.enc != V4L2_YCBCR_ENC_601)
+		return -ENOTSUPP;
+
+	if ((csc->in_cs.cs == IPUV3_COLORSPACE_YUV &&
+	     csc->in_cs.quant != V4L2_QUANTIZATION_FULL_RANGE) ||
+	    (csc->out_cs.cs == IPUV3_COLORSPACE_YUV &&
+	     csc->out_cs.quant != V4L2_QUANTIZATION_FULL_RANGE))
+		return -ENOTSUPP;
+
+	if ((csc->in_cs.cs == IPUV3_COLORSPACE_RGB &&
+	     csc->in_cs.quant != V4L2_QUANTIZATION_FULL_RANGE) ||
+	    (csc->out_cs.cs == IPUV3_COLORSPACE_RGB &&
+	     csc->out_cs.quant != V4L2_QUANTIZATION_FULL_RANGE))
+		return -ENOTSUPP;
+
+	if (csc->in_cs.cs == csc->out_cs.cs) {
+		csc->params = (csc->in_cs.cs == IPUV3_COLORSPACE_YUV) ?
+			*yuv2yuv[0] : *rgb2rgb[0];
+		return 0;
+	}
+
+	csc->params = (csc->in_cs.cs == IPUV3_COLORSPACE_YUV) ?
+		*yuv2rgb_601[0] : *rgb2yuv_601[0];
+
+	return 0;
+}
+
+int __ipu_ic_calc_csc(struct ipu_ic_csc *csc)
+{
+	return calc_csc_coeffs(csc);
+}
+EXPORT_SYMBOL_GPL(__ipu_ic_calc_csc);
+
+int ipu_ic_calc_csc(struct ipu_ic_csc *csc,
+		    enum v4l2_ycbcr_encoding in_enc,
+		    enum v4l2_quantization in_quant,
+		    enum ipu_color_space in_cs,
+		    enum v4l2_ycbcr_encoding out_enc,
+		    enum v4l2_quantization out_quant,
+		    enum ipu_color_space out_cs)
+{
+	ipu_ic_fill_colorspace(&csc->in_cs, in_enc, in_quant, in_cs);
+	ipu_ic_fill_colorspace(&csc->out_cs, out_enc, out_quant, out_cs);
+
+	return __ipu_ic_calc_csc(csc);
+}
+EXPORT_SYMBOL_GPL(ipu_ic_calc_csc);
diff --git a/drivers/gpu/ipu-v3/ipu-ic.c b/drivers/gpu/ipu-v3/ipu-ic.c
index 3428b0e72bc5..846461bac70d 100644
--- a/drivers/gpu/ipu-v3/ipu-ic.c
+++ b/drivers/gpu/ipu-v3/ipu-ic.c
@@ -140,8 +140,10 @@ struct ipu_ic {
 	const struct ic_task_regoffs *reg;
 	const struct ic_task_bitfields *bit;
 
-	enum ipu_color_space in_cs, g_in_cs;
-	enum ipu_color_space out_cs;
+	struct ipu_ic_colorspace in_cs;
+	struct ipu_ic_colorspace g_in_cs;
+	struct ipu_ic_colorspace out_cs;
+
 	bool graphics;
 	bool rotation;
 	bool in_use;
@@ -169,60 +171,11 @@ static inline void ipu_ic_write(struct ipu_ic *ic, u32 value, unsigned offset)
 	writel(value, ic->priv->base + offset);
 }
 
-struct ic_csc_params {
-	s16 coeff[3][3];	/* signed 9-bit integer coefficients */
-	s16 offset[3];		/* signed 11+2-bit fixed point offset */
-	u8 scale:2;		/* scale coefficients * 2^(scale-1) */
-	bool sat:1;		/* saturate to (16, 235(Y) / 240(U, V)) */
-};
-
-/*
- * Y = R *  .299 + G *  .587 + B *  .114;
- * U = R * -.169 + G * -.332 + B *  .500 + 128.;
- * V = R *  .500 + G * -.419 + B * -.0813 + 128.;
- */
-static const struct ic_csc_params ic_csc_rgb2ycbcr = {
-	.coeff = {
-		{ 77, 150, 29 },
-		{ 469, 427, 128 },
-		{ 128, 405, 491 },
-	},
-	.offset = { 0, 512, 512 },
-	.scale = 1,
-};
-
-/* transparent RGB->RGB matrix for graphics combining */
-static const struct ic_csc_params ic_csc_rgb2rgb = {
-	.coeff = {
-		{ 128, 0, 0 },
-		{ 0, 128, 0 },
-		{ 0, 0, 128 },
-	},
-	.scale = 2,
-};
-
-/*
- * R = (1.164 * (Y - 16)) + (1.596 * (Cr - 128));
- * G = (1.164 * (Y - 16)) - (0.392 * (Cb - 128)) - (0.813 * (Cr - 128));
- * B = (1.164 * (Y - 16)) + (2.017 * (Cb - 128);
- */
-static const struct ic_csc_params ic_csc_ycbcr2rgb = {
-	.coeff = {
-		{ 149, 0, 204 },
-		{ 149, 462, 408 },
-		{ 149, 255, 0 },
-	},
-	.offset = { -446, 266, -554 },
-	.scale = 2,
-};
-
 static int init_csc(struct ipu_ic *ic,
-		    enum ipu_color_space inf,
-		    enum ipu_color_space outf,
+		    const struct ipu_ic_csc *csc,
 		    int csc_index)
 {
 	struct ipu_ic_priv *priv = ic->priv;
-	const struct ic_csc_params *params;
 	u32 __iomem *base;
 	const u16 (*c)[3];
 	const u16 *a;
@@ -231,27 +184,16 @@ static int init_csc(struct ipu_ic *ic,
 	base = (u32 __iomem *)
 		(priv->tpmem_base + ic->reg->tpmem_csc[csc_index]);
 
-	if (inf == IPUV3_COLORSPACE_YUV && outf == IPUV3_COLORSPACE_RGB)
-		params = &ic_csc_ycbcr2rgb;
-	else if (inf == IPUV3_COLORSPACE_RGB && outf == IPUV3_COLORSPACE_YUV)
-		params = &ic_csc_rgb2ycbcr;
-	else if (inf == IPUV3_COLORSPACE_RGB && outf == IPUV3_COLORSPACE_RGB)
-		params = &ic_csc_rgb2rgb;
-	else {
-		dev_err(priv->ipu->dev, "Unsupported color space conversion\n");
-		return -EINVAL;
-	}
-
 	/* Cast to unsigned */
-	c = (const u16 (*)[3])params->coeff;
-	a = (const u16 *)params->offset;
+	c = (const u16 (*)[3])csc->params.coeff;
+	a = (const u16 *)csc->params.offset;
 
 	param = ((a[0] & 0x1f) << 27) | ((c[0][0] & 0x1ff) << 18) |
 		((c[1][1] & 0x1ff) << 9) | (c[2][2] & 0x1ff);
 	writel(param, base++);
 
-	param = ((a[0] & 0x1fe0) >> 5) | (params->scale << 8) |
-		(params->sat << 10);
+	param = ((a[0] & 0x1fe0) >> 5) | (csc->params.scale << 8) |
+		(csc->params.sat << 10);
 	writel(param, base++);
 
 	param = ((a[1] & 0x1f) << 27) | ((c[0][1] & 0x1ff) << 18) |
@@ -338,14 +280,14 @@ void ipu_ic_task_enable(struct ipu_ic *ic)
 	if (ic->rotation)
 		ic_conf |= ic->bit->ic_conf_rot_en;
 
-	if (ic->in_cs != ic->out_cs)
+	if (ic->in_cs.cs != ic->out_cs.cs)
 		ic_conf |= ic->bit->ic_conf_csc1_en;
 
 	if (ic->graphics) {
 		ic_conf |= ic->bit->ic_conf_cmb_en;
 		ic_conf |= ic->bit->ic_conf_csc1_en;
 
-		if (ic->g_in_cs != ic->out_cs)
+		if (ic->g_in_cs.cs != ic->out_cs.cs)
 			ic_conf |= ic->bit->ic_conf_csc2_en;
 	}
 
@@ -380,11 +322,12 @@ void ipu_ic_task_disable(struct ipu_ic *ic)
 EXPORT_SYMBOL_GPL(ipu_ic_task_disable);
 
 int ipu_ic_task_graphics_init(struct ipu_ic *ic,
-			      enum ipu_color_space in_g_cs,
+			      const struct ipu_ic_colorspace *g_in_cs,
 			      bool galpha_en, u32 galpha,
 			      bool colorkey_en, u32 colorkey)
 {
 	struct ipu_ic_priv *priv = ic->priv;
+	struct ipu_ic_csc csc2;
 	unsigned long flags;
 	u32 reg, ic_conf;
 	int ret = 0;
@@ -397,21 +340,36 @@ int ipu_ic_task_graphics_init(struct ipu_ic *ic,
 	ic_conf = ipu_ic_read(ic, IC_CONF);
 
 	if (!(ic_conf & ic->bit->ic_conf_csc1_en)) {
-		/* need transparent CSC1 conversion */
-		ret = init_csc(ic, IPUV3_COLORSPACE_RGB,
-			       IPUV3_COLORSPACE_RGB, 0);
+		struct ipu_ic_csc csc1;
+
+		ret = ipu_ic_calc_csc(&csc1,
+				      V4L2_YCBCR_ENC_601,
+				      V4L2_QUANTIZATION_FULL_RANGE,
+				      IPUV3_COLORSPACE_RGB,
+				      V4L2_YCBCR_ENC_601,
+				      V4L2_QUANTIZATION_FULL_RANGE,
+				      IPUV3_COLORSPACE_RGB);
 		if (ret)
 			goto unlock;
-	}
-
-	ic->g_in_cs = in_g_cs;
 
-	if (ic->g_in_cs != ic->out_cs) {
-		ret = init_csc(ic, ic->g_in_cs, ic->out_cs, 1);
+		/* need transparent CSC1 conversion */
+		ret = init_csc(ic, &csc1, 0);
 		if (ret)
 			goto unlock;
 	}
 
+	ic->g_in_cs = *g_in_cs;
+	csc2.in_cs = ic->g_in_cs;
+	csc2.out_cs = ic->out_cs;
+
+	ret = __ipu_ic_calc_csc(&csc2);
+	if (ret)
+		goto unlock;
+
+	ret = init_csc(ic, &csc2, 1);
+	if (ret)
+		goto unlock;
+
 	if (galpha_en) {
 		ic_conf |= IC_CONF_IC_GLB_LOC_A;
 		reg = ipu_ic_read(ic, IC_CMBP_1);
@@ -437,10 +395,9 @@ unlock:
 EXPORT_SYMBOL_GPL(ipu_ic_task_graphics_init);
 
 int ipu_ic_task_init_rsc(struct ipu_ic *ic,
+			 const struct ipu_ic_csc *csc,
 			 int in_width, int in_height,
 			 int out_width, int out_height,
-			 enum ipu_color_space in_cs,
-			 enum ipu_color_space out_cs,
 			 u32 rsc)
 {
 	struct ipu_ic_priv *priv = ic->priv;
@@ -472,28 +429,23 @@ int ipu_ic_task_init_rsc(struct ipu_ic *ic,
 	ipu_ic_write(ic, rsc, ic->reg->rsc);
 
 	/* Setup color space conversion */
-	ic->in_cs = in_cs;
-	ic->out_cs = out_cs;
+	ic->in_cs = csc->in_cs;
+	ic->out_cs = csc->out_cs;
 
-	if (ic->in_cs != ic->out_cs) {
-		ret = init_csc(ic, ic->in_cs, ic->out_cs, 0);
-		if (ret)
-			goto unlock;
-	}
+	ret = init_csc(ic, csc, 0);
 
-unlock:
 	spin_unlock_irqrestore(&priv->lock, flags);
 	return ret;
 }
 
 int ipu_ic_task_init(struct ipu_ic *ic,
+		     const struct ipu_ic_csc *csc,
 		     int in_width, int in_height,
-		     int out_width, int out_height,
-		     enum ipu_color_space in_cs,
-		     enum ipu_color_space out_cs)
+		     int out_width, int out_height)
 {
-	return ipu_ic_task_init_rsc(ic, in_width, in_height, out_width,
-				    out_height, in_cs, out_cs, 0);
+	return ipu_ic_task_init_rsc(ic, csc,
+				    in_width, in_height,
+				    out_width, out_height, 0);
 }
 EXPORT_SYMBOL_GPL(ipu_ic_task_init);
 
diff --git a/drivers/gpu/ipu-v3/ipu-image-convert.c b/drivers/gpu/ipu-v3/ipu-image-convert.c
index 36e88434513a..d2457fde25fa 100644
--- a/drivers/gpu/ipu-v3/ipu-image-convert.c
+++ b/drivers/gpu/ipu-v3/ipu-image-convert.c
@@ -146,6 +146,7 @@ struct ipu_image_convert_ctx {
 	/* Source/destination image data and rotation mode */
 	struct ipu_image_convert_image in;
 	struct ipu_image_convert_image out;
+	struct ipu_ic_csc csc;
 	enum ipu_rotate_mode rot_mode;
 	u32 downsize_coeff_h;
 	u32 downsize_coeff_v;
@@ -1308,7 +1309,6 @@ static int convert_start(struct ipu_image_convert_run *run, unsigned int tile)
 	struct ipu_image_convert_priv *priv = chan->priv;
 	struct ipu_image_convert_image *s_image = &ctx->in;
 	struct ipu_image_convert_image *d_image = &ctx->out;
-	enum ipu_color_space src_cs, dest_cs;
 	unsigned int dst_tile = ctx->out_tile_map[tile];
 	unsigned int dest_width, dest_height;
 	unsigned int col, row;
@@ -1318,9 +1318,6 @@ static int convert_start(struct ipu_image_convert_run *run, unsigned int tile)
 	dev_dbg(priv->ipu->dev, "%s: task %u: starting ctx %p run %p tile %u -> %u\n",
 		__func__, chan->ic_task, ctx, run, tile, dst_tile);
 
-	src_cs = ipu_pixelformat_to_colorspace(s_image->fmt->fourcc);
-	dest_cs = ipu_pixelformat_to_colorspace(d_image->fmt->fourcc);
-
 	if (ipu_rot_mode_is_irt(ctx->rot_mode)) {
 		/* swap width/height for resizer */
 		dest_width = d_image->tile[dst_tile].height;
@@ -1343,13 +1340,12 @@ static int convert_start(struct ipu_image_convert_run *run, unsigned int tile)
 		s_image->tile[tile].height, dest_width, dest_height, rsc);
 
 	/* setup the IC resizer and CSC */
-	ret = ipu_ic_task_init_rsc(chan->ic,
-			       s_image->tile[tile].width,
-			       s_image->tile[tile].height,
-			       dest_width,
-			       dest_height,
-			       src_cs, dest_cs,
-			       rsc);
+	ret = ipu_ic_task_init_rsc(chan->ic, &ctx->csc,
+				   s_image->tile[tile].width,
+				   s_image->tile[tile].height,
+				   dest_width,
+				   dest_height,
+				   rsc);
 	if (ret) {
 		dev_err(priv->ipu->dev, "ipu_ic_task_init failed, %d\n", ret);
 		return ret;
@@ -2049,6 +2045,16 @@ ipu_image_convert_prepare(struct ipu_soc *ipu, enum ipu_ic_task ic_task,
 
 	calc_tile_resize_coefficients(ctx);
 
+	ret = ipu_ic_calc_csc(&ctx->csc,
+			s_image->base.pix.ycbcr_enc,
+			s_image->base.pix.quantization,
+			ipu_pixelformat_to_colorspace(s_image->fmt->fourcc),
+			d_image->base.pix.ycbcr_enc,
+			d_image->base.pix.quantization,
+			ipu_pixelformat_to_colorspace(d_image->fmt->fourcc));
+	if (ret)
+		goto out_free;
+
 	dump_format(ctx, s_image);
 	dump_format(ctx, d_image);
 
diff --git a/drivers/staging/media/imx/imx-ic-prpencvf.c b/drivers/staging/media/imx/imx-ic-prpencvf.c
index 64037b0a8387..e8b36a181ccc 100644
--- a/drivers/staging/media/imx/imx-ic-prpencvf.c
+++ b/drivers/staging/media/imx/imx-ic-prpencvf.c
@@ -456,6 +456,7 @@ static int prp_setup_rotation(struct prp_priv *priv)
 	const struct imx_media_pixfmt *outcc, *incc;
 	struct v4l2_mbus_framefmt *infmt;
 	struct v4l2_pix_format *outfmt;
+	struct ipu_ic_csc csc;
 	dma_addr_t phys[2];
 	int ret;
 
@@ -464,6 +465,17 @@ static int prp_setup_rotation(struct prp_priv *priv)
 	incc = priv->cc[PRPENCVF_SINK_PAD];
 	outcc = vdev->cc;
 
+	ret = ipu_ic_calc_csc(&csc,
+			      infmt->ycbcr_enc, infmt->quantization,
+			      incc->cs,
+			      outfmt->ycbcr_enc, outfmt->quantization,
+			      outcc->cs);
+	if (ret) {
+		v4l2_err(&ic_priv->sd, "ipu_ic_calc_csc failed, %d\n",
+			 ret);
+		return ret;
+	}
+
 	ret = imx_media_alloc_dma_buf(priv->md, &priv->rot_buf[0],
 				      outfmt->sizeimage);
 	if (ret) {
@@ -477,10 +489,9 @@ static int prp_setup_rotation(struct prp_priv *priv)
 		goto free_rot0;
 	}
 
-	ret = ipu_ic_task_init(priv->ic,
+	ret = ipu_ic_task_init(priv->ic, &csc,
 			       infmt->width, infmt->height,
-			       outfmt->height, outfmt->width,
-			       incc->cs, outcc->cs);
+			       outfmt->height, outfmt->width);
 	if (ret) {
 		v4l2_err(&ic_priv->sd, "ipu_ic_task_init failed, %d\n", ret);
 		goto free_rot1;
@@ -572,6 +583,7 @@ static int prp_setup_norotation(struct prp_priv *priv)
 	const struct imx_media_pixfmt *outcc, *incc;
 	struct v4l2_mbus_framefmt *infmt;
 	struct v4l2_pix_format *outfmt;
+	struct ipu_ic_csc csc;
 	dma_addr_t phys[2];
 	int ret;
 
@@ -580,10 +592,20 @@ static int prp_setup_norotation(struct prp_priv *priv)
 	incc = priv->cc[PRPENCVF_SINK_PAD];
 	outcc = vdev->cc;
 
-	ret = ipu_ic_task_init(priv->ic,
+	ret = ipu_ic_calc_csc(&csc,
+			      infmt->ycbcr_enc, infmt->quantization,
+			      incc->cs,
+			      outfmt->ycbcr_enc, outfmt->quantization,
+			      outcc->cs);
+	if (ret) {
+		v4l2_err(&ic_priv->sd, "ipu_ic_calc_csc failed, %d\n",
+			 ret);
+		return ret;
+	}
+
+	ret = ipu_ic_task_init(priv->ic, &csc,
 			       infmt->width, infmt->height,
-			       outfmt->width, outfmt->height,
-			       incc->cs, outcc->cs);
+			       outfmt->width, outfmt->height);
 	if (ret) {
 		v4l2_err(&ic_priv->sd, "ipu_ic_task_init failed, %d\n", ret);
 		return ret;
diff --git a/include/video/imx-ipu-v3.h b/include/video/imx-ipu-v3.h
index b03fafa1ff58..06b0b57e996c 100644
--- a/include/video/imx-ipu-v3.h
+++ b/include/video/imx-ipu-v3.h
@@ -387,20 +387,64 @@ enum ipu_ic_task {
 	IC_NUM_TASKS,
 };
 
+/*
+ * The parameters that describe a colorspace according to the
+ * Image Converter:
+ *    - Y'CbCr encoding
+ *    - quantization
+ *    - "colorspace" (RGB or YUV).
+ */
+struct ipu_ic_colorspace {
+	enum v4l2_ycbcr_encoding enc;
+	enum v4l2_quantization quant;
+	enum ipu_color_space cs;
+};
+
+static inline void
+ipu_ic_fill_colorspace(struct ipu_ic_colorspace *ic_cs,
+		       enum v4l2_ycbcr_encoding enc,
+		       enum v4l2_quantization quant,
+		       enum ipu_color_space cs)
+{
+	ic_cs->enc = enc;
+	ic_cs->quant = quant;
+	ic_cs->cs = cs;
+}
+
+struct ipu_ic_csc_params {
+	s16 coeff[3][3];	/* signed 9-bit integer coefficients */
+	s16 offset[3];		/* signed 11+2-bit fixed point offset */
+	u8 scale:2;		/* scale coefficients * 2^(scale-1) */
+	bool sat:1;		/* saturate to (16, 235(Y) / 240(U, V)) */
+};
+
+struct ipu_ic_csc {
+	struct ipu_ic_colorspace in_cs;
+	struct ipu_ic_colorspace out_cs;
+	struct ipu_ic_csc_params params;
+};
+
 struct ipu_ic;
+
+int __ipu_ic_calc_csc(struct ipu_ic_csc *csc);
+int ipu_ic_calc_csc(struct ipu_ic_csc *csc,
+		    enum v4l2_ycbcr_encoding in_enc,
+		    enum v4l2_quantization in_quant,
+		    enum ipu_color_space in_cs,
+		    enum v4l2_ycbcr_encoding out_enc,
+		    enum v4l2_quantization out_quant,
+		    enum ipu_color_space out_cs);
 int ipu_ic_task_init(struct ipu_ic *ic,
+		     const struct ipu_ic_csc *csc,
 		     int in_width, int in_height,
-		     int out_width, int out_height,
-		     enum ipu_color_space in_cs,
-		     enum ipu_color_space out_cs);
+		     int out_width, int out_height);
 int ipu_ic_task_init_rsc(struct ipu_ic *ic,
+			 const struct ipu_ic_csc *csc,
 			 int in_width, int in_height,
 			 int out_width, int out_height,
-			 enum ipu_color_space in_cs,
-			 enum ipu_color_space out_cs,
 			 u32 rsc);
 int ipu_ic_task_graphics_init(struct ipu_ic *ic,
-			      enum ipu_color_space in_g_cs,
+			      const struct ipu_ic_colorspace *g_in_cs,
 			      bool galpha_en, u32 galpha,
 			      bool colorkey_en, u32 colorkey);
 void ipu_ic_task_enable(struct ipu_ic *ic);
-- 
cgit v1.2.3


From 7974033e527a5dd12d96126d09d4cff4f9b65c69 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Tue, 28 May 2019 17:06:49 +0300
Subject: drm/dp: Add DP_DPCD_QUIRK_NO_SINK_COUNT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CH7511 eDP->LVDS bridge doesn't seem to set SINK_COUNT properly
causing i915 to detect it as disconnected. Add a quirk to ignore
SINK_COUNT on these devices.

Cc: David S. <david@majinbuu.com>
Cc: Peteris Rudzusiks <peteris.rudzusiks@gmail.com>
Tested-by: Peteris Rudzusiks <peteris.rudzusiks@gmail.com>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105406
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190528140650.19230-1-ville.syrjala@linux.intel.com
Acked-by: Jani Nikula <jani.nikula@intel.com> #irc
---
 drivers/gpu/drm/drm_dp_helper.c | 4 +++-
 include/drm/drm_dp_helper.h     | 7 +++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_dp_helper.c b/drivers/gpu/drm/drm_dp_helper.c
index e6af758a7d22..0b994d083a89 100644
--- a/drivers/gpu/drm/drm_dp_helper.c
+++ b/drivers/gpu/drm/drm_dp_helper.c
@@ -1280,7 +1280,9 @@ static const struct dpcd_quirk dpcd_quirk_list[] = {
 	/* LG LP140WF6-SPM1 eDP panel */
 	{ OUI(0x00, 0x22, 0xb9), DEVICE_ID('s', 'i', 'v', 'a', 'r', 'T'), false, BIT(DP_DPCD_QUIRK_CONSTANT_N) },
 	/* Apple panels need some additional handling to support PSR */
-	{ OUI(0x00, 0x10, 0xfa), DEVICE_ID_ANY, false, BIT(DP_DPCD_QUIRK_NO_PSR) }
+	{ OUI(0x00, 0x10, 0xfa), DEVICE_ID_ANY, false, BIT(DP_DPCD_QUIRK_NO_PSR) },
+	/* CH7511 seems to leave SINK_COUNT zeroed */
+	{ OUI(0x00, 0x00, 0x00), DEVICE_ID('C', 'H', '7', '5', '1', '1'), false, BIT(DP_DPCD_QUIRK_NO_SINK_COUNT) },
 };
 
 #undef OUI
diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 3fc534ee8174..7e52eb81284a 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -1414,6 +1414,13 @@ enum drm_dp_quirk {
 	 * driver still need to implement proper handling for such device.
 	 */
 	DP_DPCD_QUIRK_NO_PSR,
+	/**
+	 * @DP_DPCD_QUIRK_NO_SINK_COUNT:
+	 *
+	 * The device does not set SINK_COUNT to a non-zero value.
+	 * The driver should ignore SINK_COUNT during detection.
+	 */
+	DP_DPCD_QUIRK_NO_SINK_COUNT,
 };
 
 /**
-- 
cgit v1.2.3


From 151f4e2bdc7a04020ae5c533896fb91a16e1f501 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 13 Jun 2019 07:10:36 -0300
Subject: docs: power: convert docs to ReST and rename to *.rst

Convert the PM documents to ReST, in order to allow them to
build with Sphinx.

The conversion is actually:
  - add blank lines and indentation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
---
 Documentation/ABI/testing/sysfs-class-powercap     |    2 +-
 Documentation/admin-guide/kernel-parameters.txt    |    6 +-
 Documentation/cpu-freq/core.txt                    |    2 +-
 Documentation/driver-api/pm/devices.rst            |    6 +-
 Documentation/driver-api/usb/power-management.rst  |    2 +-
 Documentation/power/apm-acpi.rst                   |   36 +
 Documentation/power/apm-acpi.txt                   |   32 -
 Documentation/power/basic-pm-debugging.rst         |  269 +++++
 Documentation/power/basic-pm-debugging.txt         |  254 -----
 Documentation/power/charger-manager.rst            |  205 ++++
 Documentation/power/charger-manager.txt            |  200 ----
 Documentation/power/drivers-testing.rst            |   51 +
 Documentation/power/drivers-testing.txt            |   46 -
 Documentation/power/energy-model.rst               |  147 +++
 Documentation/power/energy-model.txt               |  144 ---
 Documentation/power/freezing-of-tasks.rst          |  244 +++++
 Documentation/power/freezing-of-tasks.txt          |  231 ----
 Documentation/power/index.rst                      |   46 +
 Documentation/power/interface.rst                  |   79 ++
 Documentation/power/interface.txt                  |   77 --
 Documentation/power/opp.rst                        |  379 +++++++
 Documentation/power/opp.txt                        |  342 ------
 Documentation/power/pci.rst                        | 1135 ++++++++++++++++++++
 Documentation/power/pci.txt                        | 1094 -------------------
 Documentation/power/pm_qos_interface.rst           |  225 ++++
 Documentation/power/pm_qos_interface.txt           |  212 ----
 Documentation/power/power_supply_class.rst         |  282 +++++
 Documentation/power/power_supply_class.txt         |  231 ----
 Documentation/power/powercap/powercap.rst          |  257 +++++
 Documentation/power/powercap/powercap.txt          |  236 ----
 Documentation/power/regulator/consumer.rst         |  229 ++++
 Documentation/power/regulator/consumer.txt         |  218 ----
 Documentation/power/regulator/design.rst           |   38 +
 Documentation/power/regulator/design.txt           |   33 -
 Documentation/power/regulator/machine.rst          |   97 ++
 Documentation/power/regulator/machine.txt          |   96 --
 Documentation/power/regulator/overview.rst         |  178 +++
 Documentation/power/regulator/overview.txt         |  171 ---
 Documentation/power/regulator/regulator.rst        |   32 +
 Documentation/power/regulator/regulator.txt        |   30 -
 Documentation/power/runtime_pm.rst                 |  940 ++++++++++++++++
 Documentation/power/runtime_pm.txt                 |  928 ----------------
 Documentation/power/s2ram.rst                      |   87 ++
 Documentation/power/s2ram.txt                      |   85 --
 Documentation/power/suspend-and-cpuhotplug.rst     |  286 +++++
 Documentation/power/suspend-and-cpuhotplug.txt     |  274 -----
 Documentation/power/suspend-and-interrupts.rst     |  137 +++
 Documentation/power/suspend-and-interrupts.txt     |  135 ---
 Documentation/power/swsusp-and-swap-files.rst      |   63 ++
 Documentation/power/swsusp-and-swap-files.txt      |   60 --
 Documentation/power/swsusp-dmcrypt.rst             |  140 +++
 Documentation/power/swsusp-dmcrypt.txt             |  138 ---
 Documentation/power/swsusp.rst                     |  501 +++++++++
 Documentation/power/swsusp.txt                     |  446 --------
 Documentation/power/tricks.rst                     |   29 +
 Documentation/power/tricks.txt                     |   27 -
 Documentation/power/userland-swsusp.rst            |  191 ++++
 Documentation/power/userland-swsusp.txt            |  170 ---
 Documentation/power/video.rst                      |  213 ++++
 Documentation/power/video.txt                      |  185 ----
 Documentation/process/submitting-drivers.rst       |    2 +-
 Documentation/scheduler/sched-energy.txt           |    6 +-
 Documentation/trace/coresight-cpu-debug.txt        |    2 +-
 .../zh_CN/process/submitting-drivers.rst           |    2 +-
 MAINTAINERS                                        |    4 +-
 arch/x86/Kconfig                                   |    2 +-
 drivers/gpu/drm/i915/i915_drv.h                    |    2 +-
 drivers/opp/Kconfig                                |    2 +-
 drivers/power/supply/power_supply_core.c           |    2 +-
 include/linux/interrupt.h                          |    2 +-
 include/linux/pci.h                                |    2 +-
 include/linux/pm.h                                 |    2 +-
 kernel/power/Kconfig                               |    6 +-
 net/wireless/Kconfig                               |    2 +-
 74 files changed, 6544 insertions(+), 6123 deletions(-)
 create mode 100644 Documentation/power/apm-acpi.rst
 delete mode 100644 Documentation/power/apm-acpi.txt
 create mode 100644 Documentation/power/basic-pm-debugging.rst
 delete mode 100644 Documentation/power/basic-pm-debugging.txt
 create mode 100644 Documentation/power/charger-manager.rst
 delete mode 100644 Documentation/power/charger-manager.txt
 create mode 100644 Documentation/power/drivers-testing.rst
 delete mode 100644 Documentation/power/drivers-testing.txt
 create mode 100644 Documentation/power/energy-model.rst
 delete mode 100644 Documentation/power/energy-model.txt
 create mode 100644 Documentation/power/freezing-of-tasks.rst
 delete mode 100644 Documentation/power/freezing-of-tasks.txt
 create mode 100644 Documentation/power/index.rst
 create mode 100644 Documentation/power/interface.rst
 delete mode 100644 Documentation/power/interface.txt
 create mode 100644 Documentation/power/opp.rst
 delete mode 100644 Documentation/power/opp.txt
 create mode 100644 Documentation/power/pci.rst
 delete mode 100644 Documentation/power/pci.txt
 create mode 100644 Documentation/power/pm_qos_interface.rst
 delete mode 100644 Documentation/power/pm_qos_interface.txt
 create mode 100644 Documentation/power/power_supply_class.rst
 delete mode 100644 Documentation/power/power_supply_class.txt
 create mode 100644 Documentation/power/powercap/powercap.rst
 delete mode 100644 Documentation/power/powercap/powercap.txt
 create mode 100644 Documentation/power/regulator/consumer.rst
 delete mode 100644 Documentation/power/regulator/consumer.txt
 create mode 100644 Documentation/power/regulator/design.rst
 delete mode 100644 Documentation/power/regulator/design.txt
 create mode 100644 Documentation/power/regulator/machine.rst
 delete mode 100644 Documentation/power/regulator/machine.txt
 create mode 100644 Documentation/power/regulator/overview.rst
 delete mode 100644 Documentation/power/regulator/overview.txt
 create mode 100644 Documentation/power/regulator/regulator.rst
 delete mode 100644 Documentation/power/regulator/regulator.txt
 create mode 100644 Documentation/power/runtime_pm.rst
 delete mode 100644 Documentation/power/runtime_pm.txt
 create mode 100644 Documentation/power/s2ram.rst
 delete mode 100644 Documentation/power/s2ram.txt
 create mode 100644 Documentation/power/suspend-and-cpuhotplug.rst
 delete mode 100644 Documentation/power/suspend-and-cpuhotplug.txt
 create mode 100644 Documentation/power/suspend-and-interrupts.rst
 delete mode 100644 Documentation/power/suspend-and-interrupts.txt
 create mode 100644 Documentation/power/swsusp-and-swap-files.rst
 delete mode 100644 Documentation/power/swsusp-and-swap-files.txt
 create mode 100644 Documentation/power/swsusp-dmcrypt.rst
 delete mode 100644 Documentation/power/swsusp-dmcrypt.txt
 create mode 100644 Documentation/power/swsusp.rst
 delete mode 100644 Documentation/power/swsusp.txt
 create mode 100644 Documentation/power/tricks.rst
 delete mode 100644 Documentation/power/tricks.txt
 create mode 100644 Documentation/power/userland-swsusp.rst
 delete mode 100644 Documentation/power/userland-swsusp.txt
 create mode 100644 Documentation/power/video.rst
 delete mode 100644 Documentation/power/video.txt

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-powercap b/Documentation/ABI/testing/sysfs-class-powercap
index db3b3ff70d84..742dfd966592 100644
--- a/Documentation/ABI/testing/sysfs-class-powercap
+++ b/Documentation/ABI/testing/sysfs-class-powercap
@@ -5,7 +5,7 @@ Contact:	linux-pm@vger.kernel.org
 Description:
 		The powercap/ class sub directory belongs to the power cap
 		subsystem. Refer to
-		Documentation/power/powercap/powercap.txt for details.
+		Documentation/power/powercap/powercap.rst for details.
 
 What:		/sys/class/powercap/<control type>
 Date:		September 2013
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 138f6664b2e2..7f5ca6e7c4d3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -13,7 +13,7 @@
 			For ARM64, ONLY "acpi=off", "acpi=on" or "acpi=force"
 			are available
 
-			See also Documentation/power/runtime_pm.txt, pci=noacpi
+			See also Documentation/power/runtime_pm.rst, pci=noacpi
 
 	acpi_apic_instance=	[ACPI, IOAPIC]
 			Format: <int>
@@ -223,7 +223,7 @@
 	acpi_sleep=	[HW,ACPI] Sleep options
 			Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
 				  old_ordering, nonvs, sci_force_enable, nobl }
-			See Documentation/power/video.txt for information on
+			See Documentation/power/video.rst for information on
 			s3_bios and s3_mode.
 			s3_beep is for debugging; it makes the PC's speaker beep
 			as soon as the kernel's real-mode entry point is called.
@@ -4108,7 +4108,7 @@
 			Specify the offset from the beginning of the partition
 			given by "resume=" at which the swap header is located,
 			in <PAGE_SIZE> units (needed only for swap files).
-			See  Documentation/power/swsusp-and-swap-files.txt
+			See  Documentation/power/swsusp-and-swap-files.rst
 
 	resumedelay=	[HIBERNATION] Delay (in seconds) to pause before attempting to
 			read the resume files
diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt
index 073f128af5a7..55193e680250 100644
--- a/Documentation/cpu-freq/core.txt
+++ b/Documentation/cpu-freq/core.txt
@@ -95,7 +95,7 @@ flags	- flags of the cpufreq driver
 
 3. CPUFreq Table Generation with Operating Performance Point (OPP)
 ==================================================================
-For details about OPP, see Documentation/power/opp.txt
+For details about OPP, see Documentation/power/opp.rst
 
 dev_pm_opp_init_cpufreq_table -
 	This function provides a ready to use conversion routine to translate
diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index 30835683616a..f66c7b9126ea 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -225,7 +225,7 @@ system-wide transition to a sleep state even though its :c:member:`runtime_auto`
 flag is clear.
 
 For more information about the runtime power management framework, refer to
-:file:`Documentation/power/runtime_pm.txt`.
+:file:`Documentation/power/runtime_pm.rst`.
 
 
 Calling Drivers to Enter and Leave System Sleep States
@@ -728,7 +728,7 @@ it into account in any way.
 
 Devices may be defined as IRQ-safe which indicates to the PM core that their
 runtime PM callbacks may be invoked with disabled interrupts (see
-:file:`Documentation/power/runtime_pm.txt` for more information).  If an
+:file:`Documentation/power/runtime_pm.rst` for more information).  If an
 IRQ-safe device belongs to a PM domain, the runtime PM of the domain will be
 disallowed, unless the domain itself is defined as IRQ-safe. However, it
 makes sense to define a PM domain as IRQ-safe only if all the devices in it
@@ -795,7 +795,7 @@ so on) and the final state of the device must reflect the "active" runtime PM
 status in that case.
 
 During system-wide resume from a sleep state it's easiest to put devices into
-the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`.
+the full-power state, as explained in :file:`Documentation/power/runtime_pm.rst`.
 [Refer to that document for more information regarding this particular issue as
 well as for information on the device runtime power management framework in
 general.]
diff --git a/Documentation/driver-api/usb/power-management.rst b/Documentation/driver-api/usb/power-management.rst
index 4a74cf6f2797..2525c3622cae 100644
--- a/Documentation/driver-api/usb/power-management.rst
+++ b/Documentation/driver-api/usb/power-management.rst
@@ -46,7 +46,7 @@ device is turned off while the system as a whole remains running, we
 call it a "dynamic suspend" (also known as a "runtime suspend" or
 "selective suspend").  This document concentrates mostly on how
 dynamic PM is implemented in the USB subsystem, although system PM is
-covered to some extent (see ``Documentation/power/*.txt`` for more
+covered to some extent (see ``Documentation/power/*.rst`` for more
 information about system PM).
 
 System PM support is present only if the kernel was built with
diff --git a/Documentation/power/apm-acpi.rst b/Documentation/power/apm-acpi.rst
new file mode 100644
index 000000000000..5b90d947126d
--- /dev/null
+++ b/Documentation/power/apm-acpi.rst
@@ -0,0 +1,36 @@
+============
+APM or ACPI?
+============
+
+If you have a relatively recent x86 mobile, desktop, or server system,
+odds are it supports either Advanced Power Management (APM) or
+Advanced Configuration and Power Interface (ACPI).  ACPI is the newer
+of the two technologies and puts power management in the hands of the
+operating system, allowing for more intelligent power management than
+is possible with BIOS controlled APM.
+
+The best way to determine which, if either, your system supports is to
+build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
+enabled by default).  If a working ACPI implementation is found, the
+ACPI driver will override and disable APM, otherwise the APM driver
+will be used.
+
+No, sorry, you cannot have both ACPI and APM enabled and running at
+once.  Some people with broken ACPI or broken APM implementations
+would like to use both to get a full set of working features, but you
+simply cannot mix and match the two.  Only one power management
+interface can be in control of the machine at once.  Think about it..
+
+User-space Daemons
+------------------
+Both APM and ACPI rely on user-space daemons, apmd and acpid
+respectively, to be completely functional.  Obtain both of these
+daemons from your Linux distribution or from the Internet (see below)
+and be sure that they are started sometime in the system boot process.
+Go ahead and start both.  If ACPI or APM is not available on your
+system the associated daemon will exit gracefully.
+
+  =====  =======================================
+  apmd   http://ftp.debian.org/pool/main/a/apmd/
+  acpid  http://acpid.sf.net/
+  =====  =======================================
diff --git a/Documentation/power/apm-acpi.txt b/Documentation/power/apm-acpi.txt
deleted file mode 100644
index 6cc423d3662e..000000000000
--- a/Documentation/power/apm-acpi.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-APM or ACPI?
-------------
-If you have a relatively recent x86 mobile, desktop, or server system,
-odds are it supports either Advanced Power Management (APM) or
-Advanced Configuration and Power Interface (ACPI).  ACPI is the newer
-of the two technologies and puts power management in the hands of the
-operating system, allowing for more intelligent power management than
-is possible with BIOS controlled APM.
-
-The best way to determine which, if either, your system supports is to
-build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
-enabled by default).  If a working ACPI implementation is found, the
-ACPI driver will override and disable APM, otherwise the APM driver
-will be used.
-
-No, sorry, you cannot have both ACPI and APM enabled and running at
-once.  Some people with broken ACPI or broken APM implementations
-would like to use both to get a full set of working features, but you
-simply cannot mix and match the two.  Only one power management
-interface can be in control of the machine at once.  Think about it..
-
-User-space Daemons
-------------------
-Both APM and ACPI rely on user-space daemons, apmd and acpid
-respectively, to be completely functional.  Obtain both of these
-daemons from your Linux distribution or from the Internet (see below)
-and be sure that they are started sometime in the system boot process.
-Go ahead and start both.  If ACPI or APM is not available on your
-system the associated daemon will exit gracefully.
-
-  apmd:   http://ftp.debian.org/pool/main/a/apmd/
-  acpid:  http://acpid.sf.net/
diff --git a/Documentation/power/basic-pm-debugging.rst b/Documentation/power/basic-pm-debugging.rst
new file mode 100644
index 000000000000..69862e759c30
--- /dev/null
+++ b/Documentation/power/basic-pm-debugging.rst
@@ -0,0 +1,269 @@
+=================================
+Debugging hibernation and suspend
+=================================
+
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Testing hibernation (aka suspend to disk or STD)
+===================================================
+
+To check if hibernation works, you can try to hibernate in the "reboot" mode::
+
+	# echo reboot > /sys/power/disk
+	# echo disk > /sys/power/state
+
+and the system should create a hibernation image, reboot, resume and get back to
+the command prompt where you have started the transition.  If that happens,
+hibernation is most likely to work correctly.  Still, you need to repeat the
+test at least a couple of times in a row for confidence.  [This is necessary,
+because some problems only show up on a second attempt at suspending and
+resuming the system.]  Moreover, hibernating in the "reboot" and "shutdown"
+modes causes the PM core to skip some platform-related callbacks which on ACPI
+systems might be necessary to make hibernation work.  Thus, if your machine
+fails to hibernate or resume in the "reboot" mode, you should try the
+"platform" mode::
+
+	# echo platform > /sys/power/disk
+	# echo disk > /sys/power/state
+
+which is the default and recommended mode of hibernation.
+
+Unfortunately, the "platform" mode of hibernation does not work on some systems
+with broken BIOSes.  In such cases the "shutdown" mode of hibernation might
+work::
+
+	# echo shutdown > /sys/power/disk
+	# echo disk > /sys/power/state
+
+(it is similar to the "reboot" mode, but it requires you to press the power
+button to make the system resume).
+
+If neither "platform" nor "shutdown" hibernation mode works, you will need to
+identify what goes wrong.
+
+a) Test modes of hibernation
+----------------------------
+
+To find out why hibernation fails on your system, you can use a special testing
+facility available if the kernel is compiled with CONFIG_PM_DEBUG set.  Then,
+there is the file /sys/power/pm_test that can be used to make the hibernation
+core run in a test mode.  There are 5 test modes available:
+
+freezer
+	- test the freezing of processes
+
+devices
+	- test the freezing of processes and suspending of devices
+
+platform
+	- test the freezing of processes, suspending of devices and platform
+	  global control methods [1]_
+
+processors
+	- test the freezing of processes, suspending of devices, platform
+	  global control methods [1]_ and the disabling of nonboot CPUs
+
+core
+	- test the freezing of processes, suspending of devices, platform global
+	  control methods\ [1]_, the disabling of nonboot CPUs and suspending
+	  of platform/system devices
+
+.. [1]
+
+    the platform global control methods are only available on ACPI systems
+    and are only tested if the hibernation mode is set to "platform"
+
+To use one of them it is necessary to write the corresponding string to
+/sys/power/pm_test (eg. "devices" to test the freezing of processes and
+suspending devices) and issue the standard hibernation commands.  For example,
+to use the "devices" test mode along with the "platform" mode of hibernation,
+you should do the following::
+
+	# echo devices > /sys/power/pm_test
+	# echo platform > /sys/power/disk
+	# echo disk > /sys/power/state
+
+Then, the kernel will try to freeze processes, suspend devices, wait a few
+seconds (5 by default, but configurable by the suspend.pm_test_delay module
+parameter), resume devices and thaw processes.  If "platform" is written to
+/sys/power/pm_test , then after suspending devices the kernel will additionally
+invoke the global control methods (eg. ACPI global control methods) used to
+prepare the platform firmware for hibernation.  Next, it will wait a
+configurable number of seconds and invoke the platform (eg. ACPI) global
+methods used to cancel hibernation etc.
+
+Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
+hibernation/suspend operations.  Also, when open for reading, /sys/power/pm_test
+contains a space-separated list of all available tests (including "none" that
+represents the normal functionality) in which the current test level is
+indicated by square brackets.
+
+Generally, as you can see, each test level is more "invasive" than the previous
+one and the "core" level tests the hardware and drivers as deeply as possible
+without creating a hibernation image.  Obviously, if the "devices" test fails,
+the "platform" test will fail as well and so on.  Thus, as a rule of thumb, you
+should try the test modes starting from "freezer", through "devices", "platform"
+and "processors" up to "core" (repeat the test on each level a couple of times
+to make sure that any random factors are avoided).
+
+If the "freezer" test fails, there is a task that cannot be frozen (in that case
+it usually is possible to identify the offending task by analysing the output of
+dmesg obtained after the failing test).  Failure at this level usually means
+that there is a problem with the tasks freezer subsystem that should be
+reported.
+
+If the "devices" test fails, most likely there is a driver that cannot suspend
+or resume its device (in the latter case the system may hang or become unstable
+after the test, so please take that into consideration).  To find this driver,
+you can carry out a binary search according to the rules:
+
+- if the test fails, unload a half of the drivers currently loaded and repeat
+  (that would probably involve rebooting the system, so always note what drivers
+  have been loaded before the test),
+- if the test succeeds, load a half of the drivers you have unloaded most
+  recently and repeat.
+
+Once you have found the failing driver (there can be more than just one of
+them), you have to unload it every time before hibernation.  In that case please
+make sure to report the problem with the driver.
+
+It is also possible that the "devices" test will still fail after you have
+unloaded all modules. In that case, you may want to look in your kernel
+configuration for the drivers that can be compiled as modules (and test again
+with these drivers compiled as modules).  You may also try to use some special
+kernel command line options such as "noapic", "noacpi" or even "acpi=off".
+
+If the "platform" test fails, there is a problem with the handling of the
+platform (eg. ACPI) firmware on your system.  In that case the "platform" mode
+of hibernation is not likely to work.  You can try the "shutdown" mode, but that
+is rather a poor man's workaround.
+
+If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
+work (of course, this only may be an issue on SMP systems) and the problem
+should be reported.  In that case you can also try to switch the nonboot CPUs
+off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
+see if that works.
+
+If the "core" test fails, which means that suspending of the system/platform
+devices has failed (these devices are suspended on one CPU with interrupts off),
+the problem is most probably hardware-related and serious, so it should be
+reported.
+
+A failure of any of the "platform", "processors" or "core" tests may cause your
+system to hang or become unstable, so please beware.  Such a failure usually
+indicates a serious problem that very well may be related to the hardware, but
+please report it anyway.
+
+b) Testing minimal configuration
+--------------------------------
+
+If all of the hibernation test modes work, you can boot the system with the
+"init=/bin/bash" command line parameter and attempt to hibernate in the
+"reboot", "shutdown" and "platform" modes.  If that does not work, there
+probably is a problem with a driver statically compiled into the kernel and you
+can try to compile more drivers as modules, so that they can be tested
+individually.  Otherwise, there is a problem with a modular driver and you can
+find it by loading a half of the modules you normally use and binary searching
+in accordance with the algorithm:
+- if there are n modules loaded and the attempt to suspend and resume fails,
+unload n/2 of the modules and try again (that would probably involve rebooting
+the system),
+- if there are n modules loaded and the attempt to suspend and resume succeeds,
+load n/2 modules more and try again.
+
+Again, if you find the offending module(s), it(they) must be unloaded every time
+before hibernation, and please report the problem with it(them).
+
+c) Using the "test_resume" hibernation option
+---------------------------------------------
+
+/sys/power/disk generally tells the kernel what to do after creating a
+hibernation image.  One of the available options is "test_resume" which
+causes the just created image to be used for immediate restoration.  Namely,
+after doing::
+
+	# echo test_resume > /sys/power/disk
+	# echo disk > /sys/power/state
+
+a hibernation image will be created and a resume from it will be triggered
+immediately without involving the platform firmware in any way.
+
+That test can be used to check if failures to resume from hibernation are
+related to bad interactions with the platform firmware.  That is, if the above
+works every time, but resume from actual hibernation does not work or is
+unreliable, the platform firmware may be responsible for the failures.
+
+On architectures and platforms that support using different kernels to restore
+hibernation images (that is, the kernel used to read the image from storage and
+load it into memory is different from the one included in the image) or support
+kernel address space randomization, it also can be used to check if failures
+to resume may be related to the differences between the restore and image
+kernels.
+
+d) Advanced debugging
+---------------------
+
+In case that hibernation does not work on your system even in the minimal
+configuration and compiling more drivers as modules is not practical or some
+modules cannot be unloaded, you can use one of the more advanced debugging
+techniques to find the problem.  First, if there is a serial port in your box,
+you can boot the kernel with the 'no_console_suspend' parameter and try to log
+kernel messages using the serial console.  This may provide you with some
+information about the reasons of the suspend (resume) failure.  Alternatively,
+it may be possible to use a FireWire port for debugging with firescope
+(http://v3.sk/~lkundrak/firescope/).  On x86 it is also possible to
+use the PM_TRACE mechanism documented in Documentation/power/s2ram.rst .
+
+2. Testing suspend to RAM (STR)
+===============================
+
+To verify that the STR works, it is generally more convenient to use the s2ram
+tool available from http://suspend.sf.net and documented at
+http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK).
+
+Namely, after writing "freezer", "devices", "platform", "processors", or "core"
+into /sys/power/pm_test (available if the kernel is compiled with
+CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
+to given string.  The STR test modes are defined in the same way as for
+hibernation, so please refer to Section 1 for more information about them.  In
+particular, the "core" test allows you to test everything except for the actual
+invocation of the platform firmware in order to put the system into the sleep
+state.
+
+Among other things, the testing with the help of /sys/power/pm_test may allow
+you to identify drivers that fail to suspend or resume their devices.  They
+should be unloaded every time before an STR transition.
+
+Next, you can follow the instructions at S2RAM_LINK to test the system, but if
+it does not work "out of the box", you may need to boot it with
+"init=/bin/bash" and test s2ram in the minimal configuration.  In that case,
+you may be able to search for failing drivers by following the procedure
+analogous to the one described in section 1.  If you find some failing drivers,
+you will have to unload them every time before an STR transition (ie. before
+you run s2ram), and please report the problems with them.
+
+There is a debugfs entry which shows the suspend to RAM statistics. Here is an
+example of its output::
+
+	# mount -t debugfs none /sys/kernel/debug
+	# cat /sys/kernel/debug/suspend_stats
+	success: 20
+	fail: 5
+	failed_freeze: 0
+	failed_prepare: 0
+	failed_suspend: 5
+	failed_suspend_noirq: 0
+	failed_resume: 0
+	failed_resume_noirq: 0
+	failures:
+	  last_failed_dev:	alarm
+				adc
+	  last_failed_errno:	-16
+				-16
+	  last_failed_step:	suspend
+				suspend
+
+Field success means the success number of suspend to RAM, and field fail means
+the failure number. Others are the failure number of different steps of suspend
+to RAM. suspend_stats just lists the last 2 failed devices, error number and
+failed step of suspend.
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
deleted file mode 100644
index 708f87f78a75..000000000000
--- a/Documentation/power/basic-pm-debugging.txt
+++ /dev/null
@@ -1,254 +0,0 @@
-Debugging hibernation and suspend
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-1. Testing hibernation (aka suspend to disk or STD)
-
-To check if hibernation works, you can try to hibernate in the "reboot" mode:
-
-# echo reboot > /sys/power/disk
-# echo disk > /sys/power/state
-
-and the system should create a hibernation image, reboot, resume and get back to
-the command prompt where you have started the transition.  If that happens,
-hibernation is most likely to work correctly.  Still, you need to repeat the
-test at least a couple of times in a row for confidence.  [This is necessary,
-because some problems only show up on a second attempt at suspending and
-resuming the system.]  Moreover, hibernating in the "reboot" and "shutdown"
-modes causes the PM core to skip some platform-related callbacks which on ACPI
-systems might be necessary to make hibernation work.  Thus, if your machine fails
-to hibernate or resume in the "reboot" mode, you should try the "platform" mode:
-
-# echo platform > /sys/power/disk
-# echo disk > /sys/power/state
-
-which is the default and recommended mode of hibernation.
-
-Unfortunately, the "platform" mode of hibernation does not work on some systems
-with broken BIOSes.  In such cases the "shutdown" mode of hibernation might
-work:
-
-# echo shutdown > /sys/power/disk
-# echo disk > /sys/power/state
-
-(it is similar to the "reboot" mode, but it requires you to press the power
-button to make the system resume).
-
-If neither "platform" nor "shutdown" hibernation mode works, you will need to
-identify what goes wrong.
-
-a) Test modes of hibernation
-
-To find out why hibernation fails on your system, you can use a special testing
-facility available if the kernel is compiled with CONFIG_PM_DEBUG set.  Then,
-there is the file /sys/power/pm_test that can be used to make the hibernation
-core run in a test mode.  There are 5 test modes available:
-
-freezer
-- test the freezing of processes
-
-devices
-- test the freezing of processes and suspending of devices
-
-platform
-- test the freezing of processes, suspending of devices and platform
-  global control methods(*)
-
-processors
-- test the freezing of processes, suspending of devices, platform
-  global control methods(*) and the disabling of nonboot CPUs
-
-core
-- test the freezing of processes, suspending of devices, platform global
-  control methods(*), the disabling of nonboot CPUs and suspending of
-  platform/system devices
-
-(*) the platform global control methods are only available on ACPI systems
-    and are only tested if the hibernation mode is set to "platform"
-
-To use one of them it is necessary to write the corresponding string to
-/sys/power/pm_test (eg. "devices" to test the freezing of processes and
-suspending devices) and issue the standard hibernation commands.  For example,
-to use the "devices" test mode along with the "platform" mode of hibernation,
-you should do the following:
-
-# echo devices > /sys/power/pm_test
-# echo platform > /sys/power/disk
-# echo disk > /sys/power/state
-
-Then, the kernel will try to freeze processes, suspend devices, wait a few
-seconds (5 by default, but configurable by the suspend.pm_test_delay module
-parameter), resume devices and thaw processes.  If "platform" is written to
-/sys/power/pm_test , then after suspending devices the kernel will additionally
-invoke the global control methods (eg. ACPI global control methods) used to
-prepare the platform firmware for hibernation.  Next, it will wait a
-configurable number of seconds and invoke the platform (eg. ACPI) global
-methods used to cancel hibernation etc.
-
-Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
-hibernation/suspend operations.  Also, when open for reading, /sys/power/pm_test
-contains a space-separated list of all available tests (including "none" that
-represents the normal functionality) in which the current test level is
-indicated by square brackets.
-
-Generally, as you can see, each test level is more "invasive" than the previous
-one and the "core" level tests the hardware and drivers as deeply as possible
-without creating a hibernation image.  Obviously, if the "devices" test fails,
-the "platform" test will fail as well and so on.  Thus, as a rule of thumb, you
-should try the test modes starting from "freezer", through "devices", "platform"
-and "processors" up to "core" (repeat the test on each level a couple of times
-to make sure that any random factors are avoided).
-
-If the "freezer" test fails, there is a task that cannot be frozen (in that case
-it usually is possible to identify the offending task by analysing the output of
-dmesg obtained after the failing test).  Failure at this level usually means
-that there is a problem with the tasks freezer subsystem that should be
-reported.
-
-If the "devices" test fails, most likely there is a driver that cannot suspend
-or resume its device (in the latter case the system may hang or become unstable
-after the test, so please take that into consideration).  To find this driver,
-you can carry out a binary search according to the rules:
-- if the test fails, unload a half of the drivers currently loaded and repeat
-(that would probably involve rebooting the system, so always note what drivers
-have been loaded before the test),
-- if the test succeeds, load a half of the drivers you have unloaded most
-recently and repeat.
-
-Once you have found the failing driver (there can be more than just one of
-them), you have to unload it every time before hibernation.  In that case please
-make sure to report the problem with the driver.
-
-It is also possible that the "devices" test will still fail after you have
-unloaded all modules. In that case, you may want to look in your kernel
-configuration for the drivers that can be compiled as modules (and test again
-with these drivers compiled as modules).  You may also try to use some special
-kernel command line options such as "noapic", "noacpi" or even "acpi=off".
-
-If the "platform" test fails, there is a problem with the handling of the
-platform (eg. ACPI) firmware on your system.  In that case the "platform" mode
-of hibernation is not likely to work.  You can try the "shutdown" mode, but that
-is rather a poor man's workaround.
-
-If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
-work (of course, this only may be an issue on SMP systems) and the problem
-should be reported.  In that case you can also try to switch the nonboot CPUs
-off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
-see if that works.
-
-If the "core" test fails, which means that suspending of the system/platform
-devices has failed (these devices are suspended on one CPU with interrupts off),
-the problem is most probably hardware-related and serious, so it should be
-reported.
-
-A failure of any of the "platform", "processors" or "core" tests may cause your
-system to hang or become unstable, so please beware.  Such a failure usually
-indicates a serious problem that very well may be related to the hardware, but
-please report it anyway.
-
-b) Testing minimal configuration
-
-If all of the hibernation test modes work, you can boot the system with the
-"init=/bin/bash" command line parameter and attempt to hibernate in the
-"reboot", "shutdown" and "platform" modes.  If that does not work, there
-probably is a problem with a driver statically compiled into the kernel and you
-can try to compile more drivers as modules, so that they can be tested
-individually.  Otherwise, there is a problem with a modular driver and you can
-find it by loading a half of the modules you normally use and binary searching
-in accordance with the algorithm:
-- if there are n modules loaded and the attempt to suspend and resume fails,
-unload n/2 of the modules and try again (that would probably involve rebooting
-the system),
-- if there are n modules loaded and the attempt to suspend and resume succeeds,
-load n/2 modules more and try again.
-
-Again, if you find the offending module(s), it(they) must be unloaded every time
-before hibernation, and please report the problem with it(them).
-
-c) Using the "test_resume" hibernation option
-
-/sys/power/disk generally tells the kernel what to do after creating a
-hibernation image.  One of the available options is "test_resume" which
-causes the just created image to be used for immediate restoration.  Namely,
-after doing:
-
-# echo test_resume > /sys/power/disk
-# echo disk > /sys/power/state
-
-a hibernation image will be created and a resume from it will be triggered
-immediately without involving the platform firmware in any way.
-
-That test can be used to check if failures to resume from hibernation are
-related to bad interactions with the platform firmware.  That is, if the above
-works every time, but resume from actual hibernation does not work or is
-unreliable, the platform firmware may be responsible for the failures.
-
-On architectures and platforms that support using different kernels to restore
-hibernation images (that is, the kernel used to read the image from storage and
-load it into memory is different from the one included in the image) or support
-kernel address space randomization, it also can be used to check if failures
-to resume may be related to the differences between the restore and image
-kernels.
-
-d) Advanced debugging
-
-In case that hibernation does not work on your system even in the minimal
-configuration and compiling more drivers as modules is not practical or some
-modules cannot be unloaded, you can use one of the more advanced debugging
-techniques to find the problem.  First, if there is a serial port in your box,
-you can boot the kernel with the 'no_console_suspend' parameter and try to log
-kernel messages using the serial console.  This may provide you with some
-information about the reasons of the suspend (resume) failure.  Alternatively,
-it may be possible to use a FireWire port for debugging with firescope
-(http://v3.sk/~lkundrak/firescope/).  On x86 it is also possible to
-use the PM_TRACE mechanism documented in Documentation/power/s2ram.txt .
-
-2. Testing suspend to RAM (STR)
-
-To verify that the STR works, it is generally more convenient to use the s2ram
-tool available from http://suspend.sf.net and documented at
-http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK).
-
-Namely, after writing "freezer", "devices", "platform", "processors", or "core"
-into /sys/power/pm_test (available if the kernel is compiled with
-CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
-to given string.  The STR test modes are defined in the same way as for
-hibernation, so please refer to Section 1 for more information about them.  In
-particular, the "core" test allows you to test everything except for the actual
-invocation of the platform firmware in order to put the system into the sleep
-state.
-
-Among other things, the testing with the help of /sys/power/pm_test may allow
-you to identify drivers that fail to suspend or resume their devices.  They
-should be unloaded every time before an STR transition.
-
-Next, you can follow the instructions at S2RAM_LINK to test the system, but if
-it does not work "out of the box", you may need to boot it with
-"init=/bin/bash" and test s2ram in the minimal configuration.  In that case,
-you may be able to search for failing drivers by following the procedure
-analogous to the one described in section 1.  If you find some failing drivers,
-you will have to unload them every time before an STR transition (ie. before
-you run s2ram), and please report the problems with them.
-
-There is a debugfs entry which shows the suspend to RAM statistics. Here is an
-example of its output.
-	# mount -t debugfs none /sys/kernel/debug
-	# cat /sys/kernel/debug/suspend_stats
-	success: 20
-	fail: 5
-	failed_freeze: 0
-	failed_prepare: 0
-	failed_suspend: 5
-	failed_suspend_noirq: 0
-	failed_resume: 0
-	failed_resume_noirq: 0
-	failures:
-	  last_failed_dev:	alarm
-				adc
-	  last_failed_errno:	-16
-				-16
-	  last_failed_step:	suspend
-				suspend
-Field success means the success number of suspend to RAM, and field fail means
-the failure number. Others are the failure number of different steps of suspend
-to RAM. suspend_stats just lists the last 2 failed devices, error number and
-failed step of suspend.
diff --git a/Documentation/power/charger-manager.rst b/Documentation/power/charger-manager.rst
new file mode 100644
index 000000000000..84fab9376792
--- /dev/null
+++ b/Documentation/power/charger-manager.rst
@@ -0,0 +1,205 @@
+===============
+Charger Manager
+===============
+
+	(C) 2011 MyungJoo Ham <myungjoo.ham@samsung.com>, GPL
+
+Charger Manager provides in-kernel battery charger management that
+requires temperature monitoring during suspend-to-RAM state
+and where each battery may have multiple chargers attached and the userland
+wants to look at the aggregated information of the multiple chargers.
+
+Charger Manager is a platform_driver with power-supply-class entries.
+An instance of Charger Manager (a platform-device created with Charger-Manager)
+represents an independent battery with chargers. If there are multiple
+batteries with their own chargers acting independently in a system,
+the system may need multiple instances of Charger Manager.
+
+1. Introduction
+===============
+
+Charger Manager supports the following:
+
+* Support for multiple chargers (e.g., a device with USB, AC, and solar panels)
+	A system may have multiple chargers (or power sources) and some of
+	they may be activated at the same time. Each charger may have its
+	own power-supply-class and each power-supply-class can provide
+	different information about the battery status. This framework
+	aggregates charger-related information from multiple sources and
+	shows combined information as a single power-supply-class.
+
+* Support for in suspend-to-RAM polling (with suspend_again callback)
+	While the battery is being charged and the system is in suspend-to-RAM,
+	we may need to monitor the battery health by looking at the ambient or
+	battery temperature. We can accomplish this by waking up the system
+	periodically. However, such a method wakes up devices unnecessarily for
+	monitoring the battery health and tasks, and user processes that are
+	supposed to be kept suspended. That, in turn, incurs unnecessary power
+	consumption and slow down charging process. Or even, such peak power
+	consumption can stop chargers in the middle of charging
+	(external power input < device power consumption), which not
+	only affects the charging time, but the lifespan of the battery.
+
+	Charger Manager provides a function "cm_suspend_again" that can be
+	used as suspend_again callback of platform_suspend_ops. If the platform
+	requires tasks other than cm_suspend_again, it may implement its own
+	suspend_again callback that calls cm_suspend_again in the middle.
+	Normally, the platform will need to resume and suspend some devices
+	that are used by Charger Manager.
+
+* Support for premature full-battery event handling
+	If the battery voltage drops by "fullbatt_vchkdrop_uV" after
+	"fullbatt_vchkdrop_ms" from the full-battery event, the framework
+	restarts charging. This check is also performed while suspended by
+	setting wakeup time accordingly and using suspend_again.
+
+* Support for uevent-notify
+	With the charger-related events, the device sends
+	notification to users with UEVENT.
+
+2. Global Charger-Manager Data related with suspend_again
+=========================================================
+In order to setup Charger Manager with suspend-again feature
+(in-suspend monitoring), the user should provide charger_global_desc
+with setup_charger_manager(`struct charger_global_desc *`).
+This charger_global_desc data for in-suspend monitoring is global
+as the name suggests. Thus, the user needs to provide only once even
+if there are multiple batteries. If there are multiple batteries, the
+multiple instances of Charger Manager share the same charger_global_desc
+and it will manage in-suspend monitoring for all instances of Charger Manager.
+
+The user needs to provide all the three entries to `struct charger_global_desc`
+properly in order to activate in-suspend monitoring:
+
+`char *rtc_name;`
+	The name of rtc (e.g., "rtc0") used to wakeup the system from
+	suspend for Charger Manager. The alarm interrupt (AIE) of the rtc
+	should be able to wake up the system from suspend. Charger Manager
+	saves and restores the alarm value and use the previously-defined
+	alarm if it is going to go off earlier than Charger Manager so that
+	Charger Manager does not interfere with previously-defined alarms.
+
+`bool (*rtc_only_wakeup)(void);`
+	This callback should let CM know whether
+	the wakeup-from-suspend is caused only by the alarm of "rtc" in the
+	same struct. If there is any other wakeup source triggered the
+	wakeup, it should return false. If the "rtc" is the only wakeup
+	reason, it should return true.
+
+`bool assume_timer_stops_in_suspend;`
+	if true, Charger Manager assumes that
+	the timer (CM uses jiffies as timer) stops during suspend. Then, CM
+	assumes that the suspend-duration is same as the alarm length.
+
+
+3. How to setup suspend_again
+=============================
+Charger Manager provides a function "extern bool cm_suspend_again(void)".
+When cm_suspend_again is called, it monitors every battery. The suspend_ops
+callback of the system's platform_suspend_ops can call cm_suspend_again
+function to know whether Charger Manager wants to suspend again or not.
+If there are no other devices or tasks that want to use suspend_again
+feature, the platform_suspend_ops may directly refer to cm_suspend_again
+for its suspend_again callback.
+
+The cm_suspend_again() returns true (meaning "I want to suspend again")
+if the system was woken up by Charger Manager and the polling
+(in-suspend monitoring) results in "normal".
+
+4. Charger-Manager Data (struct charger_desc)
+=============================================
+For each battery charged independently from other batteries (if a series of
+batteries are charged by a single charger, they are counted as one independent
+battery), an instance of Charger Manager is attached to it. The following
+
+struct charger_desc elements:
+
+`char *psy_name;`
+	The power-supply-class name of the battery. Default is
+	"battery" if psy_name is NULL. Users can access the psy entries
+	at "/sys/class/power_supply/[psy_name]/".
+
+`enum polling_modes polling_mode;`
+	  CM_POLL_DISABLE:
+		do not poll this battery.
+	  CM_POLL_ALWAYS:
+		always poll this battery.
+	  CM_POLL_EXTERNAL_POWER_ONLY:
+		poll this battery if and only if an external power
+		source is attached.
+	  CM_POLL_CHARGING_ONLY:
+		poll this battery if and only if the battery is being charged.
+
+`unsigned int fullbatt_vchkdrop_ms; / unsigned int fullbatt_vchkdrop_uV;`
+	If both have non-zero values, Charger Manager will check the
+	battery voltage drop fullbatt_vchkdrop_ms after the battery is fully
+	charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger
+	Manager will try to recharge the battery by disabling and enabling
+	chargers. Recharge with voltage drop condition only (without delay
+	condition) is needed to be implemented with hardware interrupts from
+	fuel gauges or charger devices/chips.
+
+`unsigned int fullbatt_uV;`
+	If specified with a non-zero value, Charger Manager assumes
+	that the battery is full (capacity = 100) if the battery is not being
+	charged and the battery voltage is equal to or greater than
+	fullbatt_uV.
+
+`unsigned int polling_interval_ms;`
+	Required polling interval in ms. Charger Manager will poll
+	this battery every polling_interval_ms or more frequently.
+
+`enum data_source battery_present;`
+	CM_BATTERY_PRESENT:
+		assume that the battery exists.
+	CM_NO_BATTERY:
+		assume that the battery does not exists.
+	CM_FUEL_GAUGE:
+		get battery presence information from fuel gauge.
+	CM_CHARGER_STAT:
+		get battery presence from chargers.
+
+`char **psy_charger_stat;`
+	An array ending with NULL that has power-supply-class names of
+	chargers. Each power-supply-class should provide "PRESENT" (if
+	battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an
+	external power source is attached or not), and "STATUS" (shows whether
+	the battery is {"FULL" or not FULL} or {"FULL", "Charging",
+	"Discharging", "NotCharging"}).
+
+`int num_charger_regulators; / struct regulator_bulk_data *charger_regulators;`
+	Regulators representing the chargers in the form for
+	regulator framework's bulk functions.
+
+`char *psy_fuel_gauge;`
+	Power-supply-class name of the fuel gauge.
+
+`int (*temperature_out_of_range)(int *mC); / bool measure_battery_temp;`
+	This callback returns 0 if the temperature is safe for charging,
+	a positive number if it is too hot to charge, and a negative number
+	if it is too cold to charge. With the variable mC, the callback returns
+	the temperature in 1/1000 of centigrade.
+	The source of temperature can be battery or ambient one according to
+	the value of measure_battery_temp.
+
+
+5. Notify Charger-Manager of charger events: cm_notify_event()
+==============================================================
+If there is an charger event is required to notify
+Charger Manager, a charger device driver that triggers the event can call
+cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager.
+In the function, psy is the charger driver's power_supply pointer, which is
+associated with Charger-Manager. The parameter "type"
+is the same as irq's type (enum cm_event_types). The event message "msg" is
+optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS".
+
+6. Other Considerations
+=======================
+
+At the charger/battery-related events such as battery-pulled-out,
+charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped,
+and others critical to chargers, the system should be configured to wake up.
+At least the following should wake up the system from a suspend:
+a) charger-on/off b) external-power-in/out c) battery-in/out (while charging)
+
+It is usually accomplished by configuring the PMIC as a wakeup source.
diff --git a/Documentation/power/charger-manager.txt b/Documentation/power/charger-manager.txt
deleted file mode 100644
index 9ff1105e58d6..000000000000
--- a/Documentation/power/charger-manager.txt
+++ /dev/null
@@ -1,200 +0,0 @@
-Charger Manager
-	(C) 2011 MyungJoo Ham <myungjoo.ham@samsung.com>, GPL
-
-Charger Manager provides in-kernel battery charger management that
-requires temperature monitoring during suspend-to-RAM state
-and where each battery may have multiple chargers attached and the userland
-wants to look at the aggregated information of the multiple chargers.
-
-Charger Manager is a platform_driver with power-supply-class entries.
-An instance of Charger Manager (a platform-device created with Charger-Manager)
-represents an independent battery with chargers. If there are multiple
-batteries with their own chargers acting independently in a system,
-the system may need multiple instances of Charger Manager.
-
-1. Introduction
-===============
-
-Charger Manager supports the following:
-
-* Support for multiple chargers (e.g., a device with USB, AC, and solar panels)
-	A system may have multiple chargers (or power sources) and some of
-	they may be activated at the same time. Each charger may have its
-	own power-supply-class and each power-supply-class can provide
-	different information about the battery status. This framework
-	aggregates charger-related information from multiple sources and
-	shows combined information as a single power-supply-class.
-
-* Support for in suspend-to-RAM polling (with suspend_again callback)
-	While the battery is being charged and the system is in suspend-to-RAM,
-	we may need to monitor the battery health by looking at the ambient or
-	battery temperature. We can accomplish this by waking up the system
-	periodically. However, such a method wakes up devices unnecessarily for
-	monitoring the battery health and tasks, and user processes that are
-	supposed to be kept suspended. That, in turn, incurs unnecessary power
-	consumption and slow down charging process. Or even, such peak power
-	consumption can stop chargers in the middle of charging
-	(external power input < device power consumption), which not
-	only affects the charging time, but the lifespan of the battery.
-
-	Charger Manager provides a function "cm_suspend_again" that can be
-	used as suspend_again callback of platform_suspend_ops. If the platform
-	requires tasks other than cm_suspend_again, it may implement its own
-	suspend_again callback that calls cm_suspend_again in the middle.
-	Normally, the platform will need to resume and suspend some devices
-	that are used by Charger Manager.
-
-* Support for premature full-battery event handling
-	If the battery voltage drops by "fullbatt_vchkdrop_uV" after
-	"fullbatt_vchkdrop_ms" from the full-battery event, the framework
-	restarts charging. This check is also performed while suspended by
-	setting wakeup time accordingly and using suspend_again.
-
-* Support for uevent-notify
-	With the charger-related events, the device sends
-	notification to users with UEVENT.
-
-2. Global Charger-Manager Data related with suspend_again
-========================================================
-In order to setup Charger Manager with suspend-again feature
-(in-suspend monitoring), the user should provide charger_global_desc
-with setup_charger_manager(struct charger_global_desc *).
-This charger_global_desc data for in-suspend monitoring is global
-as the name suggests. Thus, the user needs to provide only once even
-if there are multiple batteries. If there are multiple batteries, the
-multiple instances of Charger Manager share the same charger_global_desc
-and it will manage in-suspend monitoring for all instances of Charger Manager.
-
-The user needs to provide all the three entries properly in order to activate
-in-suspend monitoring:
-
-struct charger_global_desc {
-
-char *rtc_name;
-	: The name of rtc (e.g., "rtc0") used to wakeup the system from
-	suspend for Charger Manager. The alarm interrupt (AIE) of the rtc
-	should be able to wake up the system from suspend. Charger Manager
-	saves and restores the alarm value and use the previously-defined
-	alarm if it is going to go off earlier than Charger Manager so that
-	Charger Manager does not interfere with previously-defined alarms.
-
-bool (*rtc_only_wakeup)(void);
-	: This callback should let CM know whether
-	the wakeup-from-suspend is caused only by the alarm of "rtc" in the
-	same struct. If there is any other wakeup source triggered the
-	wakeup, it should return false. If the "rtc" is the only wakeup
-	reason, it should return true.
-
-bool assume_timer_stops_in_suspend;
-	: if true, Charger Manager assumes that
-	the timer (CM uses jiffies as timer) stops during suspend. Then, CM
-	assumes that the suspend-duration is same as the alarm length.
-};
-
-3. How to setup suspend_again
-=============================
-Charger Manager provides a function "extern bool cm_suspend_again(void)".
-When cm_suspend_again is called, it monitors every battery. The suspend_ops
-callback of the system's platform_suspend_ops can call cm_suspend_again
-function to know whether Charger Manager wants to suspend again or not.
-If there are no other devices or tasks that want to use suspend_again
-feature, the platform_suspend_ops may directly refer to cm_suspend_again
-for its suspend_again callback.
-
-The cm_suspend_again() returns true (meaning "I want to suspend again")
-if the system was woken up by Charger Manager and the polling
-(in-suspend monitoring) results in "normal".
-
-4. Charger-Manager Data (struct charger_desc)
-=============================================
-For each battery charged independently from other batteries (if a series of
-batteries are charged by a single charger, they are counted as one independent
-battery), an instance of Charger Manager is attached to it.
-
-struct charger_desc {
-
-char *psy_name;
-	: The power-supply-class name of the battery. Default is
-	"battery" if psy_name is NULL. Users can access the psy entries
-	at "/sys/class/power_supply/[psy_name]/".
-
-enum polling_modes polling_mode;
-	: CM_POLL_DISABLE: do not poll this battery.
-	  CM_POLL_ALWAYS: always poll this battery.
-	  CM_POLL_EXTERNAL_POWER_ONLY: poll this battery if and only if
-				       an external power source is attached.
-	  CM_POLL_CHARGING_ONLY: poll this battery if and only if the
-				 battery is being charged.
-
-unsigned int fullbatt_vchkdrop_ms;
-unsigned int fullbatt_vchkdrop_uV;
-	: If both have non-zero values, Charger Manager will check the
-	battery voltage drop fullbatt_vchkdrop_ms after the battery is fully
-	charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger
-	Manager will try to recharge the battery by disabling and enabling
-	chargers. Recharge with voltage drop condition only (without delay
-	condition) is needed to be implemented with hardware interrupts from
-	fuel gauges or charger devices/chips.
-
-unsigned int fullbatt_uV;
-	: If specified with a non-zero value, Charger Manager assumes
-	that the battery is full (capacity = 100) if the battery is not being
-	charged and the battery voltage is equal to or greater than
-	fullbatt_uV.
-
-unsigned int polling_interval_ms;
-	: Required polling interval in ms. Charger Manager will poll
-	this battery every polling_interval_ms or more frequently.
-
-enum data_source battery_present;
-	: CM_BATTERY_PRESENT: assume that the battery exists.
-	CM_NO_BATTERY: assume that the battery does not exists.
-	CM_FUEL_GAUGE: get battery presence information from fuel gauge.
-	CM_CHARGER_STAT: get battery presence from chargers.
-
-char **psy_charger_stat;
-	: An array ending with NULL that has power-supply-class names of
-	chargers. Each power-supply-class should provide "PRESENT" (if
-	battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an
-	external power source is attached or not), and "STATUS" (shows whether
-	the battery is {"FULL" or not FULL} or {"FULL", "Charging",
-	"Discharging", "NotCharging"}).
-
-int num_charger_regulators;
-struct regulator_bulk_data *charger_regulators;
-	: Regulators representing the chargers in the form for
-	regulator framework's bulk functions.
-
-char *psy_fuel_gauge;
-	: Power-supply-class name of the fuel gauge.
-
-int (*temperature_out_of_range)(int *mC);
-bool measure_battery_temp;
-	: This callback returns 0 if the temperature is safe for charging,
-	a positive number if it is too hot to charge, and a negative number
-	if it is too cold to charge. With the variable mC, the callback returns
-	the temperature in 1/1000 of centigrade.
-	The source of temperature can be battery or ambient one according to
-	the value of measure_battery_temp.
-};
-
-5. Notify Charger-Manager of charger events: cm_notify_event()
-=========================================================
-If there is an charger event is required to notify
-Charger Manager, a charger device driver that triggers the event can call
-cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager.
-In the function, psy is the charger driver's power_supply pointer, which is
-associated with Charger-Manager. The parameter "type"
-is the same as irq's type (enum cm_event_types). The event message "msg" is
-optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS".
-
-6. Other Considerations
-=======================
-
-At the charger/battery-related events such as battery-pulled-out,
-charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped,
-and others critical to chargers, the system should be configured to wake up.
-At least the following should wake up the system from a suspend:
-a) charger-on/off b) external-power-in/out c) battery-in/out (while charging)
-
-It is usually accomplished by configuring the PMIC as a wakeup source.
diff --git a/Documentation/power/drivers-testing.rst b/Documentation/power/drivers-testing.rst
new file mode 100644
index 000000000000..e53f1999fc39
--- /dev/null
+++ b/Documentation/power/drivers-testing.rst
@@ -0,0 +1,51 @@
+====================================================
+Testing suspend and resume support in device drivers
+====================================================
+
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Preparing the test system
+============================
+
+Unfortunately, to effectively test the support for the system-wide suspend and
+resume transitions in a driver, it is necessary to suspend and resume a fully
+functional system with this driver loaded.  Moreover, that should be done
+several times, preferably several times in a row, and separately for hibernation
+(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
+cases involves slightly different operations and different interactions with
+the machine's BIOS.
+
+Of course, for this purpose the test system has to be known to suspend and
+resume without the driver being tested.  Thus, if possible, you should first
+resolve all suspend/resume-related problems in the test system before you start
+testing the new driver.  Please see Documentation/power/basic-pm-debugging.rst
+for more information about the debugging of suspend/resume functionality.
+
+2. Testing the driver
+=====================
+
+Once you have resolved the suspend/resume-related problems with your test system
+without the new driver, you are ready to test it:
+
+a) Build the driver as a module, load it and try the test modes of hibernation
+   (see: Documentation/power/basic-pm-debugging.rst, 1).
+
+b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
+   "platform" modes (see: Documentation/power/basic-pm-debugging.rst, 1).
+
+c) Compile the driver directly into the kernel and try the test modes of
+   hibernation.
+
+d) Attempt to hibernate with the driver compiled directly into the kernel
+   in the "reboot", "shutdown" and "platform" modes.
+
+e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.rst,
+   2).  [As far as the STR tests are concerned, it should not matter whether or
+   not the driver is built as a module.]
+
+f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
+   (see: Documentation/power/basic-pm-debugging.rst, 2).
+
+Each of the above tests should be repeated several times and the STD tests
+should be mixed with the STR tests.  If any of them fails, the driver cannot be
+regarded as suspend/resume-safe.
diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt
deleted file mode 100644
index 638afdf4d6b8..000000000000
--- a/Documentation/power/drivers-testing.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-Testing suspend and resume support in device drivers
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-1. Preparing the test system
-
-Unfortunately, to effectively test the support for the system-wide suspend and
-resume transitions in a driver, it is necessary to suspend and resume a fully
-functional system with this driver loaded.  Moreover, that should be done
-several times, preferably several times in a row, and separately for hibernation
-(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
-cases involves slightly different operations and different interactions with
-the machine's BIOS.
-
-Of course, for this purpose the test system has to be known to suspend and
-resume without the driver being tested.  Thus, if possible, you should first
-resolve all suspend/resume-related problems in the test system before you start
-testing the new driver.  Please see Documentation/power/basic-pm-debugging.txt
-for more information about the debugging of suspend/resume functionality.
-
-2. Testing the driver
-
-Once you have resolved the suspend/resume-related problems with your test system
-without the new driver, you are ready to test it:
-
-a) Build the driver as a module, load it and try the test modes of hibernation
-   (see: Documentation/power/basic-pm-debugging.txt, 1).
-
-b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
-   "platform" modes (see: Documentation/power/basic-pm-debugging.txt, 1).
-
-c) Compile the driver directly into the kernel and try the test modes of
-   hibernation.
-
-d) Attempt to hibernate with the driver compiled directly into the kernel
-   in the "reboot", "shutdown" and "platform" modes.
-
-e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.txt,
-   2).  [As far as the STR tests are concerned, it should not matter whether or
-   not the driver is built as a module.]
-
-f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
-   (see: Documentation/power/basic-pm-debugging.txt, 2).
-
-Each of the above tests should be repeated several times and the STD tests
-should be mixed with the STR tests.  If any of them fails, the driver cannot be
-regarded as suspend/resume-safe.
diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst
new file mode 100644
index 000000000000..90a345d57ae9
--- /dev/null
+++ b/Documentation/power/energy-model.rst
@@ -0,0 +1,147 @@
+====================
+Energy Model of CPUs
+====================
+
+1. Overview
+-----------
+
+The Energy Model (EM) framework serves as an interface between drivers knowing
+the power consumed by CPUs at various performance levels, and the kernel
+subsystems willing to use that information to make energy-aware decisions.
+
+The source of the information about the power consumed by CPUs can vary greatly
+from one platform to another. These power costs can be estimated using
+devicetree data in some cases. In others, the firmware will know better.
+Alternatively, userspace might be best positioned. And so on. In order to avoid
+each and every client subsystem to re-implement support for each and every
+possible source of information on its own, the EM framework intervenes as an
+abstraction layer which standardizes the format of power cost tables in the
+kernel, hence enabling to avoid redundant work.
+
+The figure below depicts an example of drivers (Arm-specific here, but the
+approach is applicable to any architecture) providing power costs to the EM
+framework, and interested clients reading the data from it::
+
+       +---------------+  +-----------------+  +---------------+
+       | Thermal (IPA) |  | Scheduler (EAS) |  |     Other     |
+       +---------------+  +-----------------+  +---------------+
+               |                   | em_pd_energy()    |
+               |                   | em_cpu_get()      |
+               +---------+         |         +---------+
+                         |         |         |
+                         v         v         v
+                        +---------------------+
+                        |    Energy Model     |
+                        |     Framework       |
+                        +---------------------+
+                           ^       ^       ^
+                           |       |       | em_register_perf_domain()
+                +----------+       |       +---------+
+                |                  |                 |
+        +---------------+  +---------------+  +--------------+
+        |  cpufreq-dt   |  |   arm_scmi    |  |    Other     |
+        +---------------+  +---------------+  +--------------+
+                ^                  ^                 ^
+                |                  |                 |
+        +--------------+   +---------------+  +--------------+
+        | Device Tree  |   |   Firmware    |  |      ?       |
+        +--------------+   +---------------+  +--------------+
+
+The EM framework manages power cost tables per 'performance domain' in the
+system. A performance domain is a group of CPUs whose performance is scaled
+together. Performance domains generally have a 1-to-1 mapping with CPUFreq
+policies. All CPUs in a performance domain are required to have the same
+micro-architecture. CPUs in different performance domains can have different
+micro-architectures.
+
+
+2. Core APIs
+------------
+
+2.1 Config options
+^^^^^^^^^^^^^^^^^^
+
+CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
+
+
+2.2 Registration of performance domains
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Drivers are expected to register performance domains into the EM framework by
+calling the following API::
+
+  int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+			      struct em_data_callback *cb);
+
+Drivers must specify the CPUs of the performance domains using the cpumask
+argument, and provide a callback function returning <frequency, power> tuples
+for each capacity state. The callback function provided by the driver is free
+to fetch data from any relevant location (DT, firmware, ...), and by any mean
+deemed necessary. See Section 3. for an example of driver implementing this
+callback, and kernel/power/energy_model.c for further documentation on this
+API.
+
+
+2.3 Accessing performance domains
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Subsystems interested in the energy model of a CPU can retrieve it using the
+em_cpu_get() API. The energy model tables are allocated once upon creation of
+the performance domains, and kept in memory untouched.
+
+The energy consumed by a performance domain can be estimated using the
+em_pd_energy() API. The estimation is performed assuming that the schedutil
+CPUfreq governor is in use.
+
+More details about the above APIs can be found in include/linux/energy_model.h.
+
+
+3. Example driver
+-----------------
+
+This section provides a simple example of a CPUFreq driver registering a
+performance domain in the Energy Model framework using the (fake) 'foo'
+protocol. The driver implements an est_power() function to be provided to the
+EM framework::
+
+  -> drivers/cpufreq/foo_cpufreq.c
+
+  01	static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
+  02	{
+  03		long freq, power;
+  04
+  05		/* Use the 'foo' protocol to ceil the frequency */
+  06		freq = foo_get_freq_ceil(cpu, *KHz);
+  07		if (freq < 0);
+  08			return freq;
+  09
+  10		/* Estimate the power cost for the CPU at the relevant freq. */
+  11		power = foo_estimate_power(cpu, freq);
+  12		if (power < 0);
+  13			return power;
+  14
+  15		/* Return the values to the EM framework */
+  16		*mW = power;
+  17		*KHz = freq;
+  18
+  19		return 0;
+  20	}
+  21
+  22	static int foo_cpufreq_init(struct cpufreq_policy *policy)
+  23	{
+  24		struct em_data_callback em_cb = EM_DATA_CB(est_power);
+  25		int nr_opp, ret;
+  26
+  27		/* Do the actual CPUFreq init work ... */
+  28		ret = do_foo_cpufreq_init(policy);
+  29		if (ret)
+  30			return ret;
+  31
+  32		/* Find the number of OPPs for this policy */
+  33		nr_opp = foo_get_nr_opp(policy);
+  34
+  35		/* And register the new performance domain */
+  36		em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
+  37
+  38	        return 0;
+  39	}
diff --git a/Documentation/power/energy-model.txt b/Documentation/power/energy-model.txt
deleted file mode 100644
index a2b0ae4c76bd..000000000000
--- a/Documentation/power/energy-model.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-                           ====================
-                           Energy Model of CPUs
-                           ====================
-
-1. Overview
------------
-
-The Energy Model (EM) framework serves as an interface between drivers knowing
-the power consumed by CPUs at various performance levels, and the kernel
-subsystems willing to use that information to make energy-aware decisions.
-
-The source of the information about the power consumed by CPUs can vary greatly
-from one platform to another. These power costs can be estimated using
-devicetree data in some cases. In others, the firmware will know better.
-Alternatively, userspace might be best positioned. And so on. In order to avoid
-each and every client subsystem to re-implement support for each and every
-possible source of information on its own, the EM framework intervenes as an
-abstraction layer which standardizes the format of power cost tables in the
-kernel, hence enabling to avoid redundant work.
-
-The figure below depicts an example of drivers (Arm-specific here, but the
-approach is applicable to any architecture) providing power costs to the EM
-framework, and interested clients reading the data from it.
-
-       +---------------+  +-----------------+  +---------------+
-       | Thermal (IPA) |  | Scheduler (EAS) |  |     Other     |
-       +---------------+  +-----------------+  +---------------+
-               |                   | em_pd_energy()    |
-               |                   | em_cpu_get()      |
-               +---------+         |         +---------+
-                         |         |         |
-                         v         v         v
-                        +---------------------+
-                        |    Energy Model     |
-                        |     Framework       |
-                        +---------------------+
-                           ^       ^       ^
-                           |       |       | em_register_perf_domain()
-                +----------+       |       +---------+
-                |                  |                 |
-        +---------------+  +---------------+  +--------------+
-        |  cpufreq-dt   |  |   arm_scmi    |  |    Other     |
-        +---------------+  +---------------+  +--------------+
-                ^                  ^                 ^
-                |                  |                 |
-        +--------------+   +---------------+  +--------------+
-        | Device Tree  |   |   Firmware    |  |      ?       |
-        +--------------+   +---------------+  +--------------+
-
-The EM framework manages power cost tables per 'performance domain' in the
-system. A performance domain is a group of CPUs whose performance is scaled
-together. Performance domains generally have a 1-to-1 mapping with CPUFreq
-policies. All CPUs in a performance domain are required to have the same
-micro-architecture. CPUs in different performance domains can have different
-micro-architectures.
-
-
-2. Core APIs
-------------
-
-  2.1 Config options
-
-CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
-
-
-  2.2 Registration of performance domains
-
-Drivers are expected to register performance domains into the EM framework by
-calling the following API:
-
-  int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
-			      struct em_data_callback *cb);
-
-Drivers must specify the CPUs of the performance domains using the cpumask
-argument, and provide a callback function returning <frequency, power> tuples
-for each capacity state. The callback function provided by the driver is free
-to fetch data from any relevant location (DT, firmware, ...), and by any mean
-deemed necessary. See Section 3. for an example of driver implementing this
-callback, and kernel/power/energy_model.c for further documentation on this
-API.
-
-
-  2.3 Accessing performance domains
-
-Subsystems interested in the energy model of a CPU can retrieve it using the
-em_cpu_get() API. The energy model tables are allocated once upon creation of
-the performance domains, and kept in memory untouched.
-
-The energy consumed by a performance domain can be estimated using the
-em_pd_energy() API. The estimation is performed assuming that the schedutil
-CPUfreq governor is in use.
-
-More details about the above APIs can be found in include/linux/energy_model.h.
-
-
-3. Example driver
------------------
-
-This section provides a simple example of a CPUFreq driver registering a
-performance domain in the Energy Model framework using the (fake) 'foo'
-protocol. The driver implements an est_power() function to be provided to the
-EM framework.
-
- -> drivers/cpufreq/foo_cpufreq.c
-
-01	static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
-02	{
-03		long freq, power;
-04
-05		/* Use the 'foo' protocol to ceil the frequency */
-06		freq = foo_get_freq_ceil(cpu, *KHz);
-07		if (freq < 0);
-08			return freq;
-09
-10		/* Estimate the power cost for the CPU at the relevant freq. */
-11		power = foo_estimate_power(cpu, freq);
-12		if (power < 0);
-13			return power;
-14
-15		/* Return the values to the EM framework */
-16		*mW = power;
-17		*KHz = freq;
-18
-19		return 0;
-20	}
-21
-22	static int foo_cpufreq_init(struct cpufreq_policy *policy)
-23	{
-24		struct em_data_callback em_cb = EM_DATA_CB(est_power);
-25		int nr_opp, ret;
-26
-27		/* Do the actual CPUFreq init work ... */
-28		ret = do_foo_cpufreq_init(policy);
-29		if (ret)
-30			return ret;
-31
-32		/* Find the number of OPPs for this policy */
-33		nr_opp = foo_get_nr_opp(policy);
-34
-35		/* And register the new performance domain */
-36		em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
-37
-38	        return 0;
-39	}
diff --git a/Documentation/power/freezing-of-tasks.rst b/Documentation/power/freezing-of-tasks.rst
new file mode 100644
index 000000000000..ef110fe55e82
--- /dev/null
+++ b/Documentation/power/freezing-of-tasks.rst
@@ -0,0 +1,244 @@
+=================
+Freezing of tasks
+=================
+
+(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+I. What is the freezing of tasks?
+=================================
+
+The freezing of tasks is a mechanism by which user space processes and some
+kernel threads are controlled during hibernation or system-wide suspend (on some
+architectures).
+
+II. How does it work?
+=====================
+
+There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
+and PF_FREEZER_SKIP (the last one is auxiliary).  The tasks that have
+PF_NOFREEZE unset (all user space processes and some kernel threads) are
+regarded as 'freezable' and treated in a special way before the system enters a
+suspend state as well as before a hibernation image is created (in what follows
+we only consider hibernation, but the description also applies to suspend).
+
+Namely, as the first step of the hibernation procedure the function
+freeze_processes() (defined in kernel/power/process.c) is called.  A system-wide
+variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
+whether the system is to undergo a freezing operation. And freeze_processes()
+sets this variable.  After this, it executes try_to_freeze_tasks() that sends a
+fake signal to all user space processes, and wakes up all the kernel threads.
+All freezable tasks must react to that by calling try_to_freeze(), which
+results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
+the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
+it loop until PF_FROZEN is cleared for it. Then, we say that the task is
+'frozen' and therefore the set of functions handling this mechanism is referred
+to as 'the freezer' (these functions are defined in kernel/power/process.c,
+kernel/freezer.c & include/linux/freezer.h). User space processes are generally
+frozen before kernel threads.
+
+__refrigerator() must not be called directly.  Instead, use the
+try_to_freeze() function (defined in include/linux/freezer.h), that checks
+if the task is to be frozen and makes the task enter __refrigerator().
+
+For user space processes try_to_freeze() is called automatically from the
+signal-handling code, but the freezable kernel threads need to call it
+explicitly in suitable places or use the wait_event_freezable() or
+wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
+that combine interruptible sleep with checking if the task is to be frozen and
+calling try_to_freeze().  The main loop of a freezable kernel thread may look
+like the following one::
+
+	set_freezable();
+	do {
+		hub_events();
+		wait_event_freezable(khubd_wait,
+				!list_empty(&hub_event_list) ||
+				kthread_should_stop());
+	} while (!kthread_should_stop() || !list_empty(&hub_event_list));
+
+(from drivers/usb/core/hub.c::hub_thread()).
+
+If a freezable kernel thread fails to call try_to_freeze() after the freezer has
+initiated a freezing operation, the freezing of tasks will fail and the entire
+hibernation operation will be cancelled.  For this reason, freezable kernel
+threads must call try_to_freeze() somewhere or use one of the
+wait_event_freezable() and wait_event_freezable_timeout() macros.
+
+After the system memory state has been restored from a hibernation image and
+devices have been reinitialized, the function thaw_processes() is called in
+order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
+have been frozen leave __refrigerator() and continue running.
+
+
+Rationale behind the functions dealing with freezing and thawing of tasks
+-------------------------------------------------------------------------
+
+freeze_processes():
+  - freezes only userspace tasks
+
+freeze_kernel_threads():
+  - freezes all tasks (including kernel threads) because we can't freeze
+    kernel threads without freezing userspace tasks
+
+thaw_kernel_threads():
+  - thaws only kernel threads; this is particularly useful if we need to do
+    anything special in between thawing of kernel threads and thawing of
+    userspace tasks, or if we want to postpone the thawing of userspace tasks
+
+thaw_processes():
+  - thaws all tasks (including kernel threads) because we can't thaw userspace
+    tasks without thawing kernel threads
+
+
+III. Which kernel threads are freezable?
+========================================
+
+Kernel threads are not freezable by default.  However, a kernel thread may clear
+PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
+directly is not allowed).  From this point it is regarded as freezable
+and must call try_to_freeze() in a suitable place.
+
+IV. Why do we do that?
+======================
+
+Generally speaking, there is a couple of reasons to use the freezing of tasks:
+
+1. The principal reason is to prevent filesystems from being damaged after
+   hibernation.  At the moment we have no simple means of checkpointing
+   filesystems, so if there are any modifications made to filesystem data and/or
+   metadata on disks, we cannot bring them back to the state from before the
+   modifications.  At the same time each hibernation image contains some
+   filesystem-related information that must be consistent with the state of the
+   on-disk data and metadata after the system memory state has been restored
+   from the image (otherwise the filesystems will be damaged in a nasty way,
+   usually making them almost impossible to repair).  We therefore freeze
+   tasks that might cause the on-disk filesystems' data and metadata to be
+   modified after the hibernation image has been created and before the
+   system is finally powered off. The majority of these are user space
+   processes, but if any of the kernel threads may cause something like this
+   to happen, they have to be freezable.
+
+2. Next, to create the hibernation image we need to free a sufficient amount of
+   memory (approximately 50% of available RAM) and we need to do that before
+   devices are deactivated, because we generally need them for swapping out.
+   Then, after the memory for the image has been freed, we don't want tasks
+   to allocate additional memory and we prevent them from doing that by
+   freezing them earlier. [Of course, this also means that device drivers
+   should not allocate substantial amounts of memory from their .suspend()
+   callbacks before hibernation, but this is a separate issue.]
+
+3. The third reason is to prevent user space processes and some kernel threads
+   from interfering with the suspending and resuming of devices.  A user space
+   process running on a second CPU while we are suspending devices may, for
+   example, be troublesome and without the freezing of tasks we would need some
+   safeguards against race conditions that might occur in such a case.
+
+Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
+of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
+
+"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
+
+Linus: In many ways, 'at all'.
+
+I **do** realize the IO request queue issues, and that we cannot actually do
+s2ram with some devices in the middle of a DMA.  So we want to be able to
+avoid *that*, there's no question about that.  And I suspect that stopping
+user threads and then waiting for a sync is practically one of the easier
+ways to do so.
+
+So in practice, the 'at all' may become a 'why freeze kernel threads?' and
+freezing user threads I don't find really objectionable."
+
+Still, there are kernel threads that may want to be freezable.  For example, if
+a kernel thread that belongs to a device driver accesses the device directly, it
+in principle needs to know when the device is suspended, so that it doesn't try
+to access it at that time.  However, if the kernel thread is freezable, it will
+be frozen before the driver's .suspend() callback is executed and it will be
+thawed after the driver's .resume() callback has run, so it won't be accessing
+the device while it's suspended.
+
+4. Another reason for freezing tasks is to prevent user space processes from
+   realizing that hibernation (or suspend) operation takes place.  Ideally, user
+   space processes should not notice that such a system-wide operation has
+   occurred and should continue running without any problems after the restore
+   (or resume from suspend).  Unfortunately, in the most general case this
+   is quite difficult to achieve without the freezing of tasks.  Consider,
+   for example, a process that depends on all CPUs being online while it's
+   running.  Since we need to disable nonboot CPUs during the hibernation,
+   if this process is not frozen, it may notice that the number of CPUs has
+   changed and may start to work incorrectly because of that.
+
+V. Are there any problems related to the freezing of tasks?
+===========================================================
+
+Yes, there are.
+
+First of all, the freezing of kernel threads may be tricky if they depend one
+on another.  For example, if kernel thread A waits for a completion (in the
+TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
+and B is frozen in the meantime, then A will be blocked until B is thawed, which
+may be undesirable.  That's why kernel threads are not freezable by default.
+
+Second, there are the following two problems related to the freezing of user
+space processes:
+
+1. Putting processes into an uninterruptible sleep distorts the load average.
+2. Now that we have FUSE, plus the framework for doing device drivers in
+   userspace, it gets even more complicated because some userspace processes are
+   now doing the sorts of things that kernel threads do
+   (https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
+
+The problem 1. seems to be fixable, although it hasn't been fixed so far.  The
+other one is more serious, but it seems that we can work around it by using
+hibernation (and suspend) notifiers (in that case, though, we won't be able to
+avoid the realization by the user space processes that the hibernation is taking
+place).
+
+There are also problems that the freezing of tasks tends to expose, although
+they are not directly related to it.  For example, if request_firmware() is
+called from a device driver's .resume() routine, it will timeout and eventually
+fail, because the user land process that should respond to the request is frozen
+at this point.  So, seemingly, the failure is due to the freezing of tasks.
+Suppose, however, that the firmware file is located on a filesystem accessible
+only through another device that hasn't been resumed yet.  In that case,
+request_firmware() will fail regardless of whether or not the freezing of tasks
+is used.  Consequently, the problem is not really related to the freezing of
+tasks, since it generally exists anyway.
+
+A driver must have all firmwares it may need in RAM before suspend() is called.
+If keeping them is not practical, for example due to their size, they must be
+requested early enough using the suspend notifier API described in
+Documentation/driver-api/pm/notifiers.rst.
+
+VI. Are there any precautions to be taken to prevent freezing failures?
+=======================================================================
+
+Yes, there are.
+
+First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
+from system-wide sleep such as suspend/hibernation is not encouraged.
+If possible, that piece of code must instead hook onto the suspend/hibernation
+notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
+(kernel/cpu.c) for an example.
+
+However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
+it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
+that could lead to freezing failures, because if the suspend/hibernate code
+successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
+to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
+state. As a consequence, the freezer would not be able to freeze that task,
+leading to freezing failure.
+
+However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
+since they ask the freezer to skip freezing this task, since it is anyway
+"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
+only after the entire suspend/hibernation sequence is complete.
+So, to summarize, use [un]lock_system_sleep() instead of directly using
+mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
+
+V. Miscellaneous
+================
+
+/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
+all user space processes or all freezable kernel threads, in unit of millisecond.
+The default value is 20000, with range of unsigned integer.
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
deleted file mode 100644
index cd283190855a..000000000000
--- a/Documentation/power/freezing-of-tasks.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Freezing of tasks
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-I. What is the freezing of tasks?
-
-The freezing of tasks is a mechanism by which user space processes and some
-kernel threads are controlled during hibernation or system-wide suspend (on some
-architectures).
-
-II. How does it work?
-
-There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
-and PF_FREEZER_SKIP (the last one is auxiliary).  The tasks that have
-PF_NOFREEZE unset (all user space processes and some kernel threads) are
-regarded as 'freezable' and treated in a special way before the system enters a
-suspend state as well as before a hibernation image is created (in what follows
-we only consider hibernation, but the description also applies to suspend).
-
-Namely, as the first step of the hibernation procedure the function
-freeze_processes() (defined in kernel/power/process.c) is called.  A system-wide
-variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
-whether the system is to undergo a freezing operation. And freeze_processes()
-sets this variable.  After this, it executes try_to_freeze_tasks() that sends a
-fake signal to all user space processes, and wakes up all the kernel threads.
-All freezable tasks must react to that by calling try_to_freeze(), which
-results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
-the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
-it loop until PF_FROZEN is cleared for it. Then, we say that the task is
-'frozen' and therefore the set of functions handling this mechanism is referred
-to as 'the freezer' (these functions are defined in kernel/power/process.c,
-kernel/freezer.c & include/linux/freezer.h). User space processes are generally
-frozen before kernel threads.
-
-__refrigerator() must not be called directly.  Instead, use the
-try_to_freeze() function (defined in include/linux/freezer.h), that checks
-if the task is to be frozen and makes the task enter __refrigerator().
-
-For user space processes try_to_freeze() is called automatically from the
-signal-handling code, but the freezable kernel threads need to call it
-explicitly in suitable places or use the wait_event_freezable() or
-wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
-that combine interruptible sleep with checking if the task is to be frozen and
-calling try_to_freeze().  The main loop of a freezable kernel thread may look
-like the following one:
-
-	set_freezable();
-	do {
-		hub_events();
-		wait_event_freezable(khubd_wait,
-				!list_empty(&hub_event_list) ||
-				kthread_should_stop());
-	} while (!kthread_should_stop() || !list_empty(&hub_event_list));
-
-(from drivers/usb/core/hub.c::hub_thread()).
-
-If a freezable kernel thread fails to call try_to_freeze() after the freezer has
-initiated a freezing operation, the freezing of tasks will fail and the entire
-hibernation operation will be cancelled.  For this reason, freezable kernel
-threads must call try_to_freeze() somewhere or use one of the
-wait_event_freezable() and wait_event_freezable_timeout() macros.
-
-After the system memory state has been restored from a hibernation image and
-devices have been reinitialized, the function thaw_processes() is called in
-order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
-have been frozen leave __refrigerator() and continue running.
-
-
-Rationale behind the functions dealing with freezing and thawing of tasks:
--------------------------------------------------------------------------
-
-freeze_processes():
-  - freezes only userspace tasks
-
-freeze_kernel_threads():
-  - freezes all tasks (including kernel threads) because we can't freeze
-    kernel threads without freezing userspace tasks
-
-thaw_kernel_threads():
-  - thaws only kernel threads; this is particularly useful if we need to do
-    anything special in between thawing of kernel threads and thawing of
-    userspace tasks, or if we want to postpone the thawing of userspace tasks
-
-thaw_processes():
-  - thaws all tasks (including kernel threads) because we can't thaw userspace
-    tasks without thawing kernel threads
-
-
-III. Which kernel threads are freezable?
-
-Kernel threads are not freezable by default.  However, a kernel thread may clear
-PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
-directly is not allowed).  From this point it is regarded as freezable
-and must call try_to_freeze() in a suitable place.
-
-IV. Why do we do that?
-
-Generally speaking, there is a couple of reasons to use the freezing of tasks:
-
-1. The principal reason is to prevent filesystems from being damaged after
-hibernation.  At the moment we have no simple means of checkpointing
-filesystems, so if there are any modifications made to filesystem data and/or
-metadata on disks, we cannot bring them back to the state from before the
-modifications.  At the same time each hibernation image contains some
-filesystem-related information that must be consistent with the state of the
-on-disk data and metadata after the system memory state has been restored from
-the image (otherwise the filesystems will be damaged in a nasty way, usually
-making them almost impossible to repair).  We therefore freeze tasks that might
-cause the on-disk filesystems' data and metadata to be modified after the
-hibernation image has been created and before the system is finally powered off.
-The majority of these are user space processes, but if any of the kernel threads
-may cause something like this to happen, they have to be freezable.
-
-2. Next, to create the hibernation image we need to free a sufficient amount of
-memory (approximately 50% of available RAM) and we need to do that before
-devices are deactivated, because we generally need them for swapping out.  Then,
-after the memory for the image has been freed, we don't want tasks to allocate
-additional memory and we prevent them from doing that by freezing them earlier.
-[Of course, this also means that device drivers should not allocate substantial
-amounts of memory from their .suspend() callbacks before hibernation, but this
-is a separate issue.]
-
-3. The third reason is to prevent user space processes and some kernel threads
-from interfering with the suspending and resuming of devices.  A user space
-process running on a second CPU while we are suspending devices may, for
-example, be troublesome and without the freezing of tasks we would need some
-safeguards against race conditions that might occur in such a case.
-
-Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
-of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
-
-"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
-
-Linus: In many ways, 'at all'.
-
-I _do_ realize the IO request queue issues, and that we cannot actually do
-s2ram with some devices in the middle of a DMA.  So we want to be able to
-avoid *that*, there's no question about that.  And I suspect that stopping
-user threads and then waiting for a sync is practically one of the easier
-ways to do so.
-
-So in practice, the 'at all' may become a 'why freeze kernel threads?' and
-freezing user threads I don't find really objectionable."
-
-Still, there are kernel threads that may want to be freezable.  For example, if
-a kernel thread that belongs to a device driver accesses the device directly, it
-in principle needs to know when the device is suspended, so that it doesn't try
-to access it at that time.  However, if the kernel thread is freezable, it will
-be frozen before the driver's .suspend() callback is executed and it will be
-thawed after the driver's .resume() callback has run, so it won't be accessing
-the device while it's suspended.
-
-4. Another reason for freezing tasks is to prevent user space processes from
-realizing that hibernation (or suspend) operation takes place.  Ideally, user
-space processes should not notice that such a system-wide operation has occurred
-and should continue running without any problems after the restore (or resume
-from suspend).  Unfortunately, in the most general case this is quite difficult
-to achieve without the freezing of tasks.  Consider, for example, a process
-that depends on all CPUs being online while it's running.  Since we need to
-disable nonboot CPUs during the hibernation, if this process is not frozen, it
-may notice that the number of CPUs has changed and may start to work incorrectly
-because of that.
-
-V. Are there any problems related to the freezing of tasks?
-
-Yes, there are.
-
-First of all, the freezing of kernel threads may be tricky if they depend one
-on another.  For example, if kernel thread A waits for a completion (in the
-TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
-and B is frozen in the meantime, then A will be blocked until B is thawed, which
-may be undesirable.  That's why kernel threads are not freezable by default.
-
-Second, there are the following two problems related to the freezing of user
-space processes:
-1. Putting processes into an uninterruptible sleep distorts the load average.
-2. Now that we have FUSE, plus the framework for doing device drivers in
-userspace, it gets even more complicated because some userspace processes are
-now doing the sorts of things that kernel threads do
-(https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
-
-The problem 1. seems to be fixable, although it hasn't been fixed so far.  The
-other one is more serious, but it seems that we can work around it by using
-hibernation (and suspend) notifiers (in that case, though, we won't be able to
-avoid the realization by the user space processes that the hibernation is taking
-place).
-
-There are also problems that the freezing of tasks tends to expose, although
-they are not directly related to it.  For example, if request_firmware() is
-called from a device driver's .resume() routine, it will timeout and eventually
-fail, because the user land process that should respond to the request is frozen
-at this point.  So, seemingly, the failure is due to the freezing of tasks.
-Suppose, however, that the firmware file is located on a filesystem accessible
-only through another device that hasn't been resumed yet.  In that case,
-request_firmware() will fail regardless of whether or not the freezing of tasks
-is used.  Consequently, the problem is not really related to the freezing of
-tasks, since it generally exists anyway.
-
-A driver must have all firmwares it may need in RAM before suspend() is called.
-If keeping them is not practical, for example due to their size, they must be
-requested early enough using the suspend notifier API described in
-Documentation/driver-api/pm/notifiers.rst.
-
-VI. Are there any precautions to be taken to prevent freezing failures?
-
-Yes, there are.
-
-First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
-from system-wide sleep such as suspend/hibernation is not encouraged.
-If possible, that piece of code must instead hook onto the suspend/hibernation
-notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
-(kernel/cpu.c) for an example.
-
-However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
-it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
-that could lead to freezing failures, because if the suspend/hibernate code
-successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
-to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
-state. As a consequence, the freezer would not be able to freeze that task,
-leading to freezing failure.
-
-However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
-since they ask the freezer to skip freezing this task, since it is anyway
-"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
-only after the entire suspend/hibernation sequence is complete.
-So, to summarize, use [un]lock_system_sleep() instead of directly using
-mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
-
-V. Miscellaneous
-/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
-all user space processes or all freezable kernel threads, in unit of millisecond.
-The default value is 20000, with range of unsigned integer.
diff --git a/Documentation/power/index.rst b/Documentation/power/index.rst
new file mode 100644
index 000000000000..20415f21e48a
--- /dev/null
+++ b/Documentation/power/index.rst
@@ -0,0 +1,46 @@
+:orphan:
+
+================
+Power Management
+================
+
+.. toctree::
+    :maxdepth: 1
+
+    apm-acpi
+    basic-pm-debugging
+    charger-manager
+    drivers-testing
+    energy-model
+    freezing-of-tasks
+    interface
+    opp
+    pci
+    pm_qos_interface
+    power_supply_class
+    runtime_pm
+    s2ram
+    suspend-and-cpuhotplug
+    suspend-and-interrupts
+    swsusp-and-swap-files
+    swsusp-dmcrypt
+    swsusp
+    video
+    tricks
+
+    userland-swsusp
+
+    powercap/powercap
+
+    regulator/consumer
+    regulator/design
+    regulator/machine
+    regulator/overview
+    regulator/regulator
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/power/interface.rst b/Documentation/power/interface.rst
new file mode 100644
index 000000000000..8d270ed27228
--- /dev/null
+++ b/Documentation/power/interface.rst
@@ -0,0 +1,79 @@
+===========================================
+Power Management Interface for System Sleep
+===========================================
+
+Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+The power management subsystem provides userspace with a unified sysfs interface
+for system sleep regardless of the underlying system architecture or platform.
+The interface is located in the /sys/power/ directory (assuming that sysfs is
+mounted at /sys).
+
+/sys/power/state is the system sleep state control file.
+
+Reading from it returns a list of supported sleep states, encoded as:
+
+- 'freeze' (Suspend-to-Idle)
+- 'standby' (Power-On Suspend)
+- 'mem' (Suspend-to-RAM)
+- 'disk' (Suspend-to-Disk)
+
+Suspend-to-Idle is always supported.  Suspend-to-Disk is always supported
+too as long the kernel has been configured to support hibernation at all
+(ie. CONFIG_HIBERNATION is set in the kernel configuration file).  Support
+for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
+platform.
+
+If one of the strings listed in /sys/power/state is written to it, the system
+will attempt to transition into the corresponding sleep state.  Refer to
+Documentation/admin-guide/pm/sleep-states.rst for a description of each of
+those states.
+
+/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
+Specifically, it tells the kernel what to do after creating a hibernation image.
+
+Reading from it returns a list of supported options encoded as:
+
+- 'platform' (put the system into sleep using a platform-provided method)
+- 'shutdown' (shut the system down)
+- 'reboot' (reboot the system)
+- 'suspend' (trigger a Suspend-to-RAM transition)
+- 'test_resume' (resume-after-hibernation test mode)
+
+The currently selected option is printed in square brackets.
+
+The 'platform' option is only available if the platform provides a special
+mechanism to put the system to sleep after creating a hibernation image (ACPI
+does that, for example).  The 'suspend' option is available if Suspend-to-RAM
+is supported.  Refer to Documentation/power/basic-pm-debugging.rst for the
+description of the 'test_resume' option.
+
+To select an option, write the string representing it to /sys/power/disk.
+
+/sys/power/image_size controls the size of hibernation images.
+
+It can be written a string representing a non-negative integer that will be
+used as a best-effort upper limit of the image size, in bytes.  The hibernation
+core will do its best to ensure that the image size will not exceed that number.
+However, if that turns out to be impossible to achieve, a hibernation image will
+still be created and its size will be as small as possible.  In particular,
+writing '0' to this file will enforce hibernation images to be as small as
+possible.
+
+Reading from this file returns the current image size limit, which is set to
+around 2/5 of available RAM by default.
+
+/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
+or resume event point in the RTC across reboots.
+
+It helps to debug hard lockups or reboots due to device driver failures that
+occur during system suspend or resume (which is more common) more effectively.
+
+If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
+event point in turn will be stored in the RTC memory (overwriting the actual
+RTC information), so it will survive a system crash if one occurs right after
+storing it and it can be used later to identify the driver that caused the crash
+to happen (see Documentation/power/s2ram.rst for more information).
+
+Initially it contains '0' which may be changed to '1' by writing a string
+representing a nonzero integer into it.
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
deleted file mode 100644
index 27df7f98668a..000000000000
--- a/Documentation/power/interface.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-Power Management Interface for System Sleep
-
-Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-The power management subsystem provides userspace with a unified sysfs interface
-for system sleep regardless of the underlying system architecture or platform.
-The interface is located in the /sys/power/ directory (assuming that sysfs is
-mounted at /sys).
-
-/sys/power/state is the system sleep state control file.
-
-Reading from it returns a list of supported sleep states, encoded as:
-
-'freeze' (Suspend-to-Idle)
-'standby' (Power-On Suspend)
-'mem' (Suspend-to-RAM)
-'disk' (Suspend-to-Disk)
-
-Suspend-to-Idle is always supported.  Suspend-to-Disk is always supported
-too as long the kernel has been configured to support hibernation at all
-(ie. CONFIG_HIBERNATION is set in the kernel configuration file).  Support
-for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
-platform.
-
-If one of the strings listed in /sys/power/state is written to it, the system
-will attempt to transition into the corresponding sleep state.  Refer to
-Documentation/admin-guide/pm/sleep-states.rst for a description of each of
-those states.
-
-/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
-Specifically, it tells the kernel what to do after creating a hibernation image.
-
-Reading from it returns a list of supported options encoded as:
-
-'platform' (put the system into sleep using a platform-provided method)
-'shutdown' (shut the system down)
-'reboot' (reboot the system)
-'suspend' (trigger a Suspend-to-RAM transition)
-'test_resume' (resume-after-hibernation test mode)
-
-The currently selected option is printed in square brackets.
-
-The 'platform' option is only available if the platform provides a special
-mechanism to put the system to sleep after creating a hibernation image (ACPI
-does that, for example).  The 'suspend' option is available if Suspend-to-RAM
-is supported.  Refer to Documentation/power/basic-pm-debugging.txt for the
-description of the 'test_resume' option.
-
-To select an option, write the string representing it to /sys/power/disk.
-
-/sys/power/image_size controls the size of hibernation images.
-
-It can be written a string representing a non-negative integer that will be
-used as a best-effort upper limit of the image size, in bytes.  The hibernation
-core will do its best to ensure that the image size will not exceed that number.
-However, if that turns out to be impossible to achieve, a hibernation image will
-still be created and its size will be as small as possible.  In particular,
-writing '0' to this file will enforce hibernation images to be as small as
-possible.
-
-Reading from this file returns the current image size limit, which is set to
-around 2/5 of available RAM by default.
-
-/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
-or resume event point in the RTC across reboots.
-
-It helps to debug hard lockups or reboots due to device driver failures that
-occur during system suspend or resume (which is more common) more effectively.
-
-If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
-event point in turn will be stored in the RTC memory (overwriting the actual
-RTC information), so it will survive a system crash if one occurs right after
-storing it and it can be used later to identify the driver that caused the crash
-to happen (see Documentation/power/s2ram.txt for more information).
-
-Initially it contains '0' which may be changed to '1' by writing a string
-representing a nonzero integer into it.
diff --git a/Documentation/power/opp.rst b/Documentation/power/opp.rst
new file mode 100644
index 000000000000..b3cf1def9dee
--- /dev/null
+++ b/Documentation/power/opp.rst
@@ -0,0 +1,379 @@
+==========================================
+Operating Performance Points (OPP) Library
+==========================================
+
+(C) 2009-2010 Nishanth Menon <nm@ti.com>, Texas Instruments Incorporated
+
+.. Contents
+
+  1. Introduction
+  2. Initial OPP List Registration
+  3. OPP Search Functions
+  4. OPP Availability Control Functions
+  5. OPP Data Retrieval Functions
+  6. Data Structures
+
+1. Introduction
+===============
+
+1.1 What is an Operating Performance Point (OPP)?
+-------------------------------------------------
+
+Complex SoCs of today consists of a multiple sub-modules working in conjunction.
+In an operational system executing varied use cases, not all modules in the SoC
+need to function at their highest performing frequency all the time. To
+facilitate this, sub-modules in a SoC are grouped into domains, allowing some
+domains to run at lower voltage and frequency while other domains run at
+voltage/frequency pairs that are higher.
+
+The set of discrete tuples consisting of frequency and voltage pairs that
+the device will support per domain are called Operating Performance Points or
+OPPs.
+
+As an example:
+
+Let us consider an MPU device which supports the following:
+{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V},
+{1GHz at minimum voltage of 1.3V}
+
+We can represent these as three OPPs as the following {Hz, uV} tuples:
+
+- {300000000, 1000000}
+- {800000000, 1200000}
+- {1000000000, 1300000}
+
+1.2 Operating Performance Points Library
+----------------------------------------
+
+OPP library provides a set of helper functions to organize and query the OPP
+information. The library is located in drivers/base/power/opp.c and the header
+is located in include/linux/pm_opp.h. OPP library can be enabled by enabling
+CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on
+CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to
+optionally boot at a certain OPP without needing cpufreq.
+
+Typical usage of the OPP library is as follows::
+
+ (users)	-> registers a set of default OPPs		-> (library)
+ SoC framework	-> modifies on required cases certain OPPs	-> OPP layer
+		-> queries to search/retrieve information	->
+
+OPP layer expects each domain to be represented by a unique device pointer. SoC
+framework registers a set of initial OPPs per device with the OPP layer. This
+list is expected to be an optimally small number typically around 5 per device.
+This initial list contains a set of OPPs that the framework expects to be safely
+enabled by default in the system.
+
+Note on OPP Availability
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+As the system proceeds to operate, SoC framework may choose to make certain
+OPPs available or not available on each device based on various external
+factors. Example usage: Thermal management or other exceptional situations where
+SoC framework might choose to disable a higher frequency OPP to safely continue
+operations until that OPP could be re-enabled if possible.
+
+OPP library facilitates this concept in it's implementation. The following
+operational functions operate only on available opps:
+opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count
+
+dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then
+be used for dev_pm_opp_enable/disable functions to make an opp available as required.
+
+WARNING: Users of OPP library should refresh their availability count using
+get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the
+exact mechanism to trigger these or the notification mechanism to other
+dependent subsystems such as cpufreq are left to the discretion of the SoC
+specific framework which uses the OPP library. Similar care needs to be taken
+care to refresh the cpufreq table in cases of these operations.
+
+2. Initial OPP List Registration
+================================
+The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per
+device. It is expected that the SoC framework will register the OPP entries
+optimally- typical numbers range to be less than 5. The list generated by
+registering the OPPs is maintained by OPP library throughout the device
+operation. The SoC framework can subsequently control the availability of the
+OPPs dynamically using the dev_pm_opp_enable / disable functions.
+
+dev_pm_opp_add
+	Add a new OPP for a specific domain represented by the device pointer.
+	The OPP is defined using the frequency and voltage. Once added, the OPP
+	is assumed to be available and control of it's availability can be done
+	with the dev_pm_opp_enable/disable functions. OPP library internally stores
+	and manages this information in the opp struct. This function may be
+	used by SoC framework to define a optimal list as per the demands of
+	SoC usage environment.
+
+	WARNING:
+		Do not use this function in interrupt context.
+
+	Example::
+
+	 soc_pm_init()
+	 {
+		/* Do things */
+		r = dev_pm_opp_add(mpu_dev, 1000000, 900000);
+		if (!r) {
+			pr_err("%s: unable to register mpu opp(%d)\n", r);
+			goto no_cpufreq;
+		}
+		/* Do cpufreq things */
+	 no_cpufreq:
+		/* Do remaining things */
+	 }
+
+3. OPP Search Functions
+=======================
+High level framework such as cpufreq operates on frequencies. To map the
+frequency back to the corresponding OPP, OPP library provides handy functions
+to search the OPP list that OPP library internally manages. These search
+functions return the matching pointer representing the opp if a match is
+found, else returns error. These errors are expected to be handled by standard
+error checks such as IS_ERR() and appropriate actions taken by the caller.
+
+Callers of these functions shall call dev_pm_opp_put() after they have used the
+OPP. Otherwise the memory for the OPP will never get freed and result in
+memleak.
+
+dev_pm_opp_find_freq_exact
+	Search for an OPP based on an *exact* frequency and
+	availability. This function is especially useful to enable an OPP which
+	is not available by default.
+	Example: In a case when SoC framework detects a situation where a
+	higher frequency could be made available, it can use this function to
+	find the OPP prior to call the dev_pm_opp_enable to actually make
+	it available::
+
+	 opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
+	 dev_pm_opp_put(opp);
+	 /* dont operate on the pointer.. just do a sanity check.. */
+	 if (IS_ERR(opp)) {
+		pr_err("frequency not disabled!\n");
+		/* trigger appropriate actions.. */
+	 } else {
+		dev_pm_opp_enable(dev,1000000000);
+	 }
+
+	NOTE:
+	  This is the only search function that operates on OPPs which are
+	  not available.
+
+dev_pm_opp_find_freq_floor
+	Search for an available OPP which is *at most* the
+	provided frequency. This function is useful while searching for a lesser
+	match OR operating on OPP information in the order of decreasing
+	frequency.
+	Example: To find the highest opp for a device::
+
+	 freq = ULONG_MAX;
+	 opp = dev_pm_opp_find_freq_floor(dev, &freq);
+	 dev_pm_opp_put(opp);
+
+dev_pm_opp_find_freq_ceil
+	Search for an available OPP which is *at least* the
+	provided frequency. This function is useful while searching for a
+	higher match OR operating on OPP information in the order of increasing
+	frequency.
+	Example 1: To find the lowest opp for a device::
+
+	 freq = 0;
+	 opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+	 dev_pm_opp_put(opp);
+
+	Example 2: A simplified implementation of a SoC cpufreq_driver->target::
+
+	 soc_cpufreq_target(..)
+	 {
+		/* Do stuff like policy checks etc. */
+		/* Find the best frequency match for the req */
+		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+		dev_pm_opp_put(opp);
+		if (!IS_ERR(opp))
+			soc_switch_to_freq_voltage(freq);
+		else
+			/* do something when we can't satisfy the req */
+		/* do other stuff */
+	 }
+
+4. OPP Availability Control Functions
+=====================================
+A default OPP list registered with the OPP library may not cater to all possible
+situation. The OPP library provides a set of functions to modify the
+availability of a OPP within the OPP list. This allows SoC frameworks to have
+fine grained dynamic control of which sets of OPPs are operationally available.
+These functions are intended to *temporarily* remove an OPP in conditions such
+as thermal considerations (e.g. don't use OPPx until the temperature drops).
+
+WARNING:
+	Do not use these functions in interrupt context.
+
+dev_pm_opp_enable
+	Make a OPP available for operation.
+	Example: Lets say that 1GHz OPP is to be made available only if the
+	SoC temperature is lower than a certain threshold. The SoC framework
+	implementation might choose to do something as follows::
+
+	 if (cur_temp < temp_low_thresh) {
+		/* Enable 1GHz if it was disabled */
+		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
+		dev_pm_opp_put(opp);
+		/* just error check */
+		if (!IS_ERR(opp))
+			ret = dev_pm_opp_enable(dev, 1000000000);
+		else
+			goto try_something_else;
+	 }
+
+dev_pm_opp_disable
+	Make an OPP to be not available for operation
+	Example: Lets say that 1GHz OPP is to be disabled if the temperature
+	exceeds a threshold value. The SoC framework implementation might
+	choose to do something as follows::
+
+	 if (cur_temp > temp_high_thresh) {
+		/* Disable 1GHz if it was enabled */
+		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true);
+		dev_pm_opp_put(opp);
+		/* just error check */
+		if (!IS_ERR(opp))
+			ret = dev_pm_opp_disable(dev, 1000000000);
+		else
+			goto try_something_else;
+	 }
+
+5. OPP Data Retrieval Functions
+===============================
+Since OPP library abstracts away the OPP information, a set of functions to pull
+information from the OPP structure is necessary. Once an OPP pointer is
+retrieved using the search functions, the following functions can be used by SoC
+framework to retrieve the information represented inside the OPP layer.
+
+dev_pm_opp_get_voltage
+	Retrieve the voltage represented by the opp pointer.
+	Example: At a cpufreq transition to a different frequency, SoC
+	framework requires to set the voltage represented by the OPP using
+	the regulator framework to the Power Management chip providing the
+	voltage::
+
+	 soc_switch_to_freq_voltage(freq)
+	 {
+		/* do things */
+		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+		v = dev_pm_opp_get_voltage(opp);
+		dev_pm_opp_put(opp);
+		if (v)
+			regulator_set_voltage(.., v);
+		/* do other things */
+	 }
+
+dev_pm_opp_get_freq
+	Retrieve the freq represented by the opp pointer.
+	Example: Lets say the SoC framework uses a couple of helper functions
+	we could pass opp pointers instead of doing additional parameters to
+	handle quiet a bit of data parameters::
+
+	 soc_cpufreq_target(..)
+	 {
+		/* do things.. */
+		 max_freq = ULONG_MAX;
+		 max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq);
+		 requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq);
+		 if (!IS_ERR(max_opp) && !IS_ERR(requested_opp))
+			r = soc_test_validity(max_opp, requested_opp);
+		 dev_pm_opp_put(max_opp);
+		 dev_pm_opp_put(requested_opp);
+		/* do other things */
+	 }
+	 soc_test_validity(..)
+	 {
+		 if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp))
+			 return -EINVAL;
+		 if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp))
+			 return -EINVAL;
+		/* do things.. */
+	 }
+
+dev_pm_opp_get_opp_count
+	Retrieve the number of available opps for a device
+	Example: Lets say a co-processor in the SoC needs to know the available
+	frequencies in a table, the main processor can notify as following::
+
+	 soc_notify_coproc_available_frequencies()
+	 {
+		/* Do things */
+		num_available = dev_pm_opp_get_opp_count(dev);
+		speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL);
+		/* populate the table in increasing order */
+		freq = 0;
+		while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) {
+			speeds[i] = freq;
+			freq++;
+			i++;
+			dev_pm_opp_put(opp);
+		}
+
+		soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available);
+		/* Do other things */
+	 }
+
+6. Data Structures
+==================
+Typically an SoC contains multiple voltage domains which are variable. Each
+domain is represented by a device pointer. The relationship to OPP can be
+represented as follows::
+
+  SoC
+   |- device 1
+   |	|- opp 1 (availability, freq, voltage)
+   |	|- opp 2 ..
+   ...	...
+   |	`- opp n ..
+   |- device 2
+   ...
+   `- device m
+
+OPP library maintains a internal list that the SoC framework populates and
+accessed by various functions as described above. However, the structures
+representing the actual OPPs and domains are internal to the OPP library itself
+to allow for suitable abstraction reusable across systems.
+
+struct dev_pm_opp
+	The internal data structure of OPP library which is used to
+	represent an OPP. In addition to the freq, voltage, availability
+	information, it also contains internal book keeping information required
+	for the OPP library to operate on.  Pointer to this structure is
+	provided back to the users such as SoC framework to be used as a
+	identifier for OPP in the interactions with OPP layer.
+
+	WARNING:
+	  The struct dev_pm_opp pointer should not be parsed or modified by the
+	  users. The defaults of for an instance is populated by
+	  dev_pm_opp_add, but the availability of the OPP can be modified
+	  by dev_pm_opp_enable/disable functions.
+
+struct device
+	This is used to identify a domain to the OPP layer. The
+	nature of the device and it's implementation is left to the user of
+	OPP library such as the SoC framework.
+
+Overall, in a simplistic view, the data structure operations is represented as
+following::
+
+  Initialization / modification:
+              +-----+        /- dev_pm_opp_enable
+  dev_pm_opp_add --> | opp | <-------
+    |         +-----+        \- dev_pm_opp_disable
+    \-------> domain_info(device)
+
+  Search functions:
+               /-- dev_pm_opp_find_freq_ceil  ---\   +-----+
+  domain_info<---- dev_pm_opp_find_freq_exact -----> | opp |
+               \-- dev_pm_opp_find_freq_floor ---/   +-----+
+
+  Retrieval functions:
+  +-----+     /- dev_pm_opp_get_voltage
+  | opp | <---
+  +-----+     \- dev_pm_opp_get_freq
+
+  domain_info <- dev_pm_opp_get_opp_count
diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt
deleted file mode 100644
index 0c007e250cd1..000000000000
--- a/Documentation/power/opp.txt
+++ /dev/null
@@ -1,342 +0,0 @@
-Operating Performance Points (OPP) Library
-==========================================
-
-(C) 2009-2010 Nishanth Menon <nm@ti.com>, Texas Instruments Incorporated
-
-Contents
---------
-1. Introduction
-2. Initial OPP List Registration
-3. OPP Search Functions
-4. OPP Availability Control Functions
-5. OPP Data Retrieval Functions
-6. Data Structures
-
-1. Introduction
-===============
-1.1 What is an Operating Performance Point (OPP)?
-
-Complex SoCs of today consists of a multiple sub-modules working in conjunction.
-In an operational system executing varied use cases, not all modules in the SoC
-need to function at their highest performing frequency all the time. To
-facilitate this, sub-modules in a SoC are grouped into domains, allowing some
-domains to run at lower voltage and frequency while other domains run at
-voltage/frequency pairs that are higher.
-
-The set of discrete tuples consisting of frequency and voltage pairs that
-the device will support per domain are called Operating Performance Points or
-OPPs.
-
-As an example:
-Let us consider an MPU device which supports the following:
-{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V},
-{1GHz at minimum voltage of 1.3V}
-
-We can represent these as three OPPs as the following {Hz, uV} tuples:
-{300000000, 1000000}
-{800000000, 1200000}
-{1000000000, 1300000}
-
-1.2 Operating Performance Points Library
-
-OPP library provides a set of helper functions to organize and query the OPP
-information. The library is located in drivers/base/power/opp.c and the header
-is located in include/linux/pm_opp.h. OPP library can be enabled by enabling
-CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on
-CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to
-optionally boot at a certain OPP without needing cpufreq.
-
-Typical usage of the OPP library is as follows:
-(users)		-> registers a set of default OPPs		-> (library)
-SoC framework	-> modifies on required cases certain OPPs	-> OPP layer
-		-> queries to search/retrieve information	->
-
-OPP layer expects each domain to be represented by a unique device pointer. SoC
-framework registers a set of initial OPPs per device with the OPP layer. This
-list is expected to be an optimally small number typically around 5 per device.
-This initial list contains a set of OPPs that the framework expects to be safely
-enabled by default in the system.
-
-Note on OPP Availability:
-------------------------
-As the system proceeds to operate, SoC framework may choose to make certain
-OPPs available or not available on each device based on various external
-factors. Example usage: Thermal management or other exceptional situations where
-SoC framework might choose to disable a higher frequency OPP to safely continue
-operations until that OPP could be re-enabled if possible.
-
-OPP library facilitates this concept in it's implementation. The following
-operational functions operate only on available opps:
-opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count
-
-dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then
-be used for dev_pm_opp_enable/disable functions to make an opp available as required.
-
-WARNING: Users of OPP library should refresh their availability count using
-get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the
-exact mechanism to trigger these or the notification mechanism to other
-dependent subsystems such as cpufreq are left to the discretion of the SoC
-specific framework which uses the OPP library. Similar care needs to be taken
-care to refresh the cpufreq table in cases of these operations.
-
-2. Initial OPP List Registration
-================================
-The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per
-device. It is expected that the SoC framework will register the OPP entries
-optimally- typical numbers range to be less than 5. The list generated by
-registering the OPPs is maintained by OPP library throughout the device
-operation. The SoC framework can subsequently control the availability of the
-OPPs dynamically using the dev_pm_opp_enable / disable functions.
-
-dev_pm_opp_add - Add a new OPP for a specific domain represented by the device pointer.
-	The OPP is defined using the frequency and voltage. Once added, the OPP
-	is assumed to be available and control of it's availability can be done
-	with the dev_pm_opp_enable/disable functions. OPP library internally stores
-	and manages this information in the opp struct. This function may be
-	used by SoC framework to define a optimal list as per the demands of
-	SoC usage environment.
-
-	WARNING: Do not use this function in interrupt context.
-
-	Example:
-	 soc_pm_init()
-	 {
-		/* Do things */
-		r = dev_pm_opp_add(mpu_dev, 1000000, 900000);
-		if (!r) {
-			pr_err("%s: unable to register mpu opp(%d)\n", r);
-			goto no_cpufreq;
-		}
-		/* Do cpufreq things */
-	 no_cpufreq:
-		/* Do remaining things */
-	 }
-
-3. OPP Search Functions
-=======================
-High level framework such as cpufreq operates on frequencies. To map the
-frequency back to the corresponding OPP, OPP library provides handy functions
-to search the OPP list that OPP library internally manages. These search
-functions return the matching pointer representing the opp if a match is
-found, else returns error. These errors are expected to be handled by standard
-error checks such as IS_ERR() and appropriate actions taken by the caller.
-
-Callers of these functions shall call dev_pm_opp_put() after they have used the
-OPP. Otherwise the memory for the OPP will never get freed and result in
-memleak.
-
-dev_pm_opp_find_freq_exact - Search for an OPP based on an *exact* frequency and
-	availability. This function is especially useful to enable an OPP which
-	is not available by default.
-	Example: In a case when SoC framework detects a situation where a
-	higher frequency could be made available, it can use this function to
-	find the OPP prior to call the dev_pm_opp_enable to actually make it available.
-	 opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
-	 dev_pm_opp_put(opp);
-	 /* dont operate on the pointer.. just do a sanity check.. */
-	 if (IS_ERR(opp)) {
-		pr_err("frequency not disabled!\n");
-		/* trigger appropriate actions.. */
-	 } else {
-		dev_pm_opp_enable(dev,1000000000);
-	 }
-
-	NOTE: This is the only search function that operates on OPPs which are
-	not available.
-
-dev_pm_opp_find_freq_floor - Search for an available OPP which is *at most* the
-	provided frequency. This function is useful while searching for a lesser
-	match OR operating on OPP information in the order of decreasing
-	frequency.
-	Example: To find the highest opp for a device:
-	 freq = ULONG_MAX;
-	 opp = dev_pm_opp_find_freq_floor(dev, &freq);
-	 dev_pm_opp_put(opp);
-
-dev_pm_opp_find_freq_ceil - Search for an available OPP which is *at least* the
-	provided frequency. This function is useful while searching for a
-	higher match OR operating on OPP information in the order of increasing
-	frequency.
-	Example 1: To find the lowest opp for a device:
-	 freq = 0;
-	 opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-	 dev_pm_opp_put(opp);
-	Example 2: A simplified implementation of a SoC cpufreq_driver->target:
-	 soc_cpufreq_target(..)
-	 {
-		/* Do stuff like policy checks etc. */
-		/* Find the best frequency match for the req */
-		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-		dev_pm_opp_put(opp);
-		if (!IS_ERR(opp))
-			soc_switch_to_freq_voltage(freq);
-		else
-			/* do something when we can't satisfy the req */
-		/* do other stuff */
-	 }
-
-4. OPP Availability Control Functions
-=====================================
-A default OPP list registered with the OPP library may not cater to all possible
-situation. The OPP library provides a set of functions to modify the
-availability of a OPP within the OPP list. This allows SoC frameworks to have
-fine grained dynamic control of which sets of OPPs are operationally available.
-These functions are intended to *temporarily* remove an OPP in conditions such
-as thermal considerations (e.g. don't use OPPx until the temperature drops).
-
-WARNING: Do not use these functions in interrupt context.
-
-dev_pm_opp_enable - Make a OPP available for operation.
-	Example: Lets say that 1GHz OPP is to be made available only if the
-	SoC temperature is lower than a certain threshold. The SoC framework
-	implementation might choose to do something as follows:
-	 if (cur_temp < temp_low_thresh) {
-		/* Enable 1GHz if it was disabled */
-		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
-		dev_pm_opp_put(opp);
-		/* just error check */
-		if (!IS_ERR(opp))
-			ret = dev_pm_opp_enable(dev, 1000000000);
-		else
-			goto try_something_else;
-	 }
-
-dev_pm_opp_disable - Make an OPP to be not available for operation
-	Example: Lets say that 1GHz OPP is to be disabled if the temperature
-	exceeds a threshold value. The SoC framework implementation might
-	choose to do something as follows:
-	 if (cur_temp > temp_high_thresh) {
-		/* Disable 1GHz if it was enabled */
-		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true);
-		dev_pm_opp_put(opp);
-		/* just error check */
-		if (!IS_ERR(opp))
-			ret = dev_pm_opp_disable(dev, 1000000000);
-		else
-			goto try_something_else;
-	 }
-
-5. OPP Data Retrieval Functions
-===============================
-Since OPP library abstracts away the OPP information, a set of functions to pull
-information from the OPP structure is necessary. Once an OPP pointer is
-retrieved using the search functions, the following functions can be used by SoC
-framework to retrieve the information represented inside the OPP layer.
-
-dev_pm_opp_get_voltage - Retrieve the voltage represented by the opp pointer.
-	Example: At a cpufreq transition to a different frequency, SoC
-	framework requires to set the voltage represented by the OPP using
-	the regulator framework to the Power Management chip providing the
-	voltage.
-	 soc_switch_to_freq_voltage(freq)
-	 {
-		/* do things */
-		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-		v = dev_pm_opp_get_voltage(opp);
-		dev_pm_opp_put(opp);
-		if (v)
-			regulator_set_voltage(.., v);
-		/* do other things */
-	 }
-
-dev_pm_opp_get_freq - Retrieve the freq represented by the opp pointer.
-	Example: Lets say the SoC framework uses a couple of helper functions
-	we could pass opp pointers instead of doing additional parameters to
-	handle quiet a bit of data parameters.
-	 soc_cpufreq_target(..)
-	 {
-		/* do things.. */
-		 max_freq = ULONG_MAX;
-		 max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq);
-		 requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq);
-		 if (!IS_ERR(max_opp) && !IS_ERR(requested_opp))
-			r = soc_test_validity(max_opp, requested_opp);
-		 dev_pm_opp_put(max_opp);
-		 dev_pm_opp_put(requested_opp);
-		/* do other things */
-	 }
-	 soc_test_validity(..)
-	 {
-		 if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp))
-			 return -EINVAL;
-		 if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp))
-			 return -EINVAL;
-		/* do things.. */
-	 }
-
-dev_pm_opp_get_opp_count - Retrieve the number of available opps for a device
-	Example: Lets say a co-processor in the SoC needs to know the available
-	frequencies in a table, the main processor can notify as following:
-	 soc_notify_coproc_available_frequencies()
-	 {
-		/* Do things */
-		num_available = dev_pm_opp_get_opp_count(dev);
-		speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL);
-		/* populate the table in increasing order */
-		freq = 0;
-		while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) {
-			speeds[i] = freq;
-			freq++;
-			i++;
-			dev_pm_opp_put(opp);
-		}
-
-		soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available);
-		/* Do other things */
-	 }
-
-6. Data Structures
-==================
-Typically an SoC contains multiple voltage domains which are variable. Each
-domain is represented by a device pointer. The relationship to OPP can be
-represented as follows:
-SoC
- |- device 1
- |	|- opp 1 (availability, freq, voltage)
- |	|- opp 2 ..
- ...	...
- |	`- opp n ..
- |- device 2
- ...
- `- device m
-
-OPP library maintains a internal list that the SoC framework populates and
-accessed by various functions as described above. However, the structures
-representing the actual OPPs and domains are internal to the OPP library itself
-to allow for suitable abstraction reusable across systems.
-
-struct dev_pm_opp - The internal data structure of OPP library which is used to
-	represent an OPP. In addition to the freq, voltage, availability
-	information, it also contains internal book keeping information required
-	for the OPP library to operate on.  Pointer to this structure is
-	provided back to the users such as SoC framework to be used as a
-	identifier for OPP in the interactions with OPP layer.
-
-	WARNING: The struct dev_pm_opp pointer should not be parsed or modified by the
-	users. The defaults of for an instance is populated by dev_pm_opp_add, but the
-	availability of the OPP can be modified by dev_pm_opp_enable/disable functions.
-
-struct device - This is used to identify a domain to the OPP layer. The
-	nature of the device and it's implementation is left to the user of
-	OPP library such as the SoC framework.
-
-Overall, in a simplistic view, the data structure operations is represented as
-following:
-
-Initialization / modification:
-            +-----+        /- dev_pm_opp_enable
-dev_pm_opp_add --> | opp | <-------
-  |         +-----+        \- dev_pm_opp_disable
-  \-------> domain_info(device)
-
-Search functions:
-             /-- dev_pm_opp_find_freq_ceil  ---\   +-----+
-domain_info<---- dev_pm_opp_find_freq_exact -----> | opp |
-             \-- dev_pm_opp_find_freq_floor ---/   +-----+
-
-Retrieval functions:
-+-----+     /- dev_pm_opp_get_voltage
-| opp | <---
-+-----+     \- dev_pm_opp_get_freq
-
-domain_info <- dev_pm_opp_get_opp_count
diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst
new file mode 100644
index 000000000000..0e2ef7429304
--- /dev/null
+++ b/Documentation/power/pci.rst
@@ -0,0 +1,1135 @@
+====================
+PCI Power Management
+====================
+
+Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+An overview of concepts and the Linux kernel's interfaces related to PCI power
+management.  Based on previous work by Patrick Mochel <mochel@transmeta.com>
+(and others).
+
+This document only covers the aspects of power management specific to PCI
+devices.  For general description of the kernel's interfaces related to device
+power management refer to Documentation/driver-api/pm/devices.rst and
+Documentation/power/runtime_pm.rst.
+
+.. contents:
+
+   1. Hardware and Platform Support for PCI Power Management
+   2. PCI Subsystem and Device Power Management
+   3. PCI Device Drivers and Power Management
+   4. Resources
+
+
+1. Hardware and Platform Support for PCI Power Management
+=========================================================
+
+1.1. Native and Platform-Based Power Management
+-----------------------------------------------
+
+In general, power management is a feature allowing one to save energy by putting
+devices into states in which they draw less power (low-power states) at the
+price of reduced functionality or performance.
+
+Usually, a device is put into a low-power state when it is underutilized or
+completely inactive.  However, when it is necessary to use the device once
+again, it has to be put back into the "fully functional" state (full-power
+state).  This may happen when there are some data for the device to handle or
+as a result of an external event requiring the device to be active, which may
+be signaled by the device itself.
+
+PCI devices may be put into low-power states in two ways, by using the device
+capabilities introduced by the PCI Bus Power Management Interface Specification,
+or with the help of platform firmware, such as an ACPI BIOS.  In the first
+approach, that is referred to as the native PCI power management (native PCI PM)
+in what follows, the device power state is changed as a result of writing a
+specific value into one of its standard configuration registers.  The second
+approach requires the platform firmware to provide special methods that may be
+used by the kernel to change the device's power state.
+
+Devices supporting the native PCI PM usually can generate wakeup signals called
+Power Management Events (PMEs) to let the kernel know about external events
+requiring the device to be active.  After receiving a PME the kernel is supposed
+to put the device that sent it into the full-power state.  However, the PCI Bus
+Power Management Interface Specification doesn't define any standard method of
+delivering the PME from the device to the CPU and the operating system kernel.
+It is assumed that the platform firmware will perform this task and therefore,
+even though a PCI device is set up to generate PMEs, it also may be necessary to
+prepare the platform firmware for notifying the CPU of the PMEs coming from the
+device (e.g. by generating interrupts).
+
+In turn, if the methods provided by the platform firmware are used for changing
+the power state of a device, usually the platform also provides a method for
+preparing the device to generate wakeup signals.  In that case, however, it
+often also is necessary to prepare the device for generating PMEs using the
+native PCI PM mechanism, because the method provided by the platform depends on
+that.
+
+Thus in many situations both the native and the platform-based power management
+mechanisms have to be used simultaneously to obtain the desired result.
+
+1.2. Native PCI Power Management
+--------------------------------
+
+The PCI Bus Power Management Interface Specification (PCI PM Spec) was
+introduced between the PCI 2.1 and PCI 2.2 Specifications.  It defined a
+standard interface for performing various operations related to power
+management.
+
+The implementation of the PCI PM Spec is optional for conventional PCI devices,
+but it is mandatory for PCI Express devices.  If a device supports the PCI PM
+Spec, it has an 8 byte power management capability field in its PCI
+configuration space.  This field is used to describe and control the standard
+features related to the native PCI power management.
+
+The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
+(B0-B3).  The higher the number, the less power is drawn by the device or bus
+in that state.  However, the higher the number, the longer the latency for
+the device or bus to return to the full-power state (D0 or B0, respectively).
+
+There are two variants of the D3 state defined by the specification.  The first
+one is D3hot, referred to as the software accessible D3, because devices can be
+programmed to go into it.  The second one, D3cold, is the state that PCI devices
+are in when the supply voltage (Vcc) is removed from them.  It is not possible
+to program a PCI device to go into D3cold, although there may be a programmable
+interface for putting the bus the device is on into a state in which Vcc is
+removed from all devices on the bus.
+
+PCI bus power management, however, is not supported by the Linux kernel at the
+time of this writing and therefore it is not covered by this document.
+
+Note that every PCI device can be in the full-power state (D0) or in D3cold,
+regardless of whether or not it implements the PCI PM Spec.  In addition to
+that, if the PCI PM Spec is implemented by the device, it must support D3hot
+as well as D0.  The support for the D1 and D2 power states is optional.
+
+PCI devices supporting the PCI PM Spec can be programmed to go to any of the
+supported low-power states (except for D3cold).  While in D1-D3hot the
+standard configuration registers of the device must be accessible to software
+(i.e. the device is required to respond to PCI configuration accesses), although
+its I/O and memory spaces are then disabled.  This allows the device to be
+programmatically put into D0.  Thus the kernel can switch the device back and
+forth between D0 and the supported low-power states (except for D3cold) and the
+possible power state transitions the device can undergo are the following:
+
++----------------------------+
+| Current State | New State  |
++----------------------------+
+| D0            | D1, D2, D3 |
++----------------------------+
+| D1            | D2, D3     |
++----------------------------+
+| D2            | D3         |
++----------------------------+
+| D1, D2, D3    | D0         |
++----------------------------+
+
+The transition from D3cold to D0 occurs when the supply voltage is provided to
+the device (i.e. power is restored).  In that case the device returns to D0 with
+a full power-on reset sequence and the power-on defaults are restored to the
+device by hardware just as at initial power up.
+
+PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
+while in a low-power state (D1-D3), but they are not required to be capable
+of generating PMEs from all supported low-power states.  In particular, the
+capability of generating PMEs from D3cold is optional and depends on the
+presence of additional voltage (3.3Vaux) allowing the device to remain
+sufficiently active to generate a wakeup signal.
+
+1.3. ACPI Device Power Management
+---------------------------------
+
+The platform firmware support for the power management of PCI devices is
+system-specific.  However, if the system in question is compliant with the
+Advanced Configuration and Power Interface (ACPI) Specification, like the
+majority of x86-based systems, it is supposed to implement device power
+management interfaces defined by the ACPI standard.
+
+For this purpose the ACPI BIOS provides special functions called "control
+methods" that may be executed by the kernel to perform specific tasks, such as
+putting a device into a low-power state.  These control methods are encoded
+using special byte-code language called the ACPI Machine Language (AML) and
+stored in the machine's BIOS.  The kernel loads them from the BIOS and executes
+them as needed using an AML interpreter that translates the AML byte code into
+computations and memory or I/O space accesses.  This way, in theory, a BIOS
+writer can provide the kernel with a means to perform actions depending
+on the system design in a system-specific fashion.
+
+ACPI control methods may be divided into global control methods, that are not
+associated with any particular devices, and device control methods, that have
+to be defined separately for each device supposed to be handled with the help of
+the platform.  This means, in particular, that ACPI device control methods can
+only be used to handle devices that the BIOS writer knew about in advance.  The
+ACPI methods used for device power management fall into that category.
+
+The ACPI specification assumes that devices can be in one of four power states
+labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
+D0-D3 states (although the difference between D3hot and D3cold is not taken
+into account by ACPI).  Moreover, for each power state of a device there is a
+set of power resources that have to be enabled for the device to be put into
+that state.  These power resources are controlled (i.e. enabled or disabled)
+with the help of their own control methods, _ON and _OFF, that have to be
+defined individually for each of them.
+
+To put a device into the ACPI power state Dx (where x is a number between 0 and
+3 inclusive) the kernel is supposed to (1) enable the power resources required
+by the device in this state using their _ON control methods and (2) execute the
+_PSx control method defined for the device.  In addition to that, if the device
+is going to be put into a low-power state (D1-D3) and is supposed to generate
+wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
+3.0) control method defined for it has to be executed before _PSx.  Power
+resources that are not required by the device in the target power state and are
+not required any more by any other device should be disabled (by executing their
+_OFF control methods).  If the current power state of the device is D3, it can
+only be put into D0 this way.
+
+However, quite often the power states of devices are changed during a
+system-wide transition into a sleep state or back into the working state.  ACPI
+defines four system sleep states, S1, S2, S3, and S4, and denotes the system
+working state as S0.  In general, the target system sleep (or working) state
+determines the highest power (lowest number) state the device can be put
+into and the kernel is supposed to obtain this information by executing the
+device's _SxD control method (where x is a number between 0 and 4 inclusive).
+If the device is required to wake up the system from the target sleep state, the
+lowest power (highest number) state it can be put into is also determined by the
+target state of the system.  The kernel is then supposed to use the device's
+_SxW control method to obtain the number of that state.  It also is supposed to
+use the device's _PRW control method to learn which power resources need to be
+enabled for the device to be able to generate wakeup signals.
+
+1.4. Wakeup Signaling
+---------------------
+
+Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
+a result of the execution of the _DSW (or _PSW) ACPI control method before
+putting the device into a low-power state, have to be caught and handled as
+appropriate.  If they are sent while the system is in the working state
+(ACPI S0), they should be translated into interrupts so that the kernel can
+put the devices generating them into the full-power state and take care of the
+events that triggered them.  In turn, if they are sent while the system is
+sleeping, they should cause the system's core logic to trigger wakeup.
+
+On ACPI-based systems wakeup signals sent by conventional PCI devices are
+converted into ACPI General-Purpose Events (GPEs) which are hardware signals
+from the system core logic generated in response to various events that need to
+be acted upon.  Every GPE is associated with one or more sources of potentially
+interesting events.  In particular, a GPE may be associated with a PCI device
+capable of signaling wakeup.  The information on the connections between GPEs
+and event sources is recorded in the system's ACPI BIOS from where it can be
+read by the kernel.
+
+If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
+associated with it (if there is one) is triggered.  The GPEs associated with PCI
+bridges may also be triggered in response to a wakeup signal from one of the
+devices below the bridge (this also is the case for root bridges) and, for
+example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
+handled this way.
+
+A GPE may be triggered when the system is sleeping (i.e. when it is in one of
+the ACPI S1-S4 states), in which case system wakeup is started by its core logic
+(the device that was the source of the signal causing the system wakeup to occur
+may be identified later).  The GPEs used in such situations are referred to as
+wakeup GPEs.
+
+Usually, however, GPEs are also triggered when the system is in the working
+state (ACPI S0) and in that case the system's core logic generates a System
+Control Interrupt (SCI) to notify the kernel of the event.  Then, the SCI
+handler identifies the GPE that caused the interrupt to be generated which,
+in turn, allows the kernel to identify the source of the event (that may be
+a PCI device signaling wakeup).  The GPEs used for notifying the kernel of
+events occurring while the system is in the working state are referred to as
+runtime GPEs.
+
+Unfortunately, there is no standard way of handling wakeup signals sent by
+conventional PCI devices on systems that are not ACPI-based, but there is one
+for PCI Express devices.  Namely, the PCI Express Base Specification introduced
+a native mechanism for converting native PCI PMEs into interrupts generated by
+root ports.  For conventional PCI devices native PMEs are out-of-band, so they
+are routed separately and they need not pass through bridges (in principle they
+may be routed directly to the system's core logic), but for PCI Express devices
+they are in-band messages that have to pass through the PCI Express hierarchy,
+including the root port on the path from the device to the Root Complex.  Thus
+it was possible to introduce a mechanism by which a root port generates an
+interrupt whenever it receives a PME message from one of the devices below it.
+The PCI Express Requester ID of the device that sent the PME message is then
+recorded in one of the root port's configuration registers from where it may be
+read by the interrupt handler allowing the device to be identified.  [PME
+messages sent by PCI Express endpoints integrated with the Root Complex don't
+pass through root ports, but instead they cause a Root Complex Event Collector
+(if there is one) to generate interrupts.]
+
+In principle the native PCI Express PME signaling may also be used on ACPI-based
+systems along with the GPEs, but to use it the kernel has to ask the system's
+ACPI BIOS to release control of root port configuration registers.  The ACPI
+BIOS, however, is not required to allow the kernel to control these registers
+and if it doesn't do that, the kernel must not modify their contents.  Of course
+the native PCI Express PME signaling cannot be used by the kernel in that case.
+
+
+2. PCI Subsystem and Device Power Management
+============================================
+
+2.1. Device Power Management Callbacks
+--------------------------------------
+
+The PCI Subsystem participates in the power management of PCI devices in a
+number of ways.  First of all, it provides an intermediate code layer between
+the device power management core (PM core) and PCI device drivers.
+Specifically, the pm field of the PCI subsystem's struct bus_type object,
+pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
+pointers to several device power management callbacks::
+
+  const struct dev_pm_ops pci_dev_pm_ops = {
+	.prepare = pci_pm_prepare,
+	.complete = pci_pm_complete,
+	.suspend = pci_pm_suspend,
+	.resume = pci_pm_resume,
+	.freeze = pci_pm_freeze,
+	.thaw = pci_pm_thaw,
+	.poweroff = pci_pm_poweroff,
+	.restore = pci_pm_restore,
+	.suspend_noirq = pci_pm_suspend_noirq,
+	.resume_noirq = pci_pm_resume_noirq,
+	.freeze_noirq = pci_pm_freeze_noirq,
+	.thaw_noirq = pci_pm_thaw_noirq,
+	.poweroff_noirq = pci_pm_poweroff_noirq,
+	.restore_noirq = pci_pm_restore_noirq,
+	.runtime_suspend = pci_pm_runtime_suspend,
+	.runtime_resume = pci_pm_runtime_resume,
+	.runtime_idle = pci_pm_runtime_idle,
+  };
+
+These callbacks are executed by the PM core in various situations related to
+device power management and they, in turn, execute power management callbacks
+provided by PCI device drivers.  They also perform power management operations
+involving some standard configuration registers of PCI devices that device
+drivers need not know or care about.
+
+The structure representing a PCI device, struct pci_dev, contains several fields
+that these callbacks operate on::
+
+  struct pci_dev {
+	...
+	pci_power_t     current_state;  /* Current operating state. */
+	int		pm_cap;		/* PM capability offset in the
+					   configuration space */
+	unsigned int	pme_support:5;	/* Bitmask of states from which PME#
+					   can be generated */
+	unsigned int	pme_interrupt:1;/* Is native PCIe PME signaling used? */
+	unsigned int	d1_support:1;	/* Low power state D1 is supported */
+	unsigned int	d2_support:1;	/* Low power state D2 is supported */
+	unsigned int	no_d1d2:1;	/* D1 and D2 are forbidden */
+	unsigned int	wakeup_prepared:1;  /* Device prepared for wake up */
+	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
+	...
+  };
+
+They also indirectly use some fields of the struct device that is embedded in
+struct pci_dev.
+
+2.2. Device Initialization
+--------------------------
+
+The PCI subsystem's first task related to device power management is to
+prepare the device for power management and initialize the fields of struct
+pci_dev used for this purpose.  This happens in two functions defined in
+drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
+
+The first of these functions checks if the device supports native PCI PM
+and if that's the case the offset of its power management capability structure
+in the configuration space is stored in the pm_cap field of the device's struct
+pci_dev object.  Next, the function checks which PCI low-power states are
+supported by the device and from which low-power states the device can generate
+native PCI PMEs.  The power management fields of the device's struct pci_dev and
+the struct device embedded in it are updated accordingly and the generation of
+PMEs by the device is disabled.
+
+The second function checks if the device can be prepared to signal wakeup with
+the help of the platform firmware, such as the ACPI BIOS.  If that is the case,
+the function updates the wakeup fields in struct device embedded in the
+device's struct pci_dev and uses the firmware-provided method to prevent the
+device from signaling wakeup.
+
+At this point the device is ready for power management.  For driverless devices,
+however, this functionality is limited to a few basic operations carried out
+during system-wide transitions to a sleep state and back to the working state.
+
+2.3. Runtime Device Power Management
+------------------------------------
+
+The PCI subsystem plays a vital role in the runtime power management of PCI
+devices.  For this purpose it uses the general runtime power management
+(runtime PM) framework described in Documentation/power/runtime_pm.rst.
+Namely, it provides subsystem-level callbacks::
+
+	pci_pm_runtime_suspend()
+	pci_pm_runtime_resume()
+	pci_pm_runtime_idle()
+
+that are executed by the core runtime PM routines.  It also implements the
+entire mechanics necessary for handling runtime wakeup signals from PCI devices
+in low-power states, which at the time of this writing works for both the native
+PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
+Section 1.
+
+First, a PCI device is put into a low-power state, or suspended, with the help
+of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
+pci_pm_runtime_suspend() to do the actual job.  For this to work, the device's
+driver has to provide a pm->runtime_suspend() callback (see below), which is
+run by pci_pm_runtime_suspend() as the first action.  If the driver's callback
+returns successfully, the device's standard configuration registers are saved,
+the device is prepared to generate wakeup signals and, finally, it is put into
+the target low-power state.
+
+The low-power state to put the device into is the lowest-power (highest number)
+state from which it can signal wakeup.  The exact method of signaling wakeup is
+system-dependent and is determined by the PCI subsystem on the basis of the
+reported capabilities of the device and the platform firmware.  To prepare the
+device for signaling wakeup and put it into the selected low-power state, the
+PCI subsystem can use the platform firmware as well as the device's native PCI
+PM capabilities, if supported.
+
+It is expected that the device driver's pm->runtime_suspend() callback will
+not attempt to prepare the device for signaling wakeup or to put it into a
+low-power state.  The driver ought to leave these tasks to the PCI subsystem
+that has all of the information necessary to perform them.
+
+A suspended device is brought back into the "active" state, or resumed,
+with the help of pm_request_resume() or pm_runtime_resume() which both call
+pci_pm_runtime_resume() for PCI devices.  Again, this only works if the device's
+driver provides a pm->runtime_resume() callback (see below).  However, before
+the driver's callback is executed, pci_pm_runtime_resume() brings the device
+back into the full-power state, prevents it from signaling wakeup while in that
+state and restores its standard configuration registers.  Thus the driver's
+callback need not worry about the PCI-specific aspects of the device resume.
+
+Note that generally pci_pm_runtime_resume() may be called in two different
+situations.  First, it may be called at the request of the device's driver, for
+example if there are some data for it to process.  Second, it may be called
+as a result of a wakeup signal from the device itself (this sometimes is
+referred to as "remote wakeup").  Of course, for this purpose the wakeup signal
+is handled in one of the ways described in Section 1 and finally converted into
+a notification for the PCI subsystem after the source device has been
+identified.
+
+The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
+and pm_request_idle(), executes the device driver's pm->runtime_idle()
+callback, if defined, and if that callback doesn't return error code (or is not
+present at all), suspends the device with the help of pm_runtime_suspend().
+Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
+example, it is called right after the device has just been resumed), in which
+cases it is expected to suspend the device if that makes sense.  Usually,
+however, the PCI subsystem doesn't really know if the device really can be
+suspended, so it lets the device's driver decide by running its
+pm->runtime_idle() callback.
+
+2.4. System-Wide Power Transitions
+----------------------------------
+There are a few different types of system-wide power transitions, described in
+Documentation/driver-api/pm/devices.rst.  Each of them requires devices to be handled
+in a specific way and the PM core executes subsystem-level power management
+callbacks for this purpose.  They are executed in phases such that each phase
+involves executing the same subsystem-level callback for every device belonging
+to the given subsystem before the next phase begins.  These phases always run
+after tasks have been frozen.
+
+2.4.1. System Suspend
+^^^^^^^^^^^^^^^^^^^^^
+
+When the system is going into a sleep state in which the contents of memory will
+be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
+
+	prepare, suspend, suspend_noirq.
+
+The following PCI bus type's callbacks, respectively, are used in these phases::
+
+	pci_pm_prepare()
+	pci_pm_suspend()
+	pci_pm_suspend_noirq()
+
+The pci_pm_prepare() routine first puts the device into the "fully functional"
+state with the help of pm_runtime_resume().  Then, it executes the device
+driver's pm->prepare() callback if defined (i.e. if the driver's struct
+dev_pm_ops object is present and the prepare pointer in that object is valid).
+
+The pci_pm_suspend() routine first checks if the device's driver implements
+legacy PCI suspend routines (see Section 3), in which case the driver's legacy
+suspend callback is executed, if present, and its result is returned.  Next, if
+the device's driver doesn't provide a struct dev_pm_ops object (containing
+pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
+simply turns off the device's bus master capability and runs
+pcibios_disable_device() to disable it, unless the device is a bridge (PCI
+bridges are ignored by this routine).  Next, the device driver's pm->suspend()
+callback is executed, if defined, and its result is returned if it fails.
+Finally, pci_fixup_device() is called to apply hardware suspend quirks related
+to the device if necessary.
+
+Note that the suspend phase is carried out asynchronously for PCI devices, so
+the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
+devices that don't depend on each other in a known way (i.e. none of the paths
+in the device tree from the root bridge to a leaf device contains both of them).
+
+The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
+been called, which means that the device driver's interrupt handler won't be
+invoked while this routine is running.  It first checks if the device's driver
+implements legacy PCI suspends routines (Section 3), in which case the legacy
+late suspend routine is called and its result is returned (the standard
+configuration registers of the device are saved if the driver's callback hasn't
+done that).  Second, if the device driver's struct dev_pm_ops object is not
+present, the device's standard configuration registers are saved and the routine
+returns success.  Otherwise the device driver's pm->suspend_noirq() callback is
+executed, if present, and its result is returned if it fails.  Next, if the
+device's standard configuration registers haven't been saved yet (one of the
+device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
+saves them, prepares the device to signal wakeup (if necessary) and puts it into
+a low-power state.
+
+The low-power state to put the device into is the lowest-power (highest number)
+state from which it can signal wakeup while the system is in the target sleep
+state.  Just like in the runtime PM case described above, the mechanism of
+signaling wakeup is system-dependent and determined by the PCI subsystem, which
+is also responsible for preparing the device to signal wakeup from the system's
+target sleep state as appropriate.
+
+PCI device drivers (that don't implement legacy power management callbacks) are
+generally not expected to prepare devices for signaling wakeup or to put them
+into low-power states.  However, if one of the driver's suspend callbacks
+(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
+registers, pci_pm_suspend_noirq() will assume that the device has been prepared
+to signal wakeup and put into a low-power state by the driver (the driver is
+then assumed to have used the helper functions provided by the PCI subsystem for
+this purpose).  PCI device drivers are not encouraged to do that, but in some
+rare cases doing that in the driver may be the optimum approach.
+
+2.4.2. System Resume
+^^^^^^^^^^^^^^^^^^^^
+
+When the system is undergoing a transition from a sleep state in which the
+contents of memory have been preserved, such as one of the ACPI sleep states
+S1-S3, into the working state (ACPI S0), the phases are:
+
+	resume_noirq, resume, complete.
+
+The following PCI bus type's callbacks, respectively, are executed in these
+phases::
+
+	pci_pm_resume_noirq()
+	pci_pm_resume()
+	pci_pm_complete()
+
+The pci_pm_resume_noirq() routine first puts the device into the full-power
+state, restores its standard configuration registers and applies early resume
+hardware quirks related to the device, if necessary.  This is done
+unconditionally, regardless of whether or not the device's driver implements
+legacy PCI power management callbacks (this way all PCI devices are in the
+full-power state and their standard configuration registers have been restored
+when their interrupt handlers are invoked for the first time during resume,
+which allows the kernel to avoid problems with the handling of shared interrupts
+by drivers whose devices are still suspended).  If legacy PCI power management
+callbacks (see Section 3) are implemented by the device's driver, the legacy
+early resume callback is executed and its result is returned.  Otherwise, the
+device driver's pm->resume_noirq() callback is executed, if defined, and its
+result is returned.
+
+The pci_pm_resume() routine first checks if the device's standard configuration
+registers have been restored and restores them if that's not the case (this
+only is necessary in the error path during a failing suspend).  Next, resume
+hardware quirks related to the device are applied, if necessary, and if the
+device's driver implements legacy PCI power management callbacks (see
+Section 3), the driver's legacy resume callback is executed and its result is
+returned.  Otherwise, the device's wakeup signaling mechanisms are blocked and
+its driver's pm->resume() callback is executed, if defined (the callback's
+result is then returned).
+
+The resume phase is carried out asynchronously for PCI devices, like the
+suspend phase described above, which means that if two PCI devices don't depend
+on each other in a known way, the pci_pm_resume() routine may be executed for
+the both of them in parallel.
+
+The pci_pm_complete() routine only executes the device driver's pm->complete()
+callback, if defined.
+
+2.4.3. System Hibernation
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+System hibernation is more complicated than system suspend, because it requires
+a system image to be created and written into a persistent storage medium.  The
+image is created atomically and all devices are quiesced, or frozen, before that
+happens.
+
+The freezing of devices is carried out after enough memory has been freed (at
+the time of this writing the image creation requires at least 50% of system RAM
+to be free) in the following three phases:
+
+	prepare, freeze, freeze_noirq
+
+that correspond to the PCI bus type's callbacks::
+
+	pci_pm_prepare()
+	pci_pm_freeze()
+	pci_pm_freeze_noirq()
+
+This means that the prepare phase is exactly the same as for system suspend.
+The other two phases, however, are different.
+
+The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
+the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
+and it doesn't apply the suspend-related hardware quirks.  It is executed
+asynchronously for different PCI devices that don't depend on each other in a
+known way.
+
+The pci_pm_freeze_noirq() routine, in turn, is similar to
+pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
+routine instead of pm->suspend_noirq().  It also doesn't attempt to prepare the
+device for signaling wakeup and put it into a low-power state.  Still, it saves
+the device's standard configuration registers if they haven't been saved by one
+of the driver's callbacks.
+
+Once the image has been created, it has to be saved.  However, at this point all
+devices are frozen and they cannot handle I/O, while their ability to handle
+I/O is obviously necessary for the image saving.  Thus they have to be brought
+back to the fully functional state and this is done in the following phases:
+
+	thaw_noirq, thaw, complete
+
+using the following PCI bus type's callbacks::
+
+	pci_pm_thaw_noirq()
+	pci_pm_thaw()
+	pci_pm_complete()
+
+respectively.
+
+The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
+but it doesn't put the device into the full power state and doesn't attempt to
+restore its standard configuration registers.  It also executes the device
+driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
+
+The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
+driver's pm->thaw() callback instead of pm->resume().  It is executed
+asynchronously for different PCI devices that don't depend on each other in a
+known way.
+
+The complete phase it the same as for system resume.
+
+After saving the image, devices need to be powered down before the system can
+enter the target sleep state (ACPI S4 for ACPI-based systems).  This is done in
+three phases:
+
+	prepare, poweroff, poweroff_noirq
+
+where the prepare phase is exactly the same as for system suspend.  The other
+two phases are analogous to the suspend and suspend_noirq phases, respectively.
+The PCI subsystem-level callbacks they correspond to::
+
+	pci_pm_poweroff()
+	pci_pm_poweroff_noirq()
+
+work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
+although they don't attempt to save the device's standard configuration
+registers.
+
+2.4.4. System Restore
+^^^^^^^^^^^^^^^^^^^^^
+
+System restore requires a hibernation image to be loaded into memory and the
+pre-hibernation memory contents to be restored before the pre-hibernation system
+activity can be resumed.
+
+As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded
+into memory by a fresh instance of the kernel, called the boot kernel, which in
+turn is loaded and run by a boot loader in the usual way.  After the boot kernel
+has loaded the image, it needs to replace its own code and data with the code
+and data of the "hibernated" kernel stored within the image, called the image
+kernel.  For this purpose all devices are frozen just like before creating
+the image during hibernation, in the
+
+	prepare, freeze, freeze_noirq
+
+phases described above.  However, the devices affected by these phases are only
+those having drivers in the boot kernel; other devices will still be in whatever
+state the boot loader left them.
+
+Should the restoration of the pre-hibernation memory contents fail, the boot
+kernel would go through the "thawing" procedure described above, using the
+thaw_noirq, thaw, and complete phases (that will only affect the devices having
+drivers in the boot kernel), and then continue running normally.
+
+If the pre-hibernation memory contents are restored successfully, which is the
+usual situation, control is passed to the image kernel, which then becomes
+responsible for bringing the system back to the working state.  To achieve this,
+it must restore the devices' pre-hibernation functionality, which is done much
+like waking up from the memory sleep state, although it involves different
+phases:
+
+	restore_noirq, restore, complete
+
+The first two of these are analogous to the resume_noirq and resume phases
+described above, respectively, and correspond to the following PCI subsystem
+callbacks::
+
+	pci_pm_restore_noirq()
+	pci_pm_restore()
+
+These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
+respectively, but they execute the device driver's pm->restore_noirq() and
+pm->restore() callbacks, if available.
+
+The complete phase is carried out in exactly the same way as during system
+resume.
+
+
+3. PCI Device Drivers and Power Management
+==========================================
+
+3.1. Power Management Callbacks
+-------------------------------
+
+PCI device drivers participate in power management by providing callbacks to be
+executed by the PCI subsystem's power management routines described above and by
+controlling the runtime power management of their devices.
+
+At the time of this writing there are two ways to define power management
+callbacks for a PCI device driver, the recommended one, based on using a
+dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the
+"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
+.resume() callbacks from struct pci_driver are used.  The legacy approach,
+however, doesn't allow one to define runtime power management callbacks and is
+not really suitable for any new drivers.  Therefore it is not covered by this
+document (refer to the source code to learn more about it).
+
+It is recommended that all PCI device drivers define a struct dev_pm_ops object
+containing pointers to power management (PM) callbacks that will be executed by
+the PCI subsystem's PM routines in various circumstances.  A pointer to the
+driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
+its struct pci_driver object.  Once that has happened, the "legacy" PM callbacks
+in struct pci_driver are ignored (even if they are not NULL).
+
+The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
+defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
+subsystem will handle the device in a simplified default manner.  If they are
+defined, though, they are expected to behave as described in the following
+subsections.
+
+3.1.1. prepare()
+^^^^^^^^^^^^^^^^
+
+The prepare() callback is executed during system suspend, during hibernation
+(when a hibernation image is about to be created), during power-off after
+saving a hibernation image and during system restore, when a hibernation image
+has just been loaded into memory.
+
+This callback is only necessary if the driver's device has children that in
+general may be registered at any time.  In that case the role of the prepare()
+callback is to prevent new children of the device from being registered until
+one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
+
+In addition to that the prepare() callback may carry out some operations
+preparing the device to be suspended, although it should not allocate memory
+(if additional memory is required to suspend the device, it has to be
+preallocated earlier, for example in a suspend/hibernate notifier as described
+in Documentation/driver-api/pm/notifiers.rst).
+
+3.1.2. suspend()
+^^^^^^^^^^^^^^^^
+
+The suspend() callback is only executed during system suspend, after prepare()
+callbacks have been executed for all devices in the system.
+
+This callback is expected to quiesce the device and prepare it to be put into a
+low-power state by the PCI subsystem.  It is not required (in fact it even is
+not recommended) that a PCI driver's suspend() callback save the standard
+configuration registers of the device, prepare it for waking up the system, or
+put it into a low-power state.  All of these operations can very well be taken
+care of by the PCI subsystem, without the driver's participation.
+
+However, in some rare case it is convenient to carry out these operations in
+a PCI driver.  Then, pci_save_state(), pci_prepare_to_sleep(), and
+pci_set_power_state() should be used to save the device's standard configuration
+registers, to prepare it for system wakeup (if necessary), and to put it into a
+low-power state, respectively.  Moreover, if the driver calls pci_save_state(),
+the PCI subsystem will not execute either pci_prepare_to_sleep(), or
+pci_set_power_state() for its device, so the driver is then responsible for
+handling the device as appropriate.
+
+While the suspend() callback is being executed, the driver's interrupt handler
+can be invoked to handle an interrupt from the device, so all suspend-related
+operations relying on the driver's ability to handle interrupts should be
+carried out in this callback.
+
+3.1.3. suspend_noirq()
+^^^^^^^^^^^^^^^^^^^^^^
+
+The suspend_noirq() callback is only executed during system suspend, after
+suspend() callbacks have been executed for all devices in the system and
+after device interrupts have been disabled by the PM core.
+
+The difference between suspend_noirq() and suspend() is that the driver's
+interrupt handler will not be invoked while suspend_noirq() is running.  Thus
+suspend_noirq() can carry out operations that would cause race conditions to
+arise if they were performed in suspend().
+
+3.1.4. freeze()
+^^^^^^^^^^^^^^^
+
+The freeze() callback is hibernation-specific and is executed in two situations,
+during hibernation, after prepare() callbacks have been executed for all devices
+in preparation for the creation of a system image, and during restore,
+after a system image has been loaded into memory from persistent storage and the
+prepare() callbacks have been executed for all devices.
+
+The role of this callback is analogous to the role of the suspend() callback
+described above.  In fact, they only need to be different in the rare cases when
+the driver takes the responsibility for putting the device into a low-power
+state.
+
+In that cases the freeze() callback should not prepare the device system wakeup
+or put it into a low-power state.  Still, either it or freeze_noirq() should
+save the device's standard configuration registers using pci_save_state().
+
+3.1.5. freeze_noirq()
+^^^^^^^^^^^^^^^^^^^^^
+
+The freeze_noirq() callback is hibernation-specific.  It is executed during
+hibernation, after prepare() and freeze() callbacks have been executed for all
+devices in preparation for the creation of a system image, and during restore,
+after a system image has been loaded into memory and after prepare() and
+freeze() callbacks have been executed for all devices.  It is always executed
+after device interrupts have been disabled by the PM core.
+
+The role of this callback is analogous to the role of the suspend_noirq()
+callback described above and it very rarely is necessary to define
+freeze_noirq().
+
+The difference between freeze_noirq() and freeze() is analogous to the
+difference between suspend_noirq() and suspend().
+
+3.1.6. poweroff()
+^^^^^^^^^^^^^^^^^
+
+The poweroff() callback is hibernation-specific.  It is executed when the system
+is about to be powered off after saving a hibernation image to a persistent
+storage.  prepare() callbacks are executed for all devices before poweroff() is
+called.
+
+The role of this callback is analogous to the role of the suspend() and freeze()
+callbacks described above, although it does not need to save the contents of
+the device's registers.  In particular, if the driver wants to put the device
+into a low-power state itself instead of allowing the PCI subsystem to do that,
+the poweroff() callback should use pci_prepare_to_sleep() and
+pci_set_power_state() to prepare the device for system wakeup and to put it
+into a low-power state, respectively, but it need not save the device's standard
+configuration registers.
+
+3.1.7. poweroff_noirq()
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The poweroff_noirq() callback is hibernation-specific.  It is executed after
+poweroff() callbacks have been executed for all devices in the system.
+
+The role of this callback is analogous to the role of the suspend_noirq() and
+freeze_noirq() callbacks described above, but it does not need to save the
+contents of the device's registers.
+
+The difference between poweroff_noirq() and poweroff() is analogous to the
+difference between suspend_noirq() and suspend().
+
+3.1.8. resume_noirq()
+^^^^^^^^^^^^^^^^^^^^^
+
+The resume_noirq() callback is only executed during system resume, after the
+PM core has enabled the non-boot CPUs.  The driver's interrupt handler will not
+be invoked while resume_noirq() is running, so this callback can carry out
+operations that might race with the interrupt handler.
+
+Since the PCI subsystem unconditionally puts all devices into the full power
+state in the resume_noirq phase of system resume and restores their standard
+configuration registers, resume_noirq() is usually not necessary.  In general
+it should only be used for performing operations that would lead to race
+conditions if carried out by resume().
+
+3.1.9. resume()
+^^^^^^^^^^^^^^^
+
+The resume() callback is only executed during system resume, after
+resume_noirq() callbacks have been executed for all devices in the system and
+device interrupts have been enabled by the PM core.
+
+This callback is responsible for restoring the pre-suspend configuration of the
+device and bringing it back to the fully functional state.  The device should be
+able to process I/O in a usual way after resume() has returned.
+
+3.1.10. thaw_noirq()
+^^^^^^^^^^^^^^^^^^^^
+
+The thaw_noirq() callback is hibernation-specific.  It is executed after a
+system image has been created and the non-boot CPUs have been enabled by the PM
+core, in the thaw_noirq phase of hibernation.  It also may be executed if the
+loading of a hibernation image fails during system restore (it is then executed
+after enabling the non-boot CPUs).  The driver's interrupt handler will not be
+invoked while thaw_noirq() is running.
+
+The role of this callback is analogous to the role of resume_noirq().  The
+difference between these two callbacks is that thaw_noirq() is executed after
+freeze() and freeze_noirq(), so in general it does not need to modify the
+contents of the device's registers.
+
+3.1.11. thaw()
+^^^^^^^^^^^^^^
+
+The thaw() callback is hibernation-specific.  It is executed after thaw_noirq()
+callbacks have been executed for all devices in the system and after device
+interrupts have been enabled by the PM core.
+
+This callback is responsible for restoring the pre-freeze configuration of
+the device, so that it will work in a usual way after thaw() has returned.
+
+3.1.12. restore_noirq()
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The restore_noirq() callback is hibernation-specific.  It is executed in the
+restore_noirq phase of hibernation, when the boot kernel has passed control to
+the image kernel and the non-boot CPUs have been enabled by the image kernel's
+PM core.
+
+This callback is analogous to resume_noirq() with the exception that it cannot
+make any assumption on the previous state of the device, even if the BIOS (or
+generally the platform firmware) is known to preserve that state over a
+suspend-resume cycle.
+
+For the vast majority of PCI device drivers there is no difference between
+resume_noirq() and restore_noirq().
+
+3.1.13. restore()
+^^^^^^^^^^^^^^^^^
+
+The restore() callback is hibernation-specific.  It is executed after
+restore_noirq() callbacks have been executed for all devices in the system and
+after the PM core has enabled device drivers' interrupt handlers to be invoked.
+
+This callback is analogous to resume(), just like restore_noirq() is analogous
+to resume_noirq().  Consequently, the difference between restore_noirq() and
+restore() is analogous to the difference between resume_noirq() and resume().
+
+For the vast majority of PCI device drivers there is no difference between
+resume() and restore().
+
+3.1.14. complete()
+^^^^^^^^^^^^^^^^^^
+
+The complete() callback is executed in the following situations:
+
+  - during system resume, after resume() callbacks have been executed for all
+    devices,
+  - during hibernation, before saving the system image, after thaw() callbacks
+    have been executed for all devices,
+  - during system restore, when the system is going back to its pre-hibernation
+    state, after restore() callbacks have been executed for all devices.
+
+It also may be executed if the loading of a hibernation image into memory fails
+(in that case it is run after thaw() callbacks have been executed for all
+devices that have drivers in the boot kernel).
+
+This callback is entirely optional, although it may be necessary if the
+prepare() callback performs operations that need to be reversed.
+
+3.1.15. runtime_suspend()
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_suspend() callback is specific to device runtime power management
+(runtime PM).  It is executed by the PM core's runtime PM framework when the
+device is about to be suspended (i.e. quiesced and put into a low-power state)
+at run time.
+
+This callback is responsible for freezing the device and preparing it to be
+put into a low-power state, but it must allow the PCI subsystem to perform all
+of the PCI-specific actions necessary for suspending the device.
+
+3.1.16. runtime_resume()
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_resume() callback is specific to device runtime PM.  It is executed
+by the PM core's runtime PM framework when the device is about to be resumed
+(i.e. put into the full-power state and programmed to process I/O normally) at
+run time.
+
+This callback is responsible for restoring the normal functionality of the
+device after it has been put into the full-power state by the PCI subsystem.
+The device is expected to be able to process I/O in the usual way after
+runtime_resume() has returned.
+
+3.1.17. runtime_idle()
+^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_idle() callback is specific to device runtime PM.  It is executed
+by the PM core's runtime PM framework whenever it may be desirable to suspend
+the device according to the PM core's information.  In particular, it is
+automatically executed right after runtime_resume() has returned in case the
+resume of the device has happened as a result of a spurious event.
+
+This callback is optional, but if it is not implemented or if it returns 0, the
+PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
+cause the driver's runtime_suspend() callback to be executed.
+
+3.1.18. Pointing Multiple Callback Pointers to One Routine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Although in principle each of the callbacks described in the previous
+subsections can be defined as a separate function, it often is convenient to
+point two or more members of struct dev_pm_ops to the same routine.  There are
+a few convenience macros that can be used for this purpose.
+
+The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
+suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
+members and one resume routine pointed to by the .resume(), .thaw(), and
+.restore() members.  The other function pointers in this struct dev_pm_ops are
+unset.
+
+The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
+additionally sets the .runtime_resume() pointer to the same value as
+.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
+the same value as .suspend() (and .freeze() and .poweroff()).
+
+The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
+dev_pm_ops to indicate that one suspend routine is to be pointed to by the
+.suspend(), .freeze(), and .poweroff() members and one resume routine is to
+be pointed to by the .resume(), .thaw(), and .restore() members.
+
+3.1.19. Driver Flags for Power Management
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The PM core allows device drivers to set flags that influence the handling of
+power management for the devices by the core itself and by middle layer code
+including the PCI bus type.  The flags should be set once at the driver probe
+time with the help of the dev_pm_set_driver_flags() function and they should not
+be updated directly afterwards.
+
+The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
+mechanism allowing device suspend/resume callbacks to be skipped if the device
+is in runtime suspend when the system suspend starts.  That also affects all of
+the ancestors of the device, so this flag should only be used if absolutely
+necessary.
+
+The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
+positive value from pci_pm_prepare() if the ->prepare callback provided by the
+driver of the device returns a positive value.  That allows the driver to opt
+out from using the direct-complete mechanism dynamically.
+
+The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
+perspective the device can be safely left in runtime suspend during system
+suspend.  That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
+to skip resuming the device from runtime suspend unless there are PCI-specific
+reasons for doing that.  Also, it causes pci_pm_suspend_late/noirq(),
+pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
+if the device remains in runtime suspend in the beginning of the "late" phase
+of the system-wide transition under way.  Moreover, if the device is in
+runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
+power management status will be changed to "active" (as it is going to be put
+into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
+the function will set the power.direct_complete flag for it (to make the PM core
+skip the subsequent "thaw" callbacks for it) and return.
+
+Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
+device to be left in suspend after system-wide transitions to the working state.
+This flag is checked by the PM core, but the PCI bus type informs the PM core
+which devices may be left in suspend from its perspective (that happens during
+the "noirq" phase of system-wide suspend and analogous transitions) and next it
+uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
+pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
+callbacks for the device during the transition under way and will set its
+runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
+it.
+
+3.2. Device Runtime Power Management
+------------------------------------
+
+In addition to providing device power management callbacks PCI device drivers
+are responsible for controlling the runtime power management (runtime PM) of
+their devices.
+
+The PCI device runtime PM is optional, but it is recommended that PCI device
+drivers implement it at least in the cases where there is a reliable way of
+verifying that the device is not used (like when the network cable is detached
+from an Ethernet adapter or there are no devices attached to a USB controller).
+
+To support the PCI runtime PM the driver first needs to implement the
+runtime_suspend() and runtime_resume() callbacks.  It also may need to implement
+the runtime_idle() callback to prevent the device from being suspended again
+every time right after the runtime_resume() callback has returned
+(alternatively, the runtime_suspend() callback will have to check if the
+device should really be suspended and return -EAGAIN if that is not the case).
+
+The runtime PM of PCI devices is enabled by default by the PCI core.  PCI
+device drivers do not need to enable it and should not attempt to do so.
+However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
+helper function.  In addition to that, the runtime PM usage counter of
+each PCI device is incremented by local_pci_probe() before executing the
+probe callback provided by the device's driver.
+
+If a PCI driver implements the runtime PM callbacks and intends to use the
+runtime PM framework provided by the PM core and the PCI subsystem, it needs
+to decrement the device's runtime PM usage counter in its probe callback
+function.  If it doesn't do that, the counter will always be different from
+zero for the device and it will never be runtime-suspended.  The simplest
+way to do that is by calling pm_runtime_put_noidle(), but if the driver
+wants to schedule an autosuspend right away, for example, it may call
+pm_runtime_put_autosuspend() instead for this purpose.  Generally, it
+just needs to call a function that decrements the devices usage counter
+from its probe routine to make runtime PM work for the device.
+
+It is important to remember that the driver's runtime_suspend() callback
+may be executed right after the usage counter has been decremented, because
+user space may already have caused the pm_runtime_allow() helper function
+unblocking the runtime PM of the device to run via sysfs, so the driver must
+be prepared to cope with that.
+
+The driver itself should not call pm_runtime_allow(), though.  Instead, it
+should let user space or some platform-specific code do that (user space can
+do it via sysfs as stated above), but it must be prepared to handle the
+runtime PM of the device correctly as soon as pm_runtime_allow() is called
+(which may happen at any time, even before the driver is loaded).
+
+When the driver's remove callback runs, it has to balance the decrementation
+of the device's runtime PM usage counter at the probe time.  For this reason,
+if it has decremented the counter in its probe callback, it must run
+pm_runtime_get_noresume() in its remove callback.  [Since the core carries
+out a runtime resume of the device and bumps up the device's usage counter
+before running the driver's remove callback, the runtime PM of the device
+is effectively disabled for the duration of the remove execution and all
+runtime PM helper functions incrementing the device's usage counter are
+then effectively equivalent to pm_runtime_get_noresume().]
+
+The runtime PM framework works by processing requests to suspend or resume
+devices, or to check if they are idle (in which cases it is reasonable to
+subsequently request that they be suspended).  These requests are represented
+by work items put into the power management workqueue, pm_wq.  Although there
+are a few situations in which power management requests are automatically
+queued by the PM core (for example, after processing a request to resume a
+device the PM core automatically queues a request to check if the device is
+idle), device drivers are generally responsible for queuing power management
+requests for their devices.  For this purpose they should use the runtime PM
+helper functions provided by the PM core, discussed in
+Documentation/power/runtime_pm.rst.
+
+Devices can also be suspended and resumed synchronously, without placing a
+request into pm_wq.  In the majority of cases this also is done by their
+drivers that use helper functions provided by the PM core for this purpose.
+
+For more information on the runtime PM of devices refer to
+Documentation/power/runtime_pm.rst.
+
+
+4. Resources
+============
+
+PCI Local Bus Specification, Rev. 3.0
+
+PCI Bus Power Management Interface Specification, Rev. 1.2
+
+Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
+
+PCI Express Base Specification, Rev. 2.0
+
+Documentation/driver-api/pm/devices.rst
+
+Documentation/power/runtime_pm.rst
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
deleted file mode 100644
index 8eaf9ee24d43..000000000000
--- a/Documentation/power/pci.txt
+++ /dev/null
@@ -1,1094 +0,0 @@
-PCI Power Management
-
-Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-
-An overview of concepts and the Linux kernel's interfaces related to PCI power
-management.  Based on previous work by Patrick Mochel <mochel@transmeta.com>
-(and others).
-
-This document only covers the aspects of power management specific to PCI
-devices.  For general description of the kernel's interfaces related to device
-power management refer to Documentation/driver-api/pm/devices.rst and
-Documentation/power/runtime_pm.txt.
-
----------------------------------------------------------------------------
-
-1. Hardware and Platform Support for PCI Power Management
-2. PCI Subsystem and Device Power Management
-3. PCI Device Drivers and Power Management
-4. Resources
-
-
-1. Hardware and Platform Support for PCI Power Management
-=========================================================
-
-1.1. Native and Platform-Based Power Management
------------------------------------------------
-In general, power management is a feature allowing one to save energy by putting
-devices into states in which they draw less power (low-power states) at the
-price of reduced functionality or performance.
-
-Usually, a device is put into a low-power state when it is underutilized or
-completely inactive.  However, when it is necessary to use the device once
-again, it has to be put back into the "fully functional" state (full-power
-state).  This may happen when there are some data for the device to handle or
-as a result of an external event requiring the device to be active, which may
-be signaled by the device itself.
-
-PCI devices may be put into low-power states in two ways, by using the device
-capabilities introduced by the PCI Bus Power Management Interface Specification,
-or with the help of platform firmware, such as an ACPI BIOS.  In the first
-approach, that is referred to as the native PCI power management (native PCI PM)
-in what follows, the device power state is changed as a result of writing a
-specific value into one of its standard configuration registers.  The second
-approach requires the platform firmware to provide special methods that may be
-used by the kernel to change the device's power state.
-
-Devices supporting the native PCI PM usually can generate wakeup signals called
-Power Management Events (PMEs) to let the kernel know about external events
-requiring the device to be active.  After receiving a PME the kernel is supposed
-to put the device that sent it into the full-power state.  However, the PCI Bus
-Power Management Interface Specification doesn't define any standard method of
-delivering the PME from the device to the CPU and the operating system kernel.
-It is assumed that the platform firmware will perform this task and therefore,
-even though a PCI device is set up to generate PMEs, it also may be necessary to
-prepare the platform firmware for notifying the CPU of the PMEs coming from the
-device (e.g. by generating interrupts).
-
-In turn, if the methods provided by the platform firmware are used for changing
-the power state of a device, usually the platform also provides a method for
-preparing the device to generate wakeup signals.  In that case, however, it
-often also is necessary to prepare the device for generating PMEs using the
-native PCI PM mechanism, because the method provided by the platform depends on
-that.
-
-Thus in many situations both the native and the platform-based power management
-mechanisms have to be used simultaneously to obtain the desired result.
-
-1.2. Native PCI Power Management
---------------------------------
-The PCI Bus Power Management Interface Specification (PCI PM Spec) was
-introduced between the PCI 2.1 and PCI 2.2 Specifications.  It defined a
-standard interface for performing various operations related to power
-management.
-
-The implementation of the PCI PM Spec is optional for conventional PCI devices,
-but it is mandatory for PCI Express devices.  If a device supports the PCI PM
-Spec, it has an 8 byte power management capability field in its PCI
-configuration space.  This field is used to describe and control the standard
-features related to the native PCI power management.
-
-The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
-(B0-B3).  The higher the number, the less power is drawn by the device or bus
-in that state.  However, the higher the number, the longer the latency for
-the device or bus to return to the full-power state (D0 or B0, respectively).
-
-There are two variants of the D3 state defined by the specification.  The first
-one is D3hot, referred to as the software accessible D3, because devices can be
-programmed to go into it.  The second one, D3cold, is the state that PCI devices
-are in when the supply voltage (Vcc) is removed from them.  It is not possible
-to program a PCI device to go into D3cold, although there may be a programmable
-interface for putting the bus the device is on into a state in which Vcc is
-removed from all devices on the bus.
-
-PCI bus power management, however, is not supported by the Linux kernel at the
-time of this writing and therefore it is not covered by this document.
-
-Note that every PCI device can be in the full-power state (D0) or in D3cold,
-regardless of whether or not it implements the PCI PM Spec.  In addition to
-that, if the PCI PM Spec is implemented by the device, it must support D3hot
-as well as D0.  The support for the D1 and D2 power states is optional.
-
-PCI devices supporting the PCI PM Spec can be programmed to go to any of the
-supported low-power states (except for D3cold).  While in D1-D3hot the
-standard configuration registers of the device must be accessible to software
-(i.e. the device is required to respond to PCI configuration accesses), although
-its I/O and memory spaces are then disabled.  This allows the device to be
-programmatically put into D0.  Thus the kernel can switch the device back and
-forth between D0 and the supported low-power states (except for D3cold) and the
-possible power state transitions the device can undergo are the following:
-
-+----------------------------+
-| Current State | New State  |
-+----------------------------+
-| D0            | D1, D2, D3 |
-+----------------------------+
-| D1            | D2, D3     |
-+----------------------------+
-| D2            | D3         |
-+----------------------------+
-| D1, D2, D3    | D0         |
-+----------------------------+
-
-The transition from D3cold to D0 occurs when the supply voltage is provided to
-the device (i.e. power is restored).  In that case the device returns to D0 with
-a full power-on reset sequence and the power-on defaults are restored to the
-device by hardware just as at initial power up.
-
-PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
-while in a low-power state (D1-D3), but they are not required to be capable
-of generating PMEs from all supported low-power states.  In particular, the
-capability of generating PMEs from D3cold is optional and depends on the
-presence of additional voltage (3.3Vaux) allowing the device to remain
-sufficiently active to generate a wakeup signal.
-
-1.3. ACPI Device Power Management
----------------------------------
-The platform firmware support for the power management of PCI devices is
-system-specific.  However, if the system in question is compliant with the
-Advanced Configuration and Power Interface (ACPI) Specification, like the
-majority of x86-based systems, it is supposed to implement device power
-management interfaces defined by the ACPI standard.
-
-For this purpose the ACPI BIOS provides special functions called "control
-methods" that may be executed by the kernel to perform specific tasks, such as
-putting a device into a low-power state.  These control methods are encoded
-using special byte-code language called the ACPI Machine Language (AML) and
-stored in the machine's BIOS.  The kernel loads them from the BIOS and executes
-them as needed using an AML interpreter that translates the AML byte code into
-computations and memory or I/O space accesses.  This way, in theory, a BIOS
-writer can provide the kernel with a means to perform actions depending
-on the system design in a system-specific fashion.
-
-ACPI control methods may be divided into global control methods, that are not
-associated with any particular devices, and device control methods, that have
-to be defined separately for each device supposed to be handled with the help of
-the platform.  This means, in particular, that ACPI device control methods can
-only be used to handle devices that the BIOS writer knew about in advance.  The
-ACPI methods used for device power management fall into that category.
-
-The ACPI specification assumes that devices can be in one of four power states
-labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
-D0-D3 states (although the difference between D3hot and D3cold is not taken
-into account by ACPI).  Moreover, for each power state of a device there is a
-set of power resources that have to be enabled for the device to be put into
-that state.  These power resources are controlled (i.e. enabled or disabled)
-with the help of their own control methods, _ON and _OFF, that have to be
-defined individually for each of them.
-
-To put a device into the ACPI power state Dx (where x is a number between 0 and
-3 inclusive) the kernel is supposed to (1) enable the power resources required
-by the device in this state using their _ON control methods and (2) execute the
-_PSx control method defined for the device.  In addition to that, if the device
-is going to be put into a low-power state (D1-D3) and is supposed to generate
-wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
-3.0) control method defined for it has to be executed before _PSx.  Power
-resources that are not required by the device in the target power state and are
-not required any more by any other device should be disabled (by executing their
-_OFF control methods).  If the current power state of the device is D3, it can
-only be put into D0 this way.
-
-However, quite often the power states of devices are changed during a
-system-wide transition into a sleep state or back into the working state.  ACPI
-defines four system sleep states, S1, S2, S3, and S4, and denotes the system
-working state as S0.  In general, the target system sleep (or working) state
-determines the highest power (lowest number) state the device can be put
-into and the kernel is supposed to obtain this information by executing the
-device's _SxD control method (where x is a number between 0 and 4 inclusive).
-If the device is required to wake up the system from the target sleep state, the
-lowest power (highest number) state it can be put into is also determined by the
-target state of the system.  The kernel is then supposed to use the device's
-_SxW control method to obtain the number of that state.  It also is supposed to
-use the device's _PRW control method to learn which power resources need to be
-enabled for the device to be able to generate wakeup signals.
-
-1.4. Wakeup Signaling
----------------------
-Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
-a result of the execution of the _DSW (or _PSW) ACPI control method before
-putting the device into a low-power state, have to be caught and handled as
-appropriate.  If they are sent while the system is in the working state
-(ACPI S0), they should be translated into interrupts so that the kernel can
-put the devices generating them into the full-power state and take care of the
-events that triggered them.  In turn, if they are sent while the system is
-sleeping, they should cause the system's core logic to trigger wakeup.
-
-On ACPI-based systems wakeup signals sent by conventional PCI devices are
-converted into ACPI General-Purpose Events (GPEs) which are hardware signals
-from the system core logic generated in response to various events that need to
-be acted upon.  Every GPE is associated with one or more sources of potentially
-interesting events.  In particular, a GPE may be associated with a PCI device
-capable of signaling wakeup.  The information on the connections between GPEs
-and event sources is recorded in the system's ACPI BIOS from where it can be
-read by the kernel.
-
-If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
-associated with it (if there is one) is triggered.  The GPEs associated with PCI
-bridges may also be triggered in response to a wakeup signal from one of the
-devices below the bridge (this also is the case for root bridges) and, for
-example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
-handled this way.
-
-A GPE may be triggered when the system is sleeping (i.e. when it is in one of
-the ACPI S1-S4 states), in which case system wakeup is started by its core logic
-(the device that was the source of the signal causing the system wakeup to occur
-may be identified later).  The GPEs used in such situations are referred to as
-wakeup GPEs.
-
-Usually, however, GPEs are also triggered when the system is in the working
-state (ACPI S0) and in that case the system's core logic generates a System
-Control Interrupt (SCI) to notify the kernel of the event.  Then, the SCI
-handler identifies the GPE that caused the interrupt to be generated which,
-in turn, allows the kernel to identify the source of the event (that may be
-a PCI device signaling wakeup).  The GPEs used for notifying the kernel of
-events occurring while the system is in the working state are referred to as
-runtime GPEs.
-
-Unfortunately, there is no standard way of handling wakeup signals sent by
-conventional PCI devices on systems that are not ACPI-based, but there is one
-for PCI Express devices.  Namely, the PCI Express Base Specification introduced
-a native mechanism for converting native PCI PMEs into interrupts generated by
-root ports.  For conventional PCI devices native PMEs are out-of-band, so they
-are routed separately and they need not pass through bridges (in principle they
-may be routed directly to the system's core logic), but for PCI Express devices
-they are in-band messages that have to pass through the PCI Express hierarchy,
-including the root port on the path from the device to the Root Complex.  Thus
-it was possible to introduce a mechanism by which a root port generates an
-interrupt whenever it receives a PME message from one of the devices below it.
-The PCI Express Requester ID of the device that sent the PME message is then
-recorded in one of the root port's configuration registers from where it may be
-read by the interrupt handler allowing the device to be identified.  [PME
-messages sent by PCI Express endpoints integrated with the Root Complex don't
-pass through root ports, but instead they cause a Root Complex Event Collector
-(if there is one) to generate interrupts.]
-
-In principle the native PCI Express PME signaling may also be used on ACPI-based
-systems along with the GPEs, but to use it the kernel has to ask the system's
-ACPI BIOS to release control of root port configuration registers.  The ACPI
-BIOS, however, is not required to allow the kernel to control these registers
-and if it doesn't do that, the kernel must not modify their contents.  Of course
-the native PCI Express PME signaling cannot be used by the kernel in that case.
-
-
-2. PCI Subsystem and Device Power Management
-============================================
-
-2.1. Device Power Management Callbacks
---------------------------------------
-The PCI Subsystem participates in the power management of PCI devices in a
-number of ways.  First of all, it provides an intermediate code layer between
-the device power management core (PM core) and PCI device drivers.
-Specifically, the pm field of the PCI subsystem's struct bus_type object,
-pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
-pointers to several device power management callbacks:
-
-const struct dev_pm_ops pci_dev_pm_ops = {
-	.prepare = pci_pm_prepare,
-	.complete = pci_pm_complete,
-	.suspend = pci_pm_suspend,
-	.resume = pci_pm_resume,
-	.freeze = pci_pm_freeze,
-	.thaw = pci_pm_thaw,
-	.poweroff = pci_pm_poweroff,
-	.restore = pci_pm_restore,
-	.suspend_noirq = pci_pm_suspend_noirq,
-	.resume_noirq = pci_pm_resume_noirq,
-	.freeze_noirq = pci_pm_freeze_noirq,
-	.thaw_noirq = pci_pm_thaw_noirq,
-	.poweroff_noirq = pci_pm_poweroff_noirq,
-	.restore_noirq = pci_pm_restore_noirq,
-	.runtime_suspend = pci_pm_runtime_suspend,
-	.runtime_resume = pci_pm_runtime_resume,
-	.runtime_idle = pci_pm_runtime_idle,
-};
-
-These callbacks are executed by the PM core in various situations related to
-device power management and they, in turn, execute power management callbacks
-provided by PCI device drivers.  They also perform power management operations
-involving some standard configuration registers of PCI devices that device
-drivers need not know or care about.
-
-The structure representing a PCI device, struct pci_dev, contains several fields
-that these callbacks operate on:
-
-struct pci_dev {
-	...
-	pci_power_t     current_state;  /* Current operating state. */
-	int		pm_cap;		/* PM capability offset in the
-					   configuration space */
-	unsigned int	pme_support:5;	/* Bitmask of states from which PME#
-					   can be generated */
-	unsigned int	pme_interrupt:1;/* Is native PCIe PME signaling used? */
-	unsigned int	d1_support:1;	/* Low power state D1 is supported */
-	unsigned int	d2_support:1;	/* Low power state D2 is supported */
-	unsigned int	no_d1d2:1;	/* D1 and D2 are forbidden */
-	unsigned int	wakeup_prepared:1;  /* Device prepared for wake up */
-	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
-	...
-};
-
-They also indirectly use some fields of the struct device that is embedded in
-struct pci_dev.
-
-2.2. Device Initialization
---------------------------
-The PCI subsystem's first task related to device power management is to
-prepare the device for power management and initialize the fields of struct
-pci_dev used for this purpose.  This happens in two functions defined in
-drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
-
-The first of these functions checks if the device supports native PCI PM
-and if that's the case the offset of its power management capability structure
-in the configuration space is stored in the pm_cap field of the device's struct
-pci_dev object.  Next, the function checks which PCI low-power states are
-supported by the device and from which low-power states the device can generate
-native PCI PMEs.  The power management fields of the device's struct pci_dev and
-the struct device embedded in it are updated accordingly and the generation of
-PMEs by the device is disabled.
-
-The second function checks if the device can be prepared to signal wakeup with
-the help of the platform firmware, such as the ACPI BIOS.  If that is the case,
-the function updates the wakeup fields in struct device embedded in the
-device's struct pci_dev and uses the firmware-provided method to prevent the
-device from signaling wakeup.
-
-At this point the device is ready for power management.  For driverless devices,
-however, this functionality is limited to a few basic operations carried out
-during system-wide transitions to a sleep state and back to the working state.
-
-2.3. Runtime Device Power Management
-------------------------------------
-The PCI subsystem plays a vital role in the runtime power management of PCI
-devices.  For this purpose it uses the general runtime power management
-(runtime PM) framework described in Documentation/power/runtime_pm.txt.
-Namely, it provides subsystem-level callbacks:
-
-	pci_pm_runtime_suspend()
-	pci_pm_runtime_resume()
-	pci_pm_runtime_idle()
-
-that are executed by the core runtime PM routines.  It also implements the
-entire mechanics necessary for handling runtime wakeup signals from PCI devices
-in low-power states, which at the time of this writing works for both the native
-PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
-Section 1.
-
-First, a PCI device is put into a low-power state, or suspended, with the help
-of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
-pci_pm_runtime_suspend() to do the actual job.  For this to work, the device's
-driver has to provide a pm->runtime_suspend() callback (see below), which is
-run by pci_pm_runtime_suspend() as the first action.  If the driver's callback
-returns successfully, the device's standard configuration registers are saved,
-the device is prepared to generate wakeup signals and, finally, it is put into
-the target low-power state.
-
-The low-power state to put the device into is the lowest-power (highest number)
-state from which it can signal wakeup.  The exact method of signaling wakeup is
-system-dependent and is determined by the PCI subsystem on the basis of the
-reported capabilities of the device and the platform firmware.  To prepare the
-device for signaling wakeup and put it into the selected low-power state, the
-PCI subsystem can use the platform firmware as well as the device's native PCI
-PM capabilities, if supported.
-
-It is expected that the device driver's pm->runtime_suspend() callback will
-not attempt to prepare the device for signaling wakeup or to put it into a
-low-power state.  The driver ought to leave these tasks to the PCI subsystem
-that has all of the information necessary to perform them.
-
-A suspended device is brought back into the "active" state, or resumed,
-with the help of pm_request_resume() or pm_runtime_resume() which both call
-pci_pm_runtime_resume() for PCI devices.  Again, this only works if the device's
-driver provides a pm->runtime_resume() callback (see below).  However, before
-the driver's callback is executed, pci_pm_runtime_resume() brings the device
-back into the full-power state, prevents it from signaling wakeup while in that
-state and restores its standard configuration registers.  Thus the driver's
-callback need not worry about the PCI-specific aspects of the device resume.
-
-Note that generally pci_pm_runtime_resume() may be called in two different
-situations.  First, it may be called at the request of the device's driver, for
-example if there are some data for it to process.  Second, it may be called
-as a result of a wakeup signal from the device itself (this sometimes is
-referred to as "remote wakeup").  Of course, for this purpose the wakeup signal
-is handled in one of the ways described in Section 1 and finally converted into
-a notification for the PCI subsystem after the source device has been
-identified.
-
-The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
-and pm_request_idle(), executes the device driver's pm->runtime_idle()
-callback, if defined, and if that callback doesn't return error code (or is not
-present at all), suspends the device with the help of pm_runtime_suspend().
-Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
-example, it is called right after the device has just been resumed), in which
-cases it is expected to suspend the device if that makes sense.  Usually,
-however, the PCI subsystem doesn't really know if the device really can be
-suspended, so it lets the device's driver decide by running its
-pm->runtime_idle() callback.
-
-2.4. System-Wide Power Transitions
-----------------------------------
-There are a few different types of system-wide power transitions, described in
-Documentation/driver-api/pm/devices.rst.  Each of them requires devices to be handled
-in a specific way and the PM core executes subsystem-level power management
-callbacks for this purpose.  They are executed in phases such that each phase
-involves executing the same subsystem-level callback for every device belonging
-to the given subsystem before the next phase begins.  These phases always run
-after tasks have been frozen.
-
-2.4.1. System Suspend
-
-When the system is going into a sleep state in which the contents of memory will
-be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
-
-	prepare, suspend, suspend_noirq.
-
-The following PCI bus type's callbacks, respectively, are used in these phases:
-
-	pci_pm_prepare()
-	pci_pm_suspend()
-	pci_pm_suspend_noirq()
-
-The pci_pm_prepare() routine first puts the device into the "fully functional"
-state with the help of pm_runtime_resume().  Then, it executes the device
-driver's pm->prepare() callback if defined (i.e. if the driver's struct
-dev_pm_ops object is present and the prepare pointer in that object is valid).
-
-The pci_pm_suspend() routine first checks if the device's driver implements
-legacy PCI suspend routines (see Section 3), in which case the driver's legacy
-suspend callback is executed, if present, and its result is returned.  Next, if
-the device's driver doesn't provide a struct dev_pm_ops object (containing
-pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
-simply turns off the device's bus master capability and runs
-pcibios_disable_device() to disable it, unless the device is a bridge (PCI
-bridges are ignored by this routine).  Next, the device driver's pm->suspend()
-callback is executed, if defined, and its result is returned if it fails.
-Finally, pci_fixup_device() is called to apply hardware suspend quirks related
-to the device if necessary.
-
-Note that the suspend phase is carried out asynchronously for PCI devices, so
-the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
-devices that don't depend on each other in a known way (i.e. none of the paths
-in the device tree from the root bridge to a leaf device contains both of them).
-
-The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
-been called, which means that the device driver's interrupt handler won't be
-invoked while this routine is running.  It first checks if the device's driver
-implements legacy PCI suspends routines (Section 3), in which case the legacy
-late suspend routine is called and its result is returned (the standard
-configuration registers of the device are saved if the driver's callback hasn't
-done that).  Second, if the device driver's struct dev_pm_ops object is not
-present, the device's standard configuration registers are saved and the routine
-returns success.  Otherwise the device driver's pm->suspend_noirq() callback is
-executed, if present, and its result is returned if it fails.  Next, if the
-device's standard configuration registers haven't been saved yet (one of the
-device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
-saves them, prepares the device to signal wakeup (if necessary) and puts it into
-a low-power state.
-
-The low-power state to put the device into is the lowest-power (highest number)
-state from which it can signal wakeup while the system is in the target sleep
-state.  Just like in the runtime PM case described above, the mechanism of
-signaling wakeup is system-dependent and determined by the PCI subsystem, which
-is also responsible for preparing the device to signal wakeup from the system's
-target sleep state as appropriate.
-
-PCI device drivers (that don't implement legacy power management callbacks) are
-generally not expected to prepare devices for signaling wakeup or to put them
-into low-power states.  However, if one of the driver's suspend callbacks
-(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
-registers, pci_pm_suspend_noirq() will assume that the device has been prepared
-to signal wakeup and put into a low-power state by the driver (the driver is
-then assumed to have used the helper functions provided by the PCI subsystem for
-this purpose).  PCI device drivers are not encouraged to do that, but in some
-rare cases doing that in the driver may be the optimum approach.
-
-2.4.2. System Resume
-
-When the system is undergoing a transition from a sleep state in which the
-contents of memory have been preserved, such as one of the ACPI sleep states
-S1-S3, into the working state (ACPI S0), the phases are:
-
-	resume_noirq, resume, complete.
-
-The following PCI bus type's callbacks, respectively, are executed in these
-phases:
-
-	pci_pm_resume_noirq()
-	pci_pm_resume()
-	pci_pm_complete()
-
-The pci_pm_resume_noirq() routine first puts the device into the full-power
-state, restores its standard configuration registers and applies early resume
-hardware quirks related to the device, if necessary.  This is done
-unconditionally, regardless of whether or not the device's driver implements
-legacy PCI power management callbacks (this way all PCI devices are in the
-full-power state and their standard configuration registers have been restored
-when their interrupt handlers are invoked for the first time during resume,
-which allows the kernel to avoid problems with the handling of shared interrupts
-by drivers whose devices are still suspended).  If legacy PCI power management
-callbacks (see Section 3) are implemented by the device's driver, the legacy
-early resume callback is executed and its result is returned.  Otherwise, the
-device driver's pm->resume_noirq() callback is executed, if defined, and its
-result is returned.
-
-The pci_pm_resume() routine first checks if the device's standard configuration
-registers have been restored and restores them if that's not the case (this
-only is necessary in the error path during a failing suspend).  Next, resume
-hardware quirks related to the device are applied, if necessary, and if the
-device's driver implements legacy PCI power management callbacks (see
-Section 3), the driver's legacy resume callback is executed and its result is
-returned.  Otherwise, the device's wakeup signaling mechanisms are blocked and
-its driver's pm->resume() callback is executed, if defined (the callback's
-result is then returned).
-
-The resume phase is carried out asynchronously for PCI devices, like the
-suspend phase described above, which means that if two PCI devices don't depend
-on each other in a known way, the pci_pm_resume() routine may be executed for
-the both of them in parallel.
-
-The pci_pm_complete() routine only executes the device driver's pm->complete()
-callback, if defined.
-
-2.4.3. System Hibernation
-
-System hibernation is more complicated than system suspend, because it requires
-a system image to be created and written into a persistent storage medium.  The
-image is created atomically and all devices are quiesced, or frozen, before that
-happens.
-
-The freezing of devices is carried out after enough memory has been freed (at
-the time of this writing the image creation requires at least 50% of system RAM
-to be free) in the following three phases:
-
-	prepare, freeze, freeze_noirq
-
-that correspond to the PCI bus type's callbacks:
-
-	pci_pm_prepare()
-	pci_pm_freeze()
-	pci_pm_freeze_noirq()
-
-This means that the prepare phase is exactly the same as for system suspend.
-The other two phases, however, are different.
-
-The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
-the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
-and it doesn't apply the suspend-related hardware quirks.  It is executed
-asynchronously for different PCI devices that don't depend on each other in a
-known way.
-
-The pci_pm_freeze_noirq() routine, in turn, is similar to
-pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
-routine instead of pm->suspend_noirq().  It also doesn't attempt to prepare the
-device for signaling wakeup and put it into a low-power state.  Still, it saves
-the device's standard configuration registers if they haven't been saved by one
-of the driver's callbacks.
-
-Once the image has been created, it has to be saved.  However, at this point all
-devices are frozen and they cannot handle I/O, while their ability to handle
-I/O is obviously necessary for the image saving.  Thus they have to be brought
-back to the fully functional state and this is done in the following phases:
-
-	thaw_noirq, thaw, complete
-
-using the following PCI bus type's callbacks:
-
-	pci_pm_thaw_noirq()
-	pci_pm_thaw()
-	pci_pm_complete()
-
-respectively.
-
-The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
-but it doesn't put the device into the full power state and doesn't attempt to
-restore its standard configuration registers.  It also executes the device
-driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
-
-The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
-driver's pm->thaw() callback instead of pm->resume().  It is executed
-asynchronously for different PCI devices that don't depend on each other in a
-known way.
-
-The complete phase it the same as for system resume.
-
-After saving the image, devices need to be powered down before the system can
-enter the target sleep state (ACPI S4 for ACPI-based systems).  This is done in
-three phases:
-
-	prepare, poweroff, poweroff_noirq
-
-where the prepare phase is exactly the same as for system suspend.  The other
-two phases are analogous to the suspend and suspend_noirq phases, respectively.
-The PCI subsystem-level callbacks they correspond to
-
-	pci_pm_poweroff()
-	pci_pm_poweroff_noirq()
-
-work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
-although they don't attempt to save the device's standard configuration
-registers.
-
-2.4.4. System Restore
-
-System restore requires a hibernation image to be loaded into memory and the
-pre-hibernation memory contents to be restored before the pre-hibernation system
-activity can be resumed.
-
-As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded
-into memory by a fresh instance of the kernel, called the boot kernel, which in
-turn is loaded and run by a boot loader in the usual way.  After the boot kernel
-has loaded the image, it needs to replace its own code and data with the code
-and data of the "hibernated" kernel stored within the image, called the image
-kernel.  For this purpose all devices are frozen just like before creating
-the image during hibernation, in the
-
-	prepare, freeze, freeze_noirq
-
-phases described above.  However, the devices affected by these phases are only
-those having drivers in the boot kernel; other devices will still be in whatever
-state the boot loader left them.
-
-Should the restoration of the pre-hibernation memory contents fail, the boot
-kernel would go through the "thawing" procedure described above, using the
-thaw_noirq, thaw, and complete phases (that will only affect the devices having
-drivers in the boot kernel), and then continue running normally.
-
-If the pre-hibernation memory contents are restored successfully, which is the
-usual situation, control is passed to the image kernel, which then becomes
-responsible for bringing the system back to the working state.  To achieve this,
-it must restore the devices' pre-hibernation functionality, which is done much
-like waking up from the memory sleep state, although it involves different
-phases:
-
-	restore_noirq, restore, complete
-
-The first two of these are analogous to the resume_noirq and resume phases
-described above, respectively, and correspond to the following PCI subsystem
-callbacks:
-
-	pci_pm_restore_noirq()
-	pci_pm_restore()
-
-These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
-respectively, but they execute the device driver's pm->restore_noirq() and
-pm->restore() callbacks, if available.
-
-The complete phase is carried out in exactly the same way as during system
-resume.
-
-
-3. PCI Device Drivers and Power Management
-==========================================
-
-3.1. Power Management Callbacks
--------------------------------
-PCI device drivers participate in power management by providing callbacks to be
-executed by the PCI subsystem's power management routines described above and by
-controlling the runtime power management of their devices.
-
-At the time of this writing there are two ways to define power management
-callbacks for a PCI device driver, the recommended one, based on using a
-dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the
-"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
-.resume() callbacks from struct pci_driver are used.  The legacy approach,
-however, doesn't allow one to define runtime power management callbacks and is
-not really suitable for any new drivers.  Therefore it is not covered by this
-document (refer to the source code to learn more about it).
-
-It is recommended that all PCI device drivers define a struct dev_pm_ops object
-containing pointers to power management (PM) callbacks that will be executed by
-the PCI subsystem's PM routines in various circumstances.  A pointer to the
-driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
-its struct pci_driver object.  Once that has happened, the "legacy" PM callbacks
-in struct pci_driver are ignored (even if they are not NULL).
-
-The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
-defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
-subsystem will handle the device in a simplified default manner.  If they are
-defined, though, they are expected to behave as described in the following
-subsections.
-
-3.1.1. prepare()
-
-The prepare() callback is executed during system suspend, during hibernation
-(when a hibernation image is about to be created), during power-off after
-saving a hibernation image and during system restore, when a hibernation image
-has just been loaded into memory.
-
-This callback is only necessary if the driver's device has children that in
-general may be registered at any time.  In that case the role of the prepare()
-callback is to prevent new children of the device from being registered until
-one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
-
-In addition to that the prepare() callback may carry out some operations
-preparing the device to be suspended, although it should not allocate memory
-(if additional memory is required to suspend the device, it has to be
-preallocated earlier, for example in a suspend/hibernate notifier as described
-in Documentation/driver-api/pm/notifiers.rst).
-
-3.1.2. suspend()
-
-The suspend() callback is only executed during system suspend, after prepare()
-callbacks have been executed for all devices in the system.
-
-This callback is expected to quiesce the device and prepare it to be put into a
-low-power state by the PCI subsystem.  It is not required (in fact it even is
-not recommended) that a PCI driver's suspend() callback save the standard
-configuration registers of the device, prepare it for waking up the system, or
-put it into a low-power state.  All of these operations can very well be taken
-care of by the PCI subsystem, without the driver's participation.
-
-However, in some rare case it is convenient to carry out these operations in
-a PCI driver.  Then, pci_save_state(), pci_prepare_to_sleep(), and
-pci_set_power_state() should be used to save the device's standard configuration
-registers, to prepare it for system wakeup (if necessary), and to put it into a
-low-power state, respectively.  Moreover, if the driver calls pci_save_state(),
-the PCI subsystem will not execute either pci_prepare_to_sleep(), or
-pci_set_power_state() for its device, so the driver is then responsible for
-handling the device as appropriate.
-
-While the suspend() callback is being executed, the driver's interrupt handler
-can be invoked to handle an interrupt from the device, so all suspend-related
-operations relying on the driver's ability to handle interrupts should be
-carried out in this callback.
-
-3.1.3. suspend_noirq()
-
-The suspend_noirq() callback is only executed during system suspend, after
-suspend() callbacks have been executed for all devices in the system and
-after device interrupts have been disabled by the PM core.
-
-The difference between suspend_noirq() and suspend() is that the driver's
-interrupt handler will not be invoked while suspend_noirq() is running.  Thus
-suspend_noirq() can carry out operations that would cause race conditions to
-arise if they were performed in suspend().
-
-3.1.4. freeze()
-
-The freeze() callback is hibernation-specific and is executed in two situations,
-during hibernation, after prepare() callbacks have been executed for all devices
-in preparation for the creation of a system image, and during restore,
-after a system image has been loaded into memory from persistent storage and the
-prepare() callbacks have been executed for all devices.
-
-The role of this callback is analogous to the role of the suspend() callback
-described above.  In fact, they only need to be different in the rare cases when
-the driver takes the responsibility for putting the device into a low-power
-state.
-
-In that cases the freeze() callback should not prepare the device system wakeup
-or put it into a low-power state.  Still, either it or freeze_noirq() should
-save the device's standard configuration registers using pci_save_state().
-
-3.1.5. freeze_noirq()
-
-The freeze_noirq() callback is hibernation-specific.  It is executed during
-hibernation, after prepare() and freeze() callbacks have been executed for all
-devices in preparation for the creation of a system image, and during restore,
-after a system image has been loaded into memory and after prepare() and
-freeze() callbacks have been executed for all devices.  It is always executed
-after device interrupts have been disabled by the PM core.
-
-The role of this callback is analogous to the role of the suspend_noirq()
-callback described above and it very rarely is necessary to define
-freeze_noirq().
-
-The difference between freeze_noirq() and freeze() is analogous to the
-difference between suspend_noirq() and suspend().
-
-3.1.6. poweroff()
-
-The poweroff() callback is hibernation-specific.  It is executed when the system
-is about to be powered off after saving a hibernation image to a persistent
-storage.  prepare() callbacks are executed for all devices before poweroff() is
-called.
-
-The role of this callback is analogous to the role of the suspend() and freeze()
-callbacks described above, although it does not need to save the contents of
-the device's registers.  In particular, if the driver wants to put the device
-into a low-power state itself instead of allowing the PCI subsystem to do that,
-the poweroff() callback should use pci_prepare_to_sleep() and
-pci_set_power_state() to prepare the device for system wakeup and to put it
-into a low-power state, respectively, but it need not save the device's standard
-configuration registers.
-
-3.1.7. poweroff_noirq()
-
-The poweroff_noirq() callback is hibernation-specific.  It is executed after
-poweroff() callbacks have been executed for all devices in the system.
-
-The role of this callback is analogous to the role of the suspend_noirq() and
-freeze_noirq() callbacks described above, but it does not need to save the
-contents of the device's registers.
-
-The difference between poweroff_noirq() and poweroff() is analogous to the
-difference between suspend_noirq() and suspend().
-
-3.1.8. resume_noirq()
-
-The resume_noirq() callback is only executed during system resume, after the
-PM core has enabled the non-boot CPUs.  The driver's interrupt handler will not
-be invoked while resume_noirq() is running, so this callback can carry out
-operations that might race with the interrupt handler.
-
-Since the PCI subsystem unconditionally puts all devices into the full power
-state in the resume_noirq phase of system resume and restores their standard
-configuration registers, resume_noirq() is usually not necessary.  In general
-it should only be used for performing operations that would lead to race
-conditions if carried out by resume().
-
-3.1.9. resume()
-
-The resume() callback is only executed during system resume, after
-resume_noirq() callbacks have been executed for all devices in the system and
-device interrupts have been enabled by the PM core.
-
-This callback is responsible for restoring the pre-suspend configuration of the
-device and bringing it back to the fully functional state.  The device should be
-able to process I/O in a usual way after resume() has returned.
-
-3.1.10. thaw_noirq()
-
-The thaw_noirq() callback is hibernation-specific.  It is executed after a
-system image has been created and the non-boot CPUs have been enabled by the PM
-core, in the thaw_noirq phase of hibernation.  It also may be executed if the
-loading of a hibernation image fails during system restore (it is then executed
-after enabling the non-boot CPUs).  The driver's interrupt handler will not be
-invoked while thaw_noirq() is running.
-
-The role of this callback is analogous to the role of resume_noirq().  The
-difference between these two callbacks is that thaw_noirq() is executed after
-freeze() and freeze_noirq(), so in general it does not need to modify the
-contents of the device's registers.
-
-3.1.11. thaw()
-
-The thaw() callback is hibernation-specific.  It is executed after thaw_noirq()
-callbacks have been executed for all devices in the system and after device
-interrupts have been enabled by the PM core.
-
-This callback is responsible for restoring the pre-freeze configuration of
-the device, so that it will work in a usual way after thaw() has returned.
-
-3.1.12. restore_noirq()
-
-The restore_noirq() callback is hibernation-specific.  It is executed in the
-restore_noirq phase of hibernation, when the boot kernel has passed control to
-the image kernel and the non-boot CPUs have been enabled by the image kernel's
-PM core.
-
-This callback is analogous to resume_noirq() with the exception that it cannot
-make any assumption on the previous state of the device, even if the BIOS (or
-generally the platform firmware) is known to preserve that state over a
-suspend-resume cycle.
-
-For the vast majority of PCI device drivers there is no difference between
-resume_noirq() and restore_noirq().
-
-3.1.13. restore()
-
-The restore() callback is hibernation-specific.  It is executed after
-restore_noirq() callbacks have been executed for all devices in the system and
-after the PM core has enabled device drivers' interrupt handlers to be invoked.
-
-This callback is analogous to resume(), just like restore_noirq() is analogous
-to resume_noirq().  Consequently, the difference between restore_noirq() and
-restore() is analogous to the difference between resume_noirq() and resume().
-
-For the vast majority of PCI device drivers there is no difference between
-resume() and restore().
-
-3.1.14. complete()
-
-The complete() callback is executed in the following situations:
-  - during system resume, after resume() callbacks have been executed for all
-    devices,
-  - during hibernation, before saving the system image, after thaw() callbacks
-    have been executed for all devices,
-  - during system restore, when the system is going back to its pre-hibernation
-    state, after restore() callbacks have been executed for all devices.
-It also may be executed if the loading of a hibernation image into memory fails
-(in that case it is run after thaw() callbacks have been executed for all
-devices that have drivers in the boot kernel).
-
-This callback is entirely optional, although it may be necessary if the
-prepare() callback performs operations that need to be reversed.
-
-3.1.15. runtime_suspend()
-
-The runtime_suspend() callback is specific to device runtime power management
-(runtime PM).  It is executed by the PM core's runtime PM framework when the
-device is about to be suspended (i.e. quiesced and put into a low-power state)
-at run time.
-
-This callback is responsible for freezing the device and preparing it to be
-put into a low-power state, but it must allow the PCI subsystem to perform all
-of the PCI-specific actions necessary for suspending the device.
-
-3.1.16. runtime_resume()
-
-The runtime_resume() callback is specific to device runtime PM.  It is executed
-by the PM core's runtime PM framework when the device is about to be resumed
-(i.e. put into the full-power state and programmed to process I/O normally) at
-run time.
-
-This callback is responsible for restoring the normal functionality of the
-device after it has been put into the full-power state by the PCI subsystem.
-The device is expected to be able to process I/O in the usual way after
-runtime_resume() has returned.
-
-3.1.17. runtime_idle()
-
-The runtime_idle() callback is specific to device runtime PM.  It is executed
-by the PM core's runtime PM framework whenever it may be desirable to suspend
-the device according to the PM core's information.  In particular, it is
-automatically executed right after runtime_resume() has returned in case the
-resume of the device has happened as a result of a spurious event.
-
-This callback is optional, but if it is not implemented or if it returns 0, the
-PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
-cause the driver's runtime_suspend() callback to be executed.
-
-3.1.18. Pointing Multiple Callback Pointers to One Routine
-
-Although in principle each of the callbacks described in the previous
-subsections can be defined as a separate function, it often is convenient to
-point two or more members of struct dev_pm_ops to the same routine.  There are
-a few convenience macros that can be used for this purpose.
-
-The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
-suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
-members and one resume routine pointed to by the .resume(), .thaw(), and
-.restore() members.  The other function pointers in this struct dev_pm_ops are
-unset.
-
-The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
-additionally sets the .runtime_resume() pointer to the same value as
-.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
-the same value as .suspend() (and .freeze() and .poweroff()).
-
-The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
-dev_pm_ops to indicate that one suspend routine is to be pointed to by the
-.suspend(), .freeze(), and .poweroff() members and one resume routine is to
-be pointed to by the .resume(), .thaw(), and .restore() members.
-
-3.1.19. Driver Flags for Power Management
-
-The PM core allows device drivers to set flags that influence the handling of
-power management for the devices by the core itself and by middle layer code
-including the PCI bus type.  The flags should be set once at the driver probe
-time with the help of the dev_pm_set_driver_flags() function and they should not
-be updated directly afterwards.
-
-The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
-mechanism allowing device suspend/resume callbacks to be skipped if the device
-is in runtime suspend when the system suspend starts.  That also affects all of
-the ancestors of the device, so this flag should only be used if absolutely
-necessary.
-
-The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
-positive value from pci_pm_prepare() if the ->prepare callback provided by the
-driver of the device returns a positive value.  That allows the driver to opt
-out from using the direct-complete mechanism dynamically.
-
-The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
-perspective the device can be safely left in runtime suspend during system
-suspend.  That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
-to skip resuming the device from runtime suspend unless there are PCI-specific
-reasons for doing that.  Also, it causes pci_pm_suspend_late/noirq(),
-pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
-if the device remains in runtime suspend in the beginning of the "late" phase
-of the system-wide transition under way.  Moreover, if the device is in
-runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
-power management status will be changed to "active" (as it is going to be put
-into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
-the function will set the power.direct_complete flag for it (to make the PM core
-skip the subsequent "thaw" callbacks for it) and return.
-
-Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
-device to be left in suspend after system-wide transitions to the working state.
-This flag is checked by the PM core, but the PCI bus type informs the PM core
-which devices may be left in suspend from its perspective (that happens during
-the "noirq" phase of system-wide suspend and analogous transitions) and next it
-uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
-pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
-callbacks for the device during the transition under way and will set its
-runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
-it.
-
-3.2. Device Runtime Power Management
-------------------------------------
-In addition to providing device power management callbacks PCI device drivers
-are responsible for controlling the runtime power management (runtime PM) of
-their devices.
-
-The PCI device runtime PM is optional, but it is recommended that PCI device
-drivers implement it at least in the cases where there is a reliable way of
-verifying that the device is not used (like when the network cable is detached
-from an Ethernet adapter or there are no devices attached to a USB controller).
-
-To support the PCI runtime PM the driver first needs to implement the
-runtime_suspend() and runtime_resume() callbacks.  It also may need to implement
-the runtime_idle() callback to prevent the device from being suspended again
-every time right after the runtime_resume() callback has returned
-(alternatively, the runtime_suspend() callback will have to check if the
-device should really be suspended and return -EAGAIN if that is not the case).
-
-The runtime PM of PCI devices is enabled by default by the PCI core.  PCI
-device drivers do not need to enable it and should not attempt to do so.
-However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
-helper function.  In addition to that, the runtime PM usage counter of
-each PCI device is incremented by local_pci_probe() before executing the
-probe callback provided by the device's driver.
-
-If a PCI driver implements the runtime PM callbacks and intends to use the
-runtime PM framework provided by the PM core and the PCI subsystem, it needs
-to decrement the device's runtime PM usage counter in its probe callback
-function.  If it doesn't do that, the counter will always be different from
-zero for the device and it will never be runtime-suspended.  The simplest
-way to do that is by calling pm_runtime_put_noidle(), but if the driver
-wants to schedule an autosuspend right away, for example, it may call
-pm_runtime_put_autosuspend() instead for this purpose.  Generally, it
-just needs to call a function that decrements the devices usage counter
-from its probe routine to make runtime PM work for the device.
-
-It is important to remember that the driver's runtime_suspend() callback
-may be executed right after the usage counter has been decremented, because
-user space may already have caused the pm_runtime_allow() helper function
-unblocking the runtime PM of the device to run via sysfs, so the driver must
-be prepared to cope with that.
-
-The driver itself should not call pm_runtime_allow(), though.  Instead, it
-should let user space or some platform-specific code do that (user space can
-do it via sysfs as stated above), but it must be prepared to handle the
-runtime PM of the device correctly as soon as pm_runtime_allow() is called
-(which may happen at any time, even before the driver is loaded).
-
-When the driver's remove callback runs, it has to balance the decrementation
-of the device's runtime PM usage counter at the probe time.  For this reason,
-if it has decremented the counter in its probe callback, it must run
-pm_runtime_get_noresume() in its remove callback.  [Since the core carries
-out a runtime resume of the device and bumps up the device's usage counter
-before running the driver's remove callback, the runtime PM of the device
-is effectively disabled for the duration of the remove execution and all
-runtime PM helper functions incrementing the device's usage counter are
-then effectively equivalent to pm_runtime_get_noresume().]
-
-The runtime PM framework works by processing requests to suspend or resume
-devices, or to check if they are idle (in which cases it is reasonable to
-subsequently request that they be suspended).  These requests are represented
-by work items put into the power management workqueue, pm_wq.  Although there
-are a few situations in which power management requests are automatically
-queued by the PM core (for example, after processing a request to resume a
-device the PM core automatically queues a request to check if the device is
-idle), device drivers are generally responsible for queuing power management
-requests for their devices.  For this purpose they should use the runtime PM
-helper functions provided by the PM core, discussed in
-Documentation/power/runtime_pm.txt.
-
-Devices can also be suspended and resumed synchronously, without placing a
-request into pm_wq.  In the majority of cases this also is done by their
-drivers that use helper functions provided by the PM core for this purpose.
-
-For more information on the runtime PM of devices refer to
-Documentation/power/runtime_pm.txt.
-
-
-4. Resources
-============
-
-PCI Local Bus Specification, Rev. 3.0
-PCI Bus Power Management Interface Specification, Rev. 1.2
-Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
-PCI Express Base Specification, Rev. 2.0
-Documentation/driver-api/pm/devices.rst
-Documentation/power/runtime_pm.txt
diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst
new file mode 100644
index 000000000000..945fc6d760c9
--- /dev/null
+++ b/Documentation/power/pm_qos_interface.rst
@@ -0,0 +1,225 @@
+===============================
+PM Quality Of Service Interface
+===============================
+
+This interface provides a kernel and user mode interface for registering
+performance expectations by drivers, subsystems and user space applications on
+one of the parameters.
+
+Two different PM QoS frameworks are available:
+1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput,
+memory_bandwidth.
+2. the per-device PM QoS framework provides the API to manage the per-device latency
+constraints and PM QoS flags.
+
+Each parameters have defined units:
+
+ * latency: usec
+ * timeout: usec
+ * throughput: kbs (kilo bit / sec)
+ * memory bandwidth: mbs (mega bit / sec)
+
+
+1. PM QoS framework
+===================
+
+The infrastructure exposes multiple misc device nodes one per implemented
+parameter.  The set of parameters implement is defined by pm_qos_power_init()
+and pm_qos_params.h.  This is done because having the available parameters
+being runtime configurable or changeable from a driver was seen as too easy to
+abuse.
+
+For each parameter a list of performance requests is maintained along with
+an aggregated target value.  The aggregated target value is updated with
+changes to the request list or elements of the list.  Typically the
+aggregated target value is simply the max or min of the request values held
+in the parameter list elements.
+Note: the aggregated target value is implemented as an atomic variable so that
+reading the aggregated value does not require any locking mechanism.
+
+
+From kernel mode the use of this interface is simple:
+
+void pm_qos_add_request(handle, param_class, target_value):
+  Will insert an element into the list for that identified PM QoS class with the
+  target value.  Upon change to this list the new target is recomputed and any
+  registered notifiers are called only if the target value is now different.
+  Clients of pm_qos need to save the returned handle for future use in other
+  pm_qos API functions.
+
+void pm_qos_update_request(handle, new_target_value):
+  Will update the list element pointed to by the handle with the new target value
+  and recompute the new aggregated target, calling the notification tree if the
+  target is changed.
+
+void pm_qos_remove_request(handle):
+  Will remove the element.  After removal it will update the aggregate target and
+  call the notification tree if the target was changed as a result of removing
+  the request.
+
+int pm_qos_request(param_class):
+  Returns the aggregated value for a given PM QoS class.
+
+int pm_qos_request_active(handle):
+  Returns if the request is still active, i.e. it has not been removed from a
+  PM QoS class constraints list.
+
+int pm_qos_add_notifier(param_class, notifier):
+  Adds a notification callback function to the PM QoS class. The callback is
+  called when the aggregated value for the PM QoS class is changed.
+
+int pm_qos_remove_notifier(int param_class, notifier):
+  Removes the notification callback function for the PM QoS class.
+
+
+From user mode:
+
+Only processes can register a pm_qos request.  To provide for automatic
+cleanup of a process, the interface requires the process to register its
+parameter requests in the following way:
+
+To register the default pm_qos target for the specific parameter, the process
+must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
+
+As long as the device node is held open that process has a registered
+request on the parameter.
+
+To change the requested target value the process needs to write an s32 value to
+the open device node.  Alternatively the user mode program could write a hex
+string for the value using 10 char long format e.g. "0x12345678".  This
+translates to a pm_qos_update_request call.
+
+To remove the user mode request for a target value simply close the device
+node.
+
+
+2. PM QoS per-device latency and flags framework
+================================================
+
+For each device, there are three lists of PM QoS requests. Two of them are
+maintained along with the aggregated targets of resume latency and active
+state latency tolerance (in microseconds) and the third one is for PM QoS flags.
+Values are updated in response to changes of the request list.
+
+The target values of resume latency and active state latency tolerance are
+simply the minimum of the request values held in the parameter list elements.
+The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
+values.  One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
+
+Note: The aggregated target values are implemented in such a way that reading
+the aggregated value does not require any locking mechanism.
+
+
+From kernel mode the use of this interface is the following:
+
+int dev_pm_qos_add_request(device, handle, type, value):
+  Will insert an element into the list for that identified device with the
+  target value.  Upon change to this list the new target is recomputed and any
+  registered notifiers are called only if the target value is now different.
+  Clients of dev_pm_qos need to save the handle for future use in other
+  dev_pm_qos API functions.
+
+int dev_pm_qos_update_request(handle, new_value):
+  Will update the list element pointed to by the handle with the new target
+  value and recompute the new aggregated target, calling the notification
+  trees if the target is changed.
+
+int dev_pm_qos_remove_request(handle):
+  Will remove the element.  After removal it will update the aggregate target
+  and call the notification trees if the target was changed as a result of
+  removing the request.
+
+s32 dev_pm_qos_read_value(device):
+  Returns the aggregated value for a given device's constraints list.
+
+enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
+  Check PM QoS flags of the given device against the given mask of flags.
+  The meaning of the return values is as follows:
+
+	PM_QOS_FLAGS_ALL:
+		All flags from the mask are set
+	PM_QOS_FLAGS_SOME:
+		Some flags from the mask are set
+	PM_QOS_FLAGS_NONE:
+		No flags from the mask are set
+	PM_QOS_FLAGS_UNDEFINED:
+		The device's PM QoS structure has not been initialized
+		or the list of requests is empty.
+
+int dev_pm_qos_add_ancestor_request(dev, handle, type, value)
+  Add a PM QoS request for the first direct ancestor of the given device whose
+  power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests)
+  or whose power.set_latency_tolerance callback pointer is not NULL (for
+  DEV_PM_QOS_LATENCY_TOLERANCE requests).
+
+int dev_pm_qos_expose_latency_limit(device, value)
+  Add a request to the device's PM QoS list of resume latency constraints and
+  create a sysfs attribute pm_qos_resume_latency_us under the device's power
+  directory allowing user space to manipulate that request.
+
+void dev_pm_qos_hide_latency_limit(device)
+  Drop the request added by dev_pm_qos_expose_latency_limit() from the device's
+  PM QoS list of resume latency constraints and remove sysfs attribute
+  pm_qos_resume_latency_us from the device's power directory.
+
+int dev_pm_qos_expose_flags(device, value)
+  Add a request to the device's PM QoS list of flags and create sysfs attribute
+  pm_qos_no_power_off under the device's power directory allowing user space to
+  change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
+
+void dev_pm_qos_hide_flags(device)
+  Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
+  of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
+  directory.
+
+Notification mechanisms:
+
+The per-device PM QoS framework has a per-device notification tree.
+
+int dev_pm_qos_add_notifier(device, notifier):
+  Adds a notification callback function for the device.
+  The callback is called when the aggregated value of the device constraints list
+  is changed (for resume latency device PM QoS only).
+
+int dev_pm_qos_remove_notifier(device, notifier):
+  Removes the notification callback function for the device.
+
+
+Active state latency tolerance
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This device PM QoS type is used to support systems in which hardware may switch
+to energy-saving operation modes on the fly.  In those systems, if the operation
+mode chosen by the hardware attempts to save energy in an overly aggressive way,
+it may cause excess latencies to be visible to software, causing it to miss
+certain protocol requirements or target frame or sample rates etc.
+
+If there is a latency tolerance control mechanism for a given device available
+to software, the .set_latency_tolerance callback in that device's dev_pm_info
+structure should be populated.  The routine pointed to by it is should implement
+whatever is necessary to transfer the effective requirement value to the
+hardware.
+
+Whenever the effective latency tolerance changes for the device, its
+.set_latency_tolerance() callback will be executed and the effective value will
+be passed to it.  If that value is negative, which means that the list of
+latency tolerance requirements for the device is empty, the callback is expected
+to switch the underlying hardware latency tolerance control mechanism to an
+autonomous mode if available.  If that value is PM_QOS_LATENCY_ANY, in turn, and
+the hardware supports a special "no requirement" setting, the callback is
+expected to use it.  That allows software to prevent the hardware from
+automatically updating the device's latency tolerance in response to its power
+state changes (e.g. during transitions from D3cold to D0), which generally may
+be done in the autonomous latency tolerance control mode.
+
+If .set_latency_tolerance() is present for the device, sysfs attribute
+pm_qos_latency_tolerance_us will be present in the devivce's power directory.
+Then, user space can use that attribute to specify its latency tolerance
+requirement for the device, if any.  Writing "any" to it means "no requirement,
+but do not let the hardware control latency tolerance" and writing "auto" to it
+allows the hardware to be switched to the autonomous mode if there are no other
+requirements from the kernel side in the device's list.
+
+Kernel code can use the functions described above along with the
+DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
+latency tolerance requirements for devices.
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
deleted file mode 100644
index 19c5f7b1a7ba..000000000000
--- a/Documentation/power/pm_qos_interface.txt
+++ /dev/null
@@ -1,212 +0,0 @@
-PM Quality Of Service Interface.
-
-This interface provides a kernel and user mode interface for registering
-performance expectations by drivers, subsystems and user space applications on
-one of the parameters.
-
-Two different PM QoS frameworks are available:
-1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput,
-memory_bandwidth.
-2. the per-device PM QoS framework provides the API to manage the per-device latency
-constraints and PM QoS flags.
-
-Each parameters have defined units:
- * latency: usec
- * timeout: usec
- * throughput: kbs (kilo bit / sec)
- * memory bandwidth: mbs (mega bit / sec)
-
-
-1. PM QoS framework
-
-The infrastructure exposes multiple misc device nodes one per implemented
-parameter.  The set of parameters implement is defined by pm_qos_power_init()
-and pm_qos_params.h.  This is done because having the available parameters
-being runtime configurable or changeable from a driver was seen as too easy to
-abuse.
-
-For each parameter a list of performance requests is maintained along with
-an aggregated target value.  The aggregated target value is updated with
-changes to the request list or elements of the list.  Typically the
-aggregated target value is simply the max or min of the request values held
-in the parameter list elements.
-Note: the aggregated target value is implemented as an atomic variable so that
-reading the aggregated value does not require any locking mechanism.
-
-
-From kernel mode the use of this interface is simple:
-
-void pm_qos_add_request(handle, param_class, target_value):
-Will insert an element into the list for that identified PM QoS class with the
-target value.  Upon change to this list the new target is recomputed and any
-registered notifiers are called only if the target value is now different.
-Clients of pm_qos need to save the returned handle for future use in other
-pm_qos API functions.
-
-void pm_qos_update_request(handle, new_target_value):
-Will update the list element pointed to by the handle with the new target value
-and recompute the new aggregated target, calling the notification tree if the
-target is changed.
-
-void pm_qos_remove_request(handle):
-Will remove the element.  After removal it will update the aggregate target and
-call the notification tree if the target was changed as a result of removing
-the request.
-
-int pm_qos_request(param_class):
-Returns the aggregated value for a given PM QoS class.
-
-int pm_qos_request_active(handle):
-Returns if the request is still active, i.e. it has not been removed from a
-PM QoS class constraints list.
-
-int pm_qos_add_notifier(param_class, notifier):
-Adds a notification callback function to the PM QoS class. The callback is
-called when the aggregated value for the PM QoS class is changed.
-
-int pm_qos_remove_notifier(int param_class, notifier):
-Removes the notification callback function for the PM QoS class.
-
-
-From user mode:
-Only processes can register a pm_qos request.  To provide for automatic
-cleanup of a process, the interface requires the process to register its
-parameter requests in the following way:
-
-To register the default pm_qos target for the specific parameter, the process
-must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
-
-As long as the device node is held open that process has a registered
-request on the parameter.
-
-To change the requested target value the process needs to write an s32 value to
-the open device node.  Alternatively the user mode program could write a hex
-string for the value using 10 char long format e.g. "0x12345678".  This
-translates to a pm_qos_update_request call.
-
-To remove the user mode request for a target value simply close the device
-node.
-
-
-2. PM QoS per-device latency and flags framework
-
-For each device, there are three lists of PM QoS requests. Two of them are
-maintained along with the aggregated targets of resume latency and active
-state latency tolerance (in microseconds) and the third one is for PM QoS flags.
-Values are updated in response to changes of the request list.
-
-The target values of resume latency and active state latency tolerance are
-simply the minimum of the request values held in the parameter list elements.
-The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
-values.  One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
-
-Note: The aggregated target values are implemented in such a way that reading
-the aggregated value does not require any locking mechanism.
-
-
-From kernel mode the use of this interface is the following:
-
-int dev_pm_qos_add_request(device, handle, type, value):
-Will insert an element into the list for that identified device with the
-target value.  Upon change to this list the new target is recomputed and any
-registered notifiers are called only if the target value is now different.
-Clients of dev_pm_qos need to save the handle for future use in other
-dev_pm_qos API functions.
-
-int dev_pm_qos_update_request(handle, new_value):
-Will update the list element pointed to by the handle with the new target value
-and recompute the new aggregated target, calling the notification trees if the
-target is changed.
-
-int dev_pm_qos_remove_request(handle):
-Will remove the element.  After removal it will update the aggregate target and
-call the notification trees if the target was changed as a result of removing
-the request.
-
-s32 dev_pm_qos_read_value(device):
-Returns the aggregated value for a given device's constraints list.
-
-enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
-Check PM QoS flags of the given device against the given mask of flags.
-The meaning of the return values is as follows:
-	PM_QOS_FLAGS_ALL: All flags from the mask are set
-	PM_QOS_FLAGS_SOME: Some flags from the mask are set
-	PM_QOS_FLAGS_NONE: No flags from the mask are set
-	PM_QOS_FLAGS_UNDEFINED: The device's PM QoS structure has not been
-			initialized or the list of requests is empty.
-
-int dev_pm_qos_add_ancestor_request(dev, handle, type, value)
-Add a PM QoS request for the first direct ancestor of the given device whose
-power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests)
-or whose power.set_latency_tolerance callback pointer is not NULL (for
-DEV_PM_QOS_LATENCY_TOLERANCE requests).
-
-int dev_pm_qos_expose_latency_limit(device, value)
-Add a request to the device's PM QoS list of resume latency constraints and
-create a sysfs attribute pm_qos_resume_latency_us under the device's power
-directory allowing user space to manipulate that request.
-
-void dev_pm_qos_hide_latency_limit(device)
-Drop the request added by dev_pm_qos_expose_latency_limit() from the device's
-PM QoS list of resume latency constraints and remove sysfs attribute
-pm_qos_resume_latency_us from the device's power directory.
-
-int dev_pm_qos_expose_flags(device, value)
-Add a request to the device's PM QoS list of flags and create sysfs attribute
-pm_qos_no_power_off under the device's power directory allowing user space to
-change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
-
-void dev_pm_qos_hide_flags(device)
-Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
-of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
-directory.
-
-Notification mechanisms:
-The per-device PM QoS framework has a per-device notification tree.
-
-int dev_pm_qos_add_notifier(device, notifier):
-Adds a notification callback function for the device.
-The callback is called when the aggregated value of the device constraints list
-is changed (for resume latency device PM QoS only).
-
-int dev_pm_qos_remove_notifier(device, notifier):
-Removes the notification callback function for the device.
-
-
-Active state latency tolerance
-
-This device PM QoS type is used to support systems in which hardware may switch
-to energy-saving operation modes on the fly.  In those systems, if the operation
-mode chosen by the hardware attempts to save energy in an overly aggressive way,
-it may cause excess latencies to be visible to software, causing it to miss
-certain protocol requirements or target frame or sample rates etc.
-
-If there is a latency tolerance control mechanism for a given device available
-to software, the .set_latency_tolerance callback in that device's dev_pm_info
-structure should be populated.  The routine pointed to by it is should implement
-whatever is necessary to transfer the effective requirement value to the
-hardware.
-
-Whenever the effective latency tolerance changes for the device, its
-.set_latency_tolerance() callback will be executed and the effective value will
-be passed to it.  If that value is negative, which means that the list of
-latency tolerance requirements for the device is empty, the callback is expected
-to switch the underlying hardware latency tolerance control mechanism to an
-autonomous mode if available.  If that value is PM_QOS_LATENCY_ANY, in turn, and
-the hardware supports a special "no requirement" setting, the callback is
-expected to use it.  That allows software to prevent the hardware from
-automatically updating the device's latency tolerance in response to its power
-state changes (e.g. during transitions from D3cold to D0), which generally may
-be done in the autonomous latency tolerance control mode.
-
-If .set_latency_tolerance() is present for the device, sysfs attribute
-pm_qos_latency_tolerance_us will be present in the devivce's power directory.
-Then, user space can use that attribute to specify its latency tolerance
-requirement for the device, if any.  Writing "any" to it means "no requirement,
-but do not let the hardware control latency tolerance" and writing "auto" to it
-allows the hardware to be switched to the autonomous mode if there are no other
-requirements from the kernel side in the device's list.
-
-Kernel code can use the functions described above along with the
-DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
-latency tolerance requirements for devices.
diff --git a/Documentation/power/power_supply_class.rst b/Documentation/power/power_supply_class.rst
new file mode 100644
index 000000000000..3f2c3fe38a61
--- /dev/null
+++ b/Documentation/power/power_supply_class.rst
@@ -0,0 +1,282 @@
+========================
+Linux power supply class
+========================
+
+Synopsis
+~~~~~~~~
+Power supply class used to represent battery, UPS, AC or DC power supply
+properties to user-space.
+
+It defines core set of attributes, which should be applicable to (almost)
+every power supply out there. Attributes are available via sysfs and uevent
+interfaces.
+
+Each attribute has well defined meaning, up to unit of measure used. While
+the attributes provided are believed to be universally applicable to any
+power supply, specific monitoring hardware may not be able to provide them
+all, so any of them may be skipped.
+
+Power supply class is extensible, and allows to define drivers own attributes.
+The core attribute set is subject to the standard Linux evolution (i.e.
+if it will be found that some attribute is applicable to many power supply
+types or their drivers, it can be added to the core set).
+
+It also integrates with LED framework, for the purpose of providing
+typically expected feedback of battery charging/fully charged status and
+AC/USB power supply online status. (Note that specific details of the
+indication (including whether to use it at all) are fully controllable by
+user and/or specific machine defaults, per design principles of LED
+framework).
+
+
+Attributes/properties
+~~~~~~~~~~~~~~~~~~~~~
+Power supply class has predefined set of attributes, this eliminates code
+duplication across drivers. Power supply class insist on reusing its
+predefined attributes *and* their units.
+
+So, userspace gets predictable set of attributes and their units for any
+kind of power supply, and can process/present them to a user in consistent
+manner. Results for different power supplies and machines are also directly
+comparable.
+
+See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c
+for the example how to declare and handle attributes.
+
+
+Units
+~~~~~
+Quoting include/linux/power_supply.h:
+
+  All voltages, currents, charges, energies, time and temperatures in µV,
+  µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
+  stated. It's driver's job to convert its raw values to units in which
+  this class operates.
+
+
+Attributes/properties detailed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++--------------------------------------------------------------------------+
+|               **Charge/Energy/Capacity - how to not confuse**            |
++--------------------------------------------------------------------------+
+| **Because both "charge" (µAh) and "energy" (µWh) represents "capacity"   |
+| of battery, this class distinguish these terms. Don't mix them!**        |
+|                                                                          |
+| - `CHARGE_*`                                                             |
+|	attributes represents capacity in µAh only.                        |
+| - `ENERGY_*`                                                             |
+|	attributes represents capacity in µWh only.                        |
+| - `CAPACITY`                                                             |
+|	attribute represents capacity in *percents*, from 0 to 100.        |
++--------------------------------------------------------------------------+
+
+Postfixes:
+
+_AVG
+  *hardware* averaged value, use it if your hardware is really able to
+  report averaged values.
+_NOW
+  momentary/instantaneous values.
+
+STATUS
+  this attribute represents operating status (charging, full,
+  discharging (i.e. powering a load), etc.). This corresponds to
+  `BATTERY_STATUS_*` values, as defined in battery.h.
+
+CHARGE_TYPE
+  batteries can typically charge at different rates.
+  This defines trickle and fast charges.  For batteries that
+  are already charged or discharging, 'n/a' can be displayed (or
+  'unknown', if the status is not known).
+
+AUTHENTIC
+  indicates the power supply (battery or charger) connected
+  to the platform is authentic(1) or non authentic(0).
+
+HEALTH
+  represents health of the battery, values corresponds to
+  POWER_SUPPLY_HEALTH_*, defined in battery.h.
+
+VOLTAGE_OCV
+  open circuit voltage of the battery.
+
+VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN
+  design values for maximal and minimal power supply voltages.
+  Maximal/minimal means values of voltages when battery considered
+  "full"/"empty" at normal conditions. Yes, there is no direct relation
+  between voltage and battery capacity, but some dumb
+  batteries use voltage for very approximated calculation of capacity.
+  Battery driver also can use this attribute just to inform userspace
+  about maximal and minimal voltage thresholds of a given battery.
+
+VOLTAGE_MAX, VOLTAGE_MIN
+  same as _DESIGN voltage values except that these ones should be used
+  if hardware could only guess (measure and retain) the thresholds of a
+  given power supply.
+
+VOLTAGE_BOOT
+  Reports the voltage measured during boot
+
+CURRENT_BOOT
+  Reports the current measured during boot
+
+CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN
+  design charge values, when battery considered full/empty.
+
+ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN
+  same as above but for energy.
+
+CHARGE_FULL, CHARGE_EMPTY
+  These attributes means "last remembered value of charge when battery
+  became full/empty". It also could mean "value of charge when battery
+  considered full/empty at given conditions (temperature, age)".
+  I.e. these attributes represents real thresholds, not design values.
+
+ENERGY_FULL, ENERGY_EMPTY
+  same as above but for energy.
+
+CHARGE_COUNTER
+  the current charge counter (in µAh).  This could easily
+  be negative; there is no empty or full value.  It is only useful for
+  relative, time-based measurements.
+
+PRECHARGE_CURRENT
+  the maximum charge current during precharge phase of charge cycle
+  (typically 20% of battery capacity).
+
+CHARGE_TERM_CURRENT
+  Charge termination current. The charge cycle terminates when battery
+  voltage is above recharge threshold, and charge current is below
+  this setting (typically 10% of battery capacity).
+
+CONSTANT_CHARGE_CURRENT
+  constant charge current programmed by charger.
+
+
+CONSTANT_CHARGE_CURRENT_MAX
+  maximum charge current supported by the power supply object.
+
+CONSTANT_CHARGE_VOLTAGE
+  constant charge voltage programmed by charger.
+CONSTANT_CHARGE_VOLTAGE_MAX
+  maximum charge voltage supported by the power supply object.
+
+INPUT_CURRENT_LIMIT
+  input current limit programmed by charger. Indicates
+  the current drawn from a charging source.
+
+CHARGE_CONTROL_LIMIT
+  current charge control limit setting
+CHARGE_CONTROL_LIMIT_MAX
+  maximum charge control limit setting
+
+CALIBRATE
+  battery or coulomb counter calibration status
+
+CAPACITY
+  capacity in percents.
+CAPACITY_ALERT_MIN
+  minimum capacity alert value in percents.
+CAPACITY_ALERT_MAX
+  maximum capacity alert value in percents.
+CAPACITY_LEVEL
+  capacity level. This corresponds to POWER_SUPPLY_CAPACITY_LEVEL_*.
+
+TEMP
+  temperature of the power supply.
+TEMP_ALERT_MIN
+  minimum battery temperature alert.
+TEMP_ALERT_MAX
+  maximum battery temperature alert.
+TEMP_AMBIENT
+  ambient temperature.
+TEMP_AMBIENT_ALERT_MIN
+  minimum ambient temperature alert.
+TEMP_AMBIENT_ALERT_MAX
+  maximum ambient temperature alert.
+TEMP_MIN
+  minimum operatable temperature
+TEMP_MAX
+  maximum operatable temperature
+
+TIME_TO_EMPTY
+  seconds left for battery to be considered empty
+  (i.e. while battery powers a load)
+TIME_TO_FULL
+  seconds left for battery to be considered full
+  (i.e. while battery is charging)
+
+
+Battery <-> external power supply interaction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Often power supplies are acting as supplies and supplicants at the same
+time. Batteries are good example. So, batteries usually care if they're
+externally powered or not.
+
+For that case, power supply class implements notification mechanism for
+batteries.
+
+External power supply (AC) lists supplicants (batteries) names in
+"supplied_to" struct member, and each power_supply_changed() call
+issued by external power supply will notify supplicants via
+external_power_changed callback.
+
+
+Devicetree battery characteristics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Drivers should call power_supply_get_battery_info() to obtain battery
+characteristics from a devicetree battery node, defined in
+Documentation/devicetree/bindings/power/supply/battery.txt. This is
+implemented in drivers/power/supply/bq27xxx_battery.c.
+
+Properties in struct power_supply_battery_info and their counterparts in the
+battery node have names corresponding to elements in enum power_supply_property,
+for naming consistency between sysfs attributes and battery node properties.
+
+
+QA
+~~
+
+Q:
+   Where is POWER_SUPPLY_PROP_XYZ attribute?
+A:
+   If you cannot find attribute suitable for your driver needs, feel free
+   to add it and send patch along with your driver.
+
+   The attributes available currently are the ones currently provided by the
+   drivers written.
+
+   Good candidates to add in future: model/part#, cycle_time, manufacturer,
+   etc.
+
+
+Q:
+   I have some very specific attribute (e.g. battery color), should I add
+   this attribute to standard ones?
+A:
+   Most likely, no. Such attribute can be placed in the driver itself, if
+   it is useful. Of course, if the attribute in question applicable to
+   large set of batteries, provided by many drivers, and/or comes from
+   some general battery specification/standard, it may be a candidate to
+   be added to the core attribute set.
+
+
+Q:
+   Suppose, my battery monitoring chip/firmware does not provides capacity
+   in percents, but provides charge_{now,full,empty}. Should I calculate
+   percentage capacity manually, inside the driver, and register CAPACITY
+   attribute? The same question about time_to_empty/time_to_full.
+A:
+   Most likely, no. This class is designed to export properties which are
+   directly measurable by the specific hardware available.
+
+   Inferring not available properties using some heuristics or mathematical
+   model is not subject of work for a battery driver. Such functionality
+   should be factored out, and in fact, apm_power, the driver to serve
+   legacy APM API on top of power supply class, uses a simple heuristic of
+   approximating remaining battery capacity based on its charge, current,
+   voltage and so on. But full-fledged battery model is likely not subject
+   for kernel at all, as it would require floating point calculation to deal
+   with things like differential equations and Kalman filters. This is
+   better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
deleted file mode 100644
index 300d37896e51..000000000000
--- a/Documentation/power/power_supply_class.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Linux power supply class
-========================
-
-Synopsis
-~~~~~~~~
-Power supply class used to represent battery, UPS, AC or DC power supply
-properties to user-space.
-
-It defines core set of attributes, which should be applicable to (almost)
-every power supply out there. Attributes are available via sysfs and uevent
-interfaces.
-
-Each attribute has well defined meaning, up to unit of measure used. While
-the attributes provided are believed to be universally applicable to any
-power supply, specific monitoring hardware may not be able to provide them
-all, so any of them may be skipped.
-
-Power supply class is extensible, and allows to define drivers own attributes.
-The core attribute set is subject to the standard Linux evolution (i.e.
-if it will be found that some attribute is applicable to many power supply
-types or their drivers, it can be added to the core set).
-
-It also integrates with LED framework, for the purpose of providing
-typically expected feedback of battery charging/fully charged status and
-AC/USB power supply online status. (Note that specific details of the
-indication (including whether to use it at all) are fully controllable by
-user and/or specific machine defaults, per design principles of LED
-framework).
-
-
-Attributes/properties
-~~~~~~~~~~~~~~~~~~~~~
-Power supply class has predefined set of attributes, this eliminates code
-duplication across drivers. Power supply class insist on reusing its
-predefined attributes *and* their units.
-
-So, userspace gets predictable set of attributes and their units for any
-kind of power supply, and can process/present them to a user in consistent
-manner. Results for different power supplies and machines are also directly
-comparable.
-
-See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c
-for the example how to declare and handle attributes.
-
-
-Units
-~~~~~
-Quoting include/linux/power_supply.h:
-
-  All voltages, currents, charges, energies, time and temperatures in µV,
-  µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
-  stated. It's driver's job to convert its raw values to units in which
-  this class operates.
-
-
-Attributes/properties detailed
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-~ ~ ~ ~ ~ ~ ~  Charge/Energy/Capacity - how to not confuse  ~ ~ ~ ~ ~ ~ ~
-~                                                                       ~
-~ Because both "charge" (µAh) and "energy" (µWh) represents "capacity"  ~
-~ of battery, this class distinguish these terms. Don't mix them!       ~
-~                                                                       ~
-~ CHARGE_* attributes represents capacity in µAh only.                  ~
-~ ENERGY_* attributes represents capacity in µWh only.                  ~
-~ CAPACITY attribute represents capacity in *percents*, from 0 to 100.  ~
-~                                                                       ~
-~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
-
-Postfixes:
-_AVG - *hardware* averaged value, use it if your hardware is really able to
-report averaged values.
-_NOW - momentary/instantaneous values.
-
-STATUS - this attribute represents operating status (charging, full,
-discharging (i.e. powering a load), etc.). This corresponds to
-BATTERY_STATUS_* values, as defined in battery.h.
-
-CHARGE_TYPE - batteries can typically charge at different rates.
-This defines trickle and fast charges.  For batteries that
-are already charged or discharging, 'n/a' can be displayed (or
-'unknown', if the status is not known).
-
-AUTHENTIC - indicates the power supply (battery or charger) connected
-to the platform is authentic(1) or non authentic(0).
-
-HEALTH - represents health of the battery, values corresponds to
-POWER_SUPPLY_HEALTH_*, defined in battery.h.
-
-VOLTAGE_OCV - open circuit voltage of the battery.
-
-VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN - design values for maximal and
-minimal power supply voltages. Maximal/minimal means values of voltages
-when battery considered "full"/"empty" at normal conditions. Yes, there is
-no direct relation between voltage and battery capacity, but some dumb
-batteries use voltage for very approximated calculation of capacity.
-Battery driver also can use this attribute just to inform userspace
-about maximal and minimal voltage thresholds of a given battery.
-
-VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that
-these ones should be used if hardware could only guess (measure and
-retain) the thresholds of a given power supply.
-
-VOLTAGE_BOOT - Reports the voltage measured during boot
-
-CURRENT_BOOT - Reports the current measured during boot
-
-CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when
-battery considered full/empty.
-
-ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN - same as above but for energy.
-
-CHARGE_FULL, CHARGE_EMPTY - These attributes means "last remembered value
-of charge when battery became full/empty". It also could mean "value of
-charge when battery considered full/empty at given conditions (temperature,
-age)". I.e. these attributes represents real thresholds, not design values.
-
-ENERGY_FULL, ENERGY_EMPTY - same as above but for energy.
-
-CHARGE_COUNTER - the current charge counter (in µAh).  This could easily
-be negative; there is no empty or full value.  It is only useful for
-relative, time-based measurements.
-
-PRECHARGE_CURRENT - the maximum charge current during precharge phase
-of charge cycle (typically 20% of battery capacity).
-CHARGE_TERM_CURRENT - Charge termination current. The charge cycle
-terminates when battery voltage is above recharge threshold, and charge
-current is below this setting (typically 10% of battery capacity).
-
-CONSTANT_CHARGE_CURRENT - constant charge current programmed by charger.
-CONSTANT_CHARGE_CURRENT_MAX - maximum charge current supported by the
-power supply object.
-
-CONSTANT_CHARGE_VOLTAGE - constant charge voltage programmed by charger.
-CONSTANT_CHARGE_VOLTAGE_MAX - maximum charge voltage supported by the
-power supply object.
-
-INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates
-the current drawn from a charging source.
-
-CHARGE_CONTROL_LIMIT - current charge control limit setting
-CHARGE_CONTROL_LIMIT_MAX - maximum charge control limit setting
-
-CALIBRATE - battery or coulomb counter calibration status
-
-CAPACITY - capacity in percents.
-CAPACITY_ALERT_MIN - minimum capacity alert value in percents.
-CAPACITY_ALERT_MAX - maximum capacity alert value in percents.
-CAPACITY_LEVEL - capacity level. This corresponds to
-POWER_SUPPLY_CAPACITY_LEVEL_*.
-
-TEMP - temperature of the power supply.
-TEMP_ALERT_MIN - minimum battery temperature alert.
-TEMP_ALERT_MAX - maximum battery temperature alert.
-TEMP_AMBIENT - ambient temperature.
-TEMP_AMBIENT_ALERT_MIN - minimum ambient temperature alert.
-TEMP_AMBIENT_ALERT_MAX - maximum ambient temperature alert.
-TEMP_MIN - minimum operatable temperature
-TEMP_MAX - maximum operatable temperature
-
-TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e.
-while battery powers a load)
-TIME_TO_FULL - seconds left for battery to be considered full (i.e.
-while battery is charging)
-
-
-Battery <-> external power supply interaction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Often power supplies are acting as supplies and supplicants at the same
-time. Batteries are good example. So, batteries usually care if they're
-externally powered or not.
-
-For that case, power supply class implements notification mechanism for
-batteries.
-
-External power supply (AC) lists supplicants (batteries) names in
-"supplied_to" struct member, and each power_supply_changed() call
-issued by external power supply will notify supplicants via
-external_power_changed callback.
-
-
-Devicetree battery characteristics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Drivers should call power_supply_get_battery_info() to obtain battery
-characteristics from a devicetree battery node, defined in
-Documentation/devicetree/bindings/power/supply/battery.txt. This is
-implemented in drivers/power/supply/bq27xxx_battery.c.
-
-Properties in struct power_supply_battery_info and their counterparts in the
-battery node have names corresponding to elements in enum power_supply_property,
-for naming consistency between sysfs attributes and battery node properties.
-
-
-QA
-~~
-Q: Where is POWER_SUPPLY_PROP_XYZ attribute?
-A: If you cannot find attribute suitable for your driver needs, feel free
-   to add it and send patch along with your driver.
-
-   The attributes available currently are the ones currently provided by the
-   drivers written.
-
-   Good candidates to add in future: model/part#, cycle_time, manufacturer,
-   etc.
-
-
-Q: I have some very specific attribute (e.g. battery color), should I add
-   this attribute to standard ones?
-A: Most likely, no. Such attribute can be placed in the driver itself, if
-   it is useful. Of course, if the attribute in question applicable to
-   large set of batteries, provided by many drivers, and/or comes from
-   some general battery specification/standard, it may be a candidate to
-   be added to the core attribute set.
-
-
-Q: Suppose, my battery monitoring chip/firmware does not provides capacity
-   in percents, but provides charge_{now,full,empty}. Should I calculate
-   percentage capacity manually, inside the driver, and register CAPACITY
-   attribute? The same question about time_to_empty/time_to_full.
-A: Most likely, no. This class is designed to export properties which are
-   directly measurable by the specific hardware available.
-
-   Inferring not available properties using some heuristics or mathematical
-   model is not subject of work for a battery driver. Such functionality
-   should be factored out, and in fact, apm_power, the driver to serve
-   legacy APM API on top of power supply class, uses a simple heuristic of
-   approximating remaining battery capacity based on its charge, current,
-   voltage and so on. But full-fledged battery model is likely not subject
-   for kernel at all, as it would require floating point calculation to deal
-   with things like differential equations and Kalman filters. This is
-   better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/powercap/powercap.rst b/Documentation/power/powercap/powercap.rst
new file mode 100644
index 000000000000..7ae3b44c7624
--- /dev/null
+++ b/Documentation/power/powercap/powercap.rst
@@ -0,0 +1,257 @@
+=======================
+Power Capping Framework
+=======================
+
+The power capping framework provides a consistent interface between the kernel
+and the user space that allows power capping drivers to expose the settings to
+user space in a uniform way.
+
+Terminology
+===========
+
+The framework exposes power capping devices to user space via sysfs in the
+form of a tree of objects. The objects at the root level of the tree represent
+'control types', which correspond to different methods of power capping.  For
+example, the intel-rapl control type represents the Intel "Running Average
+Power Limit" (RAPL) technology, whereas the 'idle-injection' control type
+corresponds to the use of idle injection for controlling power.
+
+Power zones represent different parts of the system, which can be controlled and
+monitored using the power capping method determined by the control type the
+given zone belongs to. They each contain attributes for monitoring power, as
+well as controls represented in the form of power constraints.  If the parts of
+the system represented by different power zones are hierarchical (that is, one
+bigger part consists of multiple smaller parts that each have their own power
+controls), those power zones may also be organized in a hierarchy with one
+parent power zone containing multiple subzones and so on to reflect the power
+control topology of the system.  In that case, it is possible to apply power
+capping to a set of devices together using the parent power zone and if more
+fine grained control is required, it can be applied through the subzones.
+
+
+Example sysfs interface tree::
+
+  /sys/devices/virtual/powercap
+  └──intel-rapl
+      ├──intel-rapl:0
+      │   ├──constraint_0_name
+      │   ├──constraint_0_power_limit_uw
+      │   ├──constraint_0_time_window_us
+      │   ├──constraint_1_name
+      │   ├──constraint_1_power_limit_uw
+      │   ├──constraint_1_time_window_us
+      │   ├──device -> ../../intel-rapl
+      │   ├──energy_uj
+      │   ├──intel-rapl:0:0
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:0
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──intel-rapl:0:1
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:0
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──max_energy_range_uj
+      │   ├──max_power_range_uw
+      │   ├──name
+      │   ├──enabled
+      │   ├──power
+      │   │   ├──async
+      │   │   []
+      │   ├──subsystem -> ../../../../../class/power_cap
+      │   ├──enabled
+      │   ├──uevent
+      ├──intel-rapl:1
+      │   ├──constraint_0_name
+      │   ├──constraint_0_power_limit_uw
+      │   ├──constraint_0_time_window_us
+      │   ├──constraint_1_name
+      │   ├──constraint_1_power_limit_uw
+      │   ├──constraint_1_time_window_us
+      │   ├──device -> ../../intel-rapl
+      │   ├──energy_uj
+      │   ├──intel-rapl:1:0
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:1
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──intel-rapl:1:1
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:1
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──max_energy_range_uj
+      │   ├──max_power_range_uw
+      │   ├──name
+      │   ├──enabled
+      │   ├──power
+      │   │   ├──async
+      │   │   []
+      │   ├──subsystem -> ../../../../../class/power_cap
+      │   ├──uevent
+      ├──power
+      │   ├──async
+      │   []
+      ├──subsystem -> ../../../../class/power_cap
+      ├──enabled
+      └──uevent
+
+The above example illustrates a case in which the Intel RAPL technology,
+available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one
+control type called intel-rapl which contains two power zones, intel-rapl:0 and
+intel-rapl:1, representing CPU packages.  Each of these power zones contains
+two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the
+"core" and the "uncore" parts of the given CPU package, respectively.  All of
+the zones and subzones contain energy monitoring attributes (energy_uj,
+max_energy_range_uj) and constraint attributes (constraint_*) allowing controls
+to be applied (the constraints in the 'package' power zones apply to the whole
+CPU packages and the subzone constraints only apply to the respective parts of
+the given package individually). Since Intel RAPL doesn't provide instantaneous
+power value, there is no power_uw attribute.
+
+In addition to that, each power zone contains a name attribute, allowing the
+part of the system represented by that zone to be identified.
+For example::
+
+	cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name
+
+package-0
+---------
+
+The Intel RAPL technology allows two constraints, short term and long term,
+with two different time windows to be applied to each power zone.  Thus for
+each zone there are 2 attributes representing the constraint names, 2 power
+limits and 2 attributes representing the sizes of the time windows. Such that,
+constraint_j_* attributes correspond to the jth constraint (j = 0,1).
+
+For example::
+
+	constraint_0_name
+	constraint_0_power_limit_uw
+	constraint_0_time_window_us
+	constraint_1_name
+	constraint_1_power_limit_uw
+	constraint_1_time_window_us
+
+Power Zone Attributes
+=====================
+
+Monitoring attributes
+---------------------
+
+energy_uj (rw)
+	Current energy counter in micro joules. Write "0" to reset.
+	If the counter can not be reset, then this attribute is read only.
+
+max_energy_range_uj (ro)
+	Range of the above energy counter in micro-joules.
+
+power_uw (ro)
+	Current power in micro watts.
+
+max_power_range_uw (ro)
+	Range of the above power value in micro-watts.
+
+name (ro)
+	Name of this power zone.
+
+It is possible that some domains have both power ranges and energy counter ranges;
+however, only one is mandatory.
+
+Constraints
+-----------
+
+constraint_X_power_limit_uw (rw)
+	Power limit in micro watts, which should be applicable for the
+	time window specified by "constraint_X_time_window_us".
+
+constraint_X_time_window_us (rw)
+	Time window in micro seconds.
+
+constraint_X_name (ro)
+	An optional name of the constraint
+
+constraint_X_max_power_uw(ro)
+	Maximum allowed power in micro watts.
+
+constraint_X_min_power_uw(ro)
+	Minimum allowed power in micro watts.
+
+constraint_X_max_time_window_us(ro)
+	Maximum allowed time window in micro seconds.
+
+constraint_X_min_time_window_us(ro)
+	Minimum allowed time window in micro seconds.
+
+Except power_limit_uw and time_window_us other fields are optional.
+
+Common zone and control type attributes
+---------------------------------------
+
+enabled (rw): Enable/Disable controls at zone level or for all zones using
+a control type.
+
+Power Cap Client Driver Interface
+=================================
+
+The API summary:
+
+Call powercap_register_control_type() to register control type object.
+Call powercap_register_zone() to register a power zone (under a given
+control type), either as a top-level power zone or as a subzone of another
+power zone registered earlier.
+The number of constraints in a power zone and the corresponding callbacks have
+to be defined prior to calling powercap_register_zone() to register that zone.
+
+To Free a power zone call powercap_unregister_zone().
+To free a control type object call powercap_unregister_control_type().
+Detailed API can be generated using kernel-doc on include/linux/powercap.h.
diff --git a/Documentation/power/powercap/powercap.txt b/Documentation/power/powercap/powercap.txt
deleted file mode 100644
index 1e6ef164e07a..000000000000
--- a/Documentation/power/powercap/powercap.txt
+++ /dev/null
@@ -1,236 +0,0 @@
-Power Capping Framework
-==================================
-
-The power capping framework provides a consistent interface between the kernel
-and the user space that allows power capping drivers to expose the settings to
-user space in a uniform way.
-
-Terminology
-=========================
-The framework exposes power capping devices to user space via sysfs in the
-form of a tree of objects. The objects at the root level of the tree represent
-'control types', which correspond to different methods of power capping.  For
-example, the intel-rapl control type represents the Intel "Running Average
-Power Limit" (RAPL) technology, whereas the 'idle-injection' control type
-corresponds to the use of idle injection for controlling power.
-
-Power zones represent different parts of the system, which can be controlled and
-monitored using the power capping method determined by the control type the
-given zone belongs to. They each contain attributes for monitoring power, as
-well as controls represented in the form of power constraints.  If the parts of
-the system represented by different power zones are hierarchical (that is, one
-bigger part consists of multiple smaller parts that each have their own power
-controls), those power zones may also be organized in a hierarchy with one
-parent power zone containing multiple subzones and so on to reflect the power
-control topology of the system.  In that case, it is possible to apply power
-capping to a set of devices together using the parent power zone and if more
-fine grained control is required, it can be applied through the subzones.
-
-
-Example sysfs interface tree:
-
-/sys/devices/virtual/powercap
-??? intel-rapl
-    ??? intel-rapl:0
-    ?   ??? constraint_0_name
-    ?   ??? constraint_0_power_limit_uw
-    ?   ??? constraint_0_time_window_us
-    ?   ??? constraint_1_name
-    ?   ??? constraint_1_power_limit_uw
-    ?   ??? constraint_1_time_window_us
-    ?   ??? device -> ../../intel-rapl
-    ?   ??? energy_uj
-    ?   ??? intel-rapl:0:0
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:0
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? intel-rapl:0:1
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:0
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? max_energy_range_uj
-    ?   ??? max_power_range_uw
-    ?   ??? name
-    ?   ??? enabled
-    ?   ??? power
-    ?   ?   ??? async
-    ?   ?   []
-    ?   ??? subsystem -> ../../../../../class/power_cap
-    ?   ??? enabled
-    ?   ??? uevent
-    ??? intel-rapl:1
-    ?   ??? constraint_0_name
-    ?   ??? constraint_0_power_limit_uw
-    ?   ??? constraint_0_time_window_us
-    ?   ??? constraint_1_name
-    ?   ??? constraint_1_power_limit_uw
-    ?   ??? constraint_1_time_window_us
-    ?   ??? device -> ../../intel-rapl
-    ?   ??? energy_uj
-    ?   ??? intel-rapl:1:0
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:1
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? intel-rapl:1:1
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:1
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? max_energy_range_uj
-    ?   ??? max_power_range_uw
-    ?   ??? name
-    ?   ??? enabled
-    ?   ??? power
-    ?   ?   ??? async
-    ?   ?   []
-    ?   ??? subsystem -> ../../../../../class/power_cap
-    ?   ??? uevent
-    ??? power
-    ?   ??? async
-    ?   []
-    ??? subsystem -> ../../../../class/power_cap
-    ??? enabled
-    ??? uevent
-
-The above example illustrates a case in which the Intel RAPL technology,
-available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one
-control type called intel-rapl which contains two power zones, intel-rapl:0 and
-intel-rapl:1, representing CPU packages.  Each of these power zones contains
-two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the
-"core" and the "uncore" parts of the given CPU package, respectively.  All of
-the zones and subzones contain energy monitoring attributes (energy_uj,
-max_energy_range_uj) and constraint attributes (constraint_*) allowing controls
-to be applied (the constraints in the 'package' power zones apply to the whole
-CPU packages and the subzone constraints only apply to the respective parts of
-the given package individually). Since Intel RAPL doesn't provide instantaneous
-power value, there is no power_uw attribute.
-
-In addition to that, each power zone contains a name attribute, allowing the
-part of the system represented by that zone to be identified.
-For example:
-
-cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name
-package-0
-
-The Intel RAPL technology allows two constraints, short term and long term,
-with two different time windows to be applied to each power zone.  Thus for
-each zone there are 2 attributes representing the constraint names, 2 power
-limits and 2 attributes representing the sizes of the time windows. Such that,
-constraint_j_* attributes correspond to the jth constraint (j = 0,1).
-
-For example:
-	constraint_0_name
-	constraint_0_power_limit_uw
-	constraint_0_time_window_us
-	constraint_1_name
-	constraint_1_power_limit_uw
-	constraint_1_time_window_us
-
-Power Zone Attributes
-=================================
-Monitoring attributes
-----------------------
-
-energy_uj (rw): Current energy counter in micro joules. Write "0" to reset.
-If the counter can not be reset, then this attribute is read only.
-
-max_energy_range_uj (ro): Range of the above energy counter in micro-joules.
-
-power_uw (ro): Current power in micro watts.
-
-max_power_range_uw (ro): Range of the above power value in micro-watts.
-
-name (ro): Name of this power zone.
-
-It is possible that some domains have both power ranges and energy counter ranges;
-however, only one is mandatory.
-
-Constraints
-----------------
-constraint_X_power_limit_uw (rw): Power limit in micro watts, which should be
-applicable for the time window specified by "constraint_X_time_window_us".
-
-constraint_X_time_window_us (rw): Time window in micro seconds.
-
-constraint_X_name (ro): An optional name of the constraint
-
-constraint_X_max_power_uw(ro): Maximum allowed power in micro watts.
-
-constraint_X_min_power_uw(ro): Minimum allowed power in micro watts.
-
-constraint_X_max_time_window_us(ro): Maximum allowed time window in micro seconds.
-
-constraint_X_min_time_window_us(ro): Minimum allowed time window in micro seconds.
-
-Except power_limit_uw and time_window_us other fields are optional.
-
-Common zone and control type attributes
-----------------------------------------
-enabled (rw): Enable/Disable controls at zone level or for all zones using
-a control type.
-
-Power Cap Client Driver Interface
-==================================
-The API summary:
-
-Call powercap_register_control_type() to register control type object.
-Call powercap_register_zone() to register a power zone (under a given
-control type), either as a top-level power zone or as a subzone of another
-power zone registered earlier.
-The number of constraints in a power zone and the corresponding callbacks have
-to be defined prior to calling powercap_register_zone() to register that zone.
-
-To Free a power zone call powercap_unregister_zone().
-To free a control type object call powercap_unregister_control_type().
-Detailed API can be generated using kernel-doc on include/linux/powercap.h.
diff --git a/Documentation/power/regulator/consumer.rst b/Documentation/power/regulator/consumer.rst
new file mode 100644
index 000000000000..0cd8cc1275a7
--- /dev/null
+++ b/Documentation/power/regulator/consumer.rst
@@ -0,0 +1,229 @@
+===================================
+Regulator Consumer Driver Interface
+===================================
+
+This text describes the regulator interface for consumer device drivers.
+Please see overview.txt for a description of the terms used in this text.
+
+
+1. Consumer Regulator Access (static & dynamic drivers)
+=======================================================
+
+A consumer driver can get access to its supply regulator by calling ::
+
+	regulator = regulator_get(dev, "Vcc");
+
+The consumer passes in its struct device pointer and power supply ID. The core
+then finds the correct regulator by consulting a machine specific lookup table.
+If the lookup is successful then this call will return a pointer to the struct
+regulator that supplies this consumer.
+
+To release the regulator the consumer driver should call ::
+
+	regulator_put(regulator);
+
+Consumers can be supplied by more than one regulator e.g. codec consumer with
+analog and digital supplies ::
+
+	digital = regulator_get(dev, "Vcc");  /* digital core */
+	analog = regulator_get(dev, "Avdd");  /* analog */
+
+The regulator access functions regulator_get() and regulator_put() will
+usually be called in your device drivers probe() and remove() respectively.
+
+
+2. Regulator Output Enable & Disable (static & dynamic drivers)
+===============================================================
+
+
+A consumer can enable its power supply by calling::
+
+	int regulator_enable(regulator);
+
+NOTE:
+  The supply may already be enabled before regulator_enabled() is called.
+  This may happen if the consumer shares the regulator or the regulator has been
+  previously enabled by bootloader or kernel board initialization code.
+
+A consumer can determine if a regulator is enabled by calling::
+
+	int regulator_is_enabled(regulator);
+
+This will return > zero when the regulator is enabled.
+
+
+A consumer can disable its supply when no longer needed by calling::
+
+	int regulator_disable(regulator);
+
+NOTE:
+  This may not disable the supply if it's shared with other consumers. The
+  regulator will only be disabled when the enabled reference count is zero.
+
+Finally, a regulator can be forcefully disabled in the case of an emergency::
+
+	int regulator_force_disable(regulator);
+
+NOTE:
+  this will immediately and forcefully shutdown the regulator output. All
+  consumers will be powered off.
+
+
+3. Regulator Voltage Control & Status (dynamic drivers)
+=======================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+voltage to match system operating points. e.g. CPUfreq drivers can scale
+voltage along with frequency to save power, SD drivers may need to select the
+correct card voltage, etc.
+
+Consumers can control their supply voltage by calling::
+
+	int regulator_set_voltage(regulator, min_uV, max_uV);
+
+Where min_uV and max_uV are the minimum and maximum acceptable voltages in
+microvolts.
+
+NOTE: this can be called when the regulator is enabled or disabled. If called
+when enabled, then the voltage changes instantly, otherwise the voltage
+configuration changes and the voltage is physically set when the regulator is
+next enabled.
+
+The regulators configured voltage output can be found by calling::
+
+	int regulator_get_voltage(regulator);
+
+NOTE:
+  get_voltage() will return the configured output voltage whether the
+  regulator is enabled or disabled and should NOT be used to determine regulator
+  output state. However this can be used in conjunction with is_enabled() to
+  determine the regulator physical output voltage.
+
+
+4. Regulator Current Limit Control & Status (dynamic drivers)
+=============================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+current limit to match system operating points. e.g. LCD backlight driver can
+change the current limit to vary the backlight brightness, USB drivers may want
+to set the limit to 500mA when supplying power.
+
+Consumers can control their supply current limit by calling::
+
+	int regulator_set_current_limit(regulator, min_uA, max_uA);
+
+Where min_uA and max_uA are the minimum and maximum acceptable current limit in
+microamps.
+
+NOTE:
+  this can be called when the regulator is enabled or disabled. If called
+  when enabled, then the current limit changes instantly, otherwise the current
+  limit configuration changes and the current limit is physically set when the
+  regulator is next enabled.
+
+A regulators current limit can be found by calling::
+
+	int regulator_get_current_limit(regulator);
+
+NOTE:
+  get_current_limit() will return the current limit whether the regulator
+  is enabled or disabled and should not be used to determine regulator current
+  load.
+
+
+5. Regulator Operating Mode Control & Status (dynamic drivers)
+==============================================================
+
+Some consumers can further save system power by changing the operating mode of
+their supply regulator to be more efficient when the consumers operating state
+changes. e.g. consumer driver is idle and subsequently draws less current
+
+Regulator operating mode can be changed indirectly or directly.
+
+Indirect operating mode control.
+--------------------------------
+Consumer drivers can request a change in their supply regulator operating mode
+by calling::
+
+	int regulator_set_load(struct regulator *regulator, int load_uA);
+
+This will cause the core to recalculate the total load on the regulator (based
+on all its consumers) and change operating mode (if necessary and permitted)
+to best match the current operating load.
+
+The load_uA value can be determined from the consumer's datasheet. e.g. most
+datasheets have tables showing the maximum current consumed in certain
+situations.
+
+Most consumers will use indirect operating mode control since they have no
+knowledge of the regulator or whether the regulator is shared with other
+consumers.
+
+Direct operating mode control.
+------------------------------
+
+Bespoke or tightly coupled drivers may want to directly control regulator
+operating mode depending on their operating point. This can be achieved by
+calling::
+
+	int regulator_set_mode(struct regulator *regulator, unsigned int mode);
+	unsigned int regulator_get_mode(struct regulator *regulator);
+
+Direct mode will only be used by consumers that *know* about the regulator and
+are not sharing the regulator with other consumers.
+
+
+6. Regulator Events
+===================
+
+Regulators can notify consumers of external events. Events could be received by
+consumers under regulator stress or failure conditions.
+
+Consumers can register interest in regulator events by calling::
+
+	int regulator_register_notifier(struct regulator *regulator,
+					struct notifier_block *nb);
+
+Consumers can unregister interest by calling::
+
+	int regulator_unregister_notifier(struct regulator *regulator,
+					  struct notifier_block *nb);
+
+Regulators use the kernel notifier framework to send event to their interested
+consumers.
+
+7. Regulator Direct Register Access
+===================================
+
+Some kinds of power management hardware or firmware are designed such that
+they need to do low-level hardware access to regulators, with no involvement
+from the kernel. Examples of such devices are:
+
+- clocksource with a voltage-controlled oscillator and control logic to change
+  the supply voltage over I2C to achieve a desired output clock rate
+- thermal management firmware that can issue an arbitrary I2C transaction to
+  perform system poweroff during overtemperature conditions
+
+To set up such a device/firmware, various parameters like I2C address of the
+regulator, addresses of various regulator registers etc. need to be configured
+to it. The regulator framework provides the following helpers for querying
+these details.
+
+Bus-specific details, like I2C addresses or transfer rates are handled by the
+regmap framework. To get the regulator's regmap (if supported), use::
+
+	struct regmap *regulator_get_regmap(struct regulator *regulator);
+
+To obtain the hardware register offset and bitmask for the regulator's voltage
+selector register, use::
+
+	int regulator_get_hardware_vsel_register(struct regulator *regulator,
+						 unsigned *vsel_reg,
+						 unsigned *vsel_mask);
+
+To convert a regulator framework voltage selector code (used by
+regulator_list_voltage) to a hardware-specific voltage selector that can be
+directly written to the voltage selector register, use::
+
+	int regulator_list_hardware_vsel(struct regulator *regulator,
+					 unsigned selector);
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
deleted file mode 100644
index e51564c1a140..000000000000
--- a/Documentation/power/regulator/consumer.txt
+++ /dev/null
@@ -1,218 +0,0 @@
-Regulator Consumer Driver Interface
-===================================
-
-This text describes the regulator interface for consumer device drivers.
-Please see overview.txt for a description of the terms used in this text.
-
-
-1. Consumer Regulator Access (static & dynamic drivers)
-=======================================================
-
-A consumer driver can get access to its supply regulator by calling :-
-
-regulator = regulator_get(dev, "Vcc");
-
-The consumer passes in its struct device pointer and power supply ID. The core
-then finds the correct regulator by consulting a machine specific lookup table.
-If the lookup is successful then this call will return a pointer to the struct
-regulator that supplies this consumer.
-
-To release the regulator the consumer driver should call :-
-
-regulator_put(regulator);
-
-Consumers can be supplied by more than one regulator e.g. codec consumer with
-analog and digital supplies :-
-
-digital = regulator_get(dev, "Vcc");  /* digital core */
-analog = regulator_get(dev, "Avdd");  /* analog */
-
-The regulator access functions regulator_get() and regulator_put() will
-usually be called in your device drivers probe() and remove() respectively.
-
-
-2. Regulator Output Enable & Disable (static & dynamic drivers)
-====================================================================
-
-A consumer can enable its power supply by calling:-
-
-int regulator_enable(regulator);
-
-NOTE: The supply may already be enabled before regulator_enabled() is called.
-This may happen if the consumer shares the regulator or the regulator has been
-previously enabled by bootloader or kernel board initialization code.
-
-A consumer can determine if a regulator is enabled by calling :-
-
-int regulator_is_enabled(regulator);
-
-This will return > zero when the regulator is enabled.
-
-
-A consumer can disable its supply when no longer needed by calling :-
-
-int regulator_disable(regulator);
-
-NOTE: This may not disable the supply if it's shared with other consumers. The
-regulator will only be disabled when the enabled reference count is zero.
-
-Finally, a regulator can be forcefully disabled in the case of an emergency :-
-
-int regulator_force_disable(regulator);
-
-NOTE: this will immediately and forcefully shutdown the regulator output. All
-consumers will be powered off.
-
-
-3. Regulator Voltage Control & Status (dynamic drivers)
-======================================================
-
-Some consumer drivers need to be able to dynamically change their supply
-voltage to match system operating points. e.g. CPUfreq drivers can scale
-voltage along with frequency to save power, SD drivers may need to select the
-correct card voltage, etc.
-
-Consumers can control their supply voltage by calling :-
-
-int regulator_set_voltage(regulator, min_uV, max_uV);
-
-Where min_uV and max_uV are the minimum and maximum acceptable voltages in
-microvolts.
-
-NOTE: this can be called when the regulator is enabled or disabled. If called
-when enabled, then the voltage changes instantly, otherwise the voltage
-configuration changes and the voltage is physically set when the regulator is
-next enabled.
-
-The regulators configured voltage output can be found by calling :-
-
-int regulator_get_voltage(regulator);
-
-NOTE: get_voltage() will return the configured output voltage whether the
-regulator is enabled or disabled and should NOT be used to determine regulator
-output state. However this can be used in conjunction with is_enabled() to
-determine the regulator physical output voltage.
-
-
-4. Regulator Current Limit Control & Status (dynamic drivers)
-===========================================================
-
-Some consumer drivers need to be able to dynamically change their supply
-current limit to match system operating points. e.g. LCD backlight driver can
-change the current limit to vary the backlight brightness, USB drivers may want
-to set the limit to 500mA when supplying power.
-
-Consumers can control their supply current limit by calling :-
-
-int regulator_set_current_limit(regulator, min_uA, max_uA);
-
-Where min_uA and max_uA are the minimum and maximum acceptable current limit in
-microamps.
-
-NOTE: this can be called when the regulator is enabled or disabled. If called
-when enabled, then the current limit changes instantly, otherwise the current
-limit configuration changes and the current limit is physically set when the
-regulator is next enabled.
-
-A regulators current limit can be found by calling :-
-
-int regulator_get_current_limit(regulator);
-
-NOTE: get_current_limit() will return the current limit whether the regulator
-is enabled or disabled and should not be used to determine regulator current
-load.
-
-
-5. Regulator Operating Mode Control & Status (dynamic drivers)
-=============================================================
-
-Some consumers can further save system power by changing the operating mode of
-their supply regulator to be more efficient when the consumers operating state
-changes. e.g. consumer driver is idle and subsequently draws less current
-
-Regulator operating mode can be changed indirectly or directly.
-
-Indirect operating mode control.
---------------------------------
-Consumer drivers can request a change in their supply regulator operating mode
-by calling :-
-
-int regulator_set_load(struct regulator *regulator, int load_uA);
-
-This will cause the core to recalculate the total load on the regulator (based
-on all its consumers) and change operating mode (if necessary and permitted)
-to best match the current operating load.
-
-The load_uA value can be determined from the consumer's datasheet. e.g. most
-datasheets have tables showing the maximum current consumed in certain
-situations.
-
-Most consumers will use indirect operating mode control since they have no
-knowledge of the regulator or whether the regulator is shared with other
-consumers.
-
-Direct operating mode control.
-------------------------------
-Bespoke or tightly coupled drivers may want to directly control regulator
-operating mode depending on their operating point. This can be achieved by
-calling :-
-
-int regulator_set_mode(struct regulator *regulator, unsigned int mode);
-unsigned int regulator_get_mode(struct regulator *regulator);
-
-Direct mode will only be used by consumers that *know* about the regulator and
-are not sharing the regulator with other consumers.
-
-
-6. Regulator Events
-===================
-Regulators can notify consumers of external events. Events could be received by
-consumers under regulator stress or failure conditions.
-
-Consumers can register interest in regulator events by calling :-
-
-int regulator_register_notifier(struct regulator *regulator,
-			      struct notifier_block *nb);
-
-Consumers can unregister interest by calling :-
-
-int regulator_unregister_notifier(struct regulator *regulator,
-				struct notifier_block *nb);
-
-Regulators use the kernel notifier framework to send event to their interested
-consumers.
-
-7. Regulator Direct Register Access
-===================================
-Some kinds of power management hardware or firmware are designed such that
-they need to do low-level hardware access to regulators, with no involvement
-from the kernel. Examples of such devices are:
-
-- clocksource with a voltage-controlled oscillator and control logic to change
-  the supply voltage over I2C to achieve a desired output clock rate
-- thermal management firmware that can issue an arbitrary I2C transaction to
-  perform system poweroff during overtemperature conditions
-
-To set up such a device/firmware, various parameters like I2C address of the
-regulator, addresses of various regulator registers etc. need to be configured
-to it. The regulator framework provides the following helpers for querying
-these details.
-
-Bus-specific details, like I2C addresses or transfer rates are handled by the
-regmap framework. To get the regulator's regmap (if supported), use :-
-
-struct regmap *regulator_get_regmap(struct regulator *regulator);
-
-To obtain the hardware register offset and bitmask for the regulator's voltage
-selector register, use :-
-
-int regulator_get_hardware_vsel_register(struct regulator *regulator,
-					 unsigned *vsel_reg,
-					 unsigned *vsel_mask);
-
-To convert a regulator framework voltage selector code (used by
-regulator_list_voltage) to a hardware-specific voltage selector that can be
-directly written to the voltage selector register, use :-
-
-int regulator_list_hardware_vsel(struct regulator *regulator,
-				 unsigned selector);
diff --git a/Documentation/power/regulator/design.rst b/Documentation/power/regulator/design.rst
new file mode 100644
index 000000000000..3b09c6841dc4
--- /dev/null
+++ b/Documentation/power/regulator/design.rst
@@ -0,0 +1,38 @@
+==========================
+Regulator API design notes
+==========================
+
+This document provides a brief, partially structured, overview of some
+of the design considerations which impact the regulator API design.
+
+Safety
+------
+
+ - Errors in regulator configuration can have very serious consequences
+   for the system, potentially including lasting hardware damage.
+ - It is not possible to automatically determine the power configuration
+   of the system - software-equivalent variants of the same chip may
+   have different power requirements, and not all components with power
+   requirements are visible to software.
+
+.. note::
+
+     The API should make no changes to the hardware state unless it has
+     specific knowledge that these changes are safe to perform on this
+     particular system.
+
+Consumer use cases
+------------------
+
+ - The overwhelming majority of devices in a system will have no
+   requirement to do any runtime configuration of their power beyond
+   being able to turn it on or off.
+
+ - Many of the power supplies in the system will be shared between many
+   different consumers.
+
+.. note::
+
+     The consumer API should be structured so that these use cases are
+     very easy to handle and so that consumers will work with shared
+     supplies without any additional effort.
diff --git a/Documentation/power/regulator/design.txt b/Documentation/power/regulator/design.txt
deleted file mode 100644
index fdd919b96830..000000000000
--- a/Documentation/power/regulator/design.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-Regulator API design notes
-==========================
-
-This document provides a brief, partially structured, overview of some
-of the design considerations which impact the regulator API design.
-
-Safety
-------
-
- - Errors in regulator configuration can have very serious consequences
-   for the system, potentially including lasting hardware damage.
- - It is not possible to automatically determine the power configuration
-   of the system - software-equivalent variants of the same chip may
-   have different power requirements, and not all components with power
-   requirements are visible to software.
-
-  => The API should make no changes to the hardware state unless it has
-     specific knowledge that these changes are safe to perform on this
-     particular system.
-
-Consumer use cases
-------------------
-
- - The overwhelming majority of devices in a system will have no
-   requirement to do any runtime configuration of their power beyond
-   being able to turn it on or off.
-
- - Many of the power supplies in the system will be shared between many
-   different consumers.
-
-  => The consumer API should be structured so that these use cases are
-     very easy to handle and so that consumers will work with shared
-     supplies without any additional effort.
diff --git a/Documentation/power/regulator/machine.rst b/Documentation/power/regulator/machine.rst
new file mode 100644
index 000000000000..22fffefaa3ad
--- /dev/null
+++ b/Documentation/power/regulator/machine.rst
@@ -0,0 +1,97 @@
+==================================
+Regulator Machine Driver Interface
+==================================
+
+The regulator machine driver interface is intended for board/machine specific
+initialisation code to configure the regulator subsystem.
+
+Consider the following machine::
+
+  Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
+               |
+               +-> [Consumer B @ 3.3V]
+
+The drivers for consumers A & B must be mapped to the correct regulator in
+order to control their power supplies. This mapping can be achieved in machine
+initialisation code by creating a struct regulator_consumer_supply for
+each regulator::
+
+  struct regulator_consumer_supply {
+	const char *dev_name;	/* consumer dev_name() */
+	const char *supply;	/* consumer supply - e.g. "vcc" */
+  };
+
+e.g. for the machine above::
+
+  static struct regulator_consumer_supply regulator1_consumers[] = {
+	REGULATOR_SUPPLY("Vcc", "consumer B"),
+  };
+
+  static struct regulator_consumer_supply regulator2_consumers[] = {
+	REGULATOR_SUPPLY("Vcc", "consumer A"),
+  };
+
+This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
+to the 'Vcc' supply for Consumer A.
+
+Constraints can now be registered by defining a struct regulator_init_data
+for each regulator power domain. This structure also maps the consumers
+to their supply regulators::
+
+  static struct regulator_init_data regulator1_data = {
+	.constraints = {
+		.name = "Regulator-1",
+		.min_uV = 3300000,
+		.max_uV = 3300000,
+		.valid_modes_mask = REGULATOR_MODE_NORMAL,
+	},
+	.num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
+	.consumer_supplies = regulator1_consumers,
+  };
+
+The name field should be set to something that is usefully descriptive
+for the board for configuration of supplies for other regulators and
+for use in logging and other diagnostic output.  Normally the name
+used for the supply rail in the schematic is a good choice.  If no
+name is provided then the subsystem will choose one.
+
+Regulator-1 supplies power to Regulator-2. This relationship must be registered
+with the core so that Regulator-1 is also enabled when Consumer A enables its
+supply (Regulator-2). The supply regulator is set by the supply_regulator
+field below and co::
+
+  static struct regulator_init_data regulator2_data = {
+	.supply_regulator = "Regulator-1",
+	.constraints = {
+		.min_uV = 1800000,
+		.max_uV = 2000000,
+		.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
+		.valid_modes_mask = REGULATOR_MODE_NORMAL,
+	},
+	.num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
+	.consumer_supplies = regulator2_consumers,
+  };
+
+Finally the regulator devices must be registered in the usual manner::
+
+  static struct platform_device regulator_devices[] = {
+	{
+		.name = "regulator",
+		.id = DCDC_1,
+		.dev = {
+			.platform_data = &regulator1_data,
+		},
+	},
+	{
+		.name = "regulator",
+		.id = DCDC_2,
+		.dev = {
+			.platform_data = &regulator2_data,
+		},
+	},
+  };
+  /* register regulator 1 device */
+  platform_device_register(&regulator_devices[0]);
+
+  /* register regulator 2 device */
+  platform_device_register(&regulator_devices[1]);
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
deleted file mode 100644
index eff4dcaaa252..000000000000
--- a/Documentation/power/regulator/machine.txt
+++ /dev/null
@@ -1,96 +0,0 @@
-Regulator Machine Driver Interface
-===================================
-
-The regulator machine driver interface is intended for board/machine specific
-initialisation code to configure the regulator subsystem.
-
-Consider the following machine :-
-
-  Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
-               |
-               +-> [Consumer B @ 3.3V]
-
-The drivers for consumers A & B must be mapped to the correct regulator in
-order to control their power supplies. This mapping can be achieved in machine
-initialisation code by creating a struct regulator_consumer_supply for
-each regulator.
-
-struct regulator_consumer_supply {
-	const char *dev_name;	/* consumer dev_name() */
-	const char *supply;	/* consumer supply - e.g. "vcc" */
-};
-
-e.g. for the machine above
-
-static struct regulator_consumer_supply regulator1_consumers[] = {
-	REGULATOR_SUPPLY("Vcc", "consumer B"),
-};
-
-static struct regulator_consumer_supply regulator2_consumers[] = {
-	REGULATOR_SUPPLY("Vcc", "consumer A"),
-};
-
-This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
-to the 'Vcc' supply for Consumer A.
-
-Constraints can now be registered by defining a struct regulator_init_data
-for each regulator power domain. This structure also maps the consumers
-to their supply regulators :-
-
-static struct regulator_init_data regulator1_data = {
-	.constraints = {
-		.name = "Regulator-1",
-		.min_uV = 3300000,
-		.max_uV = 3300000,
-		.valid_modes_mask = REGULATOR_MODE_NORMAL,
-	},
-	.num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
-	.consumer_supplies = regulator1_consumers,
-};
-
-The name field should be set to something that is usefully descriptive
-for the board for configuration of supplies for other regulators and
-for use in logging and other diagnostic output.  Normally the name
-used for the supply rail in the schematic is a good choice.  If no
-name is provided then the subsystem will choose one.
-
-Regulator-1 supplies power to Regulator-2. This relationship must be registered
-with the core so that Regulator-1 is also enabled when Consumer A enables its
-supply (Regulator-2). The supply regulator is set by the supply_regulator
-field below and co:-
-
-static struct regulator_init_data regulator2_data = {
-	.supply_regulator = "Regulator-1",
-	.constraints = {
-		.min_uV = 1800000,
-		.max_uV = 2000000,
-		.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
-		.valid_modes_mask = REGULATOR_MODE_NORMAL,
-	},
-	.num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
-	.consumer_supplies = regulator2_consumers,
-};
-
-Finally the regulator devices must be registered in the usual manner.
-
-static struct platform_device regulator_devices[] = {
-	{
-		.name = "regulator",
-		.id = DCDC_1,
-		.dev = {
-			.platform_data = &regulator1_data,
-		},
-	},
-	{
-		.name = "regulator",
-		.id = DCDC_2,
-		.dev = {
-			.platform_data = &regulator2_data,
-		},
-	},
-};
-/* register regulator 1 device */
-platform_device_register(&regulator_devices[0]);
-
-/* register regulator 2 device */
-platform_device_register(&regulator_devices[1]);
diff --git a/Documentation/power/regulator/overview.rst b/Documentation/power/regulator/overview.rst
new file mode 100644
index 000000000000..ee494c70a7c4
--- /dev/null
+++ b/Documentation/power/regulator/overview.rst
@@ -0,0 +1,178 @@
+=============================================
+Linux voltage and current regulator framework
+=============================================
+
+About
+=====
+
+This framework is designed to provide a standard kernel interface to control
+voltage and current regulators.
+
+The intention is to allow systems to dynamically control regulator power output
+in order to save power and prolong battery life. This applies to both voltage
+regulators (where voltage output is controllable) and current sinks (where
+current limit is controllable).
+
+(C) 2008  Wolfson Microelectronics PLC.
+
+Author: Liam Girdwood <lrg@slimlogic.co.uk>
+
+
+Nomenclature
+============
+
+Some terms used in this document:
+
+  - Regulator
+                 - Electronic device that supplies power to other devices.
+                   Most regulators can enable and disable their output while
+                   some can control their output voltage and or current.
+
+                   Input Voltage -> Regulator -> Output Voltage
+
+
+  - PMIC
+                 - Power Management IC. An IC that contains numerous
+                   regulators and often contains other subsystems.
+
+
+  - Consumer
+                 - Electronic device that is supplied power by a regulator.
+                   Consumers can be classified into two types:-
+
+                   Static: consumer does not change its supply voltage or
+                   current limit. It only needs to enable or disable its
+                   power supply. Its supply voltage is set by the hardware,
+                   bootloader, firmware or kernel board initialisation code.
+
+                   Dynamic: consumer needs to change its supply voltage or
+                   current limit to meet operation demands.
+
+
+  - Power Domain
+                 - Electronic circuit that is supplied its input power by the
+                   output power of a regulator, switch or by another power
+                   domain.
+
+                   The supply regulator may be behind a switch(s). i.e.::
+
+                     Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
+                                |             |
+                                |             +-> [Consumer B], [Consumer C]
+                                |
+                                +-> [Consumer D], [Consumer E]
+
+                   That is one regulator and three power domains:
+
+                   - Domain 1: Switch-1, Consumers D & E.
+                   - Domain 2: Switch-2, Consumers B & C.
+                   - Domain 3: Consumer A.
+
+                   and this represents a "supplies" relationship:
+
+                   Domain-1 --> Domain-2 --> Domain-3.
+
+                   A power domain may have regulators that are supplied power
+                   by other regulators. i.e.::
+
+                     Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
+                                  |
+                                  +-> [Consumer B]
+
+                   This gives us two regulators and two power domains:
+
+                   - Domain 1: Regulator-2, Consumer B.
+                   - Domain 2: Consumer A.
+
+                   and a "supplies" relationship:
+
+                   Domain-1 --> Domain-2
+
+
+  - Constraints
+                 - Constraints are used to define power levels for performance
+                   and hardware protection. Constraints exist at three levels:
+
+                   Regulator Level: This is defined by the regulator hardware
+                   operating parameters and is specified in the regulator
+                   datasheet. i.e.
+
+                     - voltage output is in the range 800mV -> 3500mV.
+                     - regulator current output limit is 20mA @ 5V but is
+                       10mA @ 10V.
+
+                   Power Domain Level: This is defined in software by kernel
+                   level board initialisation code. It is used to constrain a
+                   power domain to a particular power range. i.e.
+
+                     - Domain-1 voltage is 3300mV
+                     - Domain-2 voltage is 1400mV -> 1600mV
+                     - Domain-3 current limit is 0mA -> 20mA.
+
+                   Consumer Level: This is defined by consumer drivers
+                   dynamically setting voltage or current limit levels.
+
+                   e.g. a consumer backlight driver asks for a current increase
+                   from 5mA to 10mA to increase LCD illumination. This passes
+                   to through the levels as follows :-
+
+                   Consumer: need to increase LCD brightness. Lookup and
+                   request next current mA value in brightness table (the
+                   consumer driver could be used on several different
+                   personalities based upon the same reference device).
+
+                   Power Domain: is the new current limit within the domain
+                   operating limits for this domain and system state (e.g.
+                   battery power, USB power)
+
+                   Regulator Domains: is the new current limit within the
+                   regulator operating parameters for input/output voltage.
+
+                   If the regulator request passes all the constraint tests
+                   then the new regulator value is applied.
+
+
+Design
+======
+
+The framework is designed and targeted at SoC based devices but may also be
+relevant to non SoC devices and is split into the following four interfaces:-
+
+
+   1. Consumer driver interface.
+
+      This uses a similar API to the kernel clock interface in that consumer
+      drivers can get and put a regulator (like they can with clocks atm) and
+      get/set voltage, current limit, mode, enable and disable. This should
+      allow consumers complete control over their supply voltage and current
+      limit. This also compiles out if not in use so drivers can be reused in
+      systems with no regulator based power control.
+
+        See Documentation/power/regulator/consumer.rst
+
+   2. Regulator driver interface.
+
+      This allows regulator drivers to register their regulators and provide
+      operations to the core. It also has a notifier call chain for propagating
+      regulator events to clients.
+
+        See Documentation/power/regulator/regulator.rst
+
+   3. Machine interface.
+
+      This interface is for machine specific code and allows the creation of
+      voltage/current domains (with constraints) for each regulator. It can
+      provide regulator constraints that will prevent device damage through
+      overvoltage or overcurrent caused by buggy client drivers. It also
+      allows the creation of a regulator tree whereby some regulators are
+      supplied by others (similar to a clock tree).
+
+        See Documentation/power/regulator/machine.rst
+
+   4. Userspace ABI.
+
+      The framework also exports a lot of useful voltage/current/opmode data to
+      userspace via sysfs. This could be used to help monitor device power
+      consumption and status.
+
+        See Documentation/ABI/testing/sysfs-class-regulator
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
deleted file mode 100644
index 721b4739ec32..000000000000
--- a/Documentation/power/regulator/overview.txt
+++ /dev/null
@@ -1,171 +0,0 @@
-Linux voltage and current regulator framework
-=============================================
-
-About
-=====
-
-This framework is designed to provide a standard kernel interface to control
-voltage and current regulators.
-
-The intention is to allow systems to dynamically control regulator power output
-in order to save power and prolong battery life. This applies to both voltage
-regulators (where voltage output is controllable) and current sinks (where
-current limit is controllable).
-
-(C) 2008  Wolfson Microelectronics PLC.
-Author: Liam Girdwood <lrg@slimlogic.co.uk>
-
-
-Nomenclature
-============
-
-Some terms used in this document:-
-
-  o Regulator    - Electronic device that supplies power to other devices.
-                   Most regulators can enable and disable their output while
-                   some can control their output voltage and or current.
-
-                   Input Voltage -> Regulator -> Output Voltage
-
-
-  o PMIC         - Power Management IC. An IC that contains numerous regulators
-                   and often contains other subsystems.
-
-
-  o Consumer     - Electronic device that is supplied power by a regulator.
-                   Consumers can be classified into two types:-
-
-                   Static: consumer does not change its supply voltage or
-                   current limit. It only needs to enable or disable its
-                   power supply. Its supply voltage is set by the hardware,
-                   bootloader, firmware or kernel board initialisation code.
-
-                   Dynamic: consumer needs to change its supply voltage or
-                   current limit to meet operation demands.
-
-
-  o Power Domain - Electronic circuit that is supplied its input power by the
-                   output power of a regulator, switch or by another power
-                   domain.
-
-                   The supply regulator may be behind a switch(s). i.e.
-
-                   Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
-                              |             |
-                              |             +-> [Consumer B], [Consumer C]
-                              |
-                              +-> [Consumer D], [Consumer E]
-
-                   That is one regulator and three power domains:
-
-                   Domain 1: Switch-1, Consumers D & E.
-                   Domain 2: Switch-2, Consumers B & C.
-                   Domain 3: Consumer A.
-
-                   and this represents a "supplies" relationship:
-
-                   Domain-1 --> Domain-2 --> Domain-3.
-
-                   A power domain may have regulators that are supplied power
-                   by other regulators. i.e.
-
-                   Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
-                                |
-                                +-> [Consumer B]
-
-                   This gives us two regulators and two power domains:
-
-                   Domain 1: Regulator-2, Consumer B.
-                   Domain 2: Consumer A.
-
-                   and a "supplies" relationship:
-
-                   Domain-1 --> Domain-2
-
-
-  o Constraints  - Constraints are used to define power levels for performance
-                   and hardware protection. Constraints exist at three levels:
-
-                   Regulator Level: This is defined by the regulator hardware
-                   operating parameters and is specified in the regulator
-                   datasheet. i.e.
-
-                     - voltage output is in the range 800mV -> 3500mV.
-                     - regulator current output limit is 20mA @ 5V but is
-                       10mA @ 10V.
-
-                   Power Domain Level: This is defined in software by kernel
-                   level board initialisation code. It is used to constrain a
-                   power domain to a particular power range. i.e.
-
-                     - Domain-1 voltage is 3300mV
-                     - Domain-2 voltage is 1400mV -> 1600mV
-                     - Domain-3 current limit is 0mA -> 20mA.
-
-                   Consumer Level: This is defined by consumer drivers
-                   dynamically setting voltage or current limit levels.
-
-                   e.g. a consumer backlight driver asks for a current increase
-                   from 5mA to 10mA to increase LCD illumination. This passes
-                   to through the levels as follows :-
-
-                   Consumer: need to increase LCD brightness. Lookup and
-                   request next current mA value in brightness table (the
-                   consumer driver could be used on several different
-                   personalities based upon the same reference device).
-
-                   Power Domain: is the new current limit within the domain
-                   operating limits for this domain and system state (e.g.
-                   battery power, USB power)
-
-                   Regulator Domains: is the new current limit within the
-                   regulator operating parameters for input/output voltage.
-
-                   If the regulator request passes all the constraint tests
-                   then the new regulator value is applied.
-
-
-Design
-======
-
-The framework is designed and targeted at SoC based devices but may also be
-relevant to non SoC devices and is split into the following four interfaces:-
-
-
-   1. Consumer driver interface.
-
-      This uses a similar API to the kernel clock interface in that consumer
-      drivers can get and put a regulator (like they can with clocks atm) and
-      get/set voltage, current limit, mode, enable and disable. This should
-      allow consumers complete control over their supply voltage and current
-      limit. This also compiles out if not in use so drivers can be reused in
-      systems with no regulator based power control.
-
-        See Documentation/power/regulator/consumer.txt
-
-   2. Regulator driver interface.
-
-      This allows regulator drivers to register their regulators and provide
-      operations to the core. It also has a notifier call chain for propagating
-      regulator events to clients.
-
-        See Documentation/power/regulator/regulator.txt
-
-   3. Machine interface.
-
-      This interface is for machine specific code and allows the creation of
-      voltage/current domains (with constraints) for each regulator. It can
-      provide regulator constraints that will prevent device damage through
-      overvoltage or overcurrent caused by buggy client drivers. It also
-      allows the creation of a regulator tree whereby some regulators are
-      supplied by others (similar to a clock tree).
-
-        See Documentation/power/regulator/machine.txt
-
-   4. Userspace ABI.
-
-      The framework also exports a lot of useful voltage/current/opmode data to
-      userspace via sysfs. This could be used to help monitor device power
-      consumption and status.
-
-        See Documentation/ABI/testing/sysfs-class-regulator
diff --git a/Documentation/power/regulator/regulator.rst b/Documentation/power/regulator/regulator.rst
new file mode 100644
index 000000000000..794b3256fbb9
--- /dev/null
+++ b/Documentation/power/regulator/regulator.rst
@@ -0,0 +1,32 @@
+==========================
+Regulator Driver Interface
+==========================
+
+The regulator driver interface is relatively simple and designed to allow
+regulator drivers to register their services with the core framework.
+
+
+Registration
+============
+
+Drivers can register a regulator by calling::
+
+  struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
+					   const struct regulator_config *config);
+
+This will register the regulator's capabilities and operations to the regulator
+core.
+
+Regulators can be unregistered by calling::
+
+  void regulator_unregister(struct regulator_dev *rdev);
+
+
+Regulator Events
+================
+
+Regulators can send events (e.g. overtemperature, undervoltage, etc) to
+consumer drivers by calling::
+
+  int regulator_notifier_call_chain(struct regulator_dev *rdev,
+				    unsigned long event, void *data);
diff --git a/Documentation/power/regulator/regulator.txt b/Documentation/power/regulator/regulator.txt
deleted file mode 100644
index b17e5833ce21..000000000000
--- a/Documentation/power/regulator/regulator.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-Regulator Driver Interface
-==========================
-
-The regulator driver interface is relatively simple and designed to allow
-regulator drivers to register their services with the core framework.
-
-
-Registration
-============
-
-Drivers can register a regulator by calling :-
-
-struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-					 const struct regulator_config *config);
-
-This will register the regulator's capabilities and operations to the regulator
-core.
-
-Regulators can be unregistered by calling :-
-
-void regulator_unregister(struct regulator_dev *rdev);
-
-
-Regulator Events
-================
-Regulators can send events (e.g. overtemperature, undervoltage, etc) to
-consumer drivers by calling :-
-
-int regulator_notifier_call_chain(struct regulator_dev *rdev,
-				  unsigned long event, void *data);
diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
new file mode 100644
index 000000000000..2c2ec99b5088
--- /dev/null
+++ b/Documentation/power/runtime_pm.rst
@@ -0,0 +1,940 @@
+==================================================
+Runtime Power Management Framework for I/O Devices
+==================================================
+
+(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+(C) 2010 Alan Stern <stern@rowland.harvard.edu>
+
+(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+1. Introduction
+===============
+
+Support for runtime power management (runtime PM) of I/O devices is provided
+at the power management core (PM core) level by means of:
+
+* The power management workqueue pm_wq in which bus types and device drivers can
+  put their PM-related work items.  It is strongly recommended that pm_wq be
+  used for queuing all work items related to runtime PM, because this allows
+  them to be synchronized with system-wide power transitions (suspend to RAM,
+  hibernation and resume from system sleep states).  pm_wq is declared in
+  include/linux/pm_runtime.h and defined in kernel/power/main.c.
+
+* A number of runtime PM fields in the 'power' member of 'struct device' (which
+  is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
+  be used for synchronizing runtime PM operations with one another.
+
+* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
+  include/linux/pm.h).
+
+* A set of helper functions defined in drivers/base/power/runtime.c that can be
+  used for carrying out runtime PM operations in such a way that the
+  synchronization between them is taken care of by the PM core.  Bus types and
+  device drivers are encouraged to use these functions.
+
+The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
+fields of 'struct dev_pm_info' and the core helper functions provided for
+runtime PM are described below.
+
+2. Device Runtime PM Callbacks
+==============================
+
+There are three device runtime PM callbacks defined in 'struct dev_pm_ops'::
+
+  struct dev_pm_ops {
+	...
+	int (*runtime_suspend)(struct device *dev);
+	int (*runtime_resume)(struct device *dev);
+	int (*runtime_idle)(struct device *dev);
+	...
+  };
+
+The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks
+are executed by the PM core for the device's subsystem that may be either of
+the following:
+
+  1. PM domain of the device, if the device's PM domain object, dev->pm_domain,
+     is present.
+
+  2. Device type of the device, if both dev->type and dev->type->pm are present.
+
+  3. Device class of the device, if both dev->class and dev->class->pm are
+     present.
+
+  4. Bus type of the device, if both dev->bus and dev->bus->pm are present.
+
+If the subsystem chosen by applying the above rules doesn't provide the relevant
+callback, the PM core will invoke the corresponding driver callback stored in
+dev->driver->pm directly (if present).
+
+The PM core always checks which callback to use in the order given above, so the
+priority order of callbacks from high to low is: PM domain, device type, class
+and bus type.  Moreover, the high-priority one will always take precedence over
+a low-priority one.  The PM domain, bus type, device type and class callbacks
+are referred to as subsystem-level callbacks in what follows.
+
+By default, the callbacks are always invoked in process context with interrupts
+enabled.  However, the pm_runtime_irq_safe() helper function can be used to tell
+the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume()
+and ->runtime_idle() callbacks for the given device in atomic context with
+interrupts disabled.  This implies that the callback routines in question must
+not block or sleep, but it also means that the synchronous helper functions
+listed at the end of Section 4 may be used for that device within an interrupt
+handler or generally in an atomic context.
+
+The subsystem-level suspend callback, if present, is _entirely_ _responsible_
+for handling the suspend of the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_suspend() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_suspend()
+callback in a device driver as long as the subsystem-level suspend callback
+knows what to do to handle the device).
+
+  * Once the subsystem-level suspend callback (or the driver suspend callback,
+    if invoked directly) has completed successfully for the given device, the PM
+    core regards the device as suspended, which need not mean that it has been
+    put into a low power state.  It is supposed to mean, however, that the
+    device will not process data and will not communicate with the CPU(s) and
+    RAM until the appropriate resume callback is executed for it.  The runtime
+    PM status of a device after successful execution of the suspend callback is
+    'suspended'.
+
+  * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM
+    status remains 'active', which means that the device _must_ be fully
+    operational afterwards.
+
+  * If the suspend callback returns an error code different from -EBUSY and
+    -EAGAIN, the PM core regards this as a fatal error and will refuse to run
+    the helper functions described in Section 4 for the device until its status
+    is directly set to  either 'active', or 'suspended' (the PM core provides
+    special helper functions for this purpose).
+
+In particular, if the driver requires remote wakeup capability (i.e. hardware
+mechanism allowing the device to request a change of its power state, such as
+PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the
+device, then ->runtime_suspend() should return -EBUSY.  On the other hand, if
+device_can_wakeup() returns 'true' for the device and the device is put into a
+low-power state during the execution of the suspend callback, it is expected
+that remote wakeup will be enabled for the device.  Generally, remote wakeup
+should be enabled for all input devices put into low-power states at run time.
+
+The subsystem-level resume callback, if present, is **entirely responsible** for
+handling the resume of the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_resume() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_resume()
+callback in a device driver as long as the subsystem-level resume callback knows
+what to do to handle the device).
+
+  * Once the subsystem-level resume callback (or the driver resume callback, if
+    invoked directly) has completed successfully, the PM core regards the device
+    as fully operational, which means that the device _must_ be able to complete
+    I/O operations as needed.  The runtime PM status of the device is then
+    'active'.
+
+  * If the resume callback returns an error code, the PM core regards this as a
+    fatal error and will refuse to run the helper functions described in Section
+    4 for the device, until its status is directly set to either 'active', or
+    'suspended' (by means of special helper functions provided by the PM core
+    for this purpose).
+
+The idle callback (a subsystem-level one, if present, or the driver one) is
+executed by the PM core whenever the device appears to be idle, which is
+indicated to the PM core by two counters, the device's usage counter and the
+counter of 'active' children of the device.
+
+  * If any of these counters is decreased using a helper function provided by
+    the PM core and it turns out to be equal to zero, the other counter is
+    checked.  If that counter also is equal to zero, the PM core executes the
+    idle callback with the device as its argument.
+
+The action performed by the idle callback is totally dependent on the subsystem
+(or driver) in question, but the expected and recommended action is to check
+if the device can be suspended (i.e. if all of the conditions necessary for
+suspending the device are satisfied) and to queue up a suspend request for the
+device in that case.  If there is no idle callback, or if the callback returns
+0, then the PM core will attempt to carry out a runtime suspend of the device,
+also respecting devices configured for autosuspend.  In essence this means a
+call to pm_runtime_autosuspend() (do note that drivers needs to update the
+device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
+this circumstance).  To prevent this (for example, if the callback routine has
+started a delayed suspend), the routine must return a non-zero value.  Negative
+error return codes are ignored by the PM core.
+
+The helper functions provided by the PM core, described in Section 4, guarantee
+that the following constraints are met with respect to runtime PM callbacks for
+one device:
+
+(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
+    ->runtime_suspend() in parallel with ->runtime_resume() or with another
+    instance of ->runtime_suspend() for the same device) with the exception that
+    ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
+    ->runtime_idle() (although ->runtime_idle() will not be started while any
+    of the other callbacks is being executed for the same device).
+
+(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
+    devices (i.e. the PM core will only execute ->runtime_idle() or
+    ->runtime_suspend() for the devices the runtime PM status of which is
+    'active').
+
+(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
+    the usage counter of which is equal to zero _and_ either the counter of
+    'active' children of which is equal to zero, or the 'power.ignore_children'
+    flag of which is set.
+
+(4) ->runtime_resume() can only be executed for 'suspended' devices  (i.e. the
+    PM core will only execute ->runtime_resume() for the devices the runtime
+    PM status of which is 'suspended').
+
+Additionally, the helper functions provided by the PM core obey the following
+rules:
+
+  * If ->runtime_suspend() is about to be executed or there's a pending request
+    to execute it, ->runtime_idle() will not be executed for the same device.
+
+  * A request to execute or to schedule the execution of ->runtime_suspend()
+    will cancel any pending requests to execute ->runtime_idle() for the same
+    device.
+
+  * If ->runtime_resume() is about to be executed or there's a pending request
+    to execute it, the other callbacks will not be executed for the same device.
+
+  * A request to execute ->runtime_resume() will cancel any pending or
+    scheduled requests to execute the other callbacks for the same device,
+    except for scheduled autosuspends.
+
+3. Runtime PM Device Fields
+===========================
+
+The following device runtime PM fields are present in 'struct dev_pm_info', as
+defined in include/linux/pm.h:
+
+  `struct timer_list suspend_timer;`
+    - timer used for scheduling (delayed) suspend and autosuspend requests
+
+  `unsigned long timer_expires;`
+    - timer expiration time, in jiffies (if this is different from zero, the
+      timer is running and will expire at that time, otherwise the timer is not
+      running)
+
+  `struct work_struct work;`
+    - work structure used for queuing up requests (i.e. work items in pm_wq)
+
+  `wait_queue_head_t wait_queue;`
+    - wait queue used if any of the helper functions needs to wait for another
+      one to complete
+
+  `spinlock_t lock;`
+    - lock used for synchronization
+
+  `atomic_t usage_count;`
+    - the usage counter of the device
+
+  `atomic_t child_count;`
+    - the count of 'active' children of the device
+
+  `unsigned int ignore_children;`
+    - if set, the value of child_count is ignored (but still updated)
+
+  `unsigned int disable_depth;`
+    - used for disabling the helper functions (they work normally if this is
+      equal to zero); the initial value of it is 1 (i.e. runtime PM is
+      initially disabled for all devices)
+
+  `int runtime_error;`
+    - if set, there was a fatal error (one of the callbacks returned error code
+      as described in Section 2), so the helper functions will not work until
+      this flag is cleared; this is the error code returned by the failing
+      callback
+
+  `unsigned int idle_notification;`
+    - if set, ->runtime_idle() is being executed
+
+  `unsigned int request_pending;`
+    - if set, there's a pending request (i.e. a work item queued up into pm_wq)
+
+  `enum rpm_request request;`
+    - type of request that's pending (valid if request_pending is set)
+
+  `unsigned int deferred_resume;`
+    - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
+      being executed for that device and it is not practical to wait for the
+      suspend to complete; means "start a resume as soon as you've suspended"
+
+  `enum rpm_status runtime_status;`
+    - the runtime PM status of the device; this field's initial value is
+      RPM_SUSPENDED, which means that each device is initially regarded by the
+      PM core as 'suspended', regardless of its real hardware status
+
+  `unsigned int runtime_auto;`
+    - if set, indicates that the user space has allowed the device driver to
+      power manage the device at run time via the /sys/devices/.../power/control
+      `interface;` it may only be modified with the help of the pm_runtime_allow()
+      and pm_runtime_forbid() helper functions
+
+  `unsigned int no_callbacks;`
+    - indicates that the device does not use the runtime PM callbacks (see
+      Section 8); it may be modified only by the pm_runtime_no_callbacks()
+      helper function
+
+  `unsigned int irq_safe;`
+    - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
+      will be invoked with the spinlock held and interrupts disabled
+
+  `unsigned int use_autosuspend;`
+    - indicates that the device's driver supports delayed autosuspend (see
+      Section 9); it may be modified only by the
+      pm_runtime{_dont}_use_autosuspend() helper functions
+
+  `unsigned int timer_autosuspends;`
+    - indicates that the PM core should attempt to carry out an autosuspend
+      when the timer expires rather than a normal suspend
+
+  `int autosuspend_delay;`
+    - the delay time (in milliseconds) to be used for autosuspend
+
+  `unsigned long last_busy;`
+    - the time (in jiffies) when the pm_runtime_mark_last_busy() helper
+      function was last called for this device; used in calculating inactivity
+      periods for autosuspend
+
+All of the above fields are members of the 'power' member of 'struct device'.
+
+4. Runtime PM Device Helper Functions
+=====================================
+
+The following runtime PM helper functions are defined in
+drivers/base/power/runtime.c and include/linux/pm_runtime.h:
+
+  `void pm_runtime_init(struct device *dev);`
+    - initialize the device runtime PM fields in 'struct dev_pm_info'
+
+  `void pm_runtime_remove(struct device *dev);`
+    - make sure that the runtime PM of the device will be disabled after
+      removing the device from device hierarchy
+
+  `int pm_runtime_idle(struct device *dev);`
+    - execute the subsystem-level idle callback for the device; returns an
+      error code on failure, where -EINPROGRESS means that ->runtime_idle() is
+      already being executed; if there is no callback or the callback returns 0
+      then run pm_runtime_autosuspend(dev) and return its result
+
+  `int pm_runtime_suspend(struct device *dev);`
+    - execute the subsystem-level suspend callback for the device; returns 0 on
+      success, 1 if the device's runtime PM status was already 'suspended', or
+      error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
+      to suspend the device again in future and -EACCES means that
+      'power.disable_depth' is different from 0
+
+  `int pm_runtime_autosuspend(struct device *dev);`
+    - same as pm_runtime_suspend() except that the autosuspend delay is taken
+      `into account;` if pm_runtime_autosuspend_expiration() says the delay has
+      not yet expired then an autosuspend is scheduled for the appropriate time
+      and 0 is returned
+
+  `int pm_runtime_resume(struct device *dev);`
+    - execute the subsystem-level resume callback for the device; returns 0 on
+      success, 1 if the device's runtime PM status was already 'active' or
+      error code on failure, where -EAGAIN means it may be safe to attempt to
+      resume the device again in future, but 'power.runtime_error' should be
+      checked additionally, and -EACCES means that 'power.disable_depth' is
+      different from 0
+
+  `int pm_request_idle(struct device *dev);`
+    - submit a request to execute the subsystem-level idle callback for the
+      device (the request is represented by a work item in pm_wq); returns 0 on
+      success or error code if the request has not been queued up
+
+  `int pm_request_autosuspend(struct device *dev);`
+    - schedule the execution of the subsystem-level suspend callback for the
+      device when the autosuspend delay has expired; if the delay has already
+      expired then the work item is queued up immediately
+
+  `int pm_schedule_suspend(struct device *dev, unsigned int delay);`
+    - schedule the execution of the subsystem-level suspend callback for the
+      device in future, where 'delay' is the time to wait before queuing up a
+      suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
+      item is queued up immediately); returns 0 on success, 1 if the device's PM
+      runtime status was already 'suspended', or error code if the request
+      hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
+      ->runtime_suspend() is already scheduled and not yet expired, the new
+      value of 'delay' will be used as the time to wait
+
+  `int pm_request_resume(struct device *dev);`
+    - submit a request to execute the subsystem-level resume callback for the
+      device (the request is represented by a work item in pm_wq); returns 0 on
+      success, 1 if the device's runtime PM status was already 'active', or
+      error code if the request hasn't been queued up
+
+  `void pm_runtime_get_noresume(struct device *dev);`
+    - increment the device's usage counter
+
+  `int pm_runtime_get(struct device *dev);`
+    - increment the device's usage counter, run pm_request_resume(dev) and
+      return its result
+
+  `int pm_runtime_get_sync(struct device *dev);`
+    - increment the device's usage counter, run pm_runtime_resume(dev) and
+      return its result
+
+  `int pm_runtime_get_if_in_use(struct device *dev);`
+    - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
+      runtime PM status is RPM_ACTIVE and the runtime PM usage counter is
+      nonzero, increment the counter and return 1; otherwise return 0 without
+      changing the counter
+
+  `void pm_runtime_put_noidle(struct device *dev);`
+    - decrement the device's usage counter
+
+  `int pm_runtime_put(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_request_idle(dev) and return its result
+
+  `int pm_runtime_put_autosuspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_request_autosuspend(dev) and return its result
+
+  `int pm_runtime_put_sync(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_idle(dev) and return its result
+
+  `int pm_runtime_put_sync_suspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_suspend(dev) and return its result
+
+  `int pm_runtime_put_sync_autosuspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_autosuspend(dev) and return its result
+
+  `void pm_runtime_enable(struct device *dev);`
+    - decrement the device's 'power.disable_depth' field; if that field is equal
+      to zero, the runtime PM helper functions can execute subsystem-level
+      callbacks described in Section 2 for the device
+
+  `int pm_runtime_disable(struct device *dev);`
+    - increment the device's 'power.disable_depth' field (if the value of that
+      field was previously zero, this prevents subsystem-level runtime PM
+      callbacks from being run for the device), make sure that all of the
+      pending runtime PM operations on the device are either completed or
+      canceled; returns 1 if there was a resume request pending and it was
+      necessary to execute the subsystem-level resume callback for the device
+      to satisfy that request, otherwise 0 is returned
+
+  `int pm_runtime_barrier(struct device *dev);`
+    - check if there's a resume request pending for the device and resume it
+      (synchronously) in that case, cancel any other pending runtime PM requests
+      regarding it and wait for all runtime PM operations on it in progress to
+      complete; returns 1 if there was a resume request pending and it was
+      necessary to execute the subsystem-level resume callback for the device to
+      satisfy that request, otherwise 0 is returned
+
+  `void pm_suspend_ignore_children(struct device *dev, bool enable);`
+    - set/unset the power.ignore_children flag of the device
+
+  `int pm_runtime_set_active(struct device *dev);`
+    - clear the device's 'power.runtime_error' flag, set the device's runtime
+      PM status to 'active' and update its parent's counter of 'active'
+      children as appropriate (it is only valid to use this function if
+      'power.runtime_error' is set or 'power.disable_depth' is greater than
+      zero); it will fail and return error code if the device has a parent
+      which is not active and the 'power.ignore_children' flag of which is unset
+
+  `void pm_runtime_set_suspended(struct device *dev);`
+    - clear the device's 'power.runtime_error' flag, set the device's runtime
+      PM status to 'suspended' and update its parent's counter of 'active'
+      children as appropriate (it is only valid to use this function if
+      'power.runtime_error' is set or 'power.disable_depth' is greater than
+      zero)
+
+  `bool pm_runtime_active(struct device *dev);`
+    - return true if the device's runtime PM status is 'active' or its
+      'power.disable_depth' field is not equal to zero, or false otherwise
+
+  `bool pm_runtime_suspended(struct device *dev);`
+    - return true if the device's runtime PM status is 'suspended' and its
+      'power.disable_depth' field is equal to zero, or false otherwise
+
+  `bool pm_runtime_status_suspended(struct device *dev);`
+    - return true if the device's runtime PM status is 'suspended'
+
+  `void pm_runtime_allow(struct device *dev);`
+    - set the power.runtime_auto flag for the device and decrease its usage
+      counter (used by the /sys/devices/.../power/control interface to
+      effectively allow the device to be power managed at run time)
+
+  `void pm_runtime_forbid(struct device *dev);`
+    - unset the power.runtime_auto flag for the device and increase its usage
+      counter (used by the /sys/devices/.../power/control interface to
+      effectively prevent the device from being power managed at run time)
+
+  `void pm_runtime_no_callbacks(struct device *dev);`
+    - set the power.no_callbacks flag for the device and remove the runtime
+      PM attributes from /sys/devices/.../power (or prevent them from being
+      added when the device is registered)
+
+  `void pm_runtime_irq_safe(struct device *dev);`
+    - set the power.irq_safe flag for the device, causing the runtime-PM
+      callbacks to be invoked with interrupts off
+
+  `bool pm_runtime_is_irq_safe(struct device *dev);`
+    - return true if power.irq_safe flag was set for the device, causing
+      the runtime-PM callbacks to be invoked with interrupts off
+
+  `void pm_runtime_mark_last_busy(struct device *dev);`
+    - set the power.last_busy field to the current time
+
+  `void pm_runtime_use_autosuspend(struct device *dev);`
+    - set the power.use_autosuspend flag, enabling autosuspend delays; call
+      pm_runtime_get_sync if the flag was previously cleared and
+      power.autosuspend_delay is negative
+
+  `void pm_runtime_dont_use_autosuspend(struct device *dev);`
+    - clear the power.use_autosuspend flag, disabling autosuspend delays;
+      decrement the device's usage counter if the flag was previously set and
+      power.autosuspend_delay is negative; call pm_runtime_idle
+
+  `void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);`
+    - set the power.autosuspend_delay value to 'delay' (expressed in
+      milliseconds); if 'delay' is negative then runtime suspends are
+      prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
+      called or the device's usage counter may be decremented and
+      pm_runtime_idle called depending on if power.autosuspend_delay is
+      changed to or from a negative value; if power.use_autosuspend is clear,
+      pm_runtime_idle is called
+
+  `unsigned long pm_runtime_autosuspend_expiration(struct device *dev);`
+    - calculate the time when the current autosuspend delay period will expire,
+      based on power.last_busy and power.autosuspend_delay; if the delay time
+      is 1000 ms or larger then the expiration time is rounded up to the
+      nearest second; returns 0 if the delay period has already expired or
+      power.use_autosuspend isn't set, otherwise returns the expiration time
+      in jiffies
+
+It is safe to execute the following helper functions from interrupt context:
+
+- pm_request_idle()
+- pm_request_autosuspend()
+- pm_schedule_suspend()
+- pm_request_resume()
+- pm_runtime_get_noresume()
+- pm_runtime_get()
+- pm_runtime_put_noidle()
+- pm_runtime_put()
+- pm_runtime_put_autosuspend()
+- pm_runtime_enable()
+- pm_suspend_ignore_children()
+- pm_runtime_set_active()
+- pm_runtime_set_suspended()
+- pm_runtime_suspended()
+- pm_runtime_mark_last_busy()
+- pm_runtime_autosuspend_expiration()
+
+If pm_runtime_irq_safe() has been called for a device then the following helper
+functions may also be used in interrupt context:
+
+- pm_runtime_idle()
+- pm_runtime_suspend()
+- pm_runtime_autosuspend()
+- pm_runtime_resume()
+- pm_runtime_get_sync()
+- pm_runtime_put_sync()
+- pm_runtime_put_sync_suspend()
+- pm_runtime_put_sync_autosuspend()
+
+5. Runtime PM Initialization, Device Probing and Removal
+========================================================
+
+Initially, the runtime PM is disabled for all devices, which means that the
+majority of the runtime PM helper functions described in Section 4 will return
+-EAGAIN until pm_runtime_enable() is called for the device.
+
+In addition to that, the initial runtime PM status of all devices is
+'suspended', but it need not reflect the actual physical state of the device.
+Thus, if the device is initially active (i.e. it is able to process I/O), its
+runtime PM status must be changed to 'active', with the help of
+pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
+
+However, if the device has a parent and the parent's runtime PM is enabled,
+calling pm_runtime_set_active() for the device will affect the parent, unless
+the parent's 'power.ignore_children' flag is set.  Namely, in that case the
+parent won't be able to suspend at run time, using the PM core's helper
+functions, as long as the child's status is 'active', even if the child's
+runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
+the child yet or pm_runtime_disable() has been called for it).  For this reason,
+once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
+should be called for it too as soon as reasonably possible or its runtime PM
+status should be changed back to 'suspended' with the help of
+pm_runtime_set_suspended().
+
+If the default initial runtime PM status of the device (i.e. 'suspended')
+reflects the actual state of the device, its bus type's or its driver's
+->probe() callback will likely need to wake it up using one of the PM core's
+helper functions described in Section 4.  In that case, pm_runtime_resume()
+should be used.  Of course, for this purpose the device's runtime PM has to be
+enabled earlier by calling pm_runtime_enable().
+
+Note, if the device may execute pm_runtime calls during the probe (such as
+if it is registers with a subsystem that may call back in) then the
+pm_runtime_get_sync() call paired with a pm_runtime_put() call will be
+appropriate to ensure that the device is not put back to sleep during the
+probe. This can happen with systems such as the network device layer.
+
+It may be desirable to suspend the device once ->probe() has finished.
+Therefore the driver core uses the asynchronous pm_request_idle() to submit a
+request to execute the subsystem-level idle callback for the device at that
+time.  A driver that makes use of the runtime autosuspend feature, may want to
+update the last busy mark before returning from ->probe().
+
+Moreover, the driver core prevents runtime PM callbacks from racing with the bus
+notifier callback in __device_release_driver(), which is necessary, because the
+notifier is used by some subsystems to carry out operations affecting the
+runtime PM functionality.  It does so by calling pm_runtime_get_sync() before
+driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications.  This
+resumes the device if it's in the suspended state and prevents it from
+being suspended again while those routines are being executed.
+
+To allow bus types and drivers to put devices into the suspended state by
+calling pm_runtime_suspend() from their ->remove() routines, the driver core
+executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
+notifications in __device_release_driver().  This requires bus types and
+drivers to make their ->remove() callbacks avoid races with runtime PM directly,
+but also it allows of more flexibility in the handling of devices during the
+removal of their drivers.
+
+Drivers in ->remove() callback should undo the runtime PM changes done
+in ->probe(). Usually this means calling pm_runtime_disable(),
+pm_runtime_dont_use_autosuspend() etc.
+
+The user space can effectively disallow the driver of the device to power manage
+it at run time by changing the value of its /sys/devices/.../power/control
+attribute to "on", which causes pm_runtime_forbid() to be called.  In principle,
+this mechanism may also be used by the driver to effectively turn off the
+runtime power management of the device until the user space turns it on.
+Namely, during the initialization the driver can make sure that the runtime PM
+status of the device is 'active' and call pm_runtime_forbid().  It should be
+noted, however, that if the user space has already intentionally changed the
+value of /sys/devices/.../power/control to "auto" to allow the driver to power
+manage the device at run time, the driver may confuse it by using
+pm_runtime_forbid() this way.
+
+6. Runtime PM and System Sleep
+==============================
+
+Runtime PM and system sleep (i.e., system suspend and hibernation, also known
+as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
+ways.  If a device is active when a system sleep starts, everything is
+straightforward.  But what should happen if the device is already suspended?
+
+The device may have different wake-up settings for runtime PM and system sleep.
+For example, remote wake-up may be enabled for runtime suspend but disallowed
+for system sleep (device_may_wakeup(dev) returns 'false').  When this happens,
+the subsystem-level system suspend callback is responsible for changing the
+device's wake-up setting (it may leave that to the device driver's system
+suspend routine).  It may be necessary to resume the device and suspend it again
+in order to do so.  The same is true if the driver uses different power levels
+or other settings for runtime suspend and system sleep.
+
+During system resume, the simplest approach is to bring all devices back to full
+power, even if they had been suspended before the system suspend began.  There
+are several reasons for this, including:
+
+  * The device might need to switch power levels, wake-up settings, etc.
+
+  * Remote wake-up events might have been lost by the firmware.
+
+  * The device's children may need the device to be at full power in order
+    to resume themselves.
+
+  * The driver's idea of the device state may not agree with the device's
+    physical state.  This can happen during resume from hibernation.
+
+  * The device might need to be reset.
+
+  * Even though the device was suspended, if its usage counter was > 0 then most
+    likely it would need a runtime resume in the near future anyway.
+
+If the device had been suspended before the system suspend began and it's
+brought back to full power during resume, then its runtime PM status will have
+to be updated to reflect the actual post-system sleep status.  The way to do
+this is:
+
+	 - pm_runtime_disable(dev);
+	 - pm_runtime_set_active(dev);
+	 - pm_runtime_enable(dev);
+
+The PM core always increments the runtime usage counter before calling the
+->suspend() callback and decrements it after calling the ->resume() callback.
+Hence disabling runtime PM temporarily like this will not cause any runtime
+suspend attempts to be permanently lost.  If the usage count goes to zero
+following the return of the ->resume() callback, the ->runtime_idle() callback
+will be invoked as usual.
+
+On some systems, however, system sleep is not entered through a global firmware
+or hardware operation.  Instead, all hardware components are put into low-power
+states directly by the kernel in a coordinated way.  Then, the system sleep
+state effectively follows from the states the hardware components end up in
+and the system is woken up from that state by a hardware interrupt or a similar
+mechanism entirely under the kernel's control.  As a result, the kernel never
+gives control away and the states of all devices during resume are precisely
+known to it.  If that is the case and none of the situations listed above takes
+place (in particular, if the system is not waking up from hibernation), it may
+be more efficient to leave the devices that had been suspended before the system
+suspend began in the suspended state.
+
+To this end, the PM core provides a mechanism allowing some coordination between
+different levels of device hierarchy.  Namely, if a system suspend .prepare()
+callback returns a positive number for a device, that indicates to the PM core
+that the device appears to be runtime-suspended and its state is fine, so it
+may be left in runtime suspend provided that all of its descendants are also
+left in runtime suspend.  If that happens, the PM core will not execute any
+system suspend and resume callbacks for all of those devices, except for the
+complete callback, which is then entirely responsible for handling the device
+as appropriate.  This only applies to system suspend transitions that are not
+related to hibernation (see Documentation/driver-api/pm/devices.rst for more
+information).
+
+The PM core does its best to reduce the probability of race conditions between
+the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
+out the following operations:
+
+  * During system suspend pm_runtime_get_noresume() is called for every device
+    right before executing the subsystem-level .prepare() callback for it and
+    pm_runtime_barrier() is called for every device right before executing the
+    subsystem-level .suspend() callback for it.  In addition to that the PM core
+    calls  __pm_runtime_disable() with 'false' as the second argument for every
+    device right before executing the subsystem-level .suspend_late() callback
+    for it.
+
+  * During system resume pm_runtime_enable() and pm_runtime_put() are called for
+    every device right after executing the subsystem-level .resume_early()
+    callback and right after executing the subsystem-level .complete() callback
+    for it, respectively.
+
+7. Generic subsystem callbacks
+
+Subsystems may wish to conserve code space by using the set of generic power
+management callbacks provided by the PM core, defined in
+driver/base/power/generic_ops.c:
+
+  `int pm_generic_runtime_suspend(struct device *dev);`
+    - invoke the ->runtime_suspend() callback provided by the driver of this
+      device and return its result, or return 0 if not defined
+
+  `int pm_generic_runtime_resume(struct device *dev);`
+    - invoke the ->runtime_resume() callback provided by the driver of this
+      device and return its result, or return 0 if not defined
+
+  `int pm_generic_suspend(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->suspend()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_suspend_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_resume(struct device *dev);`
+    - invoke the ->resume() callback provided by the driver of this device and,
+      if successful, change the device's runtime PM status to 'active'
+
+  `int pm_generic_resume_noirq(struct device *dev);`
+    - invoke the ->resume_noirq() callback provided by the driver of this device
+
+  `int pm_generic_freeze(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->freeze()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_freeze_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_thaw(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->thaw()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_thaw_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_poweroff(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->poweroff()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_poweroff_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_restore(struct device *dev);`
+    - invoke the ->restore() callback provided by the driver of this device and,
+      if successful, change the device's runtime PM status to 'active'
+
+  `int pm_generic_restore_noirq(struct device *dev);`
+    - invoke the ->restore_noirq() callback provided by the device's driver
+
+These functions are the defaults used by the PM core, if a subsystem doesn't
+provide its own callbacks for ->runtime_idle(), ->runtime_suspend(),
+->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
+->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
+->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the
+subsystem-level dev_pm_ops structure.
+
+Device drivers that wish to use the same function as a system suspend, freeze,
+poweroff and runtime suspend callback, and similarly for system resume, thaw,
+restore, and runtime resume, can achieve this with the help of the
+UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
+last argument to NULL).
+
+8. "No-Callback" Devices
+========================
+
+Some "devices" are only logical sub-devices of their parent and cannot be
+power-managed on their own.  (The prototype example is a USB interface.  Entire
+USB devices can go into low-power mode or send wake-up requests, but neither is
+possible for individual interfaces.)  The drivers for these devices have no
+need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
+and ->runtime_resume() would always return 0 without doing anything else and
+->runtime_idle() would always call pm_runtime_suspend().
+
+Subsystems can tell the PM core about these devices by calling
+pm_runtime_no_callbacks().  This should be done after the device structure is
+initialized and before it is registered (although after device registration is
+also okay).  The routine will set the device's power.no_callbacks flag and
+prevent the non-debugging runtime PM sysfs attributes from being created.
+
+When power.no_callbacks is set, the PM core will not invoke the
+->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
+Instead it will assume that suspends and resumes always succeed and that idle
+devices should be suspended.
+
+As a consequence, the PM core will never directly inform the device's subsystem
+or driver about runtime power changes.  Instead, the driver for the device's
+parent must take responsibility for telling the device's driver when the
+parent's power state changes.
+
+9. Autosuspend, or automatically-delayed suspends
+=================================================
+
+Changing a device's power state isn't free; it requires both time and energy.
+A device should be put in a low-power state only when there's some reason to
+think it will remain in that state for a substantial time.  A common heuristic
+says that a device which hasn't been used for a while is liable to remain
+unused; following this advice, drivers should not allow devices to be suspended
+at runtime until they have been inactive for some minimum period.  Even when
+the heuristic ends up being non-optimal, it will still prevent devices from
+"bouncing" too rapidly between low-power and full-power states.
+
+The term "autosuspend" is an historical remnant.  It doesn't mean that the
+device is automatically suspended (the subsystem or driver still has to call
+the appropriate PM routines); rather it means that runtime suspends will
+automatically be delayed until the desired period of inactivity has elapsed.
+
+Inactivity is determined based on the power.last_busy field.  Drivers should
+call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
+typically just before calling pm_runtime_put_autosuspend().  The desired length
+of the inactivity period is a matter of policy.  Subsystems can set this length
+initially by calling pm_runtime_set_autosuspend_delay(), but after device
+registration the length should be controlled by user space, using the
+/sys/devices/.../power/autosuspend_delay_ms attribute.
+
+In order to use autosuspend, subsystems or drivers must call
+pm_runtime_use_autosuspend() (preferably before registering the device), and
+thereafter they should use the various `*_autosuspend()` helper functions
+instead of the non-autosuspend counterparts::
+
+	Instead of: pm_runtime_suspend    use: pm_runtime_autosuspend;
+	Instead of: pm_schedule_suspend   use: pm_request_autosuspend;
+	Instead of: pm_runtime_put        use: pm_runtime_put_autosuspend;
+	Instead of: pm_runtime_put_sync   use: pm_runtime_put_sync_autosuspend.
+
+Drivers may also continue to use the non-autosuspend helper functions; they
+will behave normally, which means sometimes taking the autosuspend delay into
+account (see pm_runtime_idle).
+
+Under some circumstances a driver or subsystem may want to prevent a device
+from autosuspending immediately, even though the usage counter is zero and the
+autosuspend delay time has expired.  If the ->runtime_suspend() callback
+returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is
+in the future (as it normally would be if the callback invoked
+pm_runtime_mark_last_busy()), the PM core will automatically reschedule the
+autosuspend.  The ->runtime_suspend() callback can't do this rescheduling
+itself because no suspend requests of any kind are accepted while the device is
+suspending (i.e., while the callback is running).
+
+The implementation is well suited for asynchronous use in interrupt contexts.
+However such use inevitably involves races, because the PM core can't
+synchronize ->runtime_suspend() callbacks with the arrival of I/O requests.
+This synchronization must be handled by the driver, using its private lock.
+Here is a schematic pseudo-code example::
+
+	foo_read_or_write(struct foo_priv *foo, void *data)
+	{
+		lock(&foo->private_lock);
+		add_request_to_io_queue(foo, data);
+		if (foo->num_pending_requests++ == 0)
+			pm_runtime_get(&foo->dev);
+		if (!foo->is_suspended)
+			foo_process_next_request(foo);
+		unlock(&foo->private_lock);
+	}
+
+	foo_io_completion(struct foo_priv *foo, void *req)
+	{
+		lock(&foo->private_lock);
+		if (--foo->num_pending_requests == 0) {
+			pm_runtime_mark_last_busy(&foo->dev);
+			pm_runtime_put_autosuspend(&foo->dev);
+		} else {
+			foo_process_next_request(foo);
+		}
+		unlock(&foo->private_lock);
+		/* Send req result back to the user ... */
+	}
+
+	int foo_runtime_suspend(struct device *dev)
+	{
+		struct foo_priv foo = container_of(dev, ...);
+		int ret = 0;
+
+		lock(&foo->private_lock);
+		if (foo->num_pending_requests > 0) {
+			ret = -EBUSY;
+		} else {
+			/* ... suspend the device ... */
+			foo->is_suspended = 1;
+		}
+		unlock(&foo->private_lock);
+		return ret;
+	}
+
+	int foo_runtime_resume(struct device *dev)
+	{
+		struct foo_priv foo = container_of(dev, ...);
+
+		lock(&foo->private_lock);
+		/* ... resume the device ... */
+		foo->is_suspended = 0;
+		pm_runtime_mark_last_busy(&foo->dev);
+		if (foo->num_pending_requests > 0)
+			foo_process_next_request(foo);
+		unlock(&foo->private_lock);
+		return 0;
+	}
+
+The important point is that after foo_io_completion() asks for an autosuspend,
+the foo_runtime_suspend() callback may race with foo_read_or_write().
+Therefore foo_runtime_suspend() has to check whether there are any pending I/O
+requests (while holding the private lock) before allowing the suspend to
+proceed.
+
+In addition, the power.autosuspend_delay field can be changed by user space at
+any time.  If a driver cares about this, it can call
+pm_runtime_autosuspend_expiration() from within the ->runtime_suspend()
+callback while holding its private lock.  If the function returns a nonzero
+value then the delay has not yet expired and the callback should return
+-EAGAIN.
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
deleted file mode 100644
index 937e33c46211..000000000000
--- a/Documentation/power/runtime_pm.txt
+++ /dev/null
@@ -1,928 +0,0 @@
-Runtime Power Management Framework for I/O Devices
-
-(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-(C) 2010 Alan Stern <stern@rowland.harvard.edu>
-(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-1. Introduction
-
-Support for runtime power management (runtime PM) of I/O devices is provided
-at the power management core (PM core) level by means of:
-
-* The power management workqueue pm_wq in which bus types and device drivers can
-  put their PM-related work items.  It is strongly recommended that pm_wq be
-  used for queuing all work items related to runtime PM, because this allows
-  them to be synchronized with system-wide power transitions (suspend to RAM,
-  hibernation and resume from system sleep states).  pm_wq is declared in
-  include/linux/pm_runtime.h and defined in kernel/power/main.c.
-
-* A number of runtime PM fields in the 'power' member of 'struct device' (which
-  is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
-  be used for synchronizing runtime PM operations with one another.
-
-* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
-  include/linux/pm.h).
-
-* A set of helper functions defined in drivers/base/power/runtime.c that can be
-  used for carrying out runtime PM operations in such a way that the
-  synchronization between them is taken care of by the PM core.  Bus types and
-  device drivers are encouraged to use these functions.
-
-The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
-fields of 'struct dev_pm_info' and the core helper functions provided for
-runtime PM are described below.
-
-2. Device Runtime PM Callbacks
-
-There are three device runtime PM callbacks defined in 'struct dev_pm_ops':
-
-struct dev_pm_ops {
-	...
-	int (*runtime_suspend)(struct device *dev);
-	int (*runtime_resume)(struct device *dev);
-	int (*runtime_idle)(struct device *dev);
-	...
-};
-
-The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks
-are executed by the PM core for the device's subsystem that may be either of
-the following:
-
-  1. PM domain of the device, if the device's PM domain object, dev->pm_domain,
-     is present.
-
-  2. Device type of the device, if both dev->type and dev->type->pm are present.
-
-  3. Device class of the device, if both dev->class and dev->class->pm are
-     present.
-
-  4. Bus type of the device, if both dev->bus and dev->bus->pm are present.
-
-If the subsystem chosen by applying the above rules doesn't provide the relevant
-callback, the PM core will invoke the corresponding driver callback stored in
-dev->driver->pm directly (if present).
-
-The PM core always checks which callback to use in the order given above, so the
-priority order of callbacks from high to low is: PM domain, device type, class
-and bus type.  Moreover, the high-priority one will always take precedence over
-a low-priority one.  The PM domain, bus type, device type and class callbacks
-are referred to as subsystem-level callbacks in what follows.
-
-By default, the callbacks are always invoked in process context with interrupts
-enabled.  However, the pm_runtime_irq_safe() helper function can be used to tell
-the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume()
-and ->runtime_idle() callbacks for the given device in atomic context with
-interrupts disabled.  This implies that the callback routines in question must
-not block or sleep, but it also means that the synchronous helper functions
-listed at the end of Section 4 may be used for that device within an interrupt
-handler or generally in an atomic context.
-
-The subsystem-level suspend callback, if present, is _entirely_ _responsible_
-for handling the suspend of the device as appropriate, which may, but need not
-include executing the device driver's own ->runtime_suspend() callback (from the
-PM core's point of view it is not necessary to implement a ->runtime_suspend()
-callback in a device driver as long as the subsystem-level suspend callback
-knows what to do to handle the device).
-
-  * Once the subsystem-level suspend callback (or the driver suspend callback,
-    if invoked directly) has completed successfully for the given device, the PM
-    core regards the device as suspended, which need not mean that it has been
-    put into a low power state.  It is supposed to mean, however, that the
-    device will not process data and will not communicate with the CPU(s) and
-    RAM until the appropriate resume callback is executed for it.  The runtime
-    PM status of a device after successful execution of the suspend callback is
-    'suspended'.
-
-  * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM
-    status remains 'active', which means that the device _must_ be fully
-    operational afterwards.
-
-  * If the suspend callback returns an error code different from -EBUSY and
-    -EAGAIN, the PM core regards this as a fatal error and will refuse to run
-    the helper functions described in Section 4 for the device until its status
-    is directly set to  either 'active', or 'suspended' (the PM core provides
-    special helper functions for this purpose).
-
-In particular, if the driver requires remote wakeup capability (i.e. hardware
-mechanism allowing the device to request a change of its power state, such as
-PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the
-device, then ->runtime_suspend() should return -EBUSY.  On the other hand, if
-device_can_wakeup() returns 'true' for the device and the device is put into a
-low-power state during the execution of the suspend callback, it is expected
-that remote wakeup will be enabled for the device.  Generally, remote wakeup
-should be enabled for all input devices put into low-power states at run time.
-
-The subsystem-level resume callback, if present, is _entirely_ _responsible_ for
-handling the resume of the device as appropriate, which may, but need not
-include executing the device driver's own ->runtime_resume() callback (from the
-PM core's point of view it is not necessary to implement a ->runtime_resume()
-callback in a device driver as long as the subsystem-level resume callback knows
-what to do to handle the device).
-
-  * Once the subsystem-level resume callback (or the driver resume callback, if
-    invoked directly) has completed successfully, the PM core regards the device
-    as fully operational, which means that the device _must_ be able to complete
-    I/O operations as needed.  The runtime PM status of the device is then
-    'active'.
-
-  * If the resume callback returns an error code, the PM core regards this as a
-    fatal error and will refuse to run the helper functions described in Section
-    4 for the device, until its status is directly set to either 'active', or
-    'suspended' (by means of special helper functions provided by the PM core
-    for this purpose).
-
-The idle callback (a subsystem-level one, if present, or the driver one) is
-executed by the PM core whenever the device appears to be idle, which is
-indicated to the PM core by two counters, the device's usage counter and the
-counter of 'active' children of the device.
-
-  * If any of these counters is decreased using a helper function provided by
-    the PM core and it turns out to be equal to zero, the other counter is
-    checked.  If that counter also is equal to zero, the PM core executes the
-    idle callback with the device as its argument.
-
-The action performed by the idle callback is totally dependent on the subsystem
-(or driver) in question, but the expected and recommended action is to check
-if the device can be suspended (i.e. if all of the conditions necessary for
-suspending the device are satisfied) and to queue up a suspend request for the
-device in that case.  If there is no idle callback, or if the callback returns
-0, then the PM core will attempt to carry out a runtime suspend of the device,
-also respecting devices configured for autosuspend.  In essence this means a
-call to pm_runtime_autosuspend() (do note that drivers needs to update the
-device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
-this circumstance).  To prevent this (for example, if the callback routine has
-started a delayed suspend), the routine must return a non-zero value.  Negative
-error return codes are ignored by the PM core.
-
-The helper functions provided by the PM core, described in Section 4, guarantee
-that the following constraints are met with respect to runtime PM callbacks for
-one device:
-
-(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
-    ->runtime_suspend() in parallel with ->runtime_resume() or with another
-    instance of ->runtime_suspend() for the same device) with the exception that
-    ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
-    ->runtime_idle() (although ->runtime_idle() will not be started while any
-    of the other callbacks is being executed for the same device).
-
-(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
-    devices (i.e. the PM core will only execute ->runtime_idle() or
-    ->runtime_suspend() for the devices the runtime PM status of which is
-    'active').
-
-(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
-    the usage counter of which is equal to zero _and_ either the counter of
-    'active' children of which is equal to zero, or the 'power.ignore_children'
-    flag of which is set.
-
-(4) ->runtime_resume() can only be executed for 'suspended' devices  (i.e. the
-    PM core will only execute ->runtime_resume() for the devices the runtime
-    PM status of which is 'suspended').
-
-Additionally, the helper functions provided by the PM core obey the following
-rules:
-
-  * If ->runtime_suspend() is about to be executed or there's a pending request
-    to execute it, ->runtime_idle() will not be executed for the same device.
-
-  * A request to execute or to schedule the execution of ->runtime_suspend()
-    will cancel any pending requests to execute ->runtime_idle() for the same
-    device.
-
-  * If ->runtime_resume() is about to be executed or there's a pending request
-    to execute it, the other callbacks will not be executed for the same device.
-
-  * A request to execute ->runtime_resume() will cancel any pending or
-    scheduled requests to execute the other callbacks for the same device,
-    except for scheduled autosuspends.
-
-3. Runtime PM Device Fields
-
-The following device runtime PM fields are present in 'struct dev_pm_info', as
-defined in include/linux/pm.h:
-
-  struct timer_list suspend_timer;
-    - timer used for scheduling (delayed) suspend and autosuspend requests
-
-  unsigned long timer_expires;
-    - timer expiration time, in jiffies (if this is different from zero, the
-      timer is running and will expire at that time, otherwise the timer is not
-      running)
-
-  struct work_struct work;
-    - work structure used for queuing up requests (i.e. work items in pm_wq)
-
-  wait_queue_head_t wait_queue;
-    - wait queue used if any of the helper functions needs to wait for another
-      one to complete
-
-  spinlock_t lock;
-    - lock used for synchronization
-
-  atomic_t usage_count;
-    - the usage counter of the device
-
-  atomic_t child_count;
-    - the count of 'active' children of the device
-
-  unsigned int ignore_children;
-    - if set, the value of child_count is ignored (but still updated)
-
-  unsigned int disable_depth;
-    - used for disabling the helper functions (they work normally if this is
-      equal to zero); the initial value of it is 1 (i.e. runtime PM is
-      initially disabled for all devices)
-
-  int runtime_error;
-    - if set, there was a fatal error (one of the callbacks returned error code
-      as described in Section 2), so the helper functions will not work until
-      this flag is cleared; this is the error code returned by the failing
-      callback
-
-  unsigned int idle_notification;
-    - if set, ->runtime_idle() is being executed
-
-  unsigned int request_pending;
-    - if set, there's a pending request (i.e. a work item queued up into pm_wq)
-
-  enum rpm_request request;
-    - type of request that's pending (valid if request_pending is set)
-
-  unsigned int deferred_resume;
-    - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
-      being executed for that device and it is not practical to wait for the
-      suspend to complete; means "start a resume as soon as you've suspended"
-
-  enum rpm_status runtime_status;
-    - the runtime PM status of the device; this field's initial value is
-      RPM_SUSPENDED, which means that each device is initially regarded by the
-      PM core as 'suspended', regardless of its real hardware status
-
-  unsigned int runtime_auto;
-    - if set, indicates that the user space has allowed the device driver to
-      power manage the device at run time via the /sys/devices/.../power/control
-      interface; it may only be modified with the help of the pm_runtime_allow()
-      and pm_runtime_forbid() helper functions
-
-  unsigned int no_callbacks;
-    - indicates that the device does not use the runtime PM callbacks (see
-      Section 8); it may be modified only by the pm_runtime_no_callbacks()
-      helper function
-
-  unsigned int irq_safe;
-    - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
-      will be invoked with the spinlock held and interrupts disabled
-
-  unsigned int use_autosuspend;
-    - indicates that the device's driver supports delayed autosuspend (see
-      Section 9); it may be modified only by the
-      pm_runtime{_dont}_use_autosuspend() helper functions
-
-  unsigned int timer_autosuspends;
-    - indicates that the PM core should attempt to carry out an autosuspend
-      when the timer expires rather than a normal suspend
-
-  int autosuspend_delay;
-    - the delay time (in milliseconds) to be used for autosuspend
-
-  unsigned long last_busy;
-    - the time (in jiffies) when the pm_runtime_mark_last_busy() helper
-      function was last called for this device; used in calculating inactivity
-      periods for autosuspend
-
-All of the above fields are members of the 'power' member of 'struct device'.
-
-4. Runtime PM Device Helper Functions
-
-The following runtime PM helper functions are defined in
-drivers/base/power/runtime.c and include/linux/pm_runtime.h:
-
-  void pm_runtime_init(struct device *dev);
-    - initialize the device runtime PM fields in 'struct dev_pm_info'
-
-  void pm_runtime_remove(struct device *dev);
-    - make sure that the runtime PM of the device will be disabled after
-      removing the device from device hierarchy
-
-  int pm_runtime_idle(struct device *dev);
-    - execute the subsystem-level idle callback for the device; returns an
-      error code on failure, where -EINPROGRESS means that ->runtime_idle() is
-      already being executed; if there is no callback or the callback returns 0
-      then run pm_runtime_autosuspend(dev) and return its result
-
-  int pm_runtime_suspend(struct device *dev);
-    - execute the subsystem-level suspend callback for the device; returns 0 on
-      success, 1 if the device's runtime PM status was already 'suspended', or
-      error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
-      to suspend the device again in future and -EACCES means that
-      'power.disable_depth' is different from 0
-
-  int pm_runtime_autosuspend(struct device *dev);
-    - same as pm_runtime_suspend() except that the autosuspend delay is taken
-      into account; if pm_runtime_autosuspend_expiration() says the delay has
-      not yet expired then an autosuspend is scheduled for the appropriate time
-      and 0 is returned
-
-  int pm_runtime_resume(struct device *dev);
-    - execute the subsystem-level resume callback for the device; returns 0 on
-      success, 1 if the device's runtime PM status was already 'active' or
-      error code on failure, where -EAGAIN means it may be safe to attempt to
-      resume the device again in future, but 'power.runtime_error' should be
-      checked additionally, and -EACCES means that 'power.disable_depth' is
-      different from 0
-
-  int pm_request_idle(struct device *dev);
-    - submit a request to execute the subsystem-level idle callback for the
-      device (the request is represented by a work item in pm_wq); returns 0 on
-      success or error code if the request has not been queued up
-
-  int pm_request_autosuspend(struct device *dev);
-    - schedule the execution of the subsystem-level suspend callback for the
-      device when the autosuspend delay has expired; if the delay has already
-      expired then the work item is queued up immediately
-
-  int pm_schedule_suspend(struct device *dev, unsigned int delay);
-    - schedule the execution of the subsystem-level suspend callback for the
-      device in future, where 'delay' is the time to wait before queuing up a
-      suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
-      item is queued up immediately); returns 0 on success, 1 if the device's PM
-      runtime status was already 'suspended', or error code if the request
-      hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
-      ->runtime_suspend() is already scheduled and not yet expired, the new
-      value of 'delay' will be used as the time to wait
-
-  int pm_request_resume(struct device *dev);
-    - submit a request to execute the subsystem-level resume callback for the
-      device (the request is represented by a work item in pm_wq); returns 0 on
-      success, 1 if the device's runtime PM status was already 'active', or
-      error code if the request hasn't been queued up
-
-  void pm_runtime_get_noresume(struct device *dev);
-    - increment the device's usage counter
-
-  int pm_runtime_get(struct device *dev);
-    - increment the device's usage counter, run pm_request_resume(dev) and
-      return its result
-
-  int pm_runtime_get_sync(struct device *dev);
-    - increment the device's usage counter, run pm_runtime_resume(dev) and
-      return its result
-
-  int pm_runtime_get_if_in_use(struct device *dev);
-    - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
-      runtime PM status is RPM_ACTIVE and the runtime PM usage counter is
-      nonzero, increment the counter and return 1; otherwise return 0 without
-      changing the counter
-
-  void pm_runtime_put_noidle(struct device *dev);
-    - decrement the device's usage counter
-
-  int pm_runtime_put(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_request_idle(dev) and return its result
-
-  int pm_runtime_put_autosuspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_request_autosuspend(dev) and return its result
-
-  int pm_runtime_put_sync(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_idle(dev) and return its result
-
-  int pm_runtime_put_sync_suspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_suspend(dev) and return its result
-
-  int pm_runtime_put_sync_autosuspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_autosuspend(dev) and return its result
-
-  void pm_runtime_enable(struct device *dev);
-    - decrement the device's 'power.disable_depth' field; if that field is equal
-      to zero, the runtime PM helper functions can execute subsystem-level
-      callbacks described in Section 2 for the device
-
-  int pm_runtime_disable(struct device *dev);
-    - increment the device's 'power.disable_depth' field (if the value of that
-      field was previously zero, this prevents subsystem-level runtime PM
-      callbacks from being run for the device), make sure that all of the
-      pending runtime PM operations on the device are either completed or
-      canceled; returns 1 if there was a resume request pending and it was
-      necessary to execute the subsystem-level resume callback for the device
-      to satisfy that request, otherwise 0 is returned
-
-  int pm_runtime_barrier(struct device *dev);
-    - check if there's a resume request pending for the device and resume it
-      (synchronously) in that case, cancel any other pending runtime PM requests
-      regarding it and wait for all runtime PM operations on it in progress to
-      complete; returns 1 if there was a resume request pending and it was
-      necessary to execute the subsystem-level resume callback for the device to
-      satisfy that request, otherwise 0 is returned
-
-  void pm_suspend_ignore_children(struct device *dev, bool enable);
-    - set/unset the power.ignore_children flag of the device
-
-  int pm_runtime_set_active(struct device *dev);
-    - clear the device's 'power.runtime_error' flag, set the device's runtime
-      PM status to 'active' and update its parent's counter of 'active'
-      children as appropriate (it is only valid to use this function if
-      'power.runtime_error' is set or 'power.disable_depth' is greater than
-      zero); it will fail and return error code if the device has a parent
-      which is not active and the 'power.ignore_children' flag of which is unset
-
-  void pm_runtime_set_suspended(struct device *dev);
-    - clear the device's 'power.runtime_error' flag, set the device's runtime
-      PM status to 'suspended' and update its parent's counter of 'active'
-      children as appropriate (it is only valid to use this function if
-      'power.runtime_error' is set or 'power.disable_depth' is greater than
-      zero)
-
-  bool pm_runtime_active(struct device *dev);
-    - return true if the device's runtime PM status is 'active' or its
-      'power.disable_depth' field is not equal to zero, or false otherwise
-
-  bool pm_runtime_suspended(struct device *dev);
-    - return true if the device's runtime PM status is 'suspended' and its
-      'power.disable_depth' field is equal to zero, or false otherwise
-
-  bool pm_runtime_status_suspended(struct device *dev);
-    - return true if the device's runtime PM status is 'suspended'
-
-  void pm_runtime_allow(struct device *dev);
-    - set the power.runtime_auto flag for the device and decrease its usage
-      counter (used by the /sys/devices/.../power/control interface to
-      effectively allow the device to be power managed at run time)
-
-  void pm_runtime_forbid(struct device *dev);
-    - unset the power.runtime_auto flag for the device and increase its usage
-      counter (used by the /sys/devices/.../power/control interface to
-      effectively prevent the device from being power managed at run time)
-
-  void pm_runtime_no_callbacks(struct device *dev);
-    - set the power.no_callbacks flag for the device and remove the runtime
-      PM attributes from /sys/devices/.../power (or prevent them from being
-      added when the device is registered)
-
-  void pm_runtime_irq_safe(struct device *dev);
-    - set the power.irq_safe flag for the device, causing the runtime-PM
-      callbacks to be invoked with interrupts off
-
-  bool pm_runtime_is_irq_safe(struct device *dev);
-    - return true if power.irq_safe flag was set for the device, causing
-      the runtime-PM callbacks to be invoked with interrupts off
-
-  void pm_runtime_mark_last_busy(struct device *dev);
-    - set the power.last_busy field to the current time
-
-  void pm_runtime_use_autosuspend(struct device *dev);
-    - set the power.use_autosuspend flag, enabling autosuspend delays; call
-      pm_runtime_get_sync if the flag was previously cleared and
-      power.autosuspend_delay is negative
-
-  void pm_runtime_dont_use_autosuspend(struct device *dev);
-    - clear the power.use_autosuspend flag, disabling autosuspend delays;
-      decrement the device's usage counter if the flag was previously set and
-      power.autosuspend_delay is negative; call pm_runtime_idle
-
-  void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
-    - set the power.autosuspend_delay value to 'delay' (expressed in
-      milliseconds); if 'delay' is negative then runtime suspends are
-      prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
-      called or the device's usage counter may be decremented and
-      pm_runtime_idle called depending on if power.autosuspend_delay is
-      changed to or from a negative value; if power.use_autosuspend is clear,
-      pm_runtime_idle is called
-
-  unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
-    - calculate the time when the current autosuspend delay period will expire,
-      based on power.last_busy and power.autosuspend_delay; if the delay time
-      is 1000 ms or larger then the expiration time is rounded up to the
-      nearest second; returns 0 if the delay period has already expired or
-      power.use_autosuspend isn't set, otherwise returns the expiration time
-      in jiffies
-
-It is safe to execute the following helper functions from interrupt context:
-
-pm_request_idle()
-pm_request_autosuspend()
-pm_schedule_suspend()
-pm_request_resume()
-pm_runtime_get_noresume()
-pm_runtime_get()
-pm_runtime_put_noidle()
-pm_runtime_put()
-pm_runtime_put_autosuspend()
-pm_runtime_enable()
-pm_suspend_ignore_children()
-pm_runtime_set_active()
-pm_runtime_set_suspended()
-pm_runtime_suspended()
-pm_runtime_mark_last_busy()
-pm_runtime_autosuspend_expiration()
-
-If pm_runtime_irq_safe() has been called for a device then the following helper
-functions may also be used in interrupt context:
-
-pm_runtime_idle()
-pm_runtime_suspend()
-pm_runtime_autosuspend()
-pm_runtime_resume()
-pm_runtime_get_sync()
-pm_runtime_put_sync()
-pm_runtime_put_sync_suspend()
-pm_runtime_put_sync_autosuspend()
-
-5. Runtime PM Initialization, Device Probing and Removal
-
-Initially, the runtime PM is disabled for all devices, which means that the
-majority of the runtime PM helper functions described in Section 4 will return
--EAGAIN until pm_runtime_enable() is called for the device.
-
-In addition to that, the initial runtime PM status of all devices is
-'suspended', but it need not reflect the actual physical state of the device.
-Thus, if the device is initially active (i.e. it is able to process I/O), its
-runtime PM status must be changed to 'active', with the help of
-pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
-
-However, if the device has a parent and the parent's runtime PM is enabled,
-calling pm_runtime_set_active() for the device will affect the parent, unless
-the parent's 'power.ignore_children' flag is set.  Namely, in that case the
-parent won't be able to suspend at run time, using the PM core's helper
-functions, as long as the child's status is 'active', even if the child's
-runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
-the child yet or pm_runtime_disable() has been called for it).  For this reason,
-once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
-should be called for it too as soon as reasonably possible or its runtime PM
-status should be changed back to 'suspended' with the help of
-pm_runtime_set_suspended().
-
-If the default initial runtime PM status of the device (i.e. 'suspended')
-reflects the actual state of the device, its bus type's or its driver's
-->probe() callback will likely need to wake it up using one of the PM core's
-helper functions described in Section 4.  In that case, pm_runtime_resume()
-should be used.  Of course, for this purpose the device's runtime PM has to be
-enabled earlier by calling pm_runtime_enable().
-
-Note, if the device may execute pm_runtime calls during the probe (such as
-if it is registers with a subsystem that may call back in) then the
-pm_runtime_get_sync() call paired with a pm_runtime_put() call will be
-appropriate to ensure that the device is not put back to sleep during the
-probe. This can happen with systems such as the network device layer.
-
-It may be desirable to suspend the device once ->probe() has finished.
-Therefore the driver core uses the asynchronous pm_request_idle() to submit a
-request to execute the subsystem-level idle callback for the device at that
-time.  A driver that makes use of the runtime autosuspend feature, may want to
-update the last busy mark before returning from ->probe().
-
-Moreover, the driver core prevents runtime PM callbacks from racing with the bus
-notifier callback in __device_release_driver(), which is necessary, because the
-notifier is used by some subsystems to carry out operations affecting the
-runtime PM functionality.  It does so by calling pm_runtime_get_sync() before
-driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications.  This
-resumes the device if it's in the suspended state and prevents it from
-being suspended again while those routines are being executed.
-
-To allow bus types and drivers to put devices into the suspended state by
-calling pm_runtime_suspend() from their ->remove() routines, the driver core
-executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
-notifications in __device_release_driver().  This requires bus types and
-drivers to make their ->remove() callbacks avoid races with runtime PM directly,
-but also it allows of more flexibility in the handling of devices during the
-removal of their drivers.
-
-Drivers in ->remove() callback should undo the runtime PM changes done
-in ->probe(). Usually this means calling pm_runtime_disable(),
-pm_runtime_dont_use_autosuspend() etc.
-
-The user space can effectively disallow the driver of the device to power manage
-it at run time by changing the value of its /sys/devices/.../power/control
-attribute to "on", which causes pm_runtime_forbid() to be called.  In principle,
-this mechanism may also be used by the driver to effectively turn off the
-runtime power management of the device until the user space turns it on.
-Namely, during the initialization the driver can make sure that the runtime PM
-status of the device is 'active' and call pm_runtime_forbid().  It should be
-noted, however, that if the user space has already intentionally changed the
-value of /sys/devices/.../power/control to "auto" to allow the driver to power
-manage the device at run time, the driver may confuse it by using
-pm_runtime_forbid() this way.
-
-6. Runtime PM and System Sleep
-
-Runtime PM and system sleep (i.e., system suspend and hibernation, also known
-as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
-ways.  If a device is active when a system sleep starts, everything is
-straightforward.  But what should happen if the device is already suspended?
-
-The device may have different wake-up settings for runtime PM and system sleep.
-For example, remote wake-up may be enabled for runtime suspend but disallowed
-for system sleep (device_may_wakeup(dev) returns 'false').  When this happens,
-the subsystem-level system suspend callback is responsible for changing the
-device's wake-up setting (it may leave that to the device driver's system
-suspend routine).  It may be necessary to resume the device and suspend it again
-in order to do so.  The same is true if the driver uses different power levels
-or other settings for runtime suspend and system sleep.
-
-During system resume, the simplest approach is to bring all devices back to full
-power, even if they had been suspended before the system suspend began.  There
-are several reasons for this, including:
-
-  * The device might need to switch power levels, wake-up settings, etc.
-
-  * Remote wake-up events might have been lost by the firmware.
-
-  * The device's children may need the device to be at full power in order
-    to resume themselves.
-
-  * The driver's idea of the device state may not agree with the device's
-    physical state.  This can happen during resume from hibernation.
-
-  * The device might need to be reset.
-
-  * Even though the device was suspended, if its usage counter was > 0 then most
-    likely it would need a runtime resume in the near future anyway.
-
-If the device had been suspended before the system suspend began and it's
-brought back to full power during resume, then its runtime PM status will have
-to be updated to reflect the actual post-system sleep status.  The way to do
-this is:
-
-	pm_runtime_disable(dev);
-	pm_runtime_set_active(dev);
-	pm_runtime_enable(dev);
-
-The PM core always increments the runtime usage counter before calling the
-->suspend() callback and decrements it after calling the ->resume() callback.
-Hence disabling runtime PM temporarily like this will not cause any runtime
-suspend attempts to be permanently lost.  If the usage count goes to zero
-following the return of the ->resume() callback, the ->runtime_idle() callback
-will be invoked as usual.
-
-On some systems, however, system sleep is not entered through a global firmware
-or hardware operation.  Instead, all hardware components are put into low-power
-states directly by the kernel in a coordinated way.  Then, the system sleep
-state effectively follows from the states the hardware components end up in
-and the system is woken up from that state by a hardware interrupt or a similar
-mechanism entirely under the kernel's control.  As a result, the kernel never
-gives control away and the states of all devices during resume are precisely
-known to it.  If that is the case and none of the situations listed above takes
-place (in particular, if the system is not waking up from hibernation), it may
-be more efficient to leave the devices that had been suspended before the system
-suspend began in the suspended state.
-
-To this end, the PM core provides a mechanism allowing some coordination between
-different levels of device hierarchy.  Namely, if a system suspend .prepare()
-callback returns a positive number for a device, that indicates to the PM core
-that the device appears to be runtime-suspended and its state is fine, so it
-may be left in runtime suspend provided that all of its descendants are also
-left in runtime suspend.  If that happens, the PM core will not execute any
-system suspend and resume callbacks for all of those devices, except for the
-complete callback, which is then entirely responsible for handling the device
-as appropriate.  This only applies to system suspend transitions that are not
-related to hibernation (see Documentation/driver-api/pm/devices.rst for more
-information).
-
-The PM core does its best to reduce the probability of race conditions between
-the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
-out the following operations:
-
-  * During system suspend pm_runtime_get_noresume() is called for every device
-    right before executing the subsystem-level .prepare() callback for it and
-    pm_runtime_barrier() is called for every device right before executing the
-    subsystem-level .suspend() callback for it.  In addition to that the PM core
-    calls  __pm_runtime_disable() with 'false' as the second argument for every
-    device right before executing the subsystem-level .suspend_late() callback
-    for it.
-
-  * During system resume pm_runtime_enable() and pm_runtime_put() are called for
-    every device right after executing the subsystem-level .resume_early()
-    callback and right after executing the subsystem-level .complete() callback
-    for it, respectively.
-
-7. Generic subsystem callbacks
-
-Subsystems may wish to conserve code space by using the set of generic power
-management callbacks provided by the PM core, defined in
-driver/base/power/generic_ops.c:
-
-  int pm_generic_runtime_suspend(struct device *dev);
-    - invoke the ->runtime_suspend() callback provided by the driver of this
-      device and return its result, or return 0 if not defined
-
-  int pm_generic_runtime_resume(struct device *dev);
-    - invoke the ->runtime_resume() callback provided by the driver of this
-      device and return its result, or return 0 if not defined
-
-  int pm_generic_suspend(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->suspend()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_suspend_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_resume(struct device *dev);
-    - invoke the ->resume() callback provided by the driver of this device and,
-      if successful, change the device's runtime PM status to 'active'
-
-  int pm_generic_resume_noirq(struct device *dev);
-    - invoke the ->resume_noirq() callback provided by the driver of this device
-
-  int pm_generic_freeze(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->freeze()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_freeze_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_thaw(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->thaw()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_thaw_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_poweroff(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->poweroff()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_poweroff_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_restore(struct device *dev);
-    - invoke the ->restore() callback provided by the driver of this device and,
-      if successful, change the device's runtime PM status to 'active'
-
-  int pm_generic_restore_noirq(struct device *dev);
-    - invoke the ->restore_noirq() callback provided by the device's driver
-
-These functions are the defaults used by the PM core, if a subsystem doesn't
-provide its own callbacks for ->runtime_idle(), ->runtime_suspend(),
-->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
-->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
-->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the
-subsystem-level dev_pm_ops structure.
-
-Device drivers that wish to use the same function as a system suspend, freeze,
-poweroff and runtime suspend callback, and similarly for system resume, thaw,
-restore, and runtime resume, can achieve this with the help of the
-UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
-last argument to NULL).
-
-8. "No-Callback" Devices
-
-Some "devices" are only logical sub-devices of their parent and cannot be
-power-managed on their own.  (The prototype example is a USB interface.  Entire
-USB devices can go into low-power mode or send wake-up requests, but neither is
-possible for individual interfaces.)  The drivers for these devices have no
-need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
-and ->runtime_resume() would always return 0 without doing anything else and
-->runtime_idle() would always call pm_runtime_suspend().
-
-Subsystems can tell the PM core about these devices by calling
-pm_runtime_no_callbacks().  This should be done after the device structure is
-initialized and before it is registered (although after device registration is
-also okay).  The routine will set the device's power.no_callbacks flag and
-prevent the non-debugging runtime PM sysfs attributes from being created.
-
-When power.no_callbacks is set, the PM core will not invoke the
-->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
-Instead it will assume that suspends and resumes always succeed and that idle
-devices should be suspended.
-
-As a consequence, the PM core will never directly inform the device's subsystem
-or driver about runtime power changes.  Instead, the driver for the device's
-parent must take responsibility for telling the device's driver when the
-parent's power state changes.
-
-9. Autosuspend, or automatically-delayed suspends
-
-Changing a device's power state isn't free; it requires both time and energy.
-A device should be put in a low-power state only when there's some reason to
-think it will remain in that state for a substantial time.  A common heuristic
-says that a device which hasn't been used for a while is liable to remain
-unused; following this advice, drivers should not allow devices to be suspended
-at runtime until they have been inactive for some minimum period.  Even when
-the heuristic ends up being non-optimal, it will still prevent devices from
-"bouncing" too rapidly between low-power and full-power states.
-
-The term "autosuspend" is an historical remnant.  It doesn't mean that the
-device is automatically suspended (the subsystem or driver still has to call
-the appropriate PM routines); rather it means that runtime suspends will
-automatically be delayed until the desired period of inactivity has elapsed.
-
-Inactivity is determined based on the power.last_busy field.  Drivers should
-call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
-typically just before calling pm_runtime_put_autosuspend().  The desired length
-of the inactivity period is a matter of policy.  Subsystems can set this length
-initially by calling pm_runtime_set_autosuspend_delay(), but after device
-registration the length should be controlled by user space, using the
-/sys/devices/.../power/autosuspend_delay_ms attribute.
-
-In order to use autosuspend, subsystems or drivers must call
-pm_runtime_use_autosuspend() (preferably before registering the device), and
-thereafter they should use the various *_autosuspend() helper functions instead
-of the non-autosuspend counterparts:
-
-	Instead of: pm_runtime_suspend    use: pm_runtime_autosuspend;
-	Instead of: pm_schedule_suspend   use: pm_request_autosuspend;
-	Instead of: pm_runtime_put        use: pm_runtime_put_autosuspend;
-	Instead of: pm_runtime_put_sync   use: pm_runtime_put_sync_autosuspend.
-
-Drivers may also continue to use the non-autosuspend helper functions; they
-will behave normally, which means sometimes taking the autosuspend delay into
-account (see pm_runtime_idle).
-
-Under some circumstances a driver or subsystem may want to prevent a device
-from autosuspending immediately, even though the usage counter is zero and the
-autosuspend delay time has expired.  If the ->runtime_suspend() callback
-returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is
-in the future (as it normally would be if the callback invoked
-pm_runtime_mark_last_busy()), the PM core will automatically reschedule the
-autosuspend.  The ->runtime_suspend() callback can't do this rescheduling
-itself because no suspend requests of any kind are accepted while the device is
-suspending (i.e., while the callback is running).
-
-The implementation is well suited for asynchronous use in interrupt contexts.
-However such use inevitably involves races, because the PM core can't
-synchronize ->runtime_suspend() callbacks with the arrival of I/O requests.
-This synchronization must be handled by the driver, using its private lock.
-Here is a schematic pseudo-code example:
-
-	foo_read_or_write(struct foo_priv *foo, void *data)
-	{
-		lock(&foo->private_lock);
-		add_request_to_io_queue(foo, data);
-		if (foo->num_pending_requests++ == 0)
-			pm_runtime_get(&foo->dev);
-		if (!foo->is_suspended)
-			foo_process_next_request(foo);
-		unlock(&foo->private_lock);
-	}
-
-	foo_io_completion(struct foo_priv *foo, void *req)
-	{
-		lock(&foo->private_lock);
-		if (--foo->num_pending_requests == 0) {
-			pm_runtime_mark_last_busy(&foo->dev);
-			pm_runtime_put_autosuspend(&foo->dev);
-		} else {
-			foo_process_next_request(foo);
-		}
-		unlock(&foo->private_lock);
-		/* Send req result back to the user ... */
-	}
-
-	int foo_runtime_suspend(struct device *dev)
-	{
-		struct foo_priv foo = container_of(dev, ...);
-		int ret = 0;
-
-		lock(&foo->private_lock);
-		if (foo->num_pending_requests > 0) {
-			ret = -EBUSY;
-		} else {
-			/* ... suspend the device ... */
-			foo->is_suspended = 1;
-		}
-		unlock(&foo->private_lock);
-		return ret;
-	}
-
-	int foo_runtime_resume(struct device *dev)
-	{
-		struct foo_priv foo = container_of(dev, ...);
-
-		lock(&foo->private_lock);
-		/* ... resume the device ... */
-		foo->is_suspended = 0;
-		pm_runtime_mark_last_busy(&foo->dev);
-		if (foo->num_pending_requests > 0)
-			foo_process_next_request(foo);
-		unlock(&foo->private_lock);
-		return 0;
-	}
-
-The important point is that after foo_io_completion() asks for an autosuspend,
-the foo_runtime_suspend() callback may race with foo_read_or_write().
-Therefore foo_runtime_suspend() has to check whether there are any pending I/O
-requests (while holding the private lock) before allowing the suspend to
-proceed.
-
-In addition, the power.autosuspend_delay field can be changed by user space at
-any time.  If a driver cares about this, it can call
-pm_runtime_autosuspend_expiration() from within the ->runtime_suspend()
-callback while holding its private lock.  If the function returns a nonzero
-value then the delay has not yet expired and the callback should return
--EAGAIN.
diff --git a/Documentation/power/s2ram.rst b/Documentation/power/s2ram.rst
new file mode 100644
index 000000000000..d739aa7c742c
--- /dev/null
+++ b/Documentation/power/s2ram.rst
@@ -0,0 +1,87 @@
+========================
+How to get s2ram working
+========================
+
+2006 Linus Torvalds
+2006 Pavel Machek
+
+1) Check suspend.sf.net, program s2ram there has long whitelist of
+   "known ok" machines, along with tricks to use on each one.
+
+2) If that does not help, try reading tricks.txt and
+   video.txt. Perhaps problem is as simple as broken module, and
+   simple module unload can fix it.
+
+3) You can use Linus' TRACE_RESUME infrastructure, described below.
+
+Using TRACE_RESUME
+~~~~~~~~~~~~~~~~~~
+
+I've been working at making the machines I have able to STR, and almost
+always it's a driver that is buggy. Thank God for the suspend/resume
+debugging - the thing that Chuck tried to disable. That's often the _only_
+way to debug these things, and it's actually pretty powerful (but
+time-consuming - having to insert TRACE_RESUME() markers into the device
+driver that doesn't resume and recompile and reboot).
+
+Anyway, the way to debug this for people who are interested (have a
+machine that doesn't boot) is:
+
+ - enable PM_DEBUG, and PM_TRACE
+
+ - use a script like this::
+
+	#!/bin/sh
+	sync
+	echo 1 > /sys/power/pm_trace
+	echo mem > /sys/power/state
+
+   to suspend
+
+ - if it doesn't come back up (which is usually the problem), reboot by
+   holding the power button down, and look at the dmesg output for things
+   like::
+
+	Magic number: 4:156:725
+	hash matches drivers/base/power/resume.c:28
+	hash matches device 0000:01:00.0
+
+   which means that the last trace event was just before trying to resume
+   device 0000:01:00.0. Then figure out what driver is controlling that
+   device (lspci and /sys/devices/pci* is your friend), and see if you can
+   fix it, disable it, or trace into its resume function.
+
+   If no device matches the hash (or any matches appear to be false positives),
+   the culprit may be a device from a loadable kernel module that is not loaded
+   until after the hash is checked. You can check the hash against the current
+   devices again after more modules are loaded using sysfs::
+
+	cat /sys/power/pm_trace_dev_match
+
+For example, the above happens to be the VGA device on my EVO, which I
+used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
+that "radeonfb" simply cannot resume that device - it tries to set the
+PLL's, and it just _hangs_. Using the regular VGA console and letting X
+resume it instead works fine.
+
+NOTE
+====
+pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
+Reason for this is that the RTC is the only reliably available piece of
+hardware during resume operations where a value can be set that will
+survive a reboot.
+
+pm_trace is not compatible with asynchronous suspend, so it turns
+asynchronous suspend off (which may work around timing or
+ordering-sensitive bugs).
+
+Consequence is that after a resume (even if it is successful) your system
+clock will have a value corresponding to the magic number instead of the
+correct date/time! It is therefore advisable to use a program like ntp-date
+or rdate to reset the correct date/time from an external time source when
+using this trace option.
+
+As the clock keeps ticking it is also essential that the reboot is done
+quickly after the resume failure. The trace option does not use the seconds
+or the low order bits of the minutes of the RTC, but a too long delay will
+corrupt the magic value.
diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt
deleted file mode 100644
index 4685aee197fd..000000000000
--- a/Documentation/power/s2ram.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-			How to get s2ram working
-			~~~~~~~~~~~~~~~~~~~~~~~~
-			2006 Linus Torvalds
-			2006 Pavel Machek
-
-1) Check suspend.sf.net, program s2ram there has long whitelist of
-   "known ok" machines, along with tricks to use on each one.
-
-2) If that does not help, try reading tricks.txt and
-   video.txt. Perhaps problem is as simple as broken module, and
-   simple module unload can fix it.
-
-3) You can use Linus' TRACE_RESUME infrastructure, described below.
-
-		      Using TRACE_RESUME
-		      ~~~~~~~~~~~~~~~~~~
-
-I've been working at making the machines I have able to STR, and almost
-always it's a driver that is buggy. Thank God for the suspend/resume
-debugging - the thing that Chuck tried to disable. That's often the _only_
-way to debug these things, and it's actually pretty powerful (but
-time-consuming - having to insert TRACE_RESUME() markers into the device
-driver that doesn't resume and recompile and reboot).
-
-Anyway, the way to debug this for people who are interested (have a
-machine that doesn't boot) is:
-
- - enable PM_DEBUG, and PM_TRACE
-
- - use a script like this:
-
-	#!/bin/sh
-	sync
-	echo 1 > /sys/power/pm_trace
-	echo mem > /sys/power/state
-
-   to suspend
-
- - if it doesn't come back up (which is usually the problem), reboot by
-   holding the power button down, and look at the dmesg output for things
-   like
-
-	Magic number: 4:156:725
-	hash matches drivers/base/power/resume.c:28
-	hash matches device 0000:01:00.0
-
-   which means that the last trace event was just before trying to resume
-   device 0000:01:00.0. Then figure out what driver is controlling that
-   device (lspci and /sys/devices/pci* is your friend), and see if you can
-   fix it, disable it, or trace into its resume function.
-
-   If no device matches the hash (or any matches appear to be false positives),
-   the culprit may be a device from a loadable kernel module that is not loaded
-   until after the hash is checked. You can check the hash against the current
-   devices again after more modules are loaded using sysfs:
-
-	cat /sys/power/pm_trace_dev_match
-
-For example, the above happens to be the VGA device on my EVO, which I
-used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
-that "radeonfb" simply cannot resume that device - it tries to set the
-PLL's, and it just _hangs_. Using the regular VGA console and letting X
-resume it instead works fine.
-
-NOTE
-====
-pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
-Reason for this is that the RTC is the only reliably available piece of
-hardware during resume operations where a value can be set that will
-survive a reboot.
-
-pm_trace is not compatible with asynchronous suspend, so it turns
-asynchronous suspend off (which may work around timing or
-ordering-sensitive bugs).
-
-Consequence is that after a resume (even if it is successful) your system
-clock will have a value corresponding to the magic number instead of the
-correct date/time! It is therefore advisable to use a program like ntp-date
-or rdate to reset the correct date/time from an external time source when
-using this trace option.
-
-As the clock keeps ticking it is also essential that the reboot is done
-quickly after the resume failure. The trace option does not use the seconds
-or the low order bits of the minutes of the RTC, but a too long delay will
-corrupt the magic value.
diff --git a/Documentation/power/suspend-and-cpuhotplug.rst b/Documentation/power/suspend-and-cpuhotplug.rst
new file mode 100644
index 000000000000..7ac8e1f549f4
--- /dev/null
+++ b/Documentation/power/suspend-and-cpuhotplug.rst
@@ -0,0 +1,286 @@
+====================================================================
+Interaction of Suspend code (S3) with the CPU hotplug infrastructure
+====================================================================
+
+(C) 2011 - 2014 Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
+
+
+I. Differences between CPU hotplug and Suspend-to-RAM
+======================================================
+
+How does the regular CPU hotplug code differ from how the Suspend-to-RAM
+infrastructure uses it internally? And where do they share common code?
+
+Well, a picture is worth a thousand words... So ASCII art follows :-)
+
+[This depicts the current design in the kernel, and focusses only on the
+interactions involving the freezer and CPU hotplug and also tries to explain
+the locking involved. It outlines the notifications involved as well.
+But please note that here, only the call paths are illustrated, with the aim
+of describing where they take different paths and where they share code.
+What happens when regular CPU hotplug and Suspend-to-RAM race with each other
+is not depicted here.]
+
+On a high level, the suspend-resume cycle goes like this::
+
+  |Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw |
+  |tasks |    |     cpus      |    |          |    |     cpus     |    |tasks|
+
+
+More details follow::
+
+                                Suspend call path
+                                -----------------
+
+                                  Write 'mem' to
+                                /sys/power/state
+                                    sysfs file
+                                        |
+                                        v
+                               Acquire system_transition_mutex lock
+                                        |
+                                        v
+                             Send PM_SUSPEND_PREPARE
+                                   notifications
+                                        |
+                                        v
+                                   Freeze tasks
+                                        |
+                                        |
+                                        v
+                              disable_nonboot_cpus()
+                                   /* start */
+                                        |
+                                        v
+                            Acquire cpu_add_remove_lock
+                                        |
+                                        v
+                             Iterate over CURRENTLY
+                                   online CPUs
+                                        |
+                                        |
+                                        |                ----------
+                                        v                          | L
+             ======>               _cpu_down()                     |
+            |              [This takes cpuhotplug.lock             |
+  Common    |               before taking down the CPU             |
+   code     |               and releases it when done]             | O
+            |            While it is at it, notifications          |
+            |            are sent when notable events occur,       |
+             ======>     by running all registered callbacks.      |
+                                        |                          | O
+                                        |                          |
+                                        |                          |
+                                        v                          |
+                            Note down these cpus in                | P
+                                frozen_cpus mask         ----------
+                                        |
+                                        v
+                           Disable regular cpu hotplug
+                        by increasing cpu_hotplug_disabled
+                                        |
+                                        v
+                            Release cpu_add_remove_lock
+                                        |
+                                        v
+                       /* disable_nonboot_cpus() complete */
+                                        |
+                                        v
+                                   Do suspend
+
+
+
+Resuming back is likewise, with the counterparts being (in the order of
+execution during resume):
+
+* enable_nonboot_cpus() which involves::
+
+   |  Acquire cpu_add_remove_lock
+   |  Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug
+   |  Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop]
+   |  Release cpu_add_remove_lock
+   v
+
+* thaw tasks
+* send PM_POST_SUSPEND notifications
+* Release system_transition_mutex lock.
+
+
+It is to be noted here that the system_transition_mutex lock is acquired at the very
+beginning, when we are just starting out to suspend, and then released only
+after the entire cycle is complete (i.e., suspend + resume).
+
+::
+
+
+
+                          Regular CPU hotplug call path
+                          -----------------------------
+
+                                Write 0 (or 1) to
+                       /sys/devices/system/cpu/cpu*/online
+                                    sysfs file
+                                        |
+                                        |
+                                        v
+                                    cpu_down()
+                                        |
+                                        v
+                           Acquire cpu_add_remove_lock
+                                        |
+                                        v
+                          If cpu_hotplug_disabled > 0
+                                return gracefully
+                                        |
+                                        |
+                                        v
+             ======>                _cpu_down()
+            |              [This takes cpuhotplug.lock
+  Common    |               before taking down the CPU
+   code     |               and releases it when done]
+            |            While it is at it, notifications
+            |           are sent when notable events occur,
+             ======>    by running all registered callbacks.
+                                        |
+                                        |
+                                        v
+                          Release cpu_add_remove_lock
+                               [That's it!, for
+                              regular CPU hotplug]
+
+
+
+So, as can be seen from the two diagrams (the parts marked as "Common code"),
+regular CPU hotplug and the suspend code path converge at the _cpu_down() and
+_cpu_up() functions. They differ in the arguments passed to these functions,
+in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen'
+argument. But during suspend, since the tasks are already frozen by the time
+the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called
+with the 'tasks_frozen' argument set to 1.
+[See below for some known issues regarding this.]
+
+
+Important files and functions/entry points:
+-------------------------------------------
+
+- kernel/power/process.c : freeze_processes(), thaw_processes()
+- kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish()
+- kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus()
+
+
+
+II. What are the issues involved in CPU hotplug?
+------------------------------------------------
+
+There are some interesting situations involving CPU hotplug and microcode
+update on the CPUs, as discussed below:
+
+[Please bear in mind that the kernel requests the microcode images from
+userspace, using the request_firmware() function defined in
+drivers/base/firmware_loader/main.c]
+
+
+a. When all the CPUs are identical:
+
+   This is the most common situation and it is quite straightforward: we want
+   to apply the same microcode revision to each of the CPUs.
+   To give an example of x86, the collect_cpu_info() function defined in
+   arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU
+   and thereby in applying the correct microcode revision to it.
+   But note that the kernel does not maintain a common microcode image for the
+   all CPUs, in order to handle case 'b' described below.
+
+
+b. When some of the CPUs are different than the rest:
+
+   In this case since we probably need to apply different microcode revisions
+   to different CPUs, the kernel maintains a copy of the correct microcode
+   image for each CPU (after appropriate CPU type/model discovery using
+   functions such as collect_cpu_info()).
+
+
+c. When a CPU is physically hot-unplugged and a new (and possibly different
+   type of) CPU is hot-plugged into the system:
+
+   In the current design of the kernel, whenever a CPU is taken offline during
+   a regular CPU hotplug operation, upon receiving the CPU_DEAD notification
+   (which is sent by the CPU hotplug code), the microcode update driver's
+   callback for that event reacts by freeing the kernel's copy of the
+   microcode image for that CPU.
+
+   Hence, when a new CPU is brought online, since the kernel finds that it
+   doesn't have the microcode image, it does the CPU type/model discovery
+   afresh and then requests the userspace for the appropriate microcode image
+   for that CPU, which is subsequently applied.
+
+   For example, in x86, the mc_cpu_callback() function (which is the microcode
+   update driver's callback registered for CPU hotplug events) calls
+   microcode_update_cpu() which would call microcode_init_cpu() in this case,
+   instead of microcode_resume_cpu() when it finds that the kernel doesn't
+   have a valid microcode image. This ensures that the CPU type/model
+   discovery is performed and the right microcode is applied to the CPU after
+   getting it from userspace.
+
+
+d. Handling microcode update during suspend/hibernate:
+
+   Strictly speaking, during a CPU hotplug operation which does not involve
+   physically removing or inserting CPUs, the CPUs are not actually powered
+   off during a CPU offline. They are just put to the lowest C-states possible.
+   Hence, in such a case, it is not really necessary to re-apply microcode
+   when the CPUs are brought back online, since they wouldn't have lost the
+   image during the CPU offline operation.
+
+   This is the usual scenario encountered during a resume after a suspend.
+   However, in the case of hibernation, since all the CPUs are completely
+   powered off, during restore it becomes necessary to apply the microcode
+   images to all the CPUs.
+
+   [Note that we don't expect someone to physically pull out nodes and insert
+   nodes with a different type of CPUs in-between a suspend-resume or a
+   hibernate/restore cycle.]
+
+   In the current design of the kernel however, during a CPU offline operation
+   as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
+   the existing copy of microcode image in the kernel is not freed up.
+   And during the CPU online operations (during resume/restore), since the
+   kernel finds that it already has copies of the microcode images for all the
+   CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU
+   type/model and the need for validating whether the microcode revisions are
+   right for the CPUs or not (due to the above assumption that physical CPU
+   hotplug will not be done in-between suspend/resume or hibernate/restore
+   cycles).
+
+
+III. Known problems
+===================
+
+Are there any known problems when regular CPU hotplug and suspend race
+with each other?
+
+Yes, they are listed below:
+
+1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to
+   the _cpu_down() and _cpu_up() functions is *always* 0.
+   This might not reflect the true current state of the system, since the
+   tasks could have been frozen by an out-of-band event such as a suspend
+   operation in progress. Hence, the cpuhp_tasks_frozen variable will not
+   reflect the frozen state and the CPU hotplug callbacks which evaluate
+   that variable might execute the wrong code path.
+
+2. If a regular CPU hotplug stress test happens to race with the freezer due
+   to a suspend operation in progress at the same time, then we could hit the
+   situation described below:
+
+    * A regular cpu online operation continues its journey from userspace
+      into the kernel, since the freezing has not yet begun.
+    * Then freezer gets to work and freezes userspace.
+    * If cpu online has not yet completed the microcode update stuff by now,
+      it will now start waiting on the frozen userspace in the
+      TASK_UNINTERRUPTIBLE state, in order to get the microcode image.
+    * Now the freezer continues and tries to freeze the remaining tasks. But
+      due to this wait mentioned above, the freezer won't be able to freeze
+      the cpu online hotplug task and hence freezing of tasks fails.
+
+   As a result of this task freezing failure, the suspend operation gets
+   aborted.
diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt
deleted file mode 100644
index a8751b8df10e..000000000000
--- a/Documentation/power/suspend-and-cpuhotplug.txt
+++ /dev/null
@@ -1,274 +0,0 @@
-Interaction of Suspend code (S3) with the CPU hotplug infrastructure
-
-     (C) 2011 - 2014 Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
-
-
-I. How does the regular CPU hotplug code differ from how the Suspend-to-RAM
-   infrastructure uses it internally? And where do they share common code?
-
-Well, a picture is worth a thousand words... So ASCII art follows :-)
-
-[This depicts the current design in the kernel, and focusses only on the
-interactions involving the freezer and CPU hotplug and also tries to explain
-the locking involved. It outlines the notifications involved as well.
-But please note that here, only the call paths are illustrated, with the aim
-of describing where they take different paths and where they share code.
-What happens when regular CPU hotplug and Suspend-to-RAM race with each other
-is not depicted here.]
-
-On a high level, the suspend-resume cycle goes like this:
-
-|Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw |
-|tasks |    |     cpus      |    |          |    |     cpus     |    |tasks|
-
-
-More details follow:
-
-                                Suspend call path
-                                -----------------
-
-                                  Write 'mem' to
-                                /sys/power/state
-                                    sysfs file
-                                        |
-                                        v
-                               Acquire system_transition_mutex lock
-                                        |
-                                        v
-                             Send PM_SUSPEND_PREPARE
-                                   notifications
-                                        |
-                                        v
-                                   Freeze tasks
-                                        |
-                                        |
-                                        v
-                              disable_nonboot_cpus()
-                                   /* start */
-                                        |
-                                        v
-                            Acquire cpu_add_remove_lock
-                                        |
-                                        v
-                             Iterate over CURRENTLY
-                                   online CPUs
-                                        |
-                                        |
-                                        |                ----------
-                                        v                          | L
-             ======>               _cpu_down()                     |
-            |              [This takes cpuhotplug.lock             |
-  Common    |               before taking down the CPU             |
-   code     |               and releases it when done]             | O
-            |            While it is at it, notifications          |
-            |            are sent when notable events occur,       |
-             ======>     by running all registered callbacks.      |
-                                        |                          | O
-                                        |                          |
-                                        |                          |
-                                        v                          |
-                            Note down these cpus in                | P
-                                frozen_cpus mask         ----------
-                                        |
-                                        v
-                           Disable regular cpu hotplug
-                        by increasing cpu_hotplug_disabled
-                                        |
-                                        v
-                            Release cpu_add_remove_lock
-                                        |
-                                        v
-                       /* disable_nonboot_cpus() complete */
-                                        |
-                                        v
-                                   Do suspend
-
-
-
-Resuming back is likewise, with the counterparts being (in the order of
-execution during resume):
-* enable_nonboot_cpus() which involves:
-   |  Acquire cpu_add_remove_lock
-   |  Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug
-   |  Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop]
-   |  Release cpu_add_remove_lock
-   v
-
-* thaw tasks
-* send PM_POST_SUSPEND notifications
-* Release system_transition_mutex lock.
-
-
-It is to be noted here that the system_transition_mutex lock is acquired at the very
-beginning, when we are just starting out to suspend, and then released only
-after the entire cycle is complete (i.e., suspend + resume).
-
-
-
-                          Regular CPU hotplug call path
-                          -----------------------------
-
-                                Write 0 (or 1) to
-                       /sys/devices/system/cpu/cpu*/online
-                                    sysfs file
-                                        |
-                                        |
-                                        v
-                                    cpu_down()
-                                        |
-                                        v
-                           Acquire cpu_add_remove_lock
-                                        |
-                                        v
-                          If cpu_hotplug_disabled > 0
-                                return gracefully
-                                        |
-                                        |
-                                        v
-             ======>                _cpu_down()
-            |              [This takes cpuhotplug.lock
-  Common    |               before taking down the CPU
-   code     |               and releases it when done]
-            |            While it is at it, notifications
-            |           are sent when notable events occur,
-             ======>    by running all registered callbacks.
-                                        |
-                                        |
-                                        v
-                          Release cpu_add_remove_lock
-                               [That's it!, for
-                              regular CPU hotplug]
-
-
-
-So, as can be seen from the two diagrams (the parts marked as "Common code"),
-regular CPU hotplug and the suspend code path converge at the _cpu_down() and
-_cpu_up() functions. They differ in the arguments passed to these functions,
-in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen'
-argument. But during suspend, since the tasks are already frozen by the time
-the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called
-with the 'tasks_frozen' argument set to 1.
-[See below for some known issues regarding this.]
-
-
-Important files and functions/entry points:
-------------------------------------------
-
-kernel/power/process.c : freeze_processes(), thaw_processes()
-kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish()
-kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus()
-
-
-
-II. What are the issues involved in CPU hotplug?
-    -------------------------------------------
-
-There are some interesting situations involving CPU hotplug and microcode
-update on the CPUs, as discussed below:
-
-[Please bear in mind that the kernel requests the microcode images from
-userspace, using the request_firmware() function defined in
-drivers/base/firmware_loader/main.c]
-
-
-a. When all the CPUs are identical:
-
-   This is the most common situation and it is quite straightforward: we want
-   to apply the same microcode revision to each of the CPUs.
-   To give an example of x86, the collect_cpu_info() function defined in
-   arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU
-   and thereby in applying the correct microcode revision to it.
-   But note that the kernel does not maintain a common microcode image for the
-   all CPUs, in order to handle case 'b' described below.
-
-
-b. When some of the CPUs are different than the rest:
-
-   In this case since we probably need to apply different microcode revisions
-   to different CPUs, the kernel maintains a copy of the correct microcode
-   image for each CPU (after appropriate CPU type/model discovery using
-   functions such as collect_cpu_info()).
-
-
-c. When a CPU is physically hot-unplugged and a new (and possibly different
-   type of) CPU is hot-plugged into the system:
-
-   In the current design of the kernel, whenever a CPU is taken offline during
-   a regular CPU hotplug operation, upon receiving the CPU_DEAD notification
-   (which is sent by the CPU hotplug code), the microcode update driver's
-   callback for that event reacts by freeing the kernel's copy of the
-   microcode image for that CPU.
-
-   Hence, when a new CPU is brought online, since the kernel finds that it
-   doesn't have the microcode image, it does the CPU type/model discovery
-   afresh and then requests the userspace for the appropriate microcode image
-   for that CPU, which is subsequently applied.
-
-   For example, in x86, the mc_cpu_callback() function (which is the microcode
-   update driver's callback registered for CPU hotplug events) calls
-   microcode_update_cpu() which would call microcode_init_cpu() in this case,
-   instead of microcode_resume_cpu() when it finds that the kernel doesn't
-   have a valid microcode image. This ensures that the CPU type/model
-   discovery is performed and the right microcode is applied to the CPU after
-   getting it from userspace.
-
-
-d. Handling microcode update during suspend/hibernate:
-
-   Strictly speaking, during a CPU hotplug operation which does not involve
-   physically removing or inserting CPUs, the CPUs are not actually powered
-   off during a CPU offline. They are just put to the lowest C-states possible.
-   Hence, in such a case, it is not really necessary to re-apply microcode
-   when the CPUs are brought back online, since they wouldn't have lost the
-   image during the CPU offline operation.
-
-   This is the usual scenario encountered during a resume after a suspend.
-   However, in the case of hibernation, since all the CPUs are completely
-   powered off, during restore it becomes necessary to apply the microcode
-   images to all the CPUs.
-
-   [Note that we don't expect someone to physically pull out nodes and insert
-   nodes with a different type of CPUs in-between a suspend-resume or a
-   hibernate/restore cycle.]
-
-   In the current design of the kernel however, during a CPU offline operation
-   as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
-   the existing copy of microcode image in the kernel is not freed up.
-   And during the CPU online operations (during resume/restore), since the
-   kernel finds that it already has copies of the microcode images for all the
-   CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU
-   type/model and the need for validating whether the microcode revisions are
-   right for the CPUs or not (due to the above assumption that physical CPU
-   hotplug will not be done in-between suspend/resume or hibernate/restore
-   cycles).
-
-
-III. Are there any known problems when regular CPU hotplug and suspend race
-     with each other?
-
-Yes, they are listed below:
-
-1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to
-   the _cpu_down() and _cpu_up() functions is *always* 0.
-   This might not reflect the true current state of the system, since the
-   tasks could have been frozen by an out-of-band event such as a suspend
-   operation in progress. Hence, the cpuhp_tasks_frozen variable will not
-   reflect the frozen state and the CPU hotplug callbacks which evaluate
-   that variable might execute the wrong code path.
-
-2. If a regular CPU hotplug stress test happens to race with the freezer due
-   to a suspend operation in progress at the same time, then we could hit the
-   situation described below:
-
-    * A regular cpu online operation continues its journey from userspace
-      into the kernel, since the freezing has not yet begun.
-    * Then freezer gets to work and freezes userspace.
-    * If cpu online has not yet completed the microcode update stuff by now,
-      it will now start waiting on the frozen userspace in the
-      TASK_UNINTERRUPTIBLE state, in order to get the microcode image.
-    * Now the freezer continues and tries to freeze the remaining tasks. But
-      due to this wait mentioned above, the freezer won't be able to freeze
-      the cpu online hotplug task and hence freezing of tasks fails.
-
-   As a result of this task freezing failure, the suspend operation gets
-   aborted.
diff --git a/Documentation/power/suspend-and-interrupts.rst b/Documentation/power/suspend-and-interrupts.rst
new file mode 100644
index 000000000000..4cda6617709a
--- /dev/null
+++ b/Documentation/power/suspend-and-interrupts.rst
@@ -0,0 +1,137 @@
+====================================
+System Suspend and Device Interrupts
+====================================
+
+Copyright (C) 2014 Intel Corp.
+Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+
+Suspending and Resuming Device IRQs
+-----------------------------------
+
+Device interrupt request lines (IRQs) are generally disabled during system
+suspend after the "late" phase of suspending devices (that is, after all of the
+->prepare, ->suspend and ->suspend_late callbacks have been executed for all
+devices).  That is done by suspend_device_irqs().
+
+The rationale for doing so is that after the "late" phase of device suspend
+there is no legitimate reason why any interrupts from suspended devices should
+trigger and if any devices have not been suspended properly yet, it is better to
+block interrupts from them anyway.  Also, in the past we had problems with
+interrupt handlers for shared IRQs that device drivers implementing them were
+not prepared for interrupts triggering after their devices had been suspended.
+In some cases they would attempt to access, for example, memory address spaces
+of suspended devices and cause unpredictable behavior to ensue as a result.
+Unfortunately, such problems are very difficult to debug and the introduction
+of suspend_device_irqs(), along with the "noirq" phase of device suspend and
+resume, was the only practical way to mitigate them.
+
+Device IRQs are re-enabled during system resume, right before the "early" phase
+of resuming devices (that is, before starting to execute ->resume_early
+callbacks for devices).  The function doing that is resume_device_irqs().
+
+
+The IRQF_NO_SUSPEND Flag
+------------------------
+
+There are interrupts that can legitimately trigger during the entire system
+suspend-resume cycle, including the "noirq" phases of suspending and resuming
+devices as well as during the time when nonboot CPUs are taken offline and
+brought back online.  That applies to timer interrupts in the first place,
+but also to IPIs and to some other special-purpose interrupts.
+
+The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when
+requesting a special-purpose interrupt.  It causes suspend_device_irqs() to
+leave the corresponding IRQ enabled so as to allow the interrupt to work as
+expected during the suspend-resume cycle, but does not guarantee that the
+interrupt will wake the system from a suspended state -- for such cases it is
+necessary to use enable_irq_wake().
+
+Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one
+user of it.  Thus, if the IRQ is shared, all of the interrupt handlers installed
+for it will be executed as usual after suspend_device_irqs(), even if the
+IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of
+the IRQ's users.  For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the
+same time should be avoided.
+
+
+System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake()
+------------------------------------------------------------------
+
+System wakeup interrupts generally need to be configured to wake up the system
+from sleep states, especially if they are used for different purposes (e.g. as
+I/O interrupts) in the working state.
+
+That may involve turning on a special signal handling logic within the platform
+(such as an SoC) so that signals from a given line are routed in a different way
+during system sleep so as to trigger a system wakeup when needed.  For example,
+the platform may include a dedicated interrupt controller used specifically for
+handling system wakeup events.  Then, if a given interrupt line is supposed to
+wake up the system from sleep sates, the corresponding input of that interrupt
+controller needs to be enabled to receive signals from the line in question.
+After wakeup, it generally is better to disable that input to prevent the
+dedicated controller from triggering interrupts unnecessarily.
+
+The IRQ subsystem provides two helper functions to be used by device drivers for
+those purposes.  Namely, enable_irq_wake() turns on the platform's logic for
+handling the given IRQ as a system wakeup interrupt line and disable_irq_wake()
+turns that logic off.
+
+Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ
+in a special way.  Namely, the IRQ remains enabled, by on the first interrupt
+it will be disabled, marked as pending and "suspended" so that it will be
+re-enabled by resume_device_irqs() during the subsequent system resume.  Also
+the PM core is notified about the event which causes the system suspend in
+progress to be aborted (that doesn't have to happen immediately, but at one
+of the points where the suspend thread looks for pending wakeup events).
+
+This way every interrupt from a wakeup interrupt source will either cause the
+system suspend currently in progress to be aborted or wake up the system if
+already suspended.  However, after suspend_device_irqs() interrupt handlers are
+not executed for system wakeup IRQs.  They are only executed for IRQF_NO_SUSPEND
+IRQs at that time, but those IRQs should not be configured for system wakeup
+using enable_irq_wake().
+
+
+Interrupts and Suspend-to-Idle
+------------------------------
+
+Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new
+system sleep state that works by idling all of the processors and waiting for
+interrupts right after the "noirq" phase of suspending devices.
+
+Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag
+set will bring CPUs out of idle while in that state, but they will not cause the
+IRQ subsystem to trigger a system wakeup.
+
+System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in
+analogy with what they do in the full system suspend case.  The only difference
+is that the wakeup from suspend-to-idle is signaled using the usual working
+state interrupt delivery mechanisms and doesn't require the platform to use
+any special interrupt handling logic for it to work.
+
+
+IRQF_NO_SUSPEND and enable_irq_wake()
+-------------------------------------
+
+There are very few valid reasons to use both enable_irq_wake() and the
+IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the
+same device.
+
+First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND
+interrupts (interrupt handlers are invoked after suspend_device_irqs()) are
+directly at odds with the rules for handling system wakeup interrupts (interrupt
+handlers are not invoked after suspend_device_irqs()).
+
+Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not
+to individual interrupt handlers, so sharing an IRQ between a system wakeup
+interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally
+make sense.
+
+In rare cases an IRQ can be shared between a wakeup device driver and an
+IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver
+must be able to discern spurious IRQs from genuine wakeup events (signalling
+the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to
+ensure that the IRQ will function as a wakeup source, and must request the IRQ
+with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If
+these requirements are not met, it is not valid to use IRQF_COND_SUSPEND.
diff --git a/Documentation/power/suspend-and-interrupts.txt b/Documentation/power/suspend-and-interrupts.txt
deleted file mode 100644
index 8afb29a8604a..000000000000
--- a/Documentation/power/suspend-and-interrupts.txt
+++ /dev/null
@@ -1,135 +0,0 @@
-System Suspend and Device Interrupts
-
-Copyright (C) 2014 Intel Corp.
-Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-
-Suspending and Resuming Device IRQs
------------------------------------
-
-Device interrupt request lines (IRQs) are generally disabled during system
-suspend after the "late" phase of suspending devices (that is, after all of the
-->prepare, ->suspend and ->suspend_late callbacks have been executed for all
-devices).  That is done by suspend_device_irqs().
-
-The rationale for doing so is that after the "late" phase of device suspend
-there is no legitimate reason why any interrupts from suspended devices should
-trigger and if any devices have not been suspended properly yet, it is better to
-block interrupts from them anyway.  Also, in the past we had problems with
-interrupt handlers for shared IRQs that device drivers implementing them were
-not prepared for interrupts triggering after their devices had been suspended.
-In some cases they would attempt to access, for example, memory address spaces
-of suspended devices and cause unpredictable behavior to ensue as a result.
-Unfortunately, such problems are very difficult to debug and the introduction
-of suspend_device_irqs(), along with the "noirq" phase of device suspend and
-resume, was the only practical way to mitigate them.
-
-Device IRQs are re-enabled during system resume, right before the "early" phase
-of resuming devices (that is, before starting to execute ->resume_early
-callbacks for devices).  The function doing that is resume_device_irqs().
-
-
-The IRQF_NO_SUSPEND Flag
-------------------------
-
-There are interrupts that can legitimately trigger during the entire system
-suspend-resume cycle, including the "noirq" phases of suspending and resuming
-devices as well as during the time when nonboot CPUs are taken offline and
-brought back online.  That applies to timer interrupts in the first place,
-but also to IPIs and to some other special-purpose interrupts.
-
-The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when
-requesting a special-purpose interrupt.  It causes suspend_device_irqs() to
-leave the corresponding IRQ enabled so as to allow the interrupt to work as
-expected during the suspend-resume cycle, but does not guarantee that the
-interrupt will wake the system from a suspended state -- for such cases it is
-necessary to use enable_irq_wake().
-
-Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one
-user of it.  Thus, if the IRQ is shared, all of the interrupt handlers installed
-for it will be executed as usual after suspend_device_irqs(), even if the
-IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of
-the IRQ's users.  For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the
-same time should be avoided.
-
-
-System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake()
-------------------------------------------------------------------
-
-System wakeup interrupts generally need to be configured to wake up the system
-from sleep states, especially if they are used for different purposes (e.g. as
-I/O interrupts) in the working state.
-
-That may involve turning on a special signal handling logic within the platform
-(such as an SoC) so that signals from a given line are routed in a different way
-during system sleep so as to trigger a system wakeup when needed.  For example,
-the platform may include a dedicated interrupt controller used specifically for
-handling system wakeup events.  Then, if a given interrupt line is supposed to
-wake up the system from sleep sates, the corresponding input of that interrupt
-controller needs to be enabled to receive signals from the line in question.
-After wakeup, it generally is better to disable that input to prevent the
-dedicated controller from triggering interrupts unnecessarily.
-
-The IRQ subsystem provides two helper functions to be used by device drivers for
-those purposes.  Namely, enable_irq_wake() turns on the platform's logic for
-handling the given IRQ as a system wakeup interrupt line and disable_irq_wake()
-turns that logic off.
-
-Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ
-in a special way.  Namely, the IRQ remains enabled, by on the first interrupt
-it will be disabled, marked as pending and "suspended" so that it will be
-re-enabled by resume_device_irqs() during the subsequent system resume.  Also
-the PM core is notified about the event which causes the system suspend in
-progress to be aborted (that doesn't have to happen immediately, but at one
-of the points where the suspend thread looks for pending wakeup events).
-
-This way every interrupt from a wakeup interrupt source will either cause the
-system suspend currently in progress to be aborted or wake up the system if
-already suspended.  However, after suspend_device_irqs() interrupt handlers are
-not executed for system wakeup IRQs.  They are only executed for IRQF_NO_SUSPEND
-IRQs at that time, but those IRQs should not be configured for system wakeup
-using enable_irq_wake().
-
-
-Interrupts and Suspend-to-Idle
-------------------------------
-
-Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new
-system sleep state that works by idling all of the processors and waiting for
-interrupts right after the "noirq" phase of suspending devices.
-
-Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag
-set will bring CPUs out of idle while in that state, but they will not cause the
-IRQ subsystem to trigger a system wakeup.
-
-System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in
-analogy with what they do in the full system suspend case.  The only difference
-is that the wakeup from suspend-to-idle is signaled using the usual working
-state interrupt delivery mechanisms and doesn't require the platform to use
-any special interrupt handling logic for it to work.
-
-
-IRQF_NO_SUSPEND and enable_irq_wake()
--------------------------------------
-
-There are very few valid reasons to use both enable_irq_wake() and the
-IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the
-same device.
-
-First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND
-interrupts (interrupt handlers are invoked after suspend_device_irqs()) are
-directly at odds with the rules for handling system wakeup interrupts (interrupt
-handlers are not invoked after suspend_device_irqs()).
-
-Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not
-to individual interrupt handlers, so sharing an IRQ between a system wakeup
-interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally
-make sense.
-
-In rare cases an IRQ can be shared between a wakeup device driver and an
-IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver
-must be able to discern spurious IRQs from genuine wakeup events (signalling
-the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to
-ensure that the IRQ will function as a wakeup source, and must request the IRQ
-with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If
-these requirements are not met, it is not valid to use IRQF_COND_SUSPEND.
diff --git a/Documentation/power/swsusp-and-swap-files.rst b/Documentation/power/swsusp-and-swap-files.rst
new file mode 100644
index 000000000000..a33a2919dbe4
--- /dev/null
+++ b/Documentation/power/swsusp-and-swap-files.rst
@@ -0,0 +1,63 @@
+===============================================
+Using swap files with software suspend (swsusp)
+===============================================
+
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+The Linux kernel handles swap files almost in the same way as it handles swap
+partitions and there are only two differences between these two types of swap
+areas:
+(1) swap files need not be contiguous,
+(2) the header of a swap file is not in the first block of the partition that
+holds it.  From the swsusp's point of view (1) is not a problem, because it is
+already taken care of by the swap-handling code, but (2) has to be taken into
+consideration.
+
+In principle the location of a swap file's header may be determined with the
+help of appropriate filesystem driver.  Unfortunately, however, it requires the
+filesystem holding the swap file to be mounted, and if this filesystem is
+journaled, it cannot be mounted during resume from disk.  For this reason to
+identify a swap file swsusp uses the name of the partition that holds the file
+and the offset from the beginning of the partition at which the swap file's
+header is located.  For convenience, this offset is expressed in <PAGE_SIZE>
+units.
+
+In order to use a swap file with swsusp, you need to:
+
+1) Create the swap file and make it active, eg.::
+
+    # dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
+    # mkswap <swap_file_path>
+    # swapon <swap_file_path>
+
+2) Use an application that will bmap the swap file with the help of the
+FIBMAP ioctl and determine the location of the file's swap header, as the
+offset, in <PAGE_SIZE> units, from the beginning of the partition which
+holds the swap file.
+
+3) Add the following parameters to the kernel command line::
+
+    resume=<swap_file_partition> resume_offset=<swap_file_offset>
+
+where <swap_file_partition> is the partition on which the swap file is located
+and <swap_file_offset> is the offset of the swap header determined by the
+application in 2) (of course, this step may be carried out automatically
+by the same application that determines the swap file's header offset using the
+FIBMAP ioctl)
+
+OR
+
+Use a userland suspend application that will set the partition and offset
+with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
+Documentation/power/userland-swsusp.rst (this is the only method to suspend
+to a swap file allowing the resume to be initiated from an initrd or initramfs
+image).
+
+Now, swsusp will use the swap file in the same way in which it would use a swap
+partition.  In particular, the swap file has to be active (ie. be present in
+/proc/swaps) so that it can be used for suspending.
+
+Note that if the swap file used for suspending is deleted and recreated,
+the location of its header need not be the same as before.  Thus every time
+this happens the value of the "resume_offset=" kernel command line parameter
+has to be updated.
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
deleted file mode 100644
index f281886de490..000000000000
--- a/Documentation/power/swsusp-and-swap-files.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-Using swap files with software suspend (swsusp)
-	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
-
-The Linux kernel handles swap files almost in the same way as it handles swap
-partitions and there are only two differences between these two types of swap
-areas:
-(1) swap files need not be contiguous,
-(2) the header of a swap file is not in the first block of the partition that
-holds it.  From the swsusp's point of view (1) is not a problem, because it is
-already taken care of by the swap-handling code, but (2) has to be taken into
-consideration.
-
-In principle the location of a swap file's header may be determined with the
-help of appropriate filesystem driver.  Unfortunately, however, it requires the
-filesystem holding the swap file to be mounted, and if this filesystem is
-journaled, it cannot be mounted during resume from disk.  For this reason to
-identify a swap file swsusp uses the name of the partition that holds the file
-and the offset from the beginning of the partition at which the swap file's
-header is located.  For convenience, this offset is expressed in <PAGE_SIZE>
-units.
-
-In order to use a swap file with swsusp, you need to:
-
-1) Create the swap file and make it active, eg.
-
-# dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
-# mkswap <swap_file_path>
-# swapon <swap_file_path>
-
-2) Use an application that will bmap the swap file with the help of the
-FIBMAP ioctl and determine the location of the file's swap header, as the
-offset, in <PAGE_SIZE> units, from the beginning of the partition which
-holds the swap file.
-
-3) Add the following parameters to the kernel command line:
-
-resume=<swap_file_partition> resume_offset=<swap_file_offset>
-
-where <swap_file_partition> is the partition on which the swap file is located
-and <swap_file_offset> is the offset of the swap header determined by the
-application in 2) (of course, this step may be carried out automatically
-by the same application that determines the swap file's header offset using the
-FIBMAP ioctl)
-
-OR
-
-Use a userland suspend application that will set the partition and offset
-with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
-Documentation/power/userland-swsusp.txt (this is the only method to suspend
-to a swap file allowing the resume to be initiated from an initrd or initramfs
-image).
-
-Now, swsusp will use the swap file in the same way in which it would use a swap
-partition.  In particular, the swap file has to be active (ie. be present in
-/proc/swaps) so that it can be used for suspending.
-
-Note that if the swap file used for suspending is deleted and recreated,
-the location of its header need not be the same as before.  Thus every time
-this happens the value of the "resume_offset=" kernel command line parameter
-has to be updated.
diff --git a/Documentation/power/swsusp-dmcrypt.rst b/Documentation/power/swsusp-dmcrypt.rst
new file mode 100644
index 000000000000..426df59172cd
--- /dev/null
+++ b/Documentation/power/swsusp-dmcrypt.rst
@@ -0,0 +1,140 @@
+=======================================
+How to use dm-crypt and swsusp together
+=======================================
+
+Author: Andreas Steinmetz <ast@domdv.de>
+
+
+
+Some prerequisites:
+You know how dm-crypt works. If not, visit the following web page:
+http://www.saout.de/misc/dm-crypt/
+You have read Documentation/power/swsusp.rst and understand it.
+You did read Documentation/admin-guide/initrd.rst and know how an initrd works.
+You know how to create or how to modify an initrd.
+
+Now your system is properly set up, your disk is encrypted except for
+the swap device(s) and the boot partition which may contain a mini
+system for crypto setup and/or rescue purposes. You may even have
+an initrd that does your current crypto setup already.
+
+At this point you want to encrypt your swap, too. Still you want to
+be able to suspend using swsusp. This, however, means that you
+have to be able to either enter a passphrase or that you read
+the key(s) from an external device like a pcmcia flash disk
+or an usb stick prior to resume. So you need an initrd, that sets
+up dm-crypt and then asks swsusp to resume from the encrypted
+swap device.
+
+The most important thing is that you set up dm-crypt in such
+a way that the swap device you suspend to/resume from has
+always the same major/minor within the initrd as well as
+within your running system. The easiest way to achieve this is
+to always set up this swap device first with dmsetup, so that
+it will always look like the following::
+
+  brw-------  1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
+
+Now set up your kernel to use /dev/mapper/swap0 as the default
+resume partition, so your kernel .config contains::
+
+  CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
+
+Prepare your boot loader to use the initrd you will create or
+modify. For lilo the simplest setup looks like the following
+lines::
+
+  image=/boot/vmlinuz
+  initrd=/boot/initrd.gz
+  label=linux
+  append="root=/dev/ram0 init=/linuxrc rw"
+
+Finally you need to create or modify your initrd. Lets assume
+you create an initrd that reads the required dm-crypt setup
+from a pcmcia flash disk card. The card is formatted with an ext2
+fs which resides on /dev/hde1 when the card is inserted. The
+card contains at least the encrypted swap setup in a file
+named "swapkey". /etc/fstab of your initrd contains something
+like the following::
+
+  /dev/hda1   /mnt    ext3      ro                            0 0
+  none        /proc   proc      defaults,noatime,nodiratime   0 0
+  none        /sys    sysfs     defaults,noatime,nodiratime   0 0
+
+/dev/hda1 contains an unencrypted mini system that sets up all
+of your crypto devices, again by reading the setup from the
+pcmcia flash disk. What follows now is a /linuxrc for your
+initrd that allows you to resume from encrypted swap and that
+continues boot with your mini system on /dev/hda1 if resume
+does not happen::
+
+  #!/bin/sh
+  PATH=/sbin:/bin:/usr/sbin:/usr/bin
+  mount /proc
+  mount /sys
+  mapped=0
+  noresume=`grep -c noresume /proc/cmdline`
+  if [ "$*" != "" ]
+  then
+    noresume=1
+  fi
+  dmesg -n 1
+  /sbin/cardmgr -q
+  for i in 1 2 3 4 5 6 7 8 9 0
+  do
+    if [ -f /proc/ide/hde/media ]
+    then
+      usleep 500000
+      mount -t ext2 -o ro /dev/hde1 /mnt
+      if [ -f /mnt/swapkey ]
+      then
+        dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
+      fi
+      umount /mnt
+      break
+    fi
+    usleep 500000
+  done
+  killproc /sbin/cardmgr
+  dmesg -n 6
+  if [ $mapped = 1 ]
+  then
+    if [ $noresume != 0 ]
+    then
+      mkswap /dev/mapper/swap0 > /dev/null 2>&1
+    fi
+    echo 254:0 > /sys/power/resume
+    dmsetup remove swap0
+  fi
+  umount /sys
+  mount /mnt
+  umount /proc
+  cd /mnt
+  pivot_root . mnt
+  mount /proc
+  umount -l /mnt
+  umount /proc
+  exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
+
+Please don't mind the weird loop above, busybox's msh doesn't know
+the let statement. Now, what is happening in the script?
+First we have to decide if we want to try to resume, or not.
+We will not resume if booting with "noresume" or any parameters
+for init like "single" or "emergency" as boot parameters.
+
+Then we need to set up dmcrypt with the setup data from the
+pcmcia flash disk. If this succeeds we need to reset the swap
+device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
+then attempts to resume from the first device mapper device.
+Note that it is important to set the device in /sys/power/resume,
+regardless if resuming or not, otherwise later suspend will fail.
+If resume starts, script execution terminates here.
+
+Otherwise we just remove the encrypted swap device and leave it to the
+mini system on /dev/hda1 to set the whole crypto up (it is up to
+you to modify this to your taste).
+
+What then follows is the well known process to change the root
+file system and continue booting from there. I prefer to unmount
+the initrd prior to continue booting but it is up to you to modify
+this.
diff --git a/Documentation/power/swsusp-dmcrypt.txt b/Documentation/power/swsusp-dmcrypt.txt
deleted file mode 100644
index b802fbfd95ef..000000000000
--- a/Documentation/power/swsusp-dmcrypt.txt
+++ /dev/null
@@ -1,138 +0,0 @@
-Author: Andreas Steinmetz <ast@domdv.de>
-
-
-How to use dm-crypt and swsusp together:
-========================================
-
-Some prerequisites:
-You know how dm-crypt works. If not, visit the following web page:
-http://www.saout.de/misc/dm-crypt/
-You have read Documentation/power/swsusp.txt and understand it.
-You did read Documentation/admin-guide/initrd.rst and know how an initrd works.
-You know how to create or how to modify an initrd.
-
-Now your system is properly set up, your disk is encrypted except for
-the swap device(s) and the boot partition which may contain a mini
-system for crypto setup and/or rescue purposes. You may even have
-an initrd that does your current crypto setup already.
-
-At this point you want to encrypt your swap, too. Still you want to
-be able to suspend using swsusp. This, however, means that you
-have to be able to either enter a passphrase or that you read
-the key(s) from an external device like a pcmcia flash disk
-or an usb stick prior to resume. So you need an initrd, that sets
-up dm-crypt and then asks swsusp to resume from the encrypted
-swap device.
-
-The most important thing is that you set up dm-crypt in such
-a way that the swap device you suspend to/resume from has
-always the same major/minor within the initrd as well as
-within your running system. The easiest way to achieve this is
-to always set up this swap device first with dmsetup, so that
-it will always look like the following:
-
-brw-------  1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
-
-Now set up your kernel to use /dev/mapper/swap0 as the default
-resume partition, so your kernel .config contains:
-
-CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
-
-Prepare your boot loader to use the initrd you will create or
-modify. For lilo the simplest setup looks like the following
-lines:
-
-image=/boot/vmlinuz
-initrd=/boot/initrd.gz
-label=linux
-append="root=/dev/ram0 init=/linuxrc rw"
-
-Finally you need to create or modify your initrd. Lets assume
-you create an initrd that reads the required dm-crypt setup
-from a pcmcia flash disk card. The card is formatted with an ext2
-fs which resides on /dev/hde1 when the card is inserted. The
-card contains at least the encrypted swap setup in a file
-named "swapkey". /etc/fstab of your initrd contains something
-like the following:
-
-/dev/hda1   /mnt    ext3      ro                            0 0
-none        /proc   proc      defaults,noatime,nodiratime   0 0
-none        /sys    sysfs     defaults,noatime,nodiratime   0 0
-
-/dev/hda1 contains an unencrypted mini system that sets up all
-of your crypto devices, again by reading the setup from the
-pcmcia flash disk. What follows now is a /linuxrc for your
-initrd that allows you to resume from encrypted swap and that
-continues boot with your mini system on /dev/hda1 if resume
-does not happen:
-
-#!/bin/sh
-PATH=/sbin:/bin:/usr/sbin:/usr/bin
-mount /proc
-mount /sys
-mapped=0
-noresume=`grep -c noresume /proc/cmdline`
-if [ "$*" != "" ]
-then
-  noresume=1
-fi
-dmesg -n 1
-/sbin/cardmgr -q
-for i in 1 2 3 4 5 6 7 8 9 0
-do
-  if [ -f /proc/ide/hde/media ]
-  then
-    usleep 500000
-    mount -t ext2 -o ro /dev/hde1 /mnt
-    if [ -f /mnt/swapkey ]
-    then
-      dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
-    fi
-    umount /mnt
-    break
-  fi
-  usleep 500000
-done
-killproc /sbin/cardmgr
-dmesg -n 6
-if [ $mapped = 1 ]
-then
-  if [ $noresume != 0 ]
-  then
-    mkswap /dev/mapper/swap0 > /dev/null 2>&1
-  fi
-  echo 254:0 > /sys/power/resume
-  dmsetup remove swap0
-fi
-umount /sys
-mount /mnt
-umount /proc
-cd /mnt
-pivot_root . mnt
-mount /proc
-umount -l /mnt
-umount /proc
-exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
-
-Please don't mind the weird loop above, busybox's msh doesn't know
-the let statement. Now, what is happening in the script?
-First we have to decide if we want to try to resume, or not.
-We will not resume if booting with "noresume" or any parameters
-for init like "single" or "emergency" as boot parameters.
-
-Then we need to set up dmcrypt with the setup data from the
-pcmcia flash disk. If this succeeds we need to reset the swap
-device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
-then attempts to resume from the first device mapper device.
-Note that it is important to set the device in /sys/power/resume,
-regardless if resuming or not, otherwise later suspend will fail.
-If resume starts, script execution terminates here.
-
-Otherwise we just remove the encrypted swap device and leave it to the
-mini system on /dev/hda1 to set the whole crypto up (it is up to
-you to modify this to your taste).
-
-What then follows is the well known process to change the root
-file system and continue booting from there. I prefer to unmount
-the initrd prior to continue booting but it is up to you to modify
-this.
diff --git a/Documentation/power/swsusp.rst b/Documentation/power/swsusp.rst
new file mode 100644
index 000000000000..d000312f6965
--- /dev/null
+++ b/Documentation/power/swsusp.rst
@@ -0,0 +1,501 @@
+============
+Swap suspend
+============
+
+Some warnings, first.
+
+.. warning::
+
+   **BIG FAT WARNING**
+
+   If you touch anything on disk between suspend and resume...
+				...kiss your data goodbye.
+
+   If you do resume from initrd after your filesystems are mounted...
+				...bye bye root partition.
+
+			[this is actually same case as above]
+
+   If you have unsupported ( ) devices using DMA, you may have some
+   problems. If your disk driver does not support suspend... (IDE does),
+   it may cause some problems, too. If you change kernel command line
+   between suspend and resume, it may do something wrong. If you change
+   your hardware while system is suspended... well, it was not good idea;
+   but it will probably only crash.
+
+   ( ) suspend/resume support is needed to make it safe.
+
+   If you have any filesystems on USB devices mounted before software suspend,
+   they won't be accessible after resume and you may lose data, as though
+   you have unplugged the USB devices with mounted filesystems on them;
+   see the FAQ below for details.  (This is not true for more traditional
+   power states like "standby", which normally don't turn USB off.)
+
+Swap partition:
+  You need to append resume=/dev/your_swap_partition to kernel command
+  line or specify it using /sys/power/resume.
+
+Swap file:
+  If using a swapfile you can also specify a resume offset using
+  resume_offset=<number> on the kernel command line or specify it
+  in /sys/power/resume_offset.
+
+After preparing then you suspend by::
+
+	echo shutdown > /sys/power/disk; echo disk > /sys/power/state
+
+- If you feel ACPI works pretty well on your system, you might try::
+
+	echo platform > /sys/power/disk; echo disk > /sys/power/state
+
+- If you would like to write hibernation image to swap and then suspend
+  to RAM (provided your platform supports it), you can try::
+
+	echo suspend > /sys/power/disk; echo disk > /sys/power/state
+
+- If you have SATA disks, you'll need recent kernels with SATA suspend
+  support. For suspend and resume to work, make sure your disk drivers
+  are built into kernel -- not modules. [There's way to make
+  suspend/resume with modular disk drivers, see FAQ, but you probably
+  should not do that.]
+
+If you want to limit the suspend image size to N bytes, do::
+
+	echo N > /sys/power/image_size
+
+before suspend (it is limited to around 2/5 of available RAM by default).
+
+- The resume process checks for the presence of the resume device,
+  if found, it then checks the contents for the hibernation image signature.
+  If both are found, it resumes the hibernation image.
+
+- The resume process may be triggered in two ways:
+
+  1) During lateinit:  If resume=/dev/your_swap_partition is specified on
+     the kernel command line, lateinit runs the resume process.  If the
+     resume device has not been probed yet, the resume process fails and
+     bootup continues.
+  2) Manually from an initrd or initramfs:  May be run from
+     the init script by using the /sys/power/resume file.  It is vital
+     that this be done prior to remounting any filesystems (even as
+     read-only) otherwise data may be corrupted.
+
+Article about goals and implementation of Software Suspend for Linux
+====================================================================
+
+Author: Gábor Kuti
+Last revised: 2003-10-20 by Pavel Machek
+
+Idea and goals to achieve
+-------------------------
+
+Nowadays it is common in several laptops that they have a suspend button. It
+saves the state of the machine to a filesystem or to a partition and switches
+to standby mode. Later resuming the machine the saved state is loaded back to
+ram and the machine can continue its work. It has two real benefits. First we
+save ourselves the time machine goes down and later boots up, energy costs
+are real high when running from batteries. The other gain is that we don't have
+to interrupt our programs so processes that are calculating something for a long
+time shouldn't need to be written interruptible.
+
+swsusp saves the state of the machine into active swaps and then reboots or
+powerdowns.  You must explicitly specify the swap partition to resume from with
+`resume=` kernel option. If signature is found it loads and restores saved
+state. If the option `noresume` is specified as a boot parameter, it skips
+the resuming.  If the option `hibernate=nocompress` is specified as a boot
+parameter, it saves hibernation image without compression.
+
+In the meantime while the system is suspended you should not add/remove any
+of the hardware, write to the filesystems, etc.
+
+Sleep states summary
+====================
+
+There are three different interfaces you can use, /proc/acpi should
+work like this:
+
+In a really perfect world::
+
+  echo 1 > /proc/acpi/sleep       # for standby
+  echo 2 > /proc/acpi/sleep       # for suspend to ram
+  echo 3 > /proc/acpi/sleep       # for suspend to ram, but with more power conservative
+  echo 4 > /proc/acpi/sleep       # for suspend to disk
+  echo 5 > /proc/acpi/sleep       # for shutdown unfriendly the system
+
+and perhaps::
+
+  echo 4b > /proc/acpi/sleep      # for suspend to disk via s4bios
+
+Frequently Asked Questions
+==========================
+
+Q:
+  well, suspending a server is IMHO a really stupid thing,
+  but... (Diego Zuccato):
+
+A:
+  You bought new UPS for your server. How do you install it without
+  bringing machine down? Suspend to disk, rearrange power cables,
+  resume.
+
+  You have your server on UPS. Power died, and UPS is indicating 30
+  seconds to failure. What do you do? Suspend to disk.
+
+
+Q:
+  Maybe I'm missing something, but why don't the regular I/O paths work?
+
+A:
+  We do use the regular I/O paths. However we cannot restore the data
+  to its original location as we load it. That would create an
+  inconsistent kernel state which would certainly result in an oops.
+  Instead, we load the image into unused memory and then atomically copy
+  it back to it original location. This implies, of course, a maximum
+  image size of half the amount of memory.
+
+  There are two solutions to this:
+
+  * require half of memory to be free during suspend. That way you can
+    read "new" data onto free spots, then cli and copy
+
+  * assume we had special "polling" ide driver that only uses memory
+    between 0-640KB. That way, I'd have to make sure that 0-640KB is free
+    during suspending, but otherwise it would work...
+
+  suspend2 shares this fundamental limitation, but does not include user
+  data and disk caches into "used memory" by saving them in
+  advance. That means that the limitation goes away in practice.
+
+Q:
+  Does linux support ACPI S4?
+
+A:
+  Yes. That's what echo platform > /sys/power/disk does.
+
+Q:
+  What is 'suspend2'?
+
+A:
+  suspend2 is 'Software Suspend 2', a forked implementation of
+  suspend-to-disk which is available as separate patches for 2.4 and 2.6
+  kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
+  highmem and preemption. It also has a extensible architecture that
+  allows for arbitrary transformations on the image (compression,
+  encryption) and arbitrary backends for writing the image (eg to swap
+  or an NFS share[Work In Progress]). Questions regarding suspend2
+  should be sent to the mailing list available through the suspend2
+  website, and not to the Linux Kernel Mailing List. We are working
+  toward merging suspend2 into the mainline kernel.
+
+Q:
+  What is the freezing of tasks and why are we using it?
+
+A:
+  The freezing of tasks is a mechanism by which user space processes and some
+  kernel threads are controlled during hibernation or system-wide suspend (on some
+  architectures).  See freezing-of-tasks.txt for details.
+
+Q:
+  What is the difference between "platform" and "shutdown"?
+
+A:
+  shutdown:
+	save state in linux, then tell bios to powerdown
+
+  platform:
+	save state in linux, then tell bios to powerdown and blink
+        "suspended led"
+
+  "platform" is actually right thing to do where supported, but
+  "shutdown" is most reliable (except on ACPI systems).
+
+Q:
+  I do not understand why you have such strong objections to idea of
+  selective suspend.
+
+A:
+  Do selective suspend during runtime power management, that's okay. But
+  it's useless for suspend-to-disk. (And I do not see how you could use
+  it for suspend-to-ram, I hope you do not want that).
+
+  Lets see, so you suggest to
+
+  * SUSPEND all but swap device and parents
+  * Snapshot
+  * Write image to disk
+  * SUSPEND swap device and parents
+  * Powerdown
+
+  Oh no, that does not work, if swap device or its parents uses DMA,
+  you've corrupted data. You'd have to do
+
+  * SUSPEND all but swap device and parents
+  * FREEZE swap device and parents
+  * Snapshot
+  * UNFREEZE swap device and parents
+  * Write
+  * SUSPEND swap device and parents
+
+  Which means that you still need that FREEZE state, and you get more
+  complicated code. (And I have not yet introduce details like system
+  devices).
+
+Q:
+  There don't seem to be any generally useful behavioral
+  distinctions between SUSPEND and FREEZE.
+
+A:
+  Doing SUSPEND when you are asked to do FREEZE is always correct,
+  but it may be unnecessarily slow. If you want your driver to stay simple,
+  slowness may not matter to you. It can always be fixed later.
+
+  For devices like disk it does matter, you do not want to spindown for
+  FREEZE.
+
+Q:
+  After resuming, system is paging heavily, leading to very bad interactivity.
+
+A:
+  Try running::
+
+    cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
+    do
+      test -f "$file" && cat "$file" > /dev/null
+    done
+
+  after resume. swapoff -a; swapon -a may also be useful.
+
+Q:
+  What happens to devices during swsusp? They seem to be resumed
+  during system suspend?
+
+A:
+  That's correct. We need to resume them if we want to write image to
+  disk. Whole sequence goes like
+
+      **Suspend part**
+
+      running system, user asks for suspend-to-disk
+
+      user processes are stopped
+
+      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+      with state snapshot
+
+      state snapshot: copy of whole used memory is taken with interrupts disabled
+
+      resume(): devices are woken up so that we can write image to swap
+
+      write image to swap
+
+      suspend(PMSG_SUSPEND): suspend devices so that we can power off
+
+      turn the power off
+
+      **Resume part**
+
+      (is actually pretty similar)
+
+      running system, user asks for suspend-to-disk
+
+      user processes are stopped (in common case there are none,
+      but with resume-from-initrd, no one knows)
+
+      read image from disk
+
+      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+      with image restoration
+
+      image restoration: rewrite memory with image
+
+      resume(): devices are woken up so that system can continue
+
+      thaw all user processes
+
+Q:
+  What is this 'Encrypt suspend image' for?
+
+A:
+  First of all: it is not a replacement for dm-crypt encrypted swap.
+  It cannot protect your computer while it is suspended. Instead it does
+  protect from leaking sensitive data after resume from suspend.
+
+  Think of the following: you suspend while an application is running
+  that keeps sensitive data in memory. The application itself prevents
+  the data from being swapped out. Suspend, however, must write these
+  data to swap to be able to resume later on. Without suspend encryption
+  your sensitive data are then stored in plaintext on disk.  This means
+  that after resume your sensitive data are accessible to all
+  applications having direct access to the swap device which was used
+  for suspend. If you don't need swap after resume these data can remain
+  on disk virtually forever. Thus it can happen that your system gets
+  broken in weeks later and sensitive data which you thought were
+  encrypted and protected are retrieved and stolen from the swap device.
+  To prevent this situation you should use 'Encrypt suspend image'.
+
+  During suspend a temporary key is created and this key is used to
+  encrypt the data written to disk. When, during resume, the data was
+  read back into memory the temporary key is destroyed which simply
+  means that all data written to disk during suspend are then
+  inaccessible so they can't be stolen later on.  The only thing that
+  you must then take care of is that you call 'mkswap' for the swap
+  partition used for suspend as early as possible during regular
+  boot. This asserts that any temporary key from an oopsed suspend or
+  from a failed or aborted resume is erased from the swap device.
+
+  As a rule of thumb use encrypted swap to protect your data while your
+  system is shut down or suspended. Additionally use the encrypted
+  suspend image to prevent sensitive data from being stolen after
+  resume.
+
+Q:
+  Can I suspend to a swap file?
+
+A:
+  Generally, yes, you can.  However, it requires you to use the "resume=" and
+  "resume_offset=" kernel command line parameters, so the resume from a swap file
+  cannot be initiated from an initrd or initramfs image.  See
+  swsusp-and-swap-files.txt for details.
+
+Q:
+  Is there a maximum system RAM size that is supported by swsusp?
+
+A:
+  It should work okay with highmem.
+
+Q:
+  Does swsusp (to disk) use only one swap partition or can it use
+  multiple swap partitions (aggregate them into one logical space)?
+
+A:
+  Only one swap partition, sorry.
+
+Q:
+  If my application(s) causes lots of memory & swap space to be used
+  (over half of the total system RAM), is it correct that it is likely
+  to be useless to try to suspend to disk while that app is running?
+
+A:
+  No, it should work okay, as long as your app does not mlock()
+  it. Just prepare big enough swap partition.
+
+Q:
+  What information is useful for debugging suspend-to-disk problems?
+
+A:
+  Well, last messages on the screen are always useful. If something
+  is broken, it is usually some kernel driver, therefore trying with as
+  little as possible modules loaded helps a lot. I also prefer people to
+  suspend from console, preferably without X running. Booting with
+  init=/bin/bash, then swapon and starting suspend sequence manually
+  usually does the trick. Then it is good idea to try with latest
+  vanilla kernel.
+
+Q:
+  How can distributions ship a swsusp-supporting kernel with modular
+  disk drivers (especially SATA)?
+
+A:
+  Well, it can be done, load the drivers, then do echo into
+  /sys/power/resume file from initrd. Be sure not to mount
+  anything, not even read-only mount, or you are going to lose your
+  data.
+
+Q:
+  How do I make suspend more verbose?
+
+A:
+  If you want to see any non-error kernel messages on the virtual
+  terminal the kernel switches to during suspend, you have to set the
+  kernel console loglevel to at least 4 (KERN_WARNING), for example by
+  doing::
+
+	# save the old loglevel
+	read LOGLEVEL DUMMY < /proc/sys/kernel/printk
+	# set the loglevel so we see the progress bar.
+	# if the level is higher than needed, we leave it alone.
+	if [ $LOGLEVEL -lt 5 ]; then
+	        echo 5 > /proc/sys/kernel/printk
+		fi
+
+        IMG_SZ=0
+        read IMG_SZ < /sys/power/image_size
+        echo -n disk > /sys/power/state
+        RET=$?
+        #
+        # the logic here is:
+        # if image_size > 0 (without kernel support, IMG_SZ will be zero),
+        # then try again with image_size set to zero.
+	if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
+                echo 0 > /sys/power/image_size
+                echo -n disk > /sys/power/state
+                RET=$?
+        fi
+
+	# restore previous loglevel
+	echo $LOGLEVEL > /proc/sys/kernel/printk
+	exit $RET
+
+Q:
+  Is this true that if I have a mounted filesystem on a USB device and
+  I suspend to disk, I can lose data unless the filesystem has been mounted
+  with "sync"?
+
+A:
+  That's right ... if you disconnect that device, you may lose data.
+  In fact, even with "-o sync" you can lose data if your programs have
+  information in buffers they haven't written out to a disk you disconnect,
+  or if you disconnect before the device finished saving data you wrote.
+
+  Software suspend normally powers down USB controllers, which is equivalent
+  to disconnecting all USB devices attached to your system.
+
+  Your system might well support low-power modes for its USB controllers
+  while the system is asleep, maintaining the connection, using true sleep
+  modes like "suspend-to-RAM" or "standby".  (Don't write "disk" to the
+  /sys/power/state file; write "standby" or "mem".)  We've not seen any
+  hardware that can use these modes through software suspend, although in
+  theory some systems might support "platform" modes that won't break the
+  USB connections.
+
+  Remember that it's always a bad idea to unplug a disk drive containing a
+  mounted filesystem.  That's true even when your system is asleep!  The
+  safest thing is to unmount all filesystems on removable media (such USB,
+  Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
+  before suspending; then remount them after resuming.
+
+  There is a work-around for this problem.  For more information, see
+  Documentation/driver-api/usb/persist.rst.
+
+Q:
+  Can I suspend-to-disk using a swap partition under LVM?
+
+A:
+  Yes and No.  You can suspend successfully, but the kernel will not be able
+  to resume on its own.  You need an initramfs that can recognize the resume
+  situation, activate the logical volume containing the swap volume (but not
+  touch any filesystems!), and eventually call::
+
+    echo -n "$major:$minor" > /sys/power/resume
+
+  where $major and $minor are the respective major and minor device numbers of
+  the swap volume.
+
+  uswsusp works with LVM, too.  See http://suspend.sourceforge.net/
+
+Q:
+  I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
+  compiled with the similar configuration files. Anyway I found that
+  suspend to disk (and resume) is much slower on 2.6.16 compared to
+  2.6.15. Any idea for why that might happen or how can I speed it up?
+
+A:
+  This is because the size of the suspend image is now greater than
+  for 2.6.15 (by saving more data we can get more responsive system
+  after resume).
+
+  There's the /sys/power/image_size knob that controls the size of the
+  image.  If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
+  root), the 2.6.15 behavior should be restored.  If it is still too
+  slow, take a look at suspend.sf.net -- userland suspend is faster and
+  supports LZF compression to speed it up further.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
deleted file mode 100644
index 236d1fb13640..000000000000
--- a/Documentation/power/swsusp.txt
+++ /dev/null
@@ -1,446 +0,0 @@
-Some warnings, first.
-
- * BIG FAT WARNING *********************************************************
- *
- * If you touch anything on disk between suspend and resume...
- *				...kiss your data goodbye.
- *
- * If you do resume from initrd after your filesystems are mounted...
- *				...bye bye root partition.
- *			[this is actually same case as above]
- *
- * If you have unsupported (*) devices using DMA, you may have some
- * problems. If your disk driver does not support suspend... (IDE does),
- * it may cause some problems, too. If you change kernel command line
- * between suspend and resume, it may do something wrong. If you change
- * your hardware while system is suspended... well, it was not good idea;
- * but it will probably only crash.
- *
- * (*) suspend/resume support is needed to make it safe.
- *
- * If you have any filesystems on USB devices mounted before software suspend,
- * they won't be accessible after resume and you may lose data, as though
- * you have unplugged the USB devices with mounted filesystems on them;
- * see the FAQ below for details.  (This is not true for more traditional
- * power states like "standby", which normally don't turn USB off.)
-
-Swap partition:
-You need to append resume=/dev/your_swap_partition to kernel command
-line or specify it using /sys/power/resume.
-
-Swap file:
-If using a swapfile you can also specify a resume offset using
-resume_offset=<number> on the kernel command line or specify it
-in /sys/power/resume_offset.
-
-After preparing then you suspend by
-
-echo shutdown > /sys/power/disk; echo disk > /sys/power/state
-
-. If you feel ACPI works pretty well on your system, you might try
-
-echo platform > /sys/power/disk; echo disk > /sys/power/state
-
-. If you would like to write hibernation image to swap and then suspend
-to RAM (provided your platform supports it), you can try
-
-echo suspend > /sys/power/disk; echo disk > /sys/power/state
-
-. If you have SATA disks, you'll need recent kernels with SATA suspend
-support. For suspend and resume to work, make sure your disk drivers
-are built into kernel -- not modules. [There's way to make
-suspend/resume with modular disk drivers, see FAQ, but you probably
-should not do that.]
-
-If you want to limit the suspend image size to N bytes, do
-
-echo N > /sys/power/image_size
-
-before suspend (it is limited to around 2/5 of available RAM by default).
-
-. The resume process checks for the presence of the resume device,
-if found, it then checks the contents for the hibernation image signature.
-If both are found, it resumes the hibernation image.
-
-. The resume process may be triggered in two ways:
-  1) During lateinit:  If resume=/dev/your_swap_partition is specified on
-     the kernel command line, lateinit runs the resume process.  If the
-     resume device has not been probed yet, the resume process fails and
-     bootup continues.
-  2) Manually from an initrd or initramfs:  May be run from
-     the init script by using the /sys/power/resume file.  It is vital
-     that this be done prior to remounting any filesystems (even as
-     read-only) otherwise data may be corrupted.
-
-Article about goals and implementation of Software Suspend for Linux
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Author: Gábor Kuti
-Last revised: 2003-10-20 by Pavel Machek
-
-Idea and goals to achieve
-
-Nowadays it is common in several laptops that they have a suspend button. It
-saves the state of the machine to a filesystem or to a partition and switches
-to standby mode. Later resuming the machine the saved state is loaded back to
-ram and the machine can continue its work. It has two real benefits. First we
-save ourselves the time machine goes down and later boots up, energy costs
-are real high when running from batteries. The other gain is that we don't have to
-interrupt our programs so processes that are calculating something for a long
-time shouldn't need to be written interruptible.
-
-swsusp saves the state of the machine into active swaps and then reboots or
-powerdowns.  You must explicitly specify the swap partition to resume from with
-``resume='' kernel option. If signature is found it loads and restores saved
-state. If the option ``noresume'' is specified as a boot parameter, it skips
-the resuming.  If the option ``hibernate=nocompress'' is specified as a boot
-parameter, it saves hibernation image without compression.
-
-In the meantime while the system is suspended you should not add/remove any
-of the hardware, write to the filesystems, etc.
-
-Sleep states summary
-====================
-
-There are three different interfaces you can use, /proc/acpi should
-work like this:
-
-In a really perfect world:
-echo 1 > /proc/acpi/sleep       # for standby
-echo 2 > /proc/acpi/sleep       # for suspend to ram
-echo 3 > /proc/acpi/sleep       # for suspend to ram, but with more power conservative
-echo 4 > /proc/acpi/sleep       # for suspend to disk
-echo 5 > /proc/acpi/sleep       # for shutdown unfriendly the system
-
-and perhaps
-echo 4b > /proc/acpi/sleep      # for suspend to disk via s4bios
-
-Frequently Asked Questions
-==========================
-
-Q: well, suspending a server is IMHO a really stupid thing,
-but... (Diego Zuccato):
-
-A: You bought new UPS for your server. How do you install it without
-bringing machine down? Suspend to disk, rearrange power cables,
-resume.
-
-You have your server on UPS. Power died, and UPS is indicating 30
-seconds to failure. What do you do? Suspend to disk.
-
-
-Q: Maybe I'm missing something, but why don't the regular I/O paths work?
-
-A: We do use the regular I/O paths. However we cannot restore the data
-to its original location as we load it. That would create an
-inconsistent kernel state which would certainly result in an oops.
-Instead, we load the image into unused memory and then atomically copy
-it back to it original location. This implies, of course, a maximum
-image size of half the amount of memory.
-
-There are two solutions to this:
-
-* require half of memory to be free during suspend. That way you can
-read "new" data onto free spots, then cli and copy
-
-* assume we had special "polling" ide driver that only uses memory
-between 0-640KB. That way, I'd have to make sure that 0-640KB is free
-during suspending, but otherwise it would work...
-
-suspend2 shares this fundamental limitation, but does not include user
-data and disk caches into "used memory" by saving them in
-advance. That means that the limitation goes away in practice.
-
-Q: Does linux support ACPI S4?
-
-A: Yes. That's what echo platform > /sys/power/disk does.
-
-Q: What is 'suspend2'?
-
-A: suspend2 is 'Software Suspend 2', a forked implementation of
-suspend-to-disk which is available as separate patches for 2.4 and 2.6
-kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
-highmem and preemption. It also has a extensible architecture that
-allows for arbitrary transformations on the image (compression,
-encryption) and arbitrary backends for writing the image (eg to swap
-or an NFS share[Work In Progress]). Questions regarding suspend2
-should be sent to the mailing list available through the suspend2
-website, and not to the Linux Kernel Mailing List. We are working
-toward merging suspend2 into the mainline kernel.
-
-Q: What is the freezing of tasks and why are we using it?
-
-A: The freezing of tasks is a mechanism by which user space processes and some
-kernel threads are controlled during hibernation or system-wide suspend (on some
-architectures).  See freezing-of-tasks.txt for details.
-
-Q: What is the difference between "platform" and "shutdown"?
-
-A:
-
-shutdown: save state in linux, then tell bios to powerdown
-
-platform: save state in linux, then tell bios to powerdown and blink
-          "suspended led"
-
-"platform" is actually right thing to do where supported, but
-"shutdown" is most reliable (except on ACPI systems).
-
-Q: I do not understand why you have such strong objections to idea of
-selective suspend.
-
-A: Do selective suspend during runtime power management, that's okay. But
-it's useless for suspend-to-disk. (And I do not see how you could use
-it for suspend-to-ram, I hope you do not want that).
-
-Lets see, so you suggest to
-
-* SUSPEND all but swap device and parents
-* Snapshot
-* Write image to disk
-* SUSPEND swap device and parents
-* Powerdown
-
-Oh no, that does not work, if swap device or its parents uses DMA,
-you've corrupted data. You'd have to do
-
-* SUSPEND all but swap device and parents
-* FREEZE swap device and parents
-* Snapshot
-* UNFREEZE swap device and parents
-* Write
-* SUSPEND swap device and parents
-
-Which means that you still need that FREEZE state, and you get more
-complicated code. (And I have not yet introduce details like system
-devices).
-
-Q: There don't seem to be any generally useful behavioral
-distinctions between SUSPEND and FREEZE.
-
-A: Doing SUSPEND when you are asked to do FREEZE is always correct,
-but it may be unnecessarily slow. If you want your driver to stay simple,
-slowness may not matter to you. It can always be fixed later.
-
-For devices like disk it does matter, you do not want to spindown for
-FREEZE.
-
-Q: After resuming, system is paging heavily, leading to very bad interactivity.
-
-A: Try running
-
-cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
-do
-  test -f "$file" && cat "$file" > /dev/null
-done
-
-after resume. swapoff -a; swapon -a may also be useful.
-
-Q: What happens to devices during swsusp? They seem to be resumed
-during system suspend?
-
-A: That's correct. We need to resume them if we want to write image to
-disk. Whole sequence goes like
-
-      Suspend part
-      ~~~~~~~~~~~~
-      running system, user asks for suspend-to-disk
-
-      user processes are stopped
-
-      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
-      		      with state snapshot
-
-      state snapshot: copy of whole used memory is taken with interrupts disabled
-
-      resume(): devices are woken up so that we can write image to swap
-
-      write image to swap
-
-      suspend(PMSG_SUSPEND): suspend devices so that we can power off
-
-      turn the power off
-
-      Resume part
-      ~~~~~~~~~~~
-      (is actually pretty similar)
-
-      running system, user asks for suspend-to-disk
-
-      user processes are stopped (in common case there are none, but with resume-from-initrd, no one knows)
-
-      read image from disk
-
-      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
-      		      with image restoration
-
-      image restoration: rewrite memory with image
-
-      resume(): devices are woken up so that system can continue
-
-      thaw all user processes
-
-Q: What is this 'Encrypt suspend image' for?
-
-A: First of all: it is not a replacement for dm-crypt encrypted swap.
-It cannot protect your computer while it is suspended. Instead it does
-protect from leaking sensitive data after resume from suspend.
-
-Think of the following: you suspend while an application is running
-that keeps sensitive data in memory. The application itself prevents
-the data from being swapped out. Suspend, however, must write these
-data to swap to be able to resume later on. Without suspend encryption
-your sensitive data are then stored in plaintext on disk.  This means
-that after resume your sensitive data are accessible to all
-applications having direct access to the swap device which was used
-for suspend. If you don't need swap after resume these data can remain
-on disk virtually forever. Thus it can happen that your system gets
-broken in weeks later and sensitive data which you thought were
-encrypted and protected are retrieved and stolen from the swap device.
-To prevent this situation you should use 'Encrypt suspend image'.
-
-During suspend a temporary key is created and this key is used to
-encrypt the data written to disk. When, during resume, the data was
-read back into memory the temporary key is destroyed which simply
-means that all data written to disk during suspend are then
-inaccessible so they can't be stolen later on.  The only thing that
-you must then take care of is that you call 'mkswap' for the swap
-partition used for suspend as early as possible during regular
-boot. This asserts that any temporary key from an oopsed suspend or
-from a failed or aborted resume is erased from the swap device.
-
-As a rule of thumb use encrypted swap to protect your data while your
-system is shut down or suspended. Additionally use the encrypted
-suspend image to prevent sensitive data from being stolen after
-resume.
-
-Q: Can I suspend to a swap file?
-
-A: Generally, yes, you can.  However, it requires you to use the "resume=" and
-"resume_offset=" kernel command line parameters, so the resume from a swap file
-cannot be initiated from an initrd or initramfs image.  See
-swsusp-and-swap-files.txt for details.
-
-Q: Is there a maximum system RAM size that is supported by swsusp?
-
-A: It should work okay with highmem.
-
-Q: Does swsusp (to disk) use only one swap partition or can it use
-multiple swap partitions (aggregate them into one logical space)?
-
-A: Only one swap partition, sorry.
-
-Q: If my application(s) causes lots of memory & swap space to be used
-(over half of the total system RAM), is it correct that it is likely
-to be useless to try to suspend to disk while that app is running?
-
-A: No, it should work okay, as long as your app does not mlock()
-it. Just prepare big enough swap partition.
-
-Q: What information is useful for debugging suspend-to-disk problems?
-
-A: Well, last messages on the screen are always useful. If something
-is broken, it is usually some kernel driver, therefore trying with as
-little as possible modules loaded helps a lot. I also prefer people to
-suspend from console, preferably without X running. Booting with
-init=/bin/bash, then swapon and starting suspend sequence manually
-usually does the trick. Then it is good idea to try with latest
-vanilla kernel.
-
-Q: How can distributions ship a swsusp-supporting kernel with modular
-disk drivers (especially SATA)?
-
-A: Well, it can be done, load the drivers, then do echo into
-/sys/power/resume file from initrd. Be sure not to mount
-anything, not even read-only mount, or you are going to lose your
-data.
-
-Q: How do I make suspend more verbose?
-
-A: If you want to see any non-error kernel messages on the virtual
-terminal the kernel switches to during suspend, you have to set the
-kernel console loglevel to at least 4 (KERN_WARNING), for example by
-doing
-
-	# save the old loglevel
-	read LOGLEVEL DUMMY < /proc/sys/kernel/printk
-	# set the loglevel so we see the progress bar.
-	# if the level is higher than needed, we leave it alone.
-	if [ $LOGLEVEL -lt 5 ]; then
-	        echo 5 > /proc/sys/kernel/printk
-		fi
-
-        IMG_SZ=0
-        read IMG_SZ < /sys/power/image_size
-        echo -n disk > /sys/power/state
-        RET=$?
-        #
-        # the logic here is:
-        # if image_size > 0 (without kernel support, IMG_SZ will be zero),
-        # then try again with image_size set to zero.
-	if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
-                echo 0 > /sys/power/image_size
-                echo -n disk > /sys/power/state
-                RET=$?
-        fi
-
-	# restore previous loglevel
-	echo $LOGLEVEL > /proc/sys/kernel/printk
-	exit $RET
-
-Q: Is this true that if I have a mounted filesystem on a USB device and
-I suspend to disk, I can lose data unless the filesystem has been mounted
-with "sync"?
-
-A: That's right ... if you disconnect that device, you may lose data.
-In fact, even with "-o sync" you can lose data if your programs have
-information in buffers they haven't written out to a disk you disconnect,
-or if you disconnect before the device finished saving data you wrote.
-
-Software suspend normally powers down USB controllers, which is equivalent
-to disconnecting all USB devices attached to your system.
-
-Your system might well support low-power modes for its USB controllers
-while the system is asleep, maintaining the connection, using true sleep
-modes like "suspend-to-RAM" or "standby".  (Don't write "disk" to the
-/sys/power/state file; write "standby" or "mem".)  We've not seen any
-hardware that can use these modes through software suspend, although in
-theory some systems might support "platform" modes that won't break the
-USB connections.
-
-Remember that it's always a bad idea to unplug a disk drive containing a
-mounted filesystem.  That's true even when your system is asleep!  The
-safest thing is to unmount all filesystems on removable media (such USB,
-Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
-before suspending; then remount them after resuming.
-
-There is a work-around for this problem.  For more information, see
-Documentation/driver-api/usb/persist.rst.
-
-Q: Can I suspend-to-disk using a swap partition under LVM?
-
-A: Yes and No.  You can suspend successfully, but the kernel will not be able
-to resume on its own.  You need an initramfs that can recognize the resume
-situation, activate the logical volume containing the swap volume (but not
-touch any filesystems!), and eventually call
-
-echo -n "$major:$minor" > /sys/power/resume
-
-where $major and $minor are the respective major and minor device numbers of
-the swap volume.
-
-uswsusp works with LVM, too.  See http://suspend.sourceforge.net/
-
-Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
-compiled with the similar configuration files. Anyway I found that
-suspend to disk (and resume) is much slower on 2.6.16 compared to
-2.6.15. Any idea for why that might happen or how can I speed it up?
-
-A: This is because the size of the suspend image is now greater than
-for 2.6.15 (by saving more data we can get more responsive system
-after resume).
-
-There's the /sys/power/image_size knob that controls the size of the
-image.  If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
-root), the 2.6.15 behavior should be restored.  If it is still too
-slow, take a look at suspend.sf.net -- userland suspend is faster and
-supports LZF compression to speed it up further.
diff --git a/Documentation/power/tricks.rst b/Documentation/power/tricks.rst
new file mode 100644
index 000000000000..ca787f142c3f
--- /dev/null
+++ b/Documentation/power/tricks.rst
@@ -0,0 +1,29 @@
+================
+swsusp/S3 tricks
+================
+
+Pavel Machek <pavel@ucw.cz>
+
+If you want to trick swsusp/S3 into working, you might want to try:
+
+* go with minimal config, turn off drivers like USB, AGP you don't
+  really need
+
+* turn off APIC and preempt
+
+* use ext2. At least it has working fsck. [If something seems to go
+  wrong, force fsck when you have a chance]
+
+* turn off modules
+
+* use vga text console, shut down X. [If you really want X, you might
+  want to try vesafb later]
+
+* try running as few processes as possible, preferably go to single
+  user mode.
+
+* due to video issues, swsusp should be easier to get working than
+  S3. Try that first.
+
+When you make it work, try to find out what exactly was it that broke
+suspend, and preferably fix that.
diff --git a/Documentation/power/tricks.txt b/Documentation/power/tricks.txt
deleted file mode 100644
index a1b8f7249f4c..000000000000
--- a/Documentation/power/tricks.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-	swsusp/S3 tricks
-	~~~~~~~~~~~~~~~~
-Pavel Machek <pavel@ucw.cz>
-
-If you want to trick swsusp/S3 into working, you might want to try:
-
-* go with minimal config, turn off drivers like USB, AGP you don't
-  really need
-
-* turn off APIC and preempt
-
-* use ext2. At least it has working fsck. [If something seems to go
-  wrong, force fsck when you have a chance]
-
-* turn off modules
-
-* use vga text console, shut down X. [If you really want X, you might
-  want to try vesafb later]
-
-* try running as few processes as possible, preferably go to single
-  user mode.
-
-* due to video issues, swsusp should be easier to get working than
-  S3. Try that first.
-
-When you make it work, try to find out what exactly was it that broke
-suspend, and preferably fix that.
diff --git a/Documentation/power/userland-swsusp.rst b/Documentation/power/userland-swsusp.rst
new file mode 100644
index 000000000000..a0fa51bb1a4d
--- /dev/null
+++ b/Documentation/power/userland-swsusp.rst
@@ -0,0 +1,191 @@
+=====================================================
+Documentation for userland software suspend interface
+=====================================================
+
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+First, the warnings at the beginning of swsusp.txt still apply.
+
+Second, you should read the FAQ in swsusp.txt _now_ if you have not
+done it already.
+
+Now, to use the userland interface for software suspend you need special
+utilities that will read/write the system memory snapshot from/to the
+kernel.  Such utilities are available, for example, from
+<http://suspend.sourceforge.net>.  You may want to have a look at them if you
+are going to develop your own suspend/resume utilities.
+
+The interface consists of a character device providing the open(),
+release(), read(), and write() operations as well as several ioctl()
+commands defined in include/linux/suspend_ioctls.h .  The major and minor
+numbers of the device are, respectively, 10 and 231, and they can
+be read from /sys/class/misc/snapshot/dev.
+
+The device can be open either for reading or for writing.  If open for
+reading, it is considered to be in the suspend mode.  Otherwise it is
+assumed to be in the resume mode.  The device cannot be open for simultaneous
+reading and writing.  It is also impossible to have the device open more than
+once at a time.
+
+Even opening the device has side effects. Data structures are
+allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
+called.
+
+The ioctl() commands recognized by the device are:
+
+SNAPSHOT_FREEZE
+	freeze user space processes (the current process is
+	not frozen); this is required for SNAPSHOT_CREATE_IMAGE
+	and SNAPSHOT_ATOMIC_RESTORE to succeed
+
+SNAPSHOT_UNFREEZE
+	thaw user space processes frozen by SNAPSHOT_FREEZE
+
+SNAPSHOT_CREATE_IMAGE
+	create a snapshot of the system memory; the
+	last argument of ioctl() should be a pointer to an int variable,
+	the value of which will indicate whether the call returned after
+	creating the snapshot (1) or after restoring the system memory state
+	from it (0) (after resume the system finds itself finishing the
+	SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
+	has been created the read() operation can be used to transfer
+	it out of the kernel
+
+SNAPSHOT_ATOMIC_RESTORE
+	restore the system memory state from the
+	uploaded snapshot image; before calling it you should transfer
+	the system memory snapshot back to the kernel using the write()
+	operation; this call will not succeed if the snapshot
+	image is not available to the kernel
+
+SNAPSHOT_FREE
+	free memory allocated for the snapshot image
+
+SNAPSHOT_PREF_IMAGE_SIZE
+	set the preferred maximum size of the image
+	(the kernel will do its best to ensure the image size will not exceed
+	this number, but if it turns out to be impossible, the kernel will
+	create the smallest image possible)
+
+SNAPSHOT_GET_IMAGE_SIZE
+	return the actual size of the hibernation image
+
+SNAPSHOT_AVAIL_SWAP_SIZE
+	return the amount of available swap in bytes (the
+	last argument should be a pointer to an unsigned int variable that will
+	contain the result if the call is successful).
+
+SNAPSHOT_ALLOC_SWAP_PAGE
+	allocate a swap page from the resume partition
+	(the last argument should be a pointer to a loff_t variable that
+	will contain the swap page offset if the call is successful)
+
+SNAPSHOT_FREE_SWAP_PAGES
+	free all swap pages allocated by
+	SNAPSHOT_ALLOC_SWAP_PAGE
+
+SNAPSHOT_SET_SWAP_AREA
+	set the resume partition and the offset (in <PAGE_SIZE>
+	units) from the beginning of the partition at which the swap header is
+	located (the last ioctl() argument should point to a struct
+	resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
+	containing the resume device specification and the offset); for swap
+	partitions the offset is always 0, but it is different from zero for
+	swap files (see Documentation/power/swsusp-and-swap-files.rst for
+	details).
+
+SNAPSHOT_PLATFORM_SUPPORT
+	enable/disable the hibernation platform support,
+	depending on the argument value (enable, if the argument is nonzero)
+
+SNAPSHOT_POWER_OFF
+	make the kernel transition the system to the hibernation
+	state (eg. ACPI S4) using the platform (eg. ACPI) driver
+
+SNAPSHOT_S2RAM
+	suspend to RAM; using this call causes the kernel to
+	immediately enter the suspend-to-RAM state, so this call must always
+	be preceded by the SNAPSHOT_FREEZE call and it is also necessary
+	to use the SNAPSHOT_UNFREEZE call after the system wakes up.  This call
+	is needed to implement the suspend-to-both mechanism in which the
+	suspend image is first created, as though the system had been suspended
+	to disk, and then the system is suspended to RAM (this makes it possible
+	to resume the system from RAM if there's enough battery power or restore
+	its state on the basis of the saved suspend image otherwise)
+
+The device's read() operation can be used to transfer the snapshot image from
+the kernel.  It has the following limitations:
+
+- you cannot read() more than one virtual memory page at a time
+- read()s across page boundaries are impossible (ie. if you read() 1/2 of
+  a page in the previous call, you will only be able to read()
+  **at most** 1/2 of the page in the next call)
+
+The device's write() operation is used for uploading the system memory snapshot
+into the kernel.  It has the same limitations as the read() operation.
+
+The release() operation frees all memory allocated for the snapshot image
+and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
+Thus it is not necessary to use either SNAPSHOT_FREE or
+SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
+unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
+still frozen when the device is being closed).
+
+Currently it is assumed that the userland utilities reading/writing the
+snapshot image from/to the kernel will use a swap partition, called the resume
+partition, or a swap file as storage space (if a swap file is used, the resume
+partition is the partition that holds this file).  However, this is not really
+required, as they can use, for example, a special (blank) suspend partition or
+a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
+mounted afterwards.
+
+These utilities MUST NOT make any assumptions regarding the ordering of
+data within the snapshot image.  The contents of the image are entirely owned
+by the kernel and its structure may be changed in future kernel releases.
+
+The snapshot image MUST be written to the kernel unaltered (ie. all of the image
+data, metadata and header MUST be written in _exactly_ the same amount, form
+and order in which they have been read).  Otherwise, the behavior of the
+resumed system may be totally unpredictable.
+
+While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
+structure of the snapshot image is consistent with the information stored
+in the image header.  If any inconsistencies are detected,
+SNAPSHOT_ATOMIC_RESTORE will not succeed.  Still, this is not a fool-proof
+mechanism and the userland utilities using the interface SHOULD use additional
+means, such as checksums, to ensure the integrity of the snapshot image.
+
+The suspending and resuming utilities MUST lock themselves in memory,
+preferably using mlockall(), before calling SNAPSHOT_FREEZE.
+
+The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
+in the memory location pointed to by the last argument of ioctl() and proceed
+in accordance with it:
+
+1. 	If the value is 1 (ie. the system memory snapshot has just been
+	created and the system is ready for saving it):
+
+	(a)	The suspending utility MUST NOT close the snapshot device
+		_unless_ the whole suspend procedure is to be cancelled, in
+		which case, if the snapshot image has already been saved, the
+		suspending utility SHOULD destroy it, preferably by zapping
+		its header.  If the suspend is not to be cancelled, the
+		system MUST be powered off or rebooted after the snapshot
+		image has been saved.
+	(b)	The suspending utility SHOULD NOT attempt to perform any
+		file system operations (including reads) on the file systems
+		that were mounted before SNAPSHOT_CREATE_IMAGE has been
+		called.  However, it MAY mount a file system that was not
+		mounted at that time and perform some operations on it (eg.
+		use it for saving the image).
+
+2.	If the value is 0 (ie. the system state has just been restored from
+	the snapshot image), the suspending utility MUST close the snapshot
+	device.  Afterwards it will be treated as a regular userland process,
+	so it need not exit.
+
+The resuming utility SHOULD NOT attempt to mount any file systems that could
+be mounted before suspend and SHOULD NOT attempt to perform any operations
+involving such file systems.
+
+For details, please refer to the source code.
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
deleted file mode 100644
index bbfcd1bbedc5..000000000000
--- a/Documentation/power/userland-swsusp.txt
+++ /dev/null
@@ -1,170 +0,0 @@
-Documentation for userland software suspend interface
-	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
-
-First, the warnings at the beginning of swsusp.txt still apply.
-
-Second, you should read the FAQ in swsusp.txt _now_ if you have not
-done it already.
-
-Now, to use the userland interface for software suspend you need special
-utilities that will read/write the system memory snapshot from/to the
-kernel.  Such utilities are available, for example, from
-<http://suspend.sourceforge.net>.  You may want to have a look at them if you
-are going to develop your own suspend/resume utilities.
-
-The interface consists of a character device providing the open(),
-release(), read(), and write() operations as well as several ioctl()
-commands defined in include/linux/suspend_ioctls.h .  The major and minor
-numbers of the device are, respectively, 10 and 231, and they can
-be read from /sys/class/misc/snapshot/dev.
-
-The device can be open either for reading or for writing.  If open for
-reading, it is considered to be in the suspend mode.  Otherwise it is
-assumed to be in the resume mode.  The device cannot be open for simultaneous
-reading and writing.  It is also impossible to have the device open more than
-once at a time.
-
-Even opening the device has side effects. Data structures are
-allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
-called.
-
-The ioctl() commands recognized by the device are:
-
-SNAPSHOT_FREEZE - freeze user space processes (the current process is
-	not frozen); this is required for SNAPSHOT_CREATE_IMAGE
-	and SNAPSHOT_ATOMIC_RESTORE to succeed
-
-SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE
-
-SNAPSHOT_CREATE_IMAGE - create a snapshot of the system memory; the
-	last argument of ioctl() should be a pointer to an int variable,
-	the value of which will indicate whether the call returned after
-	creating the snapshot (1) or after restoring the system memory state
-	from it (0) (after resume the system finds itself finishing the
-	SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
-	has been created the read() operation can be used to transfer
-	it out of the kernel
-
-SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the
-	uploaded snapshot image; before calling it you should transfer
-	the system memory snapshot back to the kernel using the write()
-	operation; this call will not succeed if the snapshot
-	image is not available to the kernel
-
-SNAPSHOT_FREE - free memory allocated for the snapshot image
-
-SNAPSHOT_PREF_IMAGE_SIZE - set the preferred maximum size of the image
-	(the kernel will do its best to ensure the image size will not exceed
-	this number, but if it turns out to be impossible, the kernel will
-	create the smallest image possible)
-
-SNAPSHOT_GET_IMAGE_SIZE - return the actual size of the hibernation image
-
-SNAPSHOT_AVAIL_SWAP_SIZE - return the amount of available swap in bytes (the
-	last argument should be a pointer to an unsigned int variable that will
-	contain the result if the call is successful).
-
-SNAPSHOT_ALLOC_SWAP_PAGE - allocate a swap page from the resume partition
-	(the last argument should be a pointer to a loff_t variable that
-	will contain the swap page offset if the call is successful)
-
-SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated by
-	SNAPSHOT_ALLOC_SWAP_PAGE
-
-SNAPSHOT_SET_SWAP_AREA - set the resume partition and the offset (in <PAGE_SIZE>
-	units) from the beginning of the partition at which the swap header is
-	located (the last ioctl() argument should point to a struct
-	resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
-	containing the resume device specification and the offset); for swap
-	partitions the offset is always 0, but it is different from zero for
-	swap files (see Documentation/power/swsusp-and-swap-files.txt for
-	details).
-
-SNAPSHOT_PLATFORM_SUPPORT - enable/disable the hibernation platform support,
-	depending on the argument value (enable, if the argument is nonzero)
-
-SNAPSHOT_POWER_OFF - make the kernel transition the system to the hibernation
-	state (eg. ACPI S4) using the platform (eg. ACPI) driver
-
-SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
-	immediately enter the suspend-to-RAM state, so this call must always
-	be preceded by the SNAPSHOT_FREEZE call and it is also necessary
-	to use the SNAPSHOT_UNFREEZE call after the system wakes up.  This call
-	is needed to implement the suspend-to-both mechanism in which the
-	suspend image is first created, as though the system had been suspended
-	to disk, and then the system is suspended to RAM (this makes it possible
-	to resume the system from RAM if there's enough battery power or restore
-	its state on the basis of the saved suspend image otherwise)
-
-The device's read() operation can be used to transfer the snapshot image from
-the kernel.  It has the following limitations:
-- you cannot read() more than one virtual memory page at a time
-- read()s across page boundaries are impossible (ie. if you read() 1/2 of
-	a page in the previous call, you will only be able to read()
-	_at_ _most_ 1/2 of the page in the next call)
-
-The device's write() operation is used for uploading the system memory snapshot
-into the kernel.  It has the same limitations as the read() operation.
-
-The release() operation frees all memory allocated for the snapshot image
-and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
-Thus it is not necessary to use either SNAPSHOT_FREE or
-SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
-unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
-still frozen when the device is being closed).
-
-Currently it is assumed that the userland utilities reading/writing the
-snapshot image from/to the kernel will use a swap partition, called the resume
-partition, or a swap file as storage space (if a swap file is used, the resume
-partition is the partition that holds this file).  However, this is not really
-required, as they can use, for example, a special (blank) suspend partition or
-a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
-mounted afterwards.
-
-These utilities MUST NOT make any assumptions regarding the ordering of
-data within the snapshot image.  The contents of the image are entirely owned
-by the kernel and its structure may be changed in future kernel releases.
-
-The snapshot image MUST be written to the kernel unaltered (ie. all of the image
-data, metadata and header MUST be written in _exactly_ the same amount, form
-and order in which they have been read).  Otherwise, the behavior of the
-resumed system may be totally unpredictable.
-
-While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
-structure of the snapshot image is consistent with the information stored
-in the image header.  If any inconsistencies are detected,
-SNAPSHOT_ATOMIC_RESTORE will not succeed.  Still, this is not a fool-proof
-mechanism and the userland utilities using the interface SHOULD use additional
-means, such as checksums, to ensure the integrity of the snapshot image.
-
-The suspending and resuming utilities MUST lock themselves in memory,
-preferably using mlockall(), before calling SNAPSHOT_FREEZE.
-
-The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
-in the memory location pointed to by the last argument of ioctl() and proceed
-in accordance with it:
-1. 	If the value is 1 (ie. the system memory snapshot has just been
-	created and the system is ready for saving it):
-	(a)	The suspending utility MUST NOT close the snapshot device
-		_unless_ the whole suspend procedure is to be cancelled, in
-		which case, if the snapshot image has already been saved, the
-		suspending utility SHOULD destroy it, preferably by zapping
-		its header.  If the suspend is not to be cancelled, the
-		system MUST be powered off or rebooted after the snapshot
-		image has been saved.
-	(b)	The suspending utility SHOULD NOT attempt to perform any
-		file system operations (including reads) on the file systems
-		that were mounted before SNAPSHOT_CREATE_IMAGE has been
-		called.  However, it MAY mount a file system that was not
-		mounted at that time and perform some operations on it (eg.
-		use it for saving the image).
-2.	If the value is 0 (ie. the system state has just been restored from
-	the snapshot image), the suspending utility MUST close the snapshot
-	device.  Afterwards it will be treated as a regular userland process,
-	so it need not exit.
-
-The resuming utility SHOULD NOT attempt to mount any file systems that could
-be mounted before suspend and SHOULD NOT attempt to perform any operations
-involving such file systems.
-
-For details, please refer to the source code.
diff --git a/Documentation/power/video.rst b/Documentation/power/video.rst
new file mode 100644
index 000000000000..337a2ba9f32f
--- /dev/null
+++ b/Documentation/power/video.rst
@@ -0,0 +1,213 @@
+===========================
+Video issues with S3 resume
+===========================
+
+2003-2006, Pavel Machek
+
+During S3 resume, hardware needs to be reinitialized. For most
+devices, this is easy, and kernel driver knows how to do
+it. Unfortunately there's one exception: video card. Those are usually
+initialized by BIOS, and kernel does not have enough information to
+boot video card. (Kernel usually does not even contain video card
+driver -- vesafb and vgacon are widely used).
+
+This is not problem for swsusp, because during swsusp resume, BIOS is
+run normally so video card is normally initialized. It should not be
+problem for S1 standby, because hardware should retain its state over
+that.
+
+We either have to run video BIOS during early resume, or interpret it
+using vbetool later, or maybe nothing is necessary on particular
+system because video state is preserved. Unfortunately different
+methods work on different systems, and no known method suits all of
+them.
+
+Userland application called s2ram has been developed; it contains long
+whitelist of systems, and automatically selects working method for a
+given system. It can be downloaded from CVS at
+www.sf.net/projects/suspend . If you get a system that is not in the
+whitelist, please try to find a working solution, and submit whitelist
+entry so that work does not need to be repeated.
+
+Currently, VBE_SAVE method (6 below) works on most
+systems. Unfortunately, vbetool only runs after userland is resumed,
+so it makes debugging of early resume problems
+hard/impossible. Methods that do not rely on userland are preferable.
+
+Details
+~~~~~~~
+
+There are a few types of systems where video works after S3 resume:
+
+(1) systems where video state is preserved over S3.
+
+(2) systems where it is possible to call the video BIOS during S3
+    resume. Unfortunately, it is not correct to call the video BIOS at
+    that point, but it happens to work on some machines. Use
+    acpi_sleep=s3_bios.
+
+(3) systems that initialize video card into vga text mode and where
+    the BIOS works well enough to be able to set video mode. Use
+    acpi_sleep=s3_mode on these.
+
+(4) on some systems s3_bios kicks video into text mode, and
+    acpi_sleep=s3_bios,s3_mode is needed.
+
+(5) radeon systems, where X can soft-boot your video card. You'll need
+    a new enough X, and a plain text console (no vesafb or radeonfb). See
+    http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
+    Alternatively, you should use vbetool (6) instead.
+
+(6) other radeon systems, where vbetool is enough to bring system back
+    to life. It needs text console to be working. Do vbetool vbestate
+    save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
+    vbestate restore < /tmp/delme; setfont <whatever>, and your video
+    should work.
+
+(7) on some systems, it is possible to boot most of kernel, and then
+    POSTing bios works. Ole Rohne has patch to do just that at
+    http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
+
+(8) on some systems, you can use the video_post utility and or
+    do echo 3 > /sys/power/state  && /usr/sbin/video_post - which will
+    initialize the display in console mode. If you are in X, you can switch
+    to a virtual terminal and back to X using  CTRL+ALT+F1 - CTRL+ALT+F7 to get
+    the display working in graphical mode again.
+
+Now, if you pass acpi_sleep=something, and it does not work with your
+bios, you'll get a hard crash during resume. Be careful. Also it is
+safest to do your experiments with plain old VGA console. The vesafb
+and radeonfb (etc) drivers have a tendency to crash the machine during
+resume.
+
+You may have a system where none of above works. At that point you
+either invent another ugly hack that works, or write proper driver for
+your video card (good luck getting docs :-(). Maybe suspending from X
+(proper X, knowing your hardware, not XF68_FBcon) might have better
+chance of working.
+
+Table of known working notebooks:
+
+
+=============================== ===============================================
+Model                           hack (or "how to do it")
+=============================== ===============================================
+Acer Aspire 1406LC		ole's late BIOS init (7), turn off DRI
+Acer TM 230			s3_bios (2)
+Acer TM 242FX			vbetool (6)
+Acer TM C110			video_post (8)
+Acer TM C300                    vga=normal (only suspend on console, not in X),
+				vbetool (6) or video_post (8)
+Acer TM 4052LCi		        s3_bios (2)
+Acer TM 636Lci			s3_bios,s3_mode (4)
+Acer TM 650 (Radeon M7)		vga=normal plus boot-radeon (5) gets text
+				console back
+Acer TM 660			??? [#f1]_
+Acer TM 800			vga=normal, X patches, see webpage (5)
+				or vbetool (6)
+Acer TM 803			vga=normal, X patches, see webpage (5)
+				or vbetool (6)
+Acer TM 803LCi			vga=normal, vbetool (6)
+Arima W730a			vbetool needed (6)
+Asus L2400D                     s3_mode (3) [#f2]_ (S1 also works OK)
+Asus L3350M (SiS 740)           (6)
+Asus L3800C (Radeon M7)		s3_bios (2) (S1 also works OK)
+Asus M6887Ne			vga=normal, s3_bios (2), use radeon driver
+				instead of fglrx in x.org
+Athlon64 desktop prototype	s3_bios (2)
+Compal CL-50			??? [#f1]_
+Compaq Armada E500 - P3-700     none (1) (S1 also works OK)
+Compaq Evo N620c		vga=normal, s3_bios (2)
+Dell 600m, ATI R250 Lf		none (1), but needs xorg-x11-6.8.1.902-1
+Dell D600, ATI RV250            vga=normal and X, or try vbestate (6)
+Dell D610			vga=normal and X (possibly vbestate (6) too,
+				but not tested)
+Dell Inspiron 4000		??? [#f1]_
+Dell Inspiron 500m		??? [#f1]_
+Dell Inspiron 510m		???
+Dell Inspiron 5150		vbetool needed (6)
+Dell Inspiron 600m		??? [#f1]_
+Dell Inspiron 8200		??? [#f1]_
+Dell Inspiron 8500		??? [#f1]_
+Dell Inspiron 8600		??? [#f1]_
+eMachines athlon64 machines	vbetool needed (6) (someone please get
+				me model #s)
+HP NC6000			s3_bios, may not use radeonfb (2);
+				or vbetool (6)
+HP NX7000			??? [#f1]_
+HP Pavilion ZD7000		vbetool post needed, need open-source nv
+				driver for X
+HP Omnibook XE3	athlon version	none (1)
+HP Omnibook XE3GC		none (1), video is S3 Savage/IX-MV
+HP Omnibook XE3L-GF		vbetool (6)
+HP Omnibook 5150		none (1), (S1 also works OK)
+IBM TP T20, model 2647-44G	none (1), video is S3 Inc. 86C270-294
+				Savage/IX-MV, vesafb gets "interesting"
+				but X work.
+IBM TP A31 / Type 2652-M5G      s3_mode (3) [works ok with
+				BIOS 1.04 2002-08-23, but not at all with
+				BIOS 1.11 2004-11-05 :-(]
+IBM TP R32 / Type 2658-MMG      none (1)
+IBM TP R40 2722B3G		??? [#f1]_
+IBM TP R50p / Type 1832-22U     s3_bios (2)
+IBM TP R51			none (1)
+IBM TP T30	236681A		??? [#f1]_
+IBM TP T40 / Type 2373-MU4      none (1)
+IBM TP T40p			none (1)
+IBM TP R40p			s3_bios (2)
+IBM TP T41p			s3_bios (2), switch to X after resume
+IBM TP T42			s3_bios (2)
+IBM ThinkPad T42p (2373-GTG)	s3_bios (2)
+IBM TP X20			??? [#f1]_
+IBM TP X30			s3_bios, s3_mode (4)
+IBM TP X31 / Type 2672-XXH      none (1), use radeontool
+				(http://fdd.com/software/radeon/) to
+				turn off backlight.
+IBM TP X32			none (1), but backlight is on and video is
+				trashed after long suspend. s3_bios,
+				s3_mode (4) works too. Perhaps that gets
+				better results?
+IBM Thinkpad X40 Type 2371-7JG  s3_bios,s3_mode (4)
+IBM TP 600e			none(1), but a switch to console and
+				back to X is needed
+Medion MD4220			??? [#f1]_
+Samsung P35			vbetool needed (6)
+Sharp PC-AR10 (ATI rage)	none (1), backlight does not switch off
+Sony Vaio PCG-C1VRX/K		s3_bios (2)
+Sony Vaio PCG-F403		??? [#f1]_
+Sony Vaio PCG-GRT995MP		none (1), works with 'nv' X driver
+Sony Vaio PCG-GR7/K		none (1), but needs radeonfb, use
+				radeontool (http://fdd.com/software/radeon/)
+				to turn off backlight.
+Sony Vaio PCG-N505SN		??? [#f1]_
+Sony Vaio vgn-s260		X or boot-radeon can init it (5)
+Sony Vaio vgn-S580BH		vga=normal, but suspend from X. Console will
+				be blank unless you return to X.
+Sony Vaio vgn-FS115B		s3_bios (2),s3_mode (4)
+Toshiba Libretto L5		none (1)
+Toshiba Libretto 100CT/110CT    vbetool (6)
+Toshiba Portege 3020CT		s3_mode (3)
+Toshiba Satellite 4030CDT	s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4080XCDT      s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4090XCDT      ??? [#f1]_
+Toshiba Satellite P10-554       s3_bios,s3_mode (4)[#f3]_
+Toshiba M30                     (2) xor X with nvidia driver using internal AGP
+Uniwill 244IIO			??? [#f1]_
+=============================== ===============================================
+
+Known working desktop systems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+=================== ============================= ========================
+Mainboard	    Graphics card                 hack (or "how to do it")
+=================== ============================= ========================
+Asus A7V8X	    nVidia RIVA TNT2 model 64	  s3_bios,s3_mode (4)
+=================== ============================= ========================
+
+
+.. [#f1] from https://wiki.ubuntu.com/HoaryPMResults, not sure
+         which options to use. If you know, please tell me.
+
+.. [#f2] To be tested with a newer kernel.
+
+.. [#f3] Not with SMP kernel, UP only.
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt
deleted file mode 100644
index 3e6272bc4472..000000000000
--- a/Documentation/power/video.txt
+++ /dev/null
@@ -1,185 +0,0 @@
-
-		Video issues with S3 resume
-		~~~~~~~~~~~~~~~~~~~~~~~~~~~
-		  2003-2006, Pavel Machek
-
-During S3 resume, hardware needs to be reinitialized. For most
-devices, this is easy, and kernel driver knows how to do
-it. Unfortunately there's one exception: video card. Those are usually
-initialized by BIOS, and kernel does not have enough information to
-boot video card. (Kernel usually does not even contain video card
-driver -- vesafb and vgacon are widely used).
-
-This is not problem for swsusp, because during swsusp resume, BIOS is
-run normally so video card is normally initialized. It should not be
-problem for S1 standby, because hardware should retain its state over
-that.
-
-We either have to run video BIOS during early resume, or interpret it
-using vbetool later, or maybe nothing is necessary on particular
-system because video state is preserved. Unfortunately different
-methods work on different systems, and no known method suits all of
-them.
-
-Userland application called s2ram has been developed; it contains long
-whitelist of systems, and automatically selects working method for a
-given system. It can be downloaded from CVS at
-www.sf.net/projects/suspend . If you get a system that is not in the
-whitelist, please try to find a working solution, and submit whitelist
-entry so that work does not need to be repeated.
-
-Currently, VBE_SAVE method (6 below) works on most
-systems. Unfortunately, vbetool only runs after userland is resumed,
-so it makes debugging of early resume problems
-hard/impossible. Methods that do not rely on userland are preferable.
-
-Details
-~~~~~~~
-
-There are a few types of systems where video works after S3 resume:
-
-(1) systems where video state is preserved over S3.
-
-(2) systems where it is possible to call the video BIOS during S3
-  resume. Unfortunately, it is not correct to call the video BIOS at
-  that point, but it happens to work on some machines. Use
-  acpi_sleep=s3_bios.
-
-(3) systems that initialize video card into vga text mode and where
-  the BIOS works well enough to be able to set video mode. Use
-  acpi_sleep=s3_mode on these.
-
-(4) on some systems s3_bios kicks video into text mode, and
-  acpi_sleep=s3_bios,s3_mode is needed.
-
-(5) radeon systems, where X can soft-boot your video card. You'll need
-  a new enough X, and a plain text console (no vesafb or radeonfb). See
-  http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
-  Alternatively, you should use vbetool (6) instead.
-
-(6) other radeon systems, where vbetool is enough to bring system back
-  to life. It needs text console to be working. Do vbetool vbestate
-  save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
-  vbestate restore < /tmp/delme; setfont <whatever>, and your video
-  should work.
-
-(7) on some systems, it is possible to boot most of kernel, and then
-  POSTing bios works. Ole Rohne has patch to do just that at
-  http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
-
-(8) on some systems, you can use the video_post utility and or 
-  do echo 3 > /sys/power/state  && /usr/sbin/video_post - which will 
-  initialize the display in console mode. If you are in X, you can switch
-  to a virtual terminal and back to X using  CTRL+ALT+F1 - CTRL+ALT+F7 to get
-  the display working in graphical mode again.
-
-Now, if you pass acpi_sleep=something, and it does not work with your
-bios, you'll get a hard crash during resume. Be careful. Also it is
-safest to do your experiments with plain old VGA console. The vesafb
-and radeonfb (etc) drivers have a tendency to crash the machine during
-resume.
-
-You may have a system where none of above works. At that point you
-either invent another ugly hack that works, or write proper driver for
-your video card (good luck getting docs :-(). Maybe suspending from X
-(proper X, knowing your hardware, not XF68_FBcon) might have better
-chance of working.
-
-Table of known working notebooks:
-
-Model                           hack (or "how to do it")
-------------------------------------------------------------------------------
-Acer Aspire 1406LC		ole's late BIOS init (7), turn off DRI
-Acer TM 230			s3_bios (2)
-Acer TM 242FX			vbetool (6)
-Acer TM C110			video_post (8)
-Acer TM C300                    vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8)
-Acer TM 4052LCi		        s3_bios (2)
-Acer TM 636Lci			s3_bios,s3_mode (4)
-Acer TM 650 (Radeon M7)		vga=normal plus boot-radeon (5) gets text console back
-Acer TM 660			??? (*)
-Acer TM 800			vga=normal, X patches, see webpage (5) or vbetool (6)
-Acer TM 803			vga=normal, X patches, see webpage (5) or vbetool (6)
-Acer TM 803LCi			vga=normal, vbetool (6)
-Arima W730a			vbetool needed (6)
-Asus L2400D                     s3_mode (3)(***) (S1 also works OK)
-Asus L3350M (SiS 740)           (6)
-Asus L3800C (Radeon M7)		s3_bios (2) (S1 also works OK)
-Asus M6887Ne			vga=normal, s3_bios (2), use radeon driver instead of fglrx in x.org
-Athlon64 desktop prototype	s3_bios (2)
-Compal CL-50			??? (*)
-Compaq Armada E500 - P3-700     none (1) (S1 also works OK)
-Compaq Evo N620c		vga=normal, s3_bios (2)
-Dell 600m, ATI R250 Lf		none (1), but needs xorg-x11-6.8.1.902-1
-Dell D600, ATI RV250            vga=normal and X, or try vbestate (6)
-Dell D610			vga=normal and X (possibly vbestate (6) too, but not tested)
-Dell Inspiron 4000		??? (*)
-Dell Inspiron 500m		??? (*)
-Dell Inspiron 510m		???
-Dell Inspiron 5150		vbetool needed (6)
-Dell Inspiron 600m		??? (*)
-Dell Inspiron 8200		??? (*)
-Dell Inspiron 8500		??? (*)
-Dell Inspiron 8600		??? (*)
-eMachines athlon64 machines	vbetool needed (6) (someone please get me model #s)
-HP NC6000			s3_bios, may not use radeonfb (2); or vbetool (6)
-HP NX7000			??? (*)
-HP Pavilion ZD7000		vbetool post needed, need open-source nv driver for X
-HP Omnibook XE3	athlon version	none (1)
-HP Omnibook XE3GC		none (1), video is S3 Savage/IX-MV
-HP Omnibook XE3L-GF		vbetool (6)
-HP Omnibook 5150		none (1), (S1 also works OK)
-IBM TP T20, model 2647-44G	none (1), video is S3 Inc. 86C270-294 Savage/IX-MV, vesafb gets "interesting" but X work.
-IBM TP A31 / Type 2652-M5G      s3_mode (3) [works ok with BIOS 1.04 2002-08-23, but not at all with BIOS 1.11 2004-11-05 :-(]
-IBM TP R32 / Type 2658-MMG      none (1)
-IBM TP R40 2722B3G		??? (*)
-IBM TP R50p / Type 1832-22U     s3_bios (2)
-IBM TP R51			none (1)
-IBM TP T30	236681A		??? (*)
-IBM TP T40 / Type 2373-MU4      none (1)
-IBM TP T40p			none (1)
-IBM TP R40p			s3_bios (2)
-IBM TP T41p			s3_bios (2), switch to X after resume
-IBM TP T42			s3_bios (2)
-IBM ThinkPad T42p (2373-GTG)	s3_bios (2)
-IBM TP X20			??? (*)
-IBM TP X30			s3_bios, s3_mode (4)
-IBM TP X31 / Type 2672-XXH      none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-IBM TP X32			none (1), but backlight is on and video is trashed after long suspend. s3_bios,s3_mode (4) works too. Perhaps that gets better results?
-IBM Thinkpad X40 Type 2371-7JG  s3_bios,s3_mode (4)
-IBM TP 600e			none(1), but a switch to console and back to X is needed
-Medion MD4220			??? (*)
-Samsung P35			vbetool needed (6)
-Sharp PC-AR10 (ATI rage)	none (1), backlight does not switch off
-Sony Vaio PCG-C1VRX/K		s3_bios (2)
-Sony Vaio PCG-F403		??? (*)
-Sony Vaio PCG-GRT995MP		none (1), works with 'nv' X driver
-Sony Vaio PCG-GR7/K		none (1), but needs radeonfb, use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-Sony Vaio PCG-N505SN		??? (*)
-Sony Vaio vgn-s260		X or boot-radeon can init it (5)
-Sony Vaio vgn-S580BH		vga=normal, but suspend from X. Console will be blank unless you return to X.
-Sony Vaio vgn-FS115B		s3_bios (2),s3_mode (4)
-Toshiba Libretto L5		none (1)
-Toshiba Libretto 100CT/110CT    vbetool (6)
-Toshiba Portege 3020CT		s3_mode (3)
-Toshiba Satellite 4030CDT	s3_mode (3) (S1 also works OK)
-Toshiba Satellite 4080XCDT      s3_mode (3) (S1 also works OK)
-Toshiba Satellite 4090XCDT      ??? (*)
-Toshiba Satellite P10-554       s3_bios,s3_mode (4)(****)
-Toshiba M30                     (2) xor X with nvidia driver using internal AGP
-Uniwill 244IIO			??? (*)
-
-Known working desktop systems
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Mainboard	    Graphics card                 hack (or "how to do it")
-------------------------------------------------------------------------------
-Asus A7V8X	    nVidia RIVA TNT2 model 64	  s3_bios,s3_mode (4)
-
-
-(*) from https://wiki.ubuntu.com/HoaryPMResults, not sure
-    which options to use. If you know, please tell me.
-
-(***) To be tested with a newer kernel.
-
-(****) Not with SMP kernel, UP only.
diff --git a/Documentation/process/submitting-drivers.rst b/Documentation/process/submitting-drivers.rst
index 58bc047e7b95..1acaa14903d6 100644
--- a/Documentation/process/submitting-drivers.rst
+++ b/Documentation/process/submitting-drivers.rst
@@ -117,7 +117,7 @@ PM support:
 		implemented") error.  You should also try to make sure that your
 		driver uses as little power as possible when it's not doing
 		anything.  For the driver testing instructions see
-		Documentation/power/drivers-testing.txt and for a relatively
+		Documentation/power/drivers-testing.rst and for a relatively
 		complete overview of the power management issues related to
 		drivers see :ref:`Documentation/driver-api/pm/devices.rst <driverapi_pm_devices>`.
 
diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt
index 197d81f4b836..d97207b9accb 100644
--- a/Documentation/scheduler/sched-energy.txt
+++ b/Documentation/scheduler/sched-energy.txt
@@ -22,7 +22,7 @@ the highest.
 
 The actual EM used by EAS is _not_ maintained by the scheduler, but by a
 dedicated framework. For details about this framework and what it provides,
-please refer to its documentation (see Documentation/power/energy-model.txt).
+please refer to its documentation (see Documentation/power/energy-model.rst).
 
 
 2. Background and Terminology
@@ -81,7 +81,7 @@ through the arch_scale_cpu_capacity() callback.
 
 The rest of platform knowledge used by EAS is directly read from the Energy
 Model (EM) framework. The EM of a platform is composed of a power cost table
-per 'performance domain' in the system (see Documentation/power/energy-model.txt
+per 'performance domain' in the system (see Documentation/power/energy-model.rst
 for futher details about performance domains).
 
 The scheduler manages references to the EM objects in the topology code when the
@@ -352,7 +352,7 @@ could be amended in the future if proven otherwise.
 EAS uses the EM of a platform to estimate the impact of scheduling decisions on
 energy. So, your platform must provide power cost tables to the EM framework in
 order to make EAS start. To do so, please refer to documentation of the
-independent EM framework in Documentation/power/energy-model.txt.
+independent EM framework in Documentation/power/energy-model.rst.
 
 Please also note that the scheduling domains need to be re-built after the
 EM has been registered in order to start EAS.
diff --git a/Documentation/trace/coresight-cpu-debug.txt b/Documentation/trace/coresight-cpu-debug.txt
index f07e38094b40..1a660a39e3c0 100644
--- a/Documentation/trace/coresight-cpu-debug.txt
+++ b/Documentation/trace/coresight-cpu-debug.txt
@@ -151,7 +151,7 @@ At the runtime you can disable idle states with below methods:
 
 It is possible to disable CPU idle states by way of the PM QoS
 subsystem, more specifically by using the "/dev/cpu_dma_latency"
-interface (see Documentation/power/pm_qos_interface.txt for more
+interface (see Documentation/power/pm_qos_interface.rst for more
 details).  As specified in the PM QoS documentation the requested
 parameter will stay in effect until the file descriptor is released.
 For example:
diff --git a/Documentation/translations/zh_CN/process/submitting-drivers.rst b/Documentation/translations/zh_CN/process/submitting-drivers.rst
index 72c6cd935821..f1c3906c69a8 100644
--- a/Documentation/translations/zh_CN/process/submitting-drivers.rst
+++ b/Documentation/translations/zh_CN/process/submitting-drivers.rst
@@ -97,7 +97,7 @@ Linux 2.6:
 		函数定义成返回 -ENOSYS（功能未实现）错误。你还应该尝试确
 		保你的驱动在什么都不干的情况下将耗电降到最低。要获得驱动
 		程序测试的指导，请参阅
-		Documentation/power/drivers-testing.txt。有关驱动程序电
+		Documentation/power/drivers-testing.rst。有关驱动程序电
 		源管理问题相对全面的概述，请参阅
 		Documentation/driver-api/pm/devices.rst。
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 9c382053ce6a..5a6137df3f0e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6446,7 +6446,7 @@ M:	"Rafael J. Wysocki" <rjw@rjwysocki.net>
 M:	Pavel Machek <pavel@ucw.cz>
 L:	linux-pm@vger.kernel.org
 S:	Supported
-F:	Documentation/power/freezing-of-tasks.txt
+F:	Documentation/power/freezing-of-tasks.rst
 F:	include/linux/freezer.h
 F:	kernel/freezer.c
 
@@ -11764,7 +11764,7 @@ S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git
 F:	drivers/opp/
 F:	include/linux/pm_opp.h
-F:	Documentation/power/opp.txt
+F:	Documentation/power/opp.rst
 F:	Documentation/devicetree/bindings/opp/
 
 OPL4 DRIVER
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2bbbd4d1ba31..77a724771dbb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2447,7 +2447,7 @@ menuconfig APM
 	  machines with more than one CPU.
 
 	  In order to use APM, you will need supporting software. For location
-	  and more information, read <file:Documentation/power/apm-acpi.txt>
+	  and more information, read <file:Documentation/power/apm-acpi.rst>
 	  and the Battery Powered Linux mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 066fd2a12851..10d040e2e807 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1175,7 +1175,7 @@ struct skl_wm_params {
  * to be disabled. This shouldn't happen and we'll print some error messages in
  * case it happens.
  *
- * For more, read the Documentation/power/runtime_pm.txt.
+ * For more, read the Documentation/power/runtime_pm.rst.
  */
 struct i915_runtime_pm {
 	atomic_t wakeref_count;
diff --git a/drivers/opp/Kconfig b/drivers/opp/Kconfig
index a7fbb93f302c..1f64a3d46c8a 100644
--- a/drivers/opp/Kconfig
+++ b/drivers/opp/Kconfig
@@ -10,4 +10,4 @@ config PM_OPP
 	  OPP layer organizes the data internally using device pointers
 	  representing individual voltage domains and provides SOC
 	  implementations a ready to use framework to manage OPPs.
-	  For more information, read <file:Documentation/power/opp.txt>
+	  For more information, read <file:Documentation/power/opp.rst>
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index f7033ecf6d0b..11f9c875b028 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -607,7 +607,7 @@ int power_supply_get_battery_info(struct power_supply *psy,
 
 	/* The property and field names below must correspond to elements
 	 * in enum power_supply_property. For reasoning, see
-	 * Documentation/power/power_supply_class.txt.
+	 * Documentation/power/power_supply_class.rst.
 	 */
 
 	of_property_read_u32(battery_np, "energy-full-design-microwatt-hours",
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c7eef32e7739..5b8328a99b2a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -52,7 +52,7 @@
  *                irq line disabled until the threaded handler has been run.
  * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend.  Does not guarantee
  *                   that this interrupt will wake the system from a suspended
- *                   state.  See Documentation/power/suspend-and-interrupts.txt
+ *                   state.  See Documentation/power/suspend-and-interrupts.rst
  * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
  * IRQF_NO_THREAD - Interrupt cannot be threaded
  * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b74b2a4e6df2..3d9a167ca5c3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -807,7 +807,7 @@ struct module;
  * @suspend_late: Put device into low power state.
  * @resume_early: Wake device from low power state.
  * @resume:	Wake device from low power state.
- *		(Please see Documentation/power/pci.txt for descriptions
+ *		(Please see Documentation/power/pci.rst for descriptions
  *		of PCI Power Management and the related functions.)
  * @shutdown:	Hook into reboot_notifier_list (kernel/sys.c).
  *		Intended to stop any idling DMA operations.
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 66c19a65a514..c14ad8bc1a41 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -284,7 +284,7 @@ typedef struct pm_message {
  * actions to be performed by a device driver's callbacks generally depend on
  * the platform and subsystem the device belongs to.
  *
- * Refer to Documentation/power/runtime_pm.txt for more information about the
+ * Refer to Documentation/power/runtime_pm.rst for more information about the
  * role of the @runtime_suspend(), @runtime_resume() and @runtime_idle()
  * callbacks in device runtime power management.
  */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9bbaaab14b36..7a4dda9e5309 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -65,7 +65,7 @@ config HIBERNATION
 	  need to run mkswap against the swap partition used for the suspend.
 
 	  It also works with swap files to a limited extent (for details see
-	  <file:Documentation/power/swsusp-and-swap-files.txt>).
+	  <file:Documentation/power/swsusp-and-swap-files.rst>).
 
 	  Right now you may boot without resuming and resume later but in the
 	  meantime you cannot use the swap partition(s)/file(s) involved in
@@ -74,7 +74,7 @@ config HIBERNATION
 	  MOUNT any journaled filesystems mounted before the suspend or they
 	  will get corrupted in a nasty way.
 
-	  For more information take a look at <file:Documentation/power/swsusp.txt>.
+	  For more information take a look at <file:Documentation/power/swsusp.rst>.
 
 config ARCH_SAVE_PAGE_KEYS
 	bool
@@ -255,7 +255,7 @@ config APM_EMULATION
 	  notification of APM "events" (e.g. battery status change).
 
 	  In order to use APM, you will need supporting software. For location
-	  and more information, read <file:Documentation/power/apm-acpi.txt>
+	  and more information, read <file:Documentation/power/apm-acpi.rst>
 	  and the Battery Powered Linux mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index 41722046b937..0cd26289bfbc 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -165,7 +165,7 @@ config CFG80211_DEFAULT_PS
 
 	  If this causes your applications to misbehave you should fix your
 	  applications instead -- they need to register their network
-	  latency requirement, see Documentation/power/pm_qos_interface.txt.
+	  latency requirement, see Documentation/power/pm_qos_interface.rst.
 
 config CFG80211_DEBUGFS
 	bool "cfg80211 DebugFS entries"
-- 
cgit v1.2.3


From e1714daad7cf8fe4d6dd91adcfbbdd0604b0210d Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:31 +0200
Subject: i2c: headers: don't use 'dev' as adapter variable

It is not a struct device, so 'dev' is confusing. Use 'adap', the most
common name.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e982b8913b73..6bd199cfe61f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -703,14 +703,14 @@ struct i2c_adapter {
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
-static inline void *i2c_get_adapdata(const struct i2c_adapter *dev)
+static inline void *i2c_get_adapdata(const struct i2c_adapter *adap)
 {
-	return dev_get_drvdata(&dev->dev);
+	return dev_get_drvdata(&adap->dev);
 }
 
-static inline void i2c_set_adapdata(struct i2c_adapter *dev, void *data)
+static inline void i2c_set_adapdata(struct i2c_adapter *adap, void *data)
 {
-	dev_set_drvdata(&dev->dev, data);
+	dev_set_drvdata(&adap->dev, data);
 }
 
 static inline struct i2c_adapter *
-- 
cgit v1.2.3


From d68222d4d6647611be5a32c80a53a145e7c80ce9 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:32 +0200
Subject: i2c: headers: always have a named variable in arguments

Much better to read and understand. Naming for i2c_adapter is not
consistent (yet), so use the name which is also used in core code.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 6bd199cfe61f..14e04fb4f46f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -40,7 +40,8 @@ struct i2c_device_identity;
 union i2c_smbus_data;
 struct i2c_board_info;
 enum i2c_slave_event;
-typedef int (*i2c_slave_cb_t)(struct i2c_client *, enum i2c_slave_event, u8 *);
+typedef int (*i2c_slave_cb_t)(struct i2c_client *client,
+			      enum i2c_slave_event event, u8 *val);
 
 struct module;
 struct property_entry;
@@ -257,16 +258,16 @@ struct i2c_driver {
 	unsigned int class;
 
 	/* Standard driver model interfaces */
-	int (*probe)(struct i2c_client *, const struct i2c_device_id *);
-	int (*remove)(struct i2c_client *);
+	int (*probe)(struct i2c_client *client, const struct i2c_device_id *id);
+	int (*remove)(struct i2c_client *client);
 
 	/* New driver model interface to aid the seamless removal of the
 	 * current probe()'s, more commonly unused than used second parameter.
 	 */
-	int (*probe_new)(struct i2c_client *);
+	int (*probe_new)(struct i2c_client *client);
 
 	/* driver model interfaces that don't relate to enumeration  */
-	void (*shutdown)(struct i2c_client *);
+	void (*shutdown)(struct i2c_client *client);
 
 	/* Alert callback, for example for the SMBus alert protocol.
 	 * The format and meaning of the data value depends on the protocol.
@@ -275,7 +276,7 @@ struct i2c_driver {
 	 * For the SMBus Host Notify protocol, the data corresponds to the
 	 * 16-bit payload data reported by the slave device acting as master.
 	 */
-	void (*alert)(struct i2c_client *, enum i2c_alert_protocol protocol,
+	void (*alert)(struct i2c_client *client, enum i2c_alert_protocol protocol,
 		      unsigned int data);
 
 	/* a ioctl like command that can be used to perform specific functions
@@ -287,7 +288,7 @@ struct i2c_driver {
 	const struct i2c_device_id *id_table;
 
 	/* Device detection callback for automatic device creation */
-	int (*detect)(struct i2c_client *, struct i2c_board_info *);
+	int (*detect)(struct i2c_client *client, struct i2c_board_info *info);
 	const unsigned short *address_list;
 	struct list_head clients;
 
@@ -447,10 +448,10 @@ extern struct i2c_client *
 i2c_new_probed_device(struct i2c_adapter *adap,
 		      struct i2c_board_info *info,
 		      unsigned short const *addr_list,
-		      int (*probe)(struct i2c_adapter *, unsigned short addr));
+		      int (*probe)(struct i2c_adapter *adap, unsigned short addr));
 
 /* Common custom probe functions */
-extern int i2c_probe_func_quick_read(struct i2c_adapter *, unsigned short addr);
+extern int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short addr);
 
 /* For devices that use several addresses, use i2c_new_dummy() to make
  * client handles for the extra addresses.
@@ -466,7 +467,7 @@ i2c_new_secondary_device(struct i2c_client *client,
 				const char *name,
 				u16 default_addr);
 
-extern void i2c_unregister_device(struct i2c_client *);
+extern void i2c_unregister_device(struct i2c_client *client);
 #endif /* I2C */
 
 /* Mainboard arch_initcall() code should register all its I2C devices.
@@ -551,9 +552,9 @@ struct i2c_algorithm {
  * The main operations are wrapped by i2c_lock_bus and i2c_unlock_bus.
  */
 struct i2c_lock_operations {
-	void (*lock_bus)(struct i2c_adapter *, unsigned int flags);
-	int (*trylock_bus)(struct i2c_adapter *, unsigned int flags);
-	void (*unlock_bus)(struct i2c_adapter *, unsigned int flags);
+	void (*lock_bus)(struct i2c_adapter *adapter, unsigned int flags);
+	int (*trylock_bus)(struct i2c_adapter *adapter, unsigned int flags);
+	void (*unlock_bus)(struct i2c_adapter *adapter, unsigned int flags);
 };
 
 /**
@@ -726,7 +727,7 @@ i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter)
 		return NULL;
 }
 
-int i2c_for_each_dev(void *data, int (*fn)(struct device *, void *));
+int i2c_for_each_dev(void *data, int (*fn)(struct device *dev, void *data));
 
 /* Adapter locking functions, exported for shared pin cases */
 #define I2C_LOCK_ROOT_ADAPTER BIT(0)
@@ -832,12 +833,12 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap)
 /* administration...
  */
 #if IS_ENABLED(CONFIG_I2C)
-extern int i2c_add_adapter(struct i2c_adapter *);
-extern void i2c_del_adapter(struct i2c_adapter *);
-extern int i2c_add_numbered_adapter(struct i2c_adapter *);
+extern int i2c_add_adapter(struct i2c_adapter *adap);
+extern void i2c_del_adapter(struct i2c_adapter *adap);
+extern int i2c_add_numbered_adapter(struct i2c_adapter *adap);
 
-extern int i2c_register_driver(struct module *, struct i2c_driver *);
-extern void i2c_del_driver(struct i2c_driver *);
+extern int i2c_register_driver(struct module *owner, struct i2c_driver *driver);
+extern void i2c_del_driver(struct i2c_driver *driver);
 
 /* use a define to avoid include chaining to get THIS_MODULE */
 #define i2c_add_driver(driver) \
-- 
cgit v1.2.3


From 2caea56f569ac361fc854f6bf2fe94b70514c917 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:34 +0200
Subject: i2c: headers: update docs about I2C_CLIENT_*

Update kerneldoc for i2c client flags because they increased over time.
Also, move them to a position where they can be more easily found.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 14e04fb4f46f..9853fae9b505 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -298,8 +298,7 @@ struct i2c_driver {
 
 /**
  * struct i2c_client - represent an I2C slave device
- * @flags: I2C_CLIENT_TEN indicates the device uses a ten bit chip address;
- *	I2C_CLIENT_PEC indicates it uses SMBus Packet Error Checking
+ * @flags: see I2C_CLIENT_* for possible flags
  * @addr: Address used on the I2C bus connected to the parent adapter.
  * @name: Indicates the type of the device, usually a chip name that's
  *	generic enough to hide second-sourcing and compatible revisions.
@@ -317,6 +316,15 @@ struct i2c_driver {
  */
 struct i2c_client {
 	unsigned short flags;		/* div., see below		*/
+#define I2C_CLIENT_PEC		0x04	/* Use Packet Error Checking */
+#define I2C_CLIENT_TEN		0x10	/* we have a ten bit chip address */
+					/* Must equal I2C_M_TEN below */
+#define I2C_CLIENT_SLAVE	0x20	/* we are the slave */
+#define I2C_CLIENT_HOST_NOTIFY	0x40	/* We want to use I2C host notify */
+#define I2C_CLIENT_WAKE		0x80	/* for board_info; true iff can wake */
+#define I2C_CLIENT_SCCB		0x9000	/* Use Omnivision SCCB protocol */
+					/* Must match I2C_M_STOP|IGNORE_NAK */
+
 	unsigned short addr;		/* chip address - NOTE: 7bit	*/
 					/* addresses are stored in the	*/
 					/* _LOWER_ 7 bits		*/
@@ -803,16 +811,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap)
 	i2c_unlock_bus(adap, I2C_LOCK_ROOT_ADAPTER);
 }
 
-/*flags for the client struct: */
-#define I2C_CLIENT_PEC		0x04	/* Use Packet Error Checking */
-#define I2C_CLIENT_TEN		0x10	/* we have a ten bit chip address */
-					/* Must equal I2C_M_TEN below */
-#define I2C_CLIENT_SLAVE	0x20	/* we are the slave */
-#define I2C_CLIENT_HOST_NOTIFY	0x40	/* We want to use I2C host notify */
-#define I2C_CLIENT_WAKE		0x80	/* for board_info; true iff can wake */
-#define I2C_CLIENT_SCCB		0x9000	/* Use Omnivision SCCB protocol */
-					/* Must match I2C_M_STOP|IGNORE_NAK */
-
 /* i2c adapter classes (bitmask) */
 #define I2C_CLASS_HWMON		(1<<0)	/* lm_sensors, ... */
 #define I2C_CLASS_DDC		(1<<3)	/* DDC bus on graphics adapters */
-- 
cgit v1.2.3


From 76cc9f0efd952d376e93e79b1f19fd6fdb8291bc Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:35 +0200
Subject: i2c: headers: reformat header comment and update copyright

Let's stick to coding style.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 9853fae9b505..d8f9060179d0 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -1,16 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* ------------------------------------------------------------------------- */
-/*									     */
-/* i2c.h - definitions for the i2c-bus interface			     */
-/*									     */
-/* ------------------------------------------------------------------------- */
-/*   Copyright (C) 1995-2000 Simon G. Vogl
-
+/*
+ * i2c.h - definitions for the Linux i2c bus interface
+ * Copyright (C) 1995-2000 Simon G. Vogl
+ * Copyright (C) 2013-2019 Wolfram Sang <wsa@the-dreams.de>
+ *
+ * With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and
+ * Frodo Looijaard <frodol@dds.nl>
  */
-/* ------------------------------------------------------------------------- */
-
-/* With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and
-   Frodo Looijaard <frodol@dds.nl> */
 #ifndef _LINUX_I2C_H
 #define _LINUX_I2C_H
 
-- 
cgit v1.2.3


From 013e868bc9465452c7b667830712ab57de236d08 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Wed, 15 May 2019 15:38:47 +0530
Subject: mfd: lp87565: Add support for 4-phase LP87561 combination

Add support for 4-phase LP87561 combination.

Data Sheet: https://www.ti.com/lit/ds/symlink/lp87561-q1.pdf

Signed-off-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/lp87565.c       | 4 ++++
 include/linux/mfd/lp87565.h | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/lp87565.c b/drivers/mfd/lp87565.c
index 32d2a07d4354..8ad688fe75f9 100644
--- a/drivers/mfd/lp87565.c
+++ b/drivers/mfd/lp87565.c
@@ -33,6 +33,10 @@ static const struct of_device_id of_lp87565_match_table[] = {
 		.compatible = "ti,lp87565-q1",
 		.data = (void *)LP87565_DEVICE_TYPE_LP87565_Q1,
 	},
+	{
+		.compatible = "ti,lp87561-q1",
+		.data = (void *)LP87565_DEVICE_TYPE_LP87561_Q1,
+	},
 	{}
 };
 MODULE_DEVICE_TABLE(of, of_lp87565_match_table);
diff --git a/include/linux/mfd/lp87565.h b/include/linux/mfd/lp87565.h
index d0c91ba65525..976447607ea2 100644
--- a/include/linux/mfd/lp87565.h
+++ b/include/linux/mfd/lp87565.h
@@ -17,6 +17,7 @@
 
 enum lp87565_device_type {
 	LP87565_DEVICE_TYPE_UNKNOWN	= 0,
+	LP87565_DEVICE_TYPE_LP87561_Q1,
 	LP87565_DEVICE_TYPE_LP87565_Q1,
 };
 
@@ -249,6 +250,7 @@ enum LP87565_regulator_id {
 	LP87565_BUCK_3,
 	LP87565_BUCK_10,
 	LP87565_BUCK_23,
+	LP87565_BUCK_3210,
 };
 
 /**
-- 
cgit v1.2.3


From 5ec47cda74e98ad2f723f93b4a97ba87638338aa Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 3 Jun 2019 17:12:33 +0900
Subject: memory: move jedec_ddr.h from include/memory to drivers/memory/

Now that jedec_ddr_data.c was moved from lib/ to drivers/memory/,
<memory/jedec_ddr.h> is included only from drivers/memory/.

Make it a local header of drivers/memory/.

The directory include/memory is now gone.

While I am here, I also changed #include <linux/module.h> to
<linux/export.h>. Because CONFIG_DDR is bool, jedec_ddr_data.c is
never compiled as a module.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 drivers/memory/emif.c           |   3 +-
 drivers/memory/jedec_ddr.h      | 175 ++++++++++++++++++++++++++++++++++++++++
 drivers/memory/jedec_ddr_data.c |   5 +-
 drivers/memory/of_memory.c      |   3 +-
 include/memory/jedec_ddr.h      | 175 ----------------------------------------
 5 files changed, 182 insertions(+), 179 deletions(-)
 create mode 100644 drivers/memory/jedec_ddr.h
 delete mode 100644 include/memory/jedec_ddr.h

(limited to 'include')

diff --git a/drivers/memory/emif.c b/drivers/memory/emif.c
index 2f214440008c..32cad7540d78 100644
--- a/drivers/memory/emif.c
+++ b/drivers/memory/emif.c
@@ -26,8 +26,9 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/pm.h>
-#include <memory/jedec_ddr.h>
+
 #include "emif.h"
+#include "jedec_ddr.h"
 #include "of_memory.h"
 
 /**
diff --git a/drivers/memory/jedec_ddr.h b/drivers/memory/jedec_ddr.h
new file mode 100644
index 000000000000..a2094a9a588e
--- /dev/null
+++ b/drivers/memory/jedec_ddr.h
@@ -0,0 +1,175 @@
+/*
+ * Definitions for DDR memories based on JEDEC specs
+ *
+ * Copyright (C) 2012 Texas Instruments, Inc.
+ *
+ * Aneesh V <aneesh@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __JEDEC_DDR_H
+#define __JEDEC_DDR_H
+
+#include <linux/types.h>
+
+/* DDR Densities */
+#define DDR_DENSITY_64Mb	1
+#define DDR_DENSITY_128Mb	2
+#define DDR_DENSITY_256Mb	3
+#define DDR_DENSITY_512Mb	4
+#define DDR_DENSITY_1Gb		5
+#define DDR_DENSITY_2Gb		6
+#define DDR_DENSITY_4Gb		7
+#define DDR_DENSITY_8Gb		8
+#define DDR_DENSITY_16Gb	9
+#define DDR_DENSITY_32Gb	10
+
+/* DDR type */
+#define DDR_TYPE_DDR2		1
+#define DDR_TYPE_DDR3		2
+#define DDR_TYPE_LPDDR2_S4	3
+#define DDR_TYPE_LPDDR2_S2	4
+#define DDR_TYPE_LPDDR2_NVM	5
+
+/* DDR IO width */
+#define DDR_IO_WIDTH_4		1
+#define DDR_IO_WIDTH_8		2
+#define DDR_IO_WIDTH_16		3
+#define DDR_IO_WIDTH_32		4
+
+/* Number of Row bits */
+#define R9			9
+#define R10			10
+#define R11			11
+#define R12			12
+#define R13			13
+#define R14			14
+#define R15			15
+#define R16			16
+
+/* Number of Column bits */
+#define C7			7
+#define C8			8
+#define C9			9
+#define C10			10
+#define C11			11
+#define C12			12
+
+/* Number of Banks */
+#define B1			0
+#define B2			1
+#define B4			2
+#define B8			3
+
+/* Refresh rate in nano-seconds */
+#define T_REFI_15_6		15600
+#define T_REFI_7_8		7800
+#define T_REFI_3_9		3900
+
+/* tRFC values */
+#define T_RFC_90		90000
+#define T_RFC_110		110000
+#define T_RFC_130		130000
+#define T_RFC_160		160000
+#define T_RFC_210		210000
+#define T_RFC_300		300000
+#define T_RFC_350		350000
+
+/* Mode register numbers */
+#define DDR_MR0			0
+#define DDR_MR1			1
+#define DDR_MR2			2
+#define DDR_MR3			3
+#define DDR_MR4			4
+#define DDR_MR5			5
+#define DDR_MR6			6
+#define DDR_MR7			7
+#define DDR_MR8			8
+#define DDR_MR9			9
+#define DDR_MR10		10
+#define DDR_MR11		11
+#define DDR_MR16		16
+#define DDR_MR17		17
+#define DDR_MR18		18
+
+/*
+ * LPDDR2 related defines
+ */
+
+/* MR4 register fields */
+#define MR4_SDRAM_REF_RATE_SHIFT			0
+#define MR4_SDRAM_REF_RATE_MASK				7
+#define MR4_TUF_SHIFT					7
+#define MR4_TUF_MASK					(1 << 7)
+
+/* MR4 SDRAM Refresh Rate field values */
+#define SDRAM_TEMP_NOMINAL				0x3
+#define SDRAM_TEMP_RESERVED_4				0x4
+#define SDRAM_TEMP_HIGH_DERATE_REFRESH			0x5
+#define SDRAM_TEMP_HIGH_DERATE_REFRESH_AND_TIMINGS	0x6
+#define SDRAM_TEMP_VERY_HIGH_SHUTDOWN			0x7
+
+#define NUM_DDR_ADDR_TABLE_ENTRIES			11
+#define NUM_DDR_TIMING_TABLE_ENTRIES			4
+
+/* Structure for DDR addressing info from the JEDEC spec */
+struct lpddr2_addressing {
+	u32 num_banks;
+	u32 tREFI_ns;
+	u32 tRFCab_ps;
+};
+
+/*
+ * Structure for timings from the LPDDR2 datasheet
+ * All parameters are in pico seconds(ps) unless explicitly indicated
+ * with a suffix like tRAS_max_ns below
+ */
+struct lpddr2_timings {
+	u32 max_freq;
+	u32 min_freq;
+	u32 tRPab;
+	u32 tRCD;
+	u32 tWR;
+	u32 tRAS_min;
+	u32 tRRD;
+	u32 tWTR;
+	u32 tXP;
+	u32 tRTP;
+	u32 tCKESR;
+	u32 tDQSCK_max;
+	u32 tDQSCK_max_derated;
+	u32 tFAW;
+	u32 tZQCS;
+	u32 tZQCL;
+	u32 tZQinit;
+	u32 tRAS_max_ns;
+};
+
+/*
+ * Min value for some parameters in terms of number of tCK cycles(nCK)
+ * Please set to zero parameters that are not valid for a given memory
+ * type
+ */
+struct lpddr2_min_tck {
+	u32 tRPab;
+	u32 tRCD;
+	u32 tWR;
+	u32 tRASmin;
+	u32 tRRD;
+	u32 tWTR;
+	u32 tXP;
+	u32 tRTP;
+	u32 tCKE;
+	u32 tCKESR;
+	u32 tFAW;
+};
+
+extern const struct lpddr2_addressing
+	lpddr2_jedec_addressing_table[NUM_DDR_ADDR_TABLE_ENTRIES];
+extern const struct lpddr2_timings
+	lpddr2_jedec_timings[NUM_DDR_TIMING_TABLE_ENTRIES];
+extern const struct lpddr2_min_tck lpddr2_jedec_min_tck;
+
+#endif /* __JEDEC_DDR_H */
diff --git a/drivers/memory/jedec_ddr_data.c b/drivers/memory/jedec_ddr_data.c
index 6d2cbf1d567f..1f9ca0f23407 100644
--- a/drivers/memory/jedec_ddr_data.c
+++ b/drivers/memory/jedec_ddr_data.c
@@ -10,8 +10,9 @@
  * published by the Free Software Foundation.
  */
 
-#include <memory/jedec_ddr.h>
-#include <linux/module.h>
+#include <linux/export.h>
+
+#include "jedec_ddr.h"
 
 /* LPDDR2 addressing details from JESD209-2 section 2.4 */
 const struct lpddr2_addressing
diff --git a/drivers/memory/of_memory.c b/drivers/memory/of_memory.c
index 12a61f558644..46539b27a3fb 100644
--- a/drivers/memory/of_memory.c
+++ b/drivers/memory/of_memory.c
@@ -10,8 +10,9 @@
 #include <linux/list.h>
 #include <linux/of.h>
 #include <linux/gfp.h>
-#include <memory/jedec_ddr.h>
 #include <linux/export.h>
+
+#include "jedec_ddr.h"
 #include "of_memory.h"
 
 /**
diff --git a/include/memory/jedec_ddr.h b/include/memory/jedec_ddr.h
deleted file mode 100644
index ddad0f870e5d..000000000000
--- a/include/memory/jedec_ddr.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Definitions for DDR memories based on JEDEC specs
- *
- * Copyright (C) 2012 Texas Instruments, Inc.
- *
- * Aneesh V <aneesh@ti.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __LINUX_JEDEC_DDR_H
-#define __LINUX_JEDEC_DDR_H
-
-#include <linux/types.h>
-
-/* DDR Densities */
-#define DDR_DENSITY_64Mb	1
-#define DDR_DENSITY_128Mb	2
-#define DDR_DENSITY_256Mb	3
-#define DDR_DENSITY_512Mb	4
-#define DDR_DENSITY_1Gb		5
-#define DDR_DENSITY_2Gb		6
-#define DDR_DENSITY_4Gb		7
-#define DDR_DENSITY_8Gb		8
-#define DDR_DENSITY_16Gb	9
-#define DDR_DENSITY_32Gb	10
-
-/* DDR type */
-#define DDR_TYPE_DDR2		1
-#define DDR_TYPE_DDR3		2
-#define DDR_TYPE_LPDDR2_S4	3
-#define DDR_TYPE_LPDDR2_S2	4
-#define DDR_TYPE_LPDDR2_NVM	5
-
-/* DDR IO width */
-#define DDR_IO_WIDTH_4		1
-#define DDR_IO_WIDTH_8		2
-#define DDR_IO_WIDTH_16		3
-#define DDR_IO_WIDTH_32		4
-
-/* Number of Row bits */
-#define R9			9
-#define R10			10
-#define R11			11
-#define R12			12
-#define R13			13
-#define R14			14
-#define R15			15
-#define R16			16
-
-/* Number of Column bits */
-#define C7			7
-#define C8			8
-#define C9			9
-#define C10			10
-#define C11			11
-#define C12			12
-
-/* Number of Banks */
-#define B1			0
-#define B2			1
-#define B4			2
-#define B8			3
-
-/* Refresh rate in nano-seconds */
-#define T_REFI_15_6		15600
-#define T_REFI_7_8		7800
-#define T_REFI_3_9		3900
-
-/* tRFC values */
-#define T_RFC_90		90000
-#define T_RFC_110		110000
-#define T_RFC_130		130000
-#define T_RFC_160		160000
-#define T_RFC_210		210000
-#define T_RFC_300		300000
-#define T_RFC_350		350000
-
-/* Mode register numbers */
-#define DDR_MR0			0
-#define DDR_MR1			1
-#define DDR_MR2			2
-#define DDR_MR3			3
-#define DDR_MR4			4
-#define DDR_MR5			5
-#define DDR_MR6			6
-#define DDR_MR7			7
-#define DDR_MR8			8
-#define DDR_MR9			9
-#define DDR_MR10		10
-#define DDR_MR11		11
-#define DDR_MR16		16
-#define DDR_MR17		17
-#define DDR_MR18		18
-
-/*
- * LPDDR2 related defines
- */
-
-/* MR4 register fields */
-#define MR4_SDRAM_REF_RATE_SHIFT			0
-#define MR4_SDRAM_REF_RATE_MASK				7
-#define MR4_TUF_SHIFT					7
-#define MR4_TUF_MASK					(1 << 7)
-
-/* MR4 SDRAM Refresh Rate field values */
-#define SDRAM_TEMP_NOMINAL				0x3
-#define SDRAM_TEMP_RESERVED_4				0x4
-#define SDRAM_TEMP_HIGH_DERATE_REFRESH			0x5
-#define SDRAM_TEMP_HIGH_DERATE_REFRESH_AND_TIMINGS	0x6
-#define SDRAM_TEMP_VERY_HIGH_SHUTDOWN			0x7
-
-#define NUM_DDR_ADDR_TABLE_ENTRIES			11
-#define NUM_DDR_TIMING_TABLE_ENTRIES			4
-
-/* Structure for DDR addressing info from the JEDEC spec */
-struct lpddr2_addressing {
-	u32 num_banks;
-	u32 tREFI_ns;
-	u32 tRFCab_ps;
-};
-
-/*
- * Structure for timings from the LPDDR2 datasheet
- * All parameters are in pico seconds(ps) unless explicitly indicated
- * with a suffix like tRAS_max_ns below
- */
-struct lpddr2_timings {
-	u32 max_freq;
-	u32 min_freq;
-	u32 tRPab;
-	u32 tRCD;
-	u32 tWR;
-	u32 tRAS_min;
-	u32 tRRD;
-	u32 tWTR;
-	u32 tXP;
-	u32 tRTP;
-	u32 tCKESR;
-	u32 tDQSCK_max;
-	u32 tDQSCK_max_derated;
-	u32 tFAW;
-	u32 tZQCS;
-	u32 tZQCL;
-	u32 tZQinit;
-	u32 tRAS_max_ns;
-};
-
-/*
- * Min value for some parameters in terms of number of tCK cycles(nCK)
- * Please set to zero parameters that are not valid for a given memory
- * type
- */
-struct lpddr2_min_tck {
-	u32 tRPab;
-	u32 tRCD;
-	u32 tWR;
-	u32 tRASmin;
-	u32 tRRD;
-	u32 tWTR;
-	u32 tXP;
-	u32 tRTP;
-	u32 tCKE;
-	u32 tCKESR;
-	u32 tFAW;
-};
-
-extern const struct lpddr2_addressing
-	lpddr2_jedec_addressing_table[NUM_DDR_ADDR_TABLE_ENTRIES];
-extern const struct lpddr2_timings
-	lpddr2_jedec_timings[NUM_DDR_TIMING_TABLE_ENTRIES];
-extern const struct lpddr2_min_tck lpddr2_jedec_min_tck;
-
-#endif /* __LINUX_JEDEC_DDR_H */
-- 
cgit v1.2.3


From e7488e58c7cfe4be0c52db68622a0397bb75258e Mon Sep 17 00:00:00 2001
From: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Date: Tue, 14 May 2019 20:59:01 +0200
Subject: platform/x86: wmi: Add function to get _UID of WMI device

Add a new function to acpi.h / wmi.c that returns _UID of the ACPI WMI
device. For example, it returns "ATK" for the following declaration in
DSDT:
Device (ATKD)
{
    Name (_HID, "PNP0C14" /* Windows Management Instrumentation Device */)
      // _HID: Hardware ID
    Name (_UID, "ATK")  // _UID: Unique ID
    ..

Generally, it is possible that multiple PNP0C14 ACPI devices are present in
the system as mentioned in the commit message of commit bff431e49ff5
("ACPI: WMI: Add ACPI-WMI mapping driver").

Therefore the _UID is returned for a specific ACPI device that declares the
given GUID, to which it is also mapped by other methods of wmi module.

Signed-off-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/wmi.c | 19 +++++++++++++++++++
 include/linux/acpi.h       |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 7b26b6ccf1a0..b08ffb769cbe 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -635,6 +635,25 @@ bool wmi_has_guid(const char *guid_string)
 }
 EXPORT_SYMBOL_GPL(wmi_has_guid);
 
+/**
+ * wmi_get_acpi_device_uid() - Get _UID name of ACPI device that defines GUID
+ * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
+ *
+ * Find the _UID of ACPI device associated with this WMI GUID.
+ *
+ * Return: The ACPI _UID field value or NULL if the WMI GUID was not found
+ */
+char *wmi_get_acpi_device_uid(const char *guid_string)
+{
+	struct wmi_block *wblock = NULL;
+
+	if (!find_guid(guid_string, &wblock))
+		return NULL;
+
+	return acpi_device_uid(wblock->acpi_device);
+}
+EXPORT_SYMBOL_GPL(wmi_get_acpi_device_uid);
+
 static struct wmi_block *dev_to_wblock(struct device *dev)
 {
 	return container_of(dev, struct wmi_block, dev.dev);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 98440df7fe42..d867a9a904f9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -380,6 +380,7 @@ extern acpi_status wmi_install_notify_handler(const char *guid,
 extern acpi_status wmi_remove_notify_handler(const char *guid);
 extern acpi_status wmi_get_event_data(u32 event, struct acpi_buffer *out);
 extern bool wmi_has_guid(const char *guid);
+extern char *wmi_get_acpi_device_uid(const char *guid);
 
 #endif	/* CONFIG_ACPI_WMI */
 
-- 
cgit v1.2.3


From e0668f28888184f6c633110a37386f2d4a6fa00e Mon Sep 17 00:00:00 2001
From: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Date: Tue, 14 May 2019 21:00:31 +0200
Subject: platform/x86: asus-wmi: Improve DSTS WMI method ID detection

The DSTS method detection mistakenly selects DCTS instead of DSTS if
nothing is returned when the method ID is not defined in WMNB. As a result,
the control of keyboard backlight is not functional for TUF Gaming series
laptops. Implement detection based on _UID of the WMI device instead.

There is evidence that DCTS is handled by ACPI WMI devices that have _UID
ASUSWMI, whereas none of the devices without ASUSWMI respond to DCTS and
DSTS is used instead [1].

DSDT examples:

FX505GM (_UID ATK):
Method (WMNB, 3, Serialized)
{ ...
    If ((Local0 == 0x53545344))
    {
        ...
        Return (Zero)
    }
    ...
    // No return
}

K54C (_UID ATK):
Method (WMNB, 3, Serialized)
{ ...
    If ((Local0 == 0x53545344))
    {
        ...
        Return (0x02)
    }
    ...
    Return (0xFFFFFFFE)
}

[1] Link: https://lkml.org/lkml/2019/4/11/322

Signed-off-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Suggested-by: Daniel Drake <drake@endlessm.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/hid/hid-asus.c                     |  2 +-
 drivers/platform/x86/asus-wmi.c            | 23 ++++++++++++++++++++---
 include/linux/platform_data/x86/asus-wmi.h |  4 ++--
 3 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c
index 336aeaed1159..1d01fe23ca0c 100644
--- a/drivers/hid/hid-asus.c
+++ b/drivers/hid/hid-asus.c
@@ -396,7 +396,7 @@ static bool asus_kbd_wmi_led_control_present(struct hid_device *hdev)
 	if (!IS_ENABLED(CONFIG_ASUS_WMI))
 		return false;
 
-	ret = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS2,
+	ret = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS,
 				       ASUS_WMI_DEVID_KBD_BACKLIGHT, 0, &value);
 	hid_dbg(hdev, "WMI backlight check: rc %d value %x", ret, value);
 	if (ret)
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index c67f11e0d6e7..ef526dcfeac5 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -83,6 +83,8 @@ MODULE_LICENSE("GPL");
 #define USB_INTEL_XUSB2PR		0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
 
+#define ASUS_ACPI_UID_ASUSWMI		"ASUSWMI"
+
 static const char * const ashs_ids[] = { "ATK4001", "ATK4002", NULL };
 
 static bool ashs_present(void)
@@ -1874,6 +1876,8 @@ static int asus_wmi_sysfs_init(struct platform_device *device)
  */
 static int asus_wmi_platform_init(struct asus_wmi *asus)
 {
+	struct device *dev = &asus->platform_device->dev;
+	char *wmi_uid;
 	int rv;
 
 	/* INIT enable hotkeys on some models */
@@ -1903,11 +1907,24 @@ static int asus_wmi_platform_init(struct asus_wmi *asus)
 	 * Note, on most Eeepc, there is no way to check if a method exist
 	 * or note, while on notebooks, they returns 0xFFFFFFFE on failure,
 	 * but once again, SPEC may probably be used for that kind of things.
+	 *
+	 * Additionally at least TUF Gaming series laptops return nothing for
+	 * unknown methods, so the detection in this way is not possible.
+	 *
+	 * There is strong indication that only ACPI WMI devices that have _UID
+	 * equal to "ASUSWMI" use DCTS whereas those with "ATK" use DSTS.
 	 */
-	if (!asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, 0, 0, NULL))
+	wmi_uid = wmi_get_acpi_device_uid(ASUS_WMI_MGMT_GUID);
+	if (!wmi_uid)
+		return -ENODEV;
+
+	if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI)) {
+		dev_info(dev, "Detected ASUSWMI, use DCTS\n");
+		asus->dsts_id = ASUS_WMI_METHODID_DCTS;
+	} else {
+		dev_info(dev, "Detected %s, not ASUSWMI, use DSTS\n", wmi_uid);
 		asus->dsts_id = ASUS_WMI_METHODID_DSTS;
-	else
-		asus->dsts_id = ASUS_WMI_METHODID_DSTS2;
+	}
 
 	/* CWAP allow to define the behavior of the Fn+F2 key,
 	 * this method doesn't seems to be present on Eee PCs */
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index bfba245636a7..0668f76df921 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -18,8 +18,8 @@
 #define ASUS_WMI_METHODID_GDSP		0x50534447 /* Get DiSPlay output */
 #define ASUS_WMI_METHODID_DEVP		0x50564544 /* DEVice Policy */
 #define ASUS_WMI_METHODID_OSVR		0x5256534F /* OS VeRsion */
-#define ASUS_WMI_METHODID_DSTS		0x53544344 /* Device STatuS */
-#define ASUS_WMI_METHODID_DSTS2		0x53545344 /* Device STatuS #2*/
+#define ASUS_WMI_METHODID_DCTS		0x53544344 /* Device status (DCTS) */
+#define ASUS_WMI_METHODID_DSTS		0x53545344 /* Device status (DSTS) */
 #define ASUS_WMI_METHODID_BSTS		0x53545342 /* Bios STatuS ? */
 #define ASUS_WMI_METHODID_DEVS		0x53564544 /* DEVice Set */
 #define ASUS_WMI_METHODID_CFVS		0x53564643 /* CPU Frequency Volt Set */
-- 
cgit v1.2.3


From b096f626a6827ad2ced5ebdbdc04e62422d463f6 Mon Sep 17 00:00:00 2001
From: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Date: Tue, 14 May 2019 21:07:05 +0200
Subject: platform/x86: asus-wmi: Switch fan boost mode

The WMI exposes a write-only device ID where up to three fan modes can be
switched on some laptops (TUF Gaming FX505GM). There is a hotkey
combination Fn-F5 that does have a fan icon, which is designed to toggle
between fan modes. The DSTS of the device ID returns information about the
presence of this capability and the presence of each of the two additional
fan modes as a bitmask (0x01 - overboost present, 0x02 - silent present)
[1].

Add a SysFS entry that reads the last written value and updates value in
WMI on write and a hotkey handler that toggles the modes taking into
account their availability according to DSTS.

Modes:
* 0x00 - normal or balanced,
* 0x01 - overboost, increased fan RPM,
* 0x02 - silent, decreased fan RPM

[1] Link: https://lkml.org/lkml/2019/4/12/110

Signed-off-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Suggested-by: Daniel Drake <drake@endlessm.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-platform-asus-wmi |  10 ++
 drivers/platform/x86/asus-wmi.c                   | 151 ++++++++++++++++++++--
 include/linux/platform_data/x86/asus-wmi.h        |   1 +
 3 files changed, 154 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 019e1e29370e..87ae5cc983bf 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -36,3 +36,13 @@ KernelVersion:	3.5
 Contact:	"AceLan Kao" <acelan.kao@canonical.com>
 Description:
 		Resume on lid open. 1 means on, 0 means off.
+
+What:		/sys/devices/platform/<platform>/fan_mode
+Date:		Apr 2019
+KernelVersion:	5.2
+Contact:	"Yurii Pavlovskyi" <yurii.pavlovskyi@gmail.com>
+Description:
+		Fan boost mode:
+			* 0 - normal,
+			* 1 - overboost,
+			* 2 - silent
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index a1d85667383c..5712bc56fa10 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -70,6 +70,7 @@ MODULE_LICENSE("GPL");
 #define NOTIFY_KBD_BRTUP		0xc4
 #define NOTIFY_KBD_BRTDWN		0xc5
 #define NOTIFY_KBD_BRTTOGGLE		0xc7
+#define NOTIFY_KBD_FBM			0x99
 
 #define ASUS_WMI_FNLOCK_BIOS_DISABLED	BIT(0)
 
@@ -80,6 +81,13 @@ MODULE_LICENSE("GPL");
 #define ASUS_FAN_CTRL_MANUAL		1
 #define ASUS_FAN_CTRL_AUTO		2
 
+#define ASUS_FAN_MODE_NORMAL		0
+#define ASUS_FAN_MODE_OVERBOOST		1
+#define ASUS_FAN_MODE_OVERBOOST_MASK	0x01
+#define ASUS_FAN_MODE_SILENT		2
+#define ASUS_FAN_MODE_SILENT_MASK	0x02
+#define ASUS_FAN_MODES_MASK		0x03
+
 #define USB_INTEL_XUSB2PR		0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
 
@@ -187,6 +195,10 @@ struct asus_wmi {
 	int asus_hwmon_num_fans;
 	int asus_hwmon_pwm;
 
+	bool fan_mode_available;
+	u8 fan_mode_mask;
+	u8 fan_mode;
+
 	struct hotplug_slot hotplug_slot;
 	struct mutex hotplug_lock;
 	struct mutex wmi_lock;
@@ -1483,6 +1495,116 @@ static int asus_wmi_fan_init(struct asus_wmi *asus)
 	return 0;
 }
 
+/* Fan mode *******************************************************************/
+
+static int fan_mode_check_present(struct asus_wmi *asus)
+{
+	u32 result;
+	int err;
+
+	asus->fan_mode_available = false;
+
+	err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_MODE, &result);
+	if (err) {
+		if (err == -ENODEV)
+			return 0;
+		else
+			return err;
+	}
+
+	if ((result & ASUS_WMI_DSTS_PRESENCE_BIT) &&
+			(result & ASUS_FAN_MODES_MASK)) {
+		asus->fan_mode_available = true;
+		asus->fan_mode_mask = result & ASUS_FAN_MODES_MASK;
+	}
+
+	return 0;
+}
+
+static int fan_mode_write(struct asus_wmi *asus)
+{
+	int err;
+	u8 value;
+	u32 retval;
+
+	value = asus->fan_mode;
+
+	pr_info("Set fan mode: %u\n", value);
+	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_MODE, value, &retval);
+
+	if (err) {
+		pr_warn("Failed to set fan mode: %d\n", err);
+		return err;
+	}
+
+	if (retval != 1) {
+		pr_warn("Failed to set fan mode (retval): 0x%x\n", retval);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int fan_mode_switch_next(struct asus_wmi *asus)
+{
+	if (asus->fan_mode == ASUS_FAN_MODE_NORMAL) {
+		if (asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK)
+			asus->fan_mode = ASUS_FAN_MODE_OVERBOOST;
+		else if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
+			asus->fan_mode = ASUS_FAN_MODE_SILENT;
+	} else if (asus->fan_mode == ASUS_FAN_MODE_OVERBOOST) {
+		if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
+			asus->fan_mode = ASUS_FAN_MODE_SILENT;
+		else
+			asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+	} else {
+		asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+	}
+
+	return fan_mode_write(asus);
+}
+
+static ssize_t fan_mode_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct asus_wmi *asus = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", asus->fan_mode);
+}
+
+static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	int result;
+	u8 new_mode;
+
+	struct asus_wmi *asus = dev_get_drvdata(dev);
+
+	result = kstrtou8(buf, 10, &new_mode);
+	if (result < 0) {
+		pr_warn("Trying to store invalid value\n");
+		return result;
+	}
+
+	if (new_mode == ASUS_FAN_MODE_OVERBOOST) {
+		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK))
+			return -EINVAL;
+	} else if (new_mode == ASUS_FAN_MODE_SILENT) {
+		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK))
+			return -EINVAL;
+	} else if (new_mode != ASUS_FAN_MODE_NORMAL) {
+		return -EINVAL;
+	}
+
+	asus->fan_mode = new_mode;
+	fan_mode_write(asus);
+
+	return result;
+}
+
+// Fan mode: 0 - normal, 1 - overboost, 2 - silent
+static DEVICE_ATTR_RW(fan_mode);
+
 /* Backlight ******************************************************************/
 
 static int read_backlight_power(struct asus_wmi *asus)
@@ -1761,6 +1883,11 @@ static void asus_wmi_handle_event_code(int code, struct asus_wmi *asus)
 		return;
 	}
 
+	if (asus->fan_mode_available && code == NOTIFY_KBD_FBM) {
+		fan_mode_switch_next(asus);
+		return;
+	}
+
 	if (is_display_toggle(code) && asus->driver->quirks->no_display_toggle)
 		return;
 
@@ -1917,6 +2044,7 @@ static struct attribute *platform_attributes[] = {
 	&dev_attr_touchpad.attr,
 	&dev_attr_lid_resume.attr,
 	&dev_attr_als_enable.attr,
+	&dev_attr_fan_mode.attr,
 	NULL
 };
 
@@ -1938,6 +2066,8 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
 		devid = ASUS_WMI_DEVID_LID_RESUME;
 	else if (attr == &dev_attr_als_enable.attr)
 		devid = ASUS_WMI_DEVID_ALS_ENABLE;
+	else if (attr == &dev_attr_fan_mode.attr)
+		ok = asus->fan_mode_available;
 
 	if (devid != -1)
 		ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0);
@@ -2037,12 +2167,7 @@ static int asus_wmi_platform_init(struct asus_wmi *asus)
 		asus_wmi_set_devstate(ASUS_WMI_DEVID_CWAP,
 				      asus->driver->quirks->wapf, NULL);
 
-	return asus_wmi_sysfs_init(asus->platform_device);
-}
-
-static void asus_wmi_platform_exit(struct asus_wmi *asus)
-{
-	asus_wmi_sysfs_exit(asus->platform_device);
+	return 0;
 }
 
 /* debugfs ********************************************************************/
@@ -2200,6 +2325,14 @@ static int asus_wmi_add(struct platform_device *pdev)
 	if (err)
 		goto fail_platform;
 
+	err = fan_mode_check_present(asus);
+	if (err)
+		goto fail_fan_mode;
+
+	err = asus_wmi_sysfs_init(asus->platform_device);
+	if (err)
+		goto fail_sysfs;
+
 	err = asus_wmi_input_init(asus);
 	if (err)
 		goto fail_input;
@@ -2277,7 +2410,9 @@ fail_leds:
 fail_hwmon:
 	asus_wmi_input_exit(asus);
 fail_input:
-	asus_wmi_platform_exit(asus);
+	asus_wmi_sysfs_exit(asus->platform_device);
+fail_sysfs:
+fail_fan_mode:
 fail_platform:
 	kfree(asus);
 	return err;
@@ -2294,7 +2429,7 @@ static int asus_wmi_remove(struct platform_device *device)
 	asus_wmi_led_exit(asus);
 	asus_wmi_rfkill_exit(asus);
 	asus_wmi_debugfs_exit(asus);
-	asus_wmi_platform_exit(asus);
+	asus_wmi_sysfs_exit(asus->platform_device);
 	asus_hwmon_fan_set_auto(asus);
 
 	kfree(asus);
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 0668f76df921..8551156b8dca 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -57,6 +57,7 @@
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT	0x00050021
 #define ASUS_WMI_DEVID_LIGHT_SENSOR	0x00050022 /* ?? */
 #define ASUS_WMI_DEVID_LIGHTBAR		0x00050025
+#define ASUS_WMI_DEVID_FAN_MODE		0x00110018
 
 /* Misc */
 #define ASUS_WMI_DEVID_CAMERA		0x00060013
-- 
cgit v1.2.3


From a48e23385fcf397e69e2a75d72a81c545ec8bec2 Mon Sep 17 00:00:00 2001
From: Mattias Jacobsson <2pi@mok.nu>
Date: Mon, 27 May 2019 18:21:29 +0200
Subject: platform/x86: wmi: add context pointer field to struct wmi_device_id

When using wmi_install_notify_handler() to initialize a WMI handler a
data pointer can be supplied which will be passed on to the notification
handler. No similar feature exist when handling WMI events via struct
wmi_driver.

Add a context field pointer to struct wmi_device_id and add a function
find_guid_context() to retrieve that context pointer.

Signed-off-by: Mattias Jacobsson <2pi@mok.nu>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/wmi.c      | 22 ++++++++++++++++++++++
 include/linux/mod_devicetable.h |  1 +
 2 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index b08ffb769cbe..f3be1c008856 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -146,6 +146,28 @@ static bool find_guid(const char *guid_string, struct wmi_block **out)
 	return false;
 }
 
+static const void *find_guid_context(struct wmi_block *wblock,
+				      struct wmi_driver *wdriver)
+{
+	const struct wmi_device_id *id;
+	uuid_le guid_input;
+
+	if (wblock == NULL || wdriver == NULL)
+		return NULL;
+	if (wdriver->id_table == NULL)
+		return NULL;
+
+	id = wdriver->id_table;
+	while (*id->guid_string) {
+		if (uuid_le_to_bin(id->guid_string, &guid_input))
+			continue;
+		if (!memcmp(wblock->gblock.guid, &guid_input, 16))
+			return id->context;
+		id++;
+	}
+	return NULL;
+}
+
 static int get_subobj_info(acpi_handle handle, const char *pathname,
 			   struct acpi_device_info **info)
 {
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 448621c32e4d..09366859aac2 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -798,6 +798,7 @@ struct tee_client_device_id {
  */
 struct wmi_device_id {
 	const char guid_string[UUID_STRING_LEN+1];
+	const void *context;
 };
 
 #endif /* LINUX_MOD_DEVICETABLE_H */
-- 
cgit v1.2.3


From 440c4983de262f78033ec58f6abcd199a664327d Mon Sep 17 00:00:00 2001
From: Mattias Jacobsson <2pi@mok.nu>
Date: Mon, 27 May 2019 18:21:30 +0200
Subject: platform/x86: wmi: add context argument to the probe function

The struct wmi_device_id has a context pointer field, forward this
pointer as an argument to the probe function in struct wmi_driver.

Update existing users of the same probe function to accept this new
context argument.

Signed-off-by: Mattias Jacobsson <2pi@mok.nu>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/dell-smbios-wmi.c       | 2 +-
 drivers/platform/x86/dell-wmi-descriptor.c   | 3 ++-
 drivers/platform/x86/dell-wmi.c              | 2 +-
 drivers/platform/x86/huawei-wmi.c            | 2 +-
 drivers/platform/x86/intel-wmi-thunderbolt.c | 3 ++-
 drivers/platform/x86/wmi-bmof.c              | 2 +-
 drivers/platform/x86/wmi.c                   | 3 ++-
 include/linux/wmi.h                          | 2 +-
 8 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/dell-smbios-wmi.c b/drivers/platform/x86/dell-smbios-wmi.c
index c3ed3c8c17b9..add2687079f7 100644
--- a/drivers/platform/x86/dell-smbios-wmi.c
+++ b/drivers/platform/x86/dell-smbios-wmi.c
@@ -146,7 +146,7 @@ fail_smbios_cmd:
 	return ret;
 }
 
-static int dell_smbios_wmi_probe(struct wmi_device *wdev)
+static int dell_smbios_wmi_probe(struct wmi_device *wdev, const void *context)
 {
 	struct wmi_driver *wdriver =
 		container_of(wdev->dev.driver, struct wmi_driver, driver);
diff --git a/drivers/platform/x86/dell-wmi-descriptor.c b/drivers/platform/x86/dell-wmi-descriptor.c
index 14ab250b7d5a..9994fd1a5acf 100644
--- a/drivers/platform/x86/dell-wmi-descriptor.c
+++ b/drivers/platform/x86/dell-wmi-descriptor.c
@@ -106,7 +106,8 @@ EXPORT_SYMBOL_GPL(dell_wmi_get_hotfix);
  * WMI buffer length        12       4    <length>
  * WMI hotfix number        16       4    <hotfix>
  */
-static int dell_wmi_descriptor_probe(struct wmi_device *wdev)
+static int dell_wmi_descriptor_probe(struct wmi_device *wdev,
+				     const void *context)
 {
 	union acpi_object *obj = NULL;
 	struct descriptor_priv *priv;
diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c
index d118bb73fcae..72b0a69a6ed0 100644
--- a/drivers/platform/x86/dell-wmi.c
+++ b/drivers/platform/x86/dell-wmi.c
@@ -672,7 +672,7 @@ static int dell_wmi_events_set_enabled(bool enable)
 	return dell_smbios_error(ret);
 }
 
-static int dell_wmi_probe(struct wmi_device *wdev)
+static int dell_wmi_probe(struct wmi_device *wdev, const void *context)
 {
 	struct dell_wmi_priv *priv;
 	int ret;
diff --git a/drivers/platform/x86/huawei-wmi.c b/drivers/platform/x86/huawei-wmi.c
index 52fcac5b393a..195a7f3638cb 100644
--- a/drivers/platform/x86/huawei-wmi.c
+++ b/drivers/platform/x86/huawei-wmi.c
@@ -166,7 +166,7 @@ static int huawei_wmi_input_setup(struct wmi_device *wdev)
 	return input_register_device(priv->idev);
 }
 
-static int huawei_wmi_probe(struct wmi_device *wdev)
+static int huawei_wmi_probe(struct wmi_device *wdev, const void *context)
 {
 	struct huawei_wmi_priv *priv;
 	int err;
diff --git a/drivers/platform/x86/intel-wmi-thunderbolt.c b/drivers/platform/x86/intel-wmi-thunderbolt.c
index 4dfa61434a76..974c22a7ff61 100644
--- a/drivers/platform/x86/intel-wmi-thunderbolt.c
+++ b/drivers/platform/x86/intel-wmi-thunderbolt.c
@@ -56,7 +56,8 @@ static const struct attribute_group tbt_attribute_group = {
 	.attrs = tbt_attrs,
 };
 
-static int intel_wmi_thunderbolt_probe(struct wmi_device *wdev)
+static int intel_wmi_thunderbolt_probe(struct wmi_device *wdev,
+				       const void *context)
 {
 	int ret;
 
diff --git a/drivers/platform/x86/wmi-bmof.c b/drivers/platform/x86/wmi-bmof.c
index 8751a13134be..105a82b6b076 100644
--- a/drivers/platform/x86/wmi-bmof.c
+++ b/drivers/platform/x86/wmi-bmof.c
@@ -54,7 +54,7 @@ read_bmof(struct file *filp, struct kobject *kobj,
 	return count;
 }
 
-static int wmi_bmof_probe(struct wmi_device *wdev)
+static int wmi_bmof_probe(struct wmi_device *wdev, const void *context)
 {
 	struct bmof_priv *priv;
 	int ret;
diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index f3be1c008856..2163fd8bf9e1 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -945,7 +945,8 @@ static int wmi_dev_probe(struct device *dev)
 		dev_warn(dev, "failed to enable device -- probing anyway\n");
 
 	if (wdriver->probe) {
-		ret = wdriver->probe(dev_to_wdev(dev));
+		ret = wdriver->probe(dev_to_wdev(dev),
+				find_guid_context(wblock, wdriver));
 		if (ret != 0)
 			goto probe_failure;
 	}
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 592f81afecbb..1e84c474a993 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -44,7 +44,7 @@ struct wmi_driver {
 	struct device_driver driver;
 	const struct wmi_device_id *id_table;
 
-	int (*probe)(struct wmi_device *wdev);
+	int (*probe)(struct wmi_device *wdev, const void *context);
 	int (*remove)(struct wmi_device *wdev);
 	void (*notify)(struct wmi_device *device, union acpi_object *data);
 	long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd,
-- 
cgit v1.2.3


From eb69c8a4bf5e26ae112793bb04a6cd6c27114900 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 14 Jun 2019 22:35:18 +0200
Subject: drm/gem: Unexport drm_gem_(un)pin/v(un)map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

They're purely for internal use, not for drivers.

Cc: Noralf Trønnes <noralf@tronnes.org>
Cc: Christian König <christian.koenig@amd.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190614203615.12639-3-daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/drm_gem.c      | 32 --------------------------------
 drivers/gpu/drm/drm_internal.h |  5 +++++
 include/drm/drm_gem.h          |  5 -----
 3 files changed, 5 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 8a55f71325b1..a8c4468f03d9 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -1216,15 +1216,6 @@ void drm_gem_print_info(struct drm_printer *p, unsigned int indent,
 		obj->dev->driver->gem_print_info(p, indent, obj);
 }
 
-/**
- * drm_gem_pin - Pin backing buffer in memory
- * @obj: GEM object
- *
- * Make sure the backing buffer is pinned in memory.
- *
- * Returns:
- * 0 on success or a negative error code on failure.
- */
 int drm_gem_pin(struct drm_gem_object *obj)
 {
 	if (obj->funcs && obj->funcs->pin)
@@ -1234,14 +1225,7 @@ int drm_gem_pin(struct drm_gem_object *obj)
 	else
 		return 0;
 }
-EXPORT_SYMBOL(drm_gem_pin);
 
-/**
- * drm_gem_unpin - Unpin backing buffer from memory
- * @obj: GEM object
- *
- * Relax the requirement that the backing buffer is pinned in memory.
- */
 void drm_gem_unpin(struct drm_gem_object *obj)
 {
 	if (obj->funcs && obj->funcs->unpin)
@@ -1249,16 +1233,7 @@ void drm_gem_unpin(struct drm_gem_object *obj)
 	else if (obj->dev->driver->gem_prime_unpin)
 		obj->dev->driver->gem_prime_unpin(obj);
 }
-EXPORT_SYMBOL(drm_gem_unpin);
 
-/**
- * drm_gem_vmap - Map buffer into kernel virtual address space
- * @obj: GEM object
- *
- * Returns:
- * A virtual pointer to a newly created GEM object or an ERR_PTR-encoded negative
- * error code on failure.
- */
 void *drm_gem_vmap(struct drm_gem_object *obj)
 {
 	void *vaddr;
@@ -1275,13 +1250,7 @@ void *drm_gem_vmap(struct drm_gem_object *obj)
 
 	return vaddr;
 }
-EXPORT_SYMBOL(drm_gem_vmap);
 
-/**
- * drm_gem_vunmap - Remove buffer mapping from kernel virtual address space
- * @obj: GEM object
- * @vaddr: Virtual address (can be NULL)
- */
 void drm_gem_vunmap(struct drm_gem_object *obj, void *vaddr)
 {
 	if (!vaddr)
@@ -1292,7 +1261,6 @@ void drm_gem_vunmap(struct drm_gem_object *obj, void *vaddr)
 	else if (obj->dev->driver->gem_prime_vunmap)
 		obj->dev->driver->gem_prime_vunmap(obj, vaddr);
 }
-EXPORT_SYMBOL(drm_gem_vunmap);
 
 /**
  * drm_gem_lock_reservations - Sets up the ww context and acquires
diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
index d18c7b91a1a8..51a2055c8f18 100644
--- a/drivers/gpu/drm/drm_internal.h
+++ b/drivers/gpu/drm/drm_internal.h
@@ -133,6 +133,11 @@ void drm_gem_release(struct drm_device *dev, struct drm_file *file_private);
 void drm_gem_print_info(struct drm_printer *p, unsigned int indent,
 			const struct drm_gem_object *obj);
 
+int drm_gem_pin(struct drm_gem_object *obj);
+void drm_gem_unpin(struct drm_gem_object *obj);
+void *drm_gem_vmap(struct drm_gem_object *obj);
+void drm_gem_vunmap(struct drm_gem_object *obj, void *vaddr);
+
 /* drm_debugfs.c drm_debugfs_crc.c */
 #if defined(CONFIG_DEBUG_FS)
 int drm_debugfs_init(struct drm_minor *minor, int minor_id,
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index 5047c7ee25f5..a9121fe66ea2 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -401,9 +401,4 @@ int drm_gem_dumb_destroy(struct drm_file *file,
 			 struct drm_device *dev,
 			 uint32_t handle);
 
-int drm_gem_pin(struct drm_gem_object *obj);
-void drm_gem_unpin(struct drm_gem_object *obj);
-void *drm_gem_vmap(struct drm_gem_object *obj);
-void drm_gem_vunmap(struct drm_gem_object *obj, void *vaddr);
-
 #endif /* __DRM_GEM_H__ */
-- 
cgit v1.2.3


From 5d60c11154116e2127374d4178e952649612b69b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 13 Jun 2019 21:38:17 -0300
Subject: RDMA: Move rdma_node_type to uapi/

This enum is exposed over the sysfs file 'node_type' and over netlink via
RDMA_NLDEV_ATTR_DEV_NODE_TYPE, so declare it in the uapi headers.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/verbs.c  |  2 +-
 include/rdma/ib_verbs.h          | 13 +------------
 include/uapi/rdma/rdma_netlink.h | 12 ++++++++++++
 3 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 585e100706aa..588f1d195fd2 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -209,7 +209,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
 EXPORT_SYMBOL(ib_rate_to_mbps);
 
 __attribute_const__ enum rdma_transport_type
-rdma_node_get_transport(enum rdma_node_type node_type)
+rdma_node_get_transport(unsigned int node_type)
 {
 
 	if (node_type == RDMA_NODE_USNIC)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index f357e03a85a6..973514ea17a7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -132,17 +132,6 @@ struct ib_gid_attr {
 	u8			port_num;
 };
 
-enum rdma_node_type {
-	/* IB values map to NodeInfo:NodeType. */
-	RDMA_NODE_IB_CA 	= 1,
-	RDMA_NODE_IB_SWITCH,
-	RDMA_NODE_IB_ROUTER,
-	RDMA_NODE_RNIC,
-	RDMA_NODE_USNIC,
-	RDMA_NODE_USNIC_UDP,
-	RDMA_NODE_UNSPECIFIED,
-};
-
 enum {
 	/* set the local administered indication */
 	IB_SA_WELL_KNOWN_GUID	= BIT_ULL(57) | 2,
@@ -164,7 +153,7 @@ enum rdma_protocol_type {
 };
 
 __attribute_const__ enum rdma_transport_type
-rdma_node_get_transport(enum rdma_node_type node_type);
+rdma_node_get_transport(unsigned int node_type);
 
 enum rdma_network_type {
 	RDMA_NETWORK_IB,
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 41db51367efa..f588e8551c6c 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -147,6 +147,18 @@ enum {
 	IWPM_NLA_HELLO_MAX
 };
 
+/* For RDMA_NLDEV_ATTR_DEV_NODE_TYPE */
+enum {
+	/* IB values map to NodeInfo:NodeType. */
+	RDMA_NODE_IB_CA = 1,
+	RDMA_NODE_IB_SWITCH,
+	RDMA_NODE_IB_ROUTER,
+	RDMA_NODE_RNIC,
+	RDMA_NODE_USNIC,
+	RDMA_NODE_USNIC_UDP,
+	RDMA_NODE_UNSPECIFIED,
+};
+
 /*
  * Local service operations:
  *   RESOLVE - The client requests the local service to resolve a path.
-- 
cgit v1.2.3


From 99600fd47eafd20b9ba6e04562bb2fcc48475344 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 22 Apr 2019 07:15:05 +0800
Subject: clk: Add CLK_HW_INIT_* macros using .parent_hws

With the new clk parenting code, struct clk_init_data was expanded to
include .parent_hws, for clk drivers to directly list parents by
pointing to their respective struct clk_hw's.

Add macros that can take either one single struct clk_hw *, or an array
of them, for drivers to use.

A special CLK_HW_INIT_HWS macro is included, which takes an array of
struct clk_hw *, but sets .num_parents to 1. This variant is to allow
the reuse of the array, instead of having a compound literal allocated
for each clk sharing the same parent.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bb6118f79784..70aad5cefea7 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -904,6 +904,29 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,				\
 	})
 
+#define CLK_HW_INIT_HW(_name, _parent, _ops, _flags)			\
+	(&(struct clk_init_data) {					\
+		.flags		= _flags,				\
+		.name		= _name,				\
+		.parent_hws	= (const struct clk_hw*[]) { _parent },	\
+		.num_parents	= 1,					\
+		.ops		= _ops,					\
+	})
+
+/*
+ * This macro is intended for drivers to be able to share the otherwise
+ * individual struct clk_hw[] compound literals created by the compiler
+ * when using CLK_HW_INIT_HW. It does NOT support multiple parents.
+ */
+#define CLK_HW_INIT_HWS(_name, _parent, _ops, _flags)			\
+	(&(struct clk_init_data) {					\
+		.flags		= _flags,				\
+		.name		= _name,				\
+		.parent_hws	= _parent,				\
+		.num_parents	= 1,					\
+		.ops		= _ops,					\
+	})
+
 #define CLK_HW_INIT_PARENTS(_name, _parents, _ops, _flags)	\
 	(&(struct clk_init_data) {				\
 		.flags		= _flags,			\
@@ -913,6 +936,15 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,				\
 	})
 
+#define CLK_HW_INIT_PARENTS_HW(_name, _parents, _ops, _flags)	\
+	(&(struct clk_init_data) {				\
+		.flags		= _flags,			\
+		.name		= _name,			\
+		.parent_hws	= _parents,			\
+		.num_parents	= ARRAY_SIZE(_parents),		\
+		.ops		= _ops,				\
+	})
+
 #define CLK_HW_INIT_NO_PARENT(_name, _ops, _flags)	\
 	(&(struct clk_init_data) {			\
 		.flags          = _flags,		\
-- 
cgit v1.2.3


From 2d6b4f33e637bf51c50c536966a19e94a59f3212 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 3 May 2019 11:49:03 +0800
Subject: clk: Add CLK_HW_INIT_FW_NAME macro using .fw_name in .parent_data

With the new clk parenting code, clk_init_data was expanded to include
.parent_data, for clk drivers that have parents referenced using a
combination of device tree clock-names, clock indices, and/or clk_hw
pointers.

Add a CLK_HW_INIT macro for specifying a single parent from the device
tree using .fw_name in struct clk_parent_data.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 70aad5cefea7..b19063512a29 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -927,6 +927,17 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,					\
 	})
 
+#define CLK_HW_INIT_FW_NAME(_name, _parent, _ops, _flags)		\
+	(&(struct clk_init_data) {					\
+		.flags		= _flags,				\
+		.name		= _name,				\
+		.parent_data	= (const struct clk_parent_data[]) {	\
+					{ .fw_name = _parent },		\
+				  },					\
+		.num_parents	= 1,					\
+		.ops		= _ops,					\
+	})
+
 #define CLK_HW_INIT_PARENTS(_name, _parents, _ops, _flags)	\
 	(&(struct clk_init_data) {				\
 		.flags		= _flags,			\
-- 
cgit v1.2.3


From 13933109dff0a5abbfc3980304c6c21c90829810 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 22 Apr 2019 07:17:50 +0800
Subject: clk: Add CLK_HW_INIT_PARENT_DATA macro using .parent_data

With the new clk parenting code, struct clk_init_data was expanded to
include .parent_data, for clk drivers that have parents referenced using
a combination of device tree clock-names, clock indices, and/or struct
clk_hw pointers.

Add a new macro that can take a list of struct clk_parent_data for
drivers to use.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index b19063512a29..0fd14c4874d6 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -956,6 +956,15 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,				\
 	})
 
+#define CLK_HW_INIT_PARENTS_DATA(_name, _parents, _ops, _flags)	\
+	(&(struct clk_init_data) {				\
+		.flags		= _flags,			\
+		.name		= _name,			\
+		.parent_data	= _parents,			\
+		.num_parents	= ARRAY_SIZE(_parents),		\
+		.ops		= _ops,				\
+	})
+
 #define CLK_HW_INIT_NO_PARENT(_name, _ops, _flags)	\
 	(&(struct clk_init_data) {			\
 		.flags          = _flags,		\
-- 
cgit v1.2.3


From d7b15114aba956ca395ec5cc28f68fe861ffc208 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 22 Apr 2019 07:19:46 +0800
Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_HW which takes clk_hw pointer
 as parent

With the new clk parenting code, clk_init_data was expanded to include
.parent_hws, for clk drivers to directly reference parents by clk_hw.

Add a new macro, CLK_FIXED_FACTOR_HW, that can take a struct clk_hw
pointer, instead of a string, as its parent.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 0fd14c4874d6..c85e9f3809f2 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -985,6 +985,17 @@ extern struct of_device_id __clk_of_table;
 					      _flags),			\
 	}
 
+#define CLK_FIXED_FACTOR_HW(_struct, _name, _parent,			\
+			    _div, _mult, _flags)			\
+	struct clk_fixed_factor _struct = {				\
+		.div		= _div,					\
+		.mult		= _mult,				\
+		.hw.init	= CLK_HW_INIT_HW(_name,			\
+						 _parent,		\
+						 &clk_fixed_factor_ops,	\
+						 _flags),		\
+	}
+
 #ifdef CONFIG_OF
 int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
-- 
cgit v1.2.3


From 1bef004e2680511ecbb6b5db3954fba430501ecb Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 6 May 2019 10:43:16 +0800
Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_HWS which takes list of
 struct clk_hw *

With the new clk parenting code, clk_init_data was expanded to include
.parent_hws, for clk drivers to directly reference parents by clk_hw.

Add a new macro, CLK_FIXED_FACTOR_HWS, that can take an array of pointers
to struct clk_hw, instead of a string, as its parent. Taking an array
instead of a direct pointer allows the reuse of the array for multiple
clks, rather than having one compound literal with the same contents
allocated for each clk declaration.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index c85e9f3809f2..146a6859969e 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -996,6 +996,21 @@ extern struct of_device_id __clk_of_table;
 						 _flags),		\
 	}
 
+/*
+ * This macro allows the driver to reuse the _parent array for multiple
+ * fixed factor clk declarations.
+ */
+#define CLK_FIXED_FACTOR_HWS(_struct, _name, _parent,			\
+			     _div, _mult, _flags)			\
+	struct clk_fixed_factor _struct = {				\
+		.div		= _div,					\
+		.mult		= _mult,				\
+		.hw.init	= CLK_HW_INIT_HWS(_name,		\
+						  _parent,		\
+						  &clk_fixed_factor_ops, \
+						  _flags),	\
+	}
+
 #ifdef CONFIG_OF
 int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
-- 
cgit v1.2.3


From 8b13a48b891c7c855e9f3a401d91391a946f4ca7 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 3 May 2019 11:58:20 +0800
Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_FW_NAME for DT clock-names
 parent

With the new clk parenting code, clk_init_data was expanded to include
.parent_data, for clk drivers to specify parents using a combination of
device tree clock-names, pointers to struct clk_hw, device tree clocks,
and/or fallback global clock names.

Add a new macro, CLK_FIXED_FACTOR_FW_NAME, that takes a string to match
a clock-names entry in the device tree to specify the clock parent.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 146a6859969e..e5c44f6dd897 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1011,6 +1011,17 @@ extern struct of_device_id __clk_of_table;
 						  _flags),	\
 	}
 
+#define CLK_FIXED_FACTOR_FW_NAME(_struct, _name, _parent,		\
+				 _div, _mult, _flags)			\
+	struct clk_fixed_factor _struct = {				\
+		.div		= _div,					\
+		.mult		= _mult,				\
+		.hw.init	= CLK_HW_INIT_FW_NAME(_name,		\
+						      _parent,		\
+						      &clk_fixed_factor_ops, \
+						      _flags),		\
+	}
+
 #ifdef CONFIG_OF
 int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
-- 
cgit v1.2.3


From 4eb293487d05a69862a4907ee944aa271ed49a4c Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 13 Jun 2019 10:55:32 +0900
Subject: pinctrl: make pinconf.h self-contained

This header uses 'bool', but it does not include any header by itself.

So, it could cause unknown type name error, depending on the header
include order, although probably <linux/types.h> has been included by
someone else.

Include <linux/types.h> to make it self-contained.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index 93c9dd133e9d..9bebc3554809 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_PINCONF
 
+#include <linux/types.h>
+
 struct pinctrl_dev;
 struct seq_file;
 
-- 
cgit v1.2.3


From 29875a52915e09abb9703722054f6443cb492ccc Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Fri, 12 Oct 2018 17:06:06 +0200
Subject: mm: Add an apply_to_pfn_range interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is basically apply_to_page_range with added functionality:
Allocating missing parts of the page table becomes optional, which
means that the function can be guaranteed not to error if allocation
is disabled. Also passing of the closure struct and callback function
becomes different and more in line with how things are done elsewhere.

Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range

The reason for not using the page-walk code is that we want to perform
the page-walk on vmas pointing to an address space without requiring the
mmap_sem to be held rather than on vmas belonging to a process with the
mmap_sem held.

Notable changes since RFC:
Don't export apply_to_pfn range.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> #v1
---
 include/linux/mm.h |  10 ++++
 mm/memory.c        | 135 ++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 113 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e8834ac32b7..3d06ce2a64af 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2675,6 +2675,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
+struct pfn_range_apply;
+typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+			 struct pfn_range_apply *closure);
+struct pfn_range_apply {
+	struct mm_struct *mm;
+	pter_fn_t ptefn;
+	unsigned int alloc;
+};
+extern int apply_to_pfn_range(struct pfn_range_apply *closure,
+			      unsigned long address, unsigned long size);
 
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
diff --git a/mm/memory.c b/mm/memory.c
index 168f546af1ad..462aa47f8878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2032,18 +2032,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
 }
 EXPORT_SYMBOL(vm_iomap_memory);
 
-static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
+			      unsigned long addr, unsigned long end)
 {
 	pte_t *pte;
 	int err;
 	pgtable_t token;
 	spinlock_t *uninitialized_var(ptl);
 
-	pte = (mm == &init_mm) ?
+	pte = (closure->mm == &init_mm) ?
 		pte_alloc_kernel(pmd, addr) :
-		pte_alloc_map_lock(mm, pmd, addr, &ptl);
+		pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 
@@ -2054,86 +2053,109 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = fn(pte++, token, addr, data);
+		err = closure->ptefn(pte++, token, addr, closure);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
 
-	if (mm != &init_mm)
+	if (closure->mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
 	return err;
 }
 
-static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
+			      unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	int err;
+	int err = 0;
 
 	BUG_ON(pud_huge(*pud));
 
-	pmd = pmd_alloc(mm, pud, addr);
+	pmd = pmd_alloc(closure->mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
+
 	do {
 		next = pmd_addr_end(addr, end);
-		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+		if (!closure->alloc && pmd_none_or_clear_bad(pmd))
+			continue;
+		err = apply_to_pte_range(closure, pmd, addr, next);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
+			      unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	pud = pud_alloc(mm, p4d, addr);
+	pud = pud_alloc(closure->mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
+
 	do {
 		next = pud_addr_end(addr, end);
-		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+		if (!closure->alloc && pud_none_or_clear_bad(pud))
+			continue;
+		err = apply_to_pmd_range(closure, pud, addr, next);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
+			      unsigned long addr, unsigned long end)
 {
 	p4d_t *p4d;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	p4d = p4d_alloc(mm, pgd, addr);
+	p4d = p4d_alloc(closure->mm, pgd, addr);
 	if (!p4d)
 		return -ENOMEM;
+
 	do {
 		next = p4d_addr_end(addr, end);
-		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
+		if (!closure->alloc && p4d_none_or_clear_bad(p4d))
+			continue;
+		err = apply_to_pud_range(closure, p4d, addr, next);
 		if (err)
 			break;
 	} while (p4d++, addr = next, addr != end);
 	return err;
 }
 
-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
+/**
+ * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
+ * function on each leaf page table entry
+ * @closure: Details about how to scan and what function to apply
+ * @addr: Start virtual address
+ * @size: Size of the region
+ *
+ * If @closure->alloc is set to 1, the function will fill in the page table
+ * as necessary. Otherwise it will skip non-present parts.
+ * Note: The caller must ensure that the range does not contain huge pages.
+ * The caller must also assure that the proper mmu_notifier functions are
+ * called before and after the call to apply_to_pfn_range.
+ *
+ * WARNING: Do not use this function unless you know exactly what you are
+ * doing. It is lacking support for huge pages and transparent huge pages.
+ *
+ * Return: Zero on success. If the provided function returns a non-zero status,
+ * the page table walk will terminate and that status will be returned.
+ * If @closure->alloc is set to 1, then this function may also return memory
+ * allocation errors arising from allocating page table memory.
  */
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
-			unsigned long size, pte_fn_t fn, void *data)
+int apply_to_pfn_range(struct pfn_range_apply *closure,
+		       unsigned long addr, unsigned long size)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -2143,16 +2165,65 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 	if (WARN_ON(addr >= end))
 		return -EINVAL;
 
-	pgd = pgd_offset(mm, addr);
+	pgd = pgd_offset(closure->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
+		if (!closure->alloc && pgd_none_or_clear_bad(pgd))
+			continue;
+		err = apply_to_p4d_range(closure, pgd, addr, next);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 
 	return err;
 }
+
+/**
+ * struct page_range_apply - Closure structure for apply_to_page_range()
+ * @pter: The base closure structure we derive from
+ * @fn: The leaf pte function to call
+ * @data: The leaf pte function closure
+ */
+struct page_range_apply {
+	struct pfn_range_apply pter;
+	pte_fn_t fn;
+	void *data;
+};
+
+/*
+ * Callback wrapper to enable use of apply_to_pfn_range for
+ * the apply_to_page_range interface
+ */
+static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
+				       unsigned long addr,
+				       struct pfn_range_apply *pter)
+{
+	struct page_range_apply *pra =
+		container_of(pter, typeof(*pra), pter);
+
+	return pra->fn(pte, token, addr, pra->data);
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ *
+ * WARNING: Do not use this function unless you know exactly what you are
+ * doing. It is lacking support for huge pages and transparent huge pages.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
+{
+	struct page_range_apply pra = {
+		.pter = {.mm = mm,
+			 .alloc = 1,
+			 .ptefn = apply_to_page_range_wrapper },
+		.fn = fn,
+		.data = data
+	};
+
+	return apply_to_pfn_range(&pra.pter, addr, size);
+}
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
-- 
cgit v1.2.3


From 4fe51e9e7902b5724b618dadd9527b1bbf2b55cc Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Tue, 19 Mar 2019 13:12:30 +0100
Subject: mm: Add write-protect and clean utilities for address space ranges
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add two utilities to a) write-protect and b) clean all ptes pointing into
a range of an address space.
The utilities are intended to aid in tracking dirty pages (either
driver-allocated system memory or pci device memory).
The write-protect utility should be used in conjunction with
page_mkwrite() and pfn_mkwrite() to trigger write page-faults on page
accesses. Typically one would want to use this on sparse accesses into
large memory regions. The clean utility should be used to utilize
hardware dirtying functionality and avoid the overhead of page-faults,
typically on large accesses into small memory regions.

The added file "as_dirty_helpers.c" is initially listed as maintained by
VMware under our DRM driver. If somebody would like it elsewhere,
that's of course no problem.

Notable changes since RFC:
- Added comments to help avoid the usage of these function for VMAs
  it's not intended for. We also do advisory checks on the vm_flags and
  warn on illegal usage.
- Perform the pte modifications the same way softdirty does.
- Add mmu_notifier range invalidation calls.
- Add a config option so that this code is not unconditionally included.
- Tell the mmu_gather code about pending tlb flushes.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> #v1
---
 MAINTAINERS           |   1 +
 include/linux/mm.h    |   9 +-
 mm/Kconfig            |   3 +
 mm/Makefile           |   1 +
 mm/as_dirty_helpers.c | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 313 insertions(+), 1 deletion(-)
 create mode 100644 mm/as_dirty_helpers.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7a2f487ea49a..a55d4ef91b0b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5179,6 +5179,7 @@ T:	git git://people.freedesktop.org/~thomash/linux
 S:	Supported
 F:	drivers/gpu/drm/vmwgfx/
 F:	include/uapi/drm/vmwgfx_drm.h
+F:	mm/as_dirty_helpers.c
 
 DRM DRIVERS
 M:	David Airlie <airlied@linux.ie>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3d06ce2a64af..a0bc2a82917e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2685,7 +2685,14 @@ struct pfn_range_apply {
 };
 extern int apply_to_pfn_range(struct pfn_range_apply *closure,
 			      unsigned long address, unsigned long size);
-
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+				 pgoff_t first_index, pgoff_t nr);
+unsigned long apply_as_clean(struct address_space *mapping,
+			     pgoff_t first_index, pgoff_t nr,
+			     pgoff_t bitmap_pgoff,
+			     unsigned long *bitmap,
+			     pgoff_t *start,
+			     pgoff_t *end);
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
 extern void kernel_poison_pages(struct page *page, int numpages, int enable);
diff --git a/mm/Kconfig b/mm/Kconfig
index f0c76ba47695..5006d0e6a5c7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -765,4 +765,7 @@ config GUP_BENCHMARK
 config ARCH_HAS_PTE_SPECIAL
 	bool
 
+config AS_DIRTY_HELPERS
+        bool
+
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index ac5e5ba78874..f5d412bbc2f7 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,3 +104,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
 obj-$(CONFIG_HMM) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_AS_DIRTY_HELPERS) += as_dirty_helpers.o
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
new file mode 100644
index 000000000000..f600e31534fb
--- /dev/null
+++ b/mm/as_dirty_helpers.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/hugetlb.h>
+#include <linux/bitops.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/**
+ * struct apply_as - Closure structure for apply_as_range
+ * @base: struct pfn_range_apply we derive from
+ * @start: Address of first modified pte
+ * @end: Address of last modified pte + 1
+ * @total: Total number of modified ptes
+ * @vma: Pointer to the struct vm_area_struct we're currently operating on
+ */
+struct apply_as {
+	struct pfn_range_apply base;
+	unsigned long start;
+	unsigned long end;
+	unsigned long total;
+	struct vm_area_struct *vma;
+};
+
+/**
+ * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
+ * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
+ * @addr: The virtual page address
+ * @closure: Pointer to a struct pfn_range_apply embedded in a
+ * struct apply_as
+ *
+ * The function write-protects a pte and records the range in
+ * virtual address space of touched ptes for efficient range TLB flushes.
+ *
+ * Return: Always zero.
+ */
+static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
+			      unsigned long addr,
+			      struct pfn_range_apply *closure)
+{
+	struct apply_as *aas = container_of(closure, typeof(*aas), base);
+	pte_t ptent = *pte;
+
+	if (pte_write(ptent)) {
+		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
+
+		ptent = pte_wrprotect(old_pte);
+		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
+		aas->total++;
+		aas->start = min(aas->start, addr);
+		aas->end = max(aas->end, addr + PAGE_SIZE);
+	}
+
+	return 0;
+}
+
+/**
+ * struct apply_as_clean - Closure structure for apply_as_clean
+ * @base: struct apply_as we derive from
+ * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
+ * @bitmap: Bitmap with one bit for each page offset in the address_space range
+ * covered.
+ * @start: Address_space page offset of first modified pte relative
+ * to @bitmap_pgoff
+ * @end: Address_space page offset of last modified pte relative
+ * to @bitmap_pgoff
+ */
+struct apply_as_clean {
+	struct apply_as base;
+	pgoff_t bitmap_pgoff;
+	unsigned long *bitmap;
+	pgoff_t start;
+	pgoff_t end;
+};
+
+/**
+ * apply_pt_clean - Leaf pte callback to clean a pte
+ * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
+ * @addr: The virtual page address
+ * @closure: Pointer to a struct pfn_range_apply embedded in a
+ * struct apply_as_clean
+ *
+ * The function cleans a pte and records the range in
+ * virtual address space of touched ptes for efficient TLB flushes.
+ * It also records dirty ptes in a bitmap representing page offsets
+ * in the address_space, as well as the first and last of the bits
+ * touched.
+ *
+ * Return: Always zero.
+ */
+static int apply_pt_clean(pte_t *pte, pgtable_t token,
+			  unsigned long addr,
+			  struct pfn_range_apply *closure)
+{
+	struct apply_as *aas = container_of(closure, typeof(*aas), base);
+	struct apply_as_clean *clean = container_of(aas, typeof(*clean), base);
+	pte_t ptent = *pte;
+
+	if (pte_dirty(ptent)) {
+		pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) +
+			aas->vma->vm_pgoff - clean->bitmap_pgoff;
+		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
+
+		ptent = pte_mkclean(old_pte);
+		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
+
+		aas->total++;
+		aas->start = min(aas->start, addr);
+		aas->end = max(aas->end, addr + PAGE_SIZE);
+
+		__set_bit(pgoff, clean->bitmap);
+		clean->start = min(clean->start, pgoff);
+		clean->end = max(clean->end, pgoff + 1);
+	}
+
+	return 0;
+}
+
+/**
+ * apply_as_range - Apply a pte callback to all PTEs pointing into a range
+ * of an address_space.
+ * @mapping: Pointer to the struct address_space
+ * @aas: Closure structure
+ * @first_index: First page offset in the address_space
+ * @nr: Number of incremental page offsets to cover
+ *
+ * Return: Number of ptes touched. Note that this number might be larger
+ * than @nr if there are overlapping vmas
+ */
+static unsigned long apply_as_range(struct address_space *mapping,
+				    struct apply_as *aas,
+				    pgoff_t first_index, pgoff_t nr)
+{
+	struct vm_area_struct *vma;
+	pgoff_t vba, vea, cba, cea;
+	unsigned long start_addr, end_addr;
+	struct mmu_notifier_range range;
+
+	i_mmap_lock_read(mapping);
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
+				  first_index + nr - 1) {
+		unsigned long vm_flags = READ_ONCE(vma->vm_flags);
+
+		/*
+		 * We can only do advisory flag tests below, since we can't
+		 * require the vm's mmap_sem to be held to protect the flags.
+		 * Therefore, callers that strictly depend on specific mmap
+		 * flags to remain constant throughout the operation must
+		 * either ensure those flags are immutable for all relevant
+		 * vmas or can't use this function. Fixing this properly would
+		 * require the vma::vm_flags to be protected by a separate
+		 * lock taken after the i_mmap_lock
+		 */
+
+		/* Skip non-applicable VMAs */
+		if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
+		    (VM_SHARED | VM_WRITE))
+			continue;
+
+		/* Warn on and skip VMAs whose flags indicate illegal usage */
+		if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
+			continue;
+
+		/* Clip to the vma */
+		vba = vma->vm_pgoff;
+		vea = vba + vma_pages(vma);
+		cba = first_index;
+		cba = max(cba, vba);
+		cea = first_index + nr;
+		cea = min(cea, vea);
+
+		/* Translate to virtual address */
+		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
+		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
+		if (start_addr >= end_addr)
+			continue;
+
+		aas->base.mm = vma->vm_mm;
+		aas->vma = vma;
+		aas->start = end_addr;
+		aas->end = start_addr;
+
+		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
+					vma, vma->vm_mm, start_addr, end_addr);
+		mmu_notifier_invalidate_range_start(&range);
+
+		/* Needed when we only change protection? */
+		flush_cache_range(vma, start_addr, end_addr);
+
+		/*
+		 * We're not using tlb_gather_mmu() since typically
+		 * only a small subrange of PTEs are affected.
+		 */
+		inc_tlb_flush_pending(vma->vm_mm);
+
+		/* Should not error since aas->base.alloc == 0 */
+		WARN_ON(apply_to_pfn_range(&aas->base, start_addr,
+					   end_addr - start_addr));
+		if (aas->end > aas->start)
+			flush_tlb_range(vma, aas->start, aas->end);
+
+		mmu_notifier_invalidate_range_end(&range);
+		dec_tlb_flush_pending(vma->vm_mm);
+	}
+	i_mmap_unlock_read(mapping);
+
+	return aas->total;
+}
+
+/**
+ * apply_as_wrprotect - Write-protect all ptes in an address_space range
+ * @mapping: The address_space we want to write protect
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ *
+ * WARNING: This function should only be used for address spaces whose
+ * vmas are marked VM_IO and that do not contain huge pages.
+ * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
+ * simply skipped.
+ *
+ * Return: The number of ptes actually write-protected. Note that
+ * already write-protected ptes are not counted.
+ */
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+				 pgoff_t first_index, pgoff_t nr)
+{
+	struct apply_as aas = {
+		.base = {
+			.alloc = 0,
+			.ptefn = apply_pt_wrprotect,
+		},
+		.total = 0,
+	};
+
+	return apply_as_range(mapping, &aas, first_index, nr);
+}
+EXPORT_SYMBOL_GPL(apply_as_wrprotect);
+
+/**
+ * apply_as_clean - Clean all ptes in an address_space range
+ * @mapping: The address_space we want to clean
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ * @bitmap_pgoff: The page offset of the first bit in @bitmap
+ * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
+ * cover the whole range @first_index..@first_index + @nr.
+ * @start: Pointer to number of the first set bit in @bitmap.
+ * is modified as new bits are set by the function.
+ * @end: Pointer to the number of the last set bit in @bitmap.
+ * none set. The value is modified as new bits are set by the function.
+ *
+ * Note: When this function returns there is no guarantee that a CPU has
+ * not already dirtied new ptes. However it will not clean any ptes not
+ * reported in the bitmap.
+ *
+ * If a caller needs to make sure all dirty ptes are picked up and none
+ * additional are added, it first needs to write-protect the address-space
+ * range and make sure new writers are blocked in page_mkwrite() or
+ * pfn_mkwrite(). And then after a TLB flush following the write-protection
+ * pick up all dirty bits.
+ *
+ * WARNING: This function should only be used for address spaces whose
+ * vmas are marked VM_IO and that do not contain huge pages.
+ * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
+ * simply skipped.
+ *
+ * Return: The number of dirty ptes actually cleaned.
+ */
+unsigned long apply_as_clean(struct address_space *mapping,
+			     pgoff_t first_index, pgoff_t nr,
+			     pgoff_t bitmap_pgoff,
+			     unsigned long *bitmap,
+			     pgoff_t *start,
+			     pgoff_t *end)
+{
+	bool none_set = (*start >= *end);
+	struct apply_as_clean clean = {
+		.base = {
+			.base = {
+				.alloc = 0,
+				.ptefn = apply_pt_clean,
+			},
+			.total = 0,
+		},
+		.bitmap_pgoff = bitmap_pgoff,
+		.bitmap = bitmap,
+		.start = none_set ? nr : *start,
+		.end = none_set ? 0 : *end,
+	};
+	unsigned long ret = apply_as_range(mapping, &clean.base, first_index,
+					   nr);
+
+	*start = clean.start;
+	*end = clean.end;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(apply_as_clean);
-- 
cgit v1.2.3


From 32d1f6985ceb6b9099bc9e02dc04e58660f28c16 Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Fri, 12 Oct 2018 17:16:39 +0200
Subject: drm/ttm: Allow the driver to provide the ttm struct
 vm_operations_struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a pointer to the struct vm_operations_struct in the bo_device, and
assign that pointer to the default value currently used.

The driver can then optionally modify that pointer and the new value
can be used for each new vma created.

Cc: "Christian König" <christian.koenig@amd.com>

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c    | 1 +
 drivers/gpu/drm/ttm/ttm_bo_vm.c | 6 +++---
 include/drm/ttm/ttm_bo_driver.h | 6 ++++++
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index c7de667d482a..6953dd264172 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1739,6 +1739,7 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
 	mutex_lock(&ttm_global_mutex);
 	list_add_tail(&bdev->device_list, &glob->device_list);
 	mutex_unlock(&ttm_global_mutex);
+	bdev->vm_ops = &ttm_bo_vm_ops;
 
 	return 0;
 out_no_sys:
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 6dacff49c1cc..196e13a0adad 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -395,7 +395,7 @@ static int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
 	return ret;
 }
 
-static const struct vm_operations_struct ttm_bo_vm_ops = {
+const struct vm_operations_struct ttm_bo_vm_ops = {
 	.fault = ttm_bo_vm_fault,
 	.open = ttm_bo_vm_open,
 	.close = ttm_bo_vm_close,
@@ -448,7 +448,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
 	if (unlikely(ret != 0))
 		goto out_unref;
 
-	vma->vm_ops = &ttm_bo_vm_ops;
+	vma->vm_ops = bdev->vm_ops;
 
 	/*
 	 * Note: We're transferring the bo reference to
@@ -480,7 +480,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
 
 	ttm_bo_get(bo);
 
-	vma->vm_ops = &ttm_bo_vm_ops;
+	vma->vm_ops = bo->bdev->vm_ops;
 	vma->vm_private_data = bo;
 	vma->vm_flags |= VM_MIXEDMAP;
 	vma->vm_flags |= VM_IO | VM_DONTEXPAND;
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index c9b8ba492f24..a2d810a2504d 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -442,6 +442,9 @@ extern struct ttm_bo_global {
  * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
  * @man: An array of mem_type_managers.
  * @vma_manager: Address space manager
+ * @vm_ops: Pointer to the struct vm_operations_struct used for this
+ * device's VM operations. The driver may override this before the first
+ * mmap() call.
  * lru_lock: Spinlock that protects the buffer+device lru lists and
  * ddestroy lists.
  * @dev_mapping: A pointer to the struct address_space representing the
@@ -460,6 +463,7 @@ struct ttm_bo_device {
 	struct ttm_bo_global *glob;
 	struct ttm_bo_driver *driver;
 	struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
+	const struct vm_operations_struct *vm_ops;
 
 	/*
 	 * Protected by internal locks.
@@ -488,6 +492,8 @@ struct ttm_bo_device {
 	bool no_retry;
 };
 
+extern const struct vm_operations_struct ttm_bo_vm_ops;
+
 /**
  * struct ttm_lru_bulk_move_pos
  *
-- 
cgit v1.2.3


From 7a39f35ce43f96ff293cc90f845416d03566b14b Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Wed, 6 Feb 2019 12:55:08 +0100
Subject: drm/ttm: TTM fault handler helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the vmwgfx dirty tracking, the default TTM fault handler is not
completely sufficient (vmwgfx need to modify the vma->vm_flags member,
and also needs to restrict the number of prefaults).

We also want to replicate the new ttm_bo_vm_reserve() functionality

So start turning the TTM vm code into helpers: ttm_bo_vm_fault_reserved()
and ttm_bo_vm_reserve(), and provide a default TTM fault handler for other
drivers to use.

Cc: "Christian König" <christian.koenig@amd.com>

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: "Christian König" <christian.koenig@amd.com> #v1
---
 drivers/gpu/drm/ttm/ttm_bo_vm.c | 163 +++++++++++++++++++++++++---------------
 include/drm/ttm/ttm_bo_api.h    |  10 +++
 2 files changed, 111 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 196e13a0adad..0c4576cbafcf 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -42,8 +42,6 @@
 #include <linux/uaccess.h>
 #include <linux/mem_encrypt.h>
 
-#define TTM_BO_VM_NUM_PREFAULT 16
-
 static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
 				struct vm_fault *vmf)
 {
@@ -106,25 +104,30 @@ static unsigned long ttm_bo_io_mem_pfn(struct ttm_buffer_object *bo,
 		+ page_offset;
 }
 
-static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
+/**
+ * ttm_bo_vm_reserve - Reserve a buffer object in a retryable vm callback
+ * @bo: The buffer object
+ * @vmf: The fault structure handed to the callback
+ *
+ * vm callbacks like fault() and *_mkwrite() allow for the mm_sem to be dropped
+ * during long waits, and after the wait the callback will be restarted. This
+ * is to allow other threads using the same virtual memory space concurrent
+ * access to map(), unmap() completely unrelated buffer objects. TTM buffer
+ * object reservations sometimes wait for GPU and should therefore be
+ * considered long waits. This function reserves the buffer object interruptibly
+ * taking this into account. Starvation is avoided by the vm system not
+ * allowing too many repeated restarts.
+ * This function is intended to be used in customized fault() and _mkwrite()
+ * handlers.
+ *
+ * Return:
+ *    0 on success and the bo was reserved.
+ *    VM_FAULT_RETRY if blocking wait.
+ *    VM_FAULT_NOPAGE if blocking wait and retrying was not allowed.
+ */
+vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
+			     struct vm_fault *vmf)
 {
-	struct vm_area_struct *vma = vmf->vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	struct ttm_bo_device *bdev = bo->bdev;
-	unsigned long page_offset;
-	unsigned long page_last;
-	unsigned long pfn;
-	struct ttm_tt *ttm = NULL;
-	struct page *page;
-	int err;
-	int i;
-	vm_fault_t ret = VM_FAULT_NOPAGE;
-	unsigned long address = vmf->address;
-	struct ttm_mem_type_manager *man =
-		&bdev->man[bo->mem.mem_type];
-	struct vm_area_struct cvma;
-
 	/*
 	 * Work around locking order reversal in fault / nopfn
 	 * between mmap_sem and bo_reserve: Perform a trylock operation
@@ -151,14 +154,55 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 		return VM_FAULT_NOPAGE;
 	}
 
+	return 0;
+}
+EXPORT_SYMBOL(ttm_bo_vm_reserve);
+
+/**
+ * ttm_bo_vm_fault_reserved - TTM fault helper
+ * @vmf: The struct vm_fault given as argument to the fault callback
+ * @prot: The page protection to be used for this memory area.
+ * @num_prefault: Maximum number of prefault pages. The caller may want to
+ * specify this based on madvice settings and the size of the GPU object
+ * backed by the memory.
+ *
+ * This function inserts one or more page table entries pointing to the
+ * memory backing the buffer object, and then returns a return code
+ * instructing the caller to retry the page access.
+ *
+ * Return:
+ *   VM_FAULT_NOPAGE on success or pending signal
+ *   VM_FAULT_SIGBUS on unspecified error
+ *   VM_FAULT_OOM on out-of-memory
+ *   VM_FAULT_RETRY if retryable wait
+ */
+vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
+				    pgprot_t prot,
+				    pgoff_t num_prefault)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct vm_area_struct cvma = *vma;
+	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
+	    vma->vm_private_data;
+	struct ttm_bo_device *bdev = bo->bdev;
+	unsigned long page_offset;
+	unsigned long page_last;
+	unsigned long pfn;
+	struct ttm_tt *ttm = NULL;
+	struct page *page;
+	int err;
+	pgoff_t i;
+	vm_fault_t ret = VM_FAULT_NOPAGE;
+	unsigned long address = vmf->address;
+	struct ttm_mem_type_manager *man =
+		&bdev->man[bo->mem.mem_type];
+
 	/*
 	 * Refuse to fault imported pages. This should be handled
 	 * (if at all) by redirecting mmap to the exporter.
 	 */
-	if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG)) {
-		ret = VM_FAULT_SIGBUS;
-		goto out_unlock;
-	}
+	if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG))
+		return VM_FAULT_SIGBUS;
 
 	if (bdev->driver->fault_reserve_notify) {
 		struct dma_fence *moving = dma_fence_get(bo->moving);
@@ -169,11 +213,9 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 			break;
 		case -EBUSY:
 		case -ERESTARTSYS:
-			ret = VM_FAULT_NOPAGE;
-			goto out_unlock;
+			return VM_FAULT_NOPAGE;
 		default:
-			ret = VM_FAULT_SIGBUS;
-			goto out_unlock;
+			return VM_FAULT_SIGBUS;
 		}
 
 		if (bo->moving != moving) {
@@ -189,21 +231,12 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 	 * move.
 	 */
 	ret = ttm_bo_vm_fault_idle(bo, vmf);
-	if (unlikely(ret != 0)) {
-		if (ret == VM_FAULT_RETRY &&
-		    !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
-			/* The BO has already been unreserved. */
-			return ret;
-		}
-
-		goto out_unlock;
-	}
+	if (unlikely(ret != 0))
+		return ret;
 
 	err = ttm_mem_io_lock(man, true);
-	if (unlikely(err != 0)) {
-		ret = VM_FAULT_NOPAGE;
-		goto out_unlock;
-	}
+	if (unlikely(err != 0))
+		return VM_FAULT_NOPAGE;
 	err = ttm_mem_io_reserve_vm(bo);
 	if (unlikely(err != 0)) {
 		ret = VM_FAULT_SIGBUS;
@@ -220,18 +253,8 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 		goto out_io_unlock;
 	}
 
-	/*
-	 * Make a local vma copy to modify the page_prot member
-	 * and vm_flags if necessary. The vma parameter is protected
-	 * by mmap_sem in write mode.
-	 */
-	cvma = *vma;
-	cvma.vm_page_prot = vm_get_page_prot(cvma.vm_flags);
-
-	if (bo->mem.bus.is_iomem) {
-		cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
-						cvma.vm_page_prot);
-	} else {
+	cvma.vm_page_prot = ttm_io_prot(bo->mem.placement, prot);
+	if (!bo->mem.bus.is_iomem) {
 		struct ttm_operation_ctx ctx = {
 			.interruptible = false,
 			.no_wait_gpu = false,
@@ -240,24 +263,21 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 		};
 
 		ttm = bo->ttm;
-		cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
-						cvma.vm_page_prot);
-
-		/* Allocate all page at once, most common usage */
-		if (ttm_tt_populate(ttm, &ctx)) {
+		if (ttm_tt_populate(bo->ttm, &ctx)) {
 			ret = VM_FAULT_OOM;
 			goto out_io_unlock;
 		}
+	} else {
+		/* Iomem should not be marked encrypted */
+		cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
 	}
 
 	/*
 	 * Speculatively prefault a number of pages. Only error on
 	 * first page.
 	 */
-	for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
+	for (i = 0; i < num_prefault; ++i) {
 		if (bo->mem.bus.is_iomem) {
-			/* Iomem should not be marked encrypted */
-			cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
 			pfn = ttm_bo_io_mem_pfn(bo, page_offset);
 		} else {
 			page = ttm->pages[page_offset];
@@ -295,7 +315,26 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 	ret = VM_FAULT_NOPAGE;
 out_io_unlock:
 	ttm_mem_io_unlock(man);
-out_unlock:
+	return ret;
+}
+EXPORT_SYMBOL(ttm_bo_vm_fault_reserved);
+
+static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	pgprot_t prot;
+	struct ttm_buffer_object *bo = vma->vm_private_data;
+	vm_fault_t ret;
+
+	ret = ttm_bo_vm_reserve(bo, vmf);
+	if (ret)
+		return ret;
+
+	prot = vm_get_page_prot(vma->vm_flags);
+	ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
+	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
+		return ret;
+
 	reservation_object_unlock(bo->resv);
 	return ret;
 }
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 49d9cdfc58f2..435d02f719a8 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -768,4 +768,14 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
 			struct ttm_operation_ctx *ctx);
 void ttm_bo_swapout_all(struct ttm_bo_device *bdev);
 int ttm_bo_wait_unreserved(struct ttm_buffer_object *bo);
+
+/* Default number of pre-faulted pages in the TTM fault handler */
+#define TTM_BO_VM_NUM_PREFAULT 16
+
+vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
+			     struct vm_fault *vmf);
+
+vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
+				    pgprot_t prot,
+				    pgoff_t num_prefault);
 #endif
-- 
cgit v1.2.3


From 4ba397671237784a212378c271f700e99c66cf39 Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Tue, 19 Mar 2019 13:27:50 +0100
Subject: drm/vmwgfx: Add surface dirty-tracking callbacks

Add the callbacks necessary to implement emulated coherent memory for
surfaces. Add a flag to the gb_surface_create ioctl to indicate that
surface memory should be coherent.
Also bump the drm minor version to signal the availability of coherent
surfaces.

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Deepak Rawat <drawat@vmware.com>
---
 .../drm/vmwgfx/device_include/svga3d_surfacedefs.h | 233 +++++++++++-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h                |   4 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_surface.c            | 395 ++++++++++++++++++++-
 include/uapi/drm/vmwgfx_drm.h                      |   4 +-
 4 files changed, 629 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
index f2bfd3d80598..61414f105c67 100644
--- a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
+++ b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
@@ -1280,7 +1280,6 @@ svga3dsurface_get_pixel_offset(SVGA3dSurfaceFormat format,
 	return offset;
 }
 
-
 static inline u32
 svga3dsurface_get_image_offset(SVGA3dSurfaceFormat format,
 			       surf_size_struct baseLevelSize,
@@ -1375,4 +1374,236 @@ svga3dsurface_is_screen_target_format(SVGA3dSurfaceFormat format)
 	return svga3dsurface_is_dx_screen_target_format(format);
 }
 
+/**
+ * struct svga3dsurface_mip - Mimpmap level information
+ * @bytes: Bytes required in the backing store of this mipmap level.
+ * @img_stride: Byte stride per image.
+ * @row_stride: Byte stride per block row.
+ * @size: The size of the mipmap.
+ */
+struct svga3dsurface_mip {
+	size_t bytes;
+	size_t img_stride;
+	size_t row_stride;
+	struct drm_vmw_size size;
+
+};
+
+/**
+ * struct svga3dsurface_cache - Cached surface information
+ * @desc: Pointer to the surface descriptor
+ * @mip: Array of mipmap level information. Valid size is @num_mip_levels.
+ * @mip_chain_bytes: Bytes required in the backing store for the whole chain
+ * of mip levels.
+ * @sheet_bytes: Bytes required in the backing store for a sheet
+ * representing a single sample.
+ * @num_mip_levels: Valid size of the @mip array. Number of mipmap levels in
+ * a chain.
+ * @num_layers: Number of slices in an array texture or number of faces in
+ * a cubemap texture.
+ */
+struct svga3dsurface_cache {
+	const struct svga3d_surface_desc *desc;
+	struct svga3dsurface_mip mip[DRM_VMW_MAX_MIP_LEVELS];
+	size_t mip_chain_bytes;
+	size_t sheet_bytes;
+	u32 num_mip_levels;
+	u32 num_layers;
+};
+
+/**
+ * struct svga3dsurface_loc - Surface location
+ * @sub_resource: Surface subresource. Defined as layer * num_mip_levels +
+ * mip_level.
+ * @x: X coordinate.
+ * @y: Y coordinate.
+ * @z: Z coordinate.
+ */
+struct svga3dsurface_loc {
+	u32 sub_resource;
+	u32 x, y, z;
+};
+
+/**
+ * svga3dsurface_subres - Compute the subresource from layer and mipmap.
+ * @cache: Surface layout data.
+ * @mip_level: The mipmap level.
+ * @layer: The surface layer (face or array slice).
+ *
+ * Return: The subresource.
+ */
+static inline u32 svga3dsurface_subres(const struct svga3dsurface_cache *cache,
+				       u32 mip_level, u32 layer)
+{
+	return cache->num_mip_levels * layer + mip_level;
+}
+
+/**
+ * svga3dsurface_setup_cache - Build a surface cache entry
+ * @size: The surface base level dimensions.
+ * @format: The surface format.
+ * @num_mip_levels: Number of mipmap levels.
+ * @num_layers: Number of layers.
+ * @cache: Pointer to a struct svga3dsurface_cach object to be filled in.
+ *
+ * Return: Zero on success, -EINVAL on invalid surface layout.
+ */
+static inline int svga3dsurface_setup_cache(const struct drm_vmw_size *size,
+					    SVGA3dSurfaceFormat format,
+					    u32 num_mip_levels,
+					    u32 num_layers,
+					    u32 num_samples,
+					    struct svga3dsurface_cache *cache)
+{
+	const struct svga3d_surface_desc *desc;
+	u32 i;
+
+	memset(cache, 0, sizeof(*cache));
+	cache->desc = desc = svga3dsurface_get_desc(format);
+	cache->num_mip_levels = num_mip_levels;
+	cache->num_layers = num_layers;
+	for (i = 0; i < cache->num_mip_levels; i++) {
+		struct svga3dsurface_mip *mip = &cache->mip[i];
+
+		mip->size = svga3dsurface_get_mip_size(*size, i);
+		mip->bytes = svga3dsurface_get_image_buffer_size
+			(desc, &mip->size, 0);
+		mip->row_stride =
+			__KERNEL_DIV_ROUND_UP(mip->size.width,
+					      desc->block_size.width) *
+			desc->bytes_per_block * num_samples;
+		if (!mip->row_stride)
+			goto invalid_dim;
+
+		mip->img_stride =
+			__KERNEL_DIV_ROUND_UP(mip->size.height,
+					      desc->block_size.height) *
+			mip->row_stride;
+		if (!mip->img_stride)
+			goto invalid_dim;
+
+		cache->mip_chain_bytes += mip->bytes;
+	}
+	cache->sheet_bytes = cache->mip_chain_bytes * num_layers;
+	if (!cache->sheet_bytes)
+		goto invalid_dim;
+
+	return 0;
+
+invalid_dim:
+	VMW_DEBUG_USER("Invalid surface layout for dirty tracking.\n");
+	return -EINVAL;
+}
+
+/**
+ * svga3dsurface_get_loc - Get a surface location from an offset into the
+ * backing store
+ * @cache: Surface layout data.
+ * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
+ * @offset: Offset into the surface backing store.
+ */
+static inline void
+svga3dsurface_get_loc(const struct svga3dsurface_cache *cache,
+		      struct svga3dsurface_loc *loc,
+		      size_t offset)
+{
+	const struct svga3dsurface_mip *mip = &cache->mip[0];
+	const struct svga3d_surface_desc *desc = cache->desc;
+	u32 layer;
+	int i;
+
+	if (offset >= cache->sheet_bytes)
+		offset %= cache->sheet_bytes;
+
+	layer = offset / cache->mip_chain_bytes;
+	offset -= layer * cache->mip_chain_bytes;
+	for (i = 0; i < cache->num_mip_levels; ++i, ++mip) {
+		if (mip->bytes > offset)
+			break;
+		offset -= mip->bytes;
+	}
+
+	loc->sub_resource = svga3dsurface_subres(cache, i, layer);
+	loc->z = offset / mip->img_stride;
+	offset -= loc->z * mip->img_stride;
+	loc->z *= desc->block_size.depth;
+	loc->y = offset / mip->row_stride;
+	offset -= loc->y * mip->row_stride;
+	loc->y *= desc->block_size.height;
+	loc->x = offset / desc->bytes_per_block;
+	loc->x *= desc->block_size.width;
+}
+
+/**
+ * svga3dsurface_inc_loc - Clamp increment a surface location with one block
+ * size
+ * in each dimension.
+ * @loc: Pointer to a struct svga3dsurface_loc to be incremented.
+ *
+ * When computing the size of a range as size = end - start, the range does not
+ * include the end element. However a location representing the last byte
+ * of a touched region in the backing store *is* included in the range.
+ * This function modifies such a location to match the end definition
+ * given as start + size which is the one used in a SVGA3dBox.
+ */
+static inline void
+svga3dsurface_inc_loc(const struct svga3dsurface_cache *cache,
+		      struct svga3dsurface_loc *loc)
+{
+	const struct svga3d_surface_desc *desc = cache->desc;
+	u32 mip = loc->sub_resource % cache->num_mip_levels;
+	const struct drm_vmw_size *size = &cache->mip[mip].size;
+
+	loc->sub_resource++;
+	loc->x += desc->block_size.width;
+	if (loc->x > size->width)
+		loc->x = size->width;
+	loc->y += desc->block_size.height;
+	if (loc->y > size->height)
+		loc->y = size->height;
+	loc->z += desc->block_size.depth;
+	if (loc->z > size->depth)
+		loc->z = size->depth;
+}
+
+/**
+ * svga3dsurface_min_loc - The start location in a subresource
+ * @cache: Surface layout data.
+ * @sub_resource: The subresource.
+ * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
+ */
+static inline void
+svga3dsurface_min_loc(const struct svga3dsurface_cache *cache,
+		      u32 sub_resource,
+		      struct svga3dsurface_loc *loc)
+{
+	loc->sub_resource = sub_resource;
+	loc->x = loc->y = loc->z = 0;
+}
+
+/**
+ * svga3dsurface_min_loc - The end location in a subresource
+ * @cache: Surface layout data.
+ * @sub_resource: The subresource.
+ * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
+ *
+ * Following the end definition given in svga3dsurface_inc_loc(),
+ * Compute the end location of a surface subresource.
+ */
+static inline void
+svga3dsurface_max_loc(const struct svga3dsurface_cache *cache,
+		      u32 sub_resource,
+		      struct svga3dsurface_loc *loc)
+{
+	const struct drm_vmw_size *size;
+	u32 mip;
+
+	loc->sub_resource = sub_resource + 1;
+	mip = sub_resource % cache->num_mip_levels;
+	size = &cache->mip[mip].size;
+	loc->x = size->width;
+	loc->y = size->height;
+	loc->z = size->depth;
+}
+
 #endif /* _SVGA3D_SURFACEDEFS_H_ */
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index dae3a39bf402..5971c0d47507 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -44,9 +44,9 @@
 #include <linux/sync_file.h>
 
 #define VMWGFX_DRIVER_NAME "vmwgfx"
-#define VMWGFX_DRIVER_DATE "20180704"
+#define VMWGFX_DRIVER_DATE "20190328"
 #define VMWGFX_DRIVER_MAJOR 2
-#define VMWGFX_DRIVER_MINOR 15
+#define VMWGFX_DRIVER_MINOR 16
 #define VMWGFX_DRIVER_PATCHLEVEL 0
 #define VMWGFX_FIFO_STATIC_SIZE (1024*1024)
 #define VMWGFX_MAX_RELOCATIONS 2048
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index c40d44f4d9af..637043f1befa 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -68,6 +68,20 @@ struct vmw_surface_offset {
 	uint32_t bo_offset;
 };
 
+/**
+ * vmw_surface_dirty - Surface dirty-tracker
+ * @cache: Cached layout information of the surface.
+ * @size: Accounting size for the struct vmw_surface_dirty.
+ * @num_subres: Number of subresources.
+ * @boxes: Array of SVGA3dBoxes indicating dirty regions. One per subresource.
+ */
+struct vmw_surface_dirty {
+	struct svga3dsurface_cache cache;
+	size_t size;
+	u32 num_subres;
+	SVGA3dBox boxes[0];
+};
+
 static void vmw_user_surface_free(struct vmw_resource *res);
 static struct vmw_resource *
 vmw_user_surface_base_to_res(struct ttm_base_object *base);
@@ -96,6 +110,13 @@ vmw_gb_surface_reference_internal(struct drm_device *dev,
 				  struct drm_vmw_gb_surface_ref_ext_rep *rep,
 				  struct drm_file *file_priv);
 
+static void vmw_surface_dirty_free(struct vmw_resource *res);
+static int vmw_surface_dirty_alloc(struct vmw_resource *res);
+static int vmw_surface_dirty_sync(struct vmw_resource *res);
+static void vmw_surface_dirty_range_add(struct vmw_resource *res, size_t start,
+					size_t end);
+static int vmw_surface_clean(struct vmw_resource *res);
+
 static const struct vmw_user_resource_conv user_surface_conv = {
 	.object_type = VMW_RES_SURFACE,
 	.base_obj_to_res = vmw_user_surface_base_to_res,
@@ -133,7 +154,12 @@ static const struct vmw_res_func vmw_gb_surface_func = {
 	.create = vmw_gb_surface_create,
 	.destroy = vmw_gb_surface_destroy,
 	.bind = vmw_gb_surface_bind,
-	.unbind = vmw_gb_surface_unbind
+	.unbind = vmw_gb_surface_unbind,
+	.dirty_alloc = vmw_surface_dirty_alloc,
+	.dirty_free = vmw_surface_dirty_free,
+	.dirty_sync = vmw_surface_dirty_sync,
+	.dirty_range_add = vmw_surface_dirty_range_add,
+	.clean = vmw_surface_clean,
 };
 
 /**
@@ -641,6 +667,7 @@ static void vmw_user_surface_free(struct vmw_resource *res)
 	struct vmw_private *dev_priv = srf->res.dev_priv;
 	uint32_t size = user_srf->size;
 
+	WARN_ON_ONCE(res->dirty);
 	if (user_srf->master)
 		drm_master_put(&user_srf->master);
 	kfree(srf->offsets);
@@ -1174,10 +1201,16 @@ static int vmw_gb_surface_bind(struct vmw_resource *res,
 		cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_SURFACE;
 		cmd2->header.size = sizeof(cmd2->body);
 		cmd2->body.sid = res->id;
-		res->backup_dirty = false;
 	}
 	vmw_fifo_commit(dev_priv, submit_size);
 
+	if (res->backup->dirty && res->backup_dirty) {
+		/* We've just made a full upload. Cear dirty regions. */
+		vmw_bo_dirty_clear_res(res);
+	}
+
+	res->backup_dirty = false;
+
 	return 0;
 }
 
@@ -1642,7 +1675,8 @@ vmw_gb_surface_define_internal(struct drm_device *dev,
 			}
 		}
 	} else if (req->base.drm_surface_flags &
-		   drm_vmw_surface_flag_create_buffer)
+		   (drm_vmw_surface_flag_create_buffer |
+		    drm_vmw_surface_flag_coherent))
 		ret = vmw_user_bo_alloc(dev_priv, tfile,
 					res->backup_size,
 					req->base.drm_surface_flags &
@@ -1656,6 +1690,26 @@ vmw_gb_surface_define_internal(struct drm_device *dev,
 		goto out_unlock;
 	}
 
+	if (req->base.drm_surface_flags & drm_vmw_surface_flag_coherent) {
+		struct vmw_buffer_object *backup = res->backup;
+
+		ttm_bo_reserve(&backup->base, false, false, NULL);
+		if (!res->func->dirty_alloc)
+			ret = -EINVAL;
+		if (!ret)
+			ret = vmw_bo_dirty_add(backup);
+		if (!ret) {
+			res->coherent = true;
+			ret = res->func->dirty_alloc(res);
+		}
+		ttm_bo_unreserve(&backup->base);
+		if (ret) {
+			vmw_resource_unreference(&res);
+			goto out_unlock;
+		}
+
+	}
+
 	tmp = vmw_resource_reference(res);
 	ret = ttm_prime_object_init(tfile, res->backup_size, &user_srf->prime,
 				    req->base.drm_surface_flags &
@@ -1764,3 +1818,338 @@ out_bad_resource:
 
 	return ret;
 }
+
+/**
+ * vmw_subres_dirty_add - Add a dirty region to a subresource
+ * @dirty: The surfaces's dirty tracker.
+ * @loc_start: The location corresponding to the start of the region.
+ * @loc_end: The location corresponding to the end of the region.
+ *
+ * As we are assuming that @loc_start and @loc_end represent a sequential
+ * range of backing store memory, if the region spans multiple lines then
+ * regardless of the x coordinate, the full lines are dirtied.
+ * Correspondingly if the region spans multiple z slices, then full rather
+ * than partial z slices are dirtied.
+ */
+static void vmw_subres_dirty_add(struct vmw_surface_dirty *dirty,
+				 const struct svga3dsurface_loc *loc_start,
+				 const struct svga3dsurface_loc *loc_end)
+{
+	const struct svga3dsurface_cache *cache = &dirty->cache;
+	SVGA3dBox *box = &dirty->boxes[loc_start->sub_resource];
+	u32 mip = loc_start->sub_resource % cache->num_mip_levels;
+	const struct drm_vmw_size *size = &cache->mip[mip].size;
+	u32 box_c2 = box->z + box->d;
+
+	if (WARN_ON(loc_start->sub_resource >= dirty->num_subres))
+		return;
+
+	if (box->d == 0 || box->z > loc_start->z)
+		box->z = loc_start->z;
+	if (box_c2 < loc_end->z)
+		box->d = loc_end->z - box->z;
+
+	if (loc_start->z + 1 == loc_end->z) {
+		box_c2 = box->y + box->h;
+		if (box->h == 0 || box->y > loc_start->y)
+			box->y = loc_start->y;
+		if (box_c2 < loc_end->y)
+			box->h = loc_end->y - box->y;
+
+		if (loc_start->y + 1 == loc_end->y) {
+			box_c2 = box->x + box->w;
+			if (box->w == 0 || box->x > loc_start->x)
+				box->x = loc_start->x;
+			if (box_c2 < loc_end->x)
+				box->w = loc_end->x - box->x;
+		} else {
+			box->x = 0;
+			box->w = size->width;
+		}
+	} else {
+		box->y = 0;
+		box->h = size->height;
+		box->x = 0;
+		box->w = size->width;
+	}
+}
+
+/**
+ * vmw_subres_dirty_full - Mark a full subresource as dirty
+ * @dirty: The surface's dirty tracker.
+ * @subres: The subresource
+ */
+static void vmw_subres_dirty_full(struct vmw_surface_dirty *dirty, u32 subres)
+{
+	const struct svga3dsurface_cache *cache = &dirty->cache;
+	u32 mip = subres % cache->num_mip_levels;
+	const struct drm_vmw_size *size = &cache->mip[mip].size;
+	SVGA3dBox *box = &dirty->boxes[subres];
+
+	box->x = 0;
+	box->y = 0;
+	box->z = 0;
+	box->w = size->width;
+	box->h = size->height;
+	box->d = size->depth;
+}
+
+/*
+ * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for texture
+ * surfaces.
+ */
+static void vmw_surface_tex_dirty_range_add(struct vmw_resource *res,
+					    size_t start, size_t end)
+{
+	struct vmw_surface_dirty *dirty =
+		(struct vmw_surface_dirty *) res->dirty;
+	size_t backup_end = res->backup_offset + res->backup_size;
+	struct svga3dsurface_loc loc1, loc2;
+	const struct svga3dsurface_cache *cache;
+
+	start = max_t(size_t, start, res->backup_offset) - res->backup_offset;
+	end = min(end, backup_end) - res->backup_offset;
+	cache = &dirty->cache;
+	svga3dsurface_get_loc(cache, &loc1, start);
+	svga3dsurface_get_loc(cache, &loc2, end - 1);
+	svga3dsurface_inc_loc(cache, &loc2);
+
+	if (loc1.sub_resource + 1 == loc2.sub_resource) {
+		/* Dirty range covers a single sub-resource */
+		vmw_subres_dirty_add(dirty, &loc1, &loc2);
+	} else {
+		/* Dirty range covers multiple sub-resources */
+		struct svga3dsurface_loc loc_min, loc_max;
+		u32 sub_res = loc1.sub_resource;
+
+		svga3dsurface_max_loc(cache, loc1.sub_resource, &loc_max);
+		vmw_subres_dirty_add(dirty, &loc1, &loc_max);
+		svga3dsurface_min_loc(cache, loc2.sub_resource - 1, &loc_min);
+		vmw_subres_dirty_add(dirty, &loc_min, &loc2);
+		for (sub_res = loc1.sub_resource + 1;
+		     sub_res < loc2.sub_resource - 1; ++sub_res)
+			vmw_subres_dirty_full(dirty, sub_res);
+	}
+}
+
+/*
+ * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for buffer
+ * surfaces.
+ */
+static void vmw_surface_buf_dirty_range_add(struct vmw_resource *res,
+					    size_t start, size_t end)
+{
+	struct vmw_surface_dirty *dirty =
+		(struct vmw_surface_dirty *) res->dirty;
+	const struct svga3dsurface_cache *cache = &dirty->cache;
+	size_t backup_end = res->backup_offset + cache->mip_chain_bytes;
+	SVGA3dBox *box = &dirty->boxes[0];
+	u32 box_c2;
+
+	box->h = box->d = 1;
+	start = max_t(size_t, start, res->backup_offset) - res->backup_offset;
+	end = min(end, backup_end) - res->backup_offset;
+	box_c2 = box->x + box->w;
+	if (box->w == 0 || box->x > start)
+		box->x = start;
+	if (box_c2 < end)
+		box->w = end - box->x;
+}
+
+/*
+ * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for surfaces
+ */
+static void vmw_surface_dirty_range_add(struct vmw_resource *res, size_t start,
+					size_t end)
+{
+	struct vmw_surface *srf = vmw_res_to_srf(res);
+
+	if (WARN_ON(end <= res->backup_offset ||
+		    start >= res->backup_offset + res->backup_size))
+		return;
+
+	if (srf->format == SVGA3D_BUFFER)
+		vmw_surface_buf_dirty_range_add(res, start, end);
+	else
+		vmw_surface_tex_dirty_range_add(res, start, end);
+}
+
+/*
+ * vmw_surface_dirty_sync - The surface's dirty_sync callback.
+ */
+static int vmw_surface_dirty_sync(struct vmw_resource *res)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	bool has_dx = 0;
+	u32 i, num_dirty;
+	struct vmw_surface_dirty *dirty =
+		(struct vmw_surface_dirty *) res->dirty;
+	size_t alloc_size;
+	const struct svga3dsurface_cache *cache = &dirty->cache;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXUpdateSubResource body;
+	} *cmd1;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdUpdateGBImage body;
+	} *cmd2;
+	void *cmd;
+
+	num_dirty = 0;
+	for (i = 0; i < dirty->num_subres; ++i) {
+		const SVGA3dBox *box = &dirty->boxes[i];
+
+		if (box->d)
+			num_dirty++;
+	}
+
+	if (!num_dirty)
+		goto out;
+
+	alloc_size = num_dirty * ((has_dx) ? sizeof(*cmd1) : sizeof(*cmd2));
+	cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
+	if (!cmd)
+		return -ENOMEM;
+
+	cmd1 = cmd;
+	cmd2 = cmd;
+
+	for (i = 0; i < dirty->num_subres; ++i) {
+		const SVGA3dBox *box = &dirty->boxes[i];
+
+		if (!box->d)
+			continue;
+
+		/*
+		 * DX_UPDATE_SUBRESOURCE is aware of array surfaces.
+		 * UPDATE_GB_IMAGE is not.
+		 */
+		if (has_dx) {
+			cmd1->header.id = SVGA_3D_CMD_DX_UPDATE_SUBRESOURCE;
+			cmd1->header.size = sizeof(cmd1->body);
+			cmd1->body.sid = res->id;
+			cmd1->body.subResource = i;
+			cmd1->body.box = *box;
+			cmd1++;
+		} else {
+			cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_IMAGE;
+			cmd2->header.size = sizeof(cmd2->body);
+			cmd2->body.image.sid = res->id;
+			cmd2->body.image.face = i / cache->num_mip_levels;
+			cmd2->body.image.mipmap = i -
+				(cache->num_mip_levels * cmd2->body.image.face);
+			cmd2->body.box = *box;
+			cmd2++;
+		}
+
+	}
+	vmw_fifo_commit(dev_priv, alloc_size);
+ out:
+	memset(&dirty->boxes[0], 0, sizeof(dirty->boxes[0]) *
+	       dirty->num_subres);
+
+	return 0;
+}
+
+/*
+ * vmw_surface_dirty_alloc - The surface's dirty_alloc callback.
+ */
+static int vmw_surface_dirty_alloc(struct vmw_resource *res)
+{
+	struct vmw_surface *srf = vmw_res_to_srf(res);
+	struct vmw_surface_dirty *dirty;
+	u32 num_layers = 1;
+	u32 num_mip;
+	u32 num_subres;
+	u32 num_samples;
+	size_t dirty_size, acc_size;
+	static struct ttm_operation_ctx ctx = {
+		.interruptible = false,
+		.no_wait_gpu = false
+	};
+	int ret;
+
+	if (srf->array_size)
+		num_layers = srf->array_size;
+	else if (srf->flags & SVGA3D_SURFACE_CUBEMAP)
+		num_layers *= SVGA3D_MAX_SURFACE_FACES;
+
+	num_mip = srf->mip_levels[0];
+	if (!num_mip)
+		num_mip = 1;
+
+	num_subres = num_layers * num_mip;
+	dirty_size = sizeof(*dirty) + num_subres * sizeof(dirty->boxes[0]);
+	acc_size = ttm_round_pot(dirty_size);
+	ret = ttm_mem_global_alloc(vmw_mem_glob(res->dev_priv),
+				   acc_size, &ctx);
+	if (ret) {
+		VMW_DEBUG_USER("Out of graphics memory for surface "
+			       "dirty tracker.\n");
+		return ret;
+	}
+
+	dirty = kvzalloc(dirty_size, GFP_KERNEL);
+	if (!dirty) {
+		ret = -ENOMEM;
+		goto out_no_dirty;
+	}
+
+	num_samples = max_t(u32, 1, srf->multisample_count);
+	ret = svga3dsurface_setup_cache(&srf->base_size, srf->format, num_mip,
+					num_layers, num_samples, &dirty->cache);
+	if (ret)
+		goto out_no_cache;
+
+	dirty->num_subres = num_subres;
+	dirty->size = acc_size;
+	res->dirty = (struct vmw_resource_dirty *) dirty;
+
+	return 0;
+
+out_no_cache:
+	kvfree(dirty);
+out_no_dirty:
+	ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
+	return ret;
+}
+
+/*
+ * vmw_surface_dirty_free - The surface's dirty_free callback
+ */
+static void vmw_surface_dirty_free(struct vmw_resource *res)
+{
+	struct vmw_surface_dirty *dirty =
+		(struct vmw_surface_dirty *) res->dirty;
+	size_t acc_size = dirty->size;
+
+	kvfree(dirty);
+	ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
+	res->dirty = NULL;
+}
+
+/*
+ * vmw_surface_clean - The surface's clean callback
+ */
+static int vmw_surface_clean(struct vmw_resource *res)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	size_t alloc_size;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdReadbackGBSurface body;
+	} *cmd;
+
+	alloc_size = sizeof(*cmd);
+	cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
+	if (!cmd)
+		return -ENOMEM;
+
+	cmd->header.id = SVGA_3D_CMD_READBACK_GB_SURFACE;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.sid = res->id;
+	vmw_fifo_commit(dev_priv, alloc_size);
+
+	return 0;
+}
diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index 399f58317cff..02cab33f2f25 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -891,11 +891,13 @@ struct drm_vmw_shader_arg {
  *                                      surface.
  * @drm_vmw_surface_flag_create_buffer: Create a backup buffer if none is
  *                                      given.
+ * @drm_vmw_surface_flag_coherent:      Back surface with coherent memory.
  */
 enum drm_vmw_surface_flags {
 	drm_vmw_surface_flag_shareable = (1 << 0),
 	drm_vmw_surface_flag_scanout = (1 << 1),
-	drm_vmw_surface_flag_create_buffer = (1 << 2)
+	drm_vmw_surface_flag_create_buffer = (1 << 2),
+	drm_vmw_surface_flag_coherent = (1 << 3),
 };
 
 /**
-- 
cgit v1.2.3


From 378a60406415bd20ec6e845a3d6883d460656537 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 11:17:22 -0300
Subject: mm/hmm: Remove duplicate condition test before wait_event_timeout

The wait_event_timeout macro already tests the condition as its first
action, so there is no reason to open code another version of this, all
that does is skip the might_sleep() debugging in common cases, which is
not helpful.

Further, based on prior patches, we can now simplify the required condition
test:
 - If range is valid memory then so is range->hmm
 - If hmm_release() has run then range->valid is set to false
   at the same time as dead, so no reason to check both.
 - A valid hmm has a valid hmm->mm.

Allowing the return value of wait_event_timeout() (along with its internal
barriers) to compute the result of the function.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 1d97b6d62c5b..26e7c477490c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -209,17 +209,8 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
 static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 					      unsigned long timeout)
 {
-	/* Check if mm is dead ? */
-	if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
-		range->valid = false;
-		return false;
-	}
-	if (range->valid)
-		return true;
-	wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
-			   msecs_to_jiffies(timeout));
-	/* Return current valid status just in case we get lucky */
-	return range->valid;
+	return wait_event_timeout(range->hmm->wq, range->valid,
+				  msecs_to_jiffies(timeout)) != 0;
 }
 
 /*
-- 
cgit v1.2.3


From 47f245985a4f3e270b1e4f28aa49f4c939527981 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 11:08:28 -0300
Subject: mm/hmm: Hold on to the mmget for the lifetime of the range

Range functions like hmm_range_snapshot() and hmm_range_fault() call
find_vma, which requires hodling the mmget() and the mmap_sem for the mm.

Make this simpler for the callers by holding the mmget() inside the range
for the lifetime of the range. Other functions that accept a range should
only be called if the range is registered.

This has the side effect of directly preventing hmm_release() from
happening while a range is registered. That means range->dead cannot be
false during the lifetime of the range, so remove dead and
hmm_mirror_mm_is_alive() entirely.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h | 26 --------------------------
 mm/hmm.c            | 32 +++++++++++---------------------
 2 files changed, 11 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 26e7c477490c..bf013e965257 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -82,7 +82,6 @@
  * @mirrors_sem: read/write semaphore protecting the mirrors list
  * @wq: wait queue for user waiting on a range invalidation
  * @notifiers: count of active mmu notifiers
- * @dead: is the mm dead ?
  */
 struct hmm {
 	struct mm_struct	*mm;
@@ -95,7 +94,6 @@ struct hmm {
 	wait_queue_head_t	wq;
 	struct rcu_head		rcu;
 	long			notifiers;
-	bool			dead;
 };
 
 /*
@@ -459,30 +457,6 @@ struct hmm_mirror {
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 
-/*
- * hmm_mirror_mm_is_alive() - test if mm is still alive
- * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
- * Return: false if the mm is dead, true otherwise
- *
- * This is an optimization, it will not always accurately return false if the
- * mm is dead; i.e., there can be false negatives (process is being killed but
- * HMM is not yet informed of that). It is only intended to be used to optimize
- * out cases where the driver is about to do something time consuming and it
- * would be better to skip it if the mm is dead.
- */
-static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
-{
-	struct mm_struct *mm;
-
-	if (!mirror || !mirror->hmm)
-		return false;
-	mm = READ_ONCE(mirror->hmm->mm);
-	if (mirror->hmm->dead || !mm)
-		return false;
-
-	return true;
-}
-
 /*
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
diff --git a/mm/hmm.c b/mm/hmm.c
index 73c8af4827fe..1eddda45cefa 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -67,7 +67,6 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	mutex_init(&hmm->lock);
 	kref_init(&hmm->kref);
 	hmm->notifiers = 0;
-	hmm->dead = false;
 	hmm->mm = mm;
 
 	hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
@@ -120,21 +119,16 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
-	struct hmm_range *range;
 
 	/* Bail out if hmm is in the process of being freed */
 	if (!kref_get_unless_zero(&hmm->kref))
 		return;
 
-	/* Report this HMM as dying. */
-	hmm->dead = true;
-
-	/* Wake-up everyone waiting on any range. */
-	mutex_lock(&hmm->lock);
-	list_for_each_entry(range, &hmm->ranges, list)
-		range->valid = false;
-	wake_up_all(&hmm->wq);
-	mutex_unlock(&hmm->lock);
+	/*
+	 * Since hmm_range_register() holds the mmget() lock hmm_release() is
+	 * prevented as long as a range exists.
+	 */
+	WARN_ON(!list_empty_careful(&hmm->ranges));
 
 	down_write(&hmm->mirrors_sem);
 	mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
@@ -903,8 +897,8 @@ int hmm_range_register(struct hmm_range *range,
 	range->start = start;
 	range->end = end;
 
-	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead)
+	/* Prevent hmm_release() from running while the range is valid */
+	if (!mmget_not_zero(hmm->mm))
 		return -EFAULT;
 
 	/* Initialize range to track CPU page table updates. */
@@ -942,11 +936,12 @@ void hmm_range_unregister(struct hmm_range *range)
 		return;
 
 	mutex_lock(&hmm->lock);
-	list_del(&range->list);
+	list_del_init(&range->list);
 	mutex_unlock(&hmm->lock);
 
 	/* Drop reference taken by hmm_range_register() */
 	range->valid = false;
+	mmput(hmm->mm);
 	hmm_put(hmm);
 	range->hmm = NULL;
 }
@@ -974,10 +969,7 @@ long hmm_range_snapshot(struct hmm_range *range)
 	struct vm_area_struct *vma;
 	struct mm_walk mm_walk;
 
-	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead)
-		return -EFAULT;
-
+	lockdep_assert_held(&hmm->mm->mmap_sem);
 	do {
 		/* If range is no longer valid force retry. */
 		if (!range->valid)
@@ -1072,9 +1064,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
 	struct mm_walk mm_walk;
 	int ret;
 
-	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead)
-		return -EFAULT;
+	lockdep_assert_held(&hmm->mm->mmap_sem);
 
 	do {
 		/* If range is no longer valid force retry. */
-- 
cgit v1.2.3


From 7786fd1087774c6090775932290f33ac73044f41 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Tue, 18 Jun 2019 10:16:48 +0200
Subject: drm/panfrost: Expose performance counters through unstable ioctls

Expose performance counters through 2 driver specific ioctls: one to
enable/disable the perfcnt block, and one to dump the counter values.

There are discussions to expose global performance monitors (those
counters that can't be retrieved on a per-job basis) in a consistent
way, but this is likely to take time to settle on something that works
for various HW/users.
The ioctls are marked unstable so we can get rid of them when the time
comes. We initally went for a debugfs-based interface, but this was
making the transition to per-FD address space more complicated (we need
to specify the namespace the GPU has to use when dumping the perf
counters), hence the decision to switch back to driver specific ioctls
which are passed the FD they operate on and thus will have a dedicated
address space attached to them.

Other than that, the implementation is pretty simple: it basically dumps
all counters and copy the values to a userspace buffer. The parsing is
left to userspace which has to know the specific layout that's used
by the GPU (layout differs on a per-revision basis).

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Acked-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190618081648.17297-5-boris.brezillon@collabora.com
---
 drivers/gpu/drm/panfrost/Makefile           |   3 +-
 drivers/gpu/drm/panfrost/panfrost_device.c  |   8 +
 drivers/gpu/drm/panfrost/panfrost_device.h  |   3 +
 drivers/gpu/drm/panfrost/panfrost_drv.c     |   4 +
 drivers/gpu/drm/panfrost/panfrost_gpu.c     |   7 +
 drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 329 ++++++++++++++++++++++++++++
 drivers/gpu/drm/panfrost/panfrost_perfcnt.h |  18 ++
 drivers/gpu/drm/panfrost/panfrost_regs.h    |  19 ++
 include/uapi/drm/panfrost_drm.h             |  24 ++
 9 files changed, 414 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h

(limited to 'include')

diff --git a/drivers/gpu/drm/panfrost/Makefile b/drivers/gpu/drm/panfrost/Makefile
index 6de72d13c58f..ecf0864cb515 100644
--- a/drivers/gpu/drm/panfrost/Makefile
+++ b/drivers/gpu/drm/panfrost/Makefile
@@ -7,6 +7,7 @@ panfrost-y := \
 	panfrost_gem.o \
 	panfrost_gpu.o \
 	panfrost_job.o \
-	panfrost_mmu.o
+	panfrost_mmu.o \
+	panfrost_perfcnt.o
 
 obj-$(CONFIG_DRM_PANFROST) += panfrost.o
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index ccb8eb2a518c..8a111d7c0200 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -14,6 +14,7 @@
 #include "panfrost_gpu.h"
 #include "panfrost_job.h"
 #include "panfrost_mmu.h"
+#include "panfrost_perfcnt.h"
 
 static int panfrost_reset_init(struct panfrost_device *pfdev)
 {
@@ -171,7 +172,13 @@ int panfrost_device_init(struct panfrost_device *pfdev)
 	pm_runtime_mark_last_busy(pfdev->dev);
 	pm_runtime_put_autosuspend(pfdev->dev);
 
+	err = panfrost_perfcnt_init(pfdev);
+	if (err)
+		goto err_out5;
+
 	return 0;
+err_out5:
+	panfrost_job_fini(pfdev);
 err_out4:
 	panfrost_mmu_fini(pfdev);
 err_out3:
@@ -187,6 +194,7 @@ err_out0:
 
 void panfrost_device_fini(struct panfrost_device *pfdev)
 {
+	panfrost_perfcnt_fini(pfdev);
 	panfrost_job_fini(pfdev);
 	panfrost_mmu_fini(pfdev);
 	panfrost_gpu_fini(pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index e86581c4af7b..83cc01cafde1 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -14,6 +14,7 @@ struct panfrost_device;
 struct panfrost_mmu;
 struct panfrost_job_slot;
 struct panfrost_job;
+struct panfrost_perfcnt;
 
 #define NUM_JOB_SLOTS 3
 
@@ -78,6 +79,8 @@ struct panfrost_device {
 	struct panfrost_job *jobs[NUM_JOB_SLOTS];
 	struct list_head scheduled_jobs;
 
+	struct panfrost_perfcnt *perfcnt;
+
 	struct mutex sched_lock;
 	struct mutex reset_lock;
 
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 754881ece8d7..e34e86a7378a 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -19,6 +19,7 @@
 #include "panfrost_mmu.h"
 #include "panfrost_job.h"
 #include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
 
 static bool unstable_ioctls;
 module_param_unsafe(unstable_ioctls, bool, 0600);
@@ -329,6 +330,7 @@ panfrost_postclose(struct drm_device *dev, struct drm_file *file)
 {
 	struct panfrost_file_priv *panfrost_priv = file->driver_priv;
 
+	panfrost_perfcnt_close(panfrost_priv);
 	panfrost_job_close(panfrost_priv);
 
 	kfree(panfrost_priv);
@@ -348,6 +350,8 @@ static const struct drm_ioctl_desc panfrost_drm_driver_ioctls[] = {
 	PANFROST_IOCTL(MMAP_BO,		mmap_bo,	DRM_RENDER_ALLOW),
 	PANFROST_IOCTL(GET_PARAM,	get_param,	DRM_RENDER_ALLOW),
 	PANFROST_IOCTL(GET_BO_OFFSET,	get_bo_offset,	DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(PERFCNT_ENABLE,	perfcnt_enable,	DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(PERFCNT_DUMP,	perfcnt_dump,	DRM_RENDER_ALLOW),
 };
 
 DEFINE_DRM_GEM_SHMEM_FOPS(panfrost_drm_driver_fops);
diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c
index 6e68a100291c..20ab333fc925 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gpu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c
@@ -15,6 +15,7 @@
 #include "panfrost_features.h"
 #include "panfrost_issues.h"
 #include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
 #include "panfrost_regs.h"
 
 static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
@@ -40,6 +41,12 @@ static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
 		gpu_write(pfdev, GPU_INT_MASK, 0);
 	}
 
+	if (state & GPU_IRQ_PERFCNT_SAMPLE_COMPLETED)
+		panfrost_perfcnt_sample_done(pfdev);
+
+	if (state & GPU_IRQ_CLEAN_CACHES_COMPLETED)
+		panfrost_perfcnt_clean_cache_done(pfdev);
+
 	gpu_write(pfdev, GPU_INT_CLEAR, state);
 
 	return IRQ_HANDLED;
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
new file mode 100644
index 000000000000..83c57d325ca8
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 Collabora Ltd */
+
+#include <drm/drm_file.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/panfrost_drm.h>
+#include <linux/completion.h>
+#include <linux/iopoll.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "panfrost_device.h"
+#include "panfrost_features.h"
+#include "panfrost_gem.h"
+#include "panfrost_issues.h"
+#include "panfrost_job.h"
+#include "panfrost_mmu.h"
+#include "panfrost_regs.h"
+
+#define COUNTERS_PER_BLOCK		64
+#define BYTES_PER_COUNTER		4
+#define BLOCKS_PER_COREGROUP		8
+#define V4_SHADERS_PER_COREGROUP	4
+
+struct panfrost_perfcnt {
+	struct panfrost_gem_object *bo;
+	size_t bosize;
+	void *buf;
+	struct panfrost_file_priv *user;
+	struct mutex lock;
+	struct completion dump_comp;
+};
+
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev)
+{
+	complete(&pfdev->perfcnt->dump_comp);
+}
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev)
+{
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES);
+}
+
+static int panfrost_perfcnt_dump_locked(struct panfrost_device *pfdev)
+{
+	u64 gpuva;
+	int ret;
+
+	reinit_completion(&pfdev->perfcnt->dump_comp);
+	gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT;
+	gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva);
+	gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32);
+	gpu_write(pfdev, GPU_INT_CLEAR,
+		  GPU_IRQ_CLEAN_CACHES_COMPLETED |
+		  GPU_IRQ_PERFCNT_SAMPLE_COMPLETED);
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE);
+	ret = wait_for_completion_interruptible_timeout(&pfdev->perfcnt->dump_comp,
+							msecs_to_jiffies(1000));
+	if (!ret)
+		ret = -ETIMEDOUT;
+	else if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int panfrost_perfcnt_enable_locked(struct panfrost_device *pfdev,
+					  struct panfrost_file_priv *user,
+					  unsigned int counterset)
+{
+	struct panfrost_perfcnt *perfcnt = pfdev->perfcnt;
+	struct drm_gem_shmem_object *bo;
+	u32 cfg;
+	int ret;
+
+	if (user == perfcnt->user)
+		return 0;
+	else if (perfcnt->user)
+		return -EBUSY;
+
+	ret = pm_runtime_get_sync(pfdev->dev);
+	if (ret < 0)
+		return ret;
+
+	bo = drm_gem_shmem_create(pfdev->ddev, perfcnt->bosize);
+	if (IS_ERR(bo))
+		return PTR_ERR(bo);
+
+	perfcnt->bo = to_panfrost_bo(&bo->base);
+
+	/* Map the perfcnt buf in the address space attached to file_priv. */
+	ret = panfrost_mmu_map(perfcnt->bo);
+	if (ret)
+		goto err_put_bo;
+
+	perfcnt->buf = drm_gem_shmem_vmap(&bo->base);
+	if (IS_ERR(perfcnt->buf)) {
+		ret = PTR_ERR(perfcnt->buf);
+		goto err_put_bo;
+	}
+
+	/*
+	 * Invalidate the cache and clear the counters to start from a fresh
+	 * state.
+	 */
+	reinit_completion(&pfdev->perfcnt->dump_comp);
+	gpu_write(pfdev, GPU_INT_CLEAR,
+		  GPU_IRQ_CLEAN_CACHES_COMPLETED |
+		  GPU_IRQ_PERFCNT_SAMPLE_COMPLETED);
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_CLEAR);
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_INV_CACHES);
+	ret = wait_for_completion_timeout(&pfdev->perfcnt->dump_comp,
+					  msecs_to_jiffies(1000));
+	if (!ret) {
+		ret = -ETIMEDOUT;
+		goto err_vunmap;
+	}
+
+	perfcnt->user = user;
+
+	/*
+	 * Always use address space 0 for now.
+	 * FIXME: this needs to be updated when we start using different
+	 * address space.
+	 */
+	cfg = GPU_PERFCNT_CFG_AS(0) |
+	      GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL);
+
+	/*
+	 * Bifrost GPUs have 2 set of counters, but we're only interested by
+	 * the first one for now.
+	 */
+	if (panfrost_model_is_bifrost(pfdev))
+		cfg |= GPU_PERFCNT_CFG_SETSEL(counterset);
+
+	gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0xffffffff);
+	gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0xffffffff);
+	gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0xffffffff);
+
+	/*
+	 * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
+	 * counters.
+	 */
+	if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+	else
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);
+
+	gpu_write(pfdev, GPU_PERFCNT_CFG, cfg);
+
+	if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);
+
+	return 0;
+
+err_vunmap:
+	drm_gem_shmem_vunmap(&perfcnt->bo->base.base, perfcnt->buf);
+err_put_bo:
+	drm_gem_object_put_unlocked(&bo->base);
+	return ret;
+}
+
+static int panfrost_perfcnt_disable_locked(struct panfrost_device *pfdev,
+					   struct panfrost_file_priv *user)
+{
+	struct panfrost_perfcnt *perfcnt = pfdev->perfcnt;
+
+	if (user != perfcnt->user)
+		return -EINVAL;
+
+	gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0x0);
+	gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0x0);
+	gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0x0);
+	gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
+
+	perfcnt->user = NULL;
+	drm_gem_shmem_vunmap(&perfcnt->bo->base.base, perfcnt->buf);
+	perfcnt->buf = NULL;
+	drm_gem_object_put_unlocked(&perfcnt->bo->base.base);
+	perfcnt->bo = NULL;
+	pm_runtime_mark_last_busy(pfdev->dev);
+	pm_runtime_put_autosuspend(pfdev->dev);
+
+	return 0;
+}
+
+int panfrost_ioctl_perfcnt_enable(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct panfrost_device *pfdev = dev->dev_private;
+	struct panfrost_perfcnt *perfcnt = pfdev->perfcnt;
+	struct drm_panfrost_perfcnt_enable *req = data;
+	int ret;
+
+	ret = panfrost_unstable_ioctl_check();
+	if (ret)
+		return ret;
+
+	/* Only Bifrost GPUs have 2 set of counters. */
+	if (req->counterset > (panfrost_model_is_bifrost(pfdev) ? 1 : 0))
+		return -EINVAL;
+
+	mutex_lock(&perfcnt->lock);
+	if (req->enable)
+		ret = panfrost_perfcnt_enable_locked(pfdev, pfile,
+						     req->counterset);
+	else
+		ret = panfrost_perfcnt_disable_locked(pfdev, pfile);
+	mutex_unlock(&perfcnt->lock);
+
+	return ret;
+}
+
+int panfrost_ioctl_perfcnt_dump(struct drm_device *dev, void *data,
+				struct drm_file *file_priv)
+{
+	struct panfrost_device *pfdev = dev->dev_private;
+	struct panfrost_perfcnt *perfcnt = pfdev->perfcnt;
+	struct drm_panfrost_perfcnt_dump *req = data;
+	void __user *user_ptr = (void __user *)(uintptr_t)req->buf_ptr;
+	int ret;
+
+	ret = panfrost_unstable_ioctl_check();
+	if (ret)
+		return ret;
+
+	mutex_lock(&perfcnt->lock);
+	if (perfcnt->user != file_priv->driver_priv) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = panfrost_perfcnt_dump_locked(pfdev);
+	if (ret)
+		goto out;
+
+	if (copy_to_user(user_ptr, perfcnt->buf, perfcnt->bosize))
+		ret = -EFAULT;
+
+out:
+	mutex_unlock(&perfcnt->lock);
+
+	return ret;
+}
+
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile)
+{
+	struct panfrost_device *pfdev = pfile->pfdev;
+	struct panfrost_perfcnt *perfcnt = pfdev->perfcnt;
+
+	pm_runtime_get_sync(pfdev->dev);
+	mutex_lock(&perfcnt->lock);
+	if (perfcnt->user == pfile)
+		panfrost_perfcnt_disable_locked(pfdev, pfile);
+	mutex_unlock(&perfcnt->lock);
+	pm_runtime_mark_last_busy(pfdev->dev);
+	pm_runtime_put_autosuspend(pfdev->dev);
+}
+
+int panfrost_perfcnt_init(struct panfrost_device *pfdev)
+{
+	struct panfrost_perfcnt *perfcnt;
+	size_t size;
+
+	if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+		unsigned int ncoregroups;
+
+		ncoregroups = hweight64(pfdev->features.l2_present);
+		size = ncoregroups * BLOCKS_PER_COREGROUP *
+		       COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+	} else {
+		unsigned int nl2c, ncores;
+
+		/*
+		 * TODO: define a macro to extract the number of l2 caches from
+		 * mem_features.
+		 */
+		nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+
+		/*
+		 * shader_present might be sparse, but the counters layout
+		 * forces to dump unused regions too, hence the fls64() call
+		 * instead of hweight64().
+		 */
+		ncores = fls64(pfdev->features.shader_present);
+
+		/*
+		 * There's always one JM and one Tiler block, hence the '+ 2'
+		 * here.
+		 */
+		size = (nl2c + ncores + 2) *
+		       COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+	}
+
+	perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL);
+	if (!perfcnt)
+		return -ENOMEM;
+
+	perfcnt->bosize = size;
+
+	/* Start with everything disabled. */
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
+	gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+
+	init_completion(&perfcnt->dump_comp);
+	mutex_init(&perfcnt->lock);
+	pfdev->perfcnt = perfcnt;
+
+	return 0;
+}
+
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev)
+{
+	/* Disable everything before leaving. */
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
+	gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+}
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.h b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
new file mode 100644
index 000000000000..13b8fdaa1b43
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2019 Collabora Ltd */
+#ifndef __PANFROST_PERFCNT_H__
+#define __PANFROST_PERFCNT_H__
+
+#include "panfrost_device.h"
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev);
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev);
+int panfrost_perfcnt_init(struct panfrost_device *pfdev);
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev);
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile);
+int panfrost_ioctl_perfcnt_enable(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv);
+int panfrost_ioctl_perfcnt_dump(struct drm_device *dev, void *data,
+				struct drm_file *file_priv);
+
+#endif
diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h
index 42d08860fd76..ea38ac60581c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_regs.h
+++ b/drivers/gpu/drm/panfrost/panfrost_regs.h
@@ -44,12 +44,31 @@
 	 GPU_IRQ_MULTIPLE_FAULT)
 #define GPU_CMD				0x30
 #define   GPU_CMD_SOFT_RESET		0x01
+#define   GPU_CMD_PERFCNT_CLEAR		0x03
+#define   GPU_CMD_PERFCNT_SAMPLE	0x04
+#define   GPU_CMD_CLEAN_CACHES		0x07
+#define   GPU_CMD_CLEAN_INV_CACHES	0x08
 #define GPU_STATUS			0x34
+#define   GPU_STATUS_PRFCNT_ACTIVE	BIT(2)
 #define GPU_LATEST_FLUSH_ID		0x38
 #define GPU_FAULT_STATUS		0x3C
 #define GPU_FAULT_ADDRESS_LO		0x40
 #define GPU_FAULT_ADDRESS_HI		0x44
 
+#define GPU_PERFCNT_BASE_LO		0x60
+#define GPU_PERFCNT_BASE_HI		0x64
+#define GPU_PERFCNT_CFG			0x68
+#define   GPU_PERFCNT_CFG_MODE(x)	(x)
+#define   GPU_PERFCNT_CFG_MODE_OFF	0
+#define   GPU_PERFCNT_CFG_MODE_MANUAL	1
+#define   GPU_PERFCNT_CFG_MODE_TILE	2
+#define   GPU_PERFCNT_CFG_AS(x)		((x) << 4)
+#define   GPU_PERFCNT_CFG_SETSEL(x)	((x) << 8)
+#define GPU_PRFCNT_JM_EN		0x6c
+#define GPU_PRFCNT_SHADER_EN		0x70
+#define GPU_PRFCNT_TILER_EN		0x74
+#define GPU_PRFCNT_MMU_L2_EN		0x7c
+
 #define GPU_THREAD_MAX_THREADS		0x0A0	/* (RO) Maximum number of threads per core */
 #define GPU_THREAD_MAX_WORKGROUP_SIZE	0x0A4	/* (RO) Maximum workgroup size */
 #define GPU_THREAD_MAX_BARRIER_SIZE	0x0A8	/* (RO) Maximum threads waiting at a barrier */
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h
index a52e0283b90d..b5d370638846 100644
--- a/include/uapi/drm/panfrost_drm.h
+++ b/include/uapi/drm/panfrost_drm.h
@@ -18,6 +18,8 @@ extern "C" {
 #define DRM_PANFROST_MMAP_BO			0x03
 #define DRM_PANFROST_GET_PARAM			0x04
 #define DRM_PANFROST_GET_BO_OFFSET		0x05
+#define DRM_PANFROST_PERFCNT_ENABLE		0x06
+#define DRM_PANFROST_PERFCNT_DUMP		0x07
 
 #define DRM_IOCTL_PANFROST_SUBMIT		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_SUBMIT, struct drm_panfrost_submit)
 #define DRM_IOCTL_PANFROST_WAIT_BO		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_WAIT_BO, struct drm_panfrost_wait_bo)
@@ -26,6 +28,15 @@ extern "C" {
 #define DRM_IOCTL_PANFROST_GET_PARAM		DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PARAM, struct drm_panfrost_get_param)
 #define DRM_IOCTL_PANFROST_GET_BO_OFFSET	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_BO_OFFSET, struct drm_panfrost_get_bo_offset)
 
+/*
+ * Unstable ioctl(s): only exposed when the unsafe unstable_ioctls module
+ * param is set to true.
+ * All these ioctl(s) are subject to deprecation, so please don't rely on
+ * them for anything but debugging purpose.
+ */
+#define DRM_IOCTL_PANFROST_PERFCNT_ENABLE	DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_PERFCNT_ENABLE, struct drm_panfrost_perfcnt_enable)
+#define DRM_IOCTL_PANFROST_PERFCNT_DUMP		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_PERFCNT_DUMP, struct drm_panfrost_perfcnt_dump)
+
 #define PANFROST_JD_REQ_FS (1 << 0)
 /**
  * struct drm_panfrost_submit - ioctl argument for submitting commands to the 3D
@@ -135,6 +146,19 @@ struct drm_panfrost_get_bo_offset {
 	__u64 offset;
 };
 
+struct drm_panfrost_perfcnt_enable {
+	__u32 enable;
+	/*
+	 * On bifrost we have 2 sets of counters, this parameter defines the
+	 * one to track.
+	 */
+	__u32 counterset;
+};
+
+struct drm_panfrost_perfcnt_dump {
+	__u64 buf_ptr;
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3


From 0e2d00eb6fd45f2a645f4874286bdc5b4b53782b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 13 Jun 2019 21:38:18 -0300
Subject: RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload

Allow userspace to issue a netlink query against the ib_device for
something like "uverbs" and get back the char dev name, inode major/minor,
and interface ABI information for "uverbs0".

Since we are now in netlink this can also trigger a module autoload to
make the uverbs device come into existence.

Largely this will let us replace searching and reading inside sysfs to
setup devices, and provides an alternative (using driver_id) to device
name based provider binding for things like rxe.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/core_priv.h |  9 ++++
 drivers/infiniband/core/device.c    | 98 +++++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/nldev.c     | 94 +++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h             |  4 ++
 include/rdma/rdma_netlink.h         |  2 +
 include/uapi/rdma/rdma_netlink.h    | 14 ++++++
 6 files changed, 221 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index ff40a450b5d2..a953c2fa2e78 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -88,6 +88,15 @@ typedef int (*nldev_callback)(struct ib_device *device,
 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
 		     struct netlink_callback *cb);
 
+struct ib_client_nl_info {
+	struct sk_buff *nl_msg;
+	struct device *cdev;
+	unsigned int port;
+	u64 abi;
+};
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
+			  struct ib_client_nl_info *res);
+
 enum ib_cache_gid_default_mode {
 	IB_CACHE_GID_DEFAULT_MODE_SET,
 	IB_CACHE_GID_DEFAULT_MODE_DELETE
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index abb169f31d0f..7db8566cdb89 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1726,6 +1726,104 @@ void ib_unregister_client(struct ib_client *client)
 }
 EXPORT_SYMBOL(ib_unregister_client);
 
+static int __ib_get_global_client_nl_info(const char *client_name,
+					  struct ib_client_nl_info *res)
+{
+	struct ib_client *client;
+	unsigned long index;
+	int ret = -ENOENT;
+
+	down_read(&clients_rwsem);
+	xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
+		if (strcmp(client->name, client_name) != 0)
+			continue;
+		if (!client->get_global_nl_info) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+		ret = client->get_global_nl_info(res);
+		if (WARN_ON(ret == -ENOENT))
+			ret = -EINVAL;
+		if (!ret && res->cdev)
+			get_device(res->cdev);
+		break;
+	}
+	up_read(&clients_rwsem);
+	return ret;
+}
+
+static int __ib_get_client_nl_info(struct ib_device *ibdev,
+				   const char *client_name,
+				   struct ib_client_nl_info *res)
+{
+	unsigned long index;
+	void *client_data;
+	int ret = -ENOENT;
+
+	down_read(&ibdev->client_data_rwsem);
+	xan_for_each_marked (&ibdev->client_data, index, client_data,
+			     CLIENT_DATA_REGISTERED) {
+		struct ib_client *client = xa_load(&clients, index);
+
+		if (!client || strcmp(client->name, client_name) != 0)
+			continue;
+		if (!client->get_nl_info) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+		ret = client->get_nl_info(ibdev, client_data, res);
+		if (WARN_ON(ret == -ENOENT))
+			ret = -EINVAL;
+
+		/*
+		 * The cdev is guaranteed valid as long as we are inside the
+		 * client_data_rwsem as remove_one can't be called. Keep it
+		 * valid for the caller.
+		 */
+		if (!ret && res->cdev)
+			get_device(res->cdev);
+		break;
+	}
+	up_read(&ibdev->client_data_rwsem);
+
+	return ret;
+}
+
+/**
+ * ib_get_client_nl_info - Fetch the nl_info from a client
+ * @device - IB device
+ * @client_name - Name of the client
+ * @res - Result of the query
+ */
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
+			  struct ib_client_nl_info *res)
+{
+	int ret;
+
+	if (ibdev)
+		ret = __ib_get_client_nl_info(ibdev, client_name, res);
+	else
+		ret = __ib_get_global_client_nl_info(client_name, res);
+#ifdef CONFIG_MODULES
+	if (ret == -ENOENT) {
+		request_module("rdma-client-%s", client_name);
+		if (ibdev)
+			ret = __ib_get_client_nl_info(ibdev, client_name, res);
+		else
+			ret = __ib_get_global_client_nl_info(client_name, res);
+	}
+#endif
+	if (ret) {
+		if (ret == -ENOENT)
+			return -EOPNOTSUPP;
+		return ret;
+	}
+
+	if (WARN_ON(!res->cdev))
+		return -EINVAL;
+	return 0;
+}
+
 /**
  * ib_set_client_data - Set IB client context
  * @device:Device to set context for
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 69188cbbd99b..16b5d6d4dd1c 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -120,6 +120,12 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_DEV_PROTOCOL]		= { .type = NLA_NUL_STRING,
 				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
 	[RDMA_NLDEV_NET_NS_FD]			= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_CHARDEV]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_CHARDEV_ABI]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_CHARDEV_TYPE]		= { .type = NLA_NUL_STRING,
+				    .len = 128 },
+	[RDMA_NLDEV_ATTR_CHARDEV_NAME]		= { .type = NLA_NUL_STRING,
+				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -1347,6 +1353,91 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return 0;
 }
 
+static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	char client_name[IB_DEVICE_NAME_MAX];
+	struct ib_client_nl_info data = {};
+	struct ib_device *ibdev = NULL;
+	struct sk_buff *msg;
+	u32 index;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+			  extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE])
+		return -EINVAL;
+
+	if (nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE],
+			sizeof(client_name)) >= sizeof(client_name))
+		return -EINVAL;
+
+	if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) {
+		index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+		ibdev = ib_device_get_by_index(sock_net(skb->sk), index);
+		if (!ibdev)
+			return -EINVAL;
+
+		if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+			data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+			if (!rdma_is_port_valid(ibdev, data.port)) {
+				err = -EINVAL;
+				goto out_put;
+			}
+		} else {
+			data.port = -1;
+		}
+	} else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+		return -EINVAL;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		err = -ENOMEM;
+		goto out_put;
+	}
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_GET_CHARDEV),
+			0, 0);
+
+	data.nl_msg = msg;
+	err = ib_get_client_nl_info(ibdev, client_name, &data);
+	if (err)
+		goto out_nlmsg;
+
+	err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV,
+				huge_encode_dev(data.cdev->devt),
+				RDMA_NLDEV_ATTR_PAD);
+	if (err)
+		goto out_data;
+	err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi,
+				RDMA_NLDEV_ATTR_PAD);
+	if (err)
+		goto out_data;
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME,
+			   dev_name(data.cdev))) {
+		err = -EMSGSIZE;
+		goto out_data;
+	}
+
+	nlmsg_end(msg, nlh);
+	put_device(data.cdev);
+	if (ibdev)
+		ib_device_put(ibdev);
+	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+out_data:
+	put_device(data.cdev);
+out_nlmsg:
+	nlmsg_free(msg);
+out_put:
+	if (ibdev)
+		ib_device_put(ibdev);
+	return err;
+}
+
 static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 			      struct netlink_ext_ack *extack)
 {
@@ -1404,6 +1495,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 		.doit = nldev_get_doit,
 		.dump = nldev_get_dumpit,
 	},
+	[RDMA_NLDEV_CMD_GET_CHARDEV] = {
+		.doit = nldev_get_chardev,
+	},
 	[RDMA_NLDEV_CMD_SET] = {
 		.doit = nldev_set_doit,
 		.flags = RDMA_NL_ADMIN_PERM,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 973514ea17a7..a1265e9ce2d1 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2684,10 +2684,14 @@ struct ib_device {
 	u32 iw_driver_flags;
 };
 
+struct ib_client_nl_info;
 struct ib_client {
 	const char *name;
 	void (*add)   (struct ib_device *);
 	void (*remove)(struct ib_device *, void *client_data);
+	int (*get_nl_info)(struct ib_device *ibdev, void *client_data,
+			   struct ib_client_nl_info *res);
+	int (*get_global_nl_info)(struct ib_client_nl_info *res);
 
 	/* Returns the net_dev belonging to this ib_client and matching the
 	 * given parameters.
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 10732ab31ba2..c7acbe083428 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -110,4 +110,6 @@ void rdma_link_register(struct rdma_link_ops *ops);
 void rdma_link_unregister(struct rdma_link_ops *ops);
 
 #define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type)
+#define MODULE_ALIAS_RDMA_CLIENT(type) MODULE_ALIAS("rdma-client-" type)
+
 #endif /* _RDMA_NETLINK_H */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index f588e8551c6c..9903db21a42c 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -279,6 +279,8 @@ enum rdma_nldev_command {
 
 	RDMA_NLDEV_CMD_RES_PD_GET, /* can dump */
 
+	RDMA_NLDEV_CMD_GET_CHARDEV,
+
 	RDMA_NLDEV_NUM_OPS
 };
 
@@ -491,6 +493,18 @@ enum rdma_nldev_attr {
 	 */
 	RDMA_NLDEV_NET_NS_FD,			/* u32 */
 
+	/*
+	 * Information about a chardev.
+	 * CHARDEV_TYPE is the name of the chardev ABI (ie uverbs, umad, etc)
+	 * CHARDEV_ABI signals the ABI revision (historical)
+	 * CHARDEV_NAME is the kernel name for the /dev/ file (no directory)
+	 * CHARDEV is the 64 bit dev_t for the inode
+	 */
+	RDMA_NLDEV_ATTR_CHARDEV_TYPE,		/* string */
+	RDMA_NLDEV_ATTR_CHARDEV_NAME,		/* string */
+	RDMA_NLDEV_ATTR_CHARDEV_ABI,		/* u64 */
+	RDMA_NLDEV_ATTR_CHARDEV,		/* u64 */
+
 	/*
 	 * Always the end
 	 */
-- 
cgit v1.2.3


From 8f71bb0030b8816f57be142f95b3c7189c6eaf4c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 13 Jun 2019 21:38:19 -0300
Subject: RDMA: Report available cdevs through RDMA_NLDEV_CMD_GET_CHARDEV

Update the struct ib_client for all modules exporting cdevs related to the
ibdevice to also implement RDMA_NLDEV_CMD_GET_CHARDEV. All cdevs are now
autoloadable and discoverable by userspace over netlink instead of relying
on sysfs.

uverbs also exposes the DRIVER_ID for drivers that are able to support
driver id binding in rdma-core.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/device.c             |  3 ++
 drivers/infiniband/core/nldev.c              |  1 +
 drivers/infiniband/core/ucma.c               | 23 +++++++++++++
 drivers/infiniband/core/user_mad.c           | 51 +++++++++++++++++++++++++---
 drivers/infiniband/core/uverbs_main.c        | 32 ++++++++++++++++-
 drivers/infiniband/hw/cxgb3/iwch_provider.c  |  1 +
 drivers/infiniband/hw/hns/hns_roce_main.c    |  1 +
 drivers/infiniband/hw/mthca/mthca_provider.c |  1 +
 include/rdma/ib_verbs.h                      |  1 +
 include/uapi/rdma/rdma_netlink.h             |  1 +
 10 files changed, 110 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7db8566cdb89..1de4ae5d5e0e 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2428,6 +2428,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	if (ops->uverbs_abi_ver)
 		dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
 
+	dev_ops->uverbs_no_driver_id_binding |=
+		ops->uverbs_no_driver_id_binding;
+
 	SET_DEVICE_OP(dev_ops, add_gid);
 	SET_DEVICE_OP(dev_ops, advise_mr);
 	SET_DEVICE_OP(dev_ops, alloc_dm);
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 16b5d6d4dd1c..3cad72a609ff 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -126,6 +126,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 				    .len = 128 },
 	[RDMA_NLDEV_ATTR_CHARDEV_NAME]		= { .type = NLA_NUL_STRING,
 				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+	[RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID]	= { .type = NLA_U32 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 39823c842202..0274e9b704be 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -52,6 +52,8 @@
 #include <rdma/rdma_cm_ib.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib.h>
+#include <rdma/rdma_netlink.h>
+#include "core_priv.h"
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
@@ -1788,6 +1790,19 @@ static struct miscdevice ucma_misc = {
 	.fops		= &ucma_fops,
 };
 
+static int ucma_get_global_nl_info(struct ib_client_nl_info *res)
+{
+	res->abi = RDMA_USER_CM_ABI_VERSION;
+	res->cdev = ucma_misc.this_device;
+	return 0;
+}
+
+static struct ib_client rdma_cma_client = {
+	.name = "rdma_cm",
+	.get_global_nl_info = ucma_get_global_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("rdma_cm");
+
 static ssize_t show_abi_version(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
@@ -1816,7 +1831,14 @@ static int __init ucma_init(void)
 		ret = -ENOMEM;
 		goto err2;
 	}
+
+	ret = ib_register_client(&rdma_cma_client);
+	if (ret)
+		goto err3;
+
 	return 0;
+err3:
+	unregister_net_sysctl_table(ucma_ctl_table_hdr);
 err2:
 	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
 err1:
@@ -1826,6 +1848,7 @@ err1:
 
 static void __exit ucma_cleanup(void)
 {
+	ib_unregister_client(&rdma_cma_client);
 	unregister_net_sysctl_table(ucma_ctl_table_hdr);
 	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
 	misc_deregister(&ucma_misc);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 671f07ba1fad..547090b41cfb 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -54,6 +54,7 @@
 
 #include <rdma/ib_mad.h>
 #include <rdma/ib_user_mad.h>
+#include <rdma/rdma_netlink.h>
 
 #include "core_priv.h"
 
@@ -1124,11 +1125,48 @@ static const struct file_operations umad_sm_fops = {
 	.llseek	 = no_llseek,
 };
 
+static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data,
+			       struct ib_client_nl_info *res)
+{
+	struct ib_umad_device *umad_dev = client_data;
+
+	if (!rdma_is_port_valid(ibdev, res->port))
+		return -EINVAL;
+
+	res->abi = IB_USER_MAD_ABI_VERSION;
+	res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].dev;
+
+	return 0;
+}
+
 static struct ib_client umad_client = {
 	.name   = "umad",
 	.add    = ib_umad_add_one,
-	.remove = ib_umad_remove_one
+	.remove = ib_umad_remove_one,
+	.get_nl_info = ib_umad_get_nl_info,
 };
+MODULE_ALIAS_RDMA_CLIENT("umad");
+
+static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data,
+			       struct ib_client_nl_info *res)
+{
+	struct ib_umad_device *umad_dev =
+		ib_get_client_data(ibdev, &umad_client);
+
+	if (!rdma_is_port_valid(ibdev, res->port))
+		return -EINVAL;
+
+	res->abi = IB_USER_MAD_ABI_VERSION;
+	res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].sm_dev;
+
+	return 0;
+}
+
+static struct ib_client issm_client = {
+	.name = "issm",
+	.get_nl_info = ib_issm_get_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("issm");
 
 static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr,
 			  char *buf)
@@ -1387,13 +1425,17 @@ static int __init ib_umad_init(void)
 	}
 
 	ret = ib_register_client(&umad_client);
-	if (ret) {
-		pr_err("couldn't register ib_umad client\n");
+	if (ret)
 		goto out_class;
-	}
+
+	ret = ib_register_client(&issm_client);
+	if (ret)
+		goto out_client;
 
 	return 0;
 
+out_client:
+	ib_unregister_client(&umad_client);
 out_class:
 	class_unregister(&umad_class);
 
@@ -1411,6 +1453,7 @@ out:
 
 static void __exit ib_umad_cleanup(void)
 {
+	ib_unregister_client(&issm_client);
 	ib_unregister_client(&umad_client);
 	class_unregister(&umad_class);
 	unregister_chrdev_region(base_umad_dev,
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 870b3dd35aac..11c13c1381cf 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -51,6 +51,7 @@
 
 #include <rdma/ib.h>
 #include <rdma/uverbs_std_types.h>
+#include <rdma/rdma_netlink.h>
 
 #include "uverbs.h"
 #include "core_priv.h"
@@ -1148,12 +1149,41 @@ static const struct file_operations uverbs_mmap_fops = {
 	.compat_ioctl = ib_uverbs_ioctl,
 };
 
+static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data,
+				 struct ib_client_nl_info *res)
+{
+	struct ib_uverbs_device *uverbs_dev = client_data;
+	int ret;
+
+	if (res->port != -1)
+		return -EINVAL;
+
+	res->abi = ibdev->ops.uverbs_abi_ver;
+	res->cdev = &uverbs_dev->dev;
+
+	/*
+	 * To support DRIVER_ID binding in userspace some of the driver need
+	 * upgrading to expose their PCI dependent revision information
+	 * through get_context instead of relying on modalias matching. When
+	 * the drivers are fixed they can drop this flag.
+	 */
+	if (!ibdev->ops.uverbs_no_driver_id_binding) {
+		ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID,
+				  ibdev->ops.driver_id);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 static struct ib_client uverbs_client = {
 	.name   = "uverbs",
 	.no_kverbs_req = true,
 	.add    = ib_uverbs_add_one,
-	.remove = ib_uverbs_remove_one
+	.remove = ib_uverbs_remove_one,
+	.get_nl_info = ib_uverbs_get_nl_info,
 };
+MODULE_ALIAS_RDMA_CLIENT("uverbs");
 
 static ssize_t ibdev_show(struct device *device, struct device_attribute *attr,
 			  char *buf)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index acba96f289cc..810fa96af2e9 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1230,6 +1230,7 @@ static const struct ib_device_ops iwch_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_CXGB3,
 	.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION,
+	.uverbs_no_driver_id_binding = 1,
 
 	.alloc_hw_stats	= iwch_alloc_stats,
 	.alloc_mr = iwch_alloc_mr,
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 3e45b119b0eb..c0e819ed8c9b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -417,6 +417,7 @@ static const struct ib_device_ops hns_roce_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_HNS,
 	.uverbs_abi_ver = 1,
+	.uverbs_no_driver_id_binding = 1,
 
 	.add_gid = hns_roce_add_gid,
 	.alloc_pd = hns_roce_alloc_pd,
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index efd4e3d13ae2..d97124bee703 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1147,6 +1147,7 @@ static const struct ib_device_ops mthca_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_MTHCA,
 	.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION,
+	.uverbs_no_driver_id_binding = 1,
 
 	.alloc_pd = mthca_alloc_pd,
 	.alloc_ucontext = mthca_alloc_ucontext,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index a1265e9ce2d1..6f09fcc21d7a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2321,6 +2321,7 @@ struct ib_device_ops {
 	struct module *owner;
 	enum rdma_driver_id driver_id;
 	u32 uverbs_abi_ver;
+	unsigned int uverbs_no_driver_id_binding:1;
 
 	int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr,
 			 const struct ib_send_wr **bad_send_wr);
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 9903db21a42c..b27c02185dcc 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -504,6 +504,7 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_CHARDEV_NAME,		/* string */
 	RDMA_NLDEV_ATTR_CHARDEV_ABI,		/* u64 */
 	RDMA_NLDEV_ATTR_CHARDEV,		/* u64 */
+	RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID,       /* u64 */
 
 	/*
 	 * Always the end
-- 
cgit v1.2.3


From 7ef91224c4864202958b018cd5612db5cc9dc67d Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Tue, 18 Jun 2019 21:05:26 +0200
Subject: clk: samsung: Add bus clock for GPU/G3D on Exynos4412

Add ID and gate for bus clock for GPU (Mali 400) on Exynos4412.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
---
 drivers/clk/samsung/clk-exynos4.c   | 1 +
 include/dt-bindings/clock/exynos4.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/clk/samsung/clk-exynos4.c b/drivers/clk/samsung/clk-exynos4.c
index d2a68a792a21..ed4af7da9c4f 100644
--- a/drivers/clk/samsung/clk-exynos4.c
+++ b/drivers/clk/samsung/clk-exynos4.c
@@ -961,6 +961,7 @@ static const struct samsung_gate_clock exynos4210_gate_clks[] __initconst = {
 
 /* list of gate clocks supported in exynos4x12 soc */
 static const struct samsung_gate_clock exynos4x12_gate_clks[] __initconst = {
+	GATE(CLK_ASYNC_G3D, "async_g3d", "aclk200", GATE_IP_LEFTBUS, 6, 0, 0),
 	GATE(CLK_AUDSS, "audss", "sclk_epll", E4X12_GATE_IP_MAUDIO, 0, 0, 0),
 	GATE(CLK_MDNIE0, "mdnie0", "aclk160", GATE_IP_LCD0, 2, 0, 0),
 	GATE(CLK_ROTATOR, "rotator", "aclk200", E4X12_GATE_IP_IMAGE, 1, 0, 0),
diff --git a/include/dt-bindings/clock/exynos4.h b/include/dt-bindings/clock/exynos4.h
index a0439ce8e8d3..88ec3968b90a 100644
--- a/include/dt-bindings/clock/exynos4.h
+++ b/include/dt-bindings/clock/exynos4.h
@@ -187,6 +187,7 @@
 #define CLK_MIPI_HSI		349 /* Exynos4210 only */
 #define CLK_PIXELASYNCM0	351
 #define CLK_PIXELASYNCM1	352
+#define CLK_ASYNC_G3D		353 /* Exynos4x12 only */
 #define CLK_PWM_ISP_SCLK	379 /* Exynos4x12 only */
 #define CLK_SPI0_ISP_SCLK	380 /* Exynos4x12 only */
 #define CLK_SPI1_ISP_SCLK	381 /* Exynos4x12 only */
-- 
cgit v1.2.3


From 772cd52c5574b04b00a97d638b2cfe94c0c1a9b6 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Wed, 19 Jun 2019 12:17:48 +0200
Subject: drm/connector: Add documentation for drm_cmdline_mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The struct drm_cmdline_mode holds the result of the command line parsers.
However, it wasn't documented so far, so let's do that.

Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/963c893c16c6a25fc469b53c726f493d99bdc578.1560783090.git-series.maxime.ripard@bootlin.com
---
 include/drm/drm_connector.h | 86 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index c6f8486d8b8f..c802780b0bfc 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -923,18 +923,100 @@ struct drm_connector_funcs {
 				   const struct drm_connector_state *state);
 };
 
-/* mode specified on the command line */
+/**
+ * struct drm_cmdline_mode - DRM Mode passed through the kernel command-line
+ *
+ * Each connector can have an initial mode with additional options
+ * passed through the kernel command line. This structure allows to
+ * express those parameters and will be filled by the command-line
+ * parser.
+ */
 struct drm_cmdline_mode {
+	/**
+	 * @specified:
+	 *
+	 * Has a mode been read from the command-line?
+	 */
 	bool specified;
+
+	/**
+	 * @refresh_specified:
+	 *
+	 * Did the mode have a preferred refresh rate?
+	 */
 	bool refresh_specified;
+
+	/**
+	 * @bpp_specified:
+	 *
+	 * Did the mode have a preferred BPP?
+	 */
 	bool bpp_specified;
-	int xres, yres;
+
+	/**
+	 * @xres:
+	 *
+	 * Active resolution on the X axis, in pixels.
+	 */
+	int xres;
+
+	/**
+	 * @yres:
+	 *
+	 * Active resolution on the Y axis, in pixels.
+	 */
+	int yres;
+
+	/**
+	 * @bpp:
+	 *
+	 * Bits per pixels for the mode.
+	 */
 	int bpp;
+
+	/**
+	 * @refresh:
+	 *
+	 * Refresh rate, in Hertz.
+	 */
 	int refresh;
+
+	/**
+	 * @rb:
+	 *
+	 * Do we need to use reduced blanking?
+	 */
 	bool rb;
+
+	/**
+	 * @interlace:
+	 *
+	 * The mode is interlaced.
+	 */
 	bool interlace;
+
+	/**
+	 * @cvt:
+	 *
+	 * The timings will be calculated using the VESA Coordinated
+	 * Video Timings instead of looking up the mode from a table.
+	 */
 	bool cvt;
+
+	/**
+	 * @margins:
+	 *
+	 * Add margins to the mode calculation (1.8% of xres rounded
+	 * down to 8 pixels and 1.8% of yres).
+	 */
 	bool margins;
+
+	/**
+	 * @force:
+	 *
+	 * Ignore the hotplug state of the connector, and force its
+	 * state to one of the DRM_FORCE_* values.
+	 */
 	enum drm_connector_force force;
 };
 
-- 
cgit v1.2.3


From a99076e87e0644088b09182bba147754ba7238ad Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Wed, 19 Jun 2019 12:17:49 +0200
Subject: drm/client: Change drm_client_panel_rotation name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The drm_client_panel_rotation function has been used so far to set the
default rotation based on the panel orientation.

However, we can have more sources of information to make that decision,
starting with the command line that we will introduce later in this series.

Change the name to remove the panel mention.

Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/8cb0f0d9569d41685bbf30a1538da6578cd2769b.1560783090.git-series.maxime.ripard@bootlin.com
---
 drivers/gpu/drm/drm_client_modeset.c | 12 ++++++------
 drivers/gpu/drm/drm_fb_helper.c      |  2 +-
 include/drm/drm_client.h             |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c
index b4e5fb0a17cf..4869a0170bec 100644
--- a/drivers/gpu/drm/drm_client_modeset.c
+++ b/drivers/gpu/drm/drm_client_modeset.c
@@ -804,19 +804,19 @@ free_connectors:
 EXPORT_SYMBOL(drm_client_modeset_probe);
 
 /**
- * drm_client_panel_rotation() - Check panel orientation
+ * drm_client_rotation() - Check the initial rotation value
  * @modeset: DRM modeset
  * @rotation: Returned rotation value
  *
- * This function checks if the primary plane in @modeset can hw rotate to match
- * the panel orientation on its connector.
+ * This function checks if the primary plane in @modeset can hw rotate
+ * to match the rotation needed on its connector.
  *
  * Note: Currently only 0 and 180 degrees are supported.
  *
  * Return:
  * True if the plane can do the rotation, false otherwise.
  */
-bool drm_client_panel_rotation(struct drm_mode_set *modeset, unsigned int *rotation)
+bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation)
 {
 	struct drm_connector *connector = modeset->connectors[0];
 	struct drm_plane *plane = modeset->crtc->primary;
@@ -857,7 +857,7 @@ bool drm_client_panel_rotation(struct drm_mode_set *modeset, unsigned int *rotat
 
 	return true;
 }
-EXPORT_SYMBOL(drm_client_panel_rotation);
+EXPORT_SYMBOL(drm_client_rotation);
 
 static int drm_client_modeset_commit_atomic(struct drm_client_dev *client, bool active)
 {
@@ -902,7 +902,7 @@ retry:
 		struct drm_plane *primary = mode_set->crtc->primary;
 		unsigned int rotation;
 
-		if (drm_client_panel_rotation(mode_set, &rotation)) {
+		if (drm_client_rotation(mode_set, &rotation)) {
 			struct drm_plane_state *plane_state;
 
 			/* Cannot fail as we've already gotten the plane state above */
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 42852cae749b..1984e5c54d58 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -1722,7 +1722,7 @@ static void drm_setup_crtcs_fb(struct drm_fb_helper *fb_helper)
 
 		modeset->fb = fb_helper->fb;
 
-		if (drm_client_panel_rotation(modeset, &rotation))
+		if (drm_client_rotation(modeset, &rotation))
 			/* Rotating in hardware, fbcon should not rotate */
 			sw_rotations |= DRM_MODE_ROTATE_0;
 		else
diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h
index f2d5ed745733..72d51d1e9dd9 100644
--- a/include/drm/drm_client.h
+++ b/include/drm/drm_client.h
@@ -153,7 +153,7 @@ void drm_client_framebuffer_delete(struct drm_client_buffer *buffer);
 int drm_client_modeset_create(struct drm_client_dev *client);
 void drm_client_modeset_free(struct drm_client_dev *client);
 int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height);
-bool drm_client_panel_rotation(struct drm_mode_set *modeset, unsigned int *rotation);
+bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation);
 int drm_client_modeset_commit_force(struct drm_client_dev *client);
 int drm_client_modeset_commit(struct drm_client_dev *client);
 int drm_client_modeset_dpms(struct drm_client_dev *client, int mode);
-- 
cgit v1.2.3


From 3aeeb13d899627fe2b86bdbdcd0927cf7192234f Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Wed, 19 Jun 2019 12:17:50 +0200
Subject: drm/modes: Support modes names on the command line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The drm subsystem also uses the video= kernel parameter, and in the
documentation refers to the fbdev documentation for that parameter.

However, that documentation also says that instead of giving the mode using
its resolution we can also give a name. However, DRM doesn't handle that
case at the moment. Even though in most case it shouldn't make any
difference, it might be useful for analog modes, where different standards
might have the same resolution, but still have a few different parameters
that are not encoded in the modes (NTSC vs NTSC-J vs PAL-M for example).

Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/18443e0c3bdbbd16cea4ec63bc7f2079b820b43b.1560783090.git-series.maxime.ripard@bootlin.com
---
 drivers/gpu/drm/drm_client_modeset.c |  4 +++
 drivers/gpu/drm/drm_connector.c      |  3 +-
 drivers/gpu/drm/drm_modes.c          | 62 ++++++++++++++++++++++++++----------
 include/drm/drm_connector.h          |  7 ++++
 4 files changed, 59 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c
index 4869a0170bec..33d4988f22ae 100644
--- a/drivers/gpu/drm/drm_client_modeset.c
+++ b/drivers/gpu/drm/drm_client_modeset.c
@@ -149,6 +149,10 @@ drm_connector_pick_cmdline_mode(struct drm_connector *connector)
 	prefer_non_interlace = !cmdline_mode->interlace;
 again:
 	list_for_each_entry(mode, &connector->modes, head) {
+		/* Check (optional) mode name first */
+		if (!strcmp(mode->name, cmdline_mode->name))
+			return mode;
+
 		/* check width/height */
 		if (mode->hdisplay != cmdline_mode->xres ||
 		    mode->vdisplay != cmdline_mode->yres)
diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 3ccdcf3dfcde..3afed5677946 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -139,8 +139,9 @@ static void drm_connector_get_cmdline_mode(struct drm_connector *connector)
 		connector->force = mode->force;
 	}
 
-	DRM_DEBUG_KMS("cmdline mode for connector %s %dx%d@%dHz%s%s%s\n",
+	DRM_DEBUG_KMS("cmdline mode for connector %s %s %dx%d@%dHz%s%s%s\n",
 		      connector->name,
+		      mode->name ? mode->name : "",
 		      mode->xres, mode->yres,
 		      mode->refresh_specified ? mode->refresh : 60,
 		      mode->rb ? " reduced blanking" : "",
diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index 6debbd6c1763..429d3be17800 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -1580,7 +1580,7 @@ bool drm_mode_parse_command_line_for_connector(const char *mode_option,
 					       struct drm_cmdline_mode *mode)
 {
 	const char *name;
-	bool parse_extras = false;
+	bool named_mode = false, parse_extras = false;
 	unsigned int bpp_off = 0, refresh_off = 0;
 	unsigned int mode_end = 0;
 	char *bpp_ptr = NULL, *refresh_ptr = NULL, *extra_ptr = NULL;
@@ -1599,8 +1599,22 @@ bool drm_mode_parse_command_line_for_connector(const char *mode_option,
 
 	name = mode_option;
 
-	if (!isdigit(name[0]))
-		return false;
+	/*
+	 * This is a bit convoluted. To differentiate between the
+	 * named modes and poorly formatted resolutions, we need a
+	 * bunch of things:
+	 *   - We need to make sure that the first character (which
+	 *     would be our resolution in X) is a digit.
+	 *   - However, if the X resolution is missing, then we end up
+	 *     with something like x<yres>, with our first character
+	 *     being an alpha-numerical character, which would be
+	 *     considered a named mode.
+	 *
+	 * If this isn't enough, we should add more heuristics here,
+	 * and matching unit-tests.
+	 */
+	if (!isdigit(name[0]) && name[0] != 'x')
+		named_mode = true;
 
 	/* Try to locate the bpp and refresh specifiers, if any */
 	bpp_ptr = strchr(name, '-');
@@ -1611,6 +1625,9 @@ bool drm_mode_parse_command_line_for_connector(const char *mode_option,
 
 	refresh_ptr = strchr(name, '@');
 	if (refresh_ptr) {
+		if (named_mode)
+			return false;
+
 		refresh_off = refresh_ptr - name;
 		mode->refresh_specified = true;
 	}
@@ -1627,12 +1644,16 @@ bool drm_mode_parse_command_line_for_connector(const char *mode_option,
 		parse_extras = true;
 	}
 
-	ret = drm_mode_parse_cmdline_res_mode(name, mode_end,
-					      parse_extras,
-					      connector,
-					      mode);
-	if (ret)
-		return false;
+	if (named_mode) {
+		strncpy(mode->name, name, mode_end);
+	} else {
+		ret = drm_mode_parse_cmdline_res_mode(name, mode_end,
+						      parse_extras,
+						      connector,
+						      mode);
+		if (ret)
+			return false;
+	}
 	mode->specified = true;
 
 	if (bpp_ptr) {
@@ -1660,14 +1681,23 @@ bool drm_mode_parse_command_line_for_connector(const char *mode_option,
 		extra_ptr = refresh_end_ptr;
 
 	if (extra_ptr) {
-		int remaining = strlen(name) - (extra_ptr - name);
+		if (!named_mode) {
+			int len = strlen(name) - (extra_ptr - name);
 
-		/*
-		 * We still have characters to process, while
-		 * we shouldn't have any
-		 */
-		if (remaining > 0)
-			return false;
+			ret = drm_mode_parse_cmdline_extra(extra_ptr, len,
+							   connector, mode);
+			if (ret)
+				return false;
+		} else {
+			int remaining = strlen(name) - (extra_ptr - name);
+
+			/*
+			 * We still have characters to process, while
+			 * we shouldn't have any
+			 */
+			if (remaining > 0)
+				return false;
+		}
 	}
 
 	return true;
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index c802780b0bfc..cdf2fb910010 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -932,6 +932,13 @@ struct drm_connector_funcs {
  * parser.
  */
 struct drm_cmdline_mode {
+	/**
+	 * @name:
+	 *
+	 * Name of the mode.
+	 */
+	char name[DRM_DISPLAY_MODE_LEN];
+
 	/**
 	 * @specified:
 	 *
-- 
cgit v1.2.3


From 1bf4e09227c345e246062285eba4b8fe660e512e Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Wed, 19 Jun 2019 12:17:51 +0200
Subject: drm/modes: Allow to specify rotation and reflection on the
 commandline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rotations and reflections setup are needed in some scenarios to initialise
properly the initial framebuffer. Some drivers already had a bunch of
quirks to deal with this, such as either a private kernel command line
parameter (omapdss) or on the device tree (various panels).

In order to accomodate this, let's create a video mode parameter to deal
with the rotation and reflexion.

Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/777da16e42db757c1f5b414b5ca34507097fed5c.1560783090.git-series.maxime.ripard@bootlin.com
---
 Documentation/fb/modedb.txt          |  12 ++++
 drivers/gpu/drm/drm_client_modeset.c |  30 +++++++++
 drivers/gpu/drm/drm_modes.c          | 114 +++++++++++++++++++++++++++++------
 include/drm/drm_connector.h          |  10 +++
 4 files changed, 146 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/Documentation/fb/modedb.txt b/Documentation/fb/modedb.txt
index 16aa08453911..52418c6dbfc4 100644
--- a/Documentation/fb/modedb.txt
+++ b/Documentation/fb/modedb.txt
@@ -51,6 +51,18 @@ To force the VGA output to be enabled and drive a specific mode say:
 Specifying the option multiple times for different ports is possible, e.g.:
     video=LVDS-1:d video=HDMI-1:D
 
+Options can also be passed after the mode, using commas as separator.
+
+       Sample usage: 720x480,rotate=180 - 720x480 mode, rotated by 180 degrees
+
+Valid options are:
+
+  - reflect_x (boolean): Perform an axial symmetry on the X axis
+  - reflect_y (boolean): Perform an axial symmetry on the Y axis
+  - rotate (integer): Rotate the initial framebuffer by x
+    degrees. Valid values are 0, 90, 180 and 270.
+
+
 ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo *****
 
 What is the VESA(TM) Coordinated Video Timings (CVT)?
diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c
index 33d4988f22ae..e95fceac8f8b 100644
--- a/drivers/gpu/drm/drm_client_modeset.c
+++ b/drivers/gpu/drm/drm_client_modeset.c
@@ -824,6 +824,7 @@ bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation)
 {
 	struct drm_connector *connector = modeset->connectors[0];
 	struct drm_plane *plane = modeset->crtc->primary;
+	struct drm_cmdline_mode *cmdline;
 	u64 valid_mask = 0;
 	unsigned int i;
 
@@ -844,6 +845,35 @@ bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation)
 		*rotation = DRM_MODE_ROTATE_0;
 	}
 
+	/**
+	 * The panel already defined the default rotation
+	 * through its orientation. Whatever has been provided
+	 * on the command line needs to be added to that.
+	 *
+	 * Unfortunately, the rotations are at different bit
+	 * indices, so the math to add them up are not as
+	 * trivial as they could.
+	 *
+	 * Reflections on the other hand are pretty trivial to deal with, a
+	 * simple XOR between the two handle the addition nicely.
+	 */
+	cmdline = &connector->cmdline_mode;
+	if (cmdline->specified) {
+		unsigned int cmdline_rest, panel_rest;
+		unsigned int cmdline_rot, panel_rot;
+		unsigned int sum_rot, sum_rest;
+
+		panel_rot = ilog2(*rotation & DRM_MODE_ROTATE_MASK);
+		cmdline_rot = ilog2(cmdline->rotation_reflection & DRM_MODE_ROTATE_MASK);
+		sum_rot = (panel_rot + cmdline_rot) % 4;
+
+		panel_rest = *rotation & ~DRM_MODE_ROTATE_MASK;
+		cmdline_rest = cmdline->rotation_reflection & ~DRM_MODE_ROTATE_MASK;
+		sum_rest = panel_rest ^ cmdline_rest;
+
+		*rotation = (1 << sum_rot) | sum_rest;
+	}
+
 	/*
 	 * TODO: support 90 / 270 degree hardware rotation,
 	 * depending on the hardware this may require the framebuffer
diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index 429d3be17800..dc6d11292685 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -1554,6 +1554,71 @@ static int drm_mode_parse_cmdline_res_mode(const char *str, unsigned int length,
 	return 0;
 }
 
+static int drm_mode_parse_cmdline_options(char *str, size_t len,
+					  struct drm_connector *connector,
+					  struct drm_cmdline_mode *mode)
+{
+	unsigned int rotation = 0;
+	char *sep = str;
+
+	while ((sep = strchr(sep, ','))) {
+		char *delim, *option;
+
+		option = sep + 1;
+		delim = strchr(option, '=');
+		if (!delim) {
+			delim = strchr(option, ',');
+
+			if (!delim)
+				delim = str + len;
+		}
+
+		if (!strncmp(option, "rotate", delim - option)) {
+			const char *value = delim + 1;
+			unsigned int deg;
+
+			deg = simple_strtol(value, &sep, 10);
+
+			/* Make sure we have parsed something */
+			if (sep == value)
+				return -EINVAL;
+
+			switch (deg) {
+			case 0:
+				rotation |= DRM_MODE_ROTATE_0;
+				break;
+
+			case 90:
+				rotation |= DRM_MODE_ROTATE_90;
+				break;
+
+			case 180:
+				rotation |= DRM_MODE_ROTATE_180;
+				break;
+
+			case 270:
+				rotation |= DRM_MODE_ROTATE_270;
+				break;
+
+			default:
+				return -EINVAL;
+			}
+		} else if (!strncmp(option, "reflect_x", delim - option)) {
+			rotation |= DRM_MODE_REFLECT_X;
+			sep = delim;
+		} else if (!strncmp(option, "reflect_y", delim - option)) {
+			rotation |= DRM_MODE_REFLECT_Y;
+			sep = delim;
+		} else {
+			return -EINVAL;
+		}
+	}
+
+	mode->rotation_reflection = rotation;
+
+	return 0;
+}
+
 /**
  * drm_mode_parse_command_line_for_connector - parse command line modeline for connector
  * @mode_option: optional per connector mode option
@@ -1569,6 +1634,10 @@ static int drm_mode_parse_cmdline_res_mode(const char *str, unsigned int length,
  *
  *	<xres>x<yres>[M][R][-<bpp>][@<refresh>][i][m][eDd]
  *
+ * Additionals options can be provided following the mode, using a comma to
+ * separate each option. Valid options can be found in
+ * Documentation/fb/modedb.txt.
+ *
  * The intermediate drm_cmdline_mode structure is required to store additional
  * options from the command line modline like the force-enable/disable flag.
  *
@@ -1581,9 +1650,10 @@ bool drm_mode_parse_command_line_for_connector(const char *mode_option,
 {
 	const char *name;
 	bool named_mode = false, parse_extras = false;
-	unsigned int bpp_off = 0, refresh_off = 0;
+	unsigned int bpp_off = 0, refresh_off = 0, options_off = 0;
 	unsigned int mode_end = 0;
 	char *bpp_ptr = NULL, *refresh_ptr = NULL, *extra_ptr = NULL;
+	char *options_ptr = NULL;
 	char *bpp_end_ptr = NULL, *refresh_end_ptr = NULL;
 	int ret;
 
@@ -1632,13 +1702,18 @@ bool drm_mode_parse_command_line_for_connector(const char *mode_option,
 		mode->refresh_specified = true;
 	}
 
+	/* Locate the start of named options */
+	options_ptr = strchr(name, ',');
+	if (options_ptr)
+		options_off = options_ptr - name;
+
 	/* Locate the end of the name / resolution, and parse it */
-	if (bpp_ptr && refresh_ptr) {
-		mode_end = min(bpp_off, refresh_off);
-	} else if (bpp_ptr) {
+	if (bpp_ptr) {
 		mode_end = bpp_off;
 	} else if (refresh_ptr) {
 		mode_end = refresh_off;
+	} else if (options_ptr) {
+		mode_end = options_off;
 	} else {
 		mode_end = strlen(name);
 		parse_extras = true;
@@ -1680,24 +1755,23 @@ bool drm_mode_parse_command_line_for_connector(const char *mode_option,
 	else if (refresh_ptr)
 		extra_ptr = refresh_end_ptr;
 
-	if (extra_ptr) {
-		if (!named_mode) {
-			int len = strlen(name) - (extra_ptr - name);
+	if (extra_ptr &&
+	    extra_ptr != options_ptr) {
+		int len = strlen(name) - (extra_ptr - name);
 
-			ret = drm_mode_parse_cmdline_extra(extra_ptr, len,
-							   connector, mode);
-			if (ret)
-				return false;
-		} else {
-			int remaining = strlen(name) - (extra_ptr - name);
+		ret = drm_mode_parse_cmdline_extra(extra_ptr, len,
+						   connector, mode);
+		if (ret)
+			return false;
+	}
 
-			/*
-			 * We still have characters to process, while
-			 * we shouldn't have any
-			 */
-			if (remaining > 0)
-				return false;
-		}
+	if (options_ptr) {
+		int len = strlen(name) - (options_ptr - name);
+
+		ret = drm_mode_parse_cmdline_options(options_ptr, len,
+						     connector, mode);
+		if (ret)
+			return false;
 	}
 
 	return true;
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index cdf2fb910010..8eebe0432c73 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -1025,6 +1025,16 @@ struct drm_cmdline_mode {
 	 * state to one of the DRM_FORCE_* values.
 	 */
 	enum drm_connector_force force;
+
+	/**
+	 * @rotation_reflection:
+	 *
+	 * Initial rotation and reflection of the mode setup from the
+	 * command line. See DRM_MODE_ROTATE_* and
+	 * DRM_MODE_REFLECT_*. The only rotations supported are
+	 * DRM_MODE_ROTATE_0 and DRM_MODE_ROTATE_180.
+	 */
+	unsigned int rotation_reflection;
 };
 
 /**
-- 
cgit v1.2.3


From 22045e8e52bd802f743f0471242782fc3b479707 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Wed, 19 Jun 2019 12:17:51 +0200
Subject: drm/connector: Introduce a TV margins structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The TV margins has been defined as a structure inside the
drm_connector_state structure so far. However, we will need it in other
structures as well, so let's move that structure definition so that it can
be reused.

Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/38b773b03f15ec7a135cdf8f7db669e5ada20cf2.1560783090.git-series.maxime.ripard@bootlin.com
---
 include/drm/drm_connector.h | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 8eebe0432c73..b22e3150e33d 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -463,14 +463,38 @@ int drm_display_info_set_bus_formats(struct drm_display_info *info,
 				     const u32 *formats,
 				     unsigned int num_formats);
 
+/**
+ * struct drm_connector_tv_margins - TV connector related margins
+ *
+ * Describes the margins in pixels to put around the image on TV
+ * connectors to deal with overscan.
+ */
+struct drm_connector_tv_margins {
+	/**
+	 * @bottom: Bottom margin in pixels.
+	 */
+	unsigned int bottom;
+
+	/**
+	 * @left: Left margin in pixels.
+	 */
+	unsigned int left;
+
+	/**
+	 * @right: Right margin in pixels.
+	 */
+	unsigned int right;
+
+	/**
+	 * @top: Top margin in pixels.
+	 */
+	unsigned int top;
+};
+
 /**
  * struct drm_tv_connector_state - TV connector related states
  * @subconnector: selected subconnector
- * @margins: margins (all margins are expressed in pixels)
- * @margins.left: left margin
- * @margins.right: right margin
- * @margins.top: top margin
- * @margins.bottom: bottom margin
+ * @margins: TV margins
  * @mode: TV mode
  * @brightness: brightness in percent
  * @contrast: contrast in percent
@@ -481,12 +505,7 @@ int drm_display_info_set_bus_formats(struct drm_display_info *info,
  */
 struct drm_tv_connector_state {
 	enum drm_mode_subconnector subconnector;
-	struct {
-		unsigned int left;
-		unsigned int right;
-		unsigned int top;
-		unsigned int bottom;
-	} margins;
+	struct drm_connector_tv_margins margins;
 	unsigned int mode;
 	unsigned int brightness;
 	unsigned int contrast;
-- 
cgit v1.2.3


From 3d46a3007cd8f73bae502bf5c171977b91d7aacc Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Wed, 19 Jun 2019 12:17:51 +0200
Subject: drm/modes: Parse overscan properties
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Properly configuring the overscan properties might be needed for the
initial setup of the framebuffer for display that still have overscan.
Let's allow for more properties on the kernel command line to setup each
margin.

Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/e481f1628e3768ca49226ec2115cfa4dfcbd5e4c.1560783090.git-series.maxime.ripard@bootlin.com
---
 Documentation/fb/modedb.txt |  2 ++
 drivers/gpu/drm/drm_modes.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/drm/drm_connector.h |  5 +++++
 3 files changed, 51 insertions(+)

(limited to 'include')

diff --git a/Documentation/fb/modedb.txt b/Documentation/fb/modedb.txt
index 52418c6dbfc4..1dd5a52f9390 100644
--- a/Documentation/fb/modedb.txt
+++ b/Documentation/fb/modedb.txt
@@ -57,6 +57,8 @@ Options can also be passed after the mode, using commas as separator.
 
 Valid options are:
 
+  - margin_top, margin_bottom, margin_left, margin_right (integer):
+    Number of pixels in the margins, typically to deal with overscan on TVs
   - reflect_x (boolean): Perform an axial symmetry on the X axis
   - reflect_y (boolean): Perform an axial symmetry on the Y axis
   - rotate (integer): Rotate the initial framebuffer by x
diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index dc6d11292685..57e6408288c8 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -1609,6 +1609,50 @@ static int drm_mode_parse_cmdline_options(char *str, size_t len,
 		} else if (!strncmp(option, "reflect_y", delim - option)) {
 			rotation |= DRM_MODE_REFLECT_Y;
 			sep = delim;
+		} else if (!strncmp(option, "margin_right", delim - option)) {
+			const char *value = delim + 1;
+			unsigned int margin;
+
+			margin = simple_strtol(value, &sep, 10);
+
+			/* Make sure we have parsed something */
+			if (sep == value)
+				return -EINVAL;
+
+			mode->tv_margins.right = margin;
+		} else if (!strncmp(option, "margin_left", delim - option)) {
+			const char *value = delim + 1;
+			unsigned int margin;
+
+			margin = simple_strtol(value, &sep, 10);
+
+			/* Make sure we have parsed something */
+			if (sep == value)
+				return -EINVAL;
+
+			mode->tv_margins.left = margin;
+		} else if (!strncmp(option, "margin_top", delim - option)) {
+			const char *value = delim + 1;
+			unsigned int margin;
+
+			margin = simple_strtol(value, &sep, 10);
+
+			/* Make sure we have parsed something */
+			if (sep == value)
+				return -EINVAL;
+
+			mode->tv_margins.top = margin;
+		} else if (!strncmp(option, "margin_bottom", delim - option)) {
+			const char *value = delim + 1;
+			unsigned int margin;
+
+			margin = simple_strtol(value, &sep, 10);
+
+			/* Make sure we have parsed something */
+			if (sep == value)
+				return -EINVAL;
+
+			mode->tv_margins.bottom = margin;
 		} else {
 			return -EINVAL;
 		}
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index b22e3150e33d..ca745d9feaf5 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -1054,6 +1054,11 @@ struct drm_cmdline_mode {
 	 * DRM_MODE_ROTATE_0 and DRM_MODE_ROTATE_180.
 	 */
 	unsigned int rotation_reflection;
+
+	/**
+	 * @tv_margins: TV margins to apply to the mode.
+	 */
+	struct drm_connector_tv_margins tv_margins;
 };
 
 /**
-- 
cgit v1.2.3


From 731514b446fe6748d5a55a3dff202efb45c7d8df Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Wed, 19 Jun 2019 12:17:52 +0200
Subject: drm/atomic: Add a function to reset connector TV properties
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During the connector reset, if that connector has a TV property, it needs
to be reset to the value provided on the command line.

Provide a helper to do that.

Reviewed-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/84a7b657f09303a2850e1cc79e68f623547f3fdd.1560783090.git-series.maxime.ripard@bootlin.com
---
 drivers/gpu/drm/drm_atomic_state_helper.c | 18 ++++++++++++++++++
 include/drm/drm_atomic_state_helper.h     |  1 +
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c
index 7d7347a6f194..46dc264a248b 100644
--- a/drivers/gpu/drm/drm_atomic_state_helper.c
+++ b/drivers/gpu/drm/drm_atomic_state_helper.c
@@ -379,6 +379,24 @@ void drm_atomic_helper_connector_reset(struct drm_connector *connector)
 }
 EXPORT_SYMBOL(drm_atomic_helper_connector_reset);
 
+/**
+ * drm_atomic_helper_connector_tv_reset - Resets TV connector properties
+ * @connector: DRM connector
+ *
+ * Resets the TV-related properties attached to a connector.
+ */
+void drm_atomic_helper_connector_tv_reset(struct drm_connector *connector)
+{
+	struct drm_cmdline_mode *cmdline = &connector->cmdline_mode;
+	struct drm_connector_state *state = connector->state;
+
+	state->tv.margins.left = cmdline->tv_margins.left;
+	state->tv.margins.right = cmdline->tv_margins.right;
+	state->tv.margins.top = cmdline->tv_margins.top;
+	state->tv.margins.bottom = cmdline->tv_margins.bottom;
+}
+EXPORT_SYMBOL(drm_atomic_helper_connector_tv_reset);
+
 /**
  * __drm_atomic_helper_connector_duplicate_state - copy atomic connector state
  * @connector: connector object
diff --git a/include/drm/drm_atomic_state_helper.h b/include/drm/drm_atomic_state_helper.h
index 4e6d2e7a40b8..e4577cc11689 100644
--- a/include/drm/drm_atomic_state_helper.h
+++ b/include/drm/drm_atomic_state_helper.h
@@ -62,6 +62,7 @@ void drm_atomic_helper_plane_destroy_state(struct drm_plane *plane,
 void __drm_atomic_helper_connector_reset(struct drm_connector *connector,
 					 struct drm_connector_state *conn_state);
 void drm_atomic_helper_connector_reset(struct drm_connector *connector);
+void drm_atomic_helper_connector_tv_reset(struct drm_connector *connector);
 void
 __drm_atomic_helper_connector_duplicate_state(struct drm_connector *connector,
 					   struct drm_connector_state *state);
-- 
cgit v1.2.3


From a49b1dc7ae447d7085360cd587fc1c8b9ec6c871 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 12 Jun 2019 15:27:41 +0300
Subject: RDMA: Convert destroy_wq to be void

All callers of destroy WQ are always success and there is no need
to check their return value, so convert destroy_wq to be void.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/verbs.c      | 12 +++++-------
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  2 +-
 drivers/infiniband/hw/mlx4/qp.c      |  4 +---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  2 +-
 drivers/infiniband/hw/mlx5/qp.c      |  4 +---
 include/rdma/ib_verbs.h              |  2 +-
 6 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 588f1d195fd2..16ef8a9bda4c 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2235,19 +2235,17 @@ EXPORT_SYMBOL(ib_create_wq);
  */
 int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 {
-	int err;
 	struct ib_cq *cq = wq->cq;
 	struct ib_pd *pd = wq->pd;
 
 	if (atomic_read(&wq->usecnt))
 		return -EBUSY;
 
-	err = wq->device->ops.destroy_wq(wq, udata);
-	if (!err) {
-		atomic_dec(&pd->usecnt);
-		atomic_dec(&cq->usecnt);
-	}
-	return err;
+	wq->device->ops.destroy_wq(wq, udata);
+	atomic_dec(&pd->usecnt);
+	atomic_dec(&cq->usecnt);
+
+	return 0;
 }
 EXPORT_SYMBOL(ib_destroy_wq);
 
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 81b3d85e5167..eb53bb4c0c91 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -906,7 +906,7 @@ void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port);
 struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
 				struct ib_wq_init_attr *init_attr,
 				struct ib_udata *udata);
-int mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
+void mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
 		      u32 wq_attr_mask, struct ib_udata *udata);
 
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 5221c0794d1d..520364defa28 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -4248,7 +4248,7 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
 	return err;
 }
 
-int mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
+void mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
 	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
@@ -4259,8 +4259,6 @@ int mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
 	destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, udata);
 
 	kfree(qp);
-
-	return 0;
 }
 
 struct ib_rwq_ind_table
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 35e2c8f5ae78..82cfe86087b6 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1201,7 +1201,7 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
 				struct ib_wq_init_attr *init_attr,
 				struct ib_udata *udata);
-int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
+void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
 		      u32 wq_attr_mask, struct ib_udata *udata);
 struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index f6623c77443a..ae847709b3d3 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -6047,7 +6047,7 @@ err:
 	return ERR_PTR(err);
 }
 
-int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
+void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(wq->device);
 	struct mlx5_ib_rwq *rwq = to_mrwq(wq);
@@ -6055,8 +6055,6 @@ int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
 	destroy_user_rq(dev, wq->pd, rwq, udata);
 	kfree(rwq);
-
-	return 0;
 }
 
 struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6f09fcc21d7a..805148a12660 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2509,7 +2509,7 @@ struct ib_device_ops {
 	struct ib_wq *(*create_wq)(struct ib_pd *pd,
 				   struct ib_wq_init_attr *init_attr,
 				   struct ib_udata *udata);
-	int (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata);
+	void (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata);
 	int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr,
 			 u32 wq_attr_mask, struct ib_udata *udata);
 	struct ib_rwq_ind_table *(*create_rwq_ind_table)(
-- 
cgit v1.2.3


From 852a6626d5fdd5dd442e6c6ab51ce0cb022d75b4 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Tue, 18 Jul 2017 19:27:55 +0800
Subject: drm/amdgpu: add navi10 asic type

Signed-off-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
 include/drm/amd_asic_type.h                | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ddd30f31a6..4c40de13dd29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -95,6 +95,7 @@ static const char *amdgpu_asic_name[] = {
 	"VEGA12",
 	"VEGA20",
 	"RAVEN",
+	"NAVI10",
 	"LAST",
 };
 
diff --git a/include/drm/amd_asic_type.h b/include/drm/amd_asic_type.h
index dd63d08cc54e..bcc2bcf32886 100644
--- a/include/drm/amd_asic_type.h
+++ b/include/drm/amd_asic_type.h
@@ -49,6 +49,7 @@ enum amd_asic_type {
 	CHIP_VEGA12,
 	CHIP_VEGA20,
 	CHIP_RAVEN,
+	CHIP_NAVI10,
 	CHIP_LAST,
 };
 
-- 
cgit v1.2.3


From 107c34bcbf4737b457ca44156aa11e788cbb6b83 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Wed, 19 Jul 2017 09:43:26 +0800
Subject: drm/amdgpu: add NV series gpu family id

Signed-off-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/drm/amdgpu_drm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 4788730dbe78..571bf550859f 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -1044,6 +1044,7 @@ struct drm_amdgpu_info_vce_clock_table {
 #define AMDGPU_FAMILY_CZ			135 /* Carrizo, Stoney */
 #define AMDGPU_FAMILY_AI			141 /* Vega10 */
 #define AMDGPU_FAMILY_RV			142 /* Raven */
+#define AMDGPU_FAMILY_NV			143 /* Navi10 */
 
 #if defined(__cplusplus)
 }
-- 
cgit v1.2.3


From d67383e6b764957b39667af2754920e38a5eb43e Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Tue, 18 Jul 2017 18:59:24 +0800
Subject: drm/amdgpu: add GDDR6 vram type

New vram type.

Signed-off-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/drm/amdgpu_drm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 571bf550859f..e1e5e7860570 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -912,6 +912,7 @@ struct drm_amdgpu_info_firmware {
 #define AMDGPU_VRAM_TYPE_HBM   6
 #define AMDGPU_VRAM_TYPE_DDR3  7
 #define AMDGPU_VRAM_TYPE_DDR4  8
+#define AMDGPU_VRAM_TYPE_GDDR6 9
 
 struct drm_amdgpu_info_device {
 	/** PCI Device ID */
-- 
cgit v1.2.3


From f56044d686c82bd31713fc0398d68e322813dc62 Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Thu, 13 Jun 2019 08:30:44 -0400
Subject: IB/rdmavt: Add new completion inline

There is opencoded send completion logic all over all
the drivers.

We need to convert to this routine to enforce ordering
issues for completions.  This routine fixes an ordering
issue where the read of the SWQE fields necessary for creating
the completion can race with a post send if the post send catches
a send queue at the edge of being full.  Is is possible in that situation
to read SWQE fields that are being written.

This new routine insures that SWQE fields are read prior to advancing
the index that post send uses to determine queue fullness.

Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/rdmavt_qp.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

(limited to 'include')

diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 68e38c20afc0..6014f1766907 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -737,6 +737,78 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
 		atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
 }
 
+/**
+ * rvt_qp_sqwe_incr - increment ring index
+ * @qp: the qp
+ * @val: the starting value
+ *
+ * Return: the new value wrapping as appropriate
+ */
+static inline u32
+rvt_qp_swqe_incr(struct rvt_qp *qp, u32 val)
+{
+	if (++val >= qp->s_size)
+		val = 0;
+	return val;
+}
+
+/**
+ * rvt_qp_complete_swqe - insert send completion
+ * @qp - the qp
+ * @wqe - the send wqe
+ * @opcode - wc operation (driver dependent)
+ * @status - completion status
+ *
+ * Update the s_last information, and then insert a send
+ * completion into the completion
+ * queue if the qp indicates it should be done.
+ *
+ * See IBTA 10.7.3.1 for info on completion
+ * control.
+ *
+ * Return: new last
+ */
+static inline u32
+rvt_qp_complete_swqe(struct rvt_qp *qp,
+		     struct rvt_swqe *wqe,
+		     enum ib_wc_opcode opcode,
+		     enum ib_wc_status status)
+{
+	bool need_completion;
+	u64 wr_id;
+	u32 byte_len, last;
+	int flags = wqe->wr.send_flags;
+
+	rvt_put_qp_swqe(qp, wqe);
+
+	need_completion =
+		!(flags & RVT_SEND_RESERVE_USED) &&
+		(!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+		(flags & IB_SEND_SIGNALED) ||
+		status != IB_WC_SUCCESS);
+	if (need_completion) {
+		wr_id = wqe->wr.wr_id;
+		byte_len = wqe->length;
+		/* above fields required before writing s_last */
+	}
+	last = rvt_qp_swqe_incr(qp, qp->s_last);
+	/* see rvt_qp_is_avail() */
+	smp_store_release(&qp->s_last, last);
+	if (need_completion) {
+		struct ib_wc w = {
+			.wr_id = wr_id,
+			.status = status,
+			.opcode = opcode,
+			.qp = &qp->ibqp,
+			.byte_len = byte_len,
+		};
+
+		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &w,
+			     status != IB_WC_SUCCESS);
+	}
+	return last;
+}
+
 extern const int  ib_rvt_state_ops[];
 
 struct rvt_dev_info;
-- 
cgit v1.2.3


From 4a9ceb7dbadf9e1435644b1f49720ee87431ce26 Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Thu, 13 Jun 2019 08:30:52 -0400
Subject: IB/{rdmavt, qib, hfi1}: Convert to new completion API

Convert all completions to use the new completion routine that
fixes a race between post send and completion where fields from
a SWQE can be read after SWQE has been freed.

This patch also addresses issues reported in
https://marc.info/?l=linux-kernel&m=155656897409107&w=2.

The reserved operation path has no need for any barrier.

The barrier for the other path is addressed by the
smp_load_acquire() barrier.

Cc: Andrea Parri <andrea.parri@amarulasolutions.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/rc.c    | 26 ++++----------------------
 drivers/infiniband/hw/qib/qib_rc.c | 26 ++++----------------------
 drivers/infiniband/sw/rdmavt/qp.c  | 31 +++++++++----------------------
 include/rdma/rdmavt_qp.h           | 36 ------------------------------------
 4 files changed, 17 insertions(+), 102 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index a922edcf23d6..84b51cc36dbd 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -1819,23 +1819,14 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
 	}
 
 	while (qp->s_last != qp->s_acked) {
-		u32 s_last;
-
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 		if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
 		    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
 			break;
 		trdma_clean_swqe(qp, wqe);
 		rvt_qp_wqe_unreserve(qp, wqe);
-		s_last = qp->s_last;
-		trace_hfi1_qp_send_completion(qp, wqe, s_last);
-		if (++s_last >= qp->s_size)
-			s_last = 0;
-		qp->s_last = s_last;
-		/* see post_send() */
-		barrier();
-		rvt_put_qp_swqe(qp, wqe);
-		rvt_qp_swqe_complete(qp,
+		trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
+		rvt_qp_complete_swqe(qp,
 				     wqe,
 				     ib_hfi1_wc_opcode[wqe->wr.opcode],
 				     IB_WC_SUCCESS);
@@ -1879,19 +1870,10 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 	trace_hfi1_rc_completion(qp, wqe->lpsn);
 	if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
 	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
-		u32 s_last;
-
 		trdma_clean_swqe(qp, wqe);
-		rvt_put_qp_swqe(qp, wqe);
 		rvt_qp_wqe_unreserve(qp, wqe);
-		s_last = qp->s_last;
-		trace_hfi1_qp_send_completion(qp, wqe, s_last);
-		if (++s_last >= qp->s_size)
-			s_last = 0;
-		qp->s_last = s_last;
-		/* see post_send() */
-		barrier();
-		rvt_qp_swqe_complete(qp,
+		trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
+		rvt_qp_complete_swqe(qp,
 				     wqe,
 				     ib_hfi1_wc_opcode[wqe->wr.opcode],
 				     IB_WC_SUCCESS);
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index 2ac4c67f5ba1..8d9a94d6f685 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -921,20 +921,11 @@ void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
 		rvt_add_retry_timer(qp);
 
 	while (qp->s_last != qp->s_acked) {
-		u32 s_last;
-
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 		if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
 		    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
 			break;
-		s_last = qp->s_last;
-		if (++s_last >= qp->s_size)
-			s_last = 0;
-		qp->s_last = s_last;
-		/* see post_send() */
-		barrier();
-		rvt_put_qp_swqe(qp, wqe);
-		rvt_qp_swqe_complete(qp,
+		rvt_qp_complete_swqe(qp,
 				     wqe,
 				     ib_qib_wc_opcode[wqe->wr.opcode],
 				     IB_WC_SUCCESS);
@@ -972,21 +963,12 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 	 * is finished.
 	 */
 	if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
-	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
-		u32 s_last;
-
-		rvt_put_qp_swqe(qp, wqe);
-		s_last = qp->s_last;
-		if (++s_last >= qp->s_size)
-			s_last = 0;
-		qp->s_last = s_last;
-		/* see post_send() */
-		barrier();
-		rvt_qp_swqe_complete(qp,
+	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0)
+		rvt_qp_complete_swqe(qp,
 				     wqe,
 				     ib_qib_wc_opcode[wqe->wr.opcode],
 				     IB_WC_SUCCESS);
-	} else
+	else
 		this_cpu_inc(*ibp->rvp.rc_delayed_comp);
 
 	qp->s_retry = qp->s_retry_cnt;
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index a60f5faea198..dfbc7d8640fb 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1853,10 +1853,9 @@ static inline int rvt_qp_is_avail(
 
 	/* see rvt_qp_wqe_unreserve() */
 	smp_mb__before_atomic();
-	reserved_used = atomic_read(&qp->s_reserved_used);
 	if (unlikely(reserved_op)) {
 		/* see rvt_qp_wqe_unreserve() */
-		smp_mb__before_atomic();
+		reserved_used = atomic_read(&qp->s_reserved_used);
 		if (reserved_used >= rdi->dparms.reserved_operations)
 			return -ENOMEM;
 		return 0;
@@ -1864,14 +1863,13 @@ static inline int rvt_qp_is_avail(
 	/* non-reserved operations */
 	if (likely(qp->s_avail))
 		return 0;
-	slast = READ_ONCE(qp->s_last);
+	/* See rvt_qp_complete_swqe() */
+	slast = smp_load_acquire(&qp->s_last);
 	if (qp->s_head >= slast)
 		avail = qp->s_size - (qp->s_head - slast);
 	else
 		avail = slast - qp->s_head;
 
-	/* see rvt_qp_wqe_unreserve() */
-	smp_mb__before_atomic();
 	reserved_used = atomic_read(&qp->s_reserved_used);
 	avail =  avail - 1 -
 		(rdi->dparms.reserved_operations - reserved_used);
@@ -2664,27 +2662,16 @@ void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
 		       enum ib_wc_status status)
 {
 	u32 old_last, last;
-	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+	struct rvt_dev_info *rdi;
 
 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
 		return;
+	rdi = ib_to_rvt(qp->ibqp.device);
 
-	last = qp->s_last;
-	old_last = last;
-	trace_rvt_qp_send_completion(qp, wqe, last);
-	if (++last >= qp->s_size)
-		last = 0;
-	trace_rvt_qp_send_completion(qp, wqe, last);
-	qp->s_last = last;
-	/* See post_send() */
-	barrier();
-	rvt_put_qp_swqe(qp, wqe);
-
-	rvt_qp_swqe_complete(qp,
-			     wqe,
-			     rdi->wc_opcode[wqe->wr.opcode],
-			     status);
-
+	old_last = qp->s_last;
+	trace_rvt_qp_send_completion(qp, wqe, old_last);
+	last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode],
+				    status);
 	if (qp->s_acked == old_last)
 		qp->s_acked = last;
 	if (qp->s_cur == old_last)
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 6014f1766907..84d0f36afc2f 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -565,42 +565,6 @@ static inline void rvt_qp_wqe_unreserve(
 
 extern const enum ib_wc_opcode ib_rvt_wc_opcode[];
 
-/**
- * rvt_qp_swqe_complete() - insert send completion
- * @qp - the qp
- * @wqe - the send wqe
- * @status - completion status
- *
- * Insert a send completion into the completion
- * queue if the qp indicates it should be done.
- *
- * See IBTA 10.7.3.1 for info on completion
- * control.
- */
-static inline void rvt_qp_swqe_complete(
-	struct rvt_qp *qp,
-	struct rvt_swqe *wqe,
-	enum ib_wc_opcode opcode,
-	enum ib_wc_status status)
-{
-	if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED))
-		return;
-	if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
-	     status != IB_WC_SUCCESS) {
-		struct ib_wc wc;
-
-		memset(&wc, 0, sizeof(wc));
-		wc.wr_id = wqe->wr.wr_id;
-		wc.status = status;
-		wc.opcode = opcode;
-		wc.qp = &qp->ibqp;
-		wc.byte_len = wqe->length;
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
-			     status != IB_WC_SUCCESS);
-	}
-}
-
 /*
  * Compare the lower 24 bits of the msn values.
  * Returns an integer <, ==, or > than zero.
-- 
cgit v1.2.3


From a78cf9657ba5426f54aa93a067c10d097944c082 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sat, 15 Jun 2019 10:23:57 +1000
Subject: PCI/ACPI: Evaluate PCI Boot Configuration _DSM

Evaluate _DSM Function #5, the "PCI Boot Configuration" function.  If the
result is 0, the OS should preserve any resource assignments made by the
firmware.

Link: https://lore.kernel.org/r/20190615002359.29577-2-benh@kernel.crashing.org
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[bhelgaas: commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/acpi/pci_root.c  | 12 ++++++++++++
 include/linux/pci-acpi.h |  7 ++++---
 include/linux/pci.h      |  2 ++
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index c36781a9b493..0d57f817ef1e 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -894,6 +894,7 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 	int node = acpi_get_node(device->handle);
 	struct pci_bus *bus;
 	struct pci_host_bridge *host_bridge;
+	union acpi_object *obj;
 
 	info->root = root;
 	info->bridge = device;
@@ -930,6 +931,17 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_LTR_CONTROL))
 		host_bridge->native_ltr = 0;
 
+	/*
+	 * Evaluate the "PCI Boot Configuration" _DSM Function.  If it
+	 * exists and returns 0, we must preserve any PCI resource
+	 * assignments made by firmware for this host bridge.
+	 */
+	obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 1,
+	                        IGNORE_PCI_BOOT_CONFIG_DSM, NULL);
+	if (obj && obj->type == ACPI_TYPE_INTEGER && obj->integer.value == 0)
+		host_bridge->preserve_config = 1;
+	ACPI_FREE(obj);
+
 	pci_scan_child_bus(bus);
 	pci_set_host_bridge_release(host_bridge, acpi_pci_root_release_info,
 				    info);
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 8082b612f561..62b7fdcc661c 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -107,9 +107,10 @@ static inline void acpiphp_check_host_bridge(struct acpi_device *adev) { }
 #endif
 
 extern const guid_t pci_acpi_dsm_guid;
-#define DEVICE_LABEL_DSM	0x07
-#define RESET_DELAY_DSM		0x08
-#define FUNCTION_DELAY_DSM	0x09
+#define IGNORE_PCI_BOOT_CONFIG_DSM	0x05
+#define DEVICE_LABEL_DSM		0x07
+#define RESET_DELAY_DSM			0x08
+#define FUNCTION_DELAY_DSM		0x09
 
 #else	/* CONFIG_ACPI */
 static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..5e2b309363a3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -505,6 +505,8 @@ struct pci_host_bridge {
 	unsigned int	native_shpc_hotplug:1;	/* OS may use SHPC hotplug */
 	unsigned int	native_pme:1;		/* OS may use PCIe PME */
 	unsigned int	native_ltr:1;		/* OS may use PCIe LTR */
+	unsigned int	preserve_config:1;	/* Preserve FW resource setup */
+
 	/* Resource alignment requirements */
 	resource_size_t (*align_resource)(struct pci_dev *dev,
 			const struct resource *res,
-- 
cgit v1.2.3


From 22e96fa62ec66ca9d21a76b74601fc200908fec3 Mon Sep 17 00:00:00 2001
From: Hawking Zhang <Hawking.Zhang@amd.com>
Date: Tue, 12 Jun 2018 18:30:04 +0800
Subject: drm/amdgpu: add pa_sc_tile_steering_override to
 drm_amdgpu_info_device

the initial/default value of pa_sc_tile_steering_override need to
be exposed to user mode driver

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/drm/amdgpu_drm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index e1e5e7860570..d799858b9e53 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -992,6 +992,8 @@ struct drm_amdgpu_info_device {
 	__u64 high_va_offset;
 	/** The maximum high virtual address */
 	__u64 high_va_max;
+	/* gfx10 pa_sc_tile_steering_override */
+	__u32 pa_sc_tile_steering_override;
 };
 
 struct drm_amdgpu_info_hw_ip {
-- 
cgit v1.2.3


From d308dfbf62eff897d71968d764f21a78678ee0a5 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 18 Jun 2019 12:58:33 +0200
Subject: i2c: mux/i801: Switch to use descriptor passing

This switches the i801 GPIO mux to use GPIO descriptors for
handling the GPIO lines. The previous hack which was reaching
inside the GPIO chips etc cannot live on. We pass descriptors
along with the GPIO mux device at creation instead.

The GPIO mux was only used by way of platform data with a
platform device from one place in the kernel: the i801 i2c bus
driver. Let's just associate the GPIO descriptor table with
the actual device like everyone else and dynamically create
a descriptor table passed along with the GPIO i2c mux.

This enables simplification of the GPIO i2c mux driver to
use only the descriptor API and the OF probe path gets
simplified in the process.

The i801 driver was registering the GPIO i2c mux with
PLATFORM_DEVID_AUTO which would make it hard to predict the
device name and assign the descriptor table properly, but
this seems to be a mistake to begin with: all of the
GPIO mux devices are hardcoded to look up GPIO lines from
the "gpio_ich" GPIO chip. If there are more than one mux,
there is certainly more than one gpio chip as well, and
then we have more serious problems. Switch to
PLATFORM_DEVID_NONE instead. There can be only one.

Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Peter Rosin <peda@axentia.se>
Cc: Jean Delvare <jdelvare@suse.com>
Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
[Removed a newline, suggested by Andy. /Peter]
Signed-off-by: Peter Rosin <peda@axentia.se>
---
 drivers/i2c/busses/i2c-i801.c              |  37 +++++++--
 drivers/i2c/muxes/i2c-mux-gpio.c           | 116 ++++++++---------------------
 include/linux/platform_data/i2c-mux-gpio.h |   7 --
 3 files changed, 60 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index 679c6c41f64b..bf484cd775ec 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -107,7 +107,7 @@
 #include <linux/pm_runtime.h>
 
 #if IS_ENABLED(CONFIG_I2C_MUX_GPIO) && defined CONFIG_DMI
-#include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/platform_data/i2c-mux-gpio.h>
 #endif
 
@@ -274,6 +274,7 @@ struct i801_priv {
 #if IS_ENABLED(CONFIG_I2C_MUX_GPIO) && defined CONFIG_DMI
 	const struct i801_mux_config *mux_drvdata;
 	struct platform_device *mux_pdev;
+	struct gpiod_lookup_table *lookup;
 #endif
 	struct platform_device *tco_pdev;
 
@@ -1258,7 +1259,8 @@ static int i801_add_mux(struct i801_priv *priv)
 	struct device *dev = &priv->adapter.dev;
 	const struct i801_mux_config *mux_config;
 	struct i2c_mux_gpio_platform_data gpio_data;
-	int err;
+	struct gpiod_lookup_table *lookup;
+	int err, i;
 
 	if (!priv->mux_drvdata)
 		return 0;
@@ -1270,17 +1272,36 @@ static int i801_add_mux(struct i801_priv *priv)
 	gpio_data.values = mux_config->values;
 	gpio_data.n_values = mux_config->n_values;
 	gpio_data.classes = mux_config->classes;
-	gpio_data.gpio_chip = mux_config->gpio_chip;
-	gpio_data.gpios = mux_config->gpios;
-	gpio_data.n_gpios = mux_config->n_gpios;
 	gpio_data.idle = I2C_MUX_GPIO_NO_IDLE;
 
-	/* Register the mux device */
+	/* Register GPIO descriptor lookup table */
+	lookup = devm_kzalloc(dev,
+			      struct_size(lookup, table, mux_config->n_gpios),
+			      GFP_KERNEL);
+	if (!lookup)
+		return -ENOMEM;
+	lookup->dev_id = "i2c-mux-gpio";
+	for (i = 0; i < mux_config->n_gpios; i++) {
+		lookup->table[i].chip_label = mux_config->gpio_chip;
+		lookup->table[i].chip_hwnum = mux_config->gpios[i];
+		lookup->table[i].con_id = "mux";
+	}
+	gpiod_add_lookup_table(lookup);
+	priv->lookup = lookup;
+
+	/*
+	 * Register the mux device, we use PLATFORM_DEVID_NONE here
+	 * because since we are referring to the GPIO chip by name we are
+	 * anyways in deep trouble if there is more than one of these
+	 * devices, and there should likely only be one platform controller
+	 * hub.
+	 */
 	priv->mux_pdev = platform_device_register_data(dev, "i2c-mux-gpio",
-				PLATFORM_DEVID_AUTO, &gpio_data,
+				PLATFORM_DEVID_NONE, &gpio_data,
 				sizeof(struct i2c_mux_gpio_platform_data));
 	if (IS_ERR(priv->mux_pdev)) {
 		err = PTR_ERR(priv->mux_pdev);
+		gpiod_remove_lookup_table(lookup);
 		priv->mux_pdev = NULL;
 		dev_err(dev, "Failed to register i2c-mux-gpio device\n");
 		return err;
@@ -1293,6 +1314,8 @@ static void i801_del_mux(struct i801_priv *priv)
 {
 	if (priv->mux_pdev)
 		platform_device_unregister(priv->mux_pdev);
+	if (priv->lookup)
+		gpiod_remove_lookup_table(priv->lookup);
 }
 
 static unsigned int i801_get_adapter_class(struct i801_priv *priv)
diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index 13882a2a4f60..fd482feafb19 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -14,13 +14,14 @@
 #include <linux/platform_device.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/gpio.h>
+#include <linux/bits.h>
+#include <linux/gpio/consumer.h>
+/* FIXME: stop poking around inside gpiolib */
 #include "../../gpio/gpiolib.h"
-#include <linux/of_gpio.h>
 
 struct gpiomux {
 	struct i2c_mux_gpio_platform_data data;
-	unsigned gpio_base;
+	int ngpios;
 	struct gpio_desc **gpios;
 };
 
@@ -30,8 +31,7 @@ static void i2c_mux_gpio_set(const struct gpiomux *mux, unsigned val)
 
 	values[0] = val;
 
-	gpiod_set_array_value_cansleep(mux->data.n_gpios, mux->gpios, NULL,
-				       values);
+	gpiod_set_array_value_cansleep(mux->ngpios, mux->gpios, NULL, values);
 }
 
 static int i2c_mux_gpio_select(struct i2c_mux_core *muxc, u32 chan)
@@ -52,12 +52,6 @@ static int i2c_mux_gpio_deselect(struct i2c_mux_core *muxc, u32 chan)
 	return 0;
 }
 
-static int match_gpio_chip_by_label(struct gpio_chip *chip,
-					      void *data)
-{
-	return !strcmp(chip->label, data);
-}
-
 #ifdef CONFIG_OF
 static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
 					struct platform_device *pdev)
@@ -65,8 +59,8 @@ static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
 	struct device_node *np = pdev->dev.of_node;
 	struct device_node *adapter_np, *child;
 	struct i2c_adapter *adapter;
-	unsigned *values, *gpios;
-	int i = 0, ret;
+	unsigned *values;
+	int i = 0;
 
 	if (!np)
 		return -ENODEV;
@@ -103,29 +97,6 @@ static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
 	if (of_property_read_u32(np, "idle-state", &mux->data.idle))
 		mux->data.idle = I2C_MUX_GPIO_NO_IDLE;
 
-	mux->data.n_gpios = of_gpio_named_count(np, "mux-gpios");
-	if (mux->data.n_gpios < 0) {
-		dev_err(&pdev->dev, "Missing mux-gpios property in the DT.\n");
-		return -EINVAL;
-	}
-
-	gpios = devm_kcalloc(&pdev->dev,
-			     mux->data.n_gpios, sizeof(*mux->data.gpios),
-			     GFP_KERNEL);
-	if (!gpios) {
-		dev_err(&pdev->dev, "Cannot allocate gpios array");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < mux->data.n_gpios; i++) {
-		ret = of_get_named_gpio(np, "mux-gpios", i);
-		if (ret < 0)
-			return ret;
-		gpios[i] = ret;
-	}
-
-	mux->data.gpios = gpios;
-
 	return 0;
 }
 #else
@@ -142,8 +113,8 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	struct gpiomux *mux;
 	struct i2c_adapter *parent;
 	struct i2c_adapter *root;
-	unsigned initial_state, gpio_base;
-	int i, ret;
+	unsigned initial_state;
+	int i, ngpios, ret;
 
 	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
 	if (!mux)
@@ -158,29 +129,19 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 			sizeof(mux->data));
 	}
 
-	/*
-	 * If a GPIO chip name is provided, the GPIO pin numbers provided are
-	 * relative to its base GPIO number. Otherwise they are absolute.
-	 */
-	if (mux->data.gpio_chip) {
-		struct gpio_chip *gpio;
-
-		gpio = gpiochip_find(mux->data.gpio_chip,
-				     match_gpio_chip_by_label);
-		if (!gpio)
-			return -EPROBE_DEFER;
-
-		gpio_base = gpio->base;
-	} else {
-		gpio_base = 0;
+	ngpios = gpiod_count(&pdev->dev, "mux");
+	if (ngpios <= 0) {
+		dev_err(&pdev->dev, "no valid gpios provided\n");
+		return ngpios ?: -EINVAL;
 	}
+	mux->ngpios = ngpios;
 
 	parent = i2c_get_adapter(mux->data.parent);
 	if (!parent)
 		return -EPROBE_DEFER;
 
 	muxc = i2c_mux_alloc(parent, &pdev->dev, mux->data.n_values,
-			     mux->data.n_gpios * sizeof(*mux->gpios), 0,
+			     ngpios * sizeof(*mux->gpios), 0,
 			     i2c_mux_gpio_select, NULL);
 	if (!muxc) {
 		ret = -ENOMEM;
@@ -194,7 +155,6 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	root = i2c_root_adapter(&parent->dev);
 
 	muxc->mux_locked = true;
-	mux->gpio_base = gpio_base;
 
 	if (mux->data.idle != I2C_MUX_GPIO_NO_IDLE) {
 		initial_state = mux->data.idle;
@@ -203,34 +163,28 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 		initial_state = mux->data.values[0];
 	}
 
-	for (i = 0; i < mux->data.n_gpios; i++) {
+	for (i = 0; i < ngpios; i++) {
 		struct device *gpio_dev;
-		struct gpio_desc *gpio_desc;
-
-		ret = gpio_request(gpio_base + mux->data.gpios[i], "i2c-mux-gpio");
-		if (ret) {
-			dev_err(&pdev->dev, "Failed to request GPIO %d\n",
-				mux->data.gpios[i]);
-			goto err_request_gpio;
+		struct gpio_desc *gpiod;
+		enum gpiod_flags flag;
+
+		if (initial_state & BIT(i))
+			flag = GPIOD_OUT_HIGH;
+		else
+			flag = GPIOD_OUT_LOW;
+		gpiod = devm_gpiod_get_index(&pdev->dev, "mux", i, flag);
+		if (IS_ERR(gpiod)) {
+			ret = PTR_ERR(gpiod);
+			goto alloc_failed;
 		}
 
-		ret = gpio_direction_output(gpio_base + mux->data.gpios[i],
-					    initial_state & (1 << i));
-		if (ret) {
-			dev_err(&pdev->dev,
-				"Failed to set direction of GPIO %d to output\n",
-				mux->data.gpios[i]);
-			i++;	/* gpio_request above succeeded, so must free */
-			goto err_request_gpio;
-		}
-
-		gpio_desc = gpio_to_desc(gpio_base + mux->data.gpios[i]);
-		mux->gpios[i] = gpio_desc;
+		mux->gpios[i] = gpiod;
 
 		if (!muxc->mux_locked)
 			continue;
 
-		gpio_dev = &gpio_desc->gdev->dev;
+		/* FIXME: find a proper way to access the GPIO device */
+		gpio_dev = &gpiod->gdev->dev;
 		muxc->mux_locked = i2c_root_adapter(gpio_dev) == root;
 	}
 
@@ -253,10 +207,6 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 
 add_adapter_failed:
 	i2c_mux_del_adapters(muxc);
-	i = mux->data.n_gpios;
-err_request_gpio:
-	for (; i > 0; i--)
-		gpio_free(gpio_base + mux->data.gpios[i - 1]);
 alloc_failed:
 	i2c_put_adapter(parent);
 
@@ -266,14 +216,8 @@ alloc_failed:
 static int i2c_mux_gpio_remove(struct platform_device *pdev)
 {
 	struct i2c_mux_core *muxc = platform_get_drvdata(pdev);
-	struct gpiomux *mux = i2c_mux_priv(muxc);
-	int i;
 
 	i2c_mux_del_adapters(muxc);
-
-	for (i = 0; i < mux->data.n_gpios; i++)
-		gpio_free(mux->gpio_base + mux->data.gpios[i]);
-
 	i2c_put_adapter(muxc->parent);
 
 	return 0;
diff --git a/include/linux/platform_data/i2c-mux-gpio.h b/include/linux/platform_data/i2c-mux-gpio.h
index 4406108201fe..28f288eed652 100644
--- a/include/linux/platform_data/i2c-mux-gpio.h
+++ b/include/linux/platform_data/i2c-mux-gpio.h
@@ -22,10 +22,6 @@
  *	position
  * @n_values: Number of multiplexer positions (busses to instantiate)
  * @classes: Optional I2C auto-detection classes
- * @gpio_chip: Optional GPIO chip name; if set, GPIO pin numbers are given
- *	relative to the base GPIO number of that chip
- * @gpios: Array of GPIO numbers used to control MUX
- * @n_gpios: Number of GPIOs used to control MUX
  * @idle: Bitmask to write to MUX when idle or GPIO_I2CMUX_NO_IDLE if not used
  */
 struct i2c_mux_gpio_platform_data {
@@ -34,9 +30,6 @@ struct i2c_mux_gpio_platform_data {
 	const unsigned *values;
 	int n_values;
 	const unsigned *classes;
-	char *gpio_chip;
-	const unsigned *gpios;
-	int n_gpios;
 	unsigned idle;
 };
 
-- 
cgit v1.2.3


From 62de37da9f382455b983f2f92b10012109005278 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 20 Jun 2019 15:26:29 +0300
Subject: mtd: spi-nor: intel-spi: Convert to use SPDX identifier

This gets rid of the license boilerplate duplicated in each file.

No functional changes intended.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
---
 drivers/mtd/spi-nor/intel-spi-pci.c      | 5 +----
 drivers/mtd/spi-nor/intel-spi-platform.c | 5 +----
 drivers/mtd/spi-nor/intel-spi.c          | 5 +----
 drivers/mtd/spi-nor/intel-spi.h          | 5 +----
 include/linux/platform_data/intel-spi.h  | 5 +----
 5 files changed, 5 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/spi-nor/intel-spi-pci.c b/drivers/mtd/spi-nor/intel-spi-pci.c
index 578f0c74e536..1b9c2d99ba38 100644
--- a/drivers/mtd/spi-nor/intel-spi-pci.c
+++ b/drivers/mtd/spi-nor/intel-spi-pci.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Intel PCH/PCU SPI flash PCI driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/ioport.h>
diff --git a/drivers/mtd/spi-nor/intel-spi-platform.c b/drivers/mtd/spi-nor/intel-spi-platform.c
index 5c943df9398f..25b18804e9bb 100644
--- a/drivers/mtd/spi-nor/intel-spi-platform.c
+++ b/drivers/mtd/spi-nor/intel-spi-platform.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Intel PCH/PCU SPI flash platform driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/ioport.h>
diff --git a/drivers/mtd/spi-nor/intel-spi.c b/drivers/mtd/spi-nor/intel-spi.c
index d60cbf23d9aa..021cef930f9f 100644
--- a/drivers/mtd/spi-nor/intel-spi.c
+++ b/drivers/mtd/spi-nor/intel-spi.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Intel PCH/PCU SPI flash driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/err.h>
diff --git a/drivers/mtd/spi-nor/intel-spi.h b/drivers/mtd/spi-nor/intel-spi.h
index 5ab7dc250050..b03bf296fda3 100644
--- a/drivers/mtd/spi-nor/intel-spi.h
+++ b/drivers/mtd/spi-nor/intel-spi.h
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Intel PCH/PCU SPI flash driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #ifndef INTEL_SPI_H
diff --git a/include/linux/platform_data/intel-spi.h b/include/linux/platform_data/intel-spi.h
index 942b0c3f8f08..001f377fb5ef 100644
--- a/include/linux/platform_data/intel-spi.h
+++ b/include/linux/platform_data/intel-spi.h
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Intel PCH/PCU SPI flash driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #ifndef INTEL_SPI_PDATA_H
-- 
cgit v1.2.3


From d7cd0e053b17dfa0dd4669dfb388c100be272823 Mon Sep 17 00:00:00 2001
From: Nikola Cornij <nikola.cornij@amd.com>
Date: Mon, 15 Apr 2019 17:31:44 -0400
Subject: drm/amd/display: Add 170Mpix/sec DSC throughput support

[why]
It was missing, although defined in DP spec

[how]
- Add handling of this value to DSC code
- Also remove unused file dsc_helpers.c

Signed-off-by: Nikola Cornij <nikola.cornij@amd.com>
Reviewed-by: Joshua Aberback <Joshua.Aberback@amd.com>
Acked-by: Bhawanpreet Lakha <Bhawanpreet.Lakha@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c      |  10 +-
 drivers/gpu/drm/amd/display/dc/dsc/dsc_helpers.c | 243 -----------------------
 include/drm/drm_dp_helper.h                      |   4 +
 3 files changed, 12 insertions(+), 245 deletions(-)
 delete mode 100644 drivers/gpu/drm/amd/display/dc/dsc/dsc_helpers.c

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c b/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c
index d58d718171b5..f09f23707a94 100644
--- a/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c
+++ b/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c
@@ -74,6 +74,12 @@ static bool dsc_line_buff_depth_from_dpcd(int dpcd_line_buff_bit_depth, int *lin
 static bool dsc_throughput_from_dpcd(int dpcd_throughput, int *throughput)
 {
 	switch (dpcd_throughput) {
+	case DP_DSC_THROUGHPUT_MODE_0_UPSUPPORTED:
+		*throughput = 0;
+		break;
+	case DP_DSC_THROUGHPUT_MODE_0_170:
+		*throughput = 170;
+		break;
 	case DP_DSC_THROUGHPUT_MODE_0_340:
 		*throughput = 340;
 		break;
@@ -170,7 +176,7 @@ static void get_dsc_enc_caps(
 /* Returns 'false' if no intersection was found for at least one capablity.
  * It also implicitly validates some sink caps against invalid value of zero.
  */
-static bool dc_intersect_dsc_caps(
+static bool intersect_dsc_caps(
 	const struct dsc_dec_dpcd_caps *dsc_sink_caps,
 	const struct dsc_enc_caps *dsc_enc_caps,
 	enum dc_pixel_encoding pixel_encoding,
@@ -537,7 +543,7 @@ static bool setup_dsc_config(
 		goto done;
 
 	// Intersect decoder with encoder DSC caps and validate DSC settings
-	is_dsc_possible = dc_intersect_dsc_caps(dsc_sink_caps, dsc_enc_caps, timing->pixel_encoding, &dsc_common_caps);
+	is_dsc_possible = intersect_dsc_caps(dsc_sink_caps, dsc_enc_caps, timing->pixel_encoding, &dsc_common_caps);
 	if (!is_dsc_possible)
 		goto done;
 
diff --git a/drivers/gpu/drm/amd/display/dc/dsc/dsc_helpers.c b/drivers/gpu/drm/amd/display/dc/dsc/dsc_helpers.c
deleted file mode 100644
index 0ecd5065d120..000000000000
--- a/drivers/gpu/drm/amd/display/dc/dsc/dsc_helpers.c
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Copyright 2018 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors: AMD
- *
- */
-
-#ifdef CONFIG_DRM_AMD_DC_DSC_SUPPORT
-
-#include "dc.h"
-#include "dsc.h"
-#include "dc_hw_types.h"
-#include <drm/drm_dp_helper.h>
-
-#define DC_LOGGER \
-	dsc->ctx->logger
-
-static bool dsc_buff_block_size_from_dpcd(int dpcd_buff_block_size, int *buff_block_size);
-static bool dsc_line_buff_depth_from_dpcd(int dpcd_line_buff_bit_depth, int *line_buff_bit_depth);
-static bool dsc_throughput_from_dpcd(int dpcd_throughput, int *throughput);
-static bool dsc_bpp_increment_div_from_dpcd(int bpp_increment_dpcd, uint32_t *bpp_increment_div);
-
-void dsc_optc_config_log(struct display_stream_compressor *dsc,
-		struct dsc_optc_config *config)
-{
-	DC_LOG_DSC("Setting optc DSC config at DSC inst %d", dsc->inst);
-	DC_LOG_DSC("\n\tbytes_per_pixel %d\n\tis_pixel_format_444 %d\n\tslice_width %d",
-			config->bytes_per_pixel,
-			config->is_pixel_format_444, config->slice_width);
-}
-
-void dsc_config_log(struct display_stream_compressor *dsc,
-		const struct dsc_config *config)
-{
-	DC_LOG_DSC("Setting DSC Config at DSC inst %d", dsc->inst);
-	DC_LOG_DSC("\n\tnum_slices_h %d\n\tnum_slices_v %d\n\tbits_per_pixel %d\n\tcolor_depth %d",
-		config->dc_dsc_cfg.num_slices_h,
-		config->dc_dsc_cfg.num_slices_v,
-		config->dc_dsc_cfg.bits_per_pixel,
-		config->color_depth);
-}
-
-
-bool dsc_parse_dsc_dpcd(const uint8_t *dpcd_dsc_data, struct dsc_dec_dpcd_caps *dsc_sink_caps)
-{
-	dsc_sink_caps->is_dsc_supported = (dpcd_dsc_data[DP_DSC_SUPPORT - DP_DSC_SUPPORT] & DP_DSC_DECOMPRESSION_IS_SUPPORTED) != 0;
-	if (!dsc_sink_caps->is_dsc_supported)
-		return true;
-
-	dsc_sink_caps->dsc_version = dpcd_dsc_data[DP_DSC_REV - DP_DSC_SUPPORT];
-
-	{
-		int buff_block_size;
-		int buff_size;
-
-		if (!dsc_buff_block_size_from_dpcd(dpcd_dsc_data[DP_DSC_RC_BUF_BLK_SIZE - DP_DSC_SUPPORT], &buff_block_size))
-			return false;
-
-		buff_size = dpcd_dsc_data[DP_DSC_RC_BUF_SIZE - DP_DSC_SUPPORT] + 1;
-		dsc_sink_caps->rc_buffer_size = buff_size * buff_block_size;
-	}
-
-	dsc_sink_caps->slice_caps1.raw = dpcd_dsc_data[DP_DSC_SLICE_CAP_1 - DP_DSC_SUPPORT];
-	if (!dsc_line_buff_depth_from_dpcd(dpcd_dsc_data[DP_DSC_LINE_BUF_BIT_DEPTH - DP_DSC_SUPPORT], &dsc_sink_caps->lb_bit_depth))
-		return false;
-
-	dsc_sink_caps->is_block_pred_supported =
-		(dpcd_dsc_data[DP_DSC_BLK_PREDICTION_SUPPORT - DP_DSC_SUPPORT] & DP_DSC_BLK_PREDICTION_IS_SUPPORTED) != 0;
-
-	dsc_sink_caps->edp_max_bits_per_pixel =
-		dpcd_dsc_data[DP_DSC_MAX_BITS_PER_PIXEL_LOW - DP_DSC_SUPPORT] |
-		dpcd_dsc_data[DP_DSC_MAX_BITS_PER_PIXEL_HI - DP_DSC_SUPPORT] << 8;
-
-	dsc_sink_caps->color_formats.raw = dpcd_dsc_data[DP_DSC_DEC_COLOR_FORMAT_CAP - DP_DSC_SUPPORT];
-	dsc_sink_caps->color_depth.raw = dpcd_dsc_data[DP_DSC_DEC_COLOR_DEPTH_CAP - DP_DSC_SUPPORT];
-
-	{
-		int dpcd_throughput = dpcd_dsc_data[DP_DSC_PEAK_THROUGHPUT - DP_DSC_SUPPORT];
-
-		if (!dsc_throughput_from_dpcd(dpcd_throughput & DP_DSC_THROUGHPUT_MODE_0_MASK, &dsc_sink_caps->throughput_mode_0_mps))
-			return false;
-
-		dpcd_throughput = (dpcd_throughput & DP_DSC_THROUGHPUT_MODE_1_MASK) >> DP_DSC_THROUGHPUT_MODE_1_SHIFT;
-		if (!dsc_throughput_from_dpcd(dpcd_throughput, &dsc_sink_caps->throughput_mode_1_mps))
-			return false;
-	}
-
-	dsc_sink_caps->max_slice_width = dpcd_dsc_data[DP_DSC_MAX_SLICE_WIDTH - DP_DSC_SUPPORT] * 320;
-	dsc_sink_caps->slice_caps2.raw = dpcd_dsc_data[DP_DSC_SLICE_CAP_2 - DP_DSC_SUPPORT];
-
-	if (!dsc_bpp_increment_div_from_dpcd(dpcd_dsc_data[DP_DSC_BITS_PER_PIXEL_INC - DP_DSC_SUPPORT], &dsc_sink_caps->bpp_increment_div))
-		return false;
-
-	return true;
-}
-
-
-/* This module's internal functions */
-
-static bool dsc_buff_block_size_from_dpcd(int dpcd_buff_block_size, int *buff_block_size)
-{
-
-	switch (dpcd_buff_block_size) {
-	case DP_DSC_RC_BUF_BLK_SIZE_1:
-		*buff_block_size = 1024;
-		break;
-	case DP_DSC_RC_BUF_BLK_SIZE_4:
-		*buff_block_size = 4 * 1024;
-		break;
-	case DP_DSC_RC_BUF_BLK_SIZE_16:
-		*buff_block_size = 16 * 1024;
-		break;
-	case DP_DSC_RC_BUF_BLK_SIZE_64:
-		*buff_block_size = 64 * 1024;
-		break;
-	default: {
-			dm_error("%s: DPCD DSC buffer size not recoginzed.\n", __func__);
-			return false;
-		}
-	}
-
-	return true;
-}
-
-
-static bool dsc_line_buff_depth_from_dpcd(int dpcd_line_buff_bit_depth, int *line_buff_bit_depth)
-{
-	if (0 <= dpcd_line_buff_bit_depth && dpcd_line_buff_bit_depth <= 7)
-		*line_buff_bit_depth = dpcd_line_buff_bit_depth + 9;
-	else if (dpcd_line_buff_bit_depth == 8)
-		*line_buff_bit_depth = 8;
-	else {
-		dm_error("%s: DPCD DSC buffer depth not recoginzed.\n", __func__);
-		return false;
-	}
-
-	return true;
-}
-
-
-static bool dsc_throughput_from_dpcd(int dpcd_throughput, int *throughput)
-{
-	switch (dpcd_throughput) {
-	case DP_DSC_THROUGHPUT_MODE_0_340:
-		*throughput = 340;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_400:
-		*throughput = 400;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_450:
-		*throughput = 450;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_500:
-		*throughput = 500;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_550:
-		*throughput = 550;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_600:
-		*throughput = 600;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_650:
-		*throughput = 650;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_700:
-		*throughput = 700;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_750:
-		*throughput = 750;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_800:
-		*throughput = 800;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_850:
-		*throughput = 850;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_900:
-		*throughput = 900;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_950:
-		*throughput = 950;
-		break;
-	case DP_DSC_THROUGHPUT_MODE_0_1000:
-		*throughput = 1000;
-		break;
-	default: {
-			dm_error("%s: DPCD DSC througput mode not recoginzed.\n", __func__);
-			return false;
-		}
-	}
-
-	return true;
-}
-
-
-static bool dsc_bpp_increment_div_from_dpcd(int bpp_increment_dpcd, uint32_t *bpp_increment_div)
-{
-
-	switch (bpp_increment_dpcd) {
-	case 0:
-		*bpp_increment_div = 16;
-		break;
-	case 1:
-		*bpp_increment_div = 8;
-		break;
-	case 2:
-		*bpp_increment_div = 4;
-		break;
-	case 3:
-		*bpp_increment_div = 2;
-		break;
-	case 4:
-		*bpp_increment_div = 1;
-		break;
-	default: {
-		dm_error("%s: DPCD DSC bits-per-pixel increment not recoginzed.\n", __func__);
-		return false;
-	}
-	}
-
-	return true;
-}
-
-
-#endif // CONFIG_DRM_AMD_DC_DSC_SUPPORT
diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 3fc534ee8174..c0c947a42a59 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -249,6 +249,7 @@
 #define DP_DSC_PEAK_THROUGHPUT              0x06B
 # define DP_DSC_THROUGHPUT_MODE_0_MASK      (0xf << 0)
 # define DP_DSC_THROUGHPUT_MODE_0_SHIFT     0
+# define DP_DSC_THROUGHPUT_MODE_0_UPSUPPORTED 0
 # define DP_DSC_THROUGHPUT_MODE_0_340       (1 << 0)
 # define DP_DSC_THROUGHPUT_MODE_0_400       (2 << 0)
 # define DP_DSC_THROUGHPUT_MODE_0_450       (3 << 0)
@@ -263,8 +264,10 @@
 # define DP_DSC_THROUGHPUT_MODE_0_900       (12 << 0)
 # define DP_DSC_THROUGHPUT_MODE_0_950       (13 << 0)
 # define DP_DSC_THROUGHPUT_MODE_0_1000      (14 << 0)
+# define DP_DSC_THROUGHPUT_MODE_0_170       (15 << 4)
 # define DP_DSC_THROUGHPUT_MODE_1_MASK      (0xf << 4)
 # define DP_DSC_THROUGHPUT_MODE_1_SHIFT     4
+# define DP_DSC_THROUGHPUT_MODE_1_UPSUPPORTED 0
 # define DP_DSC_THROUGHPUT_MODE_1_340       (1 << 4)
 # define DP_DSC_THROUGHPUT_MODE_1_400       (2 << 4)
 # define DP_DSC_THROUGHPUT_MODE_1_450       (3 << 4)
@@ -279,6 +282,7 @@
 # define DP_DSC_THROUGHPUT_MODE_1_900       (12 << 4)
 # define DP_DSC_THROUGHPUT_MODE_1_950       (13 << 4)
 # define DP_DSC_THROUGHPUT_MODE_1_1000      (14 << 4)
+# define DP_DSC_THROUGHPUT_MODE_1_170       (15 << 4)
 
 #define DP_DSC_MAX_SLICE_WIDTH              0x06C
 #define DP_DSC_MIN_SLICE_WIDTH_VALUE        2560
-- 
cgit v1.2.3


From f446489adcbc9c4833ae724a985166731c577bcd Mon Sep 17 00:00:00 2001
From: Nikola Cornij <nikola.cornij@amd.com>
Date: Wed, 17 Apr 2019 19:07:08 -0400
Subject: drm/amd/display: Add support for extended DSC DPCD caps

[why]
A few of the new DSC DPCD caps were introduced by a DP 1.4a SCR in order
to give DSC branch decoders a chance to expose their maximum throughput
and maximum line width limitations.

Signed-off-by: Nikola Cornij <nikola.cornij@amd.com>
Reviewed-by: Wenjing Liu <Wenjing.Liu@amd.com>
Acked-by: Bhawanpreet Lakha <Bhawanpreet.Lakha@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c   |  50 +++---
 drivers/gpu/drm/amd/display/dc/dc_dsc.h            |   3 +-
 drivers/gpu/drm/amd/display/dc/dc_types.h          |   5 +
 drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c        | 118 ++++++++++-----
 drivers/gpu/drm/amd/display/include/dpcd_structs.h | 168 +++++++++++++++++++++
 include/drm/drm_dp_helper.h                        |   5 +
 6 files changed, 289 insertions(+), 60 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/display/include/dpcd_structs.h

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
index effc36745671..017f88c9f2e4 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
@@ -2383,8 +2383,8 @@ static bool retrieve_link_cap(struct dc_link *link)
 	int i;
 	struct dp_sink_hw_fw_revision dp_hw_fw_revision;
 #ifdef CONFIG_DRM_AMD_DC_DSC_SUPPORT
-	uint8_t dsc_data[16];
-	struct dsc_dec_dpcd_caps *dsc_caps;
+	uint8_t dsc_data[16]; /* DP_DSC_BITS_PER_PIXEL_INC - DP_DSC_SUPPORT + 1 == 16 */
+	struct dsc_dec_dpcd_caps *dsc_dec_caps;
 #endif
 
 	memset(dpcd_data, '\0', sizeof(dpcd_data));
@@ -2558,8 +2558,8 @@ static bool retrieve_link_cap(struct dc_link *link)
 		sizeof(dp_hw_fw_revision.ieee_fw_rev));
 
 #ifdef CONFIG_DRM_AMD_DC_DSC_SUPPORT
-	dsc_caps = &link->dpcd_caps.dsc_sink_caps;
-	memset(dsc_caps, '\0', sizeof(*dsc_caps));
+	dsc_dec_caps = &link->dpcd_caps.dsc_sink_caps;
+	memset(dsc_dec_caps, '\0', sizeof(*dsc_dec_caps));
 	memset(&link->dpcd_caps.dsc_sink_caps, '\0',
 			sizeof(link->dpcd_caps.dsc_sink_caps));
 	memset(&link->dpcd_caps.fec_cap, '\0', sizeof(link->dpcd_caps.fec_cap));
@@ -2571,7 +2571,7 @@ static bool retrieve_link_cap(struct dc_link *link)
 				dsc_data,
 				sizeof(dsc_data));
 		if (status == DC_OK) {
-			DC_LOG_DSC("DSC capability read at link %d:",
+			DC_LOG_DSC("DSC DPCD capability read at link %d:",
 					link->link_index);
 			DC_LOG_DSC("\t%02x %02x %02x %02x",
 					dsc_data[0], dsc_data[1],
@@ -2590,37 +2590,43 @@ static bool retrieve_link_cap(struct dc_link *link)
 			return false;
 		}
 
-		if (dc_dsc_parse_dsc_dpcd(dsc_data,
-				dsc_caps)) {
-			DC_LOG_DSC("DSC capability parsed at link %d:",
+		if (dc_dsc_parse_dsc_dpcd(dsc_data, NULL,
+				dsc_dec_caps)) {
+			DC_LOG_DSC("DSC DPCD capabilities parsed at link %d:",
 					link->link_index);
 			DC_LOG_DSC("\tis_dsc_supported:\t%d",
-					dsc_caps->is_dsc_supported);
-			DC_LOG_DSC("\tdsc_version:\t%d", dsc_caps->dsc_version);
+					dsc_dec_caps->is_dsc_supported);
+			DC_LOG_DSC("\tdsc_version:\t%d", dsc_dec_caps->dsc_version);
 			DC_LOG_DSC("\trc_buffer_size:\t%d",
-					dsc_caps->rc_buffer_size);
+					dsc_dec_caps->rc_buffer_size);
 			DC_LOG_DSC("\tslice_caps1:\t0x%x20",
-					dsc_caps->slice_caps1.raw);
+					dsc_dec_caps->slice_caps1.raw);
 			DC_LOG_DSC("\tslice_caps2:\t0x%x20",
-					dsc_caps->slice_caps2.raw);
+					dsc_dec_caps->slice_caps2.raw);
 			DC_LOG_DSC("\tlb_bit_depth:\t%d",
-					dsc_caps->lb_bit_depth);
+					dsc_dec_caps->lb_bit_depth);
 			DC_LOG_DSC("\tis_block_pred_supported:\t%d",
-					dsc_caps->is_block_pred_supported);
+					dsc_dec_caps->is_block_pred_supported);
 			DC_LOG_DSC("\tedp_max_bits_per_pixel:\t%d",
-					dsc_caps->edp_max_bits_per_pixel);
+					dsc_dec_caps->edp_max_bits_per_pixel);
 			DC_LOG_DSC("\tcolor_formats:\t%d",
-					dsc_caps->color_formats.raw);
+					dsc_dec_caps->color_formats.raw);
 			DC_LOG_DSC("\tcolor_depth:\t%d",
-					dsc_caps->color_depth.raw);
+					dsc_dec_caps->color_depth.raw);
 			DC_LOG_DSC("\tthroughput_mode_0_mps:\t%d",
-					dsc_caps->throughput_mode_0_mps);
+					dsc_dec_caps->throughput_mode_0_mps);
 			DC_LOG_DSC("\tthroughput_mode_1_mps:\t%d",
-					dsc_caps->throughput_mode_1_mps);
+					dsc_dec_caps->throughput_mode_1_mps);
 			DC_LOG_DSC("\tmax_slice_width:\t%d",
-					dsc_caps->max_slice_width);
+					dsc_dec_caps->max_slice_width);
 			DC_LOG_DSC("\tbpp_increment_div:\t%d",
-					dsc_caps->bpp_increment_div);
+					dsc_dec_caps->bpp_increment_div);
+			DC_LOG_DSC("\tbranch_overall_throughput_0_mps:\t%d",
+					dsc_dec_caps->branch_overall_throughput_0_mps);
+			DC_LOG_DSC("\tbranch_overall_throughput_1_mps:\t%d",
+					dsc_dec_caps->branch_overall_throughput_1_mps);
+			DC_LOG_DSC("\tbranch_max_line_width:\t%d",
+					dsc_dec_caps->branch_max_line_width);
 		} else {
 			/* Some sinks return bogus DSC DPCD data
 			 * when they don't support DSC.
diff --git a/drivers/gpu/drm/amd/display/dc/dc_dsc.h b/drivers/gpu/drm/amd/display/dc/dc_dsc.h
index be0f7b09086a..6de3bc9162ea 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_dsc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc_dsc.h
@@ -34,7 +34,8 @@ struct dc_dsc_bw_range {
 };
 
 
-bool dc_dsc_parse_dsc_dpcd(const uint8_t *dpcd_dsc_data,
+bool dc_dsc_parse_dsc_dpcd(const uint8_t *dpcd_dsc_basic_data,
+		const uint8_t *dpcd_dsc_ext_data,
 		struct dsc_dec_dpcd_caps *dsc_sink_caps);
 
 bool dc_dsc_compute_bandwidth_range(
diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h
index b7e2c6f767aa..5984be3cdf0c 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_types.h
+++ b/drivers/gpu/drm/amd/display/dc/dc_types.h
@@ -773,6 +773,11 @@ struct dsc_dec_dpcd_caps {
 	int32_t throughput_mode_1_mps; /* In MPs */
 	int32_t max_slice_width;
 	uint32_t bpp_increment_div; /* bpp increment divisor, e.g. if 16, it's 1/16th of a bit */
+
+	/* Extended DSC caps */
+	uint32_t branch_overall_throughput_0_mps; /* In MPs */
+	uint32_t branch_overall_throughput_1_mps; /* In MPs */
+	uint32_t branch_max_line_width;
 };
 #endif
 #endif /* DC_TYPES_H_ */
diff --git a/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c b/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c
index f09f23707a94..94a623dc37f4 100644
--- a/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c
+++ b/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c
@@ -252,7 +252,7 @@ struct dc_dsc_policy {
 	int min_target_bpp; // Minimum target bits per pixel
 };
 
-static inline uint32_t dsc_round_up(uint32_t value)
+static inline uint32_t dsc_div_by_10_round_up(uint32_t value)
 {
 	return (value + 9) / 10;
 }
@@ -304,7 +304,7 @@ static void get_dsc_bandwidth_range(
 	range->stream_kbps = dc_bandwidth_in_kbps_from_timing(timing);
 
 	/* max dsc target bpp */
-	range->max_kbps = dsc_round_up(max_bpp * timing->pix_clk_100hz);
+	range->max_kbps = dsc_div_by_10_round_up(max_bpp * timing->pix_clk_100hz);
 	range->max_target_bpp_x16 = max_bpp * 16;
 	if (range->max_kbps > range->stream_kbps) {
 		/* max dsc target bpp is capped to native bandwidth */
@@ -313,7 +313,7 @@ static void get_dsc_bandwidth_range(
 	}
 
 	/* min dsc target bpp */
-	range->min_kbps = dsc_round_up(min_bpp * timing->pix_clk_100hz);
+	range->min_kbps = dsc_div_by_10_round_up(min_bpp * timing->pix_clk_100hz);
 	range->min_target_bpp_x16 = min_bpp * 16;
 	if (range->min_kbps > range->max_kbps) {
 		/* min dsc target bpp is capped to max dsc bandwidth*/
@@ -532,16 +532,23 @@ static bool setup_dsc_config(
 	int pic_width;
 	int slice_width;
 	int target_bpp;
-	int sink_per_slice_throughput;
+	int sink_per_slice_throughput_mps;
+	int branch_max_throughput_mps = 0;
 	bool is_dsc_possible = false;
 	int num_slices_v;
 	int pic_height;
 
 	memset(dsc_cfg, 0, sizeof(struct dc_dsc_config));
 
+	pic_width = timing->h_addressable + timing->h_border_left + timing->h_border_right;
+	pic_height = timing->v_addressable + timing->v_border_top + timing->v_border_bottom;
+
 	if (!dsc_sink_caps->is_dsc_supported)
 		goto done;
 
+	if (dsc_sink_caps->branch_max_line_width && dsc_sink_caps->branch_max_line_width < pic_width)
+		goto done;
+
 	// Intersect decoder with encoder DSC caps and validate DSC settings
 	is_dsc_possible = intersect_dsc_caps(dsc_sink_caps, dsc_enc_caps, timing->pixel_encoding, &dsc_common_caps);
 	if (!is_dsc_possible)
@@ -554,39 +561,46 @@ static bool setup_dsc_config(
 	if (!is_dsc_possible)
 		goto done;
 
-	sink_per_slice_throughput = 0;
+	sink_per_slice_throughput_mps = 0;
 
 	// Validate available DSC settings against the mode timing
 
-	// Color format
+	// Validate color format (and pick up the throughput values)
 	dsc_cfg->ycbcr422_simple = false;
 	switch (timing->pixel_encoding)	{
 	case PIXEL_ENCODING_RGB:
 		is_dsc_possible = (bool)dsc_common_caps.color_formats.bits.RGB;
-		sink_per_slice_throughput = dsc_sink_caps->throughput_mode_0_mps;
+		sink_per_slice_throughput_mps = dsc_sink_caps->throughput_mode_0_mps;
+		branch_max_throughput_mps = dsc_sink_caps->branch_overall_throughput_0_mps;
 		break;
 	case PIXEL_ENCODING_YCBCR444:
 		is_dsc_possible = (bool)dsc_common_caps.color_formats.bits.YCBCR_444;
-		sink_per_slice_throughput = dsc_sink_caps->throughput_mode_0_mps;
-		break;
-	case PIXEL_ENCODING_YCBCR422: {
-			is_dsc_possible = (bool)dsc_common_caps.color_formats.bits.YCBCR_NATIVE_422;
-			sink_per_slice_throughput = dsc_sink_caps->throughput_mode_1_mps;
-			if (!is_dsc_possible) {
-				is_dsc_possible = (bool)dsc_common_caps.color_formats.bits.YCBCR_SIMPLE_422;
-				dsc_cfg->ycbcr422_simple = is_dsc_possible;
-				sink_per_slice_throughput = dsc_sink_caps->throughput_mode_0_mps;
-			}
+		sink_per_slice_throughput_mps = dsc_sink_caps->throughput_mode_0_mps;
+		branch_max_throughput_mps = dsc_sink_caps->branch_overall_throughput_0_mps;
+		break;
+	case PIXEL_ENCODING_YCBCR422:
+		is_dsc_possible = (bool)dsc_common_caps.color_formats.bits.YCBCR_NATIVE_422;
+		sink_per_slice_throughput_mps = dsc_sink_caps->throughput_mode_1_mps;
+		branch_max_throughput_mps = dsc_sink_caps->branch_overall_throughput_1_mps;
+		if (!is_dsc_possible) {
+			is_dsc_possible = (bool)dsc_common_caps.color_formats.bits.YCBCR_SIMPLE_422;
+			dsc_cfg->ycbcr422_simple = is_dsc_possible;
+			sink_per_slice_throughput_mps = dsc_sink_caps->throughput_mode_0_mps;
 		}
 		break;
 	case PIXEL_ENCODING_YCBCR420:
 		is_dsc_possible = (bool)dsc_common_caps.color_formats.bits.YCBCR_NATIVE_420;
-		sink_per_slice_throughput = dsc_sink_caps->throughput_mode_1_mps;
+		sink_per_slice_throughput_mps = dsc_sink_caps->throughput_mode_1_mps;
+		branch_max_throughput_mps = dsc_sink_caps->branch_overall_throughput_1_mps;
 		break;
 	default:
 		is_dsc_possible = false;
 	}
 
+	// Validate branch's maximum throughput
+	if (branch_max_throughput_mps && dsc_div_by_10_round_up(timing->pix_clk_100hz) > branch_max_throughput_mps * 1000)
+		is_dsc_possible = false;
+
 	if (!is_dsc_possible)
 		goto done;
 
@@ -611,7 +625,6 @@ static bool setup_dsc_config(
 	// DSC slicing
 	max_slices_h = get_max_dsc_slices(dsc_common_caps.slice_caps);
 
-	pic_width = timing->h_addressable + timing->h_border_left + timing->h_border_right;
 	while (max_slices_h > 0) {
 		if (pic_width % max_slices_h == 0)
 			break;
@@ -630,7 +643,8 @@ static bool setup_dsc_config(
 	min_slices_h = fit_num_slices_up(dsc_common_caps.slice_caps, min_slices_h);
 
 	while (min_slices_h <= max_slices_h) {
-		if (dsc_round_up(timing->pix_clk_100hz) / (min_slices_h) <= sink_per_slice_throughput * 1000)
+		int pix_clk_per_slice_khz = dsc_div_by_10_round_up(timing->pix_clk_100hz) / min_slices_h;
+		if (pix_clk_per_slice_khz <= sink_per_slice_throughput_mps * 1000)
 			break;
 
 		min_slices_h = inc_num_slices(dsc_common_caps.slice_caps, min_slices_h);
@@ -673,7 +687,6 @@ static bool setup_dsc_config(
 
 	// Vertical number of slices: start from policy and pick the first one that height is divisible by.
 	// For 4:2:0 make sure the slice height is divisible by 2 as well.
-	pic_height = timing->v_addressable + timing->v_border_top + timing->v_border_bottom;
 	num_slices_v = dsc_policy.num_slices_v;
 	if (num_slices_v < 1)
 		num_slices_v = 1;
@@ -710,41 +723,41 @@ done:
 	return is_dsc_possible;
 }
 
-bool dc_dsc_parse_dsc_dpcd(const uint8_t *dpcd_dsc_data, struct dsc_dec_dpcd_caps *dsc_sink_caps)
+bool dc_dsc_parse_dsc_dpcd(const uint8_t *dpcd_dsc_basic_data, const uint8_t *dpcd_dsc_ext_data, struct dsc_dec_dpcd_caps *dsc_sink_caps)
 {
-	dsc_sink_caps->is_dsc_supported = (dpcd_dsc_data[DP_DSC_SUPPORT - DP_DSC_SUPPORT] & DP_DSC_DECOMPRESSION_IS_SUPPORTED) != 0;
+	dsc_sink_caps->is_dsc_supported = (dpcd_dsc_basic_data[DP_DSC_SUPPORT - DP_DSC_SUPPORT] & DP_DSC_DECOMPRESSION_IS_SUPPORTED) != 0;
 	if (!dsc_sink_caps->is_dsc_supported)
 		return true;
 
-	dsc_sink_caps->dsc_version = dpcd_dsc_data[DP_DSC_REV - DP_DSC_SUPPORT];
+	dsc_sink_caps->dsc_version = dpcd_dsc_basic_data[DP_DSC_REV - DP_DSC_SUPPORT];
 
 	{
 		int buff_block_size;
 		int buff_size;
 
-		if (!dsc_buff_block_size_from_dpcd(dpcd_dsc_data[DP_DSC_RC_BUF_BLK_SIZE - DP_DSC_SUPPORT], &buff_block_size))
+		if (!dsc_buff_block_size_from_dpcd(dpcd_dsc_basic_data[DP_DSC_RC_BUF_BLK_SIZE - DP_DSC_SUPPORT], &buff_block_size))
 			return false;
 
-		buff_size = dpcd_dsc_data[DP_DSC_RC_BUF_SIZE - DP_DSC_SUPPORT] + 1;
+		buff_size = dpcd_dsc_basic_data[DP_DSC_RC_BUF_SIZE - DP_DSC_SUPPORT] + 1;
 		dsc_sink_caps->rc_buffer_size = buff_size * buff_block_size;
 	}
 
-	dsc_sink_caps->slice_caps1.raw = dpcd_dsc_data[DP_DSC_SLICE_CAP_1 - DP_DSC_SUPPORT];
-	if (!dsc_line_buff_depth_from_dpcd(dpcd_dsc_data[DP_DSC_LINE_BUF_BIT_DEPTH - DP_DSC_SUPPORT], &dsc_sink_caps->lb_bit_depth))
+	dsc_sink_caps->slice_caps1.raw = dpcd_dsc_basic_data[DP_DSC_SLICE_CAP_1 - DP_DSC_SUPPORT];
+	if (!dsc_line_buff_depth_from_dpcd(dpcd_dsc_basic_data[DP_DSC_LINE_BUF_BIT_DEPTH - DP_DSC_SUPPORT], &dsc_sink_caps->lb_bit_depth))
 		return false;
 
 	dsc_sink_caps->is_block_pred_supported =
-		(dpcd_dsc_data[DP_DSC_BLK_PREDICTION_SUPPORT - DP_DSC_SUPPORT] & DP_DSC_BLK_PREDICTION_IS_SUPPORTED) != 0;
+		(dpcd_dsc_basic_data[DP_DSC_BLK_PREDICTION_SUPPORT - DP_DSC_SUPPORT] & DP_DSC_BLK_PREDICTION_IS_SUPPORTED) != 0;
 
 	dsc_sink_caps->edp_max_bits_per_pixel =
-		dpcd_dsc_data[DP_DSC_MAX_BITS_PER_PIXEL_LOW - DP_DSC_SUPPORT] |
-		dpcd_dsc_data[DP_DSC_MAX_BITS_PER_PIXEL_HI - DP_DSC_SUPPORT] << 8;
+		dpcd_dsc_basic_data[DP_DSC_MAX_BITS_PER_PIXEL_LOW - DP_DSC_SUPPORT] |
+		dpcd_dsc_basic_data[DP_DSC_MAX_BITS_PER_PIXEL_HI - DP_DSC_SUPPORT] << 8;
 
-	dsc_sink_caps->color_formats.raw = dpcd_dsc_data[DP_DSC_DEC_COLOR_FORMAT_CAP - DP_DSC_SUPPORT];
-	dsc_sink_caps->color_depth.raw = dpcd_dsc_data[DP_DSC_DEC_COLOR_DEPTH_CAP - DP_DSC_SUPPORT];
+	dsc_sink_caps->color_formats.raw = dpcd_dsc_basic_data[DP_DSC_DEC_COLOR_FORMAT_CAP - DP_DSC_SUPPORT];
+	dsc_sink_caps->color_depth.raw = dpcd_dsc_basic_data[DP_DSC_DEC_COLOR_DEPTH_CAP - DP_DSC_SUPPORT];
 
 	{
-		int dpcd_throughput = dpcd_dsc_data[DP_DSC_PEAK_THROUGHPUT - DP_DSC_SUPPORT];
+		int dpcd_throughput = dpcd_dsc_basic_data[DP_DSC_PEAK_THROUGHPUT - DP_DSC_SUPPORT];
 
 		if (!dsc_throughput_from_dpcd(dpcd_throughput & DP_DSC_THROUGHPUT_MODE_0_MASK, &dsc_sink_caps->throughput_mode_0_mps))
 			return false;
@@ -754,12 +767,43 @@ bool dc_dsc_parse_dsc_dpcd(const uint8_t *dpcd_dsc_data, struct dsc_dec_dpcd_cap
 			return false;
 	}
 
-	dsc_sink_caps->max_slice_width = dpcd_dsc_data[DP_DSC_MAX_SLICE_WIDTH - DP_DSC_SUPPORT] * 320;
-	dsc_sink_caps->slice_caps2.raw = dpcd_dsc_data[DP_DSC_SLICE_CAP_2 - DP_DSC_SUPPORT];
+	dsc_sink_caps->max_slice_width = dpcd_dsc_basic_data[DP_DSC_MAX_SLICE_WIDTH - DP_DSC_SUPPORT] * 320;
+	dsc_sink_caps->slice_caps2.raw = dpcd_dsc_basic_data[DP_DSC_SLICE_CAP_2 - DP_DSC_SUPPORT];
 
-	if (!dsc_bpp_increment_div_from_dpcd(dpcd_dsc_data[DP_DSC_BITS_PER_PIXEL_INC - DP_DSC_SUPPORT], &dsc_sink_caps->bpp_increment_div))
+	if (!dsc_bpp_increment_div_from_dpcd(dpcd_dsc_basic_data[DP_DSC_BITS_PER_PIXEL_INC - DP_DSC_SUPPORT], &dsc_sink_caps->bpp_increment_div))
 		return false;
 
+	/* Extended caps */
+	if (dpcd_dsc_ext_data == NULL) { // Extended DPCD DSC data can be null, e.g. because it doesn't apply to SST
+		dsc_sink_caps->branch_overall_throughput_0_mps = 0;
+		dsc_sink_caps->branch_overall_throughput_1_mps = 0;
+		dsc_sink_caps->branch_max_line_width = 0;
+		return true;
+	}
+
+	dsc_sink_caps->branch_overall_throughput_0_mps = dpcd_dsc_ext_data[DP_DSC_BRANCH_OVERALL_THROUGHPUT_0 - DP_DSC_BRANCH_OVERALL_THROUGHPUT_0];
+	if (dsc_sink_caps->branch_overall_throughput_0_mps == 0)
+		dsc_sink_caps->branch_overall_throughput_0_mps = 0;
+	else if (dsc_sink_caps->branch_overall_throughput_0_mps == 1)
+		dsc_sink_caps->branch_overall_throughput_0_mps = 680;
+	else {
+		dsc_sink_caps->branch_overall_throughput_0_mps *= 50;
+		dsc_sink_caps->branch_overall_throughput_0_mps += 600;
+	}
+
+	dsc_sink_caps->branch_overall_throughput_1_mps = dpcd_dsc_ext_data[DP_DSC_BRANCH_OVERALL_THROUGHPUT_1 - DP_DSC_BRANCH_OVERALL_THROUGHPUT_0];
+	if (dsc_sink_caps->branch_overall_throughput_1_mps == 0)
+		dsc_sink_caps->branch_overall_throughput_1_mps = 0;
+	else if (dsc_sink_caps->branch_overall_throughput_1_mps == 1)
+		dsc_sink_caps->branch_overall_throughput_1_mps = 680;
+	else {
+		dsc_sink_caps->branch_overall_throughput_1_mps *= 50;
+		dsc_sink_caps->branch_overall_throughput_1_mps += 600;
+	}
+
+	dsc_sink_caps->branch_max_line_width = dpcd_dsc_ext_data[DP_DSC_BRANCH_MAX_LINE_WIDTH - DP_DSC_BRANCH_OVERALL_THROUGHPUT_0] * 320;
+	ASSERT(dsc_sink_caps->branch_max_line_width == 0 || dsc_sink_caps->branch_max_line_width >= 5120);
+
 	return true;
 }
 
diff --git a/drivers/gpu/drm/amd/display/include/dpcd_structs.h b/drivers/gpu/drm/amd/display/include/dpcd_structs.h
new file mode 100644
index 000000000000..6f417e0480e6
--- /dev/null
+++ b/drivers/gpu/drm/amd/display/include/dpcd_structs.h
@@ -0,0 +1,168 @@
+/*
+ * dpcd_structs.h
+ *
+ *  Created on: Oct 31, 2018
+ *      Author: jlei
+ */
+
+#ifndef DAL_INCLUDE_DPCD_STRUCTS_H_
+#define DAL_INCLUDE_DPCD_STRUCTS_H_
+
+struct dpcd_receive_port0_cap01 {
+	union {
+		struct {
+			// Byte 0
+			unsigned char reserved0				:1; // Bit0
+			unsigned char local_edid_present		:1;
+			unsigned char associated_to_preceding_port	:1;
+			unsigned char hblank_expansion_capable		:1;
+			unsigned char buffer_size_unit			:1; // Bit4
+			unsigned char buffer_size_per_port		:1;
+			unsigned char reserved1				:2;
+
+			// Byte 1
+			unsigned char buffer_size			:8;
+		} fields;
+		unsigned char raw[2];
+	};
+};
+
+#ifdef CONFIG_DRM_AMD_DC_DSC_SUPPORT
+
+struct dpcd_dsc_basic_capabilities {
+	union {
+		struct {
+			// Byte 0
+			struct {
+
+				unsigned char dsc_support				:1; // Bit0
+				unsigned char reserved					:7;
+			} dsc_support;
+
+			// Byte 1
+			struct {
+				unsigned char dsc_version_major	:4;
+				unsigned char dsc_version_minor	:4;
+			} dsc_algorithm_revision;
+
+			// Byte 2
+			struct {
+				unsigned char rc_block_buffer_size	:2;
+				unsigned char reserved	:6;
+			} dsc_rc_buffer_block_size;
+
+			// Byte 3
+			unsigned char dsc_rc_buffer_size;
+
+			// Byte 4
+			struct {
+				unsigned char one_slice_per_dp_dsc_sink_device		:1; // Bit0
+				unsigned char two_slices_per_dp_dsc_sink_device		:1;
+				unsigned char reserved					:1;
+				unsigned char four_slices_per_dp_dsc_sink_device	:1;
+				unsigned char six_slices_per_dp_dsc_sink_device		:1; // Bit 4
+				unsigned char eight_slices_per_dp_dsc_sink_device	:1;
+				unsigned char ten_slices_per_dp_dsc_sink_device		:1;
+				unsigned char twelve_slices_per_dp_dsc_sink_device	:1;
+			} dsc_slice_capabilities_1;
+
+			// Byte 5
+			struct {
+				unsigned char line_buffer_bit_depth	:4;
+				unsigned char reserved			:4;
+			} dsc_line_buffer_bit_depth;
+
+			// Byte 6
+			struct {
+				unsigned char block_prediction_support	:1;
+				unsigned char reserved			:7;
+			} dsc_block_prediction_support;
+
+			// Byte 7,8
+			struct {
+				unsigned char maximum_bits_per_pixel_supported_by_the_decompressor_low	:7;
+				unsigned char maximum_bits_per_pixel_supported_by_the_decompressor_high	:7;
+			} maximum_bits_per_pixel_supported_by_the_decompressor;
+
+			// Byte 9
+			struct {
+				unsigned char rgb_support			:1; // Bit0
+				unsigned char y_cb_cr_444_support		:1;
+				unsigned char y_cb_cr_simple_422_support	:1;
+				unsigned char y_cb_cr_native_422_support	:1;
+				unsigned char y_cb_cr_native_420_support	:1; // Bit 4
+				unsigned char reserved				:3;
+			} dsc_decoder_color_format_capabilities;
+
+			// Byte 10
+			struct {
+				unsigned char reserved0				:1; // Bit0
+				unsigned char eight_bits_per_color_support	:1;
+				unsigned char ten_bits_per_color_support	:1;
+				unsigned char twelve_bits_per_color_support	:1;
+				unsigned char reserved1				:4; // Bit 4
+			} dsc_decoder_color_depth_capabilities;
+
+			// Byte 11
+			struct {
+				unsigned char throughput_mode_0			:4;
+				unsigned char throughput_mode_1			:4;
+			} peak_dsc_throughput_dsc_sink;
+
+			// Byte 12
+			unsigned char dsc_maximum_slice_width;
+
+			// Byte 13
+			struct {
+				unsigned char sixteen_slices_per_dsc_sink_device	:1;
+				unsigned char twenty_slices_per_dsc_sink_device		:1;
+				unsigned char twentyfour_slices_per_dsc_sink_device	:1;
+				unsigned char reserved					:5;
+			} dsc_slice_capabilities_2;
+
+			// Byte 14
+			unsigned char reserved;
+
+			// Byte 15
+			struct {
+				unsigned char increment_of_bits_per_pixel_supported	:3;
+				unsigned char reserved					:5;
+			} bits_per_pixel_increment;
+		} fields;
+		unsigned char raw[16];
+	};
+};
+
+struct dpcd_dsc_ext_capabilities {
+	union {
+		struct {
+			unsigned char branch_overall_throughput_0; // Byte 0
+			unsigned char branch_overall_throughput_1; // Byte 1
+			unsigned char branch_max_line_width; // Byte 2
+		} fields;
+		unsigned char raw[3];
+	};
+};
+
+struct dpcd_dsc_capabilities {
+	struct dpcd_dsc_basic_capabilities dsc_basic_caps;
+	struct dpcd_dsc_ext_capabilities dsc_ext_caps;
+};
+
+struct dpcd_fec_capability {
+	union {
+		struct {
+			// Byte 0
+			unsigned char fec_capable				:1; // Bit0
+			unsigned char uncorrected_block_error_count_capable	:1;
+			unsigned char corrected_block_error_count_capable	:1;
+			unsigned char bit_error_count_capable			:1;
+			unsigned char reserved					:4; // Bit4
+		} fields;
+		unsigned char raw[1];
+	};
+};
+
+#endif
+
+#endif /* DAL_INCLUDE_DPCD_STRUCTS_H_ */
diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index c0c947a42a59..6d10b455b80a 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -356,6 +356,11 @@
 # define DP_FEC_CORR_BLK_ERROR_COUNT_CAP    (1 << 2)
 # define DP_FEC_BIT_ERROR_COUNT_CAP	    (1 << 3)
 
+/* DP Extended DSC Capabilities */
+#define DP_DSC_BRANCH_OVERALL_THROUGHPUT_0  0x0a0   /* DP 1.4a SCR */
+#define DP_DSC_BRANCH_OVERALL_THROUGHPUT_1  0x0a1
+#define DP_DSC_BRANCH_MAX_LINE_WIDTH        0x0a2
+
 /* link configuration */
 #define	DP_LINK_BW_SET		            0x100
 # define DP_LINK_RATE_TABLE		    0x00    /* eDP 1.4 */
-- 
cgit v1.2.3


From e67d4dfc9ff19dbe74b29617cf2592ccc50c3920 Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Wed, 12 Jun 2019 01:44:04 -0700
Subject: power: supply: Add HWMON compatibility layer

Add code implementing HWMON adapter/compatibility layer to allow
expositing various sensors present on power supply devices via HWMON
subsystem. This is done in order to allow userspace to use single
ABI/library(libsensors) to access/manipulate all of the sensors of the
system.

Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Chris Healy <cphealy@gmail.com>
Cc: Chris Healy <cphealy@gmail.com>
Cc: Cory Tusar <cory.tusar@zii.aero>
Cc: Lucas Stach <l.stach@pengutronix.de>
Cc: Fabio Estevam <fabio.estevam@nxp.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: linux-kernel@vger.kernel.org
Cc: linux-pm@vger.kernel.org
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/Kconfig              |  14 ++
 drivers/power/supply/Makefile             |   1 +
 drivers/power/supply/power_supply_core.c  |   7 +
 drivers/power/supply/power_supply_hwmon.c | 355 ++++++++++++++++++++++++++++++
 include/linux/power_supply.h              |  13 ++
 5 files changed, 390 insertions(+)
 create mode 100644 drivers/power/supply/power_supply_hwmon.c

(limited to 'include')

diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig
index 26dacdab03cc..1f2252cb95fd 100644
--- a/drivers/power/supply/Kconfig
+++ b/drivers/power/supply/Kconfig
@@ -14,6 +14,20 @@ config POWER_SUPPLY_DEBUG
 	  Say Y here to enable debugging messages for power supply class
 	  and drivers.
 
+config POWER_SUPPLY_HWMON
+	bool
+	prompt "Expose power supply sensors as hwmon device"
+	depends on HWMON=y || HWMON=POWER_SUPPLY
+	default y
+	help
+	  This options enables API that allows sensors found on a
+	  power supply device (current, voltage, temperature) to be
+	  exposed as a hwmon device.
+
+	  Say 'Y' here if you want power supplies to
+	  have hwmon sysfs interface too.
+
+
 config PDA_POWER
 	tristate "Generic PDA/phone power driver"
 	depends on !S390
diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile
index f208273f9686..c47e88ba16b9 100644
--- a/drivers/power/supply/Makefile
+++ b/drivers/power/supply/Makefile
@@ -6,6 +6,7 @@ power_supply-$(CONFIG_SYSFS)		+= power_supply_sysfs.o
 power_supply-$(CONFIG_LEDS_TRIGGERS)	+= power_supply_leds.o
 
 obj-$(CONFIG_POWER_SUPPLY)	+= power_supply.o
+obj-$(CONFIG_POWER_SUPPLY_HWMON) += power_supply_hwmon.o
 obj-$(CONFIG_GENERIC_ADC_BATTERY)	+= generic-adc-battery.o
 
 obj-$(CONFIG_PDA_POWER)		+= pda_power.o
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index f7033ecf6d0b..35624193a346 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -1072,6 +1072,10 @@ __power_supply_register(struct device *parent,
 	if (rc)
 		goto create_triggers_failed;
 
+	rc = power_supply_add_hwmon_sysfs(psy);
+	if (rc)
+		goto add_hwmon_sysfs_failed;
+
 	/*
 	 * Update use_cnt after any uevents (most notably from device_add()).
 	 * We are here still during driver's probe but
@@ -1090,6 +1094,8 @@ __power_supply_register(struct device *parent,
 
 	return psy;
 
+add_hwmon_sysfs_failed:
+	power_supply_remove_triggers(psy);
 create_triggers_failed:
 	psy_unregister_cooler(psy);
 register_cooler_failed:
@@ -1242,6 +1248,7 @@ void power_supply_unregister(struct power_supply *psy)
 	cancel_work_sync(&psy->changed_work);
 	cancel_delayed_work_sync(&psy->deferred_register_work);
 	sysfs_remove_link(&psy->dev.kobj, "powers");
+	power_supply_remove_hwmon_sysfs(psy);
 	power_supply_remove_triggers(psy);
 	psy_unregister_cooler(psy);
 	psy_unregister_thermal(psy);
diff --git a/drivers/power/supply/power_supply_hwmon.c b/drivers/power/supply/power_supply_hwmon.c
new file mode 100644
index 000000000000..51fe60440d12
--- /dev/null
+++ b/drivers/power/supply/power_supply_hwmon.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  power_supply_hwmon.c - power supply hwmon support.
+ */
+
+#include <linux/err.h>
+#include <linux/hwmon.h>
+#include <linux/power_supply.h>
+#include <linux/slab.h>
+
+struct power_supply_hwmon {
+	struct power_supply *psy;
+	unsigned long *props;
+};
+
+static int power_supply_hwmon_in_to_property(u32 attr)
+{
+	switch (attr) {
+	case hwmon_in_average:
+		return POWER_SUPPLY_PROP_VOLTAGE_AVG;
+	case hwmon_in_min:
+		return POWER_SUPPLY_PROP_VOLTAGE_MIN;
+	case hwmon_in_max:
+		return POWER_SUPPLY_PROP_VOLTAGE_MAX;
+	case hwmon_in_input:
+		return POWER_SUPPLY_PROP_VOLTAGE_NOW;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int power_supply_hwmon_curr_to_property(u32 attr)
+{
+	switch (attr) {
+	case hwmon_curr_average:
+		return POWER_SUPPLY_PROP_CURRENT_AVG;
+	case hwmon_curr_max:
+		return POWER_SUPPLY_PROP_CURRENT_MAX;
+	case hwmon_curr_input:
+		return POWER_SUPPLY_PROP_CURRENT_NOW;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int power_supply_hwmon_temp_to_property(u32 attr, int channel)
+{
+	if (channel) {
+		switch (attr) {
+		case hwmon_temp_input:
+			return POWER_SUPPLY_PROP_TEMP_AMBIENT;
+		case hwmon_temp_min_alarm:
+			return POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN;
+		case hwmon_temp_max_alarm:
+			return POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX;
+		default:
+			break;
+		}
+	} else {
+		switch (attr) {
+		case hwmon_temp_input:
+			return POWER_SUPPLY_PROP_TEMP;
+		case hwmon_temp_max:
+			return POWER_SUPPLY_PROP_TEMP_MAX;
+		case hwmon_temp_min:
+			return POWER_SUPPLY_PROP_TEMP_MIN;
+		case hwmon_temp_min_alarm:
+			return POWER_SUPPLY_PROP_TEMP_ALERT_MIN;
+		case hwmon_temp_max_alarm:
+			return POWER_SUPPLY_PROP_TEMP_ALERT_MAX;
+		default:
+			break;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int
+power_supply_hwmon_to_property(enum hwmon_sensor_types type,
+			       u32 attr, int channel)
+{
+	switch (type) {
+	case hwmon_in:
+		return power_supply_hwmon_in_to_property(attr);
+	case hwmon_curr:
+		return power_supply_hwmon_curr_to_property(attr);
+	case hwmon_temp:
+		return power_supply_hwmon_temp_to_property(attr, channel);
+	default:
+		return -EINVAL;
+	}
+}
+
+static bool power_supply_hwmon_is_a_label(enum hwmon_sensor_types type,
+					   u32 attr)
+{
+	return type == hwmon_temp && attr == hwmon_temp_label;
+}
+
+static bool power_supply_hwmon_is_writable(enum hwmon_sensor_types type,
+					   u32 attr)
+{
+	switch (type) {
+	case hwmon_in:
+		return attr == hwmon_in_min ||
+		       attr == hwmon_in_max;
+	case hwmon_curr:
+		return attr == hwmon_curr_max;
+	case hwmon_temp:
+		return attr == hwmon_temp_max ||
+		       attr == hwmon_temp_min ||
+		       attr == hwmon_temp_min_alarm ||
+		       attr == hwmon_temp_max_alarm;
+	default:
+		return false;
+	}
+}
+
+static umode_t power_supply_hwmon_is_visible(const void *data,
+					     enum hwmon_sensor_types type,
+					     u32 attr, int channel)
+{
+	const struct power_supply_hwmon *psyhw = data;
+	int prop;
+
+
+	if (power_supply_hwmon_is_a_label(type, attr))
+		return 0444;
+
+	prop = power_supply_hwmon_to_property(type, attr, channel);
+	if (prop < 0 || !test_bit(prop, psyhw->props))
+		return 0;
+
+	if (power_supply_property_is_writeable(psyhw->psy, prop) > 0 &&
+	    power_supply_hwmon_is_writable(type, attr))
+		return 0644;
+
+	return 0444;
+}
+
+static int power_supply_hwmon_read_string(struct device *dev,
+					  enum hwmon_sensor_types type,
+					  u32 attr, int channel,
+					  const char **str)
+{
+	*str = channel ? "temp" : "temp ambient";
+	return 0;
+}
+
+static int
+power_supply_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+			u32 attr, int channel, long *val)
+{
+	struct power_supply_hwmon *psyhw = dev_get_drvdata(dev);
+	struct power_supply *psy = psyhw->psy;
+	union power_supply_propval pspval;
+	int ret, prop;
+
+	prop = power_supply_hwmon_to_property(type, attr, channel);
+	if (prop < 0)
+		return prop;
+
+	ret  = power_supply_get_property(psy, prop, &pspval);
+	if (ret)
+		return ret;
+
+	switch (type) {
+	/*
+	 * Both voltage and current is reported in units of
+	 * microvolts/microamps, so we need to adjust it to
+	 * milliamps(volts)
+	 */
+	case hwmon_curr:
+	case hwmon_in:
+		pspval.intval = DIV_ROUND_CLOSEST(pspval.intval, 1000);
+		break;
+	/*
+	 * Temp needs to be converted from 1/10 C to milli-C
+	 */
+	case hwmon_temp:
+		if (check_mul_overflow(pspval.intval, 100,
+				       &pspval.intval))
+			return -EOVERFLOW;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*val = pspval.intval;
+
+	return 0;
+}
+
+static int
+power_supply_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
+			 u32 attr, int channel, long val)
+{
+	struct power_supply_hwmon *psyhw = dev_get_drvdata(dev);
+	struct power_supply *psy = psyhw->psy;
+	union power_supply_propval pspval;
+	int prop;
+
+	prop = power_supply_hwmon_to_property(type, attr, channel);
+	if (prop < 0)
+		return prop;
+
+	pspval.intval = val;
+
+	switch (type) {
+	/*
+	 * Both voltage and current is reported in units of
+	 * microvolts/microamps, so we need to adjust it to
+	 * milliamps(volts)
+	 */
+	case hwmon_curr:
+	case hwmon_in:
+		if (check_mul_overflow(pspval.intval, 1000,
+				       &pspval.intval))
+			return -EOVERFLOW;
+		break;
+	/*
+	 * Temp needs to be converted from 1/10 C to milli-C
+	 */
+	case hwmon_temp:
+		pspval.intval = DIV_ROUND_CLOSEST(pspval.intval, 100);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return power_supply_set_property(psy, prop, &pspval);
+}
+
+static const struct hwmon_ops power_supply_hwmon_ops = {
+	.is_visible	= power_supply_hwmon_is_visible,
+	.read		= power_supply_hwmon_read,
+	.write		= power_supply_hwmon_write,
+	.read_string	= power_supply_hwmon_read_string,
+};
+
+static const struct hwmon_channel_info *power_supply_hwmon_info[] = {
+	HWMON_CHANNEL_INFO(temp,
+			   HWMON_T_LABEL     |
+			   HWMON_T_INPUT     |
+			   HWMON_T_MAX       |
+			   HWMON_T_MIN       |
+			   HWMON_T_MIN_ALARM |
+			   HWMON_T_MIN_ALARM,
+
+			   HWMON_T_LABEL     |
+			   HWMON_T_INPUT     |
+			   HWMON_T_MIN_ALARM |
+			   HWMON_T_LABEL     |
+			   HWMON_T_MAX_ALARM),
+
+	HWMON_CHANNEL_INFO(curr,
+			   HWMON_C_AVERAGE |
+			   HWMON_C_MAX     |
+			   HWMON_C_INPUT),
+
+	HWMON_CHANNEL_INFO(in,
+			   HWMON_I_AVERAGE |
+			   HWMON_I_MIN     |
+			   HWMON_I_MAX     |
+			   HWMON_I_INPUT),
+	NULL
+};
+
+static const struct hwmon_chip_info power_supply_hwmon_chip_info = {
+	.ops = &power_supply_hwmon_ops,
+	.info = power_supply_hwmon_info,
+};
+
+static void power_supply_hwmon_bitmap_free(void *data)
+{
+	bitmap_free(data);
+}
+
+int power_supply_add_hwmon_sysfs(struct power_supply *psy)
+{
+	const struct power_supply_desc *desc = psy->desc;
+	struct power_supply_hwmon *psyhw;
+	struct device *dev = &psy->dev;
+	struct device *hwmon;
+	int ret, i;
+
+	if (!devres_open_group(dev, power_supply_add_hwmon_sysfs,
+			       GFP_KERNEL))
+		return -ENOMEM;
+
+	psyhw = devm_kzalloc(dev, sizeof(*psyhw), GFP_KERNEL);
+	if (!psyhw) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	psyhw->psy = psy;
+	psyhw->props = bitmap_zalloc(POWER_SUPPLY_PROP_TIME_TO_FULL_AVG + 1,
+				     GFP_KERNEL);
+	if (!psyhw->props) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ret = devm_add_action(dev, power_supply_hwmon_bitmap_free,
+			      psyhw->props);
+	if (ret)
+		goto error;
+
+	for (i = 0; i < desc->num_properties; i++) {
+		const enum power_supply_property prop = desc->properties[i];
+
+		switch (prop) {
+		case POWER_SUPPLY_PROP_CURRENT_AVG:
+		case POWER_SUPPLY_PROP_CURRENT_MAX:
+		case POWER_SUPPLY_PROP_CURRENT_NOW:
+		case POWER_SUPPLY_PROP_TEMP:
+		case POWER_SUPPLY_PROP_TEMP_MAX:
+		case POWER_SUPPLY_PROP_TEMP_MIN:
+		case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
+		case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT:
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN:
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX:
+		case POWER_SUPPLY_PROP_VOLTAGE_AVG:
+		case POWER_SUPPLY_PROP_VOLTAGE_MIN:
+		case POWER_SUPPLY_PROP_VOLTAGE_MAX:
+		case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+			set_bit(prop, psyhw->props);
+			break;
+		default:
+			break;
+		}
+	}
+
+	hwmon = devm_hwmon_device_register_with_info(dev, psy->desc->name,
+						psyhw,
+						&power_supply_hwmon_chip_info,
+						NULL);
+	ret = PTR_ERR_OR_ZERO(hwmon);
+	if (ret)
+		goto error;
+
+	devres_close_group(dev, power_supply_add_hwmon_sysfs);
+	return 0;
+error:
+	devres_release_group(dev, NULL);
+	return ret;
+}
+
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy)
+{
+	devres_release_group(&psy->dev, power_supply_add_hwmon_sysfs);
+}
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index d9c0c094f8a0..d5b15e039f4f 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -481,4 +481,17 @@ static inline bool power_supply_is_watt_property(enum power_supply_property psp)
 	return 0;
 }
 
+#ifdef CONFIG_POWER_SUPPLY_HWMON
+int power_supply_add_hwmon_sysfs(struct power_supply *psy);
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy);
+#else
+static inline int power_supply_add_hwmon_sysfs(struct power_supply *psy)
+{
+	return 0;
+}
+
+static inline
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy) {}
+#endif
+
 #endif /* __LINUX_POWER_SUPPLY_H__ */
-- 
cgit v1.2.3


From 38b37d631aec80da0c65ac03a7ef680b468c7857 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Date: Fri, 7 Jun 2019 12:49:11 +0200
Subject: module: allow arch overrides for .exit section names

Some archs like ARM store unwind information for .exit.text in sections
with unusual names. As this unwind information refers to .exit.text, it
must not be loaded when .exit.text is not loaded (when CONFIG_MODULE_UNLOAD
is unset); otherwise, loading a module can fail due to relocation failures.

Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleloader.h | 5 +++++
 kernel/module.c              | 7 ++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 31013c2effd3..5229c18025e9 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -29,6 +29,11 @@ void *module_alloc(unsigned long size);
 /* Free memory returned from module_alloc. */
 void module_memfree(void *module_region);
 
+/* Determines if the section name is an exit section (that is only used during
+ * module unloading)
+ */
+bool module_exit_section(const char *name);
+
 /*
  * Apply the given relocation to the (simplified) ELF.  Return -error
  * or 0.
diff --git a/kernel/module.c b/kernel/module.c
index 41258bab24f1..537c456ce3ee 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2748,6 +2748,11 @@ void * __weak module_alloc(unsigned long size)
 	return vmalloc_exec(size);
 }
 
+bool __weak module_exit_section(const char *name)
+{
+	return strstarts(name, ".exit");
+}
+
 #ifdef CONFIG_DEBUG_KMEMLEAK
 static void kmemleak_load_module(const struct module *mod,
 				 const struct load_info *info)
@@ -2937,7 +2942,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
 
 #ifndef CONFIG_MODULE_UNLOAD
 		/* Don't load .exit sections */
-		if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
+		if (module_exit_section(info->secstrings+shdr->sh_name))
 			shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
 	}
-- 
cgit v1.2.3


From 9e645e1105ca60fbbc6bddf2fd5ef7e57ed3dca8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 10 May 2019 16:07:28 -0600
Subject: io_uring: add support for sqe links

With SQE links, we can create chains of dependent SQEs. One example
would be queueing an SQE that's a read from one file descriptor, with
the linked SQE being a write to another with the same set of buffers.

An SQE link will not stall the pipeline, it'll just ensure that
dependent SQEs aren't issued before the previous link has completed.

Any error at submission or completion time will break the chain of SQEs.
For completions, this also includes short reads or writes, as the next
SQE could depend on the previous one being fully completed.

Any SQE in a chain that gets canceled due to any of the above errors,
will get an CQE fill with -ECANCELED as the error value.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 241 +++++++++++++++++++++++++++++++++---------
 include/uapi/linux/io_uring.h |   1 +
 2 files changed, 194 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 92debd8be535..9f0ef4956f87 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -322,6 +322,7 @@ struct io_kiocb {
 
 	struct io_ring_ctx	*ctx;
 	struct list_head	list;
+	struct list_head	link_list;
 	unsigned int		flags;
 	refcount_t		refs;
 #define REQ_F_NOWAIT		1	/* must not punt to workers */
@@ -330,8 +331,10 @@ struct io_kiocb {
 #define REQ_F_SEQ_PREV		8	/* sequential with previous */
 #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
 #define REQ_F_IO_DRAINED	32	/* drain done */
+#define REQ_F_LINK		64	/* linked sqes */
+#define REQ_F_FAIL_LINK		128	/* fail rest of links */
 	u64			user_data;
-	u32			error;	/* iopoll result from callback */
+	u32			result;
 	u32			sequence;
 
 	struct work_struct	work;
@@ -583,6 +586,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	req->flags = 0;
 	/* one is dropped after submission, the other at completion */
 	refcount_set(&req->refs, 2);
+	req->result = 0;
 	return req;
 out:
 	io_ring_drop_ctx_refs(ctx, 1);
@@ -598,7 +602,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 	}
 }
 
-static void io_free_req(struct io_kiocb *req)
+static void __io_free_req(struct io_kiocb *req)
 {
 	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 		fput(req->file);
@@ -606,6 +610,63 @@ static void io_free_req(struct io_kiocb *req)
 	kmem_cache_free(req_cachep, req);
 }
 
+static void io_req_link_next(struct io_kiocb *req)
+{
+	struct io_kiocb *nxt;
+
+	/*
+	 * The list should never be empty when we are called here. But could
+	 * potentially happen if the chain is messed up, check to be on the
+	 * safe side.
+	 */
+	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
+	if (nxt) {
+		list_del(&nxt->list);
+		if (!list_empty(&req->link_list)) {
+			INIT_LIST_HEAD(&nxt->link_list);
+			list_splice(&req->link_list, &nxt->link_list);
+			nxt->flags |= REQ_F_LINK;
+		}
+
+		INIT_WORK(&nxt->work, io_sq_wq_submit_work);
+		queue_work(req->ctx->sqo_wq, &nxt->work);
+	}
+}
+
+/*
+ * Called if REQ_F_LINK is set, and we fail the head request
+ */
+static void io_fail_links(struct io_kiocb *req)
+{
+	struct io_kiocb *link;
+
+	while (!list_empty(&req->link_list)) {
+		link = list_first_entry(&req->link_list, struct io_kiocb, list);
+		list_del(&link->list);
+
+		io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
+		__io_free_req(link);
+	}
+}
+
+static void io_free_req(struct io_kiocb *req)
+{
+	/*
+	 * If LINK is set, we have dependent requests in this chain. If we
+	 * didn't fail this request, queue the first one up, moving any other
+	 * dependencies to the next request. In case of failure, fail the rest
+	 * of the chain.
+	 */
+	if (req->flags & REQ_F_LINK) {
+		if (req->flags & REQ_F_FAIL_LINK)
+			io_fail_links(req);
+		else
+			io_req_link_next(req);
+	}
+
+	__io_free_req(req);
+}
+
 static void io_put_req(struct io_kiocb *req)
 {
 	if (refcount_dec_and_test(&req->refs))
@@ -627,16 +688,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		req = list_first_entry(done, struct io_kiocb, list);
 		list_del(&req->list);
 
-		io_cqring_fill_event(ctx, req->user_data, req->error);
+		io_cqring_fill_event(ctx, req->user_data, req->result);
 		(*nr_events)++;
 
 		if (refcount_dec_and_test(&req->refs)) {
 			/* If we're not using fixed files, we have to pair the
 			 * completion part with the file put. Use regular
 			 * completions for those, only batch free for fixed
-			 * file.
+			 * file and non-linked commands.
 			 */
-			if (req->flags & REQ_F_FIXED_FILE) {
+			if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+			    REQ_F_FIXED_FILE) {
 				reqs[to_free++] = req;
 				if (to_free == ARRAY_SIZE(reqs))
 					io_free_req_many(ctx, reqs, &to_free);
@@ -775,6 +837,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 
 	kiocb_end_write(kiocb);
 
+	if ((req->flags & REQ_F_LINK) && res != req->result)
+		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req->ctx, req->user_data, res);
 	io_put_req(req);
 }
@@ -785,7 +849,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 
 	kiocb_end_write(kiocb);
 
-	req->error = res;
+	if ((req->flags & REQ_F_LINK) && res != req->result)
+		req->flags |= REQ_F_FAIL_LINK;
+	req->result = res;
 	if (res != -EAGAIN)
 		req->flags |= REQ_F_IOPOLL_COMPLETED;
 }
@@ -928,7 +994,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 		    !kiocb->ki_filp->f_op->iopoll)
 			return -EOPNOTSUPP;
 
-		req->error = 0;
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
 	} else {
@@ -1106,6 +1171,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 		return ret;
 
 	read_size = ret;
+	if (req->flags & REQ_F_LINK)
+		req->result = read_size;
+
 	iov_count = iov_iter_count(&iter);
 	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
 	if (!ret) {
@@ -1163,6 +1231,9 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	if (ret < 0)
 		return ret;
 
+	if (req->flags & REQ_F_LINK)
+		req->result = ret;
+
 	iov_count = iov_iter_count(&iter);
 
 	ret = -EAGAIN;
@@ -1266,6 +1337,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				end > 0 ? end : LLONG_MAX,
 				fsync_flags & IORING_FSYNC_DATASYNC);
 
+	if (ret < 0 && (req->flags & REQ_F_LINK))
+		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
 	io_put_req(req);
 	return 0;
@@ -1310,6 +1383,8 @@ static int io_sync_file_range(struct io_kiocb *req,
 
 	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
 
+	if (ret < 0 && (req->flags & REQ_F_LINK))
+		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
 	io_put_req(req);
 	return 0;
@@ -1562,9 +1637,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 {
 	int ret, opcode;
 
+	req->user_data = READ_ONCE(s->sqe->user_data);
+
 	if (unlikely(s->index >= ctx->sq_entries))
 		return -EINVAL;
-	req->user_data = READ_ONCE(s->sqe->user_data);
 
 	opcode = READ_ONCE(s->sqe->opcode);
 	switch (opcode) {
@@ -1608,7 +1684,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		return ret;
 
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
-		if (req->error == -EAGAIN)
+		if (req->result == -EAGAIN)
 			return -EAGAIN;
 
 		/* workqueue context doesn't hold uring_lock, grab it now */
@@ -1834,31 +1910,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 	return 0;
 }
 
-static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
-			 struct io_submit_state *state)
+static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			struct sqe_submit *s)
 {
-	struct io_kiocb *req;
 	int ret;
 
-	/* enforce forwards compatibility on users */
-	if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)))
-		return -EINVAL;
-
-	req = io_get_req(ctx, state);
-	if (unlikely(!req))
-		return -EAGAIN;
-
-	ret = io_req_set_file(ctx, s, state, req);
-	if (unlikely(ret))
-		goto out;
-
-	ret = io_req_defer(ctx, req, s->sqe);
-	if (ret) {
-		if (ret == -EIOCBQUEUED)
-			ret = 0;
-		return ret;
-	}
-
 	ret = __io_submit_sqe(ctx, req, s, true);
 	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 		struct io_uring_sqe *sqe_copy;
@@ -1881,24 +1937,93 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 
 			/*
 			 * Queued up for async execution, worker will release
-			 * submit reference when the iocb is actually
-			 * submitted.
+			 * submit reference when the iocb is actually submitted.
 			 */
 			return 0;
 		}
 	}
 
-out:
 	/* drop submission reference */
 	io_put_req(req);
 
 	/* and drop final reference, if we failed */
-	if (ret)
+	if (ret) {
+		io_cqring_add_event(ctx, req->user_data, ret);
+		if (req->flags & REQ_F_LINK)
+			req->flags |= REQ_F_FAIL_LINK;
 		io_put_req(req);
+	}
 
 	return ret;
 }
 
+#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
+
+static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
+			  struct io_submit_state *state, struct io_kiocb **link)
+{
+	struct io_uring_sqe *sqe_copy;
+	struct io_kiocb *req;
+	int ret;
+
+	/* enforce forwards compatibility on users */
+	if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	req = io_get_req(ctx, state);
+	if (unlikely(!req)) {
+		ret = -EAGAIN;
+		goto err;
+	}
+
+	ret = io_req_set_file(ctx, s, state, req);
+	if (unlikely(ret)) {
+err_req:
+		io_free_req(req);
+err:
+		io_cqring_add_event(ctx, s->sqe->user_data, ret);
+		return;
+	}
+
+	ret = io_req_defer(ctx, req, s->sqe);
+	if (ret) {
+		if (ret != -EIOCBQUEUED)
+			goto err_req;
+		return;
+	}
+
+	/*
+	 * If we already have a head request, queue this one for async
+	 * submittal once the head completes. If we don't have a head but
+	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
+	 * submitted sync once the chain is complete. If none of those
+	 * conditions are true (normal request), then just queue it.
+	 */
+	if (*link) {
+		struct io_kiocb *prev = *link;
+
+		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+		if (!sqe_copy) {
+			ret = -EAGAIN;
+			goto err_req;
+		}
+
+		s->sqe = sqe_copy;
+		memcpy(&req->submit, s, sizeof(*s));
+		list_add_tail(&req->list, &prev->link_list);
+	} else if (s->sqe->flags & IOSQE_IO_LINK) {
+		req->flags |= REQ_F_LINK;
+
+		memcpy(&req->submit, s, sizeof(*s));
+		INIT_LIST_HEAD(&req->link_list);
+		*link = req;
+	} else {
+		io_queue_sqe(ctx, req, s);
+	}
+}
+
 /*
  * Batched submission is done, ensure local IO is flushed out.
  */
@@ -1981,7 +2106,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
 			  unsigned int nr, bool has_user, bool mm_fault)
 {
 	struct io_submit_state state, *statep = NULL;
-	int ret, i, submitted = 0;
+	struct io_kiocb *link = NULL;
+	bool prev_was_link = false;
+	int i, submitted = 0;
 
 	if (nr > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, ctx, nr);
@@ -1989,22 +2116,30 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
 	}
 
 	for (i = 0; i < nr; i++) {
+		/*
+		 * If previous wasn't linked and we have a linked command,
+		 * that's the end of the chain. Submit the previous link.
+		 */
+		if (!prev_was_link && link) {
+			io_queue_sqe(ctx, link, &link->submit);
+			link = NULL;
+		}
+		prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
+
 		if (unlikely(mm_fault)) {
-			ret = -EFAULT;
+			io_cqring_add_event(ctx, sqes[i].sqe->user_data,
+						-EFAULT);
 		} else {
 			sqes[i].has_user = has_user;
 			sqes[i].needs_lock = true;
 			sqes[i].needs_fixed_file = true;
-			ret = io_submit_sqe(ctx, &sqes[i], statep);
-		}
-		if (!ret) {
+			io_submit_sqe(ctx, &sqes[i], statep, &link);
 			submitted++;
-			continue;
 		}
-
-		io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret);
 	}
 
+	if (link)
+		io_queue_sqe(ctx, link, &link->submit);
 	if (statep)
 		io_submit_state_end(&state);
 
@@ -2145,6 +2280,8 @@ static int io_sq_thread(void *data)
 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 {
 	struct io_submit_state state, *statep = NULL;
+	struct io_kiocb *link = NULL;
+	bool prev_was_link = false;
 	int i, submit = 0;
 
 	if (to_submit > IO_PLUG_THRESHOLD) {
@@ -2154,22 +2291,30 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 
 	for (i = 0; i < to_submit; i++) {
 		struct sqe_submit s;
-		int ret;
 
 		if (!io_get_sqring(ctx, &s))
 			break;
 
+		/*
+		 * If previous wasn't linked and we have a linked command,
+		 * that's the end of the chain. Submit the previous link.
+		 */
+		if (!prev_was_link && link) {
+			io_queue_sqe(ctx, link, &link->submit);
+			link = NULL;
+		}
+		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
+
 		s.has_user = true;
 		s.needs_lock = false;
 		s.needs_fixed_file = false;
 		submit++;
-
-		ret = io_submit_sqe(ctx, &s, statep);
-		if (ret)
-			io_cqring_add_event(ctx, s.sqe->user_data, ret);
+		io_submit_sqe(ctx, &s, statep, &link);
 	}
 	io_commit_sqring(ctx);
 
+	if (link)
+		io_queue_sqe(ctx, link, &link->submit);
 	if (statep)
 		io_submit_state_end(statep);
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a0c460025036..10b7c45f6d57 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -40,6 +40,7 @@ struct io_uring_sqe {
  */
 #define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
 #define IOSQE_IO_DRAIN		(1U << 1)	/* issue after inflight IO */
+#define IOSQE_IO_LINK		(1U << 2)	/* links next sqe */
 
 /*
  * io_uring_setup() flags
-- 
cgit v1.2.3


From 36b1e47ff0c196a95d5e55a05b3f988f827cce7e Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:37 +0300
Subject: RDMA/core: Introduce new header file for signature operations

Ease the exhausted ib_verbs.h file and make the code more readable.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/ib_verbs.h  | 112 +------------------------------------------
 include/rdma/signature.h | 120 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 111 deletions(-)
 create mode 100644 include/rdma/signature.h

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 805148a12660..dc59fa12669a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -63,6 +63,7 @@
 #include <linux/preempt.h>
 #include <uapi/rdma/ib_user_verbs.h>
 #include <rdma/restrack.h>
+#include <rdma/signature.h>
 #include <uapi/rdma/rdma_user_ioctl.h>
 #include <uapi/rdma/ib_user_ioctl_verbs.h>
 
@@ -264,17 +265,6 @@ enum ib_device_cap_flags {
 	IB_DEVICE_ALLOW_USER_UNREG		= (1ULL << 37),
 };
 
-enum ib_signature_prot_cap {
-	IB_PROT_T10DIF_TYPE_1 = 1,
-	IB_PROT_T10DIF_TYPE_2 = 1 << 1,
-	IB_PROT_T10DIF_TYPE_3 = 1 << 2,
-};
-
-enum ib_signature_guard_cap {
-	IB_GUARD_T10DIF_CRC	= 1,
-	IB_GUARD_T10DIF_CSUM	= 1 << 1,
-};
-
 enum ib_atomic_cap {
 	IB_ATOMIC_NONE,
 	IB_ATOMIC_HCA,
@@ -799,106 +789,6 @@ enum ib_mr_type {
 	IB_MR_TYPE_SG_GAPS,
 };
 
-/**
- * Signature types
- * IB_SIG_TYPE_NONE: Unprotected.
- * IB_SIG_TYPE_T10_DIF: Type T10-DIF
- */
-enum ib_signature_type {
-	IB_SIG_TYPE_NONE,
-	IB_SIG_TYPE_T10_DIF,
-};
-
-/**
- * Signature T10-DIF block-guard types
- * IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules.
- * IB_T10DIF_CSUM: Corresponds to IP checksum rules.
- */
-enum ib_t10_dif_bg_type {
-	IB_T10DIF_CRC,
-	IB_T10DIF_CSUM
-};
-
-/**
- * struct ib_t10_dif_domain - Parameters specific for T10-DIF
- *     domain.
- * @bg_type: T10-DIF block guard type (CRC|CSUM)
- * @pi_interval: protection information interval.
- * @bg: seed of guard computation.
- * @app_tag: application tag of guard block
- * @ref_tag: initial guard block reference tag.
- * @ref_remap: Indicate wethear the reftag increments each block
- * @app_escape: Indicate to skip block check if apptag=0xffff
- * @ref_escape: Indicate to skip block check if reftag=0xffffffff
- * @apptag_check_mask: check bitmask of application tag.
- */
-struct ib_t10_dif_domain {
-	enum ib_t10_dif_bg_type bg_type;
-	u16			pi_interval;
-	u16			bg;
-	u16			app_tag;
-	u32			ref_tag;
-	bool			ref_remap;
-	bool			app_escape;
-	bool			ref_escape;
-	u16			apptag_check_mask;
-};
-
-/**
- * struct ib_sig_domain - Parameters for signature domain
- * @sig_type: specific signauture type
- * @sig: union of all signature domain attributes that may
- *     be used to set domain layout.
- */
-struct ib_sig_domain {
-	enum ib_signature_type sig_type;
-	union {
-		struct ib_t10_dif_domain dif;
-	} sig;
-};
-
-/**
- * struct ib_sig_attrs - Parameters for signature handover operation
- * @check_mask: bitmask for signature byte check (8 bytes)
- * @mem: memory domain layout desciptor.
- * @wire: wire domain layout desciptor.
- */
-struct ib_sig_attrs {
-	u8			check_mask;
-	struct ib_sig_domain	mem;
-	struct ib_sig_domain	wire;
-};
-
-enum ib_sig_err_type {
-	IB_SIG_BAD_GUARD,
-	IB_SIG_BAD_REFTAG,
-	IB_SIG_BAD_APPTAG,
-};
-
-/**
- * Signature check masks (8 bytes in total) according to the T10-PI standard:
- *  -------- -------- ------------
- * | GUARD  | APPTAG |   REFTAG   |
- * |  2B    |  2B    |    4B      |
- *  -------- -------- ------------
- */
-enum {
-	IB_SIG_CHECK_GUARD	= 0xc0,
-	IB_SIG_CHECK_APPTAG	= 0x30,
-	IB_SIG_CHECK_REFTAG	= 0x0f,
-};
-
-/**
- * struct ib_sig_err - signature error descriptor
- */
-struct ib_sig_err {
-	enum ib_sig_err_type	err_type;
-	u32			expected;
-	u32			actual;
-	u64			sig_err_offset;
-	u32			key;
-};
-
 enum ib_mr_status_check {
 	IB_MR_CHECK_SIG_STATUS = 1,
 };
diff --git a/include/rdma/signature.h b/include/rdma/signature.h
new file mode 100644
index 000000000000..5998fe94dfd4
--- /dev/null
+++ b/include/rdma/signature.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_SIGNATURE_H_
+#define _RDMA_SIGNATURE_H_
+
+enum ib_signature_prot_cap {
+	IB_PROT_T10DIF_TYPE_1 = 1,
+	IB_PROT_T10DIF_TYPE_2 = 1 << 1,
+	IB_PROT_T10DIF_TYPE_3 = 1 << 2,
+};
+
+enum ib_signature_guard_cap {
+	IB_GUARD_T10DIF_CRC	= 1,
+	IB_GUARD_T10DIF_CSUM	= 1 << 1,
+};
+
+/**
+ * enum ib_signature_type - Signature types
+ * @IB_SIG_TYPE_NONE: Unprotected.
+ * @IB_SIG_TYPE_T10_DIF: Type T10-DIF
+ */
+enum ib_signature_type {
+	IB_SIG_TYPE_NONE,
+	IB_SIG_TYPE_T10_DIF,
+};
+
+/**
+ * enum ib_t10_dif_bg_type - Signature T10-DIF block-guard types
+ * @IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules.
+ * @IB_T10DIF_CSUM: Corresponds to IP checksum rules.
+ */
+enum ib_t10_dif_bg_type {
+	IB_T10DIF_CRC,
+	IB_T10DIF_CSUM,
+};
+
+/**
+ * struct ib_t10_dif_domain - Parameters specific for T10-DIF
+ *     domain.
+ * @bg_type: T10-DIF block guard type (CRC|CSUM)
+ * @pi_interval: protection information interval.
+ * @bg: seed of guard computation.
+ * @app_tag: application tag of guard block
+ * @ref_tag: initial guard block reference tag.
+ * @ref_remap: Indicate wethear the reftag increments each block
+ * @app_escape: Indicate to skip block check if apptag=0xffff
+ * @ref_escape: Indicate to skip block check if reftag=0xffffffff
+ * @apptag_check_mask: check bitmask of application tag.
+ */
+struct ib_t10_dif_domain {
+	enum ib_t10_dif_bg_type bg_type;
+	u16			pi_interval;
+	u16			bg;
+	u16			app_tag;
+	u32			ref_tag;
+	bool			ref_remap;
+	bool			app_escape;
+	bool			ref_escape;
+	u16			apptag_check_mask;
+};
+
+/**
+ * struct ib_sig_domain - Parameters for signature domain
+ * @sig_type: specific signauture type
+ * @sig: union of all signature domain attributes that may
+ *     be used to set domain layout.
+ */
+struct ib_sig_domain {
+	enum ib_signature_type sig_type;
+	union {
+		struct ib_t10_dif_domain dif;
+	} sig;
+};
+
+/**
+ * struct ib_sig_attrs - Parameters for signature handover operation
+ * @check_mask: bitmask for signature byte check (8 bytes)
+ * @mem: memory domain layout descriptor.
+ * @wire: wire domain layout descriptor.
+ */
+struct ib_sig_attrs {
+	u8			check_mask;
+	struct ib_sig_domain	mem;
+	struct ib_sig_domain	wire;
+};
+
+enum ib_sig_err_type {
+	IB_SIG_BAD_GUARD,
+	IB_SIG_BAD_REFTAG,
+	IB_SIG_BAD_APPTAG,
+};
+
+/*
+ * Signature check masks (8 bytes in total) according to the T10-PI standard:
+ *  -------- -------- ------------
+ * | GUARD  | APPTAG |   REFTAG   |
+ * |  2B    |  2B    |    4B      |
+ *  -------- -------- ------------
+ */
+enum {
+	IB_SIG_CHECK_GUARD = 0xc0,
+	IB_SIG_CHECK_APPTAG = 0x30,
+	IB_SIG_CHECK_REFTAG = 0x0f,
+};
+
+/*
+ * struct ib_sig_err - signature error descriptor
+ */
+struct ib_sig_err {
+	enum ib_sig_err_type	err_type;
+	u32			expected;
+	u32			actual;
+	u64			sig_err_offset;
+	u32			key;
+};
+
+#endif /* _RDMA_SIGNATURE_H_ */
-- 
cgit v1.2.3


From a0bc099abf7b45b16cb18459f3516af8c2fea781 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:38 +0300
Subject: RDMA/core: Save the MR type in the ib_mr structure

This is a preparation for the signature verbs API change. This change is
needed since the MR type will define, in the upcoming patches, the need
for allocating internal resources in LLD for signature handover related
operations. It will also help to make sure that signature related
functions are called with an appropriate MR type and fail otherwise.
Also introduce new mr types IB_MR_TYPE_USER, IB_MR_TYPE_DMA and
IB_MR_TYPE_DM for correctness.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_cmd.c          |  1 +
 drivers/infiniband/core/uverbs_std_types_mr.c |  1 +
 drivers/infiniband/core/verbs.c               |  2 ++
 include/rdma/ib_verbs.h                       | 10 ++++++++++
 4 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index d13b87d27ce5..689275c2894f 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -745,6 +745,7 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
 
 	mr->device  = pd->device;
 	mr->pd      = pd;
+	mr->type    = IB_MR_TYPE_USER;
 	mr->dm	    = NULL;
 	mr->uobject = uobj;
 	atomic_inc(&pd->usecnt);
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index 610d3b9f7654..7ca79bfa3487 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -128,6 +128,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
 
 	mr->device  = pd->device;
 	mr->pd      = pd;
+	mr->type    = IB_MR_TYPE_DM;
 	mr->dm      = dm;
 	mr->uobject = uobj;
 	atomic_inc(&pd->usecnt);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 16ef8a9bda4c..10ff85b79d25 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -299,6 +299,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 
 		mr->device	= pd->device;
 		mr->pd		= pd;
+		mr->type        = IB_MR_TYPE_DMA;
 		mr->uobject	= NULL;
 		mr->need_inval	= false;
 
@@ -2020,6 +2021,7 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
 		mr->need_inval = false;
 		mr->res.type = RDMA_RESTRACK_MR;
 		rdma_restrack_kadd(&mr->res);
+		mr->type = mr_type;
 	}
 
 	return mr;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index dc59fa12669a..b6ec71ee4d3e 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -782,11 +782,20 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
  *                            register any arbitrary sg lists (without
  *                            the normal mr constraints - see
  *                            ib_map_mr_sg)
+ * @IB_MR_TYPE_DM:            memory region that is used for device
+ *                            memory registration
+ * @IB_MR_TYPE_USER:          memory region that is used for the user-space
+ *                            application
+ * @IB_MR_TYPE_DMA:           memory region that is used for DMA operations
+ *                            without address translations (VA=PA)
  */
 enum ib_mr_type {
 	IB_MR_TYPE_MEM_REG,
 	IB_MR_TYPE_SIGNATURE,
 	IB_MR_TYPE_SG_GAPS,
+	IB_MR_TYPE_DM,
+	IB_MR_TYPE_USER,
+	IB_MR_TYPE_DMA,
 };
 
 enum ib_mr_status_check {
@@ -1719,6 +1728,7 @@ struct ib_mr {
 	u64		   iova;
 	u64		   length;
 	unsigned int	   page_size;
+	enum ib_mr_type	   type;
 	bool		   need_inval;
 	union {
 		struct ib_uobject	*uobject;	/* user */
-- 
cgit v1.2.3


From 26bc7eaee94fd904d1817fee4d864f8526807465 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:39 +0300
Subject: RDMA/core: Introduce IB_MR_TYPE_INTEGRITY and ib_alloc_mr_integrity
 API

This is a preparation for signature verbs API re-design. In the new
design a single MR with IB_MR_TYPE_INTEGRITY type will be used to perform
the needed mapping for data integrity operations.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c |  1 +
 drivers/infiniband/core/verbs.c  | 46 ++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h          | 10 +++++++++
 3 files changed, 57 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 1de4ae5d5e0e..dba385410715 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2437,6 +2437,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, alloc_fmr);
 	SET_DEVICE_OP(dev_ops, alloc_hw_stats);
 	SET_DEVICE_OP(dev_ops, alloc_mr);
+	SET_DEVICE_OP(dev_ops, alloc_mr_integrity);
 	SET_DEVICE_OP(dev_ops, alloc_mw);
 	SET_DEVICE_OP(dev_ops, alloc_pd);
 	SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 10ff85b79d25..82d62bc7af65 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2011,6 +2011,9 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
 	if (!pd->device->ops.alloc_mr)
 		return ERR_PTR(-EOPNOTSUPP);
 
+	if (WARN_ON_ONCE(mr_type == IB_MR_TYPE_INTEGRITY))
+		return ERR_PTR(-EINVAL);
+
 	mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata);
 	if (!IS_ERR(mr)) {
 		mr->device  = pd->device;
@@ -2028,6 +2031,49 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
 }
 EXPORT_SYMBOL(ib_alloc_mr_user);
 
+/**
+ * ib_alloc_mr_integrity() - Allocates an integrity memory region
+ * @pd:                      protection domain associated with the region
+ * @max_num_data_sg:         maximum data sg entries available for registration
+ * @max_num_meta_sg:         maximum metadata sg entries available for
+ *                           registration
+ *
+ * Notes:
+ * Memory registration page/sg lists must not exceed max_num_sg,
+ * also the integrity page/sg lists must not exceed max_num_meta_sg.
+ *
+ */
+struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
+				    u32 max_num_data_sg,
+				    u32 max_num_meta_sg)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->ops.alloc_mr_integrity)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (!max_num_meta_sg)
+		return ERR_PTR(-EINVAL);
+
+	mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg,
+						max_num_meta_sg);
+	if (IS_ERR(mr))
+		return mr;
+
+	mr->device = pd->device;
+	mr->pd = pd;
+	mr->dm = NULL;
+	mr->uobject = NULL;
+	atomic_inc(&pd->usecnt);
+	mr->need_inval = false;
+	mr->res.type = RDMA_RESTRACK_MR;
+	rdma_restrack_kadd(&mr->res);
+	mr->type = IB_MR_TYPE_INTEGRITY;
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_alloc_mr_integrity);
+
 /* "Fast" memory regions */
 
 struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b6ec71ee4d3e..01bc04c8e220 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -788,6 +788,8 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
  *                            application
  * @IB_MR_TYPE_DMA:           memory region that is used for DMA operations
  *                            without address translations (VA=PA)
+ * @IB_MR_TYPE_INTEGRITY:     memory region that is used for
+ *                            data integrity operations
  */
 enum ib_mr_type {
 	IB_MR_TYPE_MEM_REG,
@@ -796,6 +798,7 @@ enum ib_mr_type {
 	IB_MR_TYPE_DM,
 	IB_MR_TYPE_USER,
 	IB_MR_TYPE_DMA,
+	IB_MR_TYPE_INTEGRITY,
 };
 
 enum ib_mr_status_check {
@@ -2363,6 +2366,9 @@ struct ib_device_ops {
 	int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata);
 	struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type,
 				  u32 max_num_sg, struct ib_udata *udata);
+	struct ib_mr *(*alloc_mr_integrity)(struct ib_pd *pd,
+					    u32 max_num_data_sg,
+					    u32 max_num_meta_sg);
 	int (*advise_mr)(struct ib_pd *pd,
 			 enum ib_uverbs_advise_mr_advice advice, u32 flags,
 			 struct ib_sge *sg_list, u32 num_sge,
@@ -4042,6 +4048,10 @@ static inline struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
 	return ib_alloc_mr_user(pd, mr_type, max_num_sg, NULL);
 }
 
+struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
+				    u32 max_num_data_sg,
+				    u32 max_num_meta_sg);
+
 /**
  * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
  *   R_Key and L_Key.
-- 
cgit v1.2.3


From 2cdfcdd8677b277b32d32ef8976802dc5d5f883f Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:40 +0300
Subject: RDMA/core: Introduce ib_map_mr_sg_pi to map data/protection sgl's

This function will map the previously dma mapped SG lists for PI
(protection information) and data to an appropriate memory region for
future registration.
The given MR must be allocated as IB_MR_TYPE_INTEGRITY.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c |  1 +
 drivers/infiniband/core/verbs.c  | 40 +++++++++++++++++++++++++++++++++++++++-
 include/rdma/ib_verbs.h          |  9 +++++++++
 3 files changed, 49 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index dba385410715..669c2d58e695 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2497,6 +2497,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, iw_reject);
 	SET_DEVICE_OP(dev_ops, iw_rem_ref);
 	SET_DEVICE_OP(dev_ops, map_mr_sg);
+	SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
 	SET_DEVICE_OP(dev_ops, map_phys_fmr);
 	SET_DEVICE_OP(dev_ops, mmap);
 	SET_DEVICE_OP(dev_ops, modify_ah);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 82d62bc7af65..c892022aa8ea 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2049,7 +2049,8 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
 {
 	struct ib_mr *mr;
 
-	if (!pd->device->ops.alloc_mr_integrity)
+	if (!pd->device->ops.alloc_mr_integrity ||
+	    !pd->device->ops.map_mr_sg_pi)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	if (!max_num_meta_sg)
@@ -2430,6 +2431,43 @@ int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
 }
 EXPORT_SYMBOL(ib_set_vf_guid);
 
+/**
+ * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection
+ *     information) and set an appropriate memory region for registration.
+ * @mr:             memory region
+ * @data_sg:        dma mapped scatterlist for data
+ * @data_sg_nents:  number of entries in data_sg
+ * @data_sg_offset: offset in bytes into data_sg
+ * @meta_sg:        dma mapped scatterlist for metadata
+ * @meta_sg_nents:  number of entries in meta_sg
+ * @meta_sg_offset: offset in bytes into meta_sg
+ * @page_size:      page vector desired page size
+ *
+ * Constraints:
+ * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY.
+ *
+ * Return: 0 on success.
+ *
+ * After this completes successfully, the  memory region
+ * is ready for registration.
+ */
+int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg,
+		    int data_sg_nents, unsigned int *data_sg_offset,
+		    struct scatterlist *meta_sg, int meta_sg_nents,
+		    unsigned int *meta_sg_offset, unsigned int page_size)
+{
+	if (unlikely(!mr->device->ops.map_mr_sg_pi ||
+		     WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY)))
+		return -EOPNOTSUPP;
+
+	mr->page_size = page_size;
+
+	return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents,
+					    data_sg_offset, meta_sg,
+					    meta_sg_nents, meta_sg_offset);
+}
+EXPORT_SYMBOL(ib_map_mr_sg_pi);
+
 /**
  * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
  *     and set it the memory region.
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 01bc04c8e220..632e133e7a59 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2437,6 +2437,11 @@ struct ib_device_ops {
 	int (*read_counters)(struct ib_counters *counters,
 			     struct ib_counters_read_attr *counters_read_attr,
 			     struct uverbs_attr_bundle *attrs);
+	int (*map_mr_sg_pi)(struct ib_mr *mr, struct scatterlist *data_sg,
+			    int data_sg_nents, unsigned int *data_sg_offset,
+			    struct scatterlist *meta_sg, int meta_sg_nents,
+			    unsigned int *meta_sg_offset);
+
 	/**
 	 * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the
 	 *   driver initialized data.  The struct is kfree()'ed by the sysfs
@@ -4236,6 +4241,10 @@ int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
 
 int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
 		 unsigned int *sg_offset, unsigned int page_size);
+int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg,
+		    int data_sg_nents, unsigned int *data_sg_offset,
+		    struct scatterlist *meta_sg, int meta_sg_nents,
+		    unsigned int *meta_sg_offset, unsigned int page_size);
 
 static inline int
 ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
-- 
cgit v1.2.3


From 7c717d3aeeaabbfddd0fe949b501595a2e3469e4 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:41 +0300
Subject: RDMA/core: Add signature attrs element for ib_mr structure

This element will describe the needed characteristics for the signature
operation per signature enabled memory region (type IB_MR_TYPE_INTEGRITY).
Also add meta_length attribute to ib_sig_attrs structure for saving the
mapped metadata length (needed for the new API implementation).

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_cmd.c |  1 +
 drivers/infiniband/core/verbs.c      | 13 ++++++++++++-
 include/rdma/ib_verbs.h              |  2 +-
 include/rdma/signature.h             |  2 ++
 4 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 689275c2894f..911533081db5 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -747,6 +747,7 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
 	mr->pd      = pd;
 	mr->type    = IB_MR_TYPE_USER;
 	mr->dm	    = NULL;
+	mr->sig_attrs = NULL;
 	mr->uobject = uobj;
 	atomic_inc(&pd->usecnt);
 	mr->res.type = RDMA_RESTRACK_MR;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index c892022aa8ea..399c0d17b2b9 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1976,6 +1976,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
 {
 	struct ib_pd *pd = mr->pd;
 	struct ib_dm *dm = mr->dm;
+	struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
 	int ret;
 
 	rdma_restrack_del(&mr->res);
@@ -1984,6 +1985,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
 		atomic_dec(&pd->usecnt);
 		if (dm)
 			atomic_dec(&dm->usecnt);
+		kfree(sig_attrs);
 	}
 
 	return ret;
@@ -2025,6 +2027,7 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
 		mr->res.type = RDMA_RESTRACK_MR;
 		rdma_restrack_kadd(&mr->res);
 		mr->type = mr_type;
+		mr->sig_attrs = NULL;
 	}
 
 	return mr;
@@ -2048,6 +2051,7 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
 				    u32 max_num_meta_sg)
 {
 	struct ib_mr *mr;
+	struct ib_sig_attrs *sig_attrs;
 
 	if (!pd->device->ops.alloc_mr_integrity ||
 	    !pd->device->ops.map_mr_sg_pi)
@@ -2056,10 +2060,16 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
 	if (!max_num_meta_sg)
 		return ERR_PTR(-EINVAL);
 
+	sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL);
+	if (!sig_attrs)
+		return ERR_PTR(-ENOMEM);
+
 	mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg,
 						max_num_meta_sg);
-	if (IS_ERR(mr))
+	if (IS_ERR(mr)) {
+		kfree(sig_attrs);
 		return mr;
+	}
 
 	mr->device = pd->device;
 	mr->pd = pd;
@@ -2070,6 +2080,7 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
 	mr->res.type = RDMA_RESTRACK_MR;
 	rdma_restrack_kadd(&mr->res);
 	mr->type = IB_MR_TYPE_INTEGRITY;
+	mr->sig_attrs = sig_attrs;
 
 	return mr;
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 632e133e7a59..995b217a1940 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1739,7 +1739,7 @@ struct ib_mr {
 	};
 
 	struct ib_dm      *dm;
-
+	struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */
 	/*
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
diff --git a/include/rdma/signature.h b/include/rdma/signature.h
index 5998fe94dfd4..f24cc2a1d3c5 100644
--- a/include/rdma/signature.h
+++ b/include/rdma/signature.h
@@ -80,11 +80,13 @@ struct ib_sig_domain {
  * @check_mask: bitmask for signature byte check (8 bytes)
  * @mem: memory domain layout descriptor.
  * @wire: wire domain layout descriptor.
+ * @meta_length: metadata length
  */
 struct ib_sig_attrs {
 	u8			check_mask;
 	struct ib_sig_domain	mem;
 	struct ib_sig_domain	wire;
+	int			meta_length;
 };
 
 enum ib_sig_err_type {
-- 
cgit v1.2.3


From 62e3c379d4d713dbcb70adc7c65443fd8722b33f Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:43 +0300
Subject: RDMA/mlx5: Add attr for max number page list length for PI operation

PI offload (protection information) is a feature that each RDMA provider
can implement differently. Thus, introduce new device attribute to define
the maximal length of the page list for PI fast registration operation. For
example, mlx5 driver uses a single internal MR to map both data and
protection SGL's, so it's equal to max_fast_reg_page_list_len / 2.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 2 ++
 include/rdma/ib_verbs.h           | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 23fedff9f080..bd0322b61362 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1009,6 +1009,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	props->max_srq_sge	   = max_rq_sg - 1;
 	props->max_fast_reg_page_list_len =
 		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
+	props->max_pi_fast_reg_page_list_len =
+		props->max_fast_reg_page_list_len / 2;
 	get_atomic_caps_qp(dev, props);
 	props->masked_atomic_cap   = IB_ATOMIC_NONE;
 	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 995b217a1940..9169e798334f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -390,6 +390,7 @@ struct ib_device_attr {
 	int			max_srq_wr;
 	int			max_srq_sge;
 	unsigned int		max_fast_reg_page_list_len;
+	unsigned int		max_pi_fast_reg_page_list_len;
 	u16			max_pkeys;
 	u8			local_ca_ack_delay;
 	int			sig_prot_cap;
-- 
cgit v1.2.3


From 38ca87c6f1e514686d4a385246d1afe1e1f2e482 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:46 +0300
Subject: RDMA/mlx5: Introduce and implement new IB_WR_REG_MR_INTEGRITY work
 request

This new WR will be used to perform PI (protection information) handover
using the new API. Using the new API, the user will post a single WR that
will internally perform all the needed actions to complete PI operation.
This new WR will use a memory region that was allocated as
IB_MR_TYPE_INTEGRITY and was mapped using ib_map_mr_sg_pi to perform the
registration. In the old API, in order to perform a signature handover
operation, each ULP should perform the following:
1. Map and register the data buffers.
2. Map and register the protection buffers.
3. Post a special reg WR to configure the signature handover operation
   layout.
4. Invalidate the signature memory key.
5. Invalidate protection buffers memory key.
6. Invalidate data buffers memory key.

In the new API, the mapping of both data and protection buffers is
performed using a single call to ib_map_mr_sg_pi function. Also the
registration of the buffers and the configuration of the signature
operation layout is done by a single new work request called
IB_WR_REG_MR_INTEGRITY.
This patch implements this operation for mlx5 devices that are capable to
offload data integrity generation/validation while performing the actual
buffer transfer.
This patch will not remove the old signature API that is used by the iSER
initiator and target drivers. This will be done in the future.

In the internal implementation, for each IB_WR_REG_MR_INTEGRITY work
request, we are using a single UMR operation to register both data and
protection buffers using KLM's.
Afterwards, another UMR operation will describe the strided block format.
These will be followed by 2 SET_PSV operations to set the memory/wire
domains initial signature parameters passed by the user.
In the end of the whole transaction, only the signature memory key
(the one that exposed for the RDMA operation) will be invalidated.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 218 ++++++++++++++++++++++++++++++++++++----
 include/linux/mlx5/qp.h         |   3 +-
 include/rdma/ib_verbs.h         |   1 +
 3 files changed, 201 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index ce8fccb04c3c..f6651b93e469 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -4169,7 +4169,7 @@ static __be64 sig_mkey_mask(void)
 static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
 			    struct mlx5_ib_mr *mr, u8 flags)
 {
-	int size = mr->ndescs * mr->desc_size;
+	int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
 
 	memset(umr, 0, sizeof(*umr));
 
@@ -4300,7 +4300,7 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg,
 			     struct mlx5_ib_mr *mr,
 			     u32 key, int access)
 {
-	int ndescs = ALIGN(mr->ndescs, 8) >> 1;
+	int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1;
 
 	memset(seg, 0, sizeof(*seg));
 
@@ -4351,7 +4351,7 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg,
 			     struct mlx5_ib_mr *mr,
 			     struct mlx5_ib_pd *pd)
 {
-	int bcount = mr->desc_size * mr->ndescs;
+	int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs);
 
 	dseg->addr = cpu_to_be64(mr->desc_map);
 	dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64));
@@ -4544,23 +4544,52 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr,
 	return 0;
 }
 
-static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
-				struct mlx5_ib_qp *qp, void **seg,
-				int *size, void **cur_edge)
+static int set_sig_data_segment(const struct ib_send_wr *send_wr,
+				struct ib_mr *sig_mr,
+				struct ib_sig_attrs *sig_attrs,
+				struct mlx5_ib_qp *qp, void **seg, int *size,
+				void **cur_edge)
 {
-	struct ib_sig_attrs *sig_attrs = wr->sig_attrs;
-	struct ib_mr *sig_mr = wr->sig_mr;
 	struct mlx5_bsf *bsf;
-	u32 data_len = wr->wr.sg_list->length;
-	u32 data_key = wr->wr.sg_list->lkey;
-	u64 data_va = wr->wr.sg_list->addr;
+	u32 data_len;
+	u32 data_key;
+	u64 data_va;
+	u32 prot_len = 0;
+	u32 prot_key = 0;
+	u64 prot_va = 0;
+	bool prot = false;
 	int ret;
 	int wqe_size;
 
-	if (!wr->prot ||
-	    (data_key == wr->prot->lkey &&
-	     data_va == wr->prot->addr &&
-	     data_len == wr->prot->length)) {
+	if (send_wr->opcode == IB_WR_REG_SIG_MR) {
+		const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
+
+		data_len = wr->wr.sg_list->length;
+		data_key = wr->wr.sg_list->lkey;
+		data_va = wr->wr.sg_list->addr;
+		if (wr->prot) {
+			prot_len = wr->prot->length;
+			prot_key = wr->prot->lkey;
+			prot_va = wr->prot->addr;
+			prot = true;
+		}
+	} else {
+		struct mlx5_ib_mr *mr = to_mmr(sig_mr);
+		struct mlx5_ib_mr *pi_mr = mr->pi_mr;
+
+		data_len = pi_mr->data_length;
+		data_key = pi_mr->ibmr.lkey;
+		data_va = pi_mr->ibmr.iova;
+		if (pi_mr->meta_ndescs) {
+			prot_len = pi_mr->meta_length;
+			prot_key = pi_mr->ibmr.lkey;
+			prot_va = pi_mr->ibmr.iova + data_len;
+			prot = true;
+		}
+	}
+
+	if (!prot || (data_key == prot_key && data_va == prot_va &&
+		      data_len == prot_len)) {
 		/**
 		 * Source domain doesn't contain signature information
 		 * or data and protection are interleaved in memory.
@@ -4594,8 +4623,6 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
 		struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
 		struct mlx5_stride_block_entry *data_sentry;
 		struct mlx5_stride_block_entry *prot_sentry;
-		u32 prot_key = wr->prot->lkey;
-		u64 prot_va = wr->prot->addr;
 		u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
 		int prot_size;
 
@@ -4673,6 +4700,56 @@ static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
 	umr->mkey_mask = sig_mkey_mask();
 }
 
+static int set_pi_umr_wr(const struct ib_send_wr *send_wr,
+			 struct mlx5_ib_qp *qp, void **seg, int *size,
+			 void **cur_edge)
+{
+	const struct ib_reg_wr *wr = reg_wr(send_wr);
+	struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr);
+	struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr;
+	struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs;
+	u32 pdn = get_pd(qp)->pdn;
+	u32 xlt_size;
+	int region_len, ret;
+
+	if (unlikely(send_wr->num_sge != 0) ||
+	    unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
+	    unlikely(!sig_mr->sig->sig_status_checked))
+		return -EINVAL;
+
+	/* length of the protected region, data + protection */
+	region_len = pi_mr->ibmr.length;
+
+	/**
+	 * KLM octoword size - if protection was provided
+	 * then we use strided block format (3 octowords),
+	 * else we use single KLM (1 octoword)
+	 **/
+	if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE)
+		xlt_size = 0x30;
+	else
+		xlt_size = sizeof(struct mlx5_klm);
+
+	set_sig_umr_segment(*seg, xlt_size);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len,
+			     pdn);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size,
+				   cur_edge);
+	if (ret)
+		return ret;
+
+	sig_mr->sig->sig_status_checked = false;
+	return 0;
+}
 
 static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
 			  struct mlx5_ib_qp *qp, void **seg, int *size,
@@ -4716,7 +4793,8 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
 	*size += sizeof(struct mlx5_mkey_seg) / 16;
 	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
-	ret = set_sig_data_segment(wr, qp, seg, size, cur_edge);
+	ret = set_sig_data_segment(send_wr, wr->sig_mr, wr->sig_attrs, qp, seg,
+				   size, cur_edge);
 	if (ret)
 		return ret;
 
@@ -4758,7 +4836,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp,
 {
 	struct mlx5_ib_mr *mr = to_mmr(wr->mr);
 	struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);
-	size_t mr_list_size = mr->ndescs * mr->desc_size;
+	int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
 	bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD;
 	u8 flags = 0;
 
@@ -4899,8 +4977,11 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx5_core_dev *mdev = dev->mdev;
+	struct ib_reg_wr reg_pi_wr;
 	struct mlx5_ib_qp *qp;
 	struct mlx5_ib_mr *mr;
+	struct mlx5_ib_mr *pi_mr;
+	struct ib_sig_attrs *sig_attrs;
 	struct mlx5_wqe_xrc_seg *xrc;
 	struct mlx5_bf *bf;
 	void *cur_edge;
@@ -4954,7 +5035,8 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 			goto out;
 		}
 
-		if (wr->opcode == IB_WR_REG_MR) {
+		if (wr->opcode == IB_WR_REG_MR ||
+		    wr->opcode == IB_WR_REG_MR_INTEGRITY) {
 			fence = dev->umr_fence;
 			next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
 		} else  {
@@ -5012,6 +5094,102 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 				num_sge = 0;
 				break;
 
+			case IB_WR_REG_MR_INTEGRITY:
+				memset(&reg_pi_wr, 0, sizeof(struct ib_reg_wr));
+
+				mr = to_mmr(reg_wr(wr)->mr);
+				pi_mr = mr->pi_mr;
+
+				reg_pi_wr.mr = &pi_mr->ibmr;
+				reg_pi_wr.access = reg_wr(wr)->access;
+				reg_pi_wr.key = pi_mr->ibmr.rkey;
+
+				qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY;
+				ctrl->imm = cpu_to_be32(reg_pi_wr.key);
+				/* UMR for data + protection registration */
+				err = set_reg_wr(qp, &reg_pi_wr, &seg, &size,
+						 &cur_edge, false);
+				if (err) {
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, fence,
+					   MLX5_OPCODE_UMR);
+
+				err = begin_wqe(qp, &seg, &ctrl, wr, &idx,
+						&size, &cur_edge, nreq);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
+				/* UMR for sig MR */
+				err = set_pi_umr_wr(wr, qp, &seg, &size,
+						    &cur_edge);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, fence,
+					   MLX5_OPCODE_UMR);
+
+				/*
+				 * SET_PSV WQEs are not signaled and solicited
+				 * on error
+				 */
+				sig_attrs = mr->ibmr.sig_attrs;
+				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
+						  &size, &cur_edge, nreq, false,
+						  true);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+				err = set_psv_wr(&sig_attrs->mem,
+						 mr->sig->psv_memory.psv_idx,
+						 &seg, &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, next_fence,
+					   MLX5_OPCODE_SET_PSV);
+
+				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
+						  &size, &cur_edge, nreq, false,
+						  true);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+				err = set_psv_wr(&sig_attrs->wire,
+						 mr->sig->psv_wire.psv_idx,
+						 &seg, &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, next_fence,
+					   MLX5_OPCODE_SET_PSV);
+
+				qp->next_fence =
+					MLX5_FENCE_MODE_INITIATOR_SMALL;
+				num_sge = 0;
+				goto skip_psv;
+
 			case IB_WR_REG_SIG_MR:
 				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
 				mr = to_mmr(sig_handover_wr(wr)->sig_mr);
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 3ba4edbd17a6..08e43cd9e742 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -37,7 +37,8 @@
 #include <linux/mlx5/driver.h>
 
 #define MLX5_INVALID_LKEY	0x100
-#define MLX5_SIG_WQE_SIZE	(MLX5_SEND_WQE_BB * 5)
+/* UMR (3 WQE_BB's) + SIG (3 WQE_BB's) + PSV (mem) + PSV (wire) */
+#define MLX5_SIG_WQE_SIZE	(MLX5_SEND_WQE_BB * 8)
 #define MLX5_DIF_SIZE		8
 #define MLX5_STRIDE_BLOCK_OP	0x400
 #define MLX5_CPY_GRD_MASK	0xc0
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9169e798334f..28db256cbdb9 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1236,6 +1236,7 @@ enum ib_wr_opcode {
 	/* These are kernel only and can not be issued by userspace */
 	IB_WR_REG_MR = 0x20,
 	IB_WR_REG_SIG_MR,
+	IB_WR_REG_MR_INTEGRITY,
 
 	/* reserve values for low level drivers' internal use.
 	 * These values will not be used at all in the ib core layer.
-- 
cgit v1.2.3


From 5a6781a558cc3909851c04a0d44e3a87a35aad94 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:49 +0300
Subject: RDMA/core: Add an integrity MR pool support

This is a preparation for adding new signature API to the rw-API.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/mr_pool.c | 8 ++++++--
 drivers/infiniband/core/rw.c      | 4 ++--
 drivers/nvme/host/rdma.c          | 2 +-
 include/rdma/mr_pool.h            | 2 +-
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c
index 49d478b2ea94..132ff92626e1 100644
--- a/drivers/infiniband/core/mr_pool.c
+++ b/drivers/infiniband/core/mr_pool.c
@@ -42,14 +42,18 @@ void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr)
 EXPORT_SYMBOL(ib_mr_pool_put);
 
 int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
-		enum ib_mr_type type, u32 max_num_sg)
+		enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg)
 {
 	struct ib_mr *mr;
 	unsigned long flags;
 	int ret, i;
 
 	for (i = 0; i < nr; i++) {
-		mr = ib_alloc_mr(qp->pd, type, max_num_sg);
+		if (type == IB_MR_TYPE_INTEGRITY)
+			mr = ib_alloc_mr_integrity(qp->pd, max_num_sg,
+						   max_num_meta_sg);
+		else
+			mr = ib_alloc_mr(qp->pd, type, max_num_sg);
 		if (IS_ERR(mr)) {
 			ret = PTR_ERR(mr);
 			goto out;
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index acf9ea625811..f825990bacfa 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -719,7 +719,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 	if (nr_mrs) {
 		ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
 				IB_MR_TYPE_MEM_REG,
-				rdma_rw_fr_page_list_len(dev));
+				rdma_rw_fr_page_list_len(dev), 0);
 		if (ret) {
 			pr_err("%s: failed to allocated %d MRs\n",
 				__func__, nr_mrs);
@@ -729,7 +729,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 
 	if (nr_sig_mrs) {
 		ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
-				IB_MR_TYPE_SIGNATURE, 2);
+				IB_MR_TYPE_SIGNATURE, 2, 0);
 		if (ret) {
 			pr_err("%s: failed to allocated %d SIG MRs\n",
 				__func__, nr_sig_mrs);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f383146e7d0f..0e033b621daf 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -486,7 +486,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 	ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
 			      queue->queue_size,
 			      IB_MR_TYPE_MEM_REG,
-			      nvme_rdma_get_max_fr_pages(ibdev));
+			      nvme_rdma_get_max_fr_pages(ibdev), 0);
 	if (ret) {
 		dev_err(queue->ctrl->ctrl.device,
 			"failed to initialize MR pool sized %d for QID %d\n",
diff --git a/include/rdma/mr_pool.h b/include/rdma/mr_pool.h
index 986010b812eb..2c042e6046d1 100644
--- a/include/rdma/mr_pool.h
+++ b/include/rdma/mr_pool.h
@@ -19,7 +19,7 @@ struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list);
 void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr);
 
 int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
-		enum ib_mr_type type, u32 max_num_sg);
+		enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg);
 void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list);
 
 #endif /* _RDMA_MR_POOL_H */
-- 
cgit v1.2.3


From c0a6cbb9cbccffc249743afa16e64f16c46c80b2 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:50 +0300
Subject: RDMA/core: Rename signature qp create flag and signature device
 capability

Rename IB_QP_CREATE_SIGNATURE_EN to IB_QP_CREATE_INTEGRITY_EN
and IB_DEVICE_SIGNATURE_HANDOVER to IB_DEVICE_INTEGRITY_HANDOVER.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/rw.c             |  4 ++--
 drivers/infiniband/hw/mlx5/main.c        |  2 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h     |  3 +--
 drivers/infiniband/hw/mlx5/qp.c          | 14 +++++++-------
 drivers/infiniband/ulp/iser/iser_verbs.c |  4 ++--
 drivers/infiniband/ulp/isert/ib_isert.c  |  4 ++--
 include/rdma/ib_verbs.h                  |  4 ++--
 7 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index f825990bacfa..b45b49a2ccfc 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -688,7 +688,7 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
 	 * we'll need two additional MRs for the registrations and the
 	 * invalidation.
 	 */
-	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
+	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN)
 		factor += 6;	/* (inv + reg) * (data + prot + sig) */
 	else if (rdma_rw_can_use_mr(dev, attr->port_num))
 		factor += 2;	/* inv + reg */
@@ -709,7 +709,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 	u32 nr_mrs = 0, nr_sig_mrs = 0;
 	int ret = 0;
 
-	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) {
+	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) {
 		nr_sig_mrs = attr->cap.max_rdma_ctxs;
 		nr_mrs = attr->cap.max_rdma_ctxs * 2;
 	} else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index bd0322b61362..9aa10cfbc064 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -889,7 +889,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	}
 	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
 	if (MLX5_CAP_GEN(mdev, sho)) {
-		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
+		props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER;
 		/* At this stage no support for signature handover */
 		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
 				      IB_PROT_T10DIF_TYPE_2 |
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7980814f355d..5999792b5698 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -431,8 +431,7 @@ struct mlx5_ib_qp {
 
 	int			create_type;
 
-	/* Store signature errors */
-	bool			signature_en;
+	bool			integrity_en;
 
 	struct list_head	qps_list;
 	struct list_head	cq_recv_list;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index f6651b93e469..879162da63e3 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -442,9 +442,9 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr)
 	}
 
 	size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
-	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN &&
+	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN &&
 	    ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE)
-			return MLX5_SIG_WQE_SIZE;
+		return MLX5_SIG_WQE_SIZE;
 	else
 		return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB);
 }
@@ -496,8 +496,8 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
 			      sizeof(struct mlx5_wqe_inline_seg);
 	attr->cap.max_inline_data = qp->max_inline_data;
 
-	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
-		qp->signature_en = true;
+	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN)
+		qp->integrity_en = true;
 
 	wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
 	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
@@ -1039,7 +1039,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 	void *qpc;
 	int err;
 
-	if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN |
+	if (init_attr->create_flags & ~(IB_QP_CREATE_INTEGRITY_EN |
 					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
 					IB_QP_CREATE_IPOIB_UD_LSO |
 					IB_QP_CREATE_NETIF_QP |
@@ -4714,7 +4714,7 @@ static int set_pi_umr_wr(const struct ib_send_wr *send_wr,
 
 	if (unlikely(send_wr->num_sge != 0) ||
 	    unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) ||
-	    unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->integrity_en) ||
 	    unlikely(!sig_mr->sig->sig_status_checked))
 		return -EINVAL;
 
@@ -4763,7 +4763,7 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
 
 	if (unlikely(wr->wr.num_sge != 1) ||
 	    unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) ||
-	    unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->integrity_en) ||
 	    unlikely(!sig_mr->sig->sig_status_checked))
 		return -EINVAL;
 
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index ea9cf04ad002..a6548de0e218 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -399,7 +399,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
 	init_attr.qp_type	= IB_QPT_RC;
 	if (ib_conn->pi_support) {
 		init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
-		init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+		init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
 		iser_conn->max_cmds =
 			ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
 	} else {
@@ -712,7 +712,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
 	/* connection T10-PI support */
 	if (iser_pi_enable) {
 		if (!(device->ib_device->attrs.device_cap_flags &
-		      IB_DEVICE_SIGNATURE_HANDOVER)) {
+		      IB_DEVICE_INTEGRITY_HANDOVER)) {
 			iser_warn("T10-PI requested but not supported on %s, "
 				  "continue without T10-PI\n",
 				  dev_name(&ib_conn->device->ib_device->dev));
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index ffef4ac152ca..53bc505f5292 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -141,7 +141,7 @@ isert_create_qp(struct isert_conn *isert_conn,
 	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 	attr.qp_type = IB_QPT_RC;
 	if (device->pi_capable)
-		attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+		attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
 
 	ret = rdma_create_qp(cma_id, device->pd, &attr);
 	if (ret) {
@@ -317,7 +317,7 @@ isert_create_device_ib_res(struct isert_device *device)
 
 	/* Check signature cap */
 	device->pi_capable = ib_dev->attrs.device_cap_flags &
-			     IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
+			     IB_DEVICE_INTEGRITY_HANDOVER ? true : false;
 
 	return 0;
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 28db256cbdb9..6de0ea1aafc3 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -253,7 +253,7 @@ enum ib_device_cap_flags {
 	 */
 	IB_DEVICE_CROSS_CHANNEL			= (1 << 27),
 	IB_DEVICE_MANAGED_FLOW_STEERING		= (1 << 29),
-	IB_DEVICE_SIGNATURE_HANDOVER		= (1 << 30),
+	IB_DEVICE_INTEGRITY_HANDOVER		= (1 << 30),
 	IB_DEVICE_ON_DEMAND_PAGING		= (1ULL << 31),
 	IB_DEVICE_SG_GAPS_REG			= (1ULL << 32),
 	IB_DEVICE_VIRTUAL_FUNCTION		= (1ULL << 33),
@@ -1056,7 +1056,7 @@ enum ib_qp_create_flags {
 	IB_QP_CREATE_MANAGED_SEND               = 1 << 3,
 	IB_QP_CREATE_MANAGED_RECV               = 1 << 4,
 	IB_QP_CREATE_NETIF_QP			= 1 << 5,
-	IB_QP_CREATE_SIGNATURE_EN		= 1 << 6,
+	IB_QP_CREATE_INTEGRITY_EN		= 1 << 6,
 	/* FREE					= 1 << 7, */
 	IB_QP_CREATE_SCATTER_FCS		= 1 << 8,
 	IB_QP_CREATE_CVLAN_STRIPPING		= 1 << 9,
-- 
cgit v1.2.3


From 185eddc45798b9f73e5470964948d79b4c8df4b7 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:51 +0300
Subject: RDMA/core: Validate integrity handover device cap

Protect the case that a ULP tries to allocate a QP with signature
enabled flag while the LLD doesn't support this feature.
While we're here, also move integrity_en attribute from mlx5_qp to
ib_qp as a preparation for adding new integrity API to the rw-API
(that is part of ib_core module).

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/verbs.c      | 6 ++++++
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 --
 drivers/infiniband/hw/mlx5/qp.c      | 7 ++-----
 include/rdma/ib_verbs.h              | 1 +
 4 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 399c0d17b2b9..4a04e94a72db 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1158,6 +1158,10 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
 	    qp_init_attr->cap.max_recv_sge))
 		return ERR_PTR(-EINVAL);
 
+	if ((qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) &&
+	    !(device->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER))
+		return ERR_PTR(-EINVAL);
+
 	/*
 	 * If the callers is using the RDMA API calculate the resources
 	 * needed for the RDMA READ/WRITE operations.
@@ -1233,6 +1237,8 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
 	qp->max_write_sge = qp_init_attr->cap.max_send_sge;
 	qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
 				 device->attrs.max_sge_rd);
+	if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN)
+		qp->integrity_en = true;
 
 	return qp;
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 5999792b5698..d418219e68c6 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -431,8 +431,6 @@ struct mlx5_ib_qp {
 
 	int			create_type;
 
-	bool			integrity_en;
-
 	struct list_head	qps_list;
 	struct list_head	cq_recv_list;
 	struct list_head	cq_send_list;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 879162da63e3..d77a64c551ea 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -496,9 +496,6 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
 			      sizeof(struct mlx5_wqe_inline_seg);
 	attr->cap.max_inline_data = qp->max_inline_data;
 
-	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN)
-		qp->integrity_en = true;
-
 	wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
 	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
 	if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
@@ -4714,7 +4711,7 @@ static int set_pi_umr_wr(const struct ib_send_wr *send_wr,
 
 	if (unlikely(send_wr->num_sge != 0) ||
 	    unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) ||
-	    unlikely(!sig_mr->sig) || unlikely(!qp->integrity_en) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) ||
 	    unlikely(!sig_mr->sig->sig_status_checked))
 		return -EINVAL;
 
@@ -4763,7 +4760,7 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
 
 	if (unlikely(wr->wr.num_sge != 1) ||
 	    unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) ||
-	    unlikely(!sig_mr->sig) || unlikely(!qp->integrity_en) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) ||
 	    unlikely(!sig_mr->sig->sig_status_checked))
 		return -EINVAL;
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6de0ea1aafc3..14b5eab76ed8 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1711,6 +1711,7 @@ struct ib_qp {
 	struct ib_qp_security  *qp_sec;
 	u8			port;
 
+	bool			integrity_en;
 	/*
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
-- 
cgit v1.2.3


From e9a53e73a2507f3a1680538bd167b2e49533659a Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:53 +0300
Subject: RDMA/rw: Use IB_WR_REG_MR_INTEGRITY for PI handover

Replace the old signature handover API with the new one. The new API
simplifes PI handover code complexity for ULPs and improve performance.
For RW API it will reduce the maximum number of work requests per task
and the need of dealing with multiple MRs (and their registrations and
invalidations) per task. All the mappings and registration of the data
and the protection buffers is done by the LLD using a single WR and a
special MR type (IB_MR_TYPE_INTEGRITY) for the PI handover operation.

The setup of the tested benchmark (using iSER ULP):
 - 2 servers with 24 cores (1 initiator and 1 target)
 - ConnectX-4/ConnectX-5 adapters
 - 24 target sessions with 1 LUN each
 - ramdisk backstore
 - PI active

Performance results running fio (24 jobs, 128 iodepth) using
write_generate=1 and read_verify=1 (w/w.o patch):

bs      IOPS(read)        IOPS(write)
----    ----------        ----------
512   1243.3K/1182.3K    1725.1K/1680.2K
4k    571233/528835      743293/748259
32k   72388/71086        71789/93573

Using write_generate=0 and read_verify=0 (w/w.o patch):
bs      IOPS(read)        IOPS(write)
----    ----------        ----------
512   1572.1K/1427.2K    1823.5K/1724.3K
4k    921992/916194      753772/768267
32k   75052/73960        73180/95484

There is a performance degradation when writing big block sizes.
Degradation is caused by the complexity of combining multiple
indirections and perform RDMA READ operation from it. This will be
fixed in the following patches by reducing the indirections if
possible.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/rw.c            | 167 +++++++++++++-------------------
 drivers/infiniband/ulp/isert/ib_isert.c |   4 +-
 include/rdma/rw.h                       |   9 --
 3 files changed, 72 insertions(+), 108 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index f87f0de82e24..cc5feb301c05 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -59,10 +59,18 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
 	return false;
 }
 
-static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
+static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev,
+					   bool pi_support)
 {
+	u32 max_pages;
+
+	if (pi_support)
+		max_pages = dev->attrs.max_pi_fast_reg_page_list_len;
+	else
+		max_pages = dev->attrs.max_fast_reg_page_list_len;
+
 	/* arbitrary limit to avoid allocating gigantic resources */
-	return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
+	return min_t(u32, max_pages, 256);
 }
 
 static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg)
@@ -86,7 +94,8 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
 		struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
 		u32 sg_cnt, u32 offset)
 {
-	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+						    qp->integrity_en);
 	u32 nents = min(sg_cnt, pages_per_mr);
 	int count = 0, ret;
 
@@ -119,7 +128,8 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
 	struct rdma_rw_reg_ctx *prev = NULL;
-	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+						    qp->integrity_en);
 	int i, j, ret = 0, count = 0;
 
 	ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
@@ -360,9 +370,9 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
 	struct ib_device *dev = qp->pd->device;
-	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+						    qp->integrity_en);
 	struct ib_rdma_wr *rdma_wr;
-	struct ib_send_wr *prev_wr = NULL;
 	int count = 0, ret;
 
 	if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
@@ -376,75 +386,58 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		return -ENOMEM;
 	sg_cnt = ret;
 
-	ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
-	if (!ret) {
-		ret = -ENOMEM;
-		goto out_unmap_sg;
+	if (prot_sg_cnt) {
+		ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
+		if (!ret) {
+			ret = -ENOMEM;
+			goto out_unmap_sg;
+		}
+		prot_sg_cnt = ret;
 	}
-	prot_sg_cnt = ret;
 
 	ctx->type = RDMA_RW_SIG_MR;
 	ctx->nr_ops = 1;
-	ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL);
-	if (!ctx->sig) {
+	ctx->reg = kcalloc(1, sizeof(*ctx->reg), GFP_KERNEL);
+	if (!ctx->reg) {
 		ret = -ENOMEM;
 		goto out_unmap_prot_sg;
 	}
 
-	ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0);
-	if (ret < 0)
-		goto out_free_ctx;
-	count += ret;
-	prev_wr = &ctx->sig->data.reg_wr.wr;
-
-	ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot,
-				  prot_sg, prot_sg_cnt, 0);
-	if (ret < 0)
-		goto out_destroy_data_mr;
-	count += ret;
-
-	if (ctx->sig->prot.inv_wr.next)
-		prev_wr->next = &ctx->sig->prot.inv_wr;
-	else
-		prev_wr->next = &ctx->sig->prot.reg_wr.wr;
-	prev_wr = &ctx->sig->prot.reg_wr.wr;
-
-	ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs);
-	if (!ctx->sig->sig_mr) {
+	ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs);
+	if (!ctx->reg->mr) {
 		ret = -EAGAIN;
-		goto out_destroy_prot_mr;
+		goto out_free_ctx;
 	}
 
-	if (ctx->sig->sig_mr->need_inval) {
-		memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr));
+	count += rdma_rw_inv_key(ctx->reg);
 
-		ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV;
-		ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey;
+	memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
 
-		prev_wr->next = &ctx->sig->sig_inv_wr;
-		prev_wr = &ctx->sig->sig_inv_wr;
+	ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg,
+			      prot_sg_cnt, NULL, SZ_4K);
+	if (unlikely(ret)) {
+		pr_err("failed to map PI sg (%d)\n", sg_cnt + prot_sg_cnt);
+		goto out_destroy_sig_mr;
 	}
 
-	ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
-	ctx->sig->sig_wr.wr.wr_cqe = NULL;
-	ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge;
-	ctx->sig->sig_wr.wr.num_sge = 1;
-	ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
-	ctx->sig->sig_wr.sig_attrs = sig_attrs;
-	ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr;
-	if (prot_sg_cnt)
-		ctx->sig->sig_wr.prot = &ctx->sig->prot.sge;
-	prev_wr->next = &ctx->sig->sig_wr.wr;
-	prev_wr = &ctx->sig->sig_wr.wr;
+	ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY;
+	ctx->reg->reg_wr.wr.wr_cqe = NULL;
+	ctx->reg->reg_wr.wr.num_sge = 0;
+	ctx->reg->reg_wr.wr.send_flags = 0;
+	ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+	if (rdma_protocol_iwarp(qp->device, port_num))
+		ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+	ctx->reg->reg_wr.mr = ctx->reg->mr;
+	ctx->reg->reg_wr.key = ctx->reg->mr->lkey;
 	count++;
 
-	ctx->sig->sig_sge.addr = 0;
-	ctx->sig->sig_sge.length = ctx->sig->data.sge.length;
-	if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE)
-		ctx->sig->sig_sge.length += ctx->sig->prot.sge.length;
+	ctx->reg->sge.addr = ctx->reg->mr->iova;
+	ctx->reg->sge.length = ctx->reg->mr->length;
+	if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE)
+		ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length;
 
-	rdma_wr = &ctx->sig->data.wr;
-	rdma_wr->wr.sg_list = &ctx->sig->sig_sge;
+	rdma_wr = &ctx->reg->wr;
+	rdma_wr->wr.sg_list = &ctx->reg->sge;
 	rdma_wr->wr.num_sge = 1;
 	rdma_wr->remote_addr = remote_addr;
 	rdma_wr->rkey = rkey;
@@ -452,21 +445,18 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
 	else
 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
-	prev_wr->next = &rdma_wr->wr;
-	prev_wr = &rdma_wr->wr;
+	ctx->reg->reg_wr.wr.next = &rdma_wr->wr;
 	count++;
 
 	return count;
 
-out_destroy_prot_mr:
-	if (prot_sg_cnt)
-		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
-out_destroy_data_mr:
-	ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+out_destroy_sig_mr:
+	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
 out_free_ctx:
-	kfree(ctx->sig);
+	kfree(ctx->reg);
 out_unmap_prot_sg:
-	ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+	if (prot_sg_cnt)
+		ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
 out_unmap_sg:
 	ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
 	return ret;
@@ -509,22 +499,8 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 
 	switch (ctx->type) {
 	case RDMA_RW_SIG_MR:
-		rdma_rw_update_lkey(&ctx->sig->data, true);
-		if (ctx->sig->prot.mr)
-			rdma_rw_update_lkey(&ctx->sig->prot, true);
-	
-		ctx->sig->sig_mr->need_inval = true;
-		ib_update_fast_reg_key(ctx->sig->sig_mr,
-			ib_inc_rkey(ctx->sig->sig_mr->lkey));
-		ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey;
-
-		if (ctx->sig->data.inv_wr.next)
-			first_wr = &ctx->sig->data.inv_wr;
-		else
-			first_wr = &ctx->sig->data.reg_wr.wr;
-		last_wr = &ctx->sig->data.wr.wr;
-		break;
 	case RDMA_RW_MR:
+		/* fallthrough */
 		for (i = 0; i < ctx->nr_ops; i++) {
 			rdma_rw_update_lkey(&ctx->reg[i],
 				ctx->reg[i].wr.wr.opcode !=
@@ -641,16 +617,12 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 	if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
 		return;
 
-	ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
-	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
+	kfree(ctx->reg);
 
-	if (ctx->sig->prot.mr) {
-		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+	if (prot_sg_cnt)
 		ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
-	}
-
-	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr);
-	kfree(ctx->sig);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
 
@@ -671,7 +643,7 @@ unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
 	unsigned int mr_pages;
 
 	if (rdma_rw_can_use_mr(device, port_num))
-		mr_pages = rdma_rw_fr_page_list_len(device);
+		mr_pages = rdma_rw_fr_page_list_len(device, false);
 	else
 		mr_pages = device->attrs.max_sge_rd;
 	return DIV_ROUND_UP(maxpages, mr_pages);
@@ -697,9 +669,8 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
 	 * we'll need two additional MRs for the registrations and the
 	 * invalidation.
 	 */
-	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN)
-		factor += 6;	/* (inv + reg) * (data + prot + sig) */
-	else if (rdma_rw_can_use_mr(dev, attr->port_num))
+	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN ||
+	    rdma_rw_can_use_mr(dev, attr->port_num))
 		factor += 2;	/* inv + reg */
 
 	attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
@@ -715,20 +686,22 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 {
 	struct ib_device *dev = qp->pd->device;
-	u32 nr_mrs = 0, nr_sig_mrs = 0;
+	u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0;
 	int ret = 0;
 
 	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) {
 		nr_sig_mrs = attr->cap.max_rdma_ctxs;
-		nr_mrs = attr->cap.max_rdma_ctxs * 2;
+		nr_mrs = attr->cap.max_rdma_ctxs;
+		max_num_sg = rdma_rw_fr_page_list_len(dev, true);
 	} else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
 		nr_mrs = attr->cap.max_rdma_ctxs;
+		max_num_sg = rdma_rw_fr_page_list_len(dev, false);
 	}
 
 	if (nr_mrs) {
 		ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
 				IB_MR_TYPE_MEM_REG,
-				rdma_rw_fr_page_list_len(dev), 0);
+				max_num_sg, 0);
 		if (ret) {
 			pr_err("%s: failed to allocated %d MRs\n",
 				__func__, nr_mrs);
@@ -738,7 +711,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 
 	if (nr_sig_mrs) {
 		ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
-				IB_MR_TYPE_SIGNATURE, 2, 0);
+				IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg);
 		if (ret) {
 			pr_err("%s: failed to allocated %d SIG MRs\n",
 				__func__, nr_sig_mrs);
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 53bc505f5292..4b4998e888b9 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -1677,7 +1677,7 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 
 	isert_dbg("Cmd %p\n", isert_cmd);
 
-	ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr);
+	ret = isert_check_pi_status(cmd, isert_cmd->rw.reg->mr);
 	isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
 
 	if (ret) {
@@ -1723,7 +1723,7 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
 	iscsit_stop_dataout_timer(cmd);
 
 	if (isert_prot_cmd(isert_conn, se_cmd))
-		ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr);
+		ret = isert_check_pi_status(se_cmd, isert_cmd->rw.reg->mr);
 	isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
 	cmd->write_data_done = 0;
 
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index a3cbbc7b6417..bcb221241b5d 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -47,15 +47,6 @@ struct rdma_rw_ctx {
 			struct ib_send_wr	inv_wr;
 			struct ib_mr		*mr;
 		} *reg;
-
-		struct {
-			struct rdma_rw_reg_ctx	data;
-			struct rdma_rw_reg_ctx	prot;
-			struct ib_send_wr	sig_inv_wr;
-			struct ib_mr		*sig_mr;
-			struct ib_sge		sig_sge;
-			struct ib_sig_handover_wr sig_wr;
-		} *sig;
 	};
 };
 
-- 
cgit v1.2.3


From 5c171cbe3ab3d1390290eaa85e7b371cc26b1122 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:54 +0300
Subject: RDMA/mlx5: Remove unused IB_WR_REG_SIG_MR code

IB_WR_REG_SIG_MR is not needed after IB_WR_REG_MR_INTEGRITY
was used.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mr.c           |  15 ++-
 drivers/infiniband/hw/mlx5/qp.c           | 154 ++----------------------------
 drivers/infiniband/hw/vmw_pvrdma/pvrdma.h |   2 +-
 include/rdma/ib_verbs.h                   |  19 ----
 4 files changed, 17 insertions(+), 173 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index af8ae1e76fd4..36d1d6f8bb47 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1760,8 +1760,7 @@ static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
 			goto err_free_in;
 		mr->desc_size = sizeof(struct mlx5_klm);
 		mr->max_descs = ndescs;
-	} else if (mr_type == IB_MR_TYPE_SIGNATURE ||
-		   mr_type == IB_MR_TYPE_INTEGRITY) {
+	} else if (mr_type == IB_MR_TYPE_INTEGRITY) {
 		u32 psv_index[2];
 
 		MLX5_SET(mkc, mkc, bsf_en, 1);
@@ -1787,13 +1786,11 @@ static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
 		mr->sig->sig_err_exists = false;
 		/* Next UMR, Arm SIGERR */
 		++mr->sig->sigerr_count;
-		if (mr_type == IB_MR_TYPE_INTEGRITY) {
-			mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg,
-							max_num_meta_sg);
-			if (IS_ERR(mr->pi_mr)) {
-				err = PTR_ERR(mr->pi_mr);
-				goto err_destroy_psv;
-			}
+		mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg,
+						max_num_meta_sg);
+		if (IS_ERR(mr->pi_mr)) {
+			err = PTR_ERR(mr->pi_mr);
+			goto err_destroy_psv;
 		}
 	} else {
 		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index d77a64c551ea..60536c9c008f 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -4557,32 +4557,17 @@ static int set_sig_data_segment(const struct ib_send_wr *send_wr,
 	bool prot = false;
 	int ret;
 	int wqe_size;
+	struct mlx5_ib_mr *mr = to_mmr(sig_mr);
+	struct mlx5_ib_mr *pi_mr = mr->pi_mr;
 
-	if (send_wr->opcode == IB_WR_REG_SIG_MR) {
-		const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
-
-		data_len = wr->wr.sg_list->length;
-		data_key = wr->wr.sg_list->lkey;
-		data_va = wr->wr.sg_list->addr;
-		if (wr->prot) {
-			prot_len = wr->prot->length;
-			prot_key = wr->prot->lkey;
-			prot_va = wr->prot->addr;
-			prot = true;
-		}
-	} else {
-		struct mlx5_ib_mr *mr = to_mmr(sig_mr);
-		struct mlx5_ib_mr *pi_mr = mr->pi_mr;
-
-		data_len = pi_mr->data_length;
-		data_key = pi_mr->ibmr.lkey;
-		data_va = pi_mr->ibmr.iova;
-		if (pi_mr->meta_ndescs) {
-			prot_len = pi_mr->meta_length;
-			prot_key = pi_mr->ibmr.lkey;
-			prot_va = pi_mr->ibmr.iova + data_len;
-			prot = true;
-		}
+	data_len = pi_mr->data_length;
+	data_key = pi_mr->ibmr.lkey;
+	data_va = pi_mr->ibmr.iova;
+	if (pi_mr->meta_ndescs) {
+		prot_len = pi_mr->meta_length;
+		prot_key = pi_mr->ibmr.lkey;
+		prot_va = pi_mr->ibmr.iova + data_len;
+		prot = true;
 	}
 
 	if (!prot || (data_key == prot_key && data_va == prot_va &&
@@ -4748,57 +4733,6 @@ static int set_pi_umr_wr(const struct ib_send_wr *send_wr,
 	return 0;
 }
 
-static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
-			  struct mlx5_ib_qp *qp, void **seg, int *size,
-			  void **cur_edge)
-{
-	const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
-	struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr);
-	u32 pdn = get_pd(qp)->pdn;
-	u32 xlt_size;
-	int region_len, ret;
-
-	if (unlikely(wr->wr.num_sge != 1) ||
-	    unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) ||
-	    unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) ||
-	    unlikely(!sig_mr->sig->sig_status_checked))
-		return -EINVAL;
-
-	/* length of the protected region, data + protection */
-	region_len = wr->wr.sg_list->length;
-	if (wr->prot &&
-	    (wr->prot->lkey != wr->wr.sg_list->lkey  ||
-	     wr->prot->addr != wr->wr.sg_list->addr  ||
-	     wr->prot->length != wr->wr.sg_list->length))
-		region_len += wr->prot->length;
-
-	/**
-	 * KLM octoword size - if protection was provided
-	 * then we use strided block format (3 octowords),
-	 * else we use single KLM (1 octoword)
-	 **/
-	xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm);
-
-	set_sig_umr_segment(*seg, xlt_size);
-	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
-	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
-	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-
-	set_sig_mkey_segment(*seg, wr->sig_mr, wr->access_flags, xlt_size,
-			     region_len, pdn);
-	*seg += sizeof(struct mlx5_mkey_seg);
-	*size += sizeof(struct mlx5_mkey_seg) / 16;
-	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-
-	ret = set_sig_data_segment(send_wr, wr->sig_mr, wr->sig_attrs, qp, seg,
-				   size, cur_edge);
-	if (ret)
-		return ret;
-
-	sig_mr->sig->sig_status_checked = false;
-	return 0;
-}
-
 static int set_psv_wr(struct ib_sig_domain *domain,
 		      u32 psv_idx, void **seg, int *size)
 {
@@ -5187,74 +5121,6 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 				num_sge = 0;
 				goto skip_psv;
 
-			case IB_WR_REG_SIG_MR:
-				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
-				mr = to_mmr(sig_handover_wr(wr)->sig_mr);
-
-				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
-				err = set_sig_umr_wr(wr, qp, &seg, &size,
-						     &cur_edge);
-				if (err) {
-					mlx5_ib_warn(dev, "\n");
-					*bad_wr = wr;
-					goto out;
-				}
-
-				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
-					   wr->wr_id, nreq, fence,
-					   MLX5_OPCODE_UMR);
-				/*
-				 * SET_PSV WQEs are not signaled and solicited
-				 * on error
-				 */
-				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
-						  &size, &cur_edge, nreq, false,
-						  true);
-				if (err) {
-					mlx5_ib_warn(dev, "\n");
-					err = -ENOMEM;
-					*bad_wr = wr;
-					goto out;
-				}
-
-				err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->mem,
-						 mr->sig->psv_memory.psv_idx, &seg,
-						 &size);
-				if (err) {
-					mlx5_ib_warn(dev, "\n");
-					*bad_wr = wr;
-					goto out;
-				}
-
-				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
-					   wr->wr_id, nreq, fence,
-					   MLX5_OPCODE_SET_PSV);
-				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
-						  &size, &cur_edge, nreq, false,
-						  true);
-				if (err) {
-					mlx5_ib_warn(dev, "\n");
-					err = -ENOMEM;
-					*bad_wr = wr;
-					goto out;
-				}
-
-				err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire,
-						 mr->sig->psv_wire.psv_idx, &seg,
-						 &size);
-				if (err) {
-					mlx5_ib_warn(dev, "\n");
-					*bad_wr = wr;
-					goto out;
-				}
-
-				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
-					   wr->wr_id, nreq, fence,
-					   MLX5_OPCODE_SET_PSV);
-				qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
-				num_sge = 0;
-				goto skip_psv;
-
 			default:
 				break;
 			}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
index 3c633ab58052..c142f5e7f25f 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
@@ -456,7 +456,7 @@ static inline enum pvrdma_wr_opcode ib_wr_opcode_to_pvrdma(enum ib_wr_opcode op)
 		return PVRDMA_WR_MASKED_ATOMIC_CMP_AND_SWP;
 	case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
 		return PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD;
-	case IB_WR_REG_SIG_MR:
+	case IB_WR_REG_MR_INTEGRITY:
 		return PVRDMA_WR_REG_SIG_MR;
 	default:
 		return PVRDMA_WR_ERROR;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 14b5eab76ed8..e2478b74551d 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -776,9 +776,6 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
  * enum ib_mr_type - memory region type
  * @IB_MR_TYPE_MEM_REG:       memory region that is used for
  *                            normal registration
- * @IB_MR_TYPE_SIGNATURE:     memory region that is used for
- *                            signature operations (data-integrity
- *                            capable regions)
  * @IB_MR_TYPE_SG_GAPS:       memory region that is capable to
  *                            register any arbitrary sg lists (without
  *                            the normal mr constraints - see
@@ -794,7 +791,6 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
  */
 enum ib_mr_type {
 	IB_MR_TYPE_MEM_REG,
-	IB_MR_TYPE_SIGNATURE,
 	IB_MR_TYPE_SG_GAPS,
 	IB_MR_TYPE_DM,
 	IB_MR_TYPE_USER,
@@ -1235,7 +1231,6 @@ enum ib_wr_opcode {
 
 	/* These are kernel only and can not be issued by userspace */
 	IB_WR_REG_MR = 0x20,
-	IB_WR_REG_SIG_MR,
 	IB_WR_REG_MR_INTEGRITY,
 
 	/* reserve values for low level drivers' internal use.
@@ -1346,20 +1341,6 @@ static inline const struct ib_reg_wr *reg_wr(const struct ib_send_wr *wr)
 	return container_of(wr, struct ib_reg_wr, wr);
 }
 
-struct ib_sig_handover_wr {
-	struct ib_send_wr	wr;
-	struct ib_sig_attrs    *sig_attrs;
-	struct ib_mr	       *sig_mr;
-	int			access_flags;
-	struct ib_sge	       *prot;
-};
-
-static inline const struct ib_sig_handover_wr *
-sig_handover_wr(const struct ib_send_wr *wr)
-{
-	return container_of(wr, struct ib_sig_handover_wr, wr);
-}
-
 struct ib_recv_wr {
 	struct ib_recv_wr      *next;
 	union {
-- 
cgit v1.2.3


From e28ad544f462231d3fd081a7316339359efbb481 Mon Sep 17 00:00:00 2001
From: Andres Rodriguez <andresx7@gmail.com>
Date: Wed, 19 Jun 2019 14:09:01 -0400
Subject: drm/edid: parse CEA blocks embedded in DisplayID

DisplayID blocks allow embedding of CEA blocks. The payloads are
identical to traditional top level CEA extension blocks, but the header
is slightly different.

This change allows the CEA parser to find a CEA block inside a DisplayID
block. Additionally, it adds support for parsing the embedded CTA
header. No further changes are necessary due to payload parity.

This change fixes audio support for the Valve Index HMD.

Signed-off-by: Andres Rodriguez <andresx7@gmail.com>
Reviewed-by: Dave Airlie <airlied@redhat.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: <stable@vger.kernel.org> # v4.15
Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190619180901.17901-1-andresx7@gmail.com
---
 drivers/gpu/drm/drm_edid.c  | 81 +++++++++++++++++++++++++++++++++++++++------
 include/drm/drm_displayid.h | 10 ++++++
 2 files changed, 80 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 9d8f2b952004..df635afe41b7 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -1342,6 +1342,7 @@ MODULE_PARM_DESC(edid_fixup,
 
 static void drm_get_displayid(struct drm_connector *connector,
 			      struct edid *edid);
+static int validate_displayid(u8 *displayid, int length, int idx);
 
 static int drm_edid_block_checksum(const u8 *raw_edid)
 {
@@ -2926,16 +2927,46 @@ static u8 *drm_find_edid_extension(const struct edid *edid, int ext_id)
 	return edid_ext;
 }
 
-static u8 *drm_find_cea_extension(const struct edid *edid)
-{
-	return drm_find_edid_extension(edid, CEA_EXT);
-}
 
 static u8 *drm_find_displayid_extension(const struct edid *edid)
 {
 	return drm_find_edid_extension(edid, DISPLAYID_EXT);
 }
 
+static u8 *drm_find_cea_extension(const struct edid *edid)
+{
+	int ret;
+	int idx = 1;
+	int length = EDID_LENGTH;
+	struct displayid_block *block;
+	u8 *cea;
+	u8 *displayid;
+
+	/* Look for a top level CEA extension block */
+	cea = drm_find_edid_extension(edid, CEA_EXT);
+	if (cea)
+		return cea;
+
+	/* CEA blocks can also be found embedded in a DisplayID block */
+	displayid = drm_find_displayid_extension(edid);
+	if (!displayid)
+		return NULL;
+
+	ret = validate_displayid(displayid, length, idx);
+	if (ret)
+		return NULL;
+
+	idx += sizeof(struct displayid_hdr);
+	for_each_displayid_db(displayid, block, idx, length) {
+		if (block->tag == DATA_BLOCK_CTA) {
+			cea = (u8 *)block;
+			break;
+		}
+	}
+
+	return cea;
+}
+
 /*
  * Calculate the alternate clock for the CEA mode
  * (60Hz vs. 59.94Hz etc.)
@@ -3659,13 +3690,38 @@ cea_revision(const u8 *cea)
 static int
 cea_db_offsets(const u8 *cea, int *start, int *end)
 {
-	/* Data block offset in CEA extension block */
-	*start = 4;
-	*end = cea[2];
-	if (*end == 0)
-		*end = 127;
-	if (*end < 4 || *end > 127)
-		return -ERANGE;
+	/* DisplayID CTA extension blocks and top-level CEA EDID
+	 * block header definitions differ in the following bytes:
+	 *   1) Byte 2 of the header specifies length differently,
+	 *   2) Byte 3 is only present in the CEA top level block.
+	 *
+	 * The different definitions for byte 2 follow.
+	 *
+	 * DisplayID CTA extension block defines byte 2 as:
+	 *   Number of payload bytes
+	 *
+	 * CEA EDID block defines byte 2 as:
+	 *   Byte number (decimal) within this block where the 18-byte
+	 *   DTDs begin. If no non-DTD data is present in this extension
+	 *   block, the value should be set to 04h (the byte after next).
+	 *   If set to 00h, there are no DTDs present in this block and
+	 *   no non-DTD data.
+	 */
+	if (cea[0] == DATA_BLOCK_CTA) {
+		*start = 3;
+		*end = *start + cea[2];
+	} else if (cea[0] == CEA_EXT) {
+		/* Data block offset in CEA extension block */
+		*start = 4;
+		*end = cea[2];
+		if (*end == 0)
+			*end = 127;
+		if (*end < 4 || *end > 127)
+			return -ERANGE;
+	} else {
+		return -ENOTSUPP;
+	}
+
 	return 0;
 }
 
@@ -5406,6 +5462,9 @@ static int drm_parse_display_id(struct drm_connector *connector,
 		case DATA_BLOCK_TYPE_1_DETAILED_TIMING:
 			/* handled in mode gathering code. */
 			break;
+		case DATA_BLOCK_CTA:
+			/* handled in the cea parser code. */
+			break;
 		default:
 			DRM_DEBUG_KMS("found DisplayID tag 0x%x, unhandled\n", block->tag);
 			break;
diff --git a/include/drm/drm_displayid.h b/include/drm/drm_displayid.h
index c0d4df6a606f..9d3b745c3107 100644
--- a/include/drm/drm_displayid.h
+++ b/include/drm/drm_displayid.h
@@ -40,6 +40,7 @@
 #define DATA_BLOCK_DISPLAY_INTERFACE 0x0f
 #define DATA_BLOCK_STEREO_DISPLAY_INTERFACE 0x10
 #define DATA_BLOCK_TILED_DISPLAY 0x12
+#define DATA_BLOCK_CTA 0x81
 
 #define DATA_BLOCK_VENDOR_SPECIFIC 0x7f
 
@@ -90,4 +91,13 @@ struct displayid_detailed_timing_block {
 	struct displayid_block base;
 	struct displayid_detailed_timings_1 timings[0];
 };
+
+#define for_each_displayid_db(displayid, block, idx, length) \
+	for ((block) = (struct displayid_block *)&(displayid)[idx]; \
+	     (idx) + sizeof(struct displayid_block) <= (length) && \
+	     (idx) + sizeof(struct displayid_block) + (block)->num_bytes <= (length) && \
+	     (block)->num_bytes > 0; \
+	     (idx) += (block)->num_bytes + sizeof(struct displayid_block), \
+	     (block) = (struct displayid_block *)&(displayid)[idx])
+
 #endif
-- 
cgit v1.2.3


From ec6516bfbaf72e7c81811162b6de96322e32a027 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 13 Jun 2019 10:55:31 +0900
Subject: pinctrl: remove unneeded #ifdef around declarations

What is the point in surrounding the whole of declarations with
ifdef like this?

  #ifdef CONFIG_FOO
  int foo(void);
  #endif

If CONFIG_FOO is not defined, all callers of foo() will fail
with implicit declaration errors since the top Makefile adds
-Werror-implicit-function-declaration to KBUILD_CFLAGS.

This breaks the build earlier when you are doing something wrong.
That's it.

Anyway, it will fail to link since the definition of foo() is not
compiled.

In summary, these ifdef are unneeded.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf-generic.h | 20 ++++++--------------
 include/linux/pinctrl/pinconf.h         |  4 ----
 include/linux/pinctrl/pinctrl.h         |  4 ----
 include/linux/pinctrl/pinmux.h          |  4 ----
 4 files changed, 6 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 72d06d6a3099..673828a52294 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -12,6 +12,12 @@
 #ifndef __LINUX_PINCTRL_PINCONF_GENERIC_H
 #define __LINUX_PINCTRL_PINCONF_GENERIC_H
 
+#include <linux/device.h>
+#include <linux/pinctrl/machine.h>
+
+struct pinctrl_dev;
+struct pinctrl_map;
+
 /**
  * enum pin_config_param - possible pin configuration parameters
  * @PIN_CONFIG_BIAS_BUS_HOLD: the pin will be set to weakly latch so that it
@@ -159,9 +165,6 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param
 	return PIN_CONF_PACKED(param, argument);
 }
 
-#ifdef CONFIG_GENERIC_PINCONF
-
-#ifdef CONFIG_DEBUG_FS
 #define PCONFDUMP(a, b, c, d) {					\
 	.param = a, .display = b, .format = c, .has_arg = d	\
 	}
@@ -172,14 +175,6 @@ struct pin_config_item {
 	const char * const format;
 	bool has_arg;
 };
-#endif /* CONFIG_DEBUG_FS */
-
-#ifdef CONFIG_OF
-
-#include <linux/device.h>
-#include <linux/pinctrl/machine.h>
-struct pinctrl_dev;
-struct pinctrl_map;
 
 struct pinconf_generic_params {
 	const char * const property;
@@ -224,8 +219,5 @@ static inline int pinconf_generic_dt_node_to_map_all(
 	return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps,
 			PIN_MAP_TYPE_INVALID);
 }
-#endif
-
-#endif /* CONFIG_GENERIC_PINCONF */
 
 #endif /* __LINUX_PINCTRL_PINCONF_GENERIC_H */
diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index 9bebc3554809..513883dcc5a9 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -12,8 +12,6 @@
 #ifndef __LINUX_PINCTRL_PINCONF_H
 #define __LINUX_PINCTRL_PINCONF_H
 
-#ifdef CONFIG_PINCONF
-
 #include <linux/types.h>
 
 struct pinctrl_dev;
@@ -67,6 +65,4 @@ struct pinconf_ops {
 					    unsigned long config);
 };
 
-#endif
-
 #endif /* __LINUX_PINCTRL_PINCONF_H */
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 36a79fe7b84f..27738164daa7 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -12,8 +12,6 @@
 #ifndef __LINUX_PINCTRL_PINCTRL_H
 #define __LINUX_PINCTRL_PINCTRL_H
 
-#ifdef CONFIG_PINCTRL
-
 #include <linux/radix-tree.h>
 #include <linux/list.h>
 #include <linux/seq_file.h>
@@ -203,6 +201,4 @@ extern const char *pinctrl_dev_get_name(struct pinctrl_dev *pctldev);
 extern const char *pinctrl_dev_get_devname(struct pinctrl_dev *pctldev);
 extern void *pinctrl_dev_get_drvdata(struct pinctrl_dev *pctldev);
 
-#endif /* !CONFIG_PINCTRL */
-
 #endif /* __LINUX_PINCTRL_PINCTRL_H */
diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h
index ace60d775b20..566a5fe8eab5 100644
--- a/include/linux/pinctrl/pinmux.h
+++ b/include/linux/pinctrl/pinmux.h
@@ -16,8 +16,6 @@
 #include <linux/seq_file.h>
 #include <linux/pinctrl/pinctrl.h>
 
-#ifdef CONFIG_PINMUX
-
 struct pinctrl_dev;
 
 /**
@@ -85,6 +83,4 @@ struct pinmux_ops {
 	bool strict;
 };
 
-#endif /* CONFIG_PINMUX */
-
 #endif /* __LINUX_PINCTRL_PINMUX_H */
-- 
cgit v1.2.3


From 34d65cd837d0c77fac0c0da632c616030b2927e3 Mon Sep 17 00:00:00 2001
From: Doug Ledford <dledford@redhat.com>
Date: Fri, 21 Jun 2019 17:00:44 -0400
Subject: RDMA/netlink: Audit policy settings for netlink attributes

For all string attributes for which we don't currently accept the element
as input, we only use it as output, set the string length to
RDMA_NLDEV_ATTR_EMPTY_STRING which is defined as 1.  That way we will only
accept a null string for that element.  This will prevent someone from
writing a new input routine that uses the element without also updating
the policy to have a valid value.

Also while there, make sure the existing entries that are valid have the
correct policy, if not, correct the policy.  Remove unnecessary checks
for nla_strlcpy() overflow once the policy has been set correctly.

Signed-off-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 25 ++++++++++++-------------
 include/rdma/rdma_netlink.h      |  6 ++++++
 include/uapi/rdma/rdma_netlink.h |  4 ----
 3 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 6006d23d0410..5499f5629dc2 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -49,29 +49,29 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_CHARDEV]		= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_CHARDEV_ABI]		= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_CHARDEV_NAME]		= { .type = NLA_NUL_STRING,
-					.len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
 	[RDMA_NLDEV_ATTR_CHARDEV_TYPE]		= { .type = NLA_NUL_STRING,
-					.len = 128 },
+					.len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE },
 	[RDMA_NLDEV_ATTR_DEV_INDEX]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_DEV_NAME]		= { .type = NLA_NUL_STRING,
-					.len = IB_DEVICE_NAME_MAX - 1},
+					.len = IB_DEVICE_NAME_MAX },
 	[RDMA_NLDEV_ATTR_DEV_NODE_TYPE]		= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_DEV_PROTOCOL]		= { .type = NLA_NUL_STRING,
-					.len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
 	[RDMA_NLDEV_ATTR_DRIVER]		= { .type = NLA_NESTED },
 	[RDMA_NLDEV_ATTR_DRIVER_ENTRY]		= { .type = NLA_NESTED },
 	[RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE]	= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_DRIVER_STRING]		= { .type = NLA_NUL_STRING,
-					.len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
 	[RDMA_NLDEV_ATTR_DRIVER_S32]		= { .type = NLA_S32 },
 	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
 	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_FW_VERSION]		= { .type = NLA_NUL_STRING,
-					.len = IB_FW_VERSION_NAME_MAX - 1},
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
 	[RDMA_NLDEV_ATTR_LID]			= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type = NLA_NUL_STRING,
-					.len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+					.len = IFNAMSIZ },
 	[RDMA_NLDEV_ATTR_LMC]			= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_NDEV_INDEX]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_NDEV_NAME]		= { .type = NLA_NUL_STRING,
@@ -92,7 +92,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 			.len = sizeof(struct __kernel_sockaddr_storage) },
 	[RDMA_NLDEV_ATTR_RES_IOVA]		= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_RES_KERN_NAME]		= { .type = NLA_NUL_STRING,
-					.len = TASK_COMM_LEN },
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
 	[RDMA_NLDEV_ATTR_RES_LKEY]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY]	= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_LQPN]		= { .type = NLA_U32 },
@@ -120,7 +120,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY]	= { .type = NLA_NESTED },
 	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING,
-					.len = 16 },
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
 	[RDMA_NLDEV_ATTR_RES_TYPE]		= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_USECNT]		= { .type = NLA_U64 },
@@ -1361,7 +1361,7 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
-	char client_name[IB_DEVICE_NAME_MAX];
+	char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE];
 	struct ib_client_nl_info data = {};
 	struct ib_device *ibdev = NULL;
 	struct sk_buff *msg;
@@ -1373,9 +1373,8 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE])
 		return -EINVAL;
 
-	if (nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE],
-			sizeof(client_name)) >= sizeof(client_name))
-		return -EINVAL;
+	nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE],
+		    sizeof(client_name));
 
 	if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) {
 		index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index c7acbe083428..6631624e4d7c 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -6,6 +6,12 @@
 #include <linux/netlink.h>
 #include <uapi/rdma/rdma_netlink.h>
 
+enum {
+	RDMA_NLDEV_ATTR_EMPTY_STRING = 1,
+	RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
+	RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE = 32,
+};
+
 struct rdma_nl_cbs {
 	int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh,
 		    struct netlink_ext_ack *extack);
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index b27c02185dcc..650cee8c4bf1 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -284,10 +284,6 @@ enum rdma_nldev_command {
 	RDMA_NLDEV_NUM_OPS
 };
 
-enum {
-	RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
-};
-
 enum rdma_nldev_print_type {
 	RDMA_NLDEV_PRINT_TYPE_UNSPEC,
 	RDMA_NLDEV_PRINT_TYPE_HEX,
-- 
cgit v1.2.3


From 2f25528e4edddc6eddd42c8d41c9c9e341c8b9da Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Wed, 19 Jun 2019 11:39:25 +0200
Subject: clk: Add clk_bulk_get_optional() function

clk_bulk_get_optional() allows to get a group of clocks where one
or more is optional.  For a not available clock, e.g. not specifed
in the clock consumer node in DT, its respective struct clk pointer
will be NULL.  This allows for operating on a group of returned
clocks (struct clk_bulk_data array) with existing clk_bulk* APIs.

Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-bulk.c | 23 ++++++++++++++++++++---
 include/linux/clk.h    | 19 +++++++++++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk-bulk.c b/drivers/clk/clk-bulk.c
index 06499568cf07..524bf9a53098 100644
--- a/drivers/clk/clk-bulk.c
+++ b/drivers/clk/clk-bulk.c
@@ -75,8 +75,8 @@ void clk_bulk_put(int num_clks, struct clk_bulk_data *clks)
 }
 EXPORT_SYMBOL_GPL(clk_bulk_put);
 
-int __must_check clk_bulk_get(struct device *dev, int num_clks,
-			      struct clk_bulk_data *clks)
+static int __clk_bulk_get(struct device *dev, int num_clks,
+			  struct clk_bulk_data *clks, bool optional)
 {
 	int ret;
 	int i;
@@ -88,10 +88,14 @@ int __must_check clk_bulk_get(struct device *dev, int num_clks,
 		clks[i].clk = clk_get(dev, clks[i].id);
 		if (IS_ERR(clks[i].clk)) {
 			ret = PTR_ERR(clks[i].clk);
+			clks[i].clk = NULL;
+
+			if (ret == -ENOENT && optional)
+				continue;
+
 			if (ret != -EPROBE_DEFER)
 				dev_err(dev, "Failed to get clk '%s': %d\n",
 					clks[i].id, ret);
-			clks[i].clk = NULL;
 			goto err;
 		}
 	}
@@ -103,8 +107,21 @@ err:
 
 	return ret;
 }
+
+int __must_check clk_bulk_get(struct device *dev, int num_clks,
+			      struct clk_bulk_data *clks)
+{
+	return __clk_bulk_get(dev, num_clks, clks, false);
+}
 EXPORT_SYMBOL(clk_bulk_get);
 
+int __must_check clk_bulk_get_optional(struct device *dev, int num_clks,
+				       struct clk_bulk_data *clks)
+{
+	return __clk_bulk_get(dev, num_clks, clks, true);
+}
+EXPORT_SYMBOL_GPL(clk_bulk_get_optional);
+
 void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks)
 {
 	if (IS_ERR_OR_NULL(clks))
diff --git a/include/linux/clk.h b/include/linux/clk.h
index f689fc58d7be..1b50e7d1675c 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -332,6 +332,19 @@ int __must_check clk_bulk_get(struct device *dev, int num_clks,
  */
 int __must_check clk_bulk_get_all(struct device *dev,
 				  struct clk_bulk_data **clks);
+
+/**
+ * clk_bulk_get_optional - lookup and obtain a number of references to clock producer
+ * @dev: device for clock "consumer"
+ * @num_clks: the number of clk_bulk_data
+ * @clks: the clk_bulk_data table of consumer
+ *
+ * Behaves the same as clk_bulk_get() except where there is no clock producer.
+ * In this case, instead of returning -ENOENT, the function returns 0 and
+ * NULL for a clk for which a clock producer could not be determined.
+ */
+int __must_check clk_bulk_get_optional(struct device *dev, int num_clks,
+				       struct clk_bulk_data *clks);
 /**
  * devm_clk_bulk_get - managed get multiple clk consumers
  * @dev: device for clock "consumer"
@@ -718,6 +731,12 @@ static inline int __must_check clk_bulk_get(struct device *dev, int num_clks,
 	return 0;
 }
 
+static inline int __must_check clk_bulk_get_optional(struct device *dev,
+				int num_clks, struct clk_bulk_data *clks)
+{
+	return 0;
+}
+
 static inline int __must_check clk_bulk_get_all(struct device *dev,
 					 struct clk_bulk_data **clks)
 {
-- 
cgit v1.2.3


From 9bd5ef0bd8743700d9adffb6fbb1baa346575457 Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Wed, 19 Jun 2019 11:39:26 +0200
Subject: clk: Add devm_clk_bulk_get_optional() function

Add managed version of the clk_bulk_get_optional() helper function.

Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
[sboyd@kernel.org: Mark __devm_clk_bulk_get() static]
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-devres.c | 22 +++++++++++++++++++---
 include/linux/clk.h      | 28 ++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index daa1fc8fba53..be160764911b 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c
@@ -52,8 +52,8 @@ static void devm_clk_bulk_release(struct device *dev, void *res)
 	clk_bulk_put(devres->num_clks, devres->clks);
 }
 
-int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
-		      struct clk_bulk_data *clks)
+static int __devm_clk_bulk_get(struct device *dev, int num_clks,
+			       struct clk_bulk_data *clks, bool optional)
 {
 	struct clk_bulk_devres *devres;
 	int ret;
@@ -63,7 +63,10 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 	if (!devres)
 		return -ENOMEM;
 
-	ret = clk_bulk_get(dev, num_clks, clks);
+	if (optional)
+		ret = clk_bulk_get_optional(dev, num_clks, clks);
+	else
+		ret = clk_bulk_get(dev, num_clks, clks);
 	if (!ret) {
 		devres->clks = clks;
 		devres->num_clks = num_clks;
@@ -74,8 +77,21 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 
 	return ret;
 }
+
+int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
+		      struct clk_bulk_data *clks)
+{
+	return __devm_clk_bulk_get(dev, num_clks, clks, false);
+}
 EXPORT_SYMBOL_GPL(devm_clk_bulk_get);
 
+int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks,
+		      struct clk_bulk_data *clks)
+{
+	return __devm_clk_bulk_get(dev, num_clks, clks, true);
+}
+EXPORT_SYMBOL_GPL(devm_clk_bulk_get_optional);
+
 int __must_check devm_clk_bulk_get_all(struct device *dev,
 				       struct clk_bulk_data **clks)
 {
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 1b50e7d1675c..5e7b2dd84965 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -359,6 +359,28 @@ int __must_check clk_bulk_get_optional(struct device *dev, int num_clks,
  */
 int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 				   struct clk_bulk_data *clks);
+/**
+ * devm_clk_bulk_get_optional - managed get multiple optional consumer clocks
+ * @dev: device for clock "consumer"
+ * @clks: pointer to the clk_bulk_data table of consumer
+ *
+ * Behaves the same as devm_clk_bulk_get() except where there is no clock
+ * producer.  In this case, instead of returning -ENOENT, the function returns
+ * NULL for given clk. It is assumed all clocks in clk_bulk_data are optional.
+ *
+ * Returns 0 if all clocks specified in clk_bulk_data table are obtained
+ * successfully or for any clk there was no clk provider available, otherwise
+ * returns valid IS_ERR() condition containing errno.
+ * The implementation uses @dev and @clk_bulk_data.id to determine the
+ * clock consumer, and thereby the clock producer.
+ * The clock returned is stored in each @clk_bulk_data.clk field.
+ *
+ * Drivers must assume that the clock source is not enabled.
+ *
+ * clk_bulk_get should not be called from within interrupt context.
+ */
+int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks,
+					    struct clk_bulk_data *clks);
 /**
  * devm_clk_bulk_get_all - managed get multiple clk consumers
  * @dev: device for clock "consumer"
@@ -760,6 +782,12 @@ static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clk
 	return 0;
 }
 
+static inline int __must_check devm_clk_bulk_get_optional(struct device *dev,
+				int num_clks, struct clk_bulk_data *clks)
+{
+	return 0;
+}
+
 static inline int __must_check devm_clk_bulk_get_all(struct device *dev,
 						     struct clk_bulk_data **clks)
 {
-- 
cgit v1.2.3


From 3b5015c4d834a70233c4419b07375254e4675e0b Mon Sep 17 00:00:00 2001
From: Dinh Nguyen <dinguyen@kernel.org>
Date: Mon, 24 Jun 2019 16:47:10 -0500
Subject: clk: socfpga: stratix10: add additional clocks needed for the NAND IP

The nand_clk is actually called the nand_x_clk and the parent is the
l4_mp_clk, not the l4_main_clk. The nand_clk is a child of the
nand_x_clk and has a fixed divider of 4. The same is true for the
nand_ecc_clk.

Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/socfpga/clk-s10.c               | 6 +++++-
 include/dt-bindings/clock/stratix10-clock.h | 4 +++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/socfpga/clk-s10.c b/drivers/clk/socfpga/clk-s10.c
index 8281dfbf38c2..609dd722675e 100644
--- a/drivers/clk/socfpga/clk-s10.c
+++ b/drivers/clk/socfpga/clk-s10.c
@@ -161,8 +161,12 @@ static const struct stratix10_gate_clock s10_gate_clks[] = {
 	  8, 0, 0, 0, 0, 0, 0},
 	{ STRATIX10_SPI_M_CLK, "spi_m_clk", "l4_mp_clk", NULL, 1, 0, 0xA4,
 	  9, 0, 0, 0, 0, 0, 0},
-	{ STRATIX10_NAND_CLK, "nand_clk", "l4_main_clk", NULL, 1, 0, 0xA4,
+	{ STRATIX10_NAND_X_CLK, "nand_x_clk", "l4_mp_clk", NULL, 1, 0, 0xA4,
 	  10, 0, 0, 0, 0, 0, 0},
+	{ STRATIX10_NAND_CLK, "nand_clk", "nand_x_clk", NULL, 1, 0, 0xA4,
+	  10, 0, 0, 0, 0, 0, 4},
+	{ STRATIX10_NAND_ECC_CLK, "nand_ecc_clk", "nand_x_clk", NULL, 1, 0, 0xA4,
+	  10, 0, 0, 0, 0, 0, 4},
 };
 
 static int s10_clk_register_c_perip(const struct stratix10_perip_c_clock *clks,
diff --git a/include/dt-bindings/clock/stratix10-clock.h b/include/dt-bindings/clock/stratix10-clock.h
index 0ac1c90a18bf..08b98e20b7cc 100644
--- a/include/dt-bindings/clock/stratix10-clock.h
+++ b/include/dt-bindings/clock/stratix10-clock.h
@@ -79,6 +79,8 @@
 #define STRATIX10_USB_CLK		59
 #define STRATIX10_SPI_M_CLK		60
 #define STRATIX10_NAND_CLK		61
-#define STRATIX10_NUM_CLKS		62
+#define STRATIX10_NAND_X_CLK		62
+#define STRATIX10_NAND_ECC_CLK		63
+#define STRATIX10_NUM_CLKS		64
 
 #endif	/* __STRATIX10_CLOCK_H */
-- 
cgit v1.2.3


From 550113d4e9f5c7b62be760fc01178c9e0139c1f4 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Mon, 24 Jun 2019 19:04:02 +0200
Subject: i2c: add newly exported functions to the header, too

Nobody (including me) noticed that these functions were exported but not
added to the header :/

Fixes: 7159dbdae3c5 ("i2c: core: improve return value handling of i2c_new_device and i2c_new_dummy")
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-base.c | 5 ++---
 include/linux/i2c.h         | 6 ++++++
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index e7d5ada40d48..f1949d1e2b54 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -729,7 +729,7 @@ static int i2c_dev_irq_from_resources(const struct resource *resources,
  * This returns the new i2c client, which may be saved for later use with
  * i2c_unregister_device(); or an ERR_PTR to describe the error.
  */
-static struct i2c_client *
+struct i2c_client *
 i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 {
 	struct i2c_client	*client;
@@ -895,8 +895,7 @@ static struct i2c_driver dummy_driver = {
  * This returns the new i2c client, which should be saved for later use with
  * i2c_unregister_device(); or an ERR_PTR to describe the error.
  */
-static struct i2c_client *
-i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address)
+struct i2c_client *i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address)
 {
 	struct i2c_board_info info = {
 		I2C_BOARD_INFO("dummy", address),
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index d8f9060179d0..fa5552c2307b 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -442,6 +442,9 @@ struct i2c_board_info {
 extern struct i2c_client *
 i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info);
 
+extern struct i2c_client *
+i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info);
+
 /* If you don't know the exact address of an I2C device, use this variant
  * instead, which can probe for device presence in a list of possible
  * addresses. The "probe" callback function is optional. If it is provided,
@@ -463,6 +466,9 @@ extern int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short ad
 extern struct i2c_client *
 i2c_new_dummy(struct i2c_adapter *adap, u16 address);
 
+extern struct i2c_client *
+i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address);
+
 extern struct i2c_client *
 devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adap, u16 address);
 
-- 
cgit v1.2.3


From 3ae762a09cd72a08ef620c80fbb263693c3fb204 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 4 Jun 2019 14:49:25 +0100
Subject: fs/adfs: correct disc record structure

Fill in some padding in the disc record structure, and add GCC
packed and aligned attributes to ensure that it is correctly
laid out.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/uapi/linux/adfs_fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/adfs_fs.h b/include/uapi/linux/adfs_fs.h
index 151d93e27ed4..f1a7d67a7323 100644
--- a/include/uapi/linux/adfs_fs.h
+++ b/include/uapi/linux/adfs_fs.h
@@ -29,17 +29,17 @@ struct adfs_discrecord {
     __u8  log2sharesize:4;
     __u8  unused40:4;
     __u8  big_flag:1;
-    __u8  unused41:1;
+    __u8  unused41:7;
     __u8  nzones_high;
+    __u8  reserved43;
     __le32 format_version;
     __le32 root_size;
     __u8  unused52[60 - 52];
-};
+} __attribute__((packed, aligned(4)));
 
 #define ADFS_DISCRECORD		(0xc00)
 #define ADFS_DR_OFFSET		(0x1c0)
 #define ADFS_DR_SIZE		 60
 #define ADFS_DR_SIZE_BITS	(ADFS_DR_SIZE << 3)
 
-
 #endif /* _UAPI_ADFS_FS_H */
-- 
cgit v1.2.3


From 0dc14b013f7982de6e81b5b2931a2131d20cbb6d Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Sat, 15 Jun 2019 14:18:17 +0200
Subject: clk: rockchip: add clock id for watchdog pclk on rk3328

Needed to export that added clock.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/clock/rk3328-cru.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rk3328-cru.h b/include/dt-bindings/clock/rk3328-cru.h
index bcaa4559ab1b..6ad54c39f8da 100644
--- a/include/dt-bindings/clock/rk3328-cru.h
+++ b/include/dt-bindings/clock/rk3328-cru.h
@@ -173,6 +173,7 @@
 #define PCLK_DCF		233
 #define PCLK_SARADC		234
 #define PCLK_ACODECPHY		235
+#define PCLK_WDT		236
 
 /* hclk gates */
 #define HCLK_PERI		308
-- 
cgit v1.2.3


From dbc08f18ea49bd1952fb1158a56d400b77117403 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Fri, 14 Jun 2019 10:58:04 +0200
Subject: clk: rockchip: add clock id for hdmi_phy special clock on rk3228

Add the needed clock id to enable clock settings from devicetree.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Justin Swartz <justin.swartz@risingedge.co.za>
---
 include/dt-bindings/clock/rk3228-cru.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rk3228-cru.h b/include/dt-bindings/clock/rk3228-cru.h
index 55655ab0a4c4..a0422f62c040 100644
--- a/include/dt-bindings/clock/rk3228-cru.h
+++ b/include/dt-bindings/clock/rk3228-cru.h
@@ -73,6 +73,7 @@
 #define SCLK_WIFI		141
 #define SCLK_OTGPHY0		142
 #define SCLK_OTGPHY1		143
+#define SCLK_HDMI_PHY		144
 
 /* dclk gates */
 #define DCLK_VOP		190
-- 
cgit v1.2.3


From 2a6a7aacd4e557a4c7007f8858bcc9654b098fea Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 3 Jun 2019 10:24:32 +0300
Subject: mfd: regulator: clk: Split rohm-bd718x7.h

Split the bd718x7.h to ROHM common and bd718x7 specific parts
so that we do not need to add same things in every new ROHM
PMIC header. Please note that this change requires changes also
in bd718x7 sub-device drivers for regulators and clk.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/clk/clk-bd718x7.c             |  6 +++---
 drivers/mfd/rohm-bd718x7.c            | 23 ++++++++++++-----------
 drivers/regulator/bd718x7-regulator.c | 25 +++++++++++++------------
 include/linux/mfd/rohm-bd718x7.h      | 22 ++++++++--------------
 include/linux/mfd/rohm-generic.h      | 20 ++++++++++++++++++++
 5 files changed, 56 insertions(+), 40 deletions(-)
 create mode 100644 include/linux/mfd/rohm-generic.h

(limited to 'include')

diff --git a/drivers/clk/clk-bd718x7.c b/drivers/clk/clk-bd718x7.c
index 60422c72d142..461228ebf703 100644
--- a/drivers/clk/clk-bd718x7.c
+++ b/drivers/clk/clk-bd718x7.c
@@ -17,7 +17,7 @@ struct bd718xx_clk {
 	u8 reg;
 	u8 mask;
 	struct platform_device *pdev;
-	struct bd718xx *mfd;
+	struct rohm_regmap_dev *mfd;
 };
 
 static int bd71837_clk_set(struct clk_hw *hw, int status)
@@ -68,7 +68,7 @@ static int bd71837_clk_probe(struct platform_device *pdev)
 	int rval = -ENOMEM;
 	const char *parent_clk;
 	struct device *parent = pdev->dev.parent;
-	struct bd718xx *mfd = dev_get_drvdata(parent);
+	struct rohm_regmap_dev *mfd = dev_get_drvdata(parent);
 	struct clk_init_data init = {
 		.name = "bd718xx-32k-out",
 		.ops = &bd71837_clk_ops,
@@ -119,5 +119,5 @@ static struct platform_driver bd71837_clk = {
 module_platform_driver(bd71837_clk);
 
 MODULE_AUTHOR("Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>");
-MODULE_DESCRIPTION("BD71837 chip clk driver");
+MODULE_DESCRIPTION("BD71837/BD71847 chip clk driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/rohm-bd718x7.c b/drivers/mfd/rohm-bd718x7.c
index a29d529a96f4..7beb444a57cb 100644
--- a/drivers/mfd/rohm-bd718x7.c
+++ b/drivers/mfd/rohm-bd718x7.c
@@ -98,18 +98,19 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 		return -ENOMEM;
 
 	bd718xx->chip_irq = i2c->irq;
-	bd718xx->chip_type = (unsigned int)(uintptr_t)
+	bd718xx->chip.chip_type = (unsigned int)(uintptr_t)
 				of_device_get_match_data(&i2c->dev);
-	bd718xx->dev = &i2c->dev;
+	bd718xx->chip.dev = &i2c->dev;
 	dev_set_drvdata(&i2c->dev, bd718xx);
 
-	bd718xx->regmap = devm_regmap_init_i2c(i2c, &bd718xx_regmap_config);
-	if (IS_ERR(bd718xx->regmap)) {
+	bd718xx->chip.regmap = devm_regmap_init_i2c(i2c,
+						    &bd718xx_regmap_config);
+	if (IS_ERR(bd718xx->chip.regmap)) {
 		dev_err(&i2c->dev, "regmap initialization failed\n");
-		return PTR_ERR(bd718xx->regmap);
+		return PTR_ERR(bd718xx->chip.regmap);
 	}
 
-	ret = devm_regmap_add_irq_chip(&i2c->dev, bd718xx->regmap,
+	ret = devm_regmap_add_irq_chip(&i2c->dev, bd718xx->chip.regmap,
 				       bd718xx->chip_irq, IRQF_ONESHOT, 0,
 				       &bd718xx_irq_chip, &bd718xx->irq_data);
 	if (ret) {
@@ -118,7 +119,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 	}
 
 	/* Configure short press to 10 milliseconds */
-	ret = regmap_update_bits(bd718xx->regmap,
+	ret = regmap_update_bits(bd718xx->chip.regmap,
 				 BD718XX_REG_PWRONCONFIG0,
 				 BD718XX_PWRBTN_PRESS_DURATION_MASK,
 				 BD718XX_PWRBTN_SHORT_PRESS_10MS);
@@ -129,7 +130,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 	}
 
 	/* Configure long press to 10 seconds */
-	ret = regmap_update_bits(bd718xx->regmap,
+	ret = regmap_update_bits(bd718xx->chip.regmap,
 				 BD718XX_REG_PWRONCONFIG1,
 				 BD718XX_PWRBTN_PRESS_DURATION_MASK,
 				 BD718XX_PWRBTN_LONG_PRESS_10S);
@@ -149,7 +150,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 
 	button.irq = ret;
 
-	ret = devm_mfd_add_devices(bd718xx->dev, PLATFORM_DEVID_AUTO,
+	ret = devm_mfd_add_devices(bd718xx->chip.dev, PLATFORM_DEVID_AUTO,
 				   bd718xx_mfd_cells,
 				   ARRAY_SIZE(bd718xx_mfd_cells), NULL, 0,
 				   regmap_irq_get_domain(bd718xx->irq_data));
@@ -162,11 +163,11 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 static const struct of_device_id bd718xx_of_match[] = {
 	{
 		.compatible = "rohm,bd71837",
-		.data = (void *)BD718XX_TYPE_BD71837,
+		.data = (void *)ROHM_CHIP_TYPE_BD71837,
 	},
 	{
 		.compatible = "rohm,bd71847",
-		.data = (void *)BD718XX_TYPE_BD71847,
+		.data = (void *)ROHM_CHIP_TYPE_BD71847,
 	},
 	{ }
 };
diff --git a/drivers/regulator/bd718x7-regulator.c b/drivers/regulator/bd718x7-regulator.c
index fde4264da6ff..ef2fc175a9ae 100644
--- a/drivers/regulator/bd718x7-regulator.c
+++ b/drivers/regulator/bd718x7-regulator.c
@@ -1152,12 +1152,12 @@ static int bd718xx_probe(struct platform_device *pdev)
 {
 	struct bd718xx *mfd;
 	struct regulator_config config = { 0 };
-	struct bd718xx_pmic_inits pmic_regulators[] = {
-		[BD718XX_TYPE_BD71837] = {
+	struct bd718xx_pmic_inits pmic_regulators[ROHM_CHIP_TYPE_AMOUNT] = {
+		[ROHM_CHIP_TYPE_BD71837] = {
 			.r_datas = bd71837_regulators,
 			.r_amount = ARRAY_SIZE(bd71837_regulators),
 		},
-		[BD718XX_TYPE_BD71847] = {
+		[ROHM_CHIP_TYPE_BD71847] = {
 			.r_datas = bd71847_regulators,
 			.r_amount = ARRAY_SIZE(bd71847_regulators),
 		},
@@ -1173,15 +1173,15 @@ static int bd718xx_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	if (mfd->chip_type >= BD718XX_TYPE_AMOUNT ||
-	    !pmic_regulators[mfd->chip_type].r_datas) {
+	if (mfd->chip.chip_type >= ROHM_CHIP_TYPE_AMOUNT ||
+	    !pmic_regulators[mfd->chip.chip_type].r_datas) {
 		dev_err(&pdev->dev, "Unsupported chip type\n");
 		err = -EINVAL;
 		goto err;
 	}
 
 	/* Register LOCK release */
-	err = regmap_update_bits(mfd->regmap, BD718XX_REG_REGLOCK,
+	err = regmap_update_bits(mfd->chip.regmap, BD718XX_REG_REGLOCK,
 				 (REGLOCK_PWRSEQ | REGLOCK_VREG), 0);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to unlock PMIC (%d)\n", err);
@@ -1200,7 +1200,8 @@ static int bd718xx_probe(struct platform_device *pdev)
 	 * bit allowing HW defaults for power rails to be used
 	 */
 	if (!use_snvs) {
-		err = regmap_update_bits(mfd->regmap, BD718XX_REG_TRANS_COND1,
+		err = regmap_update_bits(mfd->chip.regmap,
+					 BD718XX_REG_TRANS_COND1,
 					 BD718XX_ON_REQ_POWEROFF_MASK |
 					 BD718XX_SWRESET_POWEROFF_MASK |
 					 BD718XX_WDOG_POWEROFF_MASK |
@@ -1215,17 +1216,17 @@ static int bd718xx_probe(struct platform_device *pdev)
 		}
 	}
 
-	for (i = 0; i < pmic_regulators[mfd->chip_type].r_amount; i++) {
+	for (i = 0; i < pmic_regulators[mfd->chip.chip_type].r_amount; i++) {
 
 		const struct regulator_desc *desc;
 		struct regulator_dev *rdev;
 		const struct bd718xx_regulator_data *r;
 
-		r = &pmic_regulators[mfd->chip_type].r_datas[i];
+		r = &pmic_regulators[mfd->chip.chip_type].r_datas[i];
 		desc = &r->desc;
 
 		config.dev = pdev->dev.parent;
-		config.regmap = mfd->regmap;
+		config.regmap = mfd->chip.regmap;
 
 		rdev = devm_regulator_register(&pdev->dev, desc, &config);
 		if (IS_ERR(rdev)) {
@@ -1254,7 +1255,7 @@ static int bd718xx_probe(struct platform_device *pdev)
 		 */
 		if (!use_snvs || !rdev->constraints->always_on ||
 		    !rdev->constraints->boot_on) {
-			err = regmap_update_bits(mfd->regmap, r->init.reg,
+			err = regmap_update_bits(mfd->chip.regmap, r->init.reg,
 						 r->init.mask, r->init.val);
 			if (err) {
 				dev_err(&pdev->dev,
@@ -1264,7 +1265,7 @@ static int bd718xx_probe(struct platform_device *pdev)
 			}
 		}
 		for (j = 0; j < r->additional_init_amnt; j++) {
-			err = regmap_update_bits(mfd->regmap,
+			err = regmap_update_bits(mfd->chip.regmap,
 						 r->additional_inits[j].reg,
 						 r->additional_inits[j].mask,
 						 r->additional_inits[j].val);
diff --git a/include/linux/mfd/rohm-bd718x7.h b/include/linux/mfd/rohm-bd718x7.h
index fd194bfc836f..7f2dbde402a1 100644
--- a/include/linux/mfd/rohm-bd718x7.h
+++ b/include/linux/mfd/rohm-bd718x7.h
@@ -4,14 +4,9 @@
 #ifndef __LINUX_MFD_BD718XX_H__
 #define __LINUX_MFD_BD718XX_H__
 
+#include <linux/mfd/rohm-generic.h>
 #include <linux/regmap.h>
 
-enum {
-	BD718XX_TYPE_BD71837 = 0,
-	BD718XX_TYPE_BD71847,
-	BD718XX_TYPE_AMOUNT
-};
-
 enum {
 	BD718XX_BUCK1 = 0,
 	BD718XX_BUCK2,
@@ -321,18 +316,17 @@ enum {
 	BD718XX_PWRBTN_LONG_PRESS_15S
 };
 
-struct bd718xx_clk;
-
 struct bd718xx {
-	unsigned int chip_type;
-	struct device *dev;
-	struct regmap *regmap;
-	unsigned long int id;
+	/*
+	 * Please keep this as the first member here as some
+	 * drivers (clk) supporting more than one chip may only know this
+	 * generic struct 'struct rohm_regmap_dev' and assume it is
+	 * the first chunk of parent device's private data.
+	 */
+	struct rohm_regmap_dev chip;
 
 	int chip_irq;
 	struct regmap_irq_chip_data *irq_data;
-
-	struct bd718xx_clk *clk;
 };
 
 #endif /* __LINUX_MFD_BD718XX_H__ */
diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
new file mode 100644
index 000000000000..bff15ac26f2c
--- /dev/null
+++ b/include/linux/mfd/rohm-generic.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (C) 2018 ROHM Semiconductors */
+
+#ifndef __LINUX_MFD_ROHM_H__
+#define __LINUX_MFD_ROHM_H__
+
+enum {
+	ROHM_CHIP_TYPE_BD71837 = 0,
+	ROHM_CHIP_TYPE_BD71847,
+	ROHM_CHIP_TYPE_BD70528,
+	ROHM_CHIP_TYPE_AMOUNT
+};
+
+struct rohm_regmap_dev {
+	unsigned int chip_type;
+	struct device *dev;
+	struct regmap *regmap;
+};
+
+#endif
-- 
cgit v1.2.3


From 21b7c58fc1943f3aa8c18a994ab9bed4ae5aa72d Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 3 Jun 2019 10:25:08 +0300
Subject: mfd: bd70528: Support ROHM bd70528 PMIC core

ROHM BD70528MWV is an ultra-low quiescent current general
purpose single-chip power management IC for battery-powered
portable devices.

Add MFD core which enables chip access for following subdevices:
	- regulators/LED drivers
	- battery-charger
	- gpios
	- 32.768kHz clk
	- RTC
	- watchdog

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig              |  17 ++
 drivers/mfd/Makefile             |   2 +
 drivers/mfd/rohm-bd70528.c       | 316 ++++++++++++++++++++++++++++++
 include/linux/mfd/rohm-bd70528.h | 408 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 743 insertions(+)
 create mode 100644 drivers/mfd/rohm-bd70528.c
 create mode 100644 include/linux/mfd/rohm-bd70528.h

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 294d9567cc71..11fc53d78c5f 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1890,6 +1890,23 @@ config MFD_ROHM_BD718XX
 	  NXP i.MX8. It contains 8 BUCK outputs and 7 LDOs, voltage monitoring
 	  and emergency shut down as well as 32,768KHz clock output.
 
+config MFD_ROHM_BD70528
+	tristate "ROHM BD70528 Power Management IC"
+	depends on I2C=y
+	depends on OF
+	select REGMAP_I2C
+	select REGMAP_IRQ
+	select MFD_CORE
+	help
+	  Select this option to get support for the ROHM BD70528 Power
+	  Management IC. BD71837 is general purpose single-chip power
+	  management IC for battery-powered portable devices. It contains
+	  3 ultra-low current consumption buck converters, 3 LDOs and 2 LED
+	  drivers. Also included are 4 GPIOs, a real-time clock (RTC), a 32kHz
+	  crystal oscillator, high-accuracy VREF for use with an external ADC,
+	  10 bits SAR ADC for battery temperature monitor and 1S battery
+	  charger.
+
 config MFD_STM32_LPTIMER
 	tristate "Support for STM32 Low-Power Timer"
 	depends on (ARCH_STM32 && OF) || COMPILE_TEST
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 52b1a90ff515..643d65bcb6ea 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -247,5 +247,7 @@ obj-$(CONFIG_MFD_STM32_TIMERS) 	+= stm32-timers.o
 obj-$(CONFIG_MFD_MXS_LRADC)     += mxs-lradc.o
 obj-$(CONFIG_MFD_SC27XX_PMIC)	+= sprd-sc27xx-spi.o
 obj-$(CONFIG_RAVE_SP_CORE)	+= rave-sp.o
+obj-$(CONFIG_MFD_ROHM_BD70528)	+= rohm-bd70528.o
 obj-$(CONFIG_MFD_ROHM_BD718XX)	+= rohm-bd718x7.o
 obj-$(CONFIG_MFD_STMFX) 	+= stmfx.o
+
diff --git a/drivers/mfd/rohm-bd70528.c b/drivers/mfd/rohm-bd70528.c
new file mode 100644
index 000000000000..55599d5c5c86
--- /dev/null
+++ b/drivers/mfd/rohm-bd70528.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+//
+// Copyright (C) 2019 ROHM Semiconductors
+//
+// ROHM BD70528 PMIC driver
+
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/rohm-bd70528.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/types.h>
+
+#define BD70528_NUM_OF_GPIOS 4
+
+static const struct resource rtc_irqs[] = {
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_RTC_ALARM, "bd70528-rtc-alm"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_ELPS_TIM, "bd70528-elapsed-timer"),
+};
+
+static const struct resource charger_irqs[] = {
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_OV_RES, "bd70528-bat-ov-res"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_OV_DET, "bd70528-bat-ov-det"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DBAT_DET, "bd70528-bat-dead"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_COLD_RES, "bd70528-bat-warmed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_COLD_DET, "bd70528-bat-cold"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_HOT_RES, "bd70528-bat-cooled"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_HOT_DET, "bd70528-bat-hot"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_CHG_TSD, "bd70528-chg-tshd"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_RMV, "bd70528-bat-removed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_DET, "bd70528-bat-detected"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_OV_RES, "bd70528-dcin2-ov-res"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_OV_DET, "bd70528-dcin2-ov-det"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_RMV, "bd70528-dcin2-removed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_DET, "bd70528-dcin2-detected"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN1_RMV, "bd70528-dcin1-removed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN1_DET, "bd70528-dcin1-detected"),
+};
+
+static struct mfd_cell bd70528_mfd_cells[] = {
+	{ .name = "bd70528-pmic", },
+	{ .name = "bd70528-gpio", },
+	/*
+	 * We use BD71837 driver to drive the clock block. Only differences to
+	 * BD70528 clock gate are the register address and mask.
+	 */
+	{ .name = "bd718xx-clk", },
+	{ .name = "bd70528-wdt", },
+	{
+		.name = "bd70528-power",
+		.resources = charger_irqs,
+		.num_resources = ARRAY_SIZE(charger_irqs),
+	}, {
+		.name = "bd70528-rtc",
+		.resources = rtc_irqs,
+		.num_resources = ARRAY_SIZE(rtc_irqs),
+	},
+};
+
+static const struct regmap_range volatile_ranges[] = {
+	{
+		.range_min = BD70528_REG_INT_MAIN,
+		.range_max = BD70528_REG_INT_OP_FAIL,
+	}, {
+		.range_min = BD70528_REG_RTC_COUNT_H,
+		.range_max = BD70528_REG_RTC_ALM_REPEAT,
+	}, {
+		/*
+		 * WDT control reg is special. Magic values must be written to
+		 * it in order to change the control. Should not be cached.
+		 */
+		.range_min = BD70528_REG_WDT_CTRL,
+		.range_max = BD70528_REG_WDT_CTRL,
+	}, {
+		/*
+		 * BD70528 also contains a few other registers which require
+		 * magic sequences to be written in order to update the value.
+		 * At least SHIPMODE, HWRESET, WARMRESET,and STANDBY
+		 */
+		.range_min = BD70528_REG_SHIPMODE,
+		.range_max = BD70528_REG_STANDBY,
+	},
+};
+
+static const struct regmap_access_table volatile_regs = {
+	.yes_ranges = &volatile_ranges[0],
+	.n_yes_ranges = ARRAY_SIZE(volatile_ranges),
+};
+
+static struct regmap_config bd70528_regmap = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.volatile_table = &volatile_regs,
+	.max_register = BD70528_MAX_REGISTER,
+	.cache_type = REGCACHE_RBTREE,
+};
+
+/*
+ * Mapping of main IRQ register bits to sub-IRQ register offsets so that we can
+ * access corect sub-IRQ registers based on bits that are set in main IRQ
+ * register.
+ */
+
+/* bit [0] - Shutdown register */
+unsigned int bit0_offsets[] = {0};	/* Shutdown register */
+unsigned int bit1_offsets[] = {1};	/* Power failure register */
+unsigned int bit2_offsets[] = {2};	/* VR FAULT register */
+unsigned int bit3_offsets[] = {3};	/* PMU register interrupts */
+unsigned int bit4_offsets[] = {4, 5};	/* Charger 1 and Charger 2 registers */
+unsigned int bit5_offsets[] = {6};	/* RTC register */
+unsigned int bit6_offsets[] = {7};	/* GPIO register */
+unsigned int bit7_offsets[] = {8};	/* Invalid operation register */
+
+static struct regmap_irq_sub_irq_map bd70528_sub_irq_offsets[] = {
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit0_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit1_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit2_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit3_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit4_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit5_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit6_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit7_offsets),
+};
+
+static struct regmap_irq bd70528_irqs[] = {
+	REGMAP_IRQ_REG(BD70528_INT_LONGPUSH, 0, BD70528_INT_LONGPUSH_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_WDT, 0, BD70528_INT_WDT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_HWRESET, 0, BD70528_INT_HWRESET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_RSTB_FAULT, 0, BD70528_INT_RSTB_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_VBAT_UVLO, 0, BD70528_INT_VBAT_UVLO_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_TSD, 0, BD70528_INT_TSD_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_RSTIN, 0, BD70528_INT_RSTIN_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_FAULT, 1,
+		       BD70528_INT_BUCK1_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_FAULT, 1,
+		       BD70528_INT_BUCK2_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK3_FAULT, 1,
+		       BD70528_INT_BUCK3_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LDO1_FAULT, 1, BD70528_INT_LDO1_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LDO2_FAULT, 1, BD70528_INT_LDO2_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LDO3_FAULT, 1, BD70528_INT_LDO3_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED1_FAULT, 1, BD70528_INT_LED1_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED2_FAULT, 1, BD70528_INT_LED2_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_OCP, 2, BD70528_INT_BUCK1_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_OCP, 2, BD70528_INT_BUCK2_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK3_OCP, 2, BD70528_INT_BUCK3_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED1_OCP, 2, BD70528_INT_LED1_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED2_OCP, 2, BD70528_INT_LED2_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_FULLON, 2,
+		       BD70528_INT_BUCK1_FULLON_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_FULLON, 2,
+		       BD70528_INT_BUCK2_FULLON_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_SHORTPUSH, 3, BD70528_INT_SHORTPUSH_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_AUTO_WAKEUP, 3,
+		       BD70528_INT_AUTO_WAKEUP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_STATE_CHANGE, 3,
+		       BD70528_INT_STATE_CHANGE_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_OV_RES, 4, BD70528_INT_BAT_OV_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_OV_DET, 4, BD70528_INT_BAT_OV_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DBAT_DET, 4, BD70528_INT_DBAT_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_COLD_RES, 4,
+		       BD70528_INT_BATTSD_COLD_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_COLD_DET, 4,
+		       BD70528_INT_BATTSD_COLD_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_HOT_RES, 4,
+		       BD70528_INT_BATTSD_HOT_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_HOT_DET, 4,
+		       BD70528_INT_BATTSD_HOT_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_CHG_TSD, 4, BD70528_INT_CHG_TSD_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_RMV, 5, BD70528_INT_BAT_RMV_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_DET, 5, BD70528_INT_BAT_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_OV_RES, 5,
+		       BD70528_INT_DCIN2_OV_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_OV_DET, 5,
+		       BD70528_INT_DCIN2_OV_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_RMV, 5, BD70528_INT_DCIN2_RMV_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_DET, 5, BD70528_INT_DCIN2_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN1_RMV, 5, BD70528_INT_DCIN1_RMV_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN1_DET, 5, BD70528_INT_DCIN1_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_RTC_ALARM, 6, BD70528_INT_RTC_ALARM_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_ELPS_TIM, 6, BD70528_INT_ELPS_TIM_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO0, 7, BD70528_INT_GPIO0_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO1, 7, BD70528_INT_GPIO1_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO2, 7, BD70528_INT_GPIO2_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO3, 7, BD70528_INT_GPIO3_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_DVS_OPFAIL, 8,
+		       BD70528_INT_BUCK1_DVS_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_DVS_OPFAIL, 8,
+		       BD70528_INT_BUCK2_DVS_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK3_DVS_OPFAIL, 8,
+		       BD70528_INT_BUCK3_DVS_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED1_VOLT_OPFAIL, 8,
+		       BD70528_INT_LED1_VOLT_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED2_VOLT_OPFAIL, 8,
+		       BD70528_INT_LED2_VOLT_OPFAIL_MASK),
+};
+
+static struct regmap_irq_chip bd70528_irq_chip = {
+	.name = "bd70528_irq",
+	.main_status = BD70528_REG_INT_MAIN,
+	.irqs = &bd70528_irqs[0],
+	.num_irqs = ARRAY_SIZE(bd70528_irqs),
+	.status_base = BD70528_REG_INT_SHDN,
+	.mask_base = BD70528_REG_INT_SHDN_MASK,
+	.ack_base = BD70528_REG_INT_SHDN,
+	.type_base = BD70528_REG_GPIO1_IN,
+	.init_ack_masked = true,
+	.num_regs = 9,
+	.num_main_regs = 1,
+	.num_type_reg = 4,
+	.sub_reg_offsets = &bd70528_sub_irq_offsets[0],
+	.num_main_status_bits = 8,
+	.irq_reg_stride = 1,
+};
+
+static int bd70528_i2c_probe(struct i2c_client *i2c,
+			     const struct i2c_device_id *id)
+{
+	struct bd70528_data *bd70528;
+	struct regmap_irq_chip_data *irq_data;
+	int ret, i;
+
+	if (!i2c->irq) {
+		dev_err(&i2c->dev, "No IRQ configured\n");
+		return -EINVAL;
+	}
+
+	bd70528 = devm_kzalloc(&i2c->dev, sizeof(*bd70528), GFP_KERNEL);
+	if (!bd70528)
+		return -ENOMEM;
+
+	mutex_init(&bd70528->rtc_timer_lock);
+
+	dev_set_drvdata(&i2c->dev, &bd70528->chip);
+
+	bd70528->chip.chip_type = ROHM_CHIP_TYPE_BD70528;
+	bd70528->chip.regmap = devm_regmap_init_i2c(i2c, &bd70528_regmap);
+	if (IS_ERR(bd70528->chip.regmap)) {
+		dev_err(&i2c->dev, "Failed to initialize Regmap\n");
+		return PTR_ERR(bd70528->chip.regmap);
+	}
+
+	/*
+	 * Disallow type setting for all IRQs by default as most of them do not
+	 * support setting type.
+	 */
+	for (i = 0; i < ARRAY_SIZE(bd70528_irqs); i++)
+		bd70528_irqs[i].type.types_supported = 0;
+
+	/* Set IRQ typesetting information for GPIO pins 0 - 3 */
+	for (i = 0; i < BD70528_NUM_OF_GPIOS; i++) {
+		struct regmap_irq_type *type;
+
+		type = &bd70528_irqs[BD70528_INT_GPIO0 + i].type;
+		type->type_reg_offset = 2 * i;
+		type->type_rising_val = 0x20;
+		type->type_falling_val = 0x10;
+		type->type_level_high_val = 0x40;
+		type->type_level_low_val = 0x50;
+		type->types_supported = (IRQ_TYPE_EDGE_BOTH |
+				IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW);
+	}
+
+	ret = devm_regmap_add_irq_chip(&i2c->dev, bd70528->chip.regmap,
+				       i2c->irq, IRQF_ONESHOT, 0,
+				       &bd70528_irq_chip, &irq_data);
+	if (ret) {
+		dev_err(&i2c->dev, "Failed to add IRQ chip\n");
+		return ret;
+	}
+	dev_dbg(&i2c->dev, "Registered %d IRQs for chip\n",
+		bd70528_irq_chip.num_irqs);
+
+	/*
+	 * BD70528 IRQ controller is not touching the main mask register.
+	 * So enable the GPIO block interrupts at main level. We can just leave
+	 * them enabled as the IRQ controller should disable IRQs from
+	 * sub-registers when IRQ is disabled or freed.
+	 */
+	ret = regmap_update_bits(bd70528->chip.regmap,
+				 BD70528_REG_INT_MAIN_MASK,
+				 BD70528_INT_GPIO_MASK, 0);
+
+	ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_AUTO,
+				   bd70528_mfd_cells,
+				   ARRAY_SIZE(bd70528_mfd_cells), NULL, 0,
+				   regmap_irq_get_domain(irq_data));
+	if (ret)
+		dev_err(&i2c->dev, "Failed to create subdevices\n");
+
+	return ret;
+}
+
+static const struct of_device_id bd70528_of_match[] = {
+	{ .compatible = "rohm,bd70528", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, bd70528_of_match);
+
+static struct i2c_driver bd70528_drv = {
+	.driver = {
+		.name = "rohm-bd70528",
+		.of_match_table = bd70528_of_match,
+	},
+	.probe = &bd70528_i2c_probe,
+};
+
+module_i2c_driver(bd70528_drv);
+
+MODULE_AUTHOR("Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>");
+MODULE_DESCRIPTION("ROHM BD70528 Power Management IC driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/rohm-bd70528.h b/include/linux/mfd/rohm-bd70528.h
new file mode 100644
index 000000000000..1013e60c5b25
--- /dev/null
+++ b/include/linux/mfd/rohm-bd70528.h
@@ -0,0 +1,408 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (C) 2018 ROHM Semiconductors */
+
+#ifndef __LINUX_MFD_BD70528_H__
+#define __LINUX_MFD_BD70528_H__
+
+#include <linux/bits.h>
+#include <linux/device.h>
+#include <linux/mfd/rohm-generic.h>
+#include <linux/regmap.h>
+
+enum {
+	BD70528_BUCK1,
+	BD70528_BUCK2,
+	BD70528_BUCK3,
+	BD70528_LDO1,
+	BD70528_LDO2,
+	BD70528_LDO3,
+	BD70528_LED1,
+	BD70528_LED2,
+};
+
+struct bd70528_data {
+	struct rohm_regmap_dev chip;
+	struct mutex rtc_timer_lock;
+};
+
+#define BD70528_BUCK_VOLTS 17
+#define BD70528_BUCK_VOLTS 17
+#define BD70528_BUCK_VOLTS 17
+#define BD70528_LDO_VOLTS 0x20
+
+#define BD70528_REG_BUCK1_EN	0x0F
+#define BD70528_REG_BUCK1_VOLT	0x15
+#define BD70528_REG_BUCK2_EN	0x10
+#define BD70528_REG_BUCK2_VOLT	0x16
+#define BD70528_REG_BUCK3_EN	0x11
+#define BD70528_REG_BUCK3_VOLT	0x17
+#define BD70528_REG_LDO1_EN	0x1b
+#define BD70528_REG_LDO1_VOLT	0x1e
+#define BD70528_REG_LDO2_EN	0x1c
+#define BD70528_REG_LDO2_VOLT	0x1f
+#define BD70528_REG_LDO3_EN	0x1d
+#define BD70528_REG_LDO3_VOLT	0x20
+#define BD70528_REG_LED_CTRL	0x2b
+#define BD70528_REG_LED_VOLT	0x29
+#define BD70528_REG_LED_EN	0x2a
+
+/* main irq registers */
+#define BD70528_REG_INT_MAIN	0x7E
+#define BD70528_REG_INT_MAIN_MASK 0x74
+
+/* 'sub irq' registers */
+#define BD70528_REG_INT_SHDN	0x7F
+#define BD70528_REG_INT_PWR_FLT	0x80
+#define BD70528_REG_INT_VR_FLT	0x81
+#define BD70528_REG_INT_MISC	0x82
+#define BD70528_REG_INT_BAT1	0x83
+#define BD70528_REG_INT_BAT2	0x84
+#define BD70528_REG_INT_RTC	0x85
+#define BD70528_REG_INT_GPIO	0x86
+#define BD70528_REG_INT_OP_FAIL	0x87
+
+#define BD70528_REG_INT_SHDN_MASK	0x75
+#define BD70528_REG_INT_PWR_FLT_MASK	0x76
+#define BD70528_REG_INT_VR_FLT_MASK	0x77
+#define BD70528_REG_INT_MISC_MASK	0x78
+#define BD70528_REG_INT_BAT1_MASK	0x79
+#define BD70528_REG_INT_BAT2_MASK	0x7a
+#define BD70528_REG_INT_RTC_MASK	0x7b
+#define BD70528_REG_INT_GPIO_MASK	0x7c
+#define BD70528_REG_INT_OP_FAIL_MASK	0x7d
+
+/* Reset related 'magic' registers */
+#define BD70528_REG_SHIPMODE	0x03
+#define BD70528_REG_HWRESET	0x04
+#define BD70528_REG_WARMRESET	0x05
+#define BD70528_REG_STANDBY	0x06
+
+/* GPIO registers */
+#define BD70528_REG_GPIO_STATE	0x8F
+
+#define BD70528_REG_GPIO1_IN	0x4d
+#define BD70528_REG_GPIO2_IN	0x4f
+#define BD70528_REG_GPIO3_IN	0x51
+#define BD70528_REG_GPIO4_IN	0x53
+#define BD70528_REG_GPIO1_OUT	0x4e
+#define BD70528_REG_GPIO2_OUT	0x50
+#define BD70528_REG_GPIO3_OUT	0x52
+#define BD70528_REG_GPIO4_OUT	0x54
+
+/* clk control */
+
+#define BD70528_REG_CLK_OUT	0x2c
+
+/* RTC */
+
+#define BD70528_REG_RTC_COUNT_H		0x2d
+#define BD70528_REG_RTC_COUNT_L		0x2e
+#define BD70528_REG_RTC_SEC		0x2f
+#define BD70528_REG_RTC_MINUTE		0x30
+#define BD70528_REG_RTC_HOUR		0x31
+#define BD70528_REG_RTC_WEEK		0x32
+#define BD70528_REG_RTC_DAY		0x33
+#define BD70528_REG_RTC_MONTH		0x34
+#define BD70528_REG_RTC_YEAR		0x35
+
+#define BD70528_REG_RTC_ALM_SEC		0x36
+#define BD70528_REG_RTC_ALM_START	BD70528_REG_RTC_ALM_SEC
+#define BD70528_REG_RTC_ALM_MINUTE	0x37
+#define BD70528_REG_RTC_ALM_HOUR	0x38
+#define BD70528_REG_RTC_ALM_WEEK	0x39
+#define BD70528_REG_RTC_ALM_DAY		0x3a
+#define BD70528_REG_RTC_ALM_MONTH	0x3b
+#define BD70528_REG_RTC_ALM_YEAR	0x3c
+#define BD70528_REG_RTC_ALM_MASK	0x3d
+#define BD70528_REG_RTC_ALM_REPEAT	0x3e
+#define BD70528_REG_RTC_START		BD70528_REG_RTC_SEC
+
+#define BD70528_REG_RTC_WAKE_SEC	0x43
+#define BD70528_REG_RTC_WAKE_START	BD70528_REG_RTC_WAKE_SEC
+#define BD70528_REG_RTC_WAKE_MIN	0x44
+#define BD70528_REG_RTC_WAKE_HOUR	0x45
+#define BD70528_REG_RTC_WAKE_CTRL	0x46
+
+#define BD70528_REG_ELAPSED_TIMER_EN	0x42
+#define BD70528_REG_WAKE_EN		0x46
+
+/* WDT registers */
+#define BD70528_REG_WDT_CTRL		0x4A
+#define BD70528_REG_WDT_HOUR		0x49
+#define BD70528_REG_WDT_MINUTE		0x48
+#define BD70528_REG_WDT_SEC		0x47
+
+/* Charger / Battery */
+#define BD70528_REG_CHG_CURR_STAT	0x59
+#define BD70528_REG_CHG_BAT_STAT	0x57
+#define BD70528_REG_CHG_BAT_TEMP	0x58
+#define BD70528_REG_CHG_IN_STAT		0x56
+#define BD70528_REG_CHG_DCIN_ILIM	0x5d
+#define BD70528_REG_CHG_CHG_CURR_WARM	0x61
+#define BD70528_REG_CHG_CHG_CURR_COLD	0x62
+
+/* Masks for main IRQ register bits */
+enum {
+	BD70528_INT_SHDN,
+#define BD70528_INT_SHDN_MASK BIT(BD70528_INT_SHDN)
+	BD70528_INT_PWR_FLT,
+#define BD70528_INT_PWR_FLT_MASK BIT(BD70528_INT_PWR_FLT)
+	BD70528_INT_VR_FLT,
+#define BD70528_INT_VR_FLT_MASK BIT(BD70528_INT_VR_FLT)
+	BD70528_INT_MISC,
+#define BD70528_INT_MISC_MASK BIT(BD70528_INT_MISC)
+	BD70528_INT_BAT1,
+#define BD70528_INT_BAT1_MASK BIT(BD70528_INT_BAT1)
+	BD70528_INT_RTC,
+#define BD70528_INT_RTC_MASK BIT(BD70528_INT_RTC)
+	BD70528_INT_GPIO,
+#define BD70528_INT_GPIO_MASK BIT(BD70528_INT_GPIO)
+	BD70528_INT_OP_FAIL,
+#define BD70528_INT_OP_FAIL_MASK BIT(BD70528_INT_OP_FAIL)
+};
+
+/* IRQs */
+enum {
+	/* Shutdown register IRQs */
+	BD70528_INT_LONGPUSH,
+	BD70528_INT_WDT,
+	BD70528_INT_HWRESET,
+	BD70528_INT_RSTB_FAULT,
+	BD70528_INT_VBAT_UVLO,
+	BD70528_INT_TSD,
+	BD70528_INT_RSTIN,
+	/* Power failure register IRQs */
+	BD70528_INT_BUCK1_FAULT,
+	BD70528_INT_BUCK2_FAULT,
+	BD70528_INT_BUCK3_FAULT,
+	BD70528_INT_LDO1_FAULT,
+	BD70528_INT_LDO2_FAULT,
+	BD70528_INT_LDO3_FAULT,
+	BD70528_INT_LED1_FAULT,
+	BD70528_INT_LED2_FAULT,
+	/* VR FAULT register IRQs */
+	BD70528_INT_BUCK1_OCP,
+	BD70528_INT_BUCK2_OCP,
+	BD70528_INT_BUCK3_OCP,
+	BD70528_INT_LED1_OCP,
+	BD70528_INT_LED2_OCP,
+	BD70528_INT_BUCK1_FULLON,
+	BD70528_INT_BUCK2_FULLON,
+	/* PMU register interrupts */
+	BD70528_INT_SHORTPUSH,
+	BD70528_INT_AUTO_WAKEUP,
+	BD70528_INT_STATE_CHANGE,
+	/* Charger 1 register IRQs */
+	BD70528_INT_BAT_OV_RES,
+	BD70528_INT_BAT_OV_DET,
+	BD70528_INT_DBAT_DET,
+	BD70528_INT_BATTSD_COLD_RES,
+	BD70528_INT_BATTSD_COLD_DET,
+	BD70528_INT_BATTSD_HOT_RES,
+	BD70528_INT_BATTSD_HOT_DET,
+	BD70528_INT_CHG_TSD,
+	/* Charger 2 register IRQs */
+	BD70528_INT_BAT_RMV,
+	BD70528_INT_BAT_DET,
+	BD70528_INT_DCIN2_OV_RES,
+	BD70528_INT_DCIN2_OV_DET,
+	BD70528_INT_DCIN2_RMV,
+	BD70528_INT_DCIN2_DET,
+	BD70528_INT_DCIN1_RMV,
+	BD70528_INT_DCIN1_DET,
+	/* RTC register IRQs */
+	BD70528_INT_RTC_ALARM,
+	BD70528_INT_ELPS_TIM,
+	/* GPIO register IRQs */
+	BD70528_INT_GPIO0,
+	BD70528_INT_GPIO1,
+	BD70528_INT_GPIO2,
+	BD70528_INT_GPIO3,
+	/* Invalid operation register IRQs */
+	BD70528_INT_BUCK1_DVS_OPFAIL,
+	BD70528_INT_BUCK2_DVS_OPFAIL,
+	BD70528_INT_BUCK3_DVS_OPFAIL,
+	BD70528_INT_LED1_VOLT_OPFAIL,
+	BD70528_INT_LED2_VOLT_OPFAIL,
+};
+
+/* Masks */
+#define BD70528_INT_LONGPUSH_MASK 0x1
+#define BD70528_INT_WDT_MASK 0x2
+#define BD70528_INT_HWRESET_MASK 0x4
+#define BD70528_INT_RSTB_FAULT_MASK 0x8
+#define BD70528_INT_VBAT_UVLO_MASK 0x10
+#define BD70528_INT_TSD_MASK 0x20
+#define BD70528_INT_RSTIN_MASK 0x40
+
+#define BD70528_INT_BUCK1_FAULT_MASK 0x1
+#define BD70528_INT_BUCK2_FAULT_MASK 0x2
+#define BD70528_INT_BUCK3_FAULT_MASK 0x4
+#define BD70528_INT_LDO1_FAULT_MASK 0x8
+#define BD70528_INT_LDO2_FAULT_MASK 0x10
+#define BD70528_INT_LDO3_FAULT_MASK 0x20
+#define BD70528_INT_LED1_FAULT_MASK 0x40
+#define BD70528_INT_LED2_FAULT_MASK 0x80
+
+#define BD70528_INT_BUCK1_OCP_MASK 0x1
+#define BD70528_INT_BUCK2_OCP_MASK 0x2
+#define BD70528_INT_BUCK3_OCP_MASK 0x4
+#define BD70528_INT_LED1_OCP_MASK 0x8
+#define BD70528_INT_LED2_OCP_MASK 0x10
+#define BD70528_INT_BUCK1_FULLON_MASK 0x20
+#define BD70528_INT_BUCK2_FULLON_MASK 0x40
+
+#define BD70528_INT_SHORTPUSH_MASK 0x1
+#define BD70528_INT_AUTO_WAKEUP_MASK 0x2
+#define BD70528_INT_STATE_CHANGE_MASK 0x10
+
+#define BD70528_INT_BAT_OV_RES_MASK 0x1
+#define BD70528_INT_BAT_OV_DET_MASK 0x2
+#define BD70528_INT_DBAT_DET_MASK 0x4
+#define BD70528_INT_BATTSD_COLD_RES_MASK 0x8
+#define BD70528_INT_BATTSD_COLD_DET_MASK 0x10
+#define BD70528_INT_BATTSD_HOT_RES_MASK 0x20
+#define BD70528_INT_BATTSD_HOT_DET_MASK 0x40
+#define BD70528_INT_CHG_TSD_MASK 0x80
+
+#define BD70528_INT_BAT_RMV_MASK 0x1
+#define BD70528_INT_BAT_DET_MASK 0x2
+#define BD70528_INT_DCIN2_OV_RES_MASK 0x4
+#define BD70528_INT_DCIN2_OV_DET_MASK 0x8
+#define BD70528_INT_DCIN2_RMV_MASK 0x10
+#define BD70528_INT_DCIN2_DET_MASK 0x20
+#define BD70528_INT_DCIN1_RMV_MASK 0x40
+#define BD70528_INT_DCIN1_DET_MASK 0x80
+
+#define BD70528_INT_RTC_ALARM_MASK 0x1
+#define BD70528_INT_ELPS_TIM_MASK 0x2
+
+#define BD70528_INT_GPIO0_MASK 0x1
+#define BD70528_INT_GPIO1_MASK 0x2
+#define BD70528_INT_GPIO2_MASK 0x4
+#define BD70528_INT_GPIO3_MASK 0x8
+
+#define BD70528_INT_BUCK1_DVS_OPFAIL_MASK 0x1
+#define BD70528_INT_BUCK2_DVS_OPFAIL_MASK 0x2
+#define BD70528_INT_BUCK3_DVS_OPFAIL_MASK 0x4
+#define BD70528_INT_LED1_VOLT_OPFAIL_MASK 0x10
+#define BD70528_INT_LED2_VOLT_OPFAIL_MASK 0x20
+
+#define BD70528_DEBOUNCE_MASK 0x3
+
+#define BD70528_DEBOUNCE_DISABLE 0
+#define BD70528_DEBOUNCE_15MS 1
+#define BD70528_DEBOUNCE_30MS 2
+#define BD70528_DEBOUNCE_50MS 3
+
+#define BD70528_GPIO_DRIVE_MASK 0x2
+#define BD70528_GPIO_PUSH_PULL 0x0
+#define BD70528_GPIO_OPEN_DRAIN 0x2
+
+#define BD70528_GPIO_OUT_EN_MASK 0x80
+#define BD70528_GPIO_OUT_ENABLE 0x80
+#define BD70528_GPIO_OUT_DISABLE 0x0
+
+#define BD70528_GPIO_OUT_HI 0x1
+#define BD70528_GPIO_OUT_LO 0x0
+#define BD70528_GPIO_OUT_MASK 0x1
+
+#define BD70528_GPIO_IN_STATE_BASE 1
+
+#define BD70528_CLK_OUT_EN_MASK 0x1
+
+/* RTC masks to mask out reserved bits */
+
+#define BD70528_MASK_RTC_SEC		0x7f
+#define BD70528_MASK_RTC_MINUTE		0x7f
+#define BD70528_MASK_RTC_HOUR_24H	0x80
+#define BD70528_MASK_RTC_HOUR_PM	0x20
+#define BD70528_MASK_RTC_HOUR		0x1f
+#define BD70528_MASK_RTC_DAY		0x3f
+#define BD70528_MASK_RTC_WEEK		0x07
+#define BD70528_MASK_RTC_MONTH		0x1f
+#define BD70528_MASK_RTC_YEAR		0xff
+#define BD70528_MASK_RTC_COUNT_L	0x7f
+
+#define BD70528_MASK_ELAPSED_TIMER_EN	0x1
+/* Mask second, min and hour fields
+ * HW would support ALM irq for over 24h
+ * (by setting day, month and year too)
+ * but as we wish to keep this same as for
+ * wake-up we limit ALM to 24H and only
+ * unmask sec, min and hour
+ */
+#define BD70528_MASK_ALM_EN		0x7
+#define BD70528_MASK_WAKE_EN		0x1
+
+/* WDT masks */
+#define BD70528_MASK_WDT_EN		0x1
+#define BD70528_MASK_WDT_HOUR		0x1
+#define BD70528_MASK_WDT_MINUTE		0x7f
+#define BD70528_MASK_WDT_SEC		0x7f
+
+#define BD70528_WDT_STATE_BIT		0x1
+#define BD70528_ELAPSED_STATE_BIT	0x2
+#define BD70528_WAKE_STATE_BIT		0x4
+
+/* Charger masks */
+#define BD70528_MASK_CHG_STAT		0x7f
+#define BD70528_MASK_CHG_BAT_TIMER	0x20
+#define BD70528_MASK_CHG_BAT_OVERVOLT	0x10
+#define BD70528_MASK_CHG_BAT_DETECT	0x1
+#define BD70528_MASK_CHG_DCIN1_UVLO	0x1
+#define BD70528_MASK_CHG_DCIN_ILIM	0x3f
+#define BD70528_MASK_CHG_CHG_CURR	0x1f
+#define BD70528_MASK_CHG_TRICKLE_CURR	0x10
+
+/*
+ * Note, external battery register is the lonely rider at
+ * address 0xc5. See how to stuff that in the regmap
+ */
+#define BD70528_MAX_REGISTER 0x94
+
+/* Buck control masks */
+#define BD70528_MASK_RUN_EN	0x4
+#define BD70528_MASK_STBY_EN	0x2
+#define BD70528_MASK_IDLE_EN	0x1
+#define BD70528_MASK_LED1_EN	0x1
+#define BD70528_MASK_LED2_EN	0x10
+
+#define BD70528_MASK_BUCK_VOLT	0xf
+#define BD70528_MASK_LDO_VOLT	0x1f
+#define BD70528_MASK_LED1_VOLT	0x1
+#define BD70528_MASK_LED2_VOLT	0x10
+
+/* Misc irq masks */
+#define BD70528_INT_MASK_SHORT_PUSH	1
+#define BD70528_INT_MASK_AUTO_WAKE	2
+#define BD70528_INT_MASK_POWER_STATE	4
+
+#define BD70528_MASK_BUCK_RAMP 0x10
+#define BD70528_SIFT_BUCK_RAMP 4
+
+#if IS_ENABLED(CONFIG_BD70528_WATCHDOG)
+
+int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable, int *old_state);
+void bd70528_wdt_lock(struct rohm_regmap_dev *data);
+void bd70528_wdt_unlock(struct rohm_regmap_dev *data);
+
+#else /* CONFIG_BD70528_WATCHDOG */
+
+static inline int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable,
+				  int *old_state)
+{
+	return 0;
+}
+
+static inline void bd70528_wdt_lock(struct rohm_regmap_dev *data)
+{
+}
+
+static inline void bd70528_wdt_unlock(struct rohm_regmap_dev *data)
+{
+}
+
+#endif /* CONFIG_BD70528_WATCHDOG */
+
+#endif /* __LINUX_MFD_BD70528_H__ */
-- 
cgit v1.2.3


From 980af75ede4f36107b98aa5c247359b87c6afc30 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 12 Jun 2019 22:13:24 +0200
Subject: thermal/drivers/core: Add init section table for self-encapsulation

Currently the governors are declared in their respective files but they
export their [un]register functions which in turn call the [un]register
governors core's functions. That implies a cyclic dependency which is
not desirable. There is a way to self-encapsulate the governors by letting
them to declare themselves in a __init section table.

Define the table in the asm generic linker description like the other
tables and provide the specific macros to deal with.

Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/thermal_core.h    | 15 +++++++++++++++
 include/asm-generic/vmlinux.lds.h | 11 +++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 0df190ed82a7..06778cec8416 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -15,6 +15,21 @@
 /* Initial state of a cooling device during binding */
 #define THERMAL_NO_TARGET -1UL
 
+/* Init section thermal table */
+extern struct thermal_governor *__governor_thermal_table[];
+extern struct thermal_governor *__governor_thermal_table_end[];
+
+#define THERMAL_TABLE_ENTRY(table, name)			\
+	static typeof(name) *__thermal_table_entry_##name	\
+	__used __section(__##table##_thermal_table) = &name
+
+#define THERMAL_GOVERNOR_DECLARE(name)	THERMAL_TABLE_ENTRY(governor, name)
+
+#define for_each_governor_table(__governor)		\
+	for (__governor = __governor_thermal_table;	\
+	     __governor < __governor_thermal_table_end;	\
+	     __governor++)
+
 /*
  * This structure is used to describe the behavior of
  * a certain cooling device on a certain trip point
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 088987e9a3ea..5a06822fcd6c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -239,6 +239,16 @@
 #define ACPI_PROBE_TABLE(name)
 #endif
 
+#ifdef CONFIG_THERMAL
+#define THERMAL_TABLE(name)						\
+	. = ALIGN(8);							\
+	__##name##_thermal_table = .;					\
+	KEEP(*(__##name##_thermal_table))				\
+	__##name##_thermal_table_end = .;
+#else
+#define THERMAL_TABLE(name)
+#endif
+
 #define KERNEL_DTB()							\
 	STRUCT_ALIGN();							\
 	__dtb_start = .;						\
@@ -608,6 +618,7 @@
 	IRQCHIP_OF_MATCH_TABLE()					\
 	ACPI_PROBE_TABLE(irqchip)					\
 	ACPI_PROBE_TABLE(timer)						\
+	THERMAL_TABLE(governor)						\
 	EARLYCON_TABLE()						\
 	LSM_TABLE()
 
-- 
cgit v1.2.3


From 6bbe6f5732faeabb4bb583726ec2d7f9739532bd Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue, 18 Jun 2019 18:05:28 -0300
Subject: docs: thermal: convert to ReST

Rename the thermal documentation files to ReST, add an
index for them and adjust in order to produce a nice html
output via the Sphinx build system.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 Documentation/thermal/cpu-cooling-api.rst          | 107 +++
 Documentation/thermal/cpu-cooling-api.txt          |  92 ---
 Documentation/thermal/exynos_thermal               |  77 --
 Documentation/thermal/exynos_thermal.rst           |  90 +++
 Documentation/thermal/exynos_thermal_emulation     |  53 --
 Documentation/thermal/exynos_thermal_emulation.rst |  61 ++
 Documentation/thermal/index.rst                    |  18 +
 Documentation/thermal/intel_powerclamp.rst         | 320 +++++++++
 Documentation/thermal/intel_powerclamp.txt         | 317 --------
 Documentation/thermal/nouveau_thermal              |  82 ---
 Documentation/thermal/nouveau_thermal.rst          |  96 +++
 Documentation/thermal/power_allocator.rst          | 271 +++++++
 Documentation/thermal/power_allocator.txt          | 247 -------
 Documentation/thermal/sysfs-api.rst                | 798 +++++++++++++++++++++
 Documentation/thermal/sysfs-api.txt                | 636 ----------------
 Documentation/thermal/x86_pkg_temperature_thermal  |  47 --
 .../thermal/x86_pkg_temperature_thermal.rst        |  55 ++
 MAINTAINERS                                        |   2 +-
 include/linux/thermal.h                            |   4 +-
 19 files changed, 1819 insertions(+), 1554 deletions(-)
 create mode 100644 Documentation/thermal/cpu-cooling-api.rst
 delete mode 100644 Documentation/thermal/cpu-cooling-api.txt
 delete mode 100644 Documentation/thermal/exynos_thermal
 create mode 100644 Documentation/thermal/exynos_thermal.rst
 delete mode 100644 Documentation/thermal/exynos_thermal_emulation
 create mode 100644 Documentation/thermal/exynos_thermal_emulation.rst
 create mode 100644 Documentation/thermal/index.rst
 create mode 100644 Documentation/thermal/intel_powerclamp.rst
 delete mode 100644 Documentation/thermal/intel_powerclamp.txt
 delete mode 100644 Documentation/thermal/nouveau_thermal
 create mode 100644 Documentation/thermal/nouveau_thermal.rst
 create mode 100644 Documentation/thermal/power_allocator.rst
 delete mode 100644 Documentation/thermal/power_allocator.txt
 create mode 100644 Documentation/thermal/sysfs-api.rst
 delete mode 100644 Documentation/thermal/sysfs-api.txt
 delete mode 100644 Documentation/thermal/x86_pkg_temperature_thermal
 create mode 100644 Documentation/thermal/x86_pkg_temperature_thermal.rst

(limited to 'include')

diff --git a/Documentation/thermal/cpu-cooling-api.rst b/Documentation/thermal/cpu-cooling-api.rst
new file mode 100644
index 000000000000..645d914c45a6
--- /dev/null
+++ b/Documentation/thermal/cpu-cooling-api.rst
@@ -0,0 +1,107 @@
+=======================
+CPU cooling APIs How To
+=======================
+
+Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
+
+Updated: 6 Jan 2015
+
+Copyright (c)  2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
+
+0. Introduction
+===============
+
+The generic cpu cooling(freq clipping) provides registration/unregistration APIs
+to the caller. The binding of the cooling devices to the trip point is left for
+the user. The registration APIs returns the cooling device pointer.
+
+1. cpu cooling APIs
+===================
+
+1.1 cpufreq registration/unregistration APIs
+--------------------------------------------
+
+    ::
+
+	struct thermal_cooling_device
+	*cpufreq_cooling_register(struct cpumask *clip_cpus)
+
+    This interface function registers the cpufreq cooling device with the name
+    "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
+    cooling devices.
+
+   clip_cpus:
+	cpumask of cpus where the frequency constraints will happen.
+
+    ::
+
+	struct thermal_cooling_device
+	*of_cpufreq_cooling_register(struct cpufreq_policy *policy)
+
+    This interface function registers the cpufreq cooling device with
+    the name "thermal-cpufreq-%x" linking it with a device tree node, in
+    order to bind it via the thermal DT code. This api can support multiple
+    instances of cpufreq cooling devices.
+
+    policy:
+	CPUFreq policy.
+
+
+    ::
+
+	void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
+
+    This interface function unregisters the "thermal-cpufreq-%x" cooling device.
+
+    cdev: Cooling device pointer which has to be unregistered.
+
+2. Power models
+===============
+
+The power API registration functions provide a simple power model for
+CPUs.  The current power is calculated as dynamic power (static power isn't
+supported currently).  This power model requires that the operating-points of
+the CPUs are registered using the kernel's opp library and the
+`cpufreq_frequency_table` is assigned to the `struct device` of the
+cpu.  If you are using CONFIG_CPUFREQ_DT then the
+`cpufreq_frequency_table` should already be assigned to the cpu
+device.
+
+The dynamic power consumption of a processor depends on many factors.
+For a given processor implementation the primary factors are:
+
+- The time the processor spends running, consuming dynamic power, as
+  compared to the time in idle states where dynamic consumption is
+  negligible.  Herein we refer to this as 'utilisation'.
+- The voltage and frequency levels as a result of DVFS.  The DVFS
+  level is a dominant factor governing power consumption.
+- In running time the 'execution' behaviour (instruction types, memory
+  access patterns and so forth) causes, in most cases, a second order
+  variation.  In pathological cases this variation can be significant,
+  but typically it is of a much lesser impact than the factors above.
+
+A high level dynamic power consumption model may then be represented as::
+
+	Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
+
+f(run) here represents the described execution behaviour and its
+result has a units of Watts/Hz/Volt^2 (this often expressed in
+mW/MHz/uVolt^2)
+
+The detailed behaviour for f(run) could be modelled on-line.  However,
+in practice, such an on-line model has dependencies on a number of
+implementation specific processor support and characterisation
+factors.  Therefore, in initial implementation that contribution is
+represented as a constant coefficient.  This is a simplification
+consistent with the relative contribution to overall power variation.
+
+In this simplified representation our model becomes::
+
+	Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
+
+Where `capacitance` is a constant that represents an indicative
+running time dynamic power coefficient in fundamental units of
+mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
+from 100 to 500.  For reference, the approximate values for the SoC in
+ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
+140 for the Cortex-A53 cluster.
diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
deleted file mode 100644
index 7df567eaea1a..000000000000
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ /dev/null
@@ -1,92 +0,0 @@
-CPU cooling APIs How To
-===================================
-
-Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
-
-Updated: 6 Jan 2015
-
-Copyright (c)  2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
-
-0. Introduction
-
-The generic cpu cooling(freq clipping) provides registration/unregistration APIs
-to the caller. The binding of the cooling devices to the trip point is left for
-the user. The registration APIs returns the cooling device pointer.
-
-1. cpu cooling APIs
-
-1.1 cpufreq registration/unregistration APIs
-1.1.1 struct thermal_cooling_device *cpufreq_cooling_register(
-	struct cpumask *clip_cpus)
-
-    This interface function registers the cpufreq cooling device with the name
-    "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
-    cooling devices.
-
-   clip_cpus: cpumask of cpus where the frequency constraints will happen.
-
-1.1.2 struct thermal_cooling_device *of_cpufreq_cooling_register(
-					struct cpufreq_policy *policy)
-
-    This interface function registers the cpufreq cooling device with
-    the name "thermal-cpufreq-%x" linking it with a device tree node, in
-    order to bind it via the thermal DT code. This api can support multiple
-    instances of cpufreq cooling devices.
-
-    policy: CPUFreq policy.
-
-1.1.3 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
-
-    This interface function unregisters the "thermal-cpufreq-%x" cooling device.
-
-    cdev: Cooling device pointer which has to be unregistered.
-
-2. Power models
-
-The power API registration functions provide a simple power model for
-CPUs.  The current power is calculated as dynamic power (static power isn't
-supported currently).  This power model requires that the operating-points of
-the CPUs are registered using the kernel's opp library and the
-`cpufreq_frequency_table` is assigned to the `struct device` of the
-cpu.  If you are using CONFIG_CPUFREQ_DT then the
-`cpufreq_frequency_table` should already be assigned to the cpu
-device.
-
-The dynamic power consumption of a processor depends on many factors.
-For a given processor implementation the primary factors are:
-
-- The time the processor spends running, consuming dynamic power, as
-  compared to the time in idle states where dynamic consumption is
-  negligible.  Herein we refer to this as 'utilisation'.
-- The voltage and frequency levels as a result of DVFS.  The DVFS
-  level is a dominant factor governing power consumption.
-- In running time the 'execution' behaviour (instruction types, memory
-  access patterns and so forth) causes, in most cases, a second order
-  variation.  In pathological cases this variation can be significant,
-  but typically it is of a much lesser impact than the factors above.
-
-A high level dynamic power consumption model may then be represented as:
-
-Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
-
-f(run) here represents the described execution behaviour and its
-result has a units of Watts/Hz/Volt^2 (this often expressed in
-mW/MHz/uVolt^2)
-
-The detailed behaviour for f(run) could be modelled on-line.  However,
-in practice, such an on-line model has dependencies on a number of
-implementation specific processor support and characterisation
-factors.  Therefore, in initial implementation that contribution is
-represented as a constant coefficient.  This is a simplification
-consistent with the relative contribution to overall power variation.
-
-In this simplified representation our model becomes:
-
-Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
-
-Where `capacitance` is a constant that represents an indicative
-running time dynamic power coefficient in fundamental units of
-mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
-from 100 to 500.  For reference, the approximate values for the SoC in
-ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
-140 for the Cortex-A53 cluster.
diff --git a/Documentation/thermal/exynos_thermal b/Documentation/thermal/exynos_thermal
deleted file mode 100644
index 9010c4416967..000000000000
--- a/Documentation/thermal/exynos_thermal
+++ /dev/null
@@ -1,77 +0,0 @@
-Kernel driver exynos_tmu
-=================
-
-Supported chips:
-* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
-  Datasheet: Not publicly available
-
-Authors: Donggeun Kim <dg77.kim@samsung.com>
-Authors: Amit Daniel <amit.daniel@samsung.com>
-
-TMU controller Description:
----------------------------
-
-This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
-
-The chip only exposes the measured 8-bit temperature code value
-through a register.
-Temperature can be taken from the temperature code.
-There are three equations converting from temperature to temperature code.
-
-The three equations are:
-  1. Two point trimming
-	Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1
-
-  2. One point trimming
-	Tc = T + TI1 - 25
-
-  3. No trimming
-	Tc = T + 50
-
-  Tc: Temperature code, T: Temperature,
-  TI1: Trimming info for 25 degree Celsius (stored at TRIMINFO register)
-       Temperature code measured at 25 degree Celsius which is unchanged
-  TI2: Trimming info for 85 degree Celsius (stored at TRIMINFO register)
-       Temperature code measured at 85 degree Celsius which is unchanged
-
-TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
-when temperature exceeds pre-defined levels.
-The maximum number of configurable threshold is five.
-The threshold levels are defined as follows:
-  Level_0: current temperature > trigger_level_0 + threshold
-  Level_1: current temperature > trigger_level_1 + threshold
-  Level_2: current temperature > trigger_level_2 + threshold
-  Level_3: current temperature > trigger_level_3 + threshold
-
-  The threshold and each trigger_level are set
-  through the corresponding registers.
-
-When an interrupt occurs, this driver notify kernel thermal framework
-with the function exynos_report_trigger.
-Although an interrupt condition for level_0 can be set,
-it can be used to synchronize the cooling action.
-
-TMU driver description:
------------------------
-
-The exynos thermal driver is structured as,
-
-					Kernel Core thermal framework
-				(thermal_core.c, step_wise.c, cpu_cooling.c)
-								^
-								|
-								|
-TMU configuration data -------> TMU Driver  <------> Exynos Core thermal wrapper
-(exynos_tmu_data.c)	      (exynos_tmu.c)	   (exynos_thermal_common.c)
-(exynos_tmu_data.h)	      (exynos_tmu.h)	   (exynos_thermal_common.h)
-
-a) TMU configuration data: This consist of TMU register offsets/bitfields
-		described through structure exynos_tmu_registers. Also several
-		other platform data (struct exynos_tmu_platform_data) members
-		are used to configure the TMU.
-b) TMU driver: This component initialises the TMU controller and sets different
-		thresholds. It invokes core thermal implementation with the call
-		exynos_report_trigger.
-c) Exynos Core thermal wrapper: This provides 3 wrapper function to use the
-		Kernel core thermal framework. They are exynos_unregister_thermal,
-		exynos_register_thermal and exynos_report_trigger.
diff --git a/Documentation/thermal/exynos_thermal.rst b/Documentation/thermal/exynos_thermal.rst
new file mode 100644
index 000000000000..5bd556566c70
--- /dev/null
+++ b/Documentation/thermal/exynos_thermal.rst
@@ -0,0 +1,90 @@
+========================
+Kernel driver exynos_tmu
+========================
+
+Supported chips:
+
+* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
+
+  Datasheet: Not publicly available
+
+Authors: Donggeun Kim <dg77.kim@samsung.com>
+Authors: Amit Daniel <amit.daniel@samsung.com>
+
+TMU controller Description:
+---------------------------
+
+This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
+
+The chip only exposes the measured 8-bit temperature code value
+through a register.
+Temperature can be taken from the temperature code.
+There are three equations converting from temperature to temperature code.
+
+The three equations are:
+  1. Two point trimming::
+
+	Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1
+
+  2. One point trimming::
+
+	Tc = T + TI1 - 25
+
+  3. No trimming::
+
+	Tc = T + 50
+
+  Tc:
+       Temperature code, T: Temperature,
+  TI1:
+       Trimming info for 25 degree Celsius (stored at TRIMINFO register)
+       Temperature code measured at 25 degree Celsius which is unchanged
+  TI2:
+       Trimming info for 85 degree Celsius (stored at TRIMINFO register)
+       Temperature code measured at 85 degree Celsius which is unchanged
+
+TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
+when temperature exceeds pre-defined levels.
+The maximum number of configurable threshold is five.
+The threshold levels are defined as follows::
+
+  Level_0: current temperature > trigger_level_0 + threshold
+  Level_1: current temperature > trigger_level_1 + threshold
+  Level_2: current temperature > trigger_level_2 + threshold
+  Level_3: current temperature > trigger_level_3 + threshold
+
+The threshold and each trigger_level are set
+through the corresponding registers.
+
+When an interrupt occurs, this driver notify kernel thermal framework
+with the function exynos_report_trigger.
+Although an interrupt condition for level_0 can be set,
+it can be used to synchronize the cooling action.
+
+TMU driver description:
+-----------------------
+
+The exynos thermal driver is structured as::
+
+					Kernel Core thermal framework
+				(thermal_core.c, step_wise.c, cpu_cooling.c)
+								^
+								|
+								|
+  TMU configuration data -----> TMU Driver  <----> Exynos Core thermal wrapper
+  (exynos_tmu_data.c)	      (exynos_tmu.c)	   (exynos_thermal_common.c)
+  (exynos_tmu_data.h)	      (exynos_tmu.h)	   (exynos_thermal_common.h)
+
+a) TMU configuration data:
+		This consist of TMU register offsets/bitfields
+		described through structure exynos_tmu_registers. Also several
+		other platform data (struct exynos_tmu_platform_data) members
+		are used to configure the TMU.
+b) TMU driver:
+		This component initialises the TMU controller and sets different
+		thresholds. It invokes core thermal implementation with the call
+		exynos_report_trigger.
+c) Exynos Core thermal wrapper:
+		This provides 3 wrapper function to use the
+		Kernel core thermal framework. They are exynos_unregister_thermal,
+		exynos_register_thermal and exynos_report_trigger.
diff --git a/Documentation/thermal/exynos_thermal_emulation b/Documentation/thermal/exynos_thermal_emulation
deleted file mode 100644
index b15efec6ca28..000000000000
--- a/Documentation/thermal/exynos_thermal_emulation
+++ /dev/null
@@ -1,53 +0,0 @@
-EXYNOS EMULATION MODE
-========================
-
-Copyright (C) 2012 Samsung Electronics
-
-Written by Jonghwa Lee <jonghwa3.lee@samsung.com>
-
-Description
------------
-
-Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal management unit.
-Thermal emulation mode supports software debug for TMU's operation. User can set temperature
-manually with software code and TMU will read current temperature from user value not from
-sensor's value.
-
-Enabling CONFIG_THERMAL_EMULATION option will make this support available.
-When it's enabled, sysfs node will be created as
-/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
-
-The sysfs node, 'emul_node', will contain value 0 for the initial state. When you input any
-temperature you want to update to sysfs node, it automatically enable emulation mode and
-current temperature will be changed into it.
-(Exynos also supports user changeable delay time which would be used to delay of
- changing temperature. However, this node only uses same delay of real sensing time, 938us.)
-
-Exynos emulation mode requires synchronous of value changing and enabling. It means when you
-want to update the any value of delay or next temperature, then you have to enable emulation
-mode at the same time. (Or you have to keep the mode enabling.) If you don't, it fails to
-change the value to updated one and just use last succeessful value repeatedly. That's why
-this node gives users the right to change termerpature only. Just one interface makes it more
-simply to use.
-
-Disabling emulation mode only requires writing value 0 to sysfs node.
-
-
-TEMP	120 |
-	    |
-	100 |
-	    |
-	 80 |
-	    |		     	 	 +-----------
-	 60 |      		     	 |	    |
-	    |	           +-------------|          |
-	 40 |              |         	 |          |
-	    |		   |	     	 |          |
-	 20 |		   |	     	 |          +----------
-	    |	 	   |	     	 |          |          |
-	  0 |______________|_____________|__________|__________|_________
-		   A	    	 A	    A	   	       A     TIME
-		   |<----->|	 |<----->|  |<----->|	       |
-		   | 938us |  	 |	 |  |       |          |
-emulation    :  0  50	   |  	 70      |  20      |          0
-current temp :   sensor   50		 70         20	      sensor
diff --git a/Documentation/thermal/exynos_thermal_emulation.rst b/Documentation/thermal/exynos_thermal_emulation.rst
new file mode 100644
index 000000000000..c21d10838bc5
--- /dev/null
+++ b/Documentation/thermal/exynos_thermal_emulation.rst
@@ -0,0 +1,61 @@
+=====================
+Exynos Emulation Mode
+=====================
+
+Copyright (C) 2012 Samsung Electronics
+
+Written by Jonghwa Lee <jonghwa3.lee@samsung.com>
+
+Description
+-----------
+
+Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal
+management unit. Thermal emulation mode supports software debug for
+TMU's operation. User can set temperature manually with software code
+and TMU will read current temperature from user value not from sensor's
+value.
+
+Enabling CONFIG_THERMAL_EMULATION option will make this support
+available. When it's enabled, sysfs node will be created as
+/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
+
+The sysfs node, 'emul_node', will contain value 0 for the initial state.
+When you input any temperature you want to update to sysfs node, it
+automatically enable emulation mode and current temperature will be
+changed into it.
+
+(Exynos also supports user changeable delay time which would be used to
+delay of changing temperature. However, this node only uses same delay
+of real sensing time, 938us.)
+
+Exynos emulation mode requires synchronous of value changing and
+enabling. It means when you want to update the any value of delay or
+next temperature, then you have to enable emulation mode at the same
+time. (Or you have to keep the mode enabling.) If you don't, it fails to
+change the value to updated one and just use last succeessful value
+repeatedly. That's why this node gives users the right to change
+termerpature only. Just one interface makes it more simply to use.
+
+Disabling emulation mode only requires writing value 0 to sysfs node.
+
+::
+
+
+  TEMP	120 |
+	    |
+	100 |
+	    |
+	 80 |
+	    |				 +-----------
+	 60 |      			 |	    |
+	    |		   +-------------|          |
+	 40 |              |         	 |          |
+	    |		   |		 |          |
+	 20 |		   |		 |          +----------
+	    |		   |		 |          |          |
+	  0 |______________|_____________|__________|__________|_________
+		   A		 A	    A		       A     TIME
+		   |<----->|	 |<----->|  |<----->|	       |
+		   | 938us |  	 |	 |  |       |          |
+  emulation   : 0  50	   |  	 70      |  20      |          0
+  current temp:   sensor   50		 70         20	      sensor
diff --git a/Documentation/thermal/index.rst b/Documentation/thermal/index.rst
new file mode 100644
index 000000000000..8c1c00146cad
--- /dev/null
+++ b/Documentation/thermal/index.rst
@@ -0,0 +1,18 @@
+:orphan:
+
+=======
+Thermal
+=======
+
+.. toctree::
+   :maxdepth: 1
+
+   cpu-cooling-api
+   sysfs-api
+   power_allocator
+
+   exynos_thermal
+   exynos_thermal_emulation
+   intel_powerclamp
+   nouveau_thermal
+   x86_pkg_temperature_thermal
diff --git a/Documentation/thermal/intel_powerclamp.rst b/Documentation/thermal/intel_powerclamp.rst
new file mode 100644
index 000000000000..3f6dfb0b3ea6
--- /dev/null
+++ b/Documentation/thermal/intel_powerclamp.rst
@@ -0,0 +1,320 @@
+=======================
+Intel Powerclamp Driver
+=======================
+
+By:
+  - Arjan van de Ven <arjan@linux.intel.com>
+  - Jacob Pan <jacob.jun.pan@linux.intel.com>
+
+.. Contents:
+
+	(*) Introduction
+	    - Goals and Objectives
+
+	(*) Theory of Operation
+	    - Idle Injection
+	    - Calibration
+
+	(*) Performance Analysis
+	    - Effectiveness and Limitations
+	    - Power vs Performance
+	    - Scalability
+	    - Calibration
+	    - Comparison with Alternative Techniques
+
+	(*) Usage and Interfaces
+	    - Generic Thermal Layer (sysfs)
+	    - Kernel APIs (TBD)
+
+INTRODUCTION
+============
+
+Consider the situation where a system’s power consumption must be
+reduced at runtime, due to power budget, thermal constraint, or noise
+level, and where active cooling is not preferred. Software managed
+passive power reduction must be performed to prevent the hardware
+actions that are designed for catastrophic scenarios.
+
+Currently, P-states, T-states (clock modulation), and CPU offlining
+are used for CPU throttling.
+
+On Intel CPUs, C-states provide effective power reduction, but so far
+they’re only used opportunistically, based on workload. With the
+development of intel_powerclamp driver, the method of synchronizing
+idle injection across all online CPU threads was introduced. The goal
+is to achieve forced and controllable C-state residency.
+
+Test/Analysis has been made in the areas of power, performance,
+scalability, and user experience. In many cases, clear advantage is
+shown over taking the CPU offline or modulating the CPU clock.
+
+
+THEORY OF OPERATION
+===================
+
+Idle Injection
+--------------
+
+On modern Intel processors (Nehalem or later), package level C-state
+residency is available in MSRs, thus also available to the kernel.
+
+These MSRs are::
+
+      #define MSR_PKG_C2_RESIDENCY      0x60D
+      #define MSR_PKG_C3_RESIDENCY      0x3F8
+      #define MSR_PKG_C6_RESIDENCY      0x3F9
+      #define MSR_PKG_C7_RESIDENCY      0x3FA
+
+If the kernel can also inject idle time to the system, then a
+closed-loop control system can be established that manages package
+level C-state. The intel_powerclamp driver is conceived as such a
+control system, where the target set point is a user-selected idle
+ratio (based on power reduction), and the error is the difference
+between the actual package level C-state residency ratio and the target idle
+ratio.
+
+Injection is controlled by high priority kernel threads, spawned for
+each online CPU.
+
+These kernel threads, with SCHED_FIFO class, are created to perform
+clamping actions of controlled duty ratio and duration. Each per-CPU
+thread synchronizes its idle time and duration, based on the rounding
+of jiffies, so accumulated errors can be prevented to avoid a jittery
+effect. Threads are also bound to the CPU such that they cannot be
+migrated, unless the CPU is taken offline. In this case, threads
+belong to the offlined CPUs will be terminated immediately.
+
+Running as SCHED_FIFO and relatively high priority, also allows such
+scheme to work for both preemptable and non-preemptable kernels.
+Alignment of idle time around jiffies ensures scalability for HZ
+values. This effect can be better visualized using a Perf timechart.
+The following diagram shows the behavior of kernel thread
+kidle_inject/cpu. During idle injection, it runs monitor/mwait idle
+for a given "duration", then relinquishes the CPU to other tasks,
+until the next time interval.
+
+The NOHZ schedule tick is disabled during idle time, but interrupts
+are not masked. Tests show that the extra wakeups from scheduler tick
+have a dramatic impact on the effectiveness of the powerclamp driver
+on large scale systems (Westmere system with 80 processors).
+
+::
+
+  CPU0
+		    ____________          ____________
+  kidle_inject/0   |   sleep    |  mwait |  sleep     |
+	  _________|            |________|            |_______
+				 duration
+  CPU1
+		    ____________          ____________
+  kidle_inject/1   |   sleep    |  mwait |  sleep     |
+	  _________|            |________|            |_______
+				^
+				|
+				|
+				roundup(jiffies, interval)
+
+Only one CPU is allowed to collect statistics and update global
+control parameters. This CPU is referred to as the controlling CPU in
+this document. The controlling CPU is elected at runtime, with a
+policy that favors BSP, taking into account the possibility of a CPU
+hot-plug.
+
+In terms of dynamics of the idle control system, package level idle
+time is considered largely as a non-causal system where its behavior
+cannot be based on the past or current input. Therefore, the
+intel_powerclamp driver attempts to enforce the desired idle time
+instantly as given input (target idle ratio). After injection,
+powerclamp monitors the actual idle for a given time window and adjust
+the next injection accordingly to avoid over/under correction.
+
+When used in a causal control system, such as a temperature control,
+it is up to the user of this driver to implement algorithms where
+past samples and outputs are included in the feedback. For example, a
+PID-based thermal controller can use the powerclamp driver to
+maintain a desired target temperature, based on integral and
+derivative gains of the past samples.
+
+
+
+Calibration
+-----------
+During scalability testing, it is observed that synchronized actions
+among CPUs become challenging as the number of cores grows. This is
+also true for the ability of a system to enter package level C-states.
+
+To make sure the intel_powerclamp driver scales well, online
+calibration is implemented. The goals for doing such a calibration
+are:
+
+a) determine the effective range of idle injection ratio
+b) determine the amount of compensation needed at each target ratio
+
+Compensation to each target ratio consists of two parts:
+
+	a) steady state error compensation
+	This is to offset the error occurring when the system can
+	enter idle without extra wakeups (such as external interrupts).
+
+	b) dynamic error compensation
+	When an excessive amount of wakeups occurs during idle, an
+	additional idle ratio can be added to quiet interrupts, by
+	slowing down CPU activities.
+
+A debugfs file is provided for the user to examine compensation
+progress and results, such as on a Westmere system::
+
+  [jacob@nex01 ~]$ cat
+  /sys/kernel/debug/intel_powerclamp/powerclamp_calib
+  controlling cpu: 0
+  pct confidence steady dynamic (compensation)
+  0       0       0       0
+  1       1       0       0
+  2       1       1       0
+  3       3       1       0
+  4       3       1       0
+  5       3       1       0
+  6       3       1       0
+  7       3       1       0
+  8       3       1       0
+  ...
+  30      3       2       0
+  31      3       2       0
+  32      3       1       0
+  33      3       2       0
+  34      3       1       0
+  35      3       2       0
+  36      3       1       0
+  37      3       2       0
+  38      3       1       0
+  39      3       2       0
+  40      3       3       0
+  41      3       1       0
+  42      3       2       0
+  43      3       1       0
+  44      3       1       0
+  45      3       2       0
+  46      3       3       0
+  47      3       0       0
+  48      3       2       0
+  49      3       3       0
+
+Calibration occurs during runtime. No offline method is available.
+Steady state compensation is used only when confidence levels of all
+adjacent ratios have reached satisfactory level. A confidence level
+is accumulated based on clean data collected at runtime. Data
+collected during a period without extra interrupts is considered
+clean.
+
+To compensate for excessive amounts of wakeup during idle, additional
+idle time is injected when such a condition is detected. Currently,
+we have a simple algorithm to double the injection ratio. A possible
+enhancement might be to throttle the offending IRQ, such as delaying
+EOI for level triggered interrupts. But it is a challenge to be
+non-intrusive to the scheduler or the IRQ core code.
+
+
+CPU Online/Offline
+------------------
+Per-CPU kernel threads are started/stopped upon receiving
+notifications of CPU hotplug activities. The intel_powerclamp driver
+keeps track of clamping kernel threads, even after they are migrated
+to other CPUs, after a CPU offline event.
+
+
+Performance Analysis
+====================
+This section describes the general performance data collected on
+multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P).
+
+Effectiveness and Limitations
+-----------------------------
+The maximum range that idle injection is allowed is capped at 50
+percent. As mentioned earlier, since interrupts are allowed during
+forced idle time, excessive interrupts could result in less
+effectiveness. The extreme case would be doing a ping -f to generated
+flooded network interrupts without much CPU acknowledgement. In this
+case, little can be done from the idle injection threads. In most
+normal cases, such as scp a large file, applications can be throttled
+by the powerclamp driver, since slowing down the CPU also slows down
+network protocol processing, which in turn reduces interrupts.
+
+When control parameters change at runtime by the controlling CPU, it
+may take an additional period for the rest of the CPUs to catch up
+with the changes. During this time, idle injection is out of sync,
+thus not able to enter package C- states at the expected ratio. But
+this effect is minor, in that in most cases change to the target
+ratio is updated much less frequently than the idle injection
+frequency.
+
+Scalability
+-----------
+Tests also show a minor, but measurable, difference between the 4P/8P
+Ivy Bridge system and the 80P Westmere server under 50% idle ratio.
+More compensation is needed on Westmere for the same amount of
+target idle ratio. The compensation also increases as the idle ratio
+gets larger. The above reason constitutes the need for the
+calibration code.
+
+On the IVB 8P system, compared to an offline CPU, powerclamp can
+achieve up to 40% better performance per watt. (measured by a spin
+counter summed over per CPU counting threads spawned for all running
+CPUs).
+
+Usage and Interfaces
+====================
+The powerclamp driver is registered to the generic thermal layer as a
+cooling device. Currently, it’s not bound to any thermal zones::
+
+  jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . *
+  cur_state:0
+  max_state:50
+  type:intel_powerclamp
+
+cur_state allows user to set the desired idle percentage. Writing 0 to
+cur_state will stop idle injection. Writing a value between 1 and
+max_state will start the idle injection. Reading cur_state returns the
+actual and current idle percentage. This may not be the same value
+set by the user in that current idle percentage depends on workload
+and includes natural idle. When idle injection is disabled, reading
+cur_state returns value -1 instead of 0 which is to avoid confusing
+100% busy state with the disabled state.
+
+Example usage:
+- To inject 25% idle time::
+
+	$ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state
+
+If the system is not busy and has more than 25% idle time already,
+then the powerclamp driver will not start idle injection. Using Top
+will not show idle injection kernel threads.
+
+If the system is busy (spin test below) and has less than 25% natural
+idle time, powerclamp kernel threads will do idle injection. Forced
+idle time is accounted as normal idle in that common code path is
+taken as the idle task.
+
+In this example, 24.1% idle is shown. This helps the system admin or
+user determine the cause of slowdown, when a powerclamp driver is in action::
+
+
+  Tasks: 197 total,   1 running, 196 sleeping,   0 stopped,   0 zombie
+  Cpu(s): 71.2%us,  4.7%sy,  0.0%ni, 24.1%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
+  Mem:   3943228k total,  1689632k used,  2253596k free,    74960k buffers
+  Swap:  4087804k total,        0k used,  4087804k free,   945336k cached
+
+    PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
+   3352 jacob     20   0  262m  644  428 S  286  0.0   0:17.16 spin
+   3341 root     -51   0     0    0    0 D   25  0.0   0:01.62 kidle_inject/0
+   3344 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/3
+   3342 root     -51   0     0    0    0 D   25  0.0   0:01.61 kidle_inject/1
+   3343 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/2
+   2935 jacob     20   0  696m 125m  35m S    5  3.3   0:31.11 firefox
+   1546 root      20   0  158m  20m 6640 S    3  0.5   0:26.97 Xorg
+   2100 jacob     20   0 1223m  88m  30m S    3  2.3   0:23.68 compiz
+
+Tests have shown that by using the powerclamp driver as a cooling
+device, a PID based userspace thermal controller can manage to
+control CPU temperature effectively, when no other thermal influence
+is added. For example, a UltraBook user can compile the kernel under
+certain temperature (below most active trip points).
diff --git a/Documentation/thermal/intel_powerclamp.txt b/Documentation/thermal/intel_powerclamp.txt
deleted file mode 100644
index b5df21168fbc..000000000000
--- a/Documentation/thermal/intel_powerclamp.txt
+++ /dev/null
@@ -1,317 +0,0 @@
-			 =======================
-			 INTEL POWERCLAMP DRIVER
-			 =======================
-By: Arjan van de Ven <arjan@linux.intel.com>
-    Jacob Pan <jacob.jun.pan@linux.intel.com>
-
-Contents:
-	(*) Introduction
-	    - Goals and Objectives
-
-	(*) Theory of Operation
-	    - Idle Injection
-	    - Calibration
-
-	(*) Performance Analysis
-	    - Effectiveness and Limitations
-	    - Power vs Performance
-	    - Scalability
-	    - Calibration
-	    - Comparison with Alternative Techniques
-
-	(*) Usage and Interfaces
-	    - Generic Thermal Layer (sysfs)
-	    - Kernel APIs (TBD)
-
-============
-INTRODUCTION
-============
-
-Consider the situation where a system’s power consumption must be
-reduced at runtime, due to power budget, thermal constraint, or noise
-level, and where active cooling is not preferred. Software managed
-passive power reduction must be performed to prevent the hardware
-actions that are designed for catastrophic scenarios.
-
-Currently, P-states, T-states (clock modulation), and CPU offlining
-are used for CPU throttling.
-
-On Intel CPUs, C-states provide effective power reduction, but so far
-they’re only used opportunistically, based on workload. With the
-development of intel_powerclamp driver, the method of synchronizing
-idle injection across all online CPU threads was introduced. The goal
-is to achieve forced and controllable C-state residency.
-
-Test/Analysis has been made in the areas of power, performance,
-scalability, and user experience. In many cases, clear advantage is
-shown over taking the CPU offline or modulating the CPU clock.
-
-
-===================
-THEORY OF OPERATION
-===================
-
-Idle Injection
---------------
-
-On modern Intel processors (Nehalem or later), package level C-state
-residency is available in MSRs, thus also available to the kernel.
-
-These MSRs are:
-      #define MSR_PKG_C2_RESIDENCY	0x60D
-      #define MSR_PKG_C3_RESIDENCY	0x3F8
-      #define MSR_PKG_C6_RESIDENCY	0x3F9
-      #define MSR_PKG_C7_RESIDENCY	0x3FA
-
-If the kernel can also inject idle time to the system, then a
-closed-loop control system can be established that manages package
-level C-state. The intel_powerclamp driver is conceived as such a
-control system, where the target set point is a user-selected idle
-ratio (based on power reduction), and the error is the difference
-between the actual package level C-state residency ratio and the target idle
-ratio.
-
-Injection is controlled by high priority kernel threads, spawned for
-each online CPU.
-
-These kernel threads, with SCHED_FIFO class, are created to perform
-clamping actions of controlled duty ratio and duration. Each per-CPU
-thread synchronizes its idle time and duration, based on the rounding
-of jiffies, so accumulated errors can be prevented to avoid a jittery
-effect. Threads are also bound to the CPU such that they cannot be
-migrated, unless the CPU is taken offline. In this case, threads
-belong to the offlined CPUs will be terminated immediately.
-
-Running as SCHED_FIFO and relatively high priority, also allows such
-scheme to work for both preemptable and non-preemptable kernels.
-Alignment of idle time around jiffies ensures scalability for HZ
-values. This effect can be better visualized using a Perf timechart.
-The following diagram shows the behavior of kernel thread
-kidle_inject/cpu. During idle injection, it runs monitor/mwait idle
-for a given "duration", then relinquishes the CPU to other tasks,
-until the next time interval.
-
-The NOHZ schedule tick is disabled during idle time, but interrupts
-are not masked. Tests show that the extra wakeups from scheduler tick
-have a dramatic impact on the effectiveness of the powerclamp driver
-on large scale systems (Westmere system with 80 processors).
-
-CPU0
-		  ____________          ____________
-kidle_inject/0   |   sleep    |  mwait |  sleep     |
-	_________|            |________|            |_______
-			       duration
-CPU1
-		  ____________          ____________
-kidle_inject/1   |   sleep    |  mwait |  sleep     |
-	_________|            |________|            |_______
-			      ^
-			      |
-			      |
-			      roundup(jiffies, interval)
-
-Only one CPU is allowed to collect statistics and update global
-control parameters. This CPU is referred to as the controlling CPU in
-this document. The controlling CPU is elected at runtime, with a
-policy that favors BSP, taking into account the possibility of a CPU
-hot-plug.
-
-In terms of dynamics of the idle control system, package level idle
-time is considered largely as a non-causal system where its behavior
-cannot be based on the past or current input. Therefore, the
-intel_powerclamp driver attempts to enforce the desired idle time
-instantly as given input (target idle ratio). After injection,
-powerclamp monitors the actual idle for a given time window and adjust
-the next injection accordingly to avoid over/under correction.
-
-When used in a causal control system, such as a temperature control,
-it is up to the user of this driver to implement algorithms where
-past samples and outputs are included in the feedback. For example, a
-PID-based thermal controller can use the powerclamp driver to
-maintain a desired target temperature, based on integral and
-derivative gains of the past samples.
-
-
-
-Calibration
------------
-During scalability testing, it is observed that synchronized actions
-among CPUs become challenging as the number of cores grows. This is
-also true for the ability of a system to enter package level C-states.
-
-To make sure the intel_powerclamp driver scales well, online
-calibration is implemented. The goals for doing such a calibration
-are:
-
-a) determine the effective range of idle injection ratio
-b) determine the amount of compensation needed at each target ratio
-
-Compensation to each target ratio consists of two parts:
-
-        a) steady state error compensation
-	This is to offset the error occurring when the system can
-	enter idle without extra wakeups (such as external interrupts).
-
-	b) dynamic error compensation
-	When an excessive amount of wakeups occurs during idle, an
-	additional idle ratio can be added to quiet interrupts, by
-	slowing down CPU activities.
-
-A debugfs file is provided for the user to examine compensation
-progress and results, such as on a Westmere system.
-[jacob@nex01 ~]$ cat
-/sys/kernel/debug/intel_powerclamp/powerclamp_calib
-controlling cpu: 0
-pct confidence steady dynamic (compensation)
-0	0	0	0
-1	1	0	0
-2	1	1	0
-3	3	1	0
-4	3	1	0
-5	3	1	0
-6	3	1	0
-7	3	1	0
-8	3	1	0
-...
-30	3	2	0
-31	3	2	0
-32	3	1	0
-33	3	2	0
-34	3	1	0
-35	3	2	0
-36	3	1	0
-37	3	2	0
-38	3	1	0
-39	3	2	0
-40	3	3	0
-41	3	1	0
-42	3	2	0
-43	3	1	0
-44	3	1	0
-45	3	2	0
-46	3	3	0
-47	3	0	0
-48	3	2	0
-49	3	3	0
-
-Calibration occurs during runtime. No offline method is available.
-Steady state compensation is used only when confidence levels of all
-adjacent ratios have reached satisfactory level. A confidence level
-is accumulated based on clean data collected at runtime. Data
-collected during a period without extra interrupts is considered
-clean.
-
-To compensate for excessive amounts of wakeup during idle, additional
-idle time is injected when such a condition is detected. Currently,
-we have a simple algorithm to double the injection ratio. A possible
-enhancement might be to throttle the offending IRQ, such as delaying
-EOI for level triggered interrupts. But it is a challenge to be
-non-intrusive to the scheduler or the IRQ core code.
-
-
-CPU Online/Offline
-------------------
-Per-CPU kernel threads are started/stopped upon receiving
-notifications of CPU hotplug activities. The intel_powerclamp driver
-keeps track of clamping kernel threads, even after they are migrated
-to other CPUs, after a CPU offline event.
-
-
-=====================
-Performance Analysis
-=====================
-This section describes the general performance data collected on
-multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P).
-
-Effectiveness and Limitations
------------------------------
-The maximum range that idle injection is allowed is capped at 50
-percent. As mentioned earlier, since interrupts are allowed during
-forced idle time, excessive interrupts could result in less
-effectiveness. The extreme case would be doing a ping -f to generated
-flooded network interrupts without much CPU acknowledgement. In this
-case, little can be done from the idle injection threads. In most
-normal cases, such as scp a large file, applications can be throttled
-by the powerclamp driver, since slowing down the CPU also slows down
-network protocol processing, which in turn reduces interrupts.
-
-When control parameters change at runtime by the controlling CPU, it
-may take an additional period for the rest of the CPUs to catch up
-with the changes. During this time, idle injection is out of sync,
-thus not able to enter package C- states at the expected ratio. But
-this effect is minor, in that in most cases change to the target
-ratio is updated much less frequently than the idle injection
-frequency.
-
-Scalability
------------
-Tests also show a minor, but measurable, difference between the 4P/8P
-Ivy Bridge system and the 80P Westmere server under 50% idle ratio.
-More compensation is needed on Westmere for the same amount of
-target idle ratio. The compensation also increases as the idle ratio
-gets larger. The above reason constitutes the need for the
-calibration code.
-
-On the IVB 8P system, compared to an offline CPU, powerclamp can
-achieve up to 40% better performance per watt. (measured by a spin
-counter summed over per CPU counting threads spawned for all running
-CPUs).
-
-====================
-Usage and Interfaces
-====================
-The powerclamp driver is registered to the generic thermal layer as a
-cooling device. Currently, it’s not bound to any thermal zones.
-
-jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . *
-cur_state:0
-max_state:50
-type:intel_powerclamp
-
-cur_state allows user to set the desired idle percentage. Writing 0 to
-cur_state will stop idle injection. Writing a value between 1 and
-max_state will start the idle injection. Reading cur_state returns the
-actual and current idle percentage. This may not be the same value
-set by the user in that current idle percentage depends on workload
-and includes natural idle. When idle injection is disabled, reading
-cur_state returns value -1 instead of 0 which is to avoid confusing
-100% busy state with the disabled state.
-
-Example usage:
-- To inject 25% idle time
-$ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state
-"
-
-If the system is not busy and has more than 25% idle time already,
-then the powerclamp driver will not start idle injection. Using Top
-will not show idle injection kernel threads.
-
-If the system is busy (spin test below) and has less than 25% natural
-idle time, powerclamp kernel threads will do idle injection. Forced
-idle time is accounted as normal idle in that common code path is
-taken as the idle task.
-
-In this example, 24.1% idle is shown. This helps the system admin or
-user determine the cause of slowdown, when a powerclamp driver is in action.
-
-
-Tasks: 197 total,   1 running, 196 sleeping,   0 stopped,   0 zombie
-Cpu(s): 71.2%us,  4.7%sy,  0.0%ni, 24.1%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
-Mem:   3943228k total,  1689632k used,  2253596k free,    74960k buffers
-Swap:  4087804k total,        0k used,  4087804k free,   945336k cached
-
-  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
- 3352 jacob     20   0  262m  644  428 S  286  0.0   0:17.16 spin
- 3341 root     -51   0     0    0    0 D   25  0.0   0:01.62 kidle_inject/0
- 3344 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/3
- 3342 root     -51   0     0    0    0 D   25  0.0   0:01.61 kidle_inject/1
- 3343 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/2
- 2935 jacob     20   0  696m 125m  35m S    5  3.3   0:31.11 firefox
- 1546 root      20   0  158m  20m 6640 S    3  0.5   0:26.97 Xorg
- 2100 jacob     20   0 1223m  88m  30m S    3  2.3   0:23.68 compiz
-
-Tests have shown that by using the powerclamp driver as a cooling
-device, a PID based userspace thermal controller can manage to
-control CPU temperature effectively, when no other thermal influence
-is added. For example, a UltraBook user can compile the kernel under
-certain temperature (below most active trip points).
diff --git a/Documentation/thermal/nouveau_thermal b/Documentation/thermal/nouveau_thermal
deleted file mode 100644
index 6e17a11efcb0..000000000000
--- a/Documentation/thermal/nouveau_thermal
+++ /dev/null
@@ -1,82 +0,0 @@
-Kernel driver nouveau
-===================
-
-Supported chips:
-* NV43+
-
-Authors: Martin Peres (mupuf) <martin.peres@free.fr>
-
-Description
----------
-
-This driver allows to read the GPU core temperature, drive the GPU fan and
-set temperature alarms.
-
-Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau
-cannot access any of the i2c external monitoring chips it may find. If you
-have one of those, temperature and/or fan management through Nouveau's HWMON
-interface is likely not to work. This document may then not cover your situation
-entirely.
-
-Temperature management
---------------------
-
-Temperature is exposed under as a read-only HWMON attribute temp1_input.
-
-In order to protect the GPU from overheating, Nouveau supports 4 configurable
-temperature thresholds:
-
- * Fan_boost: Fan speed is set to 100% when reaching this temperature;
- * Downclock: The GPU will be downclocked to reduce its power dissipation;
- * Critical: The GPU is put on hold to further lower power dissipation;
- * Shutdown: Shut the computer down to protect your GPU.
-
-WARNING: Some of these thresholds may not be used by Nouveau depending
-on your chipset.
-
-The default value for these thresholds comes from the GPU's vbios. These
-thresholds can be configured thanks to the following HWMON attributes:
-
- * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst;
- * Downclock: temp1_max and temp1_max_hyst;
- * Critical: temp1_crit and temp1_crit_hyst;
- * Shutdown: temp1_emergency and temp1_emergency_hyst.
-
-NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget
-to multiply!
-
-Fan management
-------------
-
-Not all cards have a drivable fan. If you do, then the following HWMON
-attributes should be available:
-
- * pwm1_enable: Current fan management mode (NONE, MANUAL or AUTO);
- * pwm1: Current PWM value (power percentage);
- * pwm1_min: The minimum PWM speed allowed;
- * pwm1_max: The maximum PWM speed allowed (bypassed when hitting Fan_boost);
-
-You may also have the following attribute:
-
- * fan1_input: Speed in RPM of your fan.
-
-Your fan can be driven in different modes:
-
- * 0: The fan is left untouched;
- * 1: The fan can be driven in manual (use pwm1 to change the speed);
- * 2; The fan is driven automatically depending on the temperature.
-
-NOTE: Be sure to use the manual mode if you want to drive the fan speed manually
-
-NOTE2: When operating in manual mode outside the vbios-defined
-[PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate
-depending on your hardware.
-
-Bug reports
----------
-
-Thermal management on Nouveau is new and may not work on all cards. If you have
-inquiries, please ping mupuf on IRC (#nouveau, freenode).
-
-Bug reports should be filled on Freedesktop's bug tracker. Please follow
-http://nouveau.freedesktop.org/wiki/Bugs
diff --git a/Documentation/thermal/nouveau_thermal.rst b/Documentation/thermal/nouveau_thermal.rst
new file mode 100644
index 000000000000..37255fd6735d
--- /dev/null
+++ b/Documentation/thermal/nouveau_thermal.rst
@@ -0,0 +1,96 @@
+=====================
+Kernel driver nouveau
+=====================
+
+Supported chips:
+
+* NV43+
+
+Authors: Martin Peres (mupuf) <martin.peres@free.fr>
+
+Description
+-----------
+
+This driver allows to read the GPU core temperature, drive the GPU fan and
+set temperature alarms.
+
+Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau
+cannot access any of the i2c external monitoring chips it may find. If you
+have one of those, temperature and/or fan management through Nouveau's HWMON
+interface is likely not to work. This document may then not cover your situation
+entirely.
+
+Temperature management
+----------------------
+
+Temperature is exposed under as a read-only HWMON attribute temp1_input.
+
+In order to protect the GPU from overheating, Nouveau supports 4 configurable
+temperature thresholds:
+
+ * Fan_boost:
+	Fan speed is set to 100% when reaching this temperature;
+ * Downclock:
+	The GPU will be downclocked to reduce its power dissipation;
+ * Critical:
+	The GPU is put on hold to further lower power dissipation;
+ * Shutdown:
+	Shut the computer down to protect your GPU.
+
+WARNING:
+	Some of these thresholds may not be used by Nouveau depending
+	on your chipset.
+
+The default value for these thresholds comes from the GPU's vbios. These
+thresholds can be configured thanks to the following HWMON attributes:
+
+ * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst;
+ * Downclock: temp1_max and temp1_max_hyst;
+ * Critical: temp1_crit and temp1_crit_hyst;
+ * Shutdown: temp1_emergency and temp1_emergency_hyst.
+
+NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget
+to multiply!
+
+Fan management
+--------------
+
+Not all cards have a drivable fan. If you do, then the following HWMON
+attributes should be available:
+
+ * pwm1_enable:
+	Current fan management mode (NONE, MANUAL or AUTO);
+ * pwm1:
+	Current PWM value (power percentage);
+ * pwm1_min:
+	The minimum PWM speed allowed;
+ * pwm1_max:
+	The maximum PWM speed allowed (bypassed when hitting Fan_boost);
+
+You may also have the following attribute:
+
+ * fan1_input:
+	Speed in RPM of your fan.
+
+Your fan can be driven in different modes:
+
+ * 0: The fan is left untouched;
+ * 1: The fan can be driven in manual (use pwm1 to change the speed);
+ * 2; The fan is driven automatically depending on the temperature.
+
+NOTE:
+  Be sure to use the manual mode if you want to drive the fan speed manually
+
+NOTE2:
+  When operating in manual mode outside the vbios-defined
+  [PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate
+  depending on your hardware.
+
+Bug reports
+-----------
+
+Thermal management on Nouveau is new and may not work on all cards. If you have
+inquiries, please ping mupuf on IRC (#nouveau, freenode).
+
+Bug reports should be filled on Freedesktop's bug tracker. Please follow
+http://nouveau.freedesktop.org/wiki/Bugs
diff --git a/Documentation/thermal/power_allocator.rst b/Documentation/thermal/power_allocator.rst
new file mode 100644
index 000000000000..67b6a3297238
--- /dev/null
+++ b/Documentation/thermal/power_allocator.rst
@@ -0,0 +1,271 @@
+=================================
+Power allocator governor tunables
+=================================
+
+Trip points
+-----------
+
+The governor works optimally with the following two passive trip points:
+
+1.  "switch on" trip point: temperature above which the governor
+    control loop starts operating.  This is the first passive trip
+    point of the thermal zone.
+
+2.  "desired temperature" trip point: it should be higher than the
+    "switch on" trip point.  This the target temperature the governor
+    is controlling for.  This is the last passive trip point of the
+    thermal zone.
+
+PID Controller
+--------------
+
+The power allocator governor implements a
+Proportional-Integral-Derivative controller (PID controller) with
+temperature as the control input and power as the controlled output:
+
+    P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power
+
+where
+   -  e = desired_temperature - current_temperature
+   -  err_integral is the sum of previous errors
+   -  diff_err = e - previous_error
+
+It is similar to the one depicted below::
+
+				      k_d
+				       |
+  current_temp                         |
+       |                               v
+       |              +----------+   +---+
+       |       +----->| diff_err |-->| X |------+
+       |       |      +----------+   +---+      |
+       |       |                                |      tdp        actor
+       |       |                      k_i       |       |  get_requested_power()
+       |       |                       |        |       |        |     |
+       |       |                       |        |       |        |     | ...
+       v       |                       v        v       v        v     v
+     +---+     |      +-------+      +---+    +---+   +---+   +----------+
+     | S |-----+----->| sum e |----->| X |--->| S |-->| S |-->|power     |
+     +---+     |      +-------+      +---+    +---+   +---+   |allocation|
+       ^       |                                ^             +----------+
+       |       |                                |                |     |
+       |       |        +---+                   |                |     |
+       |       +------->| X |-------------------+                v     v
+       |                +---+                               granted performance
+  desired_temperature     ^
+			  |
+			  |
+		      k_po/k_pu
+
+Sustainable power
+-----------------
+
+An estimate of the sustainable dissipatable power (in mW) should be
+provided while registering the thermal zone.  This estimates the
+sustained power that can be dissipated at the desired control
+temperature.  This is the maximum sustained power for allocation at
+the desired maximum temperature.  The actual sustained power can vary
+for a number of reasons.  The closed loop controller will take care of
+variations such as environmental conditions, and some factors related
+to the speed-grade of the silicon.  `sustainable_power` is therefore
+simply an estimate, and may be tuned to affect the aggressiveness of
+the thermal ramp. For reference, the sustainable power of a 4" phone
+is typically 2000mW, while on a 10" tablet is around 4500mW (may vary
+depending on screen size).
+
+If you are using device tree, do add it as a property of the
+thermal-zone.  For example::
+
+	thermal-zones {
+		soc_thermal {
+			polling-delay = <1000>;
+			polling-delay-passive = <100>;
+			sustainable-power = <2500>;
+			...
+
+Instead, if the thermal zone is registered from the platform code, pass a
+`thermal_zone_params` that has a `sustainable_power`.  If no
+`thermal_zone_params` were being passed, then something like below
+will suffice::
+
+	static const struct thermal_zone_params tz_params = {
+		.sustainable_power = 3500,
+	};
+
+and then pass `tz_params` as the 5th parameter to
+`thermal_zone_device_register()`
+
+k_po and k_pu
+-------------
+
+The implementation of the PID controller in the power allocator
+thermal governor allows the configuration of two proportional term
+constants: `k_po` and `k_pu`.  `k_po` is the proportional term
+constant during temperature overshoot periods (current temperature is
+above "desired temperature" trip point).  Conversely, `k_pu` is the
+proportional term constant during temperature undershoot periods
+(current temperature below "desired temperature" trip point).
+
+These controls are intended as the primary mechanism for configuring
+the permitted thermal "ramp" of the system.  For instance, a lower
+`k_pu` value will provide a slower ramp, at the cost of capping
+available capacity at a low temperature.  On the other hand, a high
+value of `k_pu` will result in the governor granting very high power
+while temperature is low, and may lead to temperature overshooting.
+
+The default value for `k_pu` is::
+
+    2 * sustainable_power / (desired_temperature - switch_on_temp)
+
+This means that at `switch_on_temp` the output of the controller's
+proportional term will be 2 * `sustainable_power`.  The default value
+for `k_po` is::
+
+    sustainable_power / (desired_temperature - switch_on_temp)
+
+Focusing on the proportional and feed forward values of the PID
+controller equation we have::
+
+    P_max = k_p * e + sustainable_power
+
+The proportional term is proportional to the difference between the
+desired temperature and the current one.  When the current temperature
+is the desired one, then the proportional component is zero and
+`P_max` = `sustainable_power`.  That is, the system should operate in
+thermal equilibrium under constant load.  `sustainable_power` is only
+an estimate, which is the reason for closed-loop control such as this.
+
+Expanding `k_pu` we get::
+
+    P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) +
+	sustainable_power
+
+where:
+
+    - T_set is the desired temperature
+    - T is the current temperature
+    - T_on is the switch on temperature
+
+When the current temperature is the switch_on temperature, the above
+formula becomes::
+
+    P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) +
+	sustainable_power = 2 * sustainable_power + sustainable_power =
+	3 * sustainable_power
+
+Therefore, the proportional term alone linearly decreases power from
+3 * `sustainable_power` to `sustainable_power` as the temperature
+rises from the switch on temperature to the desired temperature.
+
+k_i and integral_cutoff
+-----------------------
+
+`k_i` configures the PID loop's integral term constant.  This term
+allows the PID controller to compensate for long term drift and for
+the quantized nature of the output control: cooling devices can't set
+the exact power that the governor requests.  When the temperature
+error is below `integral_cutoff`, errors are accumulated in the
+integral term.  This term is then multiplied by `k_i` and the result
+added to the output of the controller.  Typically `k_i` is set low (1
+or 2) and `integral_cutoff` is 0.
+
+k_d
+---
+
+`k_d` configures the PID loop's derivative term constant.  It's
+recommended to leave it as the default: 0.
+
+Cooling device power API
+========================
+
+Cooling devices controlled by this governor must supply the additional
+"power" API in their `cooling_device_ops`.  It consists on three ops:
+
+1. ::
+
+    int get_requested_power(struct thermal_cooling_device *cdev,
+			    struct thermal_zone_device *tz, u32 *power);
+
+
+@cdev:
+	The `struct thermal_cooling_device` pointer
+@tz:
+	thermal zone in which we are currently operating
+@power:
+	pointer in which to store the calculated power
+
+`get_requested_power()` calculates the power requested by the device
+in milliwatts and stores it in @power .  It should return 0 on
+success, -E* on failure.  This is currently used by the power
+allocator governor to calculate how much power to give to each cooling
+device.
+
+2. ::
+
+	int state2power(struct thermal_cooling_device *cdev, struct
+			thermal_zone_device *tz, unsigned long state,
+			u32 *power);
+
+@cdev:
+	The `struct thermal_cooling_device` pointer
+@tz:
+	thermal zone in which we are currently operating
+@state:
+	A cooling device state
+@power:
+	pointer in which to store the equivalent power
+
+Convert cooling device state @state into power consumption in
+milliwatts and store it in @power.  It should return 0 on success, -E*
+on failure.  This is currently used by thermal core to calculate the
+maximum power that an actor can consume.
+
+3. ::
+
+	int power2state(struct thermal_cooling_device *cdev, u32 power,
+			unsigned long *state);
+
+@cdev:
+	The `struct thermal_cooling_device` pointer
+@power:
+	power in milliwatts
+@state:
+	pointer in which to store the resulting state
+
+Calculate a cooling device state that would make the device consume at
+most @power mW and store it in @state.  It should return 0 on success,
+-E* on failure.  This is currently used by the thermal core to convert
+a given power set by the power allocator governor to a state that the
+cooling device can set.  It is a function because this conversion may
+depend on external factors that may change so this function should the
+best conversion given "current circumstances".
+
+Cooling device weights
+----------------------
+
+Weights are a mechanism to bias the allocation among cooling
+devices.  They express the relative power efficiency of different
+cooling devices.  Higher weight can be used to express higher power
+efficiency.  Weighting is relative such that if each cooling device
+has a weight of one they are considered equal.  This is particularly
+useful in heterogeneous systems where two cooling devices may perform
+the same kind of compute, but with different efficiency.  For example,
+a system with two different types of processors.
+
+If the thermal zone is registered using
+`thermal_zone_device_register()` (i.e., platform code), then weights
+are passed as part of the thermal zone's `thermal_bind_parameters`.
+If the platform is registered using device tree, then they are passed
+as the `contribution` property of each map in the `cooling-maps` node.
+
+Limitations of the power allocator governor
+===========================================
+
+The power allocator governor's PID controller works best if there is a
+periodic tick.  If you have a driver that calls
+`thermal_zone_device_update()` (or anything that ends up calling the
+governor's `throttle()` function) repetitively, the governor response
+won't be very good.  Note that this is not particular to this
+governor, step-wise will also misbehave if you call its throttle()
+faster than the normal thermal framework tick (due to interrupts for
+example) as it will overreact.
diff --git a/Documentation/thermal/power_allocator.txt b/Documentation/thermal/power_allocator.txt
deleted file mode 100644
index 9fb0ff06dca9..000000000000
--- a/Documentation/thermal/power_allocator.txt
+++ /dev/null
@@ -1,247 +0,0 @@
-Power allocator governor tunables
-=================================
-
-Trip points
------------
-
-The governor works optimally with the following two passive trip points:
-
-1.  "switch on" trip point: temperature above which the governor
-    control loop starts operating.  This is the first passive trip
-    point of the thermal zone.
-
-2.  "desired temperature" trip point: it should be higher than the
-    "switch on" trip point.  This the target temperature the governor
-    is controlling for.  This is the last passive trip point of the
-    thermal zone.
-
-PID Controller
---------------
-
-The power allocator governor implements a
-Proportional-Integral-Derivative controller (PID controller) with
-temperature as the control input and power as the controlled output:
-
-    P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power
-
-where
-    e = desired_temperature - current_temperature
-    err_integral is the sum of previous errors
-    diff_err = e - previous_error
-
-It is similar to the one depicted below:
-
-                                      k_d
-                                       |
-current_temp                           |
-     |                                 v
-     |                +----------+   +---+
-     |         +----->| diff_err |-->| X |------+
-     |         |      +----------+   +---+      |
-     |         |                                |      tdp        actor
-     |         |                      k_i       |       |  get_requested_power()
-     |         |                       |        |       |        |     |
-     |         |                       |        |       |        |     | ...
-     v         |                       v        v       v        v     v
-   +---+       |      +-------+      +---+    +---+   +---+   +----------+
-   | S |-------+----->| sum e |----->| X |--->| S |-->| S |-->|power     |
-   +---+       |      +-------+      +---+    +---+   +---+   |allocation|
-     ^         |                                ^             +----------+
-     |         |                                |                |     |
-     |         |        +---+                   |                |     |
-     |         +------->| X |-------------------+                v     v
-     |                  +---+                               granted performance
-desired_temperature       ^
-                          |
-                          |
-                      k_po/k_pu
-
-Sustainable power
------------------
-
-An estimate of the sustainable dissipatable power (in mW) should be
-provided while registering the thermal zone.  This estimates the
-sustained power that can be dissipated at the desired control
-temperature.  This is the maximum sustained power for allocation at
-the desired maximum temperature.  The actual sustained power can vary
-for a number of reasons.  The closed loop controller will take care of
-variations such as environmental conditions, and some factors related
-to the speed-grade of the silicon.  `sustainable_power` is therefore
-simply an estimate, and may be tuned to affect the aggressiveness of
-the thermal ramp. For reference, the sustainable power of a 4" phone
-is typically 2000mW, while on a 10" tablet is around 4500mW (may vary
-depending on screen size).
-
-If you are using device tree, do add it as a property of the
-thermal-zone.  For example:
-
-	thermal-zones {
-		soc_thermal {
-			polling-delay = <1000>;
-			polling-delay-passive = <100>;
-			sustainable-power = <2500>;
-			...
-
-Instead, if the thermal zone is registered from the platform code, pass a
-`thermal_zone_params` that has a `sustainable_power`.  If no
-`thermal_zone_params` were being passed, then something like below
-will suffice:
-
-	static const struct thermal_zone_params tz_params = {
-		.sustainable_power = 3500,
-	};
-
-and then pass `tz_params` as the 5th parameter to
-`thermal_zone_device_register()`
-
-k_po and k_pu
--------------
-
-The implementation of the PID controller in the power allocator
-thermal governor allows the configuration of two proportional term
-constants: `k_po` and `k_pu`.  `k_po` is the proportional term
-constant during temperature overshoot periods (current temperature is
-above "desired temperature" trip point).  Conversely, `k_pu` is the
-proportional term constant during temperature undershoot periods
-(current temperature below "desired temperature" trip point).
-
-These controls are intended as the primary mechanism for configuring
-the permitted thermal "ramp" of the system.  For instance, a lower
-`k_pu` value will provide a slower ramp, at the cost of capping
-available capacity at a low temperature.  On the other hand, a high
-value of `k_pu` will result in the governor granting very high power
-while temperature is low, and may lead to temperature overshooting.
-
-The default value for `k_pu` is:
-
-    2 * sustainable_power / (desired_temperature - switch_on_temp)
-
-This means that at `switch_on_temp` the output of the controller's
-proportional term will be 2 * `sustainable_power`.  The default value
-for `k_po` is:
-
-    sustainable_power / (desired_temperature - switch_on_temp)
-
-Focusing on the proportional and feed forward values of the PID
-controller equation we have:
-
-    P_max = k_p * e + sustainable_power
-
-The proportional term is proportional to the difference between the
-desired temperature and the current one.  When the current temperature
-is the desired one, then the proportional component is zero and
-`P_max` = `sustainable_power`.  That is, the system should operate in
-thermal equilibrium under constant load.  `sustainable_power` is only
-an estimate, which is the reason for closed-loop control such as this.
-
-Expanding `k_pu` we get:
-    P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) +
-        sustainable_power
-
-where
-    T_set is the desired temperature
-    T is the current temperature
-    T_on is the switch on temperature
-
-When the current temperature is the switch_on temperature, the above
-formula becomes:
-
-    P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) +
-        sustainable_power = 2 * sustainable_power + sustainable_power =
-        3 * sustainable_power
-
-Therefore, the proportional term alone linearly decreases power from
-3 * `sustainable_power` to `sustainable_power` as the temperature
-rises from the switch on temperature to the desired temperature.
-
-k_i and integral_cutoff
------------------------
-
-`k_i` configures the PID loop's integral term constant.  This term
-allows the PID controller to compensate for long term drift and for
-the quantized nature of the output control: cooling devices can't set
-the exact power that the governor requests.  When the temperature
-error is below `integral_cutoff`, errors are accumulated in the
-integral term.  This term is then multiplied by `k_i` and the result
-added to the output of the controller.  Typically `k_i` is set low (1
-or 2) and `integral_cutoff` is 0.
-
-k_d
----
-
-`k_d` configures the PID loop's derivative term constant.  It's
-recommended to leave it as the default: 0.
-
-Cooling device power API
-========================
-
-Cooling devices controlled by this governor must supply the additional
-"power" API in their `cooling_device_ops`.  It consists on three ops:
-
-1. int get_requested_power(struct thermal_cooling_device *cdev,
-	struct thermal_zone_device *tz, u32 *power);
-@cdev: The `struct thermal_cooling_device` pointer
-@tz: thermal zone in which we are currently operating
-@power: pointer in which to store the calculated power
-
-`get_requested_power()` calculates the power requested by the device
-in milliwatts and stores it in @power .  It should return 0 on
-success, -E* on failure.  This is currently used by the power
-allocator governor to calculate how much power to give to each cooling
-device.
-
-2. int state2power(struct thermal_cooling_device *cdev, struct
-        thermal_zone_device *tz, unsigned long state, u32 *power);
-@cdev: The `struct thermal_cooling_device` pointer
-@tz: thermal zone in which we are currently operating
-@state: A cooling device state
-@power: pointer in which to store the equivalent power
-
-Convert cooling device state @state into power consumption in
-milliwatts and store it in @power.  It should return 0 on success, -E*
-on failure.  This is currently used by thermal core to calculate the
-maximum power that an actor can consume.
-
-3. int power2state(struct thermal_cooling_device *cdev, u32 power,
-	unsigned long *state);
-@cdev: The `struct thermal_cooling_device` pointer
-@power: power in milliwatts
-@state: pointer in which to store the resulting state
-
-Calculate a cooling device state that would make the device consume at
-most @power mW and store it in @state.  It should return 0 on success,
--E* on failure.  This is currently used by the thermal core to convert
-a given power set by the power allocator governor to a state that the
-cooling device can set.  It is a function because this conversion may
-depend on external factors that may change so this function should the
-best conversion given "current circumstances".
-
-Cooling device weights
-----------------------
-
-Weights are a mechanism to bias the allocation among cooling
-devices.  They express the relative power efficiency of different
-cooling devices.  Higher weight can be used to express higher power
-efficiency.  Weighting is relative such that if each cooling device
-has a weight of one they are considered equal.  This is particularly
-useful in heterogeneous systems where two cooling devices may perform
-the same kind of compute, but with different efficiency.  For example,
-a system with two different types of processors.
-
-If the thermal zone is registered using
-`thermal_zone_device_register()` (i.e., platform code), then weights
-are passed as part of the thermal zone's `thermal_bind_parameters`.
-If the platform is registered using device tree, then they are passed
-as the `contribution` property of each map in the `cooling-maps` node.
-
-Limitations of the power allocator governor
-===========================================
-
-The power allocator governor's PID controller works best if there is a
-periodic tick.  If you have a driver that calls
-`thermal_zone_device_update()` (or anything that ends up calling the
-governor's `throttle()` function) repetitively, the governor response
-won't be very good.  Note that this is not particular to this
-governor, step-wise will also misbehave if you call its throttle()
-faster than the normal thermal framework tick (due to interrupts for
-example) as it will overreact.
diff --git a/Documentation/thermal/sysfs-api.rst b/Documentation/thermal/sysfs-api.rst
new file mode 100644
index 000000000000..e4930761d3e5
--- /dev/null
+++ b/Documentation/thermal/sysfs-api.rst
@@ -0,0 +1,798 @@
+===================================
+Generic Thermal Sysfs driver How To
+===================================
+
+Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com>
+
+Updated: 2 January 2008
+
+Copyright (c)  2008 Intel Corporation
+
+
+0. Introduction
+===============
+
+The generic thermal sysfs provides a set of interfaces for thermal zone
+devices (sensors) and thermal cooling devices (fan, processor...) to register
+with the thermal management solution and to be a part of it.
+
+This how-to focuses on enabling new thermal zone and cooling devices to
+participate in thermal management.
+This solution is platform independent and any type of thermal zone devices
+and cooling devices should be able to make use of the infrastructure.
+
+The main task of the thermal sysfs driver is to expose thermal zone attributes
+as well as cooling device attributes to the user space.
+An intelligent thermal management application can make decisions based on
+inputs from thermal zone attributes (the current temperature and trip point
+temperature) and throttle appropriate devices.
+
+- `[0-*]`	denotes any positive number starting from 0
+- `[1-*]`	denotes any positive number starting from 1
+
+1. thermal sysfs driver interface functions
+===========================================
+
+1.1 thermal zone device interface
+---------------------------------
+
+    ::
+
+	struct thermal_zone_device
+	*thermal_zone_device_register(char *type,
+				      int trips, int mask, void *devdata,
+				      struct thermal_zone_device_ops *ops,
+				      const struct thermal_zone_params *tzp,
+				      int passive_delay, int polling_delay))
+
+    This interface function adds a new thermal zone device (sensor) to
+    /sys/class/thermal folder as `thermal_zone[0-*]`. It tries to bind all the
+    thermal cooling devices registered at the same time.
+
+    type:
+	the thermal zone type.
+    trips:
+	the total number of trip points this thermal zone supports.
+    mask:
+	Bit string: If 'n'th bit is set, then trip point 'n' is writeable.
+    devdata:
+	device private data
+    ops:
+	thermal zone device call-backs.
+
+	.bind:
+		bind the thermal zone device with a thermal cooling device.
+	.unbind:
+		unbind the thermal zone device with a thermal cooling device.
+	.get_temp:
+		get the current temperature of the thermal zone.
+	.set_trips:
+		    set the trip points window. Whenever the current temperature
+		    is updated, the trip points immediately below and above the
+		    current temperature are found.
+	.get_mode:
+		   get the current mode (enabled/disabled) of the thermal zone.
+
+			- "enabled" means the kernel thermal management is
+			  enabled.
+			- "disabled" will prevent kernel thermal driver action
+			  upon trip points so that user applications can take
+			  charge of thermal management.
+	.set_mode:
+		set the mode (enabled/disabled) of the thermal zone.
+	.get_trip_type:
+		get the type of certain trip point.
+	.get_trip_temp:
+			get the temperature above which the certain trip point
+			will be fired.
+	.set_emul_temp:
+			set the emulation temperature which helps in debugging
+			different threshold temperature points.
+    tzp:
+	thermal zone platform parameters.
+    passive_delay:
+	number of milliseconds to wait between polls when
+	performing passive cooling.
+    polling_delay:
+	number of milliseconds to wait between polls when checking
+	whether trip points have been crossed (0 for interrupt driven systems).
+
+    ::
+
+	void thermal_zone_device_unregister(struct thermal_zone_device *tz)
+
+    This interface function removes the thermal zone device.
+    It deletes the corresponding entry from /sys/class/thermal folder and
+    unbinds all the thermal cooling devices it uses.
+
+	::
+
+	   struct thermal_zone_device
+	   *thermal_zone_of_sensor_register(struct device *dev, int sensor_id,
+				void *data,
+				const struct thermal_zone_of_device_ops *ops)
+
+	This interface adds a new sensor to a DT thermal zone.
+	This function will search the list of thermal zones described in
+	device tree and look for the zone that refer to the sensor device
+	pointed by dev->of_node as temperature providers. For the zone
+	pointing to the sensor node, the sensor will be added to the DT
+	thermal zone device.
+
+	The parameters for this interface are:
+
+	dev:
+			Device node of sensor containing valid node pointer in
+			dev->of_node.
+	sensor_id:
+			a sensor identifier, in case the sensor IP has more
+			than one sensors
+	data:
+			a private pointer (owned by the caller) that will be
+			passed back, when a temperature reading is needed.
+	ops:
+			`struct thermal_zone_of_device_ops *`.
+
+			==============  =======================================
+			get_temp	a pointer to a function that reads the
+					sensor temperature. This is mandatory
+					callback provided by sensor driver.
+			set_trips	a pointer to a function that sets a
+					temperature window. When this window is
+					left the driver must inform the thermal
+					core via thermal_zone_device_update.
+			get_trend 	a pointer to a function that reads the
+					sensor temperature trend.
+			set_emul_temp	a pointer to a function that sets
+					sensor emulated temperature.
+			==============  =======================================
+
+	The thermal zone temperature is provided by the get_temp() function
+	pointer of thermal_zone_of_device_ops. When called, it will
+	have the private pointer @data back.
+
+	It returns error pointer if fails otherwise valid thermal zone device
+	handle. Caller should check the return handle with IS_ERR() for finding
+	whether success or not.
+
+	::
+
+	    void thermal_zone_of_sensor_unregister(struct device *dev,
+						   struct thermal_zone_device *tzd)
+
+	This interface unregisters a sensor from a DT thermal zone which was
+	successfully added by interface thermal_zone_of_sensor_register().
+	This function removes the sensor callbacks and private data from the
+	thermal zone device registered with thermal_zone_of_sensor_register()
+	interface. It will also silent the zone by remove the .get_temp() and
+	get_trend() thermal zone device callbacks.
+
+	::
+
+	  struct thermal_zone_device
+	  *devm_thermal_zone_of_sensor_register(struct device *dev,
+				int sensor_id,
+				void *data,
+				const struct thermal_zone_of_device_ops *ops)
+
+	This interface is resource managed version of
+	thermal_zone_of_sensor_register().
+
+	All details of thermal_zone_of_sensor_register() described in
+	section 1.1.3 is applicable here.
+
+	The benefit of using this interface to register sensor is that it
+	is not require to explicitly call thermal_zone_of_sensor_unregister()
+	in error path or during driver unbinding as this is done by driver
+	resource manager.
+
+	::
+
+		void devm_thermal_zone_of_sensor_unregister(struct device *dev,
+						struct thermal_zone_device *tzd)
+
+	This interface is resource managed version of
+	thermal_zone_of_sensor_unregister().
+	All details of thermal_zone_of_sensor_unregister() described in
+	section 1.1.4 is applicable here.
+	Normally this function will not need to be called and the resource
+	management code will ensure that the resource is freed.
+
+	::
+
+		int thermal_zone_get_slope(struct thermal_zone_device *tz)
+
+	This interface is used to read the slope attribute value
+	for the thermal zone device, which might be useful for platform
+	drivers for temperature calculations.
+
+	::
+
+		int thermal_zone_get_offset(struct thermal_zone_device *tz)
+
+	This interface is used to read the offset attribute value
+	for the thermal zone device, which might be useful for platform
+	drivers for temperature calculations.
+
+1.2 thermal cooling device interface
+------------------------------------
+
+
+    ::
+
+	struct thermal_cooling_device
+	*thermal_cooling_device_register(char *name,
+			void *devdata, struct thermal_cooling_device_ops *)
+
+    This interface function adds a new thermal cooling device (fan/processor/...)
+    to /sys/class/thermal/ folder as `cooling_device[0-*]`. It tries to bind itself
+    to all the thermal zone devices registered at the same time.
+
+    name:
+	the cooling device name.
+    devdata:
+	device private data.
+    ops:
+	thermal cooling devices call-backs.
+
+	.get_max_state:
+		get the Maximum throttle state of the cooling device.
+	.get_cur_state:
+		get the Currently requested throttle state of the
+		cooling device.
+	.set_cur_state:
+		set the Current throttle state of the cooling device.
+
+    ::
+
+	void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
+
+    This interface function removes the thermal cooling device.
+    It deletes the corresponding entry from /sys/class/thermal folder and
+    unbinds itself from all the thermal zone devices using it.
+
+1.3 interface for binding a thermal zone device with a thermal cooling device
+-----------------------------------------------------------------------------
+
+    ::
+
+	int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
+		int trip, struct thermal_cooling_device *cdev,
+		unsigned long upper, unsigned long lower, unsigned int weight);
+
+    This interface function binds a thermal cooling device to a particular trip
+    point of a thermal zone device.
+
+    This function is usually called in the thermal zone device .bind callback.
+
+    tz:
+	  the thermal zone device
+    cdev:
+	  thermal cooling device
+    trip:
+	  indicates which trip point in this thermal zone the cooling device
+	  is associated with.
+    upper:
+	  the Maximum cooling state for this trip point.
+	  THERMAL_NO_LIMIT means no upper limit,
+	  and the cooling device can be in max_state.
+    lower:
+	  the Minimum cooling state can be used for this trip point.
+	  THERMAL_NO_LIMIT means no lower limit,
+	  and the cooling device can be in cooling state 0.
+    weight:
+	  the influence of this cooling device in this thermal
+	  zone.  See 1.4.1 below for more information.
+
+    ::
+
+	int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
+				int trip, struct thermal_cooling_device *cdev);
+
+    This interface function unbinds a thermal cooling device from a particular
+    trip point of a thermal zone device. This function is usually called in
+    the thermal zone device .unbind callback.
+
+    tz:
+	the thermal zone device
+    cdev:
+	thermal cooling device
+    trip:
+	indicates which trip point in this thermal zone the cooling device
+	is associated with.
+
+1.4 Thermal Zone Parameters
+---------------------------
+
+    ::
+
+	struct thermal_bind_params
+
+    This structure defines the following parameters that are used to bind
+    a zone with a cooling device for a particular trip point.
+
+    .cdev:
+	     The cooling device pointer
+    .weight:
+	     The 'influence' of a particular cooling device on this
+	     zone. This is relative to the rest of the cooling
+	     devices. For example, if all cooling devices have a
+	     weight of 1, then they all contribute the same. You can
+	     use percentages if you want, but it's not mandatory. A
+	     weight of 0 means that this cooling device doesn't
+	     contribute to the cooling of this zone unless all cooling
+	     devices have a weight of 0. If all weights are 0, then
+	     they all contribute the same.
+    .trip_mask:
+	       This is a bit mask that gives the binding relation between
+	       this thermal zone and cdev, for a particular trip point.
+	       If nth bit is set, then the cdev and thermal zone are bound
+	       for trip point n.
+    .binding_limits:
+		     This is an array of cooling state limits. Must have
+		     exactly 2 * thermal_zone.number_of_trip_points. It is an
+		     array consisting of tuples <lower-state upper-state> of
+		     state limits. Each trip will be associated with one state
+		     limit tuple when binding. A NULL pointer means
+		     <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
+		     These limits are used when binding a cdev to a trip point.
+    .match:
+	    This call back returns success(0) if the 'tz and cdev' need to
+	    be bound, as per platform data.
+
+    ::
+
+	struct thermal_zone_params
+
+    This structure defines the platform level parameters for a thermal zone.
+    This data, for each thermal zone should come from the platform layer.
+    This is an optional feature where some platforms can choose not to
+    provide this data.
+
+    .governor_name:
+	       Name of the thermal governor used for this zone
+    .no_hwmon:
+	       a boolean to indicate if the thermal to hwmon sysfs interface
+	       is required. when no_hwmon == false, a hwmon sysfs interface
+	       will be created. when no_hwmon == true, nothing will be done.
+	       In case the thermal_zone_params is NULL, the hwmon interface
+	       will be created (for backward compatibility).
+    .num_tbps:
+	       Number of thermal_bind_params entries for this zone
+    .tbp:
+	       thermal_bind_params entries
+
+2. sysfs attributes structure
+=============================
+
+==	================
+RO	read only value
+WO	write only value
+RW	read/write value
+==	================
+
+Thermal sysfs attributes will be represented under /sys/class/thermal.
+Hwmon sysfs I/F extension is also available under /sys/class/hwmon
+if hwmon is compiled in or built as a module.
+
+Thermal zone device sys I/F, created once it's registered::
+
+  /sys/class/thermal/thermal_zone[0-*]:
+    |---type:			Type of the thermal zone
+    |---temp:			Current temperature
+    |---mode:			Working mode of the thermal zone
+    |---policy:			Thermal governor used for this zone
+    |---available_policies:	Available thermal governors for this zone
+    |---trip_point_[0-*]_temp:	Trip point temperature
+    |---trip_point_[0-*]_type:	Trip point type
+    |---trip_point_[0-*]_hyst:	Hysteresis value for this trip point
+    |---emul_temp:		Emulated temperature set node
+    |---sustainable_power:      Sustainable dissipatable power
+    |---k_po:                   Proportional term during temperature overshoot
+    |---k_pu:                   Proportional term during temperature undershoot
+    |---k_i:                    PID's integral term in the power allocator gov
+    |---k_d:                    PID's derivative term in the power allocator
+    |---integral_cutoff:        Offset above which errors are accumulated
+    |---slope:                  Slope constant applied as linear extrapolation
+    |---offset:                 Offset constant applied as linear extrapolation
+
+Thermal cooling device sys I/F, created once it's registered::
+
+  /sys/class/thermal/cooling_device[0-*]:
+    |---type:			Type of the cooling device(processor/fan/...)
+    |---max_state:		Maximum cooling state of the cooling device
+    |---cur_state:		Current cooling state of the cooling device
+    |---stats:			Directory containing cooling device's statistics
+    |---stats/reset:		Writing any value resets the statistics
+    |---stats/time_in_state_ms:	Time (msec) spent in various cooling states
+    |---stats/total_trans:	Total number of times cooling state is changed
+    |---stats/trans_table:	Cooing state transition table
+
+
+Then next two dynamic attributes are created/removed in pairs. They represent
+the relationship between a thermal zone and its associated cooling device.
+They are created/removed for each successful execution of
+thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device.
+
+::
+
+  /sys/class/thermal/thermal_zone[0-*]:
+    |---cdev[0-*]:		[0-*]th cooling device in current thermal zone
+    |---cdev[0-*]_trip_point:	Trip point that cdev[0-*] is associated with
+    |---cdev[0-*]_weight:       Influence of the cooling device in
+				this thermal zone
+
+Besides the thermal zone device sysfs I/F and cooling device sysfs I/F,
+the generic thermal driver also creates a hwmon sysfs I/F for each _type_
+of thermal zone device. E.g. the generic thermal driver registers one hwmon
+class device and build the associated hwmon sysfs I/F for all the registered
+ACPI thermal zones.
+
+::
+
+  /sys/class/hwmon/hwmon[0-*]:
+    |---name:			The type of the thermal zone devices
+    |---temp[1-*]_input:	The current temperature of thermal zone [1-*]
+    |---temp[1-*]_critical:	The critical trip point of thermal zone [1-*]
+
+Please read Documentation/hwmon/sysfs-interface.rst for additional information.
+
+Thermal zone attributes
+-----------------------
+
+type
+	Strings which represent the thermal zone type.
+	This is given by thermal zone driver as part of registration.
+	E.g: "acpitz" indicates it's an ACPI thermal device.
+	In order to keep it consistent with hwmon sys attribute; this should
+	be a short, lowercase string, not containing spaces nor dashes.
+	RO, Required
+
+temp
+	Current temperature as reported by thermal zone (sensor).
+	Unit: millidegree Celsius
+	RO, Required
+
+mode
+	One of the predefined values in [enabled, disabled].
+	This file gives information about the algorithm that is currently
+	managing the thermal zone. It can be either default kernel based
+	algorithm or user space application.
+
+	enabled
+			  enable Kernel Thermal management.
+	disabled
+			  Preventing kernel thermal zone driver actions upon
+			  trip points so that user application can take full
+			  charge of the thermal management.
+
+	RW, Optional
+
+policy
+	One of the various thermal governors used for a particular zone.
+
+	RW, Required
+
+available_policies
+	Available thermal governors which can be used for a particular zone.
+
+	RO, Required
+
+`trip_point_[0-*]_temp`
+	The temperature above which trip point will be fired.
+
+	Unit: millidegree Celsius
+
+	RO, Optional
+
+`trip_point_[0-*]_type`
+	Strings which indicate the type of the trip point.
+
+	E.g. it can be one of critical, hot, passive, `active[0-*]` for ACPI
+	thermal zone.
+
+	RO, Optional
+
+`trip_point_[0-*]_hyst`
+	The hysteresis value for a trip point, represented as an integer
+	Unit: Celsius
+	RW, Optional
+
+`cdev[0-*]`
+	Sysfs link to the thermal cooling device node where the sys I/F
+	for cooling device throttling control represents.
+
+	RO, Optional
+
+`cdev[0-*]_trip_point`
+	The trip point in this thermal zone which `cdev[0-*]` is associated
+	with; -1 means the cooling device is not associated with any trip
+	point.
+
+	RO, Optional
+
+`cdev[0-*]_weight`
+	The influence of `cdev[0-*]` in this thermal zone. This value
+	is relative to the rest of cooling devices in the thermal
+	zone. For example, if a cooling device has a weight double
+	than that of other, it's twice as effective in cooling the
+	thermal zone.
+
+	RW, Optional
+
+passive
+	Attribute is only present for zones in which the passive cooling
+	policy is not supported by native thermal driver. Default is zero
+	and can be set to a temperature (in millidegrees) to enable a
+	passive trip point for the zone. Activation is done by polling with
+	an interval of 1 second.
+
+	Unit: millidegrees Celsius
+
+	Valid values: 0 (disabled) or greater than 1000
+
+	RW, Optional
+
+emul_temp
+	Interface to set the emulated temperature method in thermal zone
+	(sensor). After setting this temperature, the thermal zone may pass
+	this temperature to platform emulation function if registered or
+	cache it locally. This is useful in debugging different temperature
+	threshold and its associated cooling action. This is write only node
+	and writing 0 on this node should disable emulation.
+	Unit: millidegree Celsius
+
+	WO, Optional
+
+	  WARNING:
+	    Be careful while enabling this option on production systems,
+	    because userland can easily disable the thermal policy by simply
+	    flooding this sysfs node with low temperature values.
+
+sustainable_power
+	An estimate of the sustained power that can be dissipated by
+	the thermal zone. Used by the power allocator governor. For
+	more information see Documentation/thermal/power_allocator.rst
+
+	Unit: milliwatts
+
+	RW, Optional
+
+k_po
+	The proportional term of the power allocator governor's PID
+	controller during temperature overshoot. Temperature overshoot
+	is when the current temperature is above the "desired
+	temperature" trip point. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+k_pu
+	The proportional term of the power allocator governor's PID
+	controller during temperature undershoot. Temperature undershoot
+	is when the current temperature is below the "desired
+	temperature" trip point. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+k_i
+	The integral term of the power allocator governor's PID
+	controller. This term allows the PID controller to compensate
+	for long term drift. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+k_d
+	The derivative term of the power allocator governor's PID
+	controller. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+integral_cutoff
+	Temperature offset from the desired temperature trip point
+	above which the integral term of the power allocator
+	governor's PID controller starts accumulating errors. For
+	example, if integral_cutoff is 0, then the integral term only
+	accumulates error when temperature is above the desired
+	temperature trip point. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	Unit: millidegree Celsius
+
+	RW, Optional
+
+slope
+	The slope constant used in a linear extrapolation model
+	to determine a hotspot temperature based off the sensor's
+	raw readings. It is up to the device driver to determine
+	the usage of these values.
+
+	RW, Optional
+
+offset
+	The offset constant used in a linear extrapolation model
+	to determine a hotspot temperature based off the sensor's
+	raw readings. It is up to the device driver to determine
+	the usage of these values.
+
+	RW, Optional
+
+Cooling device attributes
+-------------------------
+
+type
+	String which represents the type of device, e.g:
+
+	- for generic ACPI: should be "Fan", "Processor" or "LCD"
+	- for memory controller device on intel_menlow platform:
+	  should be "Memory controller".
+
+	RO, Required
+
+max_state
+	The maximum permissible cooling state of this cooling device.
+
+	RO, Required
+
+cur_state
+	The current cooling state of this cooling device.
+	The value can any integer numbers between 0 and max_state:
+
+	- cur_state == 0 means no cooling
+	- cur_state == max_state means the maximum cooling.
+
+	RW, Required
+
+stats/reset
+	Writing any value resets the cooling device's statistics.
+	WO, Required
+
+stats/time_in_state_ms:
+	The amount of time spent by the cooling device in various cooling
+	states. The output will have "<state> <time>" pair in each line, which
+	will mean this cooling device spent <time> msec of time at <state>.
+	Output will have one line for each of the supported states.  usertime
+	units here is 10mS (similar to other time exported in /proc).
+	RO, Required
+
+
+stats/total_trans:
+	A single positive value showing the total number of times the state of a
+	cooling device is changed.
+
+	RO, Required
+
+stats/trans_table:
+	This gives fine grained information about all the cooling state
+	transitions. The cat output here is a two dimensional matrix, where an
+	entry <i,j> (row i, column j) represents the number of transitions from
+	State_i to State_j. If the transition table is bigger than PAGE_SIZE,
+	reading this will return an -EFBIG error.
+	RO, Required
+
+3. A simple implementation
+==========================
+
+ACPI thermal zone may support multiple trip points like critical, hot,
+passive, active. If an ACPI thermal zone supports critical, passive,
+active[0] and active[1] at the same time, it may register itself as a
+thermal_zone_device (thermal_zone1) with 4 trip points in all.
+It has one processor and one fan, which are both registered as
+thermal_cooling_device. Both are considered to have the same
+effectiveness in cooling the thermal zone.
+
+If the processor is listed in _PSL method, and the fan is listed in _AL0
+method, the sys I/F structure will be built like this::
+
+ /sys/class/thermal:
+  |thermal_zone1:
+    |---type:			acpitz
+    |---temp:			37000
+    |---mode:			enabled
+    |---policy:			step_wise
+    |---available_policies:	step_wise fair_share
+    |---trip_point_0_temp:	100000
+    |---trip_point_0_type:	critical
+    |---trip_point_1_temp:	80000
+    |---trip_point_1_type:	passive
+    |---trip_point_2_temp:	70000
+    |---trip_point_2_type:	active0
+    |---trip_point_3_temp:	60000
+    |---trip_point_3_type:	active1
+    |---cdev0:			--->/sys/class/thermal/cooling_device0
+    |---cdev0_trip_point:	1	/* cdev0 can be used for passive */
+    |---cdev0_weight:           1024
+    |---cdev1:			--->/sys/class/thermal/cooling_device3
+    |---cdev1_trip_point:	2	/* cdev1 can be used for active[0]*/
+    |---cdev1_weight:           1024
+
+  |cooling_device0:
+    |---type:			Processor
+    |---max_state:		8
+    |---cur_state:		0
+
+  |cooling_device3:
+    |---type:			Fan
+    |---max_state:		2
+    |---cur_state:		0
+
+ /sys/class/hwmon:
+  |hwmon0:
+    |---name:			acpitz
+    |---temp1_input:		37000
+    |---temp1_crit:		100000
+
+4. Event Notification
+=====================
+
+The framework includes a simple notification mechanism, in the form of a
+netlink event. Netlink socket initialization is done during the _init_
+of the framework. Drivers which intend to use the notification mechanism
+just need to call thermal_generate_netlink_event() with two arguments viz
+(originator, event). The originator is a pointer to struct thermal_zone_device
+from where the event has been originated. An integer which represents the
+thermal zone device will be used in the message to identify the zone. The
+event will be one of:{THERMAL_AUX0, THERMAL_AUX1, THERMAL_CRITICAL,
+THERMAL_DEV_FAULT}. Notification can be sent when the current temperature
+crosses any of the configured thresholds.
+
+5. Export Symbol APIs
+=====================
+
+5.1. get_tz_trend
+-----------------
+
+This function returns the trend of a thermal zone, i.e the rate of change
+of temperature of the thermal zone. Ideally, the thermal sensor drivers
+are supposed to implement the callback. If they don't, the thermal
+framework calculated the trend by comparing the previous and the current
+temperature values.
+
+5.2. get_thermal_instance
+-------------------------
+
+This function returns the thermal_instance corresponding to a given
+{thermal_zone, cooling_device, trip_point} combination. Returns NULL
+if such an instance does not exist.
+
+5.3. thermal_notify_framework
+-----------------------------
+
+This function handles the trip events from sensor drivers. It starts
+throttling the cooling devices according to the policy configured.
+For CRITICAL and HOT trip points, this notifies the respective drivers,
+and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
+The throttling policy is based on the configured platform data; if no
+platform data is provided, this uses the step_wise throttling policy.
+
+5.4. thermal_cdev_update
+------------------------
+
+This function serves as an arbitrator to set the state of a cooling
+device. It sets the cooling device to the deepest cooling state if
+possible.
+
+6. thermal_emergency_poweroff
+=============================
+
+On an event of critical trip temperature crossing. Thermal framework
+allows the system to shutdown gracefully by calling orderly_poweroff().
+In the event of a failure of orderly_poweroff() to shut down the system
+we are in danger of keeping the system alive at undesirably high
+temperatures. To mitigate this high risk scenario we program a work
+queue to fire after a pre-determined number of seconds to start
+an emergency shutdown of the device using the kernel_power_off()
+function. In case kernel_power_off() fails then finally
+emergency_restart() is called in the worst case.
+
+The delay should be carefully profiled so as to give adequate time for
+orderly_poweroff(). In case of failure of an orderly_poweroff() the
+emergency poweroff kicks in after the delay has elapsed and shuts down
+the system.
+
+If set to 0 emergency poweroff will not be supported. So a carefully
+profiled non-zero positive value is a must for emergerncy poweroff to be
+triggered.
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
deleted file mode 100644
index c3fa500df92c..000000000000
--- a/Documentation/thermal/sysfs-api.txt
+++ /dev/null
@@ -1,636 +0,0 @@
-Generic Thermal Sysfs driver How To
-===================================
-
-Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com>
-
-Updated: 2 January 2008
-
-Copyright (c)  2008 Intel Corporation
-
-
-0. Introduction
-
-The generic thermal sysfs provides a set of interfaces for thermal zone
-devices (sensors) and thermal cooling devices (fan, processor...) to register
-with the thermal management solution and to be a part of it.
-
-This how-to focuses on enabling new thermal zone and cooling devices to
-participate in thermal management.
-This solution is platform independent and any type of thermal zone devices
-and cooling devices should be able to make use of the infrastructure.
-
-The main task of the thermal sysfs driver is to expose thermal zone attributes
-as well as cooling device attributes to the user space.
-An intelligent thermal management application can make decisions based on
-inputs from thermal zone attributes (the current temperature and trip point
-temperature) and throttle appropriate devices.
-
-[0-*]	denotes any positive number starting from 0
-[1-*]	denotes any positive number starting from 1
-
-1. thermal sysfs driver interface functions
-
-1.1 thermal zone device interface
-1.1.1 struct thermal_zone_device *thermal_zone_device_register(char *type,
-		int trips, int mask, void *devdata,
-		struct thermal_zone_device_ops *ops,
-		const struct thermal_zone_params *tzp,
-		int passive_delay, int polling_delay))
-
-    This interface function adds a new thermal zone device (sensor) to
-    /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the
-    thermal cooling devices registered at the same time.
-
-    type: the thermal zone type.
-    trips: the total number of trip points this thermal zone supports.
-    mask: Bit string: If 'n'th bit is set, then trip point 'n' is writeable.
-    devdata: device private data
-    ops: thermal zone device call-backs.
-	.bind: bind the thermal zone device with a thermal cooling device.
-	.unbind: unbind the thermal zone device with a thermal cooling device.
-	.get_temp: get the current temperature of the thermal zone.
-	.set_trips: set the trip points window. Whenever the current temperature
-		    is updated, the trip points immediately below and above the
-		    current temperature are found.
-	.get_mode: get the current mode (enabled/disabled) of the thermal zone.
-	    - "enabled" means the kernel thermal management is enabled.
-	    - "disabled" will prevent kernel thermal driver action upon trip points
-	      so that user applications can take charge of thermal management.
-	.set_mode: set the mode (enabled/disabled) of the thermal zone.
-	.get_trip_type: get the type of certain trip point.
-	.get_trip_temp: get the temperature above which the certain trip point
-			will be fired.
-	.set_emul_temp: set the emulation temperature which helps in debugging
-			different threshold temperature points.
-    tzp: thermal zone platform parameters.
-    passive_delay: number of milliseconds to wait between polls when
-	performing passive cooling.
-    polling_delay: number of milliseconds to wait between polls when checking
-	whether trip points have been crossed (0 for interrupt driven systems).
-
-
-1.1.2 void thermal_zone_device_unregister(struct thermal_zone_device *tz)
-
-    This interface function removes the thermal zone device.
-    It deletes the corresponding entry from /sys/class/thermal folder and
-    unbinds all the thermal cooling devices it uses.
-
-1.1.3 struct thermal_zone_device *thermal_zone_of_sensor_register(
-		struct device *dev, int sensor_id, void *data,
-		const struct thermal_zone_of_device_ops *ops)
-
-	This interface adds a new sensor to a DT thermal zone.
-	This function will search the list of thermal zones described in
-	device tree and look for the zone that refer to the sensor device
-	pointed by dev->of_node as temperature providers. For the zone
-	pointing to the sensor node, the sensor will be added to the DT
-	thermal zone device.
-
-	The parameters for this interface are:
-	dev:		Device node of sensor containing valid node pointer in
-			dev->of_node.
-	sensor_id:	a sensor identifier, in case the sensor IP has more
-			than one sensors
-	data:		a private pointer (owned by the caller) that will be
-			passed back, when a temperature reading is needed.
-	ops:		struct thermal_zone_of_device_ops *.
-
-			get_temp:	a pointer to a function that reads the
-					sensor temperature. This is mandatory
-					callback provided by sensor driver.
-			set_trips:      a pointer to a function that sets a
-					temperature window. When this window is
-					left the driver must inform the thermal
-					core via thermal_zone_device_update.
-			get_trend: 	a pointer to a function that reads the
-					sensor temperature trend.
-			set_emul_temp:	a pointer to a function that sets
-					sensor emulated temperature.
-	The thermal zone temperature is provided by the get_temp() function
-	pointer of thermal_zone_of_device_ops. When called, it will
-	have the private pointer @data back.
-
-	It returns error pointer if fails otherwise valid thermal zone device
-	handle. Caller should check the return handle with IS_ERR() for finding
-	whether success or not.
-
-1.1.4 void thermal_zone_of_sensor_unregister(struct device *dev,
-		struct thermal_zone_device *tzd)
-
-	This interface unregisters a sensor from a DT thermal zone which was
-	successfully added by interface thermal_zone_of_sensor_register().
-	This function removes the sensor callbacks and private data from the
-	thermal zone device registered with thermal_zone_of_sensor_register()
-	interface. It will also silent the zone by remove the .get_temp() and
-	get_trend() thermal zone device callbacks.
-
-1.1.5 struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
-		struct device *dev, int sensor_id,
-		void *data, const struct thermal_zone_of_device_ops *ops)
-
-	This interface is resource managed version of
-	thermal_zone_of_sensor_register().
-	All details of thermal_zone_of_sensor_register() described in
-	section 1.1.3 is applicable here.
-	The benefit of using this interface to register sensor is that it
-	is not require to explicitly call thermal_zone_of_sensor_unregister()
-	in error path or during driver unbinding as this is done by driver
-	resource manager.
-
-1.1.6 void devm_thermal_zone_of_sensor_unregister(struct device *dev,
-		struct thermal_zone_device *tzd)
-
-	This interface is resource managed version of
-	thermal_zone_of_sensor_unregister().
-	All details of thermal_zone_of_sensor_unregister() described in
-	section 1.1.4 is applicable here.
-	Normally this function will not need to be called and the resource
-	management code will ensure that the resource is freed.
-
-1.1.7 int thermal_zone_get_slope(struct thermal_zone_device *tz)
-
-	This interface is used to read the slope attribute value
-	for the thermal zone device, which might be useful for platform
-	drivers for temperature calculations.
-
-1.1.8 int thermal_zone_get_offset(struct thermal_zone_device *tz)
-
-	This interface is used to read the offset attribute value
-	for the thermal zone device, which might be useful for platform
-	drivers for temperature calculations.
-
-1.2 thermal cooling device interface
-1.2.1 struct thermal_cooling_device *thermal_cooling_device_register(char *name,
-		void *devdata, struct thermal_cooling_device_ops *)
-
-    This interface function adds a new thermal cooling device (fan/processor/...)
-    to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself
-    to all the thermal zone devices registered at the same time.
-    name: the cooling device name.
-    devdata: device private data.
-    ops: thermal cooling devices call-backs.
-	.get_max_state: get the Maximum throttle state of the cooling device.
-	.get_cur_state: get the Currently requested throttle state of the cooling device.
-	.set_cur_state: set the Current throttle state of the cooling device.
-
-1.2.2 void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
-
-    This interface function removes the thermal cooling device.
-    It deletes the corresponding entry from /sys/class/thermal folder and
-    unbinds itself from all the thermal zone devices using it.
-
-1.3 interface for binding a thermal zone device with a thermal cooling device
-1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
-	int trip, struct thermal_cooling_device *cdev,
-	unsigned long upper, unsigned long lower, unsigned int weight);
-
-    This interface function binds a thermal cooling device to a particular trip
-    point of a thermal zone device.
-    This function is usually called in the thermal zone device .bind callback.
-    tz: the thermal zone device
-    cdev: thermal cooling device
-    trip: indicates which trip point in this thermal zone the cooling device
-          is associated with.
-    upper:the Maximum cooling state for this trip point.
-          THERMAL_NO_LIMIT means no upper limit,
-	  and the cooling device can be in max_state.
-    lower:the Minimum cooling state can be used for this trip point.
-          THERMAL_NO_LIMIT means no lower limit,
-	  and the cooling device can be in cooling state 0.
-    weight: the influence of this cooling device in this thermal
-            zone.  See 1.4.1 below for more information.
-
-1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
-		int trip, struct thermal_cooling_device *cdev);
-
-    This interface function unbinds a thermal cooling device from a particular
-    trip point of a thermal zone device. This function is usually called in
-    the thermal zone device .unbind callback.
-    tz: the thermal zone device
-    cdev: thermal cooling device
-    trip: indicates which trip point in this thermal zone the cooling device
-          is associated with.
-
-1.4 Thermal Zone Parameters
-1.4.1 struct thermal_bind_params
-    This structure defines the following parameters that are used to bind
-    a zone with a cooling device for a particular trip point.
-    .cdev: The cooling device pointer
-    .weight: The 'influence' of a particular cooling device on this
-             zone. This is relative to the rest of the cooling
-             devices. For example, if all cooling devices have a
-             weight of 1, then they all contribute the same. You can
-             use percentages if you want, but it's not mandatory. A
-             weight of 0 means that this cooling device doesn't
-             contribute to the cooling of this zone unless all cooling
-             devices have a weight of 0. If all weights are 0, then
-             they all contribute the same.
-    .trip_mask:This is a bit mask that gives the binding relation between
-               this thermal zone and cdev, for a particular trip point.
-               If nth bit is set, then the cdev and thermal zone are bound
-               for trip point n.
-    .binding_limits: This is an array of cooling state limits. Must have
-                     exactly 2 * thermal_zone.number_of_trip_points. It is an
-                     array consisting of tuples <lower-state upper-state> of
-                     state limits. Each trip will be associated with one state
-                     limit tuple when binding. A NULL pointer means
-                     <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
-                     These limits are used when binding a cdev to a trip point.
-    .match: This call back returns success(0) if the 'tz and cdev' need to
-	    be bound, as per platform data.
-1.4.2 struct thermal_zone_params
-    This structure defines the platform level parameters for a thermal zone.
-    This data, for each thermal zone should come from the platform layer.
-    This is an optional feature where some platforms can choose not to
-    provide this data.
-    .governor_name: Name of the thermal governor used for this zone
-    .no_hwmon: a boolean to indicate if the thermal to hwmon sysfs interface
-               is required. when no_hwmon == false, a hwmon sysfs interface
-               will be created. when no_hwmon == true, nothing will be done.
-               In case the thermal_zone_params is NULL, the hwmon interface
-               will be created (for backward compatibility).
-    .num_tbps: Number of thermal_bind_params entries for this zone
-    .tbp: thermal_bind_params entries
-
-2. sysfs attributes structure
-
-RO	read only value
-WO	write only value
-RW	read/write value
-
-Thermal sysfs attributes will be represented under /sys/class/thermal.
-Hwmon sysfs I/F extension is also available under /sys/class/hwmon
-if hwmon is compiled in or built as a module.
-
-Thermal zone device sys I/F, created once it's registered:
-/sys/class/thermal/thermal_zone[0-*]:
-    |---type:			Type of the thermal zone
-    |---temp:			Current temperature
-    |---mode:			Working mode of the thermal zone
-    |---policy:			Thermal governor used for this zone
-    |---available_policies:	Available thermal governors for this zone
-    |---trip_point_[0-*]_temp:	Trip point temperature
-    |---trip_point_[0-*]_type:	Trip point type
-    |---trip_point_[0-*]_hyst:	Hysteresis value for this trip point
-    |---emul_temp:		Emulated temperature set node
-    |---sustainable_power:      Sustainable dissipatable power
-    |---k_po:                   Proportional term during temperature overshoot
-    |---k_pu:                   Proportional term during temperature undershoot
-    |---k_i:                    PID's integral term in the power allocator gov
-    |---k_d:                    PID's derivative term in the power allocator
-    |---integral_cutoff:        Offset above which errors are accumulated
-    |---slope:                  Slope constant applied as linear extrapolation
-    |---offset:                 Offset constant applied as linear extrapolation
-
-Thermal cooling device sys I/F, created once it's registered:
-/sys/class/thermal/cooling_device[0-*]:
-    |---type:			Type of the cooling device(processor/fan/...)
-    |---max_state:		Maximum cooling state of the cooling device
-    |---cur_state:		Current cooling state of the cooling device
-    |---stats:			Directory containing cooling device's statistics
-    |---stats/reset:		Writing any value resets the statistics
-    |---stats/time_in_state_ms:	Time (msec) spent in various cooling states
-    |---stats/total_trans:	Total number of times cooling state is changed
-    |---stats/trans_table:	Cooing state transition table
-
-
-Then next two dynamic attributes are created/removed in pairs. They represent
-the relationship between a thermal zone and its associated cooling device.
-They are created/removed for each successful execution of
-thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device.
-
-/sys/class/thermal/thermal_zone[0-*]:
-    |---cdev[0-*]:		[0-*]th cooling device in current thermal zone
-    |---cdev[0-*]_trip_point:	Trip point that cdev[0-*] is associated with
-    |---cdev[0-*]_weight:       Influence of the cooling device in
-                                this thermal zone
-
-Besides the thermal zone device sysfs I/F and cooling device sysfs I/F,
-the generic thermal driver also creates a hwmon sysfs I/F for each _type_
-of thermal zone device. E.g. the generic thermal driver registers one hwmon
-class device and build the associated hwmon sysfs I/F for all the registered
-ACPI thermal zones.
-
-/sys/class/hwmon/hwmon[0-*]:
-    |---name:			The type of the thermal zone devices
-    |---temp[1-*]_input:	The current temperature of thermal zone [1-*]
-    |---temp[1-*]_critical:	The critical trip point of thermal zone [1-*]
-
-Please read Documentation/hwmon/sysfs-interface.rst for additional information.
-
-***************************
-* Thermal zone attributes *
-***************************
-
-type
-	Strings which represent the thermal zone type.
-	This is given by thermal zone driver as part of registration.
-	E.g: "acpitz" indicates it's an ACPI thermal device.
-	In order to keep it consistent with hwmon sys attribute; this should
-	be a short, lowercase string, not containing spaces nor dashes.
-	RO, Required
-
-temp
-	Current temperature as reported by thermal zone (sensor).
-	Unit: millidegree Celsius
-	RO, Required
-
-mode
-	One of the predefined values in [enabled, disabled].
-	This file gives information about the algorithm that is currently
-	managing the thermal zone. It can be either default kernel based
-	algorithm or user space application.
-	enabled		= enable Kernel Thermal management.
-	disabled	= Preventing kernel thermal zone driver actions upon
-			  trip points so that user application can take full
-			  charge of the thermal management.
-	RW, Optional
-
-policy
-	One of the various thermal governors used for a particular zone.
-	RW, Required
-
-available_policies
-	Available thermal governors which can be used for a particular zone.
-	RO, Required
-
-trip_point_[0-*]_temp
-	The temperature above which trip point will be fired.
-	Unit: millidegree Celsius
-	RO, Optional
-
-trip_point_[0-*]_type
-	Strings which indicate the type of the trip point.
-	E.g. it can be one of critical, hot, passive, active[0-*] for ACPI
-	thermal zone.
-	RO, Optional
-
-trip_point_[0-*]_hyst
-	The hysteresis value for a trip point, represented as an integer
-	Unit: Celsius
-	RW, Optional
-
-cdev[0-*]
-	Sysfs link to the thermal cooling device node where the sys I/F
-	for cooling device throttling control represents.
-	RO, Optional
-
-cdev[0-*]_trip_point
-	The trip point in this thermal zone which cdev[0-*] is associated
-	with; -1 means the cooling device is not associated with any trip
-	point.
-	RO, Optional
-
-cdev[0-*]_weight
-        The influence of cdev[0-*] in this thermal zone. This value
-        is relative to the rest of cooling devices in the thermal
-        zone. For example, if a cooling device has a weight double
-        than that of other, it's twice as effective in cooling the
-        thermal zone.
-        RW, Optional
-
-passive
-	Attribute is only present for zones in which the passive cooling
-	policy is not supported by native thermal driver. Default is zero
-	and can be set to a temperature (in millidegrees) to enable a
-	passive trip point for the zone. Activation is done by polling with
-	an interval of 1 second.
-	Unit: millidegrees Celsius
-	Valid values: 0 (disabled) or greater than 1000
-	RW, Optional
-
-emul_temp
-	Interface to set the emulated temperature method in thermal zone
-	(sensor). After setting this temperature, the thermal zone may pass
-	this temperature to platform emulation function if registered or
-	cache it locally. This is useful in debugging different temperature
-	threshold and its associated cooling action. This is write only node
-	and writing 0 on this node should disable emulation.
-	Unit: millidegree Celsius
-	WO, Optional
-
-	  WARNING: Be careful while enabling this option on production systems,
-	  because userland can easily disable the thermal policy by simply
-	  flooding this sysfs node with low temperature values.
-
-sustainable_power
-	An estimate of the sustained power that can be dissipated by
-	the thermal zone. Used by the power allocator governor. For
-	more information see Documentation/thermal/power_allocator.txt
-	Unit: milliwatts
-	RW, Optional
-
-k_po
-	The proportional term of the power allocator governor's PID
-	controller during temperature overshoot. Temperature overshoot
-	is when the current temperature is above the "desired
-	temperature" trip point. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-k_pu
-	The proportional term of the power allocator governor's PID
-	controller during temperature undershoot. Temperature undershoot
-	is when the current temperature is below the "desired
-	temperature" trip point. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-k_i
-	The integral term of the power allocator governor's PID
-	controller. This term allows the PID controller to compensate
-	for long term drift. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-k_d
-	The derivative term of the power allocator governor's PID
-	controller. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-integral_cutoff
-	Temperature offset from the desired temperature trip point
-	above which the integral term of the power allocator
-	governor's PID controller starts accumulating errors. For
-	example, if integral_cutoff is 0, then the integral term only
-	accumulates error when temperature is above the desired
-	temperature trip point. For more information see
-	Documentation/thermal/power_allocator.txt
-	Unit: millidegree Celsius
-	RW, Optional
-
-slope
-	The slope constant used in a linear extrapolation model
-	to determine a hotspot temperature based off the sensor's
-	raw readings. It is up to the device driver to determine
-	the usage of these values.
-	RW, Optional
-
-offset
-	The offset constant used in a linear extrapolation model
-	to determine a hotspot temperature based off the sensor's
-	raw readings. It is up to the device driver to determine
-	the usage of these values.
-	RW, Optional
-
-*****************************
-* Cooling device attributes *
-*****************************
-
-type
-	String which represents the type of device, e.g:
-	- for generic ACPI: should be "Fan", "Processor" or "LCD"
-	- for memory controller device on intel_menlow platform:
-	  should be "Memory controller".
-	RO, Required
-
-max_state
-	The maximum permissible cooling state of this cooling device.
-	RO, Required
-
-cur_state
-	The current cooling state of this cooling device.
-	The value can any integer numbers between 0 and max_state:
-	- cur_state == 0 means no cooling
-	- cur_state == max_state means the maximum cooling.
-	RW, Required
-
-stats/reset
-	Writing any value resets the cooling device's statistics.
-	WO, Required
-
-stats/time_in_state_ms:
-	The amount of time spent by the cooling device in various cooling
-	states. The output will have "<state> <time>" pair in each line, which
-	will mean this cooling device spent <time> msec of time at <state>.
-	Output will have one line for each of the supported states.  usertime
-	units here is 10mS (similar to other time exported in /proc).
-	RO, Required
-
-stats/total_trans:
-	A single positive value showing the total number of times the state of a
-	cooling device is changed.
-	RO, Required
-
-stats/trans_table:
-	This gives fine grained information about all the cooling state
-	transitions. The cat output here is a two dimensional matrix, where an
-	entry <i,j> (row i, column j) represents the number of transitions from
-	State_i to State_j. If the transition table is bigger than PAGE_SIZE,
-	reading this will return an -EFBIG error.
-	RO, Required
-
-3. A simple implementation
-
-ACPI thermal zone may support multiple trip points like critical, hot,
-passive, active. If an ACPI thermal zone supports critical, passive,
-active[0] and active[1] at the same time, it may register itself as a
-thermal_zone_device (thermal_zone1) with 4 trip points in all.
-It has one processor and one fan, which are both registered as
-thermal_cooling_device. Both are considered to have the same
-effectiveness in cooling the thermal zone.
-
-If the processor is listed in _PSL method, and the fan is listed in _AL0
-method, the sys I/F structure will be built like this:
-
-/sys/class/thermal:
-
-|thermal_zone1:
-    |---type:			acpitz
-    |---temp:			37000
-    |---mode:			enabled
-    |---policy:			step_wise
-    |---available_policies:	step_wise fair_share
-    |---trip_point_0_temp:	100000
-    |---trip_point_0_type:	critical
-    |---trip_point_1_temp:	80000
-    |---trip_point_1_type:	passive
-    |---trip_point_2_temp:	70000
-    |---trip_point_2_type:	active0
-    |---trip_point_3_temp:	60000
-    |---trip_point_3_type:	active1
-    |---cdev0:			--->/sys/class/thermal/cooling_device0
-    |---cdev0_trip_point:	1	/* cdev0 can be used for passive */
-    |---cdev0_weight:           1024
-    |---cdev1:			--->/sys/class/thermal/cooling_device3
-    |---cdev1_trip_point:	2	/* cdev1 can be used for active[0]*/
-    |---cdev1_weight:           1024
-
-|cooling_device0:
-    |---type:			Processor
-    |---max_state:		8
-    |---cur_state:		0
-
-|cooling_device3:
-    |---type:			Fan
-    |---max_state:		2
-    |---cur_state:		0
-
-/sys/class/hwmon:
-
-|hwmon0:
-    |---name:			acpitz
-    |---temp1_input:		37000
-    |---temp1_crit:		100000
-
-4. Event Notification
-
-The framework includes a simple notification mechanism, in the form of a
-netlink event. Netlink socket initialization is done during the _init_
-of the framework. Drivers which intend to use the notification mechanism
-just need to call thermal_generate_netlink_event() with two arguments viz
-(originator, event). The originator is a pointer to struct thermal_zone_device
-from where the event has been originated. An integer which represents the
-thermal zone device will be used in the message to identify the zone. The
-event will be one of:{THERMAL_AUX0, THERMAL_AUX1, THERMAL_CRITICAL,
-THERMAL_DEV_FAULT}. Notification can be sent when the current temperature
-crosses any of the configured thresholds.
-
-5. Export Symbol APIs:
-
-5.1: get_tz_trend:
-This function returns the trend of a thermal zone, i.e the rate of change
-of temperature of the thermal zone. Ideally, the thermal sensor drivers
-are supposed to implement the callback. If they don't, the thermal
-framework calculated the trend by comparing the previous and the current
-temperature values.
-
-5.2:get_thermal_instance:
-This function returns the thermal_instance corresponding to a given
-{thermal_zone, cooling_device, trip_point} combination. Returns NULL
-if such an instance does not exist.
-
-5.3:thermal_notify_framework:
-This function handles the trip events from sensor drivers. It starts
-throttling the cooling devices according to the policy configured.
-For CRITICAL and HOT trip points, this notifies the respective drivers,
-and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
-The throttling policy is based on the configured platform data; if no
-platform data is provided, this uses the step_wise throttling policy.
-
-5.4:thermal_cdev_update:
-This function serves as an arbitrator to set the state of a cooling
-device. It sets the cooling device to the deepest cooling state if
-possible.
-
-6. thermal_emergency_poweroff:
-
-On an event of critical trip temperature crossing. Thermal framework
-allows the system to shutdown gracefully by calling orderly_poweroff().
-In the event of a failure of orderly_poweroff() to shut down the system
-we are in danger of keeping the system alive at undesirably high
-temperatures. To mitigate this high risk scenario we program a work
-queue to fire after a pre-determined number of seconds to start
-an emergency shutdown of the device using the kernel_power_off()
-function. In case kernel_power_off() fails then finally
-emergency_restart() is called in the worst case.
-
-The delay should be carefully profiled so as to give adequate time for
-orderly_poweroff(). In case of failure of an orderly_poweroff() the
-emergency poweroff kicks in after the delay has elapsed and shuts down
-the system.
-
-If set to 0 emergency poweroff will not be supported. So a carefully
-profiled non-zero positive value is a must for emergerncy poweroff to be
-triggered.
diff --git a/Documentation/thermal/x86_pkg_temperature_thermal b/Documentation/thermal/x86_pkg_temperature_thermal
deleted file mode 100644
index 17a3a4c0a0ca..000000000000
--- a/Documentation/thermal/x86_pkg_temperature_thermal
+++ /dev/null
@@ -1,47 +0,0 @@
-Kernel driver: x86_pkg_temp_thermal
-===================
-
-Supported chips:
-* x86: with package level thermal management
-(Verify using: CPUID.06H:EAX[bit 6] =1)
-
-Authors: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
-
-Reference
----
-Intel® 64 and IA-32 Architectures Software Developer’s Manual (Jan, 2013):
-Chapter 14.6: PACKAGE LEVEL THERMAL MANAGEMENT
-
-Description
----------
-
-This driver register CPU digital temperature package level sensor as a thermal
-zone with maximum two user mode configurable trip points. Number of trip points
-depends on the capability of the package. Once the trip point is violated,
-user mode can receive notification via thermal notification mechanism and can
-take any action to control temperature.
-
-
-Threshold management
---------------------
-Each package will register as a thermal zone under /sys/class/thermal.
-Example:
-/sys/class/thermal/thermal_zone1
-
-This contains two trip points:
-- trip_point_0_temp
-- trip_point_1_temp
-
-User can set any temperature between 0 to TJ-Max temperature. Temperature units
-are in milli-degree Celsius. Refer to "Documentation/thermal/sysfs-api.txt" for
-thermal sys-fs details.
-
-Any value other than 0 in these trip points, can trigger thermal notifications.
-Setting 0, stops sending thermal notifications.
-
-Thermal notifications: To get kobject-uevent notifications, set the thermal zone
-policy to "user_space". For example: echo -n "user_space" > policy
-
-
-
-
diff --git a/Documentation/thermal/x86_pkg_temperature_thermal.rst b/Documentation/thermal/x86_pkg_temperature_thermal.rst
new file mode 100644
index 000000000000..f134dbd3f5a9
--- /dev/null
+++ b/Documentation/thermal/x86_pkg_temperature_thermal.rst
@@ -0,0 +1,55 @@
+===================================
+Kernel driver: x86_pkg_temp_thermal
+===================================
+
+Supported chips:
+
+* x86: with package level thermal management
+
+(Verify using: CPUID.06H:EAX[bit 6] =1)
+
+Authors: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+
+Reference
+---------
+
+Intel® 64 and IA-32 Architectures Software Developer’s Manual (Jan, 2013):
+Chapter 14.6: PACKAGE LEVEL THERMAL MANAGEMENT
+
+Description
+-----------
+
+This driver register CPU digital temperature package level sensor as a thermal
+zone with maximum two user mode configurable trip points. Number of trip points
+depends on the capability of the package. Once the trip point is violated,
+user mode can receive notification via thermal notification mechanism and can
+take any action to control temperature.
+
+
+Threshold management
+--------------------
+Each package will register as a thermal zone under /sys/class/thermal.
+
+Example::
+
+	/sys/class/thermal/thermal_zone1
+
+This contains two trip points:
+
+- trip_point_0_temp
+- trip_point_1_temp
+
+User can set any temperature between 0 to TJ-Max temperature. Temperature units
+are in milli-degree Celsius. Refer to "Documentation/thermal/sysfs-api.rst" for
+thermal sys-fs details.
+
+Any value other than 0 in these trip points, can trigger thermal notifications.
+Setting 0, stops sending thermal notifications.
+
+Thermal notifications:
+To get kobject-uevent notifications, set the thermal zone
+policy to "user_space".
+
+For example::
+
+	echo -n "user_space" > policy
diff --git a/MAINTAINERS b/MAINTAINERS
index d0ed735994a5..693f2aebbc83 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15618,7 +15618,7 @@ M:	Viresh Kumar <viresh.kumar@linaro.org>
 M:	Javi Merino <javi.merino@kernel.org>
 L:	linux-pm@vger.kernel.org
 S:	Supported
-F:	Documentation/thermal/cpu-cooling-api.txt
+F:	Documentation/thermal/cpu-cooling-api.rst
 F:	drivers/thermal/cpu_cooling.c
 F:	include/linux/cpu_cooling.h
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 15a4ca5d7099..681047f8cc05 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -251,7 +251,7 @@ struct thermal_bind_params {
 	 * platform characterization. This value is relative to the
 	 * rest of the weights so a cooling device whose weight is
 	 * double that of another cooling device is twice as
-	 * effective. See Documentation/thermal/sysfs-api.txt for more
+	 * effective. See Documentation/thermal/sysfs-api.rst for more
 	 * information.
 	 */
 	int weight;
@@ -259,7 +259,7 @@ struct thermal_bind_params {
 	/*
 	 * This is a bit mask that gives the binding relation between this
 	 * thermal zone and cdev, for a particular trip point.
-	 * See Documentation/thermal/sysfs-api.txt for more information.
+	 * See Documentation/thermal/sysfs-api.rst for more information.
 	 */
 	int trip_mask;
 
-- 
cgit v1.2.3


From 586c1b4125b3c7bf5b482fcafab5d568b8a3c285 Mon Sep 17 00:00:00 2001
From: Tony Xie <tony.xie@rock-chips.com>
Date: Fri, 21 Jun 2019 06:32:54 -0400
Subject: mfd: rk808: Add RK817 and RK809 support

The RK809 and RK817 are a Power Management IC (PMIC) for multimedia
and handheld devices. They contains the following components:
  - Regulators
  - RTC
  - Clocking

Both RK809 and RK817 chips are using a similar register map,
so we can reuse the RTC and Clocking functionality.
Most of regulators have a some implementation also.

Signed-off-by: Tony Xie <tony.xie@rock-chips.com>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig       |   6 +-
 drivers/mfd/rk808.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/mfd/rk808.h | 172 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 364 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 294d9567cc71..0b7db542e478 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1030,14 +1030,14 @@ config MFD_RC5T583
 	  different functionality of the device.
 
 config MFD_RK808
-	tristate "Rockchip RK805/RK808/RK818 Power Management Chip"
+	tristate "Rockchip RK805/RK808/RK809/RK817/RK818 Power Management Chip"
 	depends on I2C && OF
 	select MFD_CORE
 	select REGMAP_I2C
 	select REGMAP_IRQ
 	help
-	  If you say yes here you get support for the RK805, RK808 and RK818
-	  Power Management chips.
+	  If you say yes here you get support for the RK805, RK808, RK809,
+	  RK817 and RK818 Power Management chips.
 	  This driver provides common support for accessing the device
 	  through I2C interface. The device supports multiple sub-devices
 	  including interrupts, RTC, LDO & DCDC regulators, and onkey.
diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index 94377782d208..6ee1c461a3bb 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/regmap.h>
+#include <linux/syscore_ops.h>
 
 struct rk808_reg_data {
 	int addr;
@@ -62,6 +63,27 @@ static bool rk808_is_volatile_reg(struct device *dev, unsigned int reg)
 	return false;
 }
 
+static bool rk817_is_volatile_reg(struct device *dev, unsigned int reg)
+{
+	/*
+	 * Notes:
+	 * - Technically the ROUND_30s bit makes RTC_CTRL_REG volatile, but
+	 *   we don't use that feature.  It's better to cache.
+	 */
+
+	switch (reg) {
+	case RK817_SECONDS_REG ... RK817_WEEKS_REG:
+	case RK817_RTC_STATUS_REG:
+	case RK817_INT_STS_REG0:
+	case RK817_INT_STS_REG1:
+	case RK817_INT_STS_REG2:
+	case RK817_SYS_STS:
+		return true;
+	}
+
+	return true;
+}
+
 static const struct regmap_config rk818_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
@@ -86,6 +108,14 @@ static const struct regmap_config rk808_regmap_config = {
 	.volatile_reg = rk808_is_volatile_reg,
 };
 
+static const struct regmap_config rk817_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = RK817_GPIO_INT_CFG,
+	.cache_type = REGCACHE_NONE,
+	.volatile_reg = rk817_is_volatile_reg,
+};
+
 static struct resource rtc_resources[] = {
 	{
 		.start  = RK808_IRQ_RTC_ALARM,
@@ -94,6 +124,10 @@ static struct resource rtc_resources[] = {
 	}
 };
 
+static struct resource rk817_rtc_resources[] = {
+	DEFINE_RES_IRQ(RK817_IRQ_RTC_ALARM),
+};
+
 static struct resource rk805_key_resources[] = {
 	{
 		.start  = RK805_IRQ_PWRON_FALL,
@@ -107,6 +141,11 @@ static struct resource rk805_key_resources[] = {
 	}
 };
 
+static struct resource rk817_pwrkey_resources[] = {
+	DEFINE_RES_IRQ(RK817_IRQ_PWRON_RISE),
+	DEFINE_RES_IRQ(RK817_IRQ_PWRON_FALL),
+};
+
 static const struct mfd_cell rk805s[] = {
 	{ .name = "rk808-clkout", },
 	{ .name = "rk808-regulator", },
@@ -132,6 +171,21 @@ static const struct mfd_cell rk808s[] = {
 	},
 };
 
+static const struct mfd_cell rk817s[] = {
+	{ .name = "rk808-clkout",},
+	{ .name = "rk808-regulator",},
+	{
+		.name = "rk8xx-pwrkey",
+		.num_resources = ARRAY_SIZE(rk817_pwrkey_resources),
+		.resources = &rk817_pwrkey_resources[0],
+	},
+	{
+		.name = "rk808-rtc",
+		.num_resources = ARRAY_SIZE(rk817_rtc_resources),
+		.resources = &rk817_rtc_resources[0],
+	},
+};
+
 static const struct mfd_cell rk818s[] = {
 	{ .name = "rk808-clkout", },
 	{ .name = "rk808-regulator", },
@@ -167,6 +221,13 @@ static const struct rk808_reg_data rk808_pre_init_reg[] = {
 						    VB_LO_SEL_3500MV },
 };
 
+static const struct rk808_reg_data rk817_pre_init_reg[] = {
+	{RK817_RTC_CTRL_REG, RTC_STOP, RTC_STOP},
+	{RK817_GPIO_INT_CFG, RK817_INT_POL_MSK, RK817_INT_POL_H},
+	{RK817_SYS_CFG(1), RK817_HOTDIE_TEMP_MSK | RK817_TSD_TEMP_MSK,
+					   RK817_HOTDIE_105 | RK817_TSD_140},
+};
+
 static const struct rk808_reg_data rk818_pre_init_reg[] = {
 	/* improve efficiency */
 	{ RK818_BUCK2_CONFIG_REG, BUCK2_RATE_MASK,  BUCK_ILMIN_250MA },
@@ -332,6 +393,33 @@ static const struct regmap_irq rk818_irqs[] = {
 	},
 };
 
+static const struct regmap_irq rk817_irqs[RK817_IRQ_END] = {
+	REGMAP_IRQ_REG_LINE(0, 8),
+	REGMAP_IRQ_REG_LINE(1, 8),
+	REGMAP_IRQ_REG_LINE(2, 8),
+	REGMAP_IRQ_REG_LINE(3, 8),
+	REGMAP_IRQ_REG_LINE(4, 8),
+	REGMAP_IRQ_REG_LINE(5, 8),
+	REGMAP_IRQ_REG_LINE(6, 8),
+	REGMAP_IRQ_REG_LINE(7, 8),
+	REGMAP_IRQ_REG_LINE(8, 8),
+	REGMAP_IRQ_REG_LINE(9, 8),
+	REGMAP_IRQ_REG_LINE(10, 8),
+	REGMAP_IRQ_REG_LINE(11, 8),
+	REGMAP_IRQ_REG_LINE(12, 8),
+	REGMAP_IRQ_REG_LINE(13, 8),
+	REGMAP_IRQ_REG_LINE(14, 8),
+	REGMAP_IRQ_REG_LINE(15, 8),
+	REGMAP_IRQ_REG_LINE(16, 8),
+	REGMAP_IRQ_REG_LINE(17, 8),
+	REGMAP_IRQ_REG_LINE(18, 8),
+	REGMAP_IRQ_REG_LINE(19, 8),
+	REGMAP_IRQ_REG_LINE(20, 8),
+	REGMAP_IRQ_REG_LINE(21, 8),
+	REGMAP_IRQ_REG_LINE(22, 8),
+	REGMAP_IRQ_REG_LINE(23, 8)
+};
+
 static struct regmap_irq_chip rk805_irq_chip = {
 	.name = "rk805",
 	.irqs = rk805_irqs,
@@ -355,6 +443,18 @@ static const struct regmap_irq_chip rk808_irq_chip = {
 	.init_ack_masked = true,
 };
 
+static struct regmap_irq_chip rk817_irq_chip = {
+	.name = "rk817",
+	.irqs = rk817_irqs,
+	.num_irqs = ARRAY_SIZE(rk817_irqs),
+	.num_regs = 3,
+	.irq_reg_stride = 2,
+	.status_base = RK817_INT_STS_REG0,
+	.mask_base = RK817_INT_STS_MSK_REG0,
+	.ack_base = RK817_INT_STS_REG0,
+	.init_ack_masked = true,
+};
+
 static const struct regmap_irq_chip rk818_irq_chip = {
 	.name = "rk818",
 	.irqs = rk818_irqs,
@@ -423,9 +523,33 @@ static void rk818_device_shutdown(void)
 		dev_err(&rk808_i2c_client->dev, "power off error!\n");
 }
 
+static void rk8xx_syscore_shutdown(void)
+{
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+	int ret;
+
+	if (system_state == SYSTEM_POWER_OFF &&
+	    (rk808->variant == RK809_ID || rk808->variant == RK817_ID)) {
+		ret = regmap_update_bits(rk808->regmap,
+					 RK817_SYS_CFG(3),
+					 RK817_SLPPIN_FUNC_MSK,
+					 SLPPIN_DN_FUN);
+		if (ret) {
+			dev_warn(&rk808_i2c_client->dev,
+				 "Cannot switch to power down function\n");
+		}
+	}
+}
+
+static struct syscore_ops rk808_syscore_ops = {
+	.shutdown = rk8xx_syscore_shutdown,
+};
+
 static const struct of_device_id rk808_of_match[] = {
 	{ .compatible = "rockchip,rk805" },
 	{ .compatible = "rockchip,rk808" },
+	{ .compatible = "rockchip,rk809" },
+	{ .compatible = "rockchip,rk817" },
 	{ .compatible = "rockchip,rk818" },
 	{ },
 };
@@ -438,10 +562,11 @@ static int rk808_probe(struct i2c_client *client,
 	struct rk808 *rk808;
 	const struct rk808_reg_data *pre_init_reg;
 	const struct mfd_cell *cells;
-	void (*pm_pwroff_fn)(void);
+	void (*pm_pwroff_fn)(void) = NULL;
 	int nr_pre_init_regs;
 	int nr_cells;
 	int pm_off = 0, msb, lsb;
+	unsigned char pmic_id_msb, pmic_id_lsb;
 	int ret;
 	int i;
 
@@ -449,15 +574,24 @@ static int rk808_probe(struct i2c_client *client,
 	if (!rk808)
 		return -ENOMEM;
 
+	if (of_device_is_compatible(np, "rockchip,rk817") ||
+	    of_device_is_compatible(np, "rockchip,rk809")) {
+		pmic_id_msb = RK817_ID_MSB;
+		pmic_id_lsb = RK817_ID_LSB;
+	} else {
+		pmic_id_msb = RK808_ID_MSB;
+		pmic_id_lsb = RK808_ID_LSB;
+	}
+
 	/* Read chip variant */
-	msb = i2c_smbus_read_byte_data(client, RK808_ID_MSB);
+	msb = i2c_smbus_read_byte_data(client, pmic_id_msb);
 	if (msb < 0) {
 		dev_err(&client->dev, "failed to read the chip id at 0x%x\n",
 			RK808_ID_MSB);
 		return msb;
 	}
 
-	lsb = i2c_smbus_read_byte_data(client, RK808_ID_LSB);
+	lsb = i2c_smbus_read_byte_data(client, pmic_id_lsb);
 	if (lsb < 0) {
 		dev_err(&client->dev, "failed to read the chip id at 0x%x\n",
 			RK808_ID_LSB);
@@ -495,6 +629,16 @@ static int rk808_probe(struct i2c_client *client,
 		nr_cells = ARRAY_SIZE(rk818s);
 		pm_pwroff_fn = rk818_device_shutdown;
 		break;
+	case RK809_ID:
+	case RK817_ID:
+		rk808->regmap_cfg = &rk817_regmap_config;
+		rk808->regmap_irq_chip = &rk817_irq_chip;
+		pre_init_reg = rk817_pre_init_reg;
+		nr_pre_init_regs = ARRAY_SIZE(rk817_pre_init_reg);
+		cells = rk817s;
+		nr_cells = ARRAY_SIZE(rk817s);
+		register_syscore_ops(&rk808_syscore_ops);
+		break;
 	default:
 		dev_err(&client->dev, "Unsupported RK8XX ID %lu\n",
 			rk808->variant);
@@ -568,10 +712,52 @@ static int rk808_remove(struct i2c_client *client)
 	return 0;
 }
 
+static int rk8xx_suspend(struct device *dev)
+{
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+	int ret = 0;
+
+	switch (rk808->variant) {
+	case RK809_ID:
+	case RK817_ID:
+		ret = regmap_update_bits(rk808->regmap,
+					 RK817_SYS_CFG(3),
+					 RK817_SLPPIN_FUNC_MSK,
+					 SLPPIN_SLP_FUN);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static int rk8xx_resume(struct device *dev)
+{
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+	int ret = 0;
+
+	switch (rk808->variant) {
+	case RK809_ID:
+	case RK817_ID:
+		ret = regmap_update_bits(rk808->regmap,
+					 RK817_SYS_CFG(3),
+					 RK817_SLPPIN_FUNC_MSK,
+					 SLPPIN_NULL_FUN);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+SIMPLE_DEV_PM_OPS(rk8xx_pm_ops, rk8xx_suspend, rk8xx_resume);
+
 static struct i2c_driver rk808_i2c_driver = {
 	.driver = {
 		.name = "rk808",
 		.of_match_table = rk808_of_match,
+		.pm = &rk8xx_pm_ops,
 	},
 	.probe    = rk808_probe,
 	.remove   = rk808_remove,
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index d3156594674c..0fd9eedf3c20 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -382,6 +382,7 @@ enum rk805_reg {
 #define SWITCH1_EN	BIT(5)
 #define DEV_OFF_RST	BIT(3)
 #define DEV_OFF		BIT(0)
+#define RTC_STOP	BIT(0)
 
 #define VB_LO_ACT		BIT(4)
 #define VB_LO_SEL_3500MV	(7 << 0)
@@ -396,6 +397,175 @@ enum rk805_reg {
 #define SLEEP_FUN			(0x1 << 2)
 #define RK8XX_ID_MSK			0xfff0
 #define FPWM_MODE			BIT(7)
+enum rk817_reg_id {
+	RK817_ID_DCDC1 = 0,
+	RK817_ID_DCDC2,
+	RK817_ID_DCDC3,
+	RK817_ID_DCDC4,
+	RK817_ID_LDO1,
+	RK817_ID_LDO2,
+	RK817_ID_LDO3,
+	RK817_ID_LDO4,
+	RK817_ID_LDO5,
+	RK817_ID_LDO6,
+	RK817_ID_LDO7,
+	RK817_ID_LDO8,
+	RK817_ID_LDO9,
+	RK817_ID_BOOST,
+	RK817_ID_BOOST_OTG_SW,
+	RK817_NUM_REGULATORS
+};
+
+enum rk809_reg_id {
+	RK809_ID_DCDC5 = RK817_ID_BOOST,
+	RK809_ID_SW1,
+	RK809_ID_SW2,
+	RK809_NUM_REGULATORS
+};
+
+#define RK817_SECONDS_REG		0x00
+#define RK817_MINUTES_REG		0x01
+#define RK817_HOURS_REG			0x02
+#define RK817_DAYS_REG			0x03
+#define RK817_MONTHS_REG		0x04
+#define RK817_YEARS_REG			0x05
+#define RK817_WEEKS_REG			0x06
+#define RK817_ALARM_SECONDS_REG		0x07
+#define RK817_ALARM_MINUTES_REG		0x08
+#define RK817_ALARM_HOURS_REG		0x09
+#define RK817_ALARM_DAYS_REG		0x0a
+#define RK817_ALARM_MONTHS_REG		0x0b
+#define RK817_ALARM_YEARS_REG		0x0c
+#define RK817_RTC_CTRL_REG		0xd
+#define RK817_RTC_STATUS_REG		0xe
+#define RK817_RTC_INT_REG		0xf
+#define RK817_RTC_COMP_LSB_REG		0x10
+#define RK817_RTC_COMP_MSB_REG		0x11
+
+#define RK817_POWER_EN_REG(i)		(0xb1 + (i))
+#define RK817_POWER_SLP_EN_REG(i)	(0xb5 + (i))
+
+#define RK817_POWER_CONFIG		(0xb9)
+
+#define RK817_BUCK_CONFIG_REG(i)	(0xba + (i) * 3)
+
+#define RK817_BUCK1_ON_VSEL_REG		0xBB
+#define RK817_BUCK1_SLP_VSEL_REG	0xBC
+
+#define RK817_BUCK2_CONFIG_REG		0xBD
+#define RK817_BUCK2_ON_VSEL_REG		0xBE
+#define RK817_BUCK2_SLP_VSEL_REG	0xBF
+
+#define RK817_BUCK3_CONFIG_REG		0xC0
+#define RK817_BUCK3_ON_VSEL_REG		0xC1
+#define RK817_BUCK3_SLP_VSEL_REG	0xC2
+
+#define RK817_BUCK4_CONFIG_REG		0xC3
+#define RK817_BUCK4_ON_VSEL_REG		0xC4
+#define RK817_BUCK4_SLP_VSEL_REG	0xC5
+
+#define RK817_LDO_ON_VSEL_REG(idx)	(0xcc + (idx) * 2)
+#define RK817_BOOST_OTG_CFG		(0xde)
+
+#define RK817_ID_MSB			0xed
+#define RK817_ID_LSB			0xee
+
+#define RK817_SYS_STS			0xf0
+#define RK817_SYS_CFG(i)		(0xf1 + (i))
+
+#define RK817_ON_SOURCE_REG		0xf5
+#define RK817_OFF_SOURCE_REG		0xf6
+
+/* INTERRUPT REGISTER */
+#define RK817_INT_STS_REG0		0xf8
+#define RK817_INT_STS_MSK_REG0		0xf9
+#define RK817_INT_STS_REG1		0xfa
+#define RK817_INT_STS_MSK_REG1		0xfb
+#define RK817_INT_STS_REG2		0xfc
+#define RK817_INT_STS_MSK_REG2		0xfd
+#define RK817_GPIO_INT_CFG		0xfe
+
+/* IRQ Definitions */
+#define RK817_IRQ_PWRON_FALL		0
+#define RK817_IRQ_PWRON_RISE		1
+#define RK817_IRQ_PWRON			2
+#define RK817_IRQ_PWMON_LP		3
+#define RK817_IRQ_HOTDIE		4
+#define RK817_IRQ_RTC_ALARM		5
+#define RK817_IRQ_RTC_PERIOD		6
+#define RK817_IRQ_VB_LO			7
+#define RK817_IRQ_PLUG_IN		8
+#define RK817_IRQ_PLUG_OUT		9
+#define RK817_IRQ_CHRG_TERM		10
+#define RK817_IRQ_CHRG_TIME		11
+#define RK817_IRQ_CHRG_TS		12
+#define RK817_IRQ_USB_OV		13
+#define RK817_IRQ_CHRG_IN_CLMP		14
+#define RK817_IRQ_BAT_DIS_ILIM		15
+#define RK817_IRQ_GATE_GPIO		16
+#define RK817_IRQ_TS_GPIO		17
+#define RK817_IRQ_CODEC_PD		18
+#define RK817_IRQ_CODEC_PO		19
+#define RK817_IRQ_CLASSD_MUTE_DONE	20
+#define RK817_IRQ_CLASSD_OCP		21
+#define RK817_IRQ_BAT_OVP               22
+#define RK817_IRQ_CHRG_BAT_HI		23
+#define RK817_IRQ_END			(RK817_IRQ_CHRG_BAT_HI + 1)
+
+/*
+ * rtc_ctrl 0xd
+ * same as 808, except bit4
+ */
+#define RK817_RTC_CTRL_RSV4		BIT(4)
+
+/* power config 0xb9 */
+#define RK817_BUCK3_FB_RES_MSK		BIT(6)
+#define RK817_BUCK3_FB_RES_INTER	BIT(6)
+#define RK817_BUCK3_FB_RES_EXT		0
+
+/* buck config 0xba */
+#define RK817_RAMP_RATE_OFFSET		6
+#define RK817_RAMP_RATE_MASK		(0x3 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_3MV_PER_US	(0x0 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_6_3MV_PER_US	(0x1 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_12_5MV_PER_US	(0x2 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_25MV_PER_US	(0x3 << RK817_RAMP_RATE_OFFSET)
+
+/* sys_cfg1 0xf2 */
+#define RK817_HOTDIE_TEMP_MSK		(0x3 << 4)
+#define RK817_HOTDIE_85			(0x0 << 4)
+#define RK817_HOTDIE_95			(0x1 << 4)
+#define RK817_HOTDIE_105		(0x2 << 4)
+#define RK817_HOTDIE_115		(0x3 << 4)
+
+#define RK817_TSD_TEMP_MSK		BIT(6)
+#define RK817_TSD_140			0
+#define RK817_TSD_160			BIT(6)
+
+#define RK817_CLK32KOUT2_EN		BIT(7)
+
+/* sys_cfg3 0xf4 */
+#define RK817_SLPPIN_FUNC_MSK		(0x3 << 3)
+#define SLPPIN_NULL_FUN			(0x0 << 3)
+#define SLPPIN_SLP_FUN			(0x1 << 3)
+#define SLPPIN_DN_FUN			(0x2 << 3)
+#define SLPPIN_RST_FUN			(0x3 << 3)
+
+#define RK817_RST_FUNC_MSK		(0x3 << 6)
+#define RK817_RST_FUNC_SFT		(6)
+#define RK817_RST_FUNC_CNT		(3)
+#define RK817_RST_FUNC_DEV		(0) /* reset the dev */
+#define RK817_RST_FUNC_REG		(0x1 << 6) /* reset the reg only */
+
+#define RK817_SLPPOL_MSK		BIT(5)
+#define RK817_SLPPOL_H			BIT(5)
+#define RK817_SLPPOL_L			(0)
+
+/* gpio&int 0xfe */
+#define RK817_INT_POL_MSK		BIT(1)
+#define RK817_INT_POL_H			BIT(1)
+#define RK817_INT_POL_L			0
+#define RK809_BUCK5_CONFIG(i)		(RK817_BOOST_OTG_CFG + (i) * 1)
 
 enum {
 	BUCK_ILMIN_50MA,
@@ -443,6 +613,8 @@ enum {
 enum {
 	RK805_ID = 0x8050,
 	RK808_ID = 0x0000,
+	RK809_ID = 0x8090,
+	RK817_ID = 0x8170,
 	RK818_ID = 0x8181,
 };
 
-- 
cgit v1.2.3


From e444f6d68c07bc01a3a3d5905409dbe1ca391d26 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Wed, 26 Jun 2019 14:29:18 +0200
Subject: regulator: rk808: Add RK809 and RK817 support.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    Add support for the rk809 and rk817 regulator driver.
    Their specifications are as follows：
    1. The RK809 and RK809 consist of 5 DCDCs, 9 LDOs
       and have the same registers for these components except dcdc5.
    2. The dcdc5 is a boost dcdc for RK817 and is a buck for RK809.
    3. The RK817 has one switch but The Rk809 has two.

    The output voltages are configurable and are meant to supply power
    to the main processor and other components.

Signed-off-by: Tony Xie <tony.xie@rock-chips.com>
Acked-by: Mark Brown <broonie@kernel.org>
[rebased on top of 5.2-rc1]
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/regulator/Kconfig           |   4 +-
 drivers/regulator/rk808-regulator.c | 646 ++++++++++++++++++++++++++++++++++--
 include/linux/mfd/rk808.h           |   3 +
 3 files changed, 625 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 6c37f0df9323..214a958ff3e5 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -762,11 +762,11 @@ config REGULATOR_RC5T583
 	  outputs which can be controlled by i2c communication.
 
 config REGULATOR_RK808
-	tristate "Rockchip RK805/RK808/RK818 Power regulators"
+	tristate "Rockchip RK805/RK808/RK809/RK817/RK818 Power regulators"
 	depends on MFD_RK808
 	help
 	  Select this option to enable the power regulator of ROCKCHIP
-	  PMIC RK805,RK808 and RK818.
+	  PMIC RK805,RK809&RK817,RK808 and RK818.
 	  This driver supports the control of different power rails of device
 	  through regulator interface. The device supports multiple DCDC/LDO
 	  outputs which can be controlled by i2c communication.
diff --git a/drivers/regulator/rk808-regulator.c b/drivers/regulator/rk808-regulator.c
index 23713e16c286..e9b0bb996fc4 100644
--- a/drivers/regulator/rk808-regulator.c
+++ b/drivers/regulator/rk808-regulator.c
@@ -36,6 +36,12 @@
 #define RK808_BUCK4_VSEL_MASK	0xf
 #define RK808_LDO_VSEL_MASK	0x1f
 
+#define RK809_BUCK5_VSEL_MASK		0x7
+
+#define RK817_LDO_VSEL_MASK		0x7f
+#define RK817_BOOST_VSEL_MASK		0x7
+#define RK817_BUCK_VSEL_MASK		0x7f
+
 #define RK818_BUCK_VSEL_MASK		0x3f
 #define RK818_BUCK4_VSEL_MASK		0x1f
 #define RK818_LDO_VSEL_MASK		0x1f
@@ -65,30 +71,36 @@
 /* max steps for increase voltage of Buck1/2, equal 100mv*/
 #define MAX_STEPS_ONE_TIME 8
 
-#define RK805_DESC(_id, _match, _supply, _min, _max, _step, _vreg,      \
-	_vmask, _ereg, _emask, _etime)                                  \
-	[_id] = {                                                       \
-		.name           = (_match),                             \
-		.supply_name    = (_supply),                            \
-		.of_match       = of_match_ptr(_match),                 \
-		.regulators_node = of_match_ptr("regulators"),          \
-		.type           = REGULATOR_VOLTAGE,                    \
-		.id             = (_id),                                \
-		.n_voltages     = (((_max) - (_min)) / (_step) + 1),    \
-		.owner          = THIS_MODULE,                          \
-		.min_uV         = (_min) * 1000,                        \
-		.uV_step        = (_step) * 1000,                       \
-		.vsel_reg       = (_vreg),                              \
-		.vsel_mask      = (_vmask),                             \
-		.enable_reg     = (_ereg),                              \
-		.enable_mask    = (_emask),                             \
-		.enable_time    = (_etime),                             \
-		.ops            = &rk805_reg_ops,                       \
+#define ENABLE_MASK(id)			(BIT(id) | BIT(4 + (id)))
+#define DISABLE_VAL(id)			(BIT(4 + (id)))
+
+#define RK817_BOOST_DESC(_id, _match, _supply, _min, _max, _step, _vreg,\
+	_vmask, _ereg, _emask, _enval, _disval, _etime, m_drop)		\
+	{							\
+		.name		= (_match),				\
+		.supply_name	= (_supply),				\
+		.of_match	= of_match_ptr(_match),			\
+		.regulators_node = of_match_ptr("regulators"),		\
+		.type		= REGULATOR_VOLTAGE,			\
+		.id		= (_id),				\
+		.n_voltages	= (((_max) - (_min)) / (_step) + 1),	\
+		.owner		= THIS_MODULE,				\
+		.min_uV		= (_min) * 1000,			\
+		.uV_step	= (_step) * 1000,			\
+		.vsel_reg	= (_vreg),				\
+		.vsel_mask	= (_vmask),				\
+		.enable_reg	= (_ereg),				\
+		.enable_mask	= (_emask),				\
+		.enable_val     = (_enval),				\
+		.disable_val     = (_disval),				\
+		.enable_time	= (_etime),				\
+		.min_dropout_uV = (m_drop) * 1000,			\
+		.ops		= &rk817_boost_ops,			\
 	}
 
-#define RK8XX_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
-	_vmask, _ereg, _emask, _etime)					\
-	[_id] = {							\
+#define RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _enval, _disval, _etime, _ops)		\
+	{								\
 		.name		= (_match),				\
 		.supply_name	= (_supply),				\
 		.of_match	= of_match_ptr(_match),			\
@@ -103,12 +115,30 @@
 		.vsel_mask	= (_vmask),				\
 		.enable_reg	= (_ereg),				\
 		.enable_mask	= (_emask),				\
+		.enable_val     = (_enval),				\
+		.disable_val     = (_disval),				\
 		.enable_time	= (_etime),				\
-		.ops		= &rk808_reg_ops,			\
+		.ops		= _ops,			\
 	}
 
-#define RK8XX_DESC_SWITCH(_id, _match, _supply, _ereg, _emask)		\
-	[_id] = {							\
+#define RK805_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _etime)					\
+	RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, 0, 0, _etime, &rk805_reg_ops)
+
+#define RK8XX_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _etime)					\
+	RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, 0, 0, _etime, &rk808_reg_ops)
+
+#define RK817_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _disval, _etime)				\
+	RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _emask, _disval, _etime, &rk817_reg_ops)
+
+#define RKXX_DESC_SWITCH_COM(_id, _match, _supply, _ereg, _emask,	\
+	_enval, _disval, _ops)						\
+	{								\
 		.name		= (_match),				\
 		.supply_name	= (_supply),				\
 		.of_match	= of_match_ptr(_match),			\
@@ -117,10 +147,20 @@
 		.id		= (_id),				\
 		.enable_reg	= (_ereg),				\
 		.enable_mask	= (_emask),				\
+		.enable_val     = (_enval),				\
+		.disable_val     = (_disval),				\
 		.owner		= THIS_MODULE,				\
-		.ops		= &rk808_switch_ops			\
+		.ops		= _ops					\
 	}
 
+#define RK817_DESC_SWITCH(_id, _match, _supply, _ereg, _emask,		\
+	_disval)							\
+	RKXX_DESC_SWITCH_COM(_id, _match, _supply, _ereg, _emask,	\
+	_emask, _disval, &rk817_switch_ops)
+
+#define RK8XX_DESC_SWITCH(_id, _match, _supply, _ereg, _emask)		\
+	RKXX_DESC_SWITCH_COM(_id, _match, _supply, _ereg, _emask,	\
+	0, 0, &rk808_switch_ops)
 
 struct rk808_regulator_data {
 	struct gpio_desc *dvs_gpio[2];
@@ -138,6 +178,51 @@ static const struct regulator_linear_range rk808_ldo3_voltage_ranges[] = {
 	REGULATOR_LINEAR_RANGE(2500000, 15, 15, 0),
 };
 
+#define RK809_BUCK5_SEL_CNT		(8)
+
+static const struct regulator_linear_range rk809_buck5_voltage_ranges[] = {
+	REGULATOR_LINEAR_RANGE(1500000, 0, 0, 0),
+	REGULATOR_LINEAR_RANGE(1800000, 1, 3, 200000),
+	REGULATOR_LINEAR_RANGE(2800000, 4, 5, 200000),
+	REGULATOR_LINEAR_RANGE(3300000, 6, 7, 300000),
+};
+
+#define RK817_BUCK1_MIN0 500000
+#define RK817_BUCK1_MAX0 1500000
+
+#define RK817_BUCK1_MIN1 1600000
+#define RK817_BUCK1_MAX1 2400000
+
+#define RK817_BUCK3_MAX1 3400000
+
+#define RK817_BUCK1_STP0 12500
+#define RK817_BUCK1_STP1 100000
+
+#define RK817_BUCK1_SEL0 ((RK817_BUCK1_MAX0 - RK817_BUCK1_MIN0) /\
+						  RK817_BUCK1_STP0)
+#define RK817_BUCK1_SEL1 ((RK817_BUCK1_MAX1 - RK817_BUCK1_MIN1) /\
+						  RK817_BUCK1_STP1)
+
+#define RK817_BUCK3_SEL1 ((RK817_BUCK3_MAX1 - RK817_BUCK1_MIN1) /\
+						  RK817_BUCK1_STP1)
+
+#define RK817_BUCK1_SEL_CNT (RK817_BUCK1_SEL0 + RK817_BUCK1_SEL1 + 1)
+#define RK817_BUCK3_SEL_CNT (RK817_BUCK1_SEL0 + RK817_BUCK3_SEL1 + 1)
+
+static const struct regulator_linear_range rk817_buck1_voltage_ranges[] = {
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN0, 0,
+			       RK817_BUCK1_SEL0, RK817_BUCK1_STP0),
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN1, RK817_BUCK1_SEL0 + 1,
+			       RK817_BUCK1_SEL_CNT, RK817_BUCK1_STP1),
+};
+
+static const struct regulator_linear_range rk817_buck3_voltage_ranges[] = {
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN0, 0,
+			       RK817_BUCK1_SEL0, RK817_BUCK1_STP0),
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN1, RK817_BUCK1_SEL0 + 1,
+			       RK817_BUCK3_SEL_CNT, RK817_BUCK1_STP1),
+};
+
 static int rk808_buck1_2_get_voltage_sel_regmap(struct regulator_dev *rdev)
 {
 	struct rk808_regulator_data *pdata = rdev_get_drvdata(rdev);
@@ -289,6 +374,36 @@ static int rk808_set_ramp_delay(struct regulator_dev *rdev, int ramp_delay)
 				  RK808_RAMP_RATE_MASK, ramp_value);
 }
 
+/*
+ * RK817 RK809
+ */
+static int rk817_set_ramp_delay(struct regulator_dev *rdev, int ramp_delay)
+{
+	unsigned int ramp_value = RK817_RAMP_RATE_25MV_PER_US;
+	unsigned int reg = RK817_BUCK_CONFIG_REG(rdev_get_id(rdev));
+
+	switch (ramp_delay) {
+	case 0 ... 3000:
+		ramp_value = RK817_RAMP_RATE_3MV_PER_US;
+		break;
+	case 3001 ... 6300:
+		ramp_value = RK817_RAMP_RATE_6_3MV_PER_US;
+		break;
+	case 6301 ... 12500:
+		ramp_value = RK817_RAMP_RATE_12_5MV_PER_US;
+		break;
+	case 12501 ... 25000:
+		break;
+	default:
+		dev_warn(&rdev->dev,
+			 "%s ramp_delay: %d not supported, setting 10000\n",
+			 rdev->desc->name, ramp_delay);
+	}
+
+	return regmap_update_bits(rdev->regmap, reg,
+				  RK817_RAMP_RATE_MASK, ramp_value);
+}
+
 static int rk808_set_suspend_voltage(struct regulator_dev *rdev, int uv)
 {
 	unsigned int reg;
@@ -304,6 +419,21 @@ static int rk808_set_suspend_voltage(struct regulator_dev *rdev, int uv)
 				  sel);
 }
 
+static int rk817_set_suspend_voltage(struct regulator_dev *rdev, int uv)
+{
+	unsigned int reg;
+	int sel = regulator_map_voltage_linear(rdev, uv, uv);
+	/* only ldo1~ldo9 */
+	if (sel < 0)
+		return -EINVAL;
+
+	reg = rdev->desc->vsel_reg + RK808_SLP_REG_OFFSET;
+
+	return regmap_update_bits(rdev->regmap, reg,
+				  rdev->desc->vsel_mask,
+				  sel);
+}
+
 static int rk808_set_suspend_voltage_range(struct regulator_dev *rdev, int uv)
 {
 	unsigned int reg;
@@ -363,6 +493,131 @@ static int rk808_set_suspend_disable(struct regulator_dev *rdev)
 				  rdev->desc->enable_mask);
 }
 
+static int rk817_set_suspend_enable_ctrl(struct regulator_dev *rdev,
+					 unsigned int en)
+{
+	unsigned int reg;
+	int id = rdev_get_id(rdev);
+	unsigned int id_slp, msk, val;
+
+	if (id >= RK817_ID_DCDC1 && id <= RK817_ID_DCDC4)
+		id_slp = id;
+	else if (id >= RK817_ID_LDO1 && id <= RK817_ID_LDO8)
+		id_slp = 8 + (id - RK817_ID_LDO1);
+	else if (id >= RK817_ID_LDO9 && id <= RK809_ID_SW2)
+		id_slp = 4 + (id - RK817_ID_LDO9);
+	else
+		return -EINVAL;
+
+	reg = RK817_POWER_SLP_EN_REG(id_slp / 8);
+
+	msk = BIT(id_slp % 8);
+	if (en)
+		val = msk;
+	else
+		val = 0;
+
+	return regmap_update_bits(rdev->regmap, reg, msk, val);
+}
+
+static int rk817_set_suspend_enable(struct regulator_dev *rdev)
+{
+	return rk817_set_suspend_enable_ctrl(rdev, 1);
+}
+
+static int rk817_set_suspend_disable(struct regulator_dev *rdev)
+{
+	return rk817_set_suspend_enable_ctrl(rdev, 0);
+}
+
+static int rk8xx_set_suspend_mode(struct regulator_dev *rdev, unsigned int mode)
+{
+	unsigned int reg;
+
+	reg = rdev->desc->vsel_reg + RK808_SLP_REG_OFFSET;
+
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		return regmap_update_bits(rdev->regmap, reg,
+					  PWM_MODE_MSK, FPWM_MODE);
+	case REGULATOR_MODE_NORMAL:
+		return regmap_update_bits(rdev->regmap, reg,
+					  PWM_MODE_MSK, AUTO_PWM_MODE);
+	default:
+		dev_err(&rdev->dev, "do not support this mode\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int rk8xx_set_mode(struct regulator_dev *rdev, unsigned int mode)
+{
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		return regmap_update_bits(rdev->regmap, rdev->desc->vsel_reg,
+					  PWM_MODE_MSK, FPWM_MODE);
+	case REGULATOR_MODE_NORMAL:
+		return regmap_update_bits(rdev->regmap, rdev->desc->vsel_reg,
+					  PWM_MODE_MSK, AUTO_PWM_MODE);
+	default:
+		dev_err(&rdev->dev, "do not support this mode\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static unsigned int rk8xx_get_mode(struct regulator_dev *rdev)
+{
+	unsigned int val;
+	int err;
+
+	err = regmap_read(rdev->regmap, rdev->desc->vsel_reg, &val);
+	if (err)
+		return err;
+
+	if (val & FPWM_MODE)
+		return REGULATOR_MODE_FAST;
+	else
+		return REGULATOR_MODE_NORMAL;
+}
+
+static int rk8xx_is_enabled_wmsk_regmap(struct regulator_dev *rdev)
+{
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(rdev->regmap, rdev->desc->enable_reg, &val);
+	if (ret != 0)
+		return ret;
+
+	/* add write mask bit */
+	val |= (rdev->desc->enable_mask & 0xf0);
+	val &= rdev->desc->enable_mask;
+
+	if (rdev->desc->enable_is_inverted) {
+		if (rdev->desc->enable_val)
+			return val != rdev->desc->enable_val;
+		return (val == 0);
+	}
+	if (rdev->desc->enable_val)
+		return val == rdev->desc->enable_val;
+	return val != 0;
+}
+
+static unsigned int rk8xx_regulator_of_map_mode(unsigned int mode)
+{
+	switch (mode) {
+	case 1:
+		return REGULATOR_MODE_FAST;
+	case 2:
+		return REGULATOR_MODE_NORMAL;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct regulator_ops rk805_reg_ops = {
 	.list_voltage           = regulator_list_voltage_linear,
 	.map_voltage            = regulator_map_voltage_linear,
@@ -439,6 +694,71 @@ static const struct regulator_linear_range rk805_buck_1_2_voltage_ranges[] = {
 	REGULATOR_LINEAR_RANGE(2300000, 63, 63, 0),
 };
 
+static struct regulator_ops rk809_buck5_ops_range = {
+	.list_voltage		= regulator_list_voltage_linear_range,
+	.map_voltage		= regulator_map_voltage_linear_range,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.set_voltage_time_sel	= regulator_set_voltage_time_sel,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_voltage	= rk808_set_suspend_voltage_range,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_reg_ops = {
+	.list_voltage		= regulator_list_voltage_linear,
+	.map_voltage		= regulator_map_voltage_linear,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_voltage	= rk817_set_suspend_voltage,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_boost_ops = {
+	.list_voltage		= regulator_list_voltage_linear,
+	.map_voltage		= regulator_map_voltage_linear,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_buck_ops_range = {
+	.list_voltage		= regulator_list_voltage_linear_range,
+	.map_voltage		= regulator_map_voltage_linear_range,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.set_voltage_time_sel	= regulator_set_voltage_time_sel,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_mode		= rk8xx_set_mode,
+	.get_mode		= rk8xx_get_mode,
+	.set_suspend_mode	= rk8xx_set_suspend_mode,
+	.set_ramp_delay		= rk817_set_ramp_delay,
+	.set_suspend_voltage	= rk808_set_suspend_voltage_range,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_switch_ops = {
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
 static const struct regulator_desc rk805_reg[] = {
 	{
 		.name = "DCDC_REG1",
@@ -595,6 +915,271 @@ static const struct regulator_desc rk808_reg[] = {
 		RK808_DCDC_EN_REG, BIT(6)),
 };
 
+static const struct regulator_desc rk809_reg[] = {
+	{
+		.name = "DCDC_REG1",
+		.supply_name = "vcc1",
+		.of_match = of_match_ptr("DCDC_REG1"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC1,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK1_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC1),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC1),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC1),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG2",
+		.supply_name = "vcc2",
+		.of_match = of_match_ptr("DCDC_REG2"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC2,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK2_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC2),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC2),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC2),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG3",
+		.supply_name = "vcc3",
+		.of_match = of_match_ptr("DCDC_REG3"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC3,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK3_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC3),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC3),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC3),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG4",
+		.supply_name = "vcc4",
+		.of_match = of_match_ptr("DCDC_REG4"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC4,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK3_SEL_CNT + 1,
+		.linear_ranges = rk817_buck3_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck3_voltage_ranges),
+		.vsel_reg = RK817_BUCK4_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC4),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC4),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC4),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "DCDC_REG5",
+		.supply_name = "vcc9",
+		.of_match = of_match_ptr("DCDC_REG5"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK809_ID_DCDC5,
+		.ops = &rk809_buck5_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK809_BUCK5_SEL_CNT,
+		.linear_ranges = rk809_buck5_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk809_buck5_voltage_ranges),
+		.vsel_reg = RK809_BUCK5_CONFIG(0),
+		.vsel_mask = RK809_BUCK5_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(3),
+		.enable_mask = ENABLE_MASK(1),
+		.enable_val = ENABLE_MASK(1),
+		.disable_val = DISABLE_VAL(1),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	},
+	RK817_DESC(RK817_ID_LDO1, "LDO_REG1", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(0), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO2, "LDO_REG2", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(1), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO3, "LDO_REG3", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(2), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO4, "LDO_REG4", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(3), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO5, "LDO_REG5", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(4), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO6, "LDO_REG6", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(5), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO7, "LDO_REG7", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(6), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO8, "LDO_REG8", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(7), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO9, "LDO_REG9", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(8), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(3), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC_SWITCH(RK809_ID_SW1, "SWITCH_REG1", "vcc9",
+			  RK817_POWER_EN_REG(3), ENABLE_MASK(2),
+			  DISABLE_VAL(2)),
+	RK817_DESC_SWITCH(RK809_ID_SW2, "SWITCH_REG2", "vcc8",
+			  RK817_POWER_EN_REG(3), ENABLE_MASK(3),
+			  DISABLE_VAL(3)),
+};
+
+static const struct regulator_desc rk817_reg[] = {
+	{
+		.name = "DCDC_REG1",
+		.supply_name = "vcc1",
+		.of_match = of_match_ptr("DCDC_REG1"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC1,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK1_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC1),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC1),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC1),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG2",
+		.supply_name = "vcc2",
+		.of_match = of_match_ptr("DCDC_REG2"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC2,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK2_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC2),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC2),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC2),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG3",
+		.supply_name = "vcc3",
+		.of_match = of_match_ptr("DCDC_REG3"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC3,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK3_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC3),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC3),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC3),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG4",
+		.supply_name = "vcc4",
+		.of_match = of_match_ptr("DCDC_REG4"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC4,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK3_SEL_CNT + 1,
+		.linear_ranges = rk817_buck3_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck3_voltage_ranges),
+		.vsel_reg = RK817_BUCK4_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC4),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC4),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC4),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	},
+	RK817_DESC(RK817_ID_LDO1, "LDO_REG1", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(0), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO2, "LDO_REG2", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(1), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO3, "LDO_REG3", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(2), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO4, "LDO_REG4", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(3), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO5, "LDO_REG5", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(4), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO6, "LDO_REG6", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(5), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO7, "LDO_REG7", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(6), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO8, "LDO_REG8", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(7), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO9, "LDO_REG9", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(8), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(3), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_BOOST_DESC(RK817_ID_BOOST, "BOOST", "vcc8", 4700, 5400, 100,
+			 RK817_BOOST_OTG_CFG, RK817_BOOST_VSEL_MASK,
+			 RK817_POWER_EN_REG(3), ENABLE_MASK(1), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400, 3500 - 5400),
+	RK817_DESC_SWITCH(RK817_ID_BOOST_OTG_SW, "OTG_SWITCH", "vcc9",
+			  RK817_POWER_EN_REG(3), ENABLE_MASK(2),
+			  DISABLE_VAL(2)),
+};
+
 static const struct regulator_desc rk818_reg[] = {
 	{
 		.name = "DCDC_REG1",
@@ -765,6 +1350,14 @@ static int rk808_regulator_probe(struct platform_device *pdev)
 		regulators = rk808_reg;
 		nregulators = RK808_NUM_REGULATORS;
 		break;
+	case RK809_ID:
+		regulators = rk809_reg;
+		nregulators = RK809_NUM_REGULATORS;
+		break;
+	case RK817_ID:
+		regulators = rk817_reg;
+		nregulators = RK817_NUM_REGULATORS;
+		break;
 	case RK818_ID:
 		regulators = rk818_reg;
 		nregulators = RK818_NUM_REGULATORS;
@@ -803,6 +1396,7 @@ static struct platform_driver rk808_regulator_driver = {
 module_platform_driver(rk808_regulator_driver);
 
 MODULE_DESCRIPTION("regulator driver for the RK805/RK808/RK818 series PMICs");
+MODULE_AUTHOR("Tony xie <tony.xie@rock-chips.com>");
 MODULE_AUTHOR("Chris Zhong <zyw@rock-chips.com>");
 MODULE_AUTHOR("Zhang Qing <zhangqing@rock-chips.com>");
 MODULE_AUTHOR("Wadim Egorov <w.egorov@phytec.de>");
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 0fd9eedf3c20..2a9cd01691b2 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -396,7 +396,10 @@ enum rk805_reg {
 #define SHUTDOWN_FUN			(0x2 << 2)
 #define SLEEP_FUN			(0x1 << 2)
 #define RK8XX_ID_MSK			0xfff0
+#define PWM_MODE_MSK			BIT(7)
 #define FPWM_MODE			BIT(7)
+#define AUTO_PWM_MODE			0
+
 enum rk817_reg_id {
 	RK817_ID_DCDC1 = 0,
 	RK817_ID_DCDC2,
-- 
cgit v1.2.3


From 5a136b4ae327e7f6be9c984a010df8d7ea5a4f83 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 7 Jun 2019 12:10:33 -0300
Subject: mm/hmm: Fix error flows in hmm_invalidate_range_start

If the trylock on the hmm->mirrors_sem fails the function will return
without decrementing the notifiers that were previously incremented. Since
the caller will not call invalidate_range_end() on EAGAIN this will result
in notifiers becoming permanently incremented and deadlock.

If the sync_cpu_device_pagetables() required blocking the function will
not return EAGAIN even though the device continues to touch the
pages. This is a violation of the mmu notifier contract.

Switch, and rename, the ranges_lock to a spin lock so we can reliably
obtain it without blocking during error unwind.

The error unwind is necessary since the notifiers count must be held
incremented across the call to sync_cpu_device_pagetables() as we cannot
allow the range to become marked valid by a parallel
invalidate_start/end() pair while doing sync_cpu_device_pagetables().

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h |  2 +-
 mm/hmm.c            | 69 +++++++++++++++++++++++++++++++----------------------
 2 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index bf013e965257..0fa8ea34ccef 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -86,7 +86,7 @@
 struct hmm {
 	struct mm_struct	*mm;
 	struct kref		kref;
-	struct mutex		lock;
+	spinlock_t		ranges_lock;
 	struct list_head	ranges;
 	struct list_head	mirrors;
 	struct mmu_notifier	mmu_notifier;
diff --git a/mm/hmm.c b/mm/hmm.c
index b224ea635a77..de35289df20d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -64,7 +64,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	init_rwsem(&hmm->mirrors_sem);
 	hmm->mmu_notifier.ops = NULL;
 	INIT_LIST_HEAD(&hmm->ranges);
-	mutex_init(&hmm->lock);
+	spin_lock_init(&hmm->ranges_lock);
 	kref_init(&hmm->kref);
 	hmm->notifiers = 0;
 	hmm->mm = mm;
@@ -144,6 +144,25 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	hmm_put(hmm);
 }
 
+static void notifiers_decrement(struct hmm *hmm)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
+	hmm->notifiers--;
+	if (!hmm->notifiers) {
+		struct hmm_range *range;
+
+		list_for_each_entry(range, &hmm->ranges, list) {
+			if (range->valid)
+				continue;
+			range->valid = true;
+		}
+		wake_up_all(&hmm->wq);
+	}
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
+}
+
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
@@ -151,6 +170,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 	struct hmm_mirror *mirror;
 	struct hmm_update update;
 	struct hmm_range *range;
+	unsigned long flags;
 	int ret = 0;
 
 	if (!kref_get_unless_zero(&hmm->kref))
@@ -161,12 +181,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 	update.event = HMM_UPDATE_INVALIDATE;
 	update.blockable = mmu_notifier_range_blockable(nrange);
 
-	if (mmu_notifier_range_blockable(nrange))
-		mutex_lock(&hmm->lock);
-	else if (!mutex_trylock(&hmm->lock)) {
-		ret = -EAGAIN;
-		goto out;
-	}
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
 	hmm->notifiers++;
 	list_for_each_entry(range, &hmm->ranges, list) {
 		if (update.end < range->start || update.start >= range->end)
@@ -174,7 +189,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 
 		range->valid = false;
 	}
-	mutex_unlock(&hmm->lock);
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
 
 	if (mmu_notifier_range_blockable(nrange))
 		down_read(&hmm->mirrors_sem);
@@ -182,16 +197,23 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 		ret = -EAGAIN;
 		goto out;
 	}
+
 	list_for_each_entry(mirror, &hmm->mirrors, list) {
-		int ret;
+		int rc;
 
-		ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
-		if (!update.blockable && ret == -EAGAIN)
+		rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+		if (rc) {
+			if (WARN_ON(update.blockable || rc != -EAGAIN))
+				continue;
+			ret = -EAGAIN;
 			break;
+		}
 	}
 	up_read(&hmm->mirrors_sem);
 
 out:
+	if (ret)
+		notifiers_decrement(hmm);
 	hmm_put(hmm);
 	return ret;
 }
@@ -204,20 +226,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
 	if (!kref_get_unless_zero(&hmm->kref))
 		return;
 
-	mutex_lock(&hmm->lock);
-	hmm->notifiers--;
-	if (!hmm->notifiers) {
-		struct hmm_range *range;
-
-		list_for_each_entry(range, &hmm->ranges, list) {
-			if (range->valid)
-				continue;
-			range->valid = true;
-		}
-		wake_up_all(&hmm->wq);
-	}
-	mutex_unlock(&hmm->lock);
-
+	notifiers_decrement(hmm);
 	hmm_put(hmm);
 }
 
@@ -868,6 +877,7 @@ int hmm_range_register(struct hmm_range *range,
 {
 	unsigned long mask = ((1UL << page_shift) - 1UL);
 	struct hmm *hmm = mirror->hmm;
+	unsigned long flags;
 
 	range->valid = false;
 	range->hmm = NULL;
@@ -886,7 +896,7 @@ int hmm_range_register(struct hmm_range *range,
 		return -EFAULT;
 
 	/* Initialize range to track CPU page table updates. */
-	mutex_lock(&hmm->lock);
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
 
 	range->hmm = hmm;
 	kref_get(&hmm->kref);
@@ -898,7 +908,7 @@ int hmm_range_register(struct hmm_range *range,
 	 */
 	if (!hmm->notifiers)
 		range->valid = true;
-	mutex_unlock(&hmm->lock);
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
 
 	return 0;
 }
@@ -914,10 +924,11 @@ EXPORT_SYMBOL(hmm_range_register);
 void hmm_range_unregister(struct hmm_range *range)
 {
 	struct hmm *hmm = range->hmm;
+	unsigned long flags;
 
-	mutex_lock(&hmm->lock);
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
 	list_del_init(&range->list);
-	mutex_unlock(&hmm->lock);
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
 
 	/* Drop reference taken by hmm_range_register() */
 	mmput(hmm->mm);
-- 
cgit v1.2.3


From 4844ef80305d0180051d0787cd91c63573255dc2 Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Tue, 25 Jun 2019 13:27:42 +0530
Subject: mtd: cfi_cmdset_0002: Add support for polling status register

HyperFlash devices are compliant with CFI AMD/Fujitsu Extended Command
Set (0x0002) for flash operations, therefore
drivers/mtd/chips/cfi_cmdset_0002.c can be used as is. But these devices
do not support DQ polling method of determining chip ready/good status.
These flashes provide Status Register whose bits can be polled to know
status of flash operation.

Cypress HyperFlash datasheet here[1], talks about CFI Amd/Fujitsu
Extended Query version 1.5. Bit 0 of "Software Features supported" field
of CFI Primary Vendor-Specific Extended Query table indicates
presence/absence of status register and Bit 1 indicates whether or not
DQ polling is supported. Using these bits, its possible to determine
whether flash supports DQ polling or need to use Status Register.

Add support for polling Status Register to know device ready/status of
erase/write operations when DQ polling is not supported.
Print error messages on erase/program failure by looking at related
Status Register bits.

[1] https://www.cypress.com/file/213346/download

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Reviewed-by: Tokunori Ikegami <ikegami.t@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/chips/cfi_cmdset_0002.c | 130 +++++++++++++++++++++++++++++++-----
 include/linux/mtd/cfi.h             |   7 ++
 2 files changed, 120 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index a1a7d334aa82..f4da7bd552e9 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -49,6 +49,16 @@
 #define SST49LF008A		0x005a
 #define AT49BV6416		0x00d6
 
+/*
+ * Status Register bit description. Used by flash devices that don't
+ * support DQ polling (e.g. HyperFlash)
+ */
+#define CFI_SR_DRB		BIT(7)
+#define CFI_SR_ESB		BIT(5)
+#define CFI_SR_PSB		BIT(4)
+#define CFI_SR_WBASB		BIT(3)
+#define CFI_SR_SLSB		BIT(1)
+
 static int cfi_amdstd_read (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
 static int cfi_amdstd_write_words(struct mtd_info *, loff_t, size_t, size_t *, const u_char *);
 static int cfi_amdstd_write_buffers(struct mtd_info *, loff_t, size_t, size_t *, const u_char *);
@@ -97,6 +107,50 @@ static struct mtd_chip_driver cfi_amdstd_chipdrv = {
 	.module		= THIS_MODULE
 };
 
+/*
+ * Use status register to poll for Erase/write completion when DQ is not
+ * supported. This is indicated by Bit[1:0] of SoftwareFeatures field in
+ * CFI Primary Vendor-Specific Extended Query table 1.5
+ */
+static int cfi_use_status_reg(struct cfi_private *cfi)
+{
+	struct cfi_pri_amdstd *extp = cfi->cmdset_priv;
+	u8 poll_mask = CFI_POLL_STATUS_REG | CFI_POLL_DQ;
+
+	return extp->MinorVersion >= '5' &&
+		(extp->SoftwareFeatures & poll_mask) == CFI_POLL_STATUS_REG;
+}
+
+static void cfi_check_err_status(struct map_info *map, struct flchip *chip,
+				 unsigned long adr)
+{
+	struct cfi_private *cfi = map->fldrv_priv;
+	map_word status;
+
+	if (!cfi_use_status_reg(cfi))
+		return;
+
+	cfi_send_gen_cmd(0x70, cfi->addr_unlock1, chip->start, map, cfi,
+			 cfi->device_type, NULL);
+	status = map_read(map, adr);
+
+	if (map_word_bitsset(map, status, CMD(0x3a))) {
+		unsigned long chipstatus = MERGESTATUS(status);
+
+		if (chipstatus & CFI_SR_ESB)
+			pr_err("%s erase operation failed, status %lx\n",
+			       map->name, chipstatus);
+		if (chipstatus & CFI_SR_PSB)
+			pr_err("%s program operation failed, status %lx\n",
+			       map->name, chipstatus);
+		if (chipstatus & CFI_SR_WBASB)
+			pr_err("%s buffer program command aborted, status %lx\n",
+			       map->name, chipstatus);
+		if (chipstatus & CFI_SR_SLSB)
+			pr_err("%s sector write protected, status %lx\n",
+			       map->name, chipstatus);
+	}
+}
 
 /* #define DEBUG_CFI_FEATURES */
 
@@ -742,10 +796,25 @@ static struct mtd_info *cfi_amdstd_setup(struct mtd_info *mtd)
  * correctly and is therefore not done	(particularly with interleaved chips
  * as each chip must be checked independently of the others).
  */
-static int __xipram chip_ready(struct map_info *map, unsigned long addr)
+static int __xipram chip_ready(struct map_info *map, struct flchip *chip,
+			       unsigned long addr)
 {
+	struct cfi_private *cfi = map->fldrv_priv;
 	map_word d, t;
 
+	if (cfi_use_status_reg(cfi)) {
+		map_word ready = CMD(CFI_SR_DRB);
+		/*
+		 * For chips that support status register, check device
+		 * ready bit
+		 */
+		cfi_send_gen_cmd(0x70, cfi->addr_unlock1, chip->start, map, cfi,
+				 cfi->device_type, NULL);
+		d = map_read(map, addr);
+
+		return map_word_andequal(map, d, ready, ready);
+	}
+
 	d = map_read(map, addr);
 	t = map_read(map, addr);
 
@@ -767,10 +836,30 @@ static int __xipram chip_ready(struct map_info *map, unsigned long addr)
  * as each chip must be checked independently of the others).
  *
  */
-static int __xipram chip_good(struct map_info *map, unsigned long addr, map_word expected)
+static int __xipram chip_good(struct map_info *map, struct flchip *chip,
+			      unsigned long addr, map_word expected)
 {
+	struct cfi_private *cfi = map->fldrv_priv;
 	map_word oldd, curd;
 
+	if (cfi_use_status_reg(cfi)) {
+		map_word ready = CMD(CFI_SR_DRB);
+		map_word err = CMD(CFI_SR_PSB | CFI_SR_ESB);
+		/*
+		 * For chips that support status register, check device
+		 * ready bit and Erase/Program status bit to know if
+		 * operation succeeded.
+		 */
+		cfi_send_gen_cmd(0x70, cfi->addr_unlock1, chip->start, map, cfi,
+				 cfi->device_type, NULL);
+		curd = map_read(map, addr);
+
+		if (map_word_andequal(map, curd, ready, ready))
+			return !map_word_bitsset(map, curd, err);
+
+		return 0;
+	}
+
 	oldd = map_read(map, addr);
 	curd = map_read(map, addr);
 
@@ -792,7 +881,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 
 	case FL_STATUS:
 		for (;;) {
-			if (chip_ready(map, adr))
+			if (chip_ready(map, chip, adr))
 				break;
 
 			if (time_after(jiffies, timeo)) {
@@ -830,7 +919,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 		chip->state = FL_ERASE_SUSPENDING;
 		chip->erase_suspended = 1;
 		for (;;) {
-			if (chip_ready(map, adr))
+			if (chip_ready(map, chip, adr))
 				break;
 
 			if (time_after(jiffies, timeo)) {
@@ -1362,7 +1451,7 @@ static int do_otp_lock(struct map_info *map, struct flchip *chip, loff_t adr,
 	/* wait for chip to become ready */
 	timeo = jiffies + msecs_to_jiffies(2);
 	for (;;) {
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		if (time_after(jiffies, timeo)) {
@@ -1628,22 +1717,24 @@ static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip,
 			continue;
 		}
 
-		if (time_after(jiffies, timeo) && !chip_ready(map, adr)){
+		if (time_after(jiffies, timeo) &&
+		    !chip_ready(map, chip, adr)) {
 			xip_enable(map, chip, adr);
 			printk(KERN_WARNING "MTD %s(): software timeout\n", __func__);
 			xip_disable(map, chip, adr);
 			break;
 		}
 
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		/* Latency issues. Drop the lock, wait a while and retry */
 		UDELAY(map, chip, adr, 1);
 	}
 	/* Did we succeed? */
-	if (!chip_good(map, adr, datum)) {
+	if (!chip_good(map, chip, adr, datum)) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -1881,10 +1972,11 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 		 * We check "time_after" and "!chip_good" before checking "chip_good" to avoid
 		 * the failure due to scheduling.
 		 */
-		if (time_after(jiffies, timeo) && !chip_good(map, adr, datum))
+		if (time_after(jiffies, timeo) &&
+		    !chip_good(map, chip, adr, datum))
 			break;
 
-		if (chip_good(map, adr, datum)) {
+		if (chip_good(map, chip, adr, datum)) {
 			xip_enable(map, chip, adr);
 			goto op_done;
 		}
@@ -1901,6 +1993,7 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 	 * See e.g.
 	 * http://www.spansion.com/Support/Application%20Notes/MirrorBit_Write_Buffer_Prog_Page_Buffer_Read_AN.pdf
 	 */
+	cfi_check_err_status(map, chip, adr);
 	cfi_send_gen_cmd(0xAA, cfi->addr_unlock1, chip->start, map, cfi,
 			 cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x55, cfi->addr_unlock2, chip->start, map, cfi,
@@ -2018,7 +2111,7 @@ static int cfi_amdstd_panic_wait(struct map_info *map, struct flchip *chip,
 	 * If the driver thinks the chip is idle, and no toggle bits
 	 * are changing, then the chip is actually idle for sure.
 	 */
-	if (chip->state == FL_READY && chip_ready(map, adr))
+	if (chip->state == FL_READY && chip_ready(map, chip, adr))
 		return 0;
 
 	/*
@@ -2035,7 +2128,7 @@ static int cfi_amdstd_panic_wait(struct map_info *map, struct flchip *chip,
 
 		/* wait for the chip to become ready */
 		for (i = 0; i < jiffies_to_usecs(timeo); i++) {
-			if (chip_ready(map, adr))
+			if (chip_ready(map, chip, adr))
 				return 0;
 
 			udelay(1);
@@ -2099,14 +2192,15 @@ retry:
 	map_write(map, datum, adr);
 
 	for (i = 0; i < jiffies_to_usecs(uWriteTimeout); i++) {
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		udelay(1);
 	}
 
-	if (!chip_good(map, adr, datum)) {
+	if (!chip_good(map, chip, adr, datum)) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -2300,7 +2394,7 @@ static int __xipram do_erase_chip(struct map_info *map, struct flchip *chip)
 			chip->erase_suspended = 0;
 		}
 
-		if (chip_good(map, adr, map_word_ff(map)))
+		if (chip_good(map, chip, adr, map_word_ff(map)))
 			break;
 
 		if (time_after(jiffies, timeo)) {
@@ -2316,6 +2410,7 @@ static int __xipram do_erase_chip(struct map_info *map, struct flchip *chip)
 	/* Did we succeed? */
 	if (ret) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -2396,7 +2491,7 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip,
 			chip->erase_suspended = 0;
 		}
 
-		if (chip_good(map, adr, map_word_ff(map)))
+		if (chip_good(map, chip, adr, map_word_ff(map)))
 			break;
 
 		if (time_after(jiffies, timeo)) {
@@ -2412,6 +2507,7 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip,
 	/* Did we succeed? */
 	if (ret) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -2587,7 +2683,7 @@ static int __maybe_unused do_ppb_xxlock(struct map_info *map,
 	 */
 	timeo = jiffies + msecs_to_jiffies(2000);	/* 2s max (un)locking */
 	for (;;) {
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		if (time_after(jiffies, timeo)) {
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index cbf77168658c..7fdbc1ff6527 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -233,6 +233,13 @@ struct cfi_pri_amdstd {
 	uint8_t  VppMin;
 	uint8_t  VppMax;
 	uint8_t  TopBottom;
+	/* Below field are added from version 1.5 */
+	uint8_t  ProgramSuspend;
+	uint8_t  UnlockBypass;
+	uint8_t  SecureSiliconSector;
+	uint8_t  SoftwareFeatures;
+#define CFI_POLL_STATUS_REG	BIT(0)
+#define CFI_POLL_DQ		BIT(1)
 } __packed;
 
 /* Vendor-Specific PRI for Atmel chips (command set 0x0002) */
-- 
cgit v1.2.3


From dcc7d3446a0fa19bd7e8074920b8f9ef3b7ec00c Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Tue, 25 Jun 2019 13:27:44 +0530
Subject: mtd: Add support for HyperBus memory devices

Cypress' HyperBus is Low Signal Count, High Performance Double Data Rate
Bus interface between a host system master and one or more slave
interfaces. HyperBus is used to connect microprocessor, microcontroller,
or ASIC devices with random access NOR flash memory (called HyperFlash)
or self refresh DRAM (called HyperRAM).

Its a 8-bit data bus (DQ[7:0]) with  Read-Write Data Strobe (RWDS)
signal and either Single-ended clock(3.0V parts) or Differential clock
(1.8V parts). It uses ChipSelect lines to select b/w multiple slaves.
At bus level, it follows a separate protocol described in HyperBus
specification[1].

HyperFlash follows CFI AMD/Fujitsu Extended Command Set (0x0002) similar
to that of existing parallel NORs. Since HyperBus is x8 DDR bus,
its equivalent to x16 parallel NOR flash with respect to bits per clock
cycle. But HyperBus operates at >166MHz frequencies.
HyperRAM provides direct random read/write access to flash memory
array.

But, HyperBus memory controllers seem to abstract implementation details
and expose a simple MMIO interface to access connected flash.

Add support for registering HyperFlash devices with MTD framework. MTD
maps framework along with CFI chip support framework are used to support
communicating with flash.

Framework is modelled along the lines of spi-nor framework. HyperBus
memory controller (HBMC) drivers calls hyperbus_register_device() to
register a single HyperFlash device. HyperFlash core parses MMIO access
information from DT, sets up the map_info struct, probes CFI flash and
registers it with MTD framework.

Some HBMC masters need calibration/training sequence[3] to be carried
out, in order for DLL inside the controller to lock, by reading a known
string/pattern. This is done by repeatedly reading CFI Query
Identification String. Calibration needs to be done before trying to detect
flash as part of CFI flash probe.

HyperRAM is not supported at the moment.

HyperBus specification can be found at[1]
HyperFlash datasheet can be found at[2]

[1] https://www.cypress.com/file/213356/download
[2] https://www.cypress.com/file/213346/download
[3] http://www.ti.com/lit/ug/spruid7b/spruid7b.pdf
    Table 12-5741. HyperFlash Access Sequence

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 MAINTAINERS                          |   7 ++
 drivers/mtd/Kconfig                  |   2 +
 drivers/mtd/Makefile                 |   1 +
 drivers/mtd/hyperbus/Kconfig         |  11 +++
 drivers/mtd/hyperbus/Makefile        |   3 +
 drivers/mtd/hyperbus/hyperbus-core.c | 153 +++++++++++++++++++++++++++++++++++
 include/linux/mtd/hyperbus.h         |  84 +++++++++++++++++++
 7 files changed, 261 insertions(+)
 create mode 100644 drivers/mtd/hyperbus/Kconfig
 create mode 100644 drivers/mtd/hyperbus/Makefile
 create mode 100644 drivers/mtd/hyperbus/hyperbus-core.c
 create mode 100644 include/linux/mtd/hyperbus.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5cfbea4ce575..f1253adb8cf6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7305,6 +7305,13 @@ F:	include/uapi/linux/hyperv.h
 F:	tools/hv/
 F:	Documentation/ABI/stable/sysfs-bus-vmbus
 
+HYPERBUS SUPPORT
+M:	Vignesh Raghavendra <vigneshr@ti.com>
+S:	Supported
+F:	drivers/mtd/hyperbus/
+F:	include/linux/mtd/hyperbus.h
+F:	Documentation/devicetree/bindings/mtd/cypress,hyperflash.txt
+
 HYPERVISOR VIRTUAL CONSOLE DRIVER
 L:	linuxppc-dev@lists.ozlabs.org
 S:	Odd Fixes
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index fb31a7f649a3..80a6e2dcd085 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -274,4 +274,6 @@ source "drivers/mtd/spi-nor/Kconfig"
 
 source "drivers/mtd/ubi/Kconfig"
 
+source "drivers/mtd/hyperbus/Kconfig"
+
 endif # MTD
diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile
index 806287e80e84..62d649a959e2 100644
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -34,3 +34,4 @@ obj-y		+= chips/ lpddr/ maps/ devices/ nand/ tests/
 
 obj-$(CONFIG_MTD_SPI_NOR)	+= spi-nor/
 obj-$(CONFIG_MTD_UBI)		+= ubi/
+obj-$(CONFIG_MTD_HYPERBUS)	+= hyperbus/
diff --git a/drivers/mtd/hyperbus/Kconfig b/drivers/mtd/hyperbus/Kconfig
new file mode 100644
index 000000000000..98147e28caa0
--- /dev/null
+++ b/drivers/mtd/hyperbus/Kconfig
@@ -0,0 +1,11 @@
+menuconfig MTD_HYPERBUS
+	tristate "HyperBus support"
+	select MTD_CFI
+	select MTD_MAP_BANK_WIDTH_2
+	select MTD_CFI_AMDSTD
+	select MTD_COMPLEX_MAPPINGS
+	help
+	  This is the framework for the HyperBus which can be used by
+	  the HyperBus Controller driver to communicate with
+	  HyperFlash. See Cypress HyperBus specification for more
+	  details
diff --git a/drivers/mtd/hyperbus/Makefile b/drivers/mtd/hyperbus/Makefile
new file mode 100644
index 000000000000..ca61dedd730d
--- /dev/null
+++ b/drivers/mtd/hyperbus/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_MTD_HYPERBUS)	+= hyperbus-core.o
diff --git a/drivers/mtd/hyperbus/hyperbus-core.c b/drivers/mtd/hyperbus/hyperbus-core.c
new file mode 100644
index 000000000000..6af9ea34117d
--- /dev/null
+++ b/drivers/mtd/hyperbus/hyperbus-core.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com/
+// Author: Vignesh Raghavendra <vigneshr@ti.com>
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mtd/hyperbus.h>
+#include <linux/mtd/map.h>
+#include <linux/mtd/mtd.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/types.h>
+
+static struct hyperbus_device *map_to_hbdev(struct map_info *map)
+{
+	return container_of(map, struct hyperbus_device, map);
+}
+
+static map_word hyperbus_read16(struct map_info *map, unsigned long addr)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+	map_word read_data;
+
+	read_data.x[0] = ctlr->ops->read16(hbdev, addr);
+
+	return read_data;
+}
+
+static void hyperbus_write16(struct map_info *map, map_word d,
+			     unsigned long addr)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+
+	ctlr->ops->write16(hbdev, addr, d.x[0]);
+}
+
+static void hyperbus_copy_from(struct map_info *map, void *to,
+			       unsigned long from, ssize_t len)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+
+	ctlr->ops->copy_from(hbdev, to, from, len);
+}
+
+static void hyperbus_copy_to(struct map_info *map, unsigned long to,
+			     const void *from, ssize_t len)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+
+	ctlr->ops->copy_to(hbdev, to, from, len);
+}
+
+int hyperbus_register_device(struct hyperbus_device *hbdev)
+{
+	const struct hyperbus_ops *ops;
+	struct hyperbus_ctlr *ctlr;
+	struct device_node *np;
+	struct map_info *map;
+	struct resource res;
+	struct device *dev;
+	int ret;
+
+	if (!hbdev || !hbdev->np || !hbdev->ctlr || !hbdev->ctlr->dev) {
+		pr_err("hyperbus: please fill all the necessary fields!\n");
+		return -EINVAL;
+	}
+
+	np = hbdev->np;
+	ctlr = hbdev->ctlr;
+	if (!of_device_is_compatible(np, "cypress,hyperflash"))
+		return -ENODEV;
+
+	hbdev->memtype = HYPERFLASH;
+
+	ret = of_address_to_resource(np, 0, &res);
+	if (ret)
+		return ret;
+
+	dev = ctlr->dev;
+	map = &hbdev->map;
+	map->size = resource_size(&res);
+	map->virt = devm_ioremap_resource(dev, &res);
+	if (IS_ERR(map->virt))
+		return PTR_ERR(map->virt);
+
+	map->name = dev_name(dev);
+	map->bankwidth = 2;
+	map->device_node = np;
+
+	simple_map_init(map);
+	ops = ctlr->ops;
+	if (ops) {
+		if (ops->read16)
+			map->read = hyperbus_read16;
+		if (ops->write16)
+			map->write = hyperbus_write16;
+		if (ops->copy_to)
+			map->copy_to = hyperbus_copy_to;
+		if (ops->copy_from)
+			map->copy_from = hyperbus_copy_from;
+
+		if (ops->calibrate && !ctlr->calibrated) {
+			ret = ops->calibrate(hbdev);
+			if (!ret) {
+				dev_err(dev, "Calibration failed\n");
+				return -ENODEV;
+			}
+			ctlr->calibrated = true;
+		}
+	}
+
+	hbdev->mtd = do_map_probe("cfi_probe", map);
+	if (!hbdev->mtd) {
+		dev_err(dev, "probing of hyperbus device failed\n");
+		return -ENODEV;
+	}
+
+	hbdev->mtd->dev.parent = dev;
+	mtd_set_of_node(hbdev->mtd, np);
+
+	ret = mtd_device_register(hbdev->mtd, NULL, 0);
+	if (ret) {
+		dev_err(dev, "failed to register mtd device\n");
+		map_destroy(hbdev->mtd);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hyperbus_register_device);
+
+int hyperbus_unregister_device(struct hyperbus_device *hbdev)
+{
+	int ret = 0;
+
+	if (hbdev && hbdev->mtd) {
+		ret = mtd_device_unregister(hbdev->mtd);
+		map_destroy(hbdev->mtd);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hyperbus_unregister_device);
+
+MODULE_DESCRIPTION("HyperBus Framework");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vignesh Raghavendra <vigneshr@ti.com>");
diff --git a/include/linux/mtd/hyperbus.h b/include/linux/mtd/hyperbus.h
new file mode 100644
index 000000000000..2dfe65964f6e
--- /dev/null
+++ b/include/linux/mtd/hyperbus.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com/
+ */
+
+#ifndef __LINUX_MTD_HYPERBUS_H__
+#define __LINUX_MTD_HYPERBUS_H__
+
+#include <linux/mtd/map.h>
+
+enum hyperbus_memtype {
+	HYPERFLASH,
+	HYPERRAM,
+};
+
+/**
+ * struct hyperbus_device - struct representing HyperBus slave device
+ * @map: map_info struct for accessing MMIO HyperBus flash memory
+ * @np: pointer to HyperBus slave device node
+ * @mtd: pointer to MTD struct
+ * @ctlr: pointer to HyperBus controller struct
+ * @memtype: type of memory device: HyperFlash or HyperRAM
+ */
+
+struct hyperbus_device {
+	struct map_info map;
+	struct device_node *np;
+	struct mtd_info *mtd;
+	struct hyperbus_ctlr *ctlr;
+	enum hyperbus_memtype memtype;
+};
+
+/**
+ * struct hyperbus_ops - struct representing custom HyperBus operations
+ * @read16: read 16 bit of data from flash in a single burst. Used to read
+ *          from non default address space, such as ID/CFI space
+ * @write16: write 16 bit of data to flash in a single burst. Used to
+ *           send cmd to flash or write single 16 bit word at a time.
+ * @copy_from: copy data from flash memory
+ * @copy_to: copy data to flash memory
+ * @calibrate: calibrate HyperBus controller
+ */
+
+struct hyperbus_ops {
+	u16 (*read16)(struct hyperbus_device *hbdev, unsigned long addr);
+	void (*write16)(struct hyperbus_device *hbdev,
+			unsigned long addr, u16 val);
+	void (*copy_from)(struct hyperbus_device *hbdev, void *to,
+			  unsigned long from, ssize_t len);
+	void (*copy_to)(struct hyperbus_device *dev, unsigned long to,
+			const void *from, ssize_t len);
+	int (*calibrate)(struct hyperbus_device *dev);
+};
+
+/**
+ * struct hyperbus_ctlr - struct representing HyperBus controller
+ * @dev: pointer to HyperBus controller device
+ * @calibrated: flag to indicate ctlr calibration sequence is complete
+ * @ops: HyperBus controller ops
+ */
+struct hyperbus_ctlr {
+	struct device *dev;
+	bool calibrated;
+
+	const struct hyperbus_ops *ops;
+};
+
+/**
+ * hyperbus_register_device - probe and register a HyperBus slave memory device
+ * @hbdev: hyperbus_device struct with dev, np and ctlr field populated
+ *
+ * Return: 0 for success, others for failure.
+ */
+int hyperbus_register_device(struct hyperbus_device *hbdev);
+
+/**
+ * hyperbus_unregister_device - deregister HyperBus slave memory device
+ * @hbdev: hyperbus_device to be unregistered
+ *
+ * Return: 0 for success, others for failure.
+ */
+int hyperbus_unregister_device(struct hyperbus_device *hbdev);
+
+#endif /* __LINUX_MTD_HYPERBUS_H__ */
-- 
cgit v1.2.3


From 855eff216a97afa4a2233b792cb3c812b5ebd876 Mon Sep 17 00:00:00 2001
From: Jonathan Bakker <xc-racer2@live.ca>
Date: Fri, 26 Apr 2019 17:06:34 +0200
Subject: mtd: onenand: Add support for 8Gb datasize onenand
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Used in several S5PV210-based Galaxy S devices, among them SGH-T959V,
SGH-T959P, SGH-T839, and SPH-D700.

Signed-off-by: Jonathan Bakker <xc-racer2@live.ca>
Signed-off-by: Paweł Chmiel <pawel.mikolaj.chmiel@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/onenand/onenand_base.c | 2 ++
 include/linux/mtd/onenand_regs.h        | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/mtd/nand/onenand/onenand_base.c b/drivers/mtd/nand/onenand/onenand_base.c
index f41d76248550..492c0059673d 100644
--- a/drivers/mtd/nand/onenand/onenand_base.c
+++ b/drivers/mtd/nand/onenand/onenand_base.c
@@ -3260,6 +3260,8 @@ static void onenand_check_features(struct mtd_info *mtd)
 
 	/* Lock scheme */
 	switch (density) {
+	case ONENAND_DEVICE_DENSITY_8Gb:
+		this->options |= ONENAND_HAS_NOP_1;
 	case ONENAND_DEVICE_DENSITY_4Gb:
 		if (ONENAND_IS_DDP(this))
 			this->options |= ONENAND_HAS_2PLANE;
diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h
index d60130f88eed..9640d707cbf8 100644
--- a/include/linux/mtd/onenand_regs.h
+++ b/include/linux/mtd/onenand_regs.h
@@ -80,6 +80,7 @@
 #define ONENAND_DEVICE_DENSITY_1Gb	(0x003)
 #define ONENAND_DEVICE_DENSITY_2Gb	(0x004)
 #define ONENAND_DEVICE_DENSITY_4Gb	(0x005)
+#define ONENAND_DEVICE_DENSITY_8Gb	(0x006)
 
 /*
  * Version ID Register F002h (R)
-- 
cgit v1.2.3


From 14a82ea7e1682645d942d9fb41fcb6126fd1645e Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:30 +0200
Subject: mtd: rawnand: export NAND operation tracer

The NAND core has a NAND operation tracing function, but it can only
be used by drivers using the generic option parser from the NAND core.
Export the tracing function as a static inline function in rawnand.h
so that drivers implementing exec_op directly do not have to write their
own operation tracing.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 30 +-----------------------------
 include/linux/mtd/rawnand.h      | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index d565b4a9dce1..6ecd1c496ce3 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -2115,35 +2115,7 @@ static void nand_op_parser_trace(const struct nand_op_parser_ctx *ctx)
 		if (instr == &ctx->subop.instrs[0])
 			prefix = "    ->";
 
-		switch (instr->type) {
-		case NAND_OP_CMD_INSTR:
-			pr_debug("%sCMD      [0x%02x]\n", prefix,
-				 instr->ctx.cmd.opcode);
-			break;
-		case NAND_OP_ADDR_INSTR:
-			pr_debug("%sADDR     [%d cyc: %*ph]\n", prefix,
-				 instr->ctx.addr.naddrs,
-				 instr->ctx.addr.naddrs < 64 ?
-				 instr->ctx.addr.naddrs : 64,
-				 instr->ctx.addr.addrs);
-			break;
-		case NAND_OP_DATA_IN_INSTR:
-			pr_debug("%sDATA_IN  [%d B%s]\n", prefix,
-				 instr->ctx.data.len,
-				 instr->ctx.data.force_8bit ?
-				 ", force 8-bit" : "");
-			break;
-		case NAND_OP_DATA_OUT_INSTR:
-			pr_debug("%sDATA_OUT [%d B%s]\n", prefix,
-				 instr->ctx.data.len,
-				 instr->ctx.data.force_8bit ?
-				 ", force 8-bit" : "");
-			break;
-		case NAND_OP_WAITRDY_INSTR:
-			pr_debug("%sWAITRDY  [max %d ms]\n", prefix,
-				 instr->ctx.waitrdy.timeout_ms);
-			break;
-		}
+		nand_op_trace(prefix, instr);
 
 		if (instr == &ctx->subop.instrs[ctx->subop.ninstrs - 1])
 			prefix = "      ";
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index dbfffa5bec7b..f5bb6f11c36b 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -877,6 +877,42 @@ int nand_op_parser_exec_op(struct nand_chip *chip,
 			   const struct nand_op_parser *parser,
 			   const struct nand_operation *op, bool check_only);
 
+static inline void nand_op_trace(const char *prefix,
+				 const struct nand_op_instr *instr)
+{
+#if IS_ENABLED(CONFIG_DYNAMIC_DEBUG) || defined(DEBUG)
+	switch (instr->type) {
+	case NAND_OP_CMD_INSTR:
+		pr_debug("%sCMD      [0x%02x]\n", prefix,
+			 instr->ctx.cmd.opcode);
+		break;
+	case NAND_OP_ADDR_INSTR:
+		pr_debug("%sADDR     [%d cyc: %*ph]\n", prefix,
+			 instr->ctx.addr.naddrs,
+			 instr->ctx.addr.naddrs < 64 ?
+			 instr->ctx.addr.naddrs : 64,
+			 instr->ctx.addr.addrs);
+		break;
+	case NAND_OP_DATA_IN_INSTR:
+		pr_debug("%sDATA_IN  [%d B%s]\n", prefix,
+			 instr->ctx.data.len,
+			 instr->ctx.data.force_8bit ?
+			 ", force 8-bit" : "");
+		break;
+	case NAND_OP_DATA_OUT_INSTR:
+		pr_debug("%sDATA_OUT [%d B%s]\n", prefix,
+			 instr->ctx.data.len,
+			 instr->ctx.data.force_8bit ?
+			 ", force 8-bit" : "");
+		break;
+	case NAND_OP_WAITRDY_INSTR:
+		pr_debug("%sWAITRDY  [max %d ms]\n", prefix,
+			 instr->ctx.waitrdy.timeout_ms);
+		break;
+	}
+#endif
+}
+
 /**
  * struct nand_controller_ops - Controller operations
  *
-- 
cgit v1.2.3


From e0ddaab76802d3179013f4864535043e2aea6c69 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:41 +0200
Subject: dmaengine: mxs: Add header file to be shared with gpmi nand driver

The mxs dma driver can do PIO transfers. A pointer to the PIO words
to transfer is passed in the struct scatterlist * argument of
dmaengine_prep_slave_sg(). It's quite ugly and non obvious to cast
u32 * to struct scatterlist * each time when calling
dmaengine_prep_slave_sg(), so add a static inline wrapper function
to be called by the user along with a description what is going on.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/dma/mxs-dma.c                      |  1 +
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 36 ++++++++++++------------------
 include/linux/dma/mxs-dma.h                | 21 +++++++++++++++++
 3 files changed, 36 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/dma/mxs-dma.h

(limited to 'include')

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index ce92a3626ea4..62ee9328aea1 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -24,6 +24,7 @@
 #include <linux/of_device.h>
 #include <linux/of_dma.h>
 #include <linux/list.h>
+#include <linux/dma/mxs-dma.h>
 
 #include <asm/irq.h>
 
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
index 74c4de0b1a3d..45c7b91aae23 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/pm_runtime.h>
+#include <linux/dma/mxs-dma.h>
 #include "gpmi-nand.h"
 #include "gpmi-regs.h"
 #include "bch-regs.h"
@@ -914,9 +915,8 @@ static int gpmi_send_command(struct gpmi_nand_data *this)
 		| BM_GPMI_CTRL0_ADDRESS_INCREMENT
 		| BF_GPMI_CTRL0_XFER_COUNT(this->command_length);
 	pio[1] = pio[2] = 0;
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -988,8 +988,8 @@ static int gpmi_send_data(struct gpmi_nand_data *this, const void *buf, int len)
 		| BF_GPMI_CTRL0_ADDRESS(address)
 		| BF_GPMI_CTRL0_XFER_COUNT(len);
 	pio[1] = 0;
-	desc = dmaengine_prep_slave_sg(channel, (struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -1025,9 +1025,8 @@ static int gpmi_read_data(struct gpmi_nand_data *this, void *buf, int len)
 		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
 		| BF_GPMI_CTRL0_XFER_COUNT(len);
 	pio[1] = 0;
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -1083,10 +1082,8 @@ static int gpmi_send_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[4] = payload;
 	pio[5] = auxiliary;
 
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE,
-					DMA_CTRL_ACK);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, DMA_CTRL_ACK);
 	if (!desc)
 		return -EINVAL;
 
@@ -1117,9 +1114,7 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 		| BF_GPMI_CTRL0_ADDRESS(address)
 		| BF_GPMI_CTRL0_XFER_COUNT(0);
 	pio[1] = 0;
-	desc = dmaengine_prep_slave_sg(channel,
-				(struct scatterlist *)pio, 2,
-				DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, 2, DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -1144,10 +1139,8 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[3] = geo->page_size;
 	pio[4] = payload;
 	pio[5] = auxiliary;
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE,
-					DMA_CTRL_ACK);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, DMA_CTRL_ACK);
 	if (!desc)
 		return -EINVAL;
 
@@ -1163,9 +1156,8 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 		| BF_GPMI_CTRL0_XFER_COUNT(geo->page_size);
 	pio[1] = 0;
 	pio[2] = 0; /* clear GPMI_HW_GPMI_ECCCTRL, disable the BCH. */
-	desc = dmaengine_prep_slave_sg(channel,
-				(struct scatterlist *)pio, 3,
-				DMA_TRANS_NONE, DMA_CTRL_ACK);
+	desc = mxs_dmaengine_prep_pio(channel, pio, 3, DMA_TRANS_NONE,
+				      DMA_CTRL_ACK);
 	if (!desc)
 		return -EINVAL;
 
diff --git a/include/linux/dma/mxs-dma.h b/include/linux/dma/mxs-dma.h
new file mode 100644
index 000000000000..092b2a7b92ac
--- /dev/null
+++ b/include/linux/dma/mxs-dma.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MXS_DMA_H_
+#define _MXS_DMA_H_
+
+#include <linux/dmaengine.h>
+
+/*
+ * The mxs dmaengine can do PIO transfers. We pass a pointer to the PIO words
+ * in the second argument to dmaengine_prep_slave_sg when the direction is
+ * set to DMA_TRANS_NONE. To make this clear and to prevent users from doing
+ * the error prone casting we have this wrapper function
+ */
+static inline struct dma_async_tx_descriptor *mxs_dmaengine_prep_pio(
+        struct dma_chan *chan, u32 *pio, unsigned int npio,
+        enum dma_transfer_direction dir, unsigned long flags)
+{
+	return dmaengine_prep_slave_sg(chan, (struct scatterlist *)pio, npio,
+				       dir, flags);
+}
+
+#endif /* _MXS_DMA_H_ */
-- 
cgit v1.2.3


From ceeeb99cd821a2f7493e1e0e1eca5afc7a205213 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:42 +0200
Subject: dmaengine: mxs: rename custom flag

The mxs dma driver uses the flags parameter in dmaengine_prep_slave_sg() for
custom flags, but still uses the dmaengine specific names of the flags.
Do a little bit better and at least give the flag a custom name.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/dma/mxs-dma.c                      |  4 ++--
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 17 +++++++++++------
 include/linux/dma/mxs-dma.h                |  2 ++
 3 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index 62ee9328aea1..c622bee7eb12 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -541,7 +541,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
 		ccw->bits = 0;
 		ccw->bits |= CCW_IRQ;
 		ccw->bits |= CCW_DEC_SEM;
-		if (flags & DMA_CTRL_ACK)
+		if (flags & MXS_DMA_CTRL_WAIT4END)
 			ccw->bits |= CCW_WAIT4END;
 		ccw->bits |= CCW_HALT_ON_TERM;
 		ccw->bits |= CCW_TERM_FLUSH;
@@ -573,7 +573,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
 				ccw->bits &= ~CCW_CHAIN;
 				ccw->bits |= CCW_IRQ;
 				ccw->bits |= CCW_DEC_SEM;
-				if (flags & DMA_CTRL_ACK)
+				if (flags & MXS_DMA_CTRL_WAIT4END)
 					ccw->bits |= CCW_WAIT4END;
 			}
 		}
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
index 45c7b91aae23..d088b3e77fef 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
@@ -926,7 +926,8 @@ static int gpmi_send_command(struct gpmi_nand_data *this)
 	sg_init_one(sgl, this->cmd_buffer, this->command_length);
 	dma_map_sg(this->dev, sgl, 1, DMA_TO_DEVICE);
 	desc = dmaengine_prep_slave_sg(channel,
-				sgl, 1, DMA_MEM_TO_DEV, DMA_CTRL_ACK);
+				sgl, 1, DMA_MEM_TO_DEV,
+				MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -996,7 +997,8 @@ static int gpmi_send_data(struct gpmi_nand_data *this, const void *buf, int len)
 	/* [2] send DMA request */
 	prepare_data_dma(this, buf, len, DMA_TO_DEVICE);
 	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_MEM_TO_DEV, DMA_CTRL_ACK);
+					1, DMA_MEM_TO_DEV,
+					MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1033,7 +1035,8 @@ static int gpmi_read_data(struct gpmi_nand_data *this, void *buf, int len)
 	/* [2] : send DMA request */
 	direct = prepare_data_dma(this, buf, len, DMA_FROM_DEVICE);
 	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_DEV_TO_MEM, DMA_CTRL_ACK);
+					1, DMA_DEV_TO_MEM,
+					MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1083,7 +1086,8 @@ static int gpmi_send_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[5] = auxiliary;
 
 	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, DMA_CTRL_ACK);
+				      DMA_TRANS_NONE,
+				      MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1140,7 +1144,8 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[4] = payload;
 	pio[5] = auxiliary;
 	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, DMA_CTRL_ACK);
+				      DMA_TRANS_NONE,
+				      MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1157,7 +1162,7 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[1] = 0;
 	pio[2] = 0; /* clear GPMI_HW_GPMI_ECCCTRL, disable the BCH. */
 	desc = mxs_dmaengine_prep_pio(channel, pio, 3, DMA_TRANS_NONE,
-				      DMA_CTRL_ACK);
+				      MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
diff --git a/include/linux/dma/mxs-dma.h b/include/linux/dma/mxs-dma.h
index 092b2a7b92ac..4a33f2c8a682 100644
--- a/include/linux/dma/mxs-dma.h
+++ b/include/linux/dma/mxs-dma.h
@@ -4,6 +4,8 @@
 
 #include <linux/dmaengine.h>
 
+#define MXS_DMA_CTRL_WAIT4END	BIT(31)
+
 /*
  * The mxs dmaengine can do PIO transfers. We pass a pointer to the PIO words
  * in the second argument to dmaengine_prep_slave_sg when the direction is
-- 
cgit v1.2.3


From ef347c0cfd619a9251e5a2f9ff72e33650a9bccb Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:43 +0200
Subject: mtd: rawnand: gpmi: Implement exec_op

The gpmi driver performance suffers from NAND operations being split
in multiple small DMA transfers. This has been forced by the NAND layer
in the former days, but now with exec_op we can use the controller as
intended.

With this patch gpmi_nfc_exec_op becomes the main entry point to NAND
operations. Here all instructions are collected and chained as separate
DMA transfers. In the end whole chain is fired and waited to be
finished. gpmi_nfc_exec_op only does the hardware operations, bad block
marker swapping and buffer scrambling is done by the callers. It's worth
noting that the nand_*_op functions always take the buffer lengths for
the data that the NAND chip actually transfers. When doing BCH we have
to calculate the net data size from the raw data size in some places.

This patch has been tested with 2048/64 and 2048/128 byte NAND on
i.MX6q. mtd_oobtest, mtd_subpagetest and mtd_speedtest run without
errors. nandbiterrs, nandpagetest and nandsubpagetest userspace tests
from mtdutils run without errors and UBIFS can successfully be mounted.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/dma/mxs-dma.c                      |    3 +
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 1105 +++++++++++-----------------
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h |   25 +-
 include/linux/dma/mxs-dma.h                |    1 +
 4 files changed, 444 insertions(+), 690 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index c622bee7eb12..20a9cb7cb6d3 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -78,6 +78,7 @@
 #define BM_CCW_COMMAND		(3 << 0)
 #define CCW_CHAIN		(1 << 2)
 #define CCW_IRQ			(1 << 3)
+#define CCW_WAIT4RDY		(1 << 5)
 #define CCW_DEC_SEM		(1 << 6)
 #define CCW_WAIT4END		(1 << 7)
 #define CCW_HALT_ON_TERM	(1 << 8)
@@ -547,6 +548,8 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
 		ccw->bits |= CCW_TERM_FLUSH;
 		ccw->bits |= BF_CCW(sg_len, PIO_NUM);
 		ccw->bits |= BF_CCW(MXS_DMA_CMD_NO_XFER, COMMAND);
+		if (flags & MXS_DMA_CTRL_WAIT4RDY)
+			ccw->bits |= CCW_WAIT4RDY;
 	} else {
 		for_each_sg(sgl, sg, sg_len, i) {
 			if (sg_dma_len(sg) > MAX_XFER_BYTES) {
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
index d088b3e77fef..5db84178edff 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
@@ -528,26 +528,12 @@ static int common_nfc_set_geometry(struct gpmi_nand_data *this)
 static int bch_set_geometry(struct gpmi_nand_data *this)
 {
 	struct resources *r = &this->resources;
-	struct bch_geometry *bch_geo = &this->bch_geometry;
-	unsigned int block_count;
-	unsigned int block_size;
-	unsigned int metadata_size;
-	unsigned int ecc_strength;
-	unsigned int page_size;
-	unsigned int gf_len;
 	int ret;
 
 	ret = common_nfc_set_geometry(this);
 	if (ret)
 		return ret;
 
-	block_count   = bch_geo->ecc_chunk_count - 1;
-	block_size    = bch_geo->ecc_chunk_size;
-	metadata_size = bch_geo->metadata_size;
-	ecc_strength  = bch_geo->ecc_strength >> 1;
-	page_size     = bch_geo->page_size;
-	gf_len        = bch_geo->gf_len;
-
 	ret = pm_runtime_get_sync(this->dev);
 	if (ret < 0)
 		return ret;
@@ -561,27 +547,9 @@ static int bch_set_geometry(struct gpmi_nand_data *this)
 	if (ret)
 		goto err_out;
 
-	/* Configure layout 0. */
-	writel(BF_BCH_FLASH0LAYOUT0_NBLOCKS(block_count)
-			| BF_BCH_FLASH0LAYOUT0_META_SIZE(metadata_size)
-			| BF_BCH_FLASH0LAYOUT0_ECC0(ecc_strength, this)
-			| BF_BCH_FLASH0LAYOUT0_GF(gf_len, this)
-			| BF_BCH_FLASH0LAYOUT0_DATA0_SIZE(block_size, this),
-			r->bch_regs + HW_BCH_FLASH0LAYOUT0);
-
-	writel(BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(page_size)
-			| BF_BCH_FLASH0LAYOUT1_ECCN(ecc_strength, this)
-			| BF_BCH_FLASH0LAYOUT1_GF(gf_len, this)
-			| BF_BCH_FLASH0LAYOUT1_DATAN_SIZE(block_size, this),
-			r->bch_regs + HW_BCH_FLASH0LAYOUT1);
-
 	/* Set *all* chip selects to use layout 0. */
 	writel(0, r->bch_regs + HW_BCH_LAYOUTSELECT);
 
-	/* Enable interrupts. */
-	writel(BM_BCH_CTRL_COMPLETE_IRQ_EN,
-				r->bch_regs + HW_BCH_CTRL_SET);
-
 	ret = 0;
 err_out:
 	pm_runtime_mark_last_busy(this->dev);
@@ -795,32 +763,6 @@ static void gpmi_clear_bch(struct gpmi_nand_data *this)
 	writel(BM_BCH_CTRL_COMPLETE_IRQ, r->bch_regs + HW_BCH_CTRL_CLR);
 }
 
-/* Returns the Ready/Busy status of the given chip. */
-static int gpmi_is_ready(struct gpmi_nand_data *this, unsigned chip)
-{
-	struct resources *r = &this->resources;
-	uint32_t mask = 0;
-	uint32_t reg = 0;
-
-	if (GPMI_IS_MX23(this)) {
-		mask = MX23_BM_GPMI_DEBUG_READY0 << chip;
-		reg = readl(r->gpmi_regs + HW_GPMI_DEBUG);
-	} else if (GPMI_IS_MX28(this) || GPMI_IS_MX6(this)) {
-		/*
-		 * In the imx6, all the ready/busy pins are bound
-		 * together. So we only need to check chip 0.
-		 */
-		if (GPMI_IS_MX6(this))
-			chip = 0;
-
-		/* MX28 shares the same R/B register as MX6Q. */
-		mask = MX28_BF_GPMI_STAT_READY_BUSY(1 << chip);
-		reg = readl(r->gpmi_regs + HW_GPMI_STAT);
-	} else
-		dev_err(this->dev, "unknown arch.\n");
-	return reg & mask;
-}
-
 static struct dma_chan *get_dma_chan(struct gpmi_nand_data *this)
 {
 	/* We use the DMA channel 0 to access all the nand chips. */
@@ -836,29 +778,6 @@ static void dma_irq_callback(void *param)
 	complete(dma_c);
 }
 
-static int start_dma_without_bch_irq(struct gpmi_nand_data *this,
-				     struct dma_async_tx_descriptor *desc)
-{
-	struct completion *dma_c = &this->dma_done;
-	unsigned long timeout;
-
-	init_completion(dma_c);
-
-	desc->callback		= dma_irq_callback;
-	desc->callback_param	= this;
-	dmaengine_submit(desc);
-	dma_async_issue_pending(get_dma_chan(this));
-
-	/* Wait for the interrupt from the DMA block. */
-	timeout = wait_for_completion_timeout(dma_c, msecs_to_jiffies(1000));
-	if (!timeout) {
-		dev_err(this->dev, "DMA timeout, last DMA\n");
-		gpmi_dump_info(this);
-		return -ETIMEDOUT;
-	}
-	return 0;
-}
-
 static irqreturn_t bch_irq(int irq, void *cookie)
 {
 	struct gpmi_nand_data *this = cookie;
@@ -868,83 +787,25 @@ static irqreturn_t bch_irq(int irq, void *cookie)
 	return IRQ_HANDLED;
 }
 
-/*
- * This function is used in BCH reading or BCH writing pages.
- * It will wait for the BCH interrupt as long as ONE second.
- * Actually, we must wait for two interrupts :
- *	[1] firstly the DMA interrupt and
- *	[2] secondly the BCH interrupt.
- */
-static int start_dma_with_bch_irq(struct gpmi_nand_data *this,
-				  struct dma_async_tx_descriptor *desc)
+static int gpmi_raw_len_to_len(struct gpmi_nand_data *this, int raw_len)
 {
-	struct completion *bch_c = &this->bch_done;
-	unsigned long timeout;
-
-	/* Prepare to receive an interrupt from the BCH block. */
-	init_completion(bch_c);
-
-	/* start the DMA */
-	start_dma_without_bch_irq(this, desc);
-
-	/* Wait for the interrupt from the BCH block. */
-	timeout = wait_for_completion_timeout(bch_c, msecs_to_jiffies(1000));
-	if (!timeout) {
-		dev_err(this->dev, "BCH timeout\n");
-		gpmi_dump_info(this);
-		return -ETIMEDOUT;
-	}
-	return 0;
-}
-
-static int gpmi_send_command(struct gpmi_nand_data *this)
-{
-	struct dma_chan *channel = get_dma_chan(this);
-	struct dma_async_tx_descriptor *desc;
-	struct scatterlist *sgl;
-	int chip = this->current_chip;
-	int ret;
-	u32 pio[3];
-
-	/* [1] send out the PIO words */
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WRITE)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_CLE)
-		| BM_GPMI_CTRL0_ADDRESS_INCREMENT
-		| BF_GPMI_CTRL0_XFER_COUNT(this->command_length);
-	pio[1] = pio[2] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] send out the COMMAND + ADDRESS string stored in @buffer */
-	sgl = &this->cmd_sgl;
-
-	sg_init_one(sgl, this->cmd_buffer, this->command_length);
-	dma_map_sg(this->dev, sgl, 1, DMA_TO_DEVICE);
-	desc = dmaengine_prep_slave_sg(channel,
-				sgl, 1, DMA_MEM_TO_DEV,
-				MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] submit the DMA */
-	ret = start_dma_without_bch_irq(this, desc);
-
-	dma_unmap_sg(this->dev, sgl, 1, DMA_TO_DEVICE);
-
-	return ret;
+	/*
+	 * raw_len is the length to read/write including bch data which
+	 * we are passed in exec_op. Calculate the data length from it.
+	 */
+	if (this->bch)
+		return ALIGN_DOWN(raw_len, this->bch_geometry.ecc_chunk_size);
+	else
+		return raw_len;
 }
 
 /* Can we use the upper's buffer directly for DMA? */
 static bool prepare_data_dma(struct gpmi_nand_data *this, const void *buf,
-			     int len, enum dma_data_direction dr)
+			     int raw_len, struct scatterlist *sgl,
+			     enum dma_data_direction dr)
 {
-	struct scatterlist *sgl = &this->data_sgl;
 	int ret;
+	int len = gpmi_raw_len_to_len(this, raw_len);
 
 	/* first try to map the upper buffer directly */
 	if (virt_addr_valid(buf) && !object_is_on_stack(buf)) {
@@ -960,7 +821,7 @@ map_fail:
 	/* We have to use our own DMA buffer. */
 	sg_init_one(sgl, this->data_buffer_dma, len);
 
-	if (dr == DMA_TO_DEVICE)
+	if (dr == DMA_TO_DEVICE && buf != this->data_buffer_dma)
 		memcpy(this->data_buffer_dma, buf, len);
 
 	dma_map_sg(this->dev, sgl, 1, dr);
@@ -968,208 +829,6 @@ map_fail:
 	return false;
 }
 
-static int gpmi_send_data(struct gpmi_nand_data *this, const void *buf, int len)
-{
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	int ret;
-	uint32_t command_mode;
-	uint32_t address;
-	u32 pio[2];
-
-	/* [1] PIO */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WRITE;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(len);
-	pio[1] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] send DMA request */
-	prepare_data_dma(this, buf, len, DMA_TO_DEVICE);
-	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_MEM_TO_DEV,
-					MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] submit the DMA */
-	ret = start_dma_without_bch_irq(this, desc);
-
-	dma_unmap_sg(this->dev, &this->data_sgl, 1, DMA_TO_DEVICE);
-
-	return ret;
-}
-
-static int gpmi_read_data(struct gpmi_nand_data *this, void *buf, int len)
-{
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	int ret;
-	u32 pio[2];
-	bool direct;
-
-	/* [1] : send PIO */
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__READ)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
-		| BF_GPMI_CTRL0_XFER_COUNT(len);
-	pio[1] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] : send DMA request */
-	direct = prepare_data_dma(this, buf, len, DMA_FROM_DEVICE);
-	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_DEV_TO_MEM,
-					MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] : submit the DMA */
-
-	ret = start_dma_without_bch_irq(this, desc);
-
-	dma_unmap_sg(this->dev, &this->data_sgl, 1, DMA_FROM_DEVICE);
-	if (!direct)
-		memcpy(buf, this->data_buffer_dma, len);
-
-	return ret;
-}
-
-static int gpmi_send_page(struct gpmi_nand_data *this, dma_addr_t payload,
-			  dma_addr_t auxiliary)
-{
-	struct bch_geometry *geo = &this->bch_geometry;
-	uint32_t command_mode;
-	uint32_t address;
-	uint32_t ecc_command;
-	uint32_t buffer_mask;
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	u32 pio[6];
-
-	/* A DMA descriptor that does an ECC page read. */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WRITE;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-	ecc_command  = BV_GPMI_ECCCTRL_ECC_CMD__BCH_ENCODE;
-	buffer_mask  = BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE |
-				BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY;
-
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(0);
-	pio[1] = 0;
-	pio[2] = BM_GPMI_ECCCTRL_ENABLE_ECC
-		| BF_GPMI_ECCCTRL_ECC_CMD(ecc_command)
-		| BF_GPMI_ECCCTRL_BUFFER_MASK(buffer_mask);
-	pio[3] = geo->page_size;
-	pio[4] = payload;
-	pio[5] = auxiliary;
-
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE,
-				      MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	return start_dma_with_bch_irq(this, desc);
-}
-
-static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
-			  dma_addr_t auxiliary)
-{
-	struct bch_geometry *geo = &this->bch_geometry;
-	uint32_t command_mode;
-	uint32_t address;
-	uint32_t ecc_command;
-	uint32_t buffer_mask;
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	u32 pio[6];
-
-	/* [1] Wait for the chip to report ready. */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WAIT_FOR_READY;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-
-	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(0);
-	pio[1] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, 2, DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] Enable the BCH block and read. */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__READ;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-	ecc_command  = BV_GPMI_ECCCTRL_ECC_CMD__BCH_DECODE;
-	buffer_mask  = BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE
-			| BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY;
-
-	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(geo->page_size);
-
-	pio[1] = 0;
-	pio[2] =  BM_GPMI_ECCCTRL_ENABLE_ECC
-		| BF_GPMI_ECCCTRL_ECC_CMD(ecc_command)
-		| BF_GPMI_ECCCTRL_BUFFER_MASK(buffer_mask);
-	pio[3] = geo->page_size;
-	pio[4] = payload;
-	pio[5] = auxiliary;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE,
-				      MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] Disable the BCH block */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WAIT_FOR_READY;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(geo->page_size);
-	pio[1] = 0;
-	pio[2] = 0; /* clear GPMI_HW_GPMI_ECCCTRL, disable the BCH. */
-	desc = mxs_dmaengine_prep_pio(channel, pio, 3, DMA_TRANS_NONE,
-				      MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [4] submit the DMA */
-	return start_dma_with_bch_irq(this, desc);
-}
-
 /**
  * gpmi_copy_bits - copy bits from one memory region to another
  * @dst: destination buffer
@@ -1568,67 +1227,20 @@ static void release_resources(struct gpmi_nand_data *this)
 	release_dma_channels(this);
 }
 
-static int send_page_prepare(struct gpmi_nand_data *this,
-			const void *source, unsigned length,
-			void *alt_virt, dma_addr_t alt_phys, unsigned alt_size,
-			const void **use_virt, dma_addr_t *use_phys)
-{
-	struct device *dev = this->dev;
-
-	if (virt_addr_valid(source)) {
-		dma_addr_t source_phys;
-
-		source_phys = dma_map_single(dev, (void *)source, length,
-						DMA_TO_DEVICE);
-		if (dma_mapping_error(dev, source_phys)) {
-			if (alt_size < length) {
-				dev_err(dev, "Alternate buffer is too small\n");
-				return -ENOMEM;
-			}
-			goto map_failed;
-		}
-		*use_virt = source;
-		*use_phys = source_phys;
-		return 0;
-	}
-map_failed:
-	/*
-	 * Copy the content of the source buffer into the alternate
-	 * buffer and set up the return values accordingly.
-	 */
-	memcpy(alt_virt, source, length);
-
-	*use_virt = alt_virt;
-	*use_phys = alt_phys;
-	return 0;
-}
-
-static void send_page_end(struct gpmi_nand_data *this,
-			const void *source, unsigned length,
-			void *alt_virt, dma_addr_t alt_phys, unsigned alt_size,
-			const void *used_virt, dma_addr_t used_phys)
-{
-	struct device *dev = this->dev;
-	if (used_virt == source)
-		dma_unmap_single(dev, used_phys, length, DMA_TO_DEVICE);
-}
-
 static void gpmi_free_dma_buffer(struct gpmi_nand_data *this)
 {
 	struct device *dev = this->dev;
+	struct bch_geometry *geo = &this->bch_geometry;
 
-	if (this->payload_virt && virt_addr_valid(this->payload_virt))
-		dma_free_coherent(dev, this->page_buffer_size,
-					this->payload_virt,
-					this->payload_phys);
-	kfree(this->cmd_buffer);
+	if (this->auxiliary_virt && virt_addr_valid(this->auxiliary_virt))
+		dma_free_coherent(dev, geo->auxiliary_size,
+					this->auxiliary_virt,
+					this->auxiliary_phys);
 	kfree(this->data_buffer_dma);
 	kfree(this->raw_buffer);
 
-	this->cmd_buffer	= NULL;
 	this->data_buffer_dma	= NULL;
 	this->raw_buffer	= NULL;
-	this->page_buffer_size	=  0;
 }
 
 /* Allocate the DMA buffers */
@@ -1638,11 +1250,6 @@ static int gpmi_alloc_dma_buffer(struct gpmi_nand_data *this)
 	struct device *dev = this->dev;
 	struct mtd_info *mtd = nand_to_mtd(&this->nand);
 
-	/* [1] Allocate a command buffer. PAGE_SIZE is enough. */
-	this->cmd_buffer = kzalloc(PAGE_SIZE, GFP_DMA | GFP_KERNEL);
-	if (this->cmd_buffer == NULL)
-		goto error_alloc;
-
 	/*
 	 * [2] Allocate a read/write data buffer.
 	 *     The gpmi_alloc_dma_buffer can be called twice.
@@ -1656,27 +1263,15 @@ static int gpmi_alloc_dma_buffer(struct gpmi_nand_data *this)
 	if (this->data_buffer_dma == NULL)
 		goto error_alloc;
 
-	/*
-	 * [3] Allocate the page buffer.
-	 *
-	 * Both the payload buffer and the auxiliary buffer must appear on
-	 * 32-bit boundaries. We presume the size of the payload buffer is a
-	 * power of two and is much larger than four, which guarantees the
-	 * auxiliary buffer will appear on a 32-bit boundary.
-	 */
-	this->page_buffer_size = geo->payload_size + geo->auxiliary_size;
-	this->payload_virt = dma_alloc_coherent(dev, this->page_buffer_size,
-					&this->payload_phys, GFP_DMA);
-	if (!this->payload_virt)
+	this->auxiliary_virt = dma_alloc_coherent(dev, geo->auxiliary_size,
+					&this->auxiliary_phys, GFP_DMA);
+	if (!this->auxiliary_virt)
 		goto error_alloc;
 
-	this->raw_buffer = kzalloc(mtd->writesize + mtd->oobsize, GFP_KERNEL);
+	this->raw_buffer = kzalloc((mtd->writesize ?: PAGE_SIZE) + mtd->oobsize, GFP_KERNEL);
 	if (!this->raw_buffer)
 		goto error_alloc;
 
-	/* Slice up the page buffer. */
-	this->auxiliary_virt = this->payload_virt + geo->payload_size;
-	this->auxiliary_phys = this->payload_phys + geo->payload_size;
 	return 0;
 
 error_alloc:
@@ -1684,105 +1279,6 @@ error_alloc:
 	return -ENOMEM;
 }
 
-static void gpmi_cmd_ctrl(struct nand_chip *chip, int data, unsigned int ctrl)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	int ret;
-
-	/*
-	 * Every operation begins with a command byte and a series of zero or
-	 * more address bytes. These are distinguished by either the Address
-	 * Latch Enable (ALE) or Command Latch Enable (CLE) signals being
-	 * asserted. When MTD is ready to execute the command, it will deassert
-	 * both latch enables.
-	 *
-	 * Rather than run a separate DMA operation for every single byte, we
-	 * queue them up and run a single DMA operation for the entire series
-	 * of command and data bytes. NAND_CMD_NONE means the END of the queue.
-	 */
-	if ((ctrl & (NAND_ALE | NAND_CLE))) {
-		if (data != NAND_CMD_NONE)
-			this->cmd_buffer[this->command_length++] = data;
-		return;
-	}
-
-	if (!this->command_length)
-		return;
-
-	ret = gpmi_send_command(this);
-	if (ret)
-		dev_err(this->dev, "Chip: %u, Error %d\n",
-			this->current_chip, ret);
-
-	this->command_length = 0;
-}
-
-static int gpmi_dev_ready(struct nand_chip *chip)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-
-	return gpmi_is_ready(this, this->current_chip);
-}
-
-static void gpmi_select_chip(struct nand_chip *chip, int chipnr)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	int ret;
-
-	/*
-	 * For power consumption matters, disable/enable the clock each time a
-	 * die is selected/unselected.
-	 */
-	if (this->current_chip < 0 && chipnr >= 0) {
-		ret = pm_runtime_get_sync(this->dev);
-		if (ret < 0)
-			dev_err(this->dev, "Failed to enable the clock\n");
-	} else if (this->current_chip >= 0 && chipnr < 0) {
-		pm_runtime_mark_last_busy(this->dev);
-		pm_runtime_put_autosuspend(this->dev);
-	}
-
-	/*
-	 * This driver currently supports only one NAND chip. Plus, dies share
-	 * the same configuration. So once timings have been applied on the
-	 * controller side, they will not change anymore. When the time will
-	 * come, the check on must_apply_timings will have to be dropped.
-	 */
-	if (chipnr >= 0 && this->hw.must_apply_timings) {
-		this->hw.must_apply_timings = false;
-		gpmi_nfc_apply_timings(this);
-	}
-
-	this->current_chip = chipnr;
-}
-
-static void gpmi_read_buf(struct nand_chip *chip, uint8_t *buf, int len)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-
-	dev_dbg(this->dev, "len is %d\n", len);
-
-	gpmi_read_data(this, buf, len);
-}
-
-static void gpmi_write_buf(struct nand_chip *chip, const uint8_t *buf, int len)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-
-	dev_dbg(this->dev, "len is %d\n", len);
-
-	gpmi_send_data(this, buf, len);
-}
-
-static uint8_t gpmi_read_byte(struct nand_chip *chip)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	uint8_t *buf = this->data_buffer_dma;
-
-	gpmi_read_buf(chip, buf, 1);
-	return buf[0];
-}
-
 /*
  * Handles block mark swapping.
  * It can be called in swapping the block mark, or swapping it back,
@@ -1831,50 +1327,20 @@ static void block_mark_swapping(struct gpmi_nand_data *this,
 	p[1] = (p[1] & mask) | (from_oob >> (8 - bit));
 }
 
-static int gpmi_ecc_read_page_data(struct nand_chip *chip, uint8_t *buf)
+static int gpmi_count_bitflips(struct nand_chip *chip, void *buf, int first,
+			       int last, int meta)
 {
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	struct bch_geometry *nfc_geo = &this->bch_geometry;
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	dma_addr_t    payload_phys;
-	unsigned int  i;
+	int i;
 	unsigned char *status;
-	unsigned int  max_bitflips = 0;
-	int           ret;
-	bool          direct = false;
-
-	payload_phys = this->payload_phys;
-
-	if (virt_addr_valid(buf)) {
-		dma_addr_t dest_phys;
-
-		dest_phys = dma_map_single(this->dev, buf, nfc_geo->payload_size,
-					   DMA_FROM_DEVICE);
-		if (!dma_mapping_error(this->dev, dest_phys)) {
-			payload_phys = dest_phys;
-			direct = true;
-		}
-	}
-
-	/* go! */
-	ret = gpmi_read_page(this, payload_phys, this->auxiliary_phys);
-
-	if (direct)
-		dma_unmap_single(this->dev, payload_phys, nfc_geo->payload_size,
-				 DMA_FROM_DEVICE);
-
-	if (ret) {
-		dev_err(this->dev, "Error in ECC-based read: %d\n", ret);
-		return ret;
-	}
+	unsigned int max_bitflips = 0;
 
 	/* Loop over status bytes, accumulating ECC status. */
-	status = this->auxiliary_virt + nfc_geo->auxiliary_status_offset;
+	status = this->auxiliary_virt + ALIGN(meta, 4);
 
-	if (!direct)
-		memcpy(buf, this->payload_virt, nfc_geo->payload_size);
-
-	for (i = 0; i < nfc_geo->ecc_chunk_count; i++, status++) {
+	for (i = first; i < last; i++, status++) {
 		if ((*status == STATUS_GOOD) || (*status == STATUS_ERASED))
 			continue;
 
@@ -1954,25 +1420,53 @@ static int gpmi_ecc_read_page_data(struct nand_chip *chip, uint8_t *buf)
 		max_bitflips = max_t(unsigned int, max_bitflips, *status);
 	}
 
-	/* handle the block mark swapping */
-	block_mark_swapping(this, buf, this->auxiliary_virt);
-
 	return max_bitflips;
 }
 
+static void gpmi_bch_layout_std(struct gpmi_nand_data *this)
+{
+	struct bch_geometry *geo = &this->bch_geometry;
+	unsigned int ecc_strength = geo->ecc_strength >> 1;
+	unsigned int gf_len = geo->gf_len;
+	unsigned int block_size = block_size = geo->ecc_chunk_size;
+
+	this->bch_flashlayout0 =
+		BF_BCH_FLASH0LAYOUT0_NBLOCKS(geo->ecc_chunk_count - 1) |
+		BF_BCH_FLASH0LAYOUT0_META_SIZE(geo->metadata_size) |
+		BF_BCH_FLASH0LAYOUT0_ECC0(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT0_GF(gf_len, this) |
+		BF_BCH_FLASH0LAYOUT0_DATA0_SIZE(block_size, this);
+
+	this->bch_flashlayout1 =
+		BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(geo->page_size) |
+		BF_BCH_FLASH0LAYOUT1_ECCN(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT1_GF(gf_len, this) |
+		BF_BCH_FLASH0LAYOUT1_DATAN_SIZE(block_size, this);
+}
+
 static int gpmi_ecc_read_page(struct nand_chip *chip, uint8_t *buf,
 			      int oob_required, int page)
 {
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	struct mtd_info *mtd = nand_to_mtd(chip);
+	struct bch_geometry *geo = &this->bch_geometry;
+	unsigned int max_bitflips;
 	int ret;
 
-	nand_read_page_op(chip, page, 0, NULL, 0);
+	gpmi_bch_layout_std(this);
+	this->bch = true;
 
-	ret = gpmi_ecc_read_page_data(chip, buf);
-	if (ret < 0)
+	ret = nand_read_page_op(chip, page, 0, buf, geo->page_size);
+	if (ret)
 		return ret;
 
+	max_bitflips = gpmi_count_bitflips(chip, buf, 0,
+					   geo->ecc_chunk_count,
+					   geo->auxiliary_status_offset);
+
+	/* handle the block mark swapping */
+	block_mark_swapping(this, buf, this->auxiliary_virt);
+
 	if (oob_required) {
 		/*
 		 * It's time to deliver the OOB bytes. See gpmi_ecc_read_oob()
@@ -1988,7 +1482,7 @@ static int gpmi_ecc_read_page(struct nand_chip *chip, uint8_t *buf,
 		chip->oob_poi[0] = ((uint8_t *)this->auxiliary_virt)[0];
 	}
 
-	return ret;
+	return max_bitflips;
 }
 
 /* Fake a virtual small page for the subpage read */
@@ -1996,17 +1490,15 @@ static int gpmi_ecc_read_subpage(struct nand_chip *chip, uint32_t offs,
 				 uint32_t len, uint8_t *buf, int page)
 {
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	void __iomem *bch_regs = this->resources.bch_regs;
-	struct bch_geometry old_geo = this->bch_geometry;
 	struct bch_geometry *geo = &this->bch_geometry;
 	int size = chip->ecc.size; /* ECC chunk size */
 	int meta, n, page_size;
-	u32 r1_old, r2_old, r1_new, r2_new;
 	unsigned int max_bitflips;
+	unsigned int ecc_strength;
 	int first, last, marker_pos;
 	int ecc_parity_size;
 	int col = 0;
-	int old_swap_block_mark = this->swap_block_mark;
+	int ret;
 
 	/* The size of ECC parity */
 	ecc_parity_size = geo->gf_len * geo->ecc_strength / 8;
@@ -2039,43 +1531,33 @@ static int gpmi_ecc_read_subpage(struct nand_chip *chip, uint32_t offs,
 		buf = buf + first * size;
 	}
 
-	nand_read_page_op(chip, page, col, NULL, 0);
-
-	/* Save the old environment */
-	r1_old = r1_new = readl(bch_regs + HW_BCH_FLASH0LAYOUT0);
-	r2_old = r2_new = readl(bch_regs + HW_BCH_FLASH0LAYOUT1);
+	ecc_parity_size = geo->gf_len * geo->ecc_strength / 8;
 
-	/* change the BCH registers and bch_geometry{} */
 	n = last - first + 1;
 	page_size = meta + (size + ecc_parity_size) * n;
+	ecc_strength = geo->ecc_strength >> 1;
+
+	this->bch_flashlayout0 = BF_BCH_FLASH0LAYOUT0_NBLOCKS(n - 1) |
+		BF_BCH_FLASH0LAYOUT0_META_SIZE(meta) |
+		BF_BCH_FLASH0LAYOUT0_ECC0(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT0_GF(geo->gf_len, this) |
+		BF_BCH_FLASH0LAYOUT0_DATA0_SIZE(geo->ecc_chunk_size, this);
 
-	r1_new &= ~(BM_BCH_FLASH0LAYOUT0_NBLOCKS |
-			BM_BCH_FLASH0LAYOUT0_META_SIZE);
-	r1_new |= BF_BCH_FLASH0LAYOUT0_NBLOCKS(n - 1)
-			| BF_BCH_FLASH0LAYOUT0_META_SIZE(meta);
-	writel(r1_new, bch_regs + HW_BCH_FLASH0LAYOUT0);
+	this->bch_flashlayout1 = BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(page_size) |
+		BF_BCH_FLASH0LAYOUT1_ECCN(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT1_GF(geo->gf_len, this) |
+		BF_BCH_FLASH0LAYOUT1_DATAN_SIZE(geo->ecc_chunk_size, this);
 
-	r2_new &= ~BM_BCH_FLASH0LAYOUT1_PAGE_SIZE;
-	r2_new |= BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(page_size);
-	writel(r2_new, bch_regs + HW_BCH_FLASH0LAYOUT1);
+	this->bch = true;
 
-	geo->ecc_chunk_count = n;
-	geo->payload_size = n * size;
-	geo->page_size = page_size;
-	geo->auxiliary_status_offset = ALIGN(meta, 4);
+	ret = nand_read_page_op(chip, page, col, buf, page_size);
+	if (ret)
+		return ret;
 
 	dev_dbg(this->dev, "page:%d(%d:%d)%d, chunk:(%d:%d), BCH PG size:%d\n",
 		page, offs, len, col, first, n, page_size);
 
-	/* Read the subpage now */
-	this->swap_block_mark = false;
-	max_bitflips = gpmi_ecc_read_page_data(chip, buf);
-
-	/* Restore */
-	writel(r1_old, bch_regs + HW_BCH_FLASH0LAYOUT0);
-	writel(r2_old, bch_regs + HW_BCH_FLASH0LAYOUT1);
-	this->bch_geometry = old_geo;
-	this->swap_block_mark = old_swap_block_mark;
+	max_bitflips = gpmi_count_bitflips(chip, buf, first, last, meta);
 
 	return max_bitflips;
 }
@@ -2086,81 +1568,29 @@ static int gpmi_ecc_write_page(struct nand_chip *chip, const uint8_t *buf,
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	struct bch_geometry *nfc_geo = &this->bch_geometry;
-	const void *payload_virt;
-	dma_addr_t payload_phys;
-	const void *auxiliary_virt;
-	dma_addr_t auxiliary_phys;
-	int        ret;
+	int ret;
 
 	dev_dbg(this->dev, "ecc write page.\n");
 
-	nand_prog_page_begin_op(chip, page, 0, NULL, 0);
+	gpmi_bch_layout_std(this);
+	this->bch = true;
+
+	memcpy(this->auxiliary_virt, chip->oob_poi, nfc_geo->auxiliary_size);
 
 	if (this->swap_block_mark) {
 		/*
-		 * If control arrives here, we're doing block mark swapping.
-		 * Since we can't modify the caller's buffers, we must copy them
-		 * into our own.
-		 */
-		memcpy(this->payload_virt, buf, mtd->writesize);
-		payload_virt = this->payload_virt;
-		payload_phys = this->payload_phys;
-
-		memcpy(this->auxiliary_virt, chip->oob_poi,
-				nfc_geo->auxiliary_size);
-		auxiliary_virt = this->auxiliary_virt;
-		auxiliary_phys = this->auxiliary_phys;
-
-		/* Handle block mark swapping. */
-		block_mark_swapping(this,
-				(void *)payload_virt, (void *)auxiliary_virt);
-	} else {
-		/*
-		 * If control arrives here, we're not doing block mark swapping,
-		 * so we can to try and use the caller's buffers.
+		 * When doing bad block marker swapping we must always copy the
+		 * input buffer as we can't modify the const buffer.
 		 */
-		ret = send_page_prepare(this,
-				buf, mtd->writesize,
-				this->payload_virt, this->payload_phys,
-				nfc_geo->payload_size,
-				&payload_virt, &payload_phys);
-		if (ret) {
-			dev_err(this->dev, "Inadequate payload DMA buffer\n");
-			return 0;
-		}
-
-		ret = send_page_prepare(this,
-				chip->oob_poi, mtd->oobsize,
-				this->auxiliary_virt, this->auxiliary_phys,
-				nfc_geo->auxiliary_size,
-				&auxiliary_virt, &auxiliary_phys);
-		if (ret) {
-			dev_err(this->dev, "Inadequate auxiliary DMA buffer\n");
-			goto exit_auxiliary;
-		}
+		memcpy(this->data_buffer_dma, buf, mtd->writesize);
+		buf = this->data_buffer_dma;
+		block_mark_swapping(this, this->data_buffer_dma,
+				    this->auxiliary_virt);
 	}
 
-	/* Ask the NFC. */
-	ret = gpmi_send_page(this, payload_phys, auxiliary_phys);
-	if (ret)
-		dev_err(this->dev, "Error in ECC-based write: %d\n", ret);
-
-	if (!this->swap_block_mark) {
-		send_page_end(this, chip->oob_poi, mtd->oobsize,
-				this->auxiliary_virt, this->auxiliary_phys,
-				nfc_geo->auxiliary_size,
-				auxiliary_virt, auxiliary_phys);
-exit_auxiliary:
-		send_page_end(this, buf, mtd->writesize,
-				this->payload_virt, this->payload_phys,
-				nfc_geo->payload_size,
-				payload_virt, payload_phys);
-	}
+	ret = nand_prog_page_op(chip, page, 0, buf, nfc_geo->page_size);
 
-	if (ret)
-		return ret;
-
-	return nand_prog_page_end_op(chip);
+	return ret;
 }
 
 /*
@@ -2229,7 +1659,6 @@ static int gpmi_ecc_read_oob(struct nand_chip *chip, int page)
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	int ret;
 
-	dev_dbg(this->dev, "page number is %d\n", page);
 	/* clear the OOB buffer */
 	memset(chip->oob_poi, ~0, mtd->oobsize);
 
@@ -2297,9 +1726,12 @@ static int gpmi_ecc_read_page_raw(struct nand_chip *chip, uint8_t *buf,
 	size_t oob_byte_off;
 	uint8_t *oob = chip->oob_poi;
 	int step;
+	int ret;
 
-	nand_read_page_op(chip, page, 0, tmp_buf,
-			  mtd->writesize + mtd->oobsize);
+	ret = nand_read_page_op(chip, page, 0, tmp_buf,
+				mtd->writesize + mtd->oobsize);
+	if (ret)
+		return ret;
 
 	/*
 	 * If required, swap the bad block marker and the data stored in the
@@ -2789,9 +2221,330 @@ static int gpmi_nand_attach_chip(struct nand_chip *chip)
 	return 0;
 }
 
+static struct gpmi_transfer *get_next_transfer(struct gpmi_nand_data *this)
+{
+	struct gpmi_transfer *transfer = &this->transfers[this->ntransfers];
+
+	this->ntransfers++;
+
+	if (this->ntransfers == GPMI_MAX_TRANSFERS)
+		return NULL;
+
+	return transfer;
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_command(
+	struct gpmi_nand_data *this, u8 cmd, const u8 *addr, int naddr)
+{
+	struct dma_chan *channel = get_dma_chan(this);
+	struct dma_async_tx_descriptor *desc;
+	struct gpmi_transfer *transfer;
+	int chip = this->nand.cur_cs;
+	u32 pio[3];
+
+	/* [1] send out the PIO words */
+	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WRITE)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(chip, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_CLE)
+		| BM_GPMI_CTRL0_ADDRESS_INCREMENT
+		| BF_GPMI_CTRL0_XFER_COUNT(naddr + 1);
+	pio[1] = 0;
+	pio[2] = 0;
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
+	if (!desc)
+		return NULL;
+
+	transfer = get_next_transfer(this);
+	if (!transfer)
+		return NULL;
+
+	transfer->cmdbuf[0] = cmd;
+	if (naddr)
+		memcpy(&transfer->cmdbuf[1], addr, naddr);
+
+	sg_init_one(&transfer->sgl, transfer->cmdbuf, naddr + 1);
+	dma_map_sg(this->dev, &transfer->sgl, 1, DMA_TO_DEVICE);
+
+	transfer->direction = DMA_TO_DEVICE;
+
+	desc = dmaengine_prep_slave_sg(channel, &transfer->sgl, 1, DMA_MEM_TO_DEV,
+				       MXS_DMA_CTRL_WAIT4END);
+	return desc;
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_wait_ready(
+	struct gpmi_nand_data *this)
+{
+	struct dma_chan *channel = get_dma_chan(this);
+	u32 pio[2];
+
+	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WAIT_FOR_READY)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(this->nand.cur_cs, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
+		| BF_GPMI_CTRL0_XFER_COUNT(0);
+	pio[1] = 0;
+
+	return mxs_dmaengine_prep_pio(channel, pio, 2, DMA_TRANS_NONE,
+				MXS_DMA_CTRL_WAIT4END | MXS_DMA_CTRL_WAIT4RDY);
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_data_read(
+	struct gpmi_nand_data *this, void *buf, int raw_len, bool *direct)
+{
+	struct dma_async_tx_descriptor *desc;
+	struct dma_chan *channel = get_dma_chan(this);
+	struct gpmi_transfer *transfer;
+	u32 pio[6] = {};
+
+	transfer = get_next_transfer(this);
+	if (!transfer)
+		return NULL;
+
+	transfer->direction = DMA_FROM_DEVICE;
+
+	*direct = prepare_data_dma(this, buf, raw_len, &transfer->sgl,
+				   DMA_FROM_DEVICE);
+
+	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__READ)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(this->nand.cur_cs, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
+		| BF_GPMI_CTRL0_XFER_COUNT(raw_len);
+
+	if (this->bch) {
+		pio[2] =  BM_GPMI_ECCCTRL_ENABLE_ECC
+			| BF_GPMI_ECCCTRL_ECC_CMD(BV_GPMI_ECCCTRL_ECC_CMD__BCH_DECODE)
+			| BF_GPMI_ECCCTRL_BUFFER_MASK(BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE
+				| BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY);
+		pio[3] = raw_len;
+		pio[4] = transfer->sgl.dma_address;
+		pio[5] = this->auxiliary_phys;
+	}
+
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
+	if (!desc)
+		return NULL;
+
+	if (!this->bch)
+		desc = dmaengine_prep_slave_sg(channel, &transfer->sgl, 1,
+					     DMA_DEV_TO_MEM,
+					     MXS_DMA_CTRL_WAIT4END);
+
+	return desc;
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_data_write(
+	struct gpmi_nand_data *this, const void *buf, int raw_len)
+{
+	struct dma_chan *channel = get_dma_chan(this);
+	struct dma_async_tx_descriptor *desc;
+	struct gpmi_transfer *transfer;
+	u32 pio[6] = {};
+
+	transfer = get_next_transfer(this);
+	if (!transfer)
+		return NULL;
+
+	transfer->direction = DMA_TO_DEVICE;
+
+	prepare_data_dma(this, buf, raw_len, &transfer->sgl, DMA_TO_DEVICE);
+
+	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WRITE)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(this->nand.cur_cs, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
+		| BF_GPMI_CTRL0_XFER_COUNT(raw_len);
+
+	if (this->bch) {
+		pio[2] = BM_GPMI_ECCCTRL_ENABLE_ECC
+			| BF_GPMI_ECCCTRL_ECC_CMD(BV_GPMI_ECCCTRL_ECC_CMD__BCH_ENCODE)
+			| BF_GPMI_ECCCTRL_BUFFER_MASK(BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE |
+					BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY);
+		pio[3] = raw_len;
+		pio[4] = transfer->sgl.dma_address;
+		pio[5] = this->auxiliary_phys;
+	}
+
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE,
+				      (this->bch ? MXS_DMA_CTRL_WAIT4END : 0));
+	if (!desc)
+		return NULL;
+
+	if (!this->bch)
+		desc = dmaengine_prep_slave_sg(channel, &transfer->sgl, 1,
+					       DMA_MEM_TO_DEV,
+					       MXS_DMA_CTRL_WAIT4END);
+
+	return desc;
+}
+
+static int gpmi_nfc_exec_op(struct nand_chip *chip,
+			     const struct nand_operation *op,
+			     bool check_only)
+{
+	const struct nand_op_instr *instr;
+	struct gpmi_nand_data *this = nand_get_controller_data(chip);
+	struct dma_async_tx_descriptor *desc = NULL;
+	int i, ret, buf_len = 0, nbufs = 0;
+	u8 cmd = 0;
+	void *buf_read = NULL;
+	const void *buf_write = NULL;
+	bool direct = false;
+	struct completion *completion;
+	unsigned long to;
+
+	this->ntransfers = 0;
+	for (i = 0; i < GPMI_MAX_TRANSFERS; i++)
+		this->transfers[i].direction = DMA_NONE;
+
+	ret = pm_runtime_get_sync(this->dev);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * This driver currently supports only one NAND chip. Plus, dies share
+	 * the same configuration. So once timings have been applied on the
+	 * controller side, they will not change anymore. When the time will
+	 * come, the check on must_apply_timings will have to be dropped.
+	 */
+	if (this->hw.must_apply_timings) {
+		this->hw.must_apply_timings = false;
+		gpmi_nfc_apply_timings(this);
+	}
+
+	dev_dbg(this->dev, "%s: %d instructions\n", __func__, op->ninstrs);
+
+	for (i = 0; i < op->ninstrs; i++) {
+		instr = &op->instrs[i];
+
+		nand_op_trace("  ", instr);
+
+		switch (instr->type) {
+		case NAND_OP_WAITRDY_INSTR:
+			desc = gpmi_chain_wait_ready(this);
+			break;
+		case NAND_OP_CMD_INSTR:
+			cmd = instr->ctx.cmd.opcode;
+
+			/*
+			 * When this command has an address cycle chain it
+			 * together with the address cycle
+			 */
+			if (i + 1 != op->ninstrs &&
+			    op->instrs[i + 1].type == NAND_OP_ADDR_INSTR)
+				continue;
+
+			desc = gpmi_chain_command(this, cmd, NULL, 0);
+
+			break;
+		case NAND_OP_ADDR_INSTR:
+			desc = gpmi_chain_command(this, cmd, instr->ctx.addr.addrs,
+						  instr->ctx.addr.naddrs);
+			break;
+		case NAND_OP_DATA_OUT_INSTR:
+			buf_write = instr->ctx.data.buf.out;
+			buf_len = instr->ctx.data.len;
+			nbufs++;
+
+			desc = gpmi_chain_data_write(this, buf_write, buf_len);
+
+			break;
+		case NAND_OP_DATA_IN_INSTR:
+			if (!instr->ctx.data.len)
+				break;
+			buf_read = instr->ctx.data.buf.in;
+			buf_len = instr->ctx.data.len;
+			nbufs++;
+
+			desc = gpmi_chain_data_read(this, buf_read, buf_len,
+						   &direct);
+			break;
+		}
+
+		if (!desc) {
+			ret = -ENXIO;
+			goto unmap;
+		}
+	}
+
+	dev_dbg(this->dev, "%s setup done\n", __func__);
+
+	if (nbufs > 1) {
+		dev_err(this->dev, "Multiple data instructions not supported\n");
+		ret = -EINVAL;
+		goto unmap;
+	}
+
+	if (this->bch) {
+		writel(this->bch_flashlayout0,
+		       this->resources.bch_regs + HW_BCH_FLASH0LAYOUT0);
+		writel(this->bch_flashlayout1,
+		       this->resources.bch_regs + HW_BCH_FLASH0LAYOUT1);
+	}
+
+	if (this->bch && buf_read) {
+		writel(BM_BCH_CTRL_COMPLETE_IRQ_EN,
+		       this->resources.bch_regs + HW_BCH_CTRL_SET);
+		completion = &this->bch_done;
+	} else {
+		desc->callback = dma_irq_callback;
+		desc->callback_param = this;
+		completion = &this->dma_done;
+	}
+
+	init_completion(completion);
+
+	dmaengine_submit(desc);
+	dma_async_issue_pending(get_dma_chan(this));
+
+	to = wait_for_completion_timeout(completion, msecs_to_jiffies(1000));
+	if (!to) {
+		dev_err(this->dev, "DMA timeout, last DMA\n");
+		gpmi_dump_info(this);
+		ret = -ETIMEDOUT;
+		goto unmap;
+	}
+
+	writel(BM_BCH_CTRL_COMPLETE_IRQ_EN,
+	       this->resources.bch_regs + HW_BCH_CTRL_CLR);
+	gpmi_clear_bch(this);
+
+	ret = 0;
+
+unmap:
+	for (i = 0; i < this->ntransfers; i++) {
+		struct gpmi_transfer *transfer = &this->transfers[i];
+
+		if (transfer->direction != DMA_NONE)
+			dma_unmap_sg(this->dev, &transfer->sgl, 1,
+				     transfer->direction);
+	}
+
+	if (!ret && buf_read && !direct)
+		memcpy(buf_read, this->data_buffer_dma,
+		       gpmi_raw_len_to_len(this, buf_len));
+
+	this->bch = false;
+
+	pm_runtime_mark_last_busy(this->dev);
+	pm_runtime_put_autosuspend(this->dev);
+
+	return ret;
+}
+
 static const struct nand_controller_ops gpmi_nand_controller_ops = {
 	.attach_chip = gpmi_nand_attach_chip,
 	.setup_data_interface = gpmi_setup_data_interface,
+	.exec_op = gpmi_nfc_exec_op,
 };
 
 static int gpmi_nand_init(struct gpmi_nand_data *this)
@@ -2800,9 +2553,6 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
 	struct mtd_info  *mtd = nand_to_mtd(chip);
 	int ret;
 
-	/* init current chip */
-	this->current_chip	= -1;
-
 	/* init the MTD data structures */
 	mtd->name		= "gpmi-nand";
 	mtd->dev.parent		= this->dev;
@@ -2810,14 +2560,8 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
 	/* init the nand_chip{}, we don't support a 16-bit NAND Flash bus. */
 	nand_set_controller_data(chip, this);
 	nand_set_flash_node(chip, this->pdev->dev.of_node);
-	chip->legacy.select_chip	= gpmi_select_chip;
-	chip->legacy.cmd_ctrl	= gpmi_cmd_ctrl;
-	chip->legacy.dev_ready	= gpmi_dev_ready;
-	chip->legacy.read_byte	= gpmi_read_byte;
-	chip->legacy.read_buf	= gpmi_read_buf;
-	chip->legacy.write_buf	= gpmi_write_buf;
-	chip->badblock_pattern	= &gpmi_bbt_descr;
 	chip->legacy.block_markbad = gpmi_block_markbad;
+	chip->badblock_pattern	= &gpmi_bbt_descr;
 	chip->options		|= NAND_NO_SUBPAGE_WRITE;
 
 	/* Set up swap_block_mark, must be set before the gpmi_set_geometry() */
@@ -2833,7 +2577,10 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
 	if (ret)
 		goto err_out;
 
-	chip->legacy.dummy_controller.ops = &gpmi_nand_controller_ops;
+	nand_controller_init(&this->base);
+	this->base.ops = &gpmi_nand_controller_ops;
+	chip->controller = &this->base;
+
 	ret = nand_scan(chip, GPMI_IS_MX6(this) ? 2 : 1);
 	if (ret)
 		goto err_out;
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
index 51a070da84ed..fdc5ed7de083 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
@@ -103,6 +103,14 @@ struct gpmi_nfc_hardware_timing {
 	u32 ctrl1n;
 };
 
+#define GPMI_MAX_TRANSFERS	8
+
+struct gpmi_transfer {
+	u8 cmdbuf[8];
+	struct scatterlist sgl;
+	enum dma_data_direction direction;
+};
+
 struct gpmi_nand_data {
 	/* Devdata */
 	const struct gpmi_devdata *devdata;
@@ -126,23 +134,18 @@ struct gpmi_nand_data {
 	struct boot_rom_geometry rom_geometry;
 
 	/* MTD / NAND */
+	struct nand_controller	base;
 	struct nand_chip	nand;
 
-	/* General-use Variables */
-	int			current_chip;
-	unsigned int		command_length;
+	struct gpmi_transfer	transfers[GPMI_MAX_TRANSFERS];
+	int			ntransfers;
 
-	struct scatterlist	cmd_sgl;
-	char			*cmd_buffer;
+	bool			bch;
+	uint32_t		bch_flashlayout0;
+	uint32_t		bch_flashlayout1;
 
-	struct scatterlist	data_sgl;
 	char			*data_buffer_dma;
 
-	unsigned int		page_buffer_size;
-
-	void			*payload_virt;
-	dma_addr_t		payload_phys;
-
 	void			*auxiliary_virt;
 	dma_addr_t		auxiliary_phys;
 
diff --git a/include/linux/dma/mxs-dma.h b/include/linux/dma/mxs-dma.h
index 4a33f2c8a682..069d9f5a609e 100644
--- a/include/linux/dma/mxs-dma.h
+++ b/include/linux/dma/mxs-dma.h
@@ -5,6 +5,7 @@
 #include <linux/dmaengine.h>
 
 #define MXS_DMA_CTRL_WAIT4END	BIT(31)
+#define MXS_DMA_CTRL_WAIT4RDY	BIT(30)
 
 /*
  * The mxs dmaengine can do PIO transfers. We pass a pointer to the PIO words
-- 
cgit v1.2.3


From bded033062396e67ffbb3111084cf7ea202473d5 Mon Sep 17 00:00:00 2001
From: Jeff Kletsky <git-commits@allycomm.com>
Date: Wed, 22 May 2019 15:05:53 -0700
Subject: mtd: spinand: Define macros for page-read ops with three-byte
 addresses

The GigaDevice GD5F1GQ4UFxxG SPI NAND utilizes three-byte addresses
for its page-read ops.

http://www.gigadevice.com/datasheet/gd5f1gq4xfxxg/

Signed-off-by: Jeff Kletsky <git-commits@allycomm.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 include/linux/mtd/spinand.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 507f7e289bd1..8aa39ac41e8e 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -68,30 +68,60 @@
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 1))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_OP_3A(fast, addr, ndummy, buf, len) \
+	SPI_MEM_OP(SPI_MEM_OP_CMD(fast ? 0x0b : 0x03, 1),		\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 1))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_X2_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 2))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_X2_OP_3A(addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_X4_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 4))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_X4_OP_3A(addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 2),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 2),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 2))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP_3A(addr, ndummy, buf, len) \
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 2),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 2),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 4),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 4),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 4))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP_3A(addr, ndummy, buf, len) \
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 4),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 4),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+
 #define SPINAND_PROG_EXEC_OP(addr)					\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x10, 1),				\
 		   SPI_MEM_OP_ADDR(3, addr, 1),				\
-- 
cgit v1.2.3


From 878844908e563a2f02b977bacd221c288e681c47 Mon Sep 17 00:00:00 2001
From: Jeff Kletsky <git-commits@allycomm.com>
Date: Wed, 22 May 2019 15:05:54 -0700
Subject: mtd: spinand: Add support for two-byte device IDs

The GigaDevice GD5F1GQ4UFxxG SPI NAND utilizes two-byte device IDs.

http://www.gigadevice.com/datasheet/gd5f1gq4xfxxg/

Signed-off-by: Jeff Kletsky <git-commits@allycomm.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/core.c | 2 +-
 include/linux/mtd/spinand.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 4c15bb58c623..556bfdb34455 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -845,7 +845,7 @@ spinand_select_op_variant(struct spinand_device *spinand,
  */
 int spinand_match_and_init(struct spinand_device *spinand,
 			   const struct spinand_info *table,
-			   unsigned int table_size, u8 devid)
+			   unsigned int table_size, u16 devid)
 {
 	struct nand_device *nand = spinand_to_nand(spinand);
 	unsigned int i;
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 8aa39ac41e8e..fbc0423bb4ae 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -290,7 +290,7 @@ struct spinand_ecc_info {
  */
 struct spinand_info {
 	const char *model;
-	u8 devid;
+	u16 devid;
 	u32 flags;
 	struct nand_memory_organization memorg;
 	struct nand_ecc_req eccreq;
@@ -452,7 +452,7 @@ static inline void spinand_set_of_node(struct spinand_device *spinand,
 
 int spinand_match_and_init(struct spinand_device *dev,
 			   const struct spinand_info *table,
-			   unsigned int table_size, u8 devid);
+			   unsigned int table_size, u16 devid);
 
 int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val);
 int spinand_select_target(struct spinand_device *spinand, unsigned int target);
-- 
cgit v1.2.3


From 9f897bfdd89f5f08a12fa263a7f57fbf8ad9292f Mon Sep 17 00:00:00 2001
From: Kamal Dasu <kdasu.kdev@gmail.com>
Date: Thu, 16 May 2019 12:41:46 -0400
Subject: mtd: Add flag to indicate panic_write

Added a flag to indicate a panic_write so that low level drivers can
use it to take required action where applicable, to ensure oops data
gets written to assigned mtd device.

Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/mtdcore.c   | 3 +++
 include/linux/mtd/mtd.h | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 453242d6cf56..408615f29e57 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1124,6 +1124,9 @@ int mtd_panic_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen,
 		return -EROFS;
 	if (!len)
 		return 0;
+	if (!mtd->oops_panic_write)
+		mtd->oops_panic_write = true;
+
 	return mtd->_panic_write(mtd, to, len, retlen, buf);
 }
 EXPORT_SYMBOL_GPL(mtd_panic_write);
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 936a3fdb48b5..4ca8c1c845fb 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -316,6 +316,12 @@ struct mtd_info {
 	int (*_get_device) (struct mtd_info *mtd);
 	void (*_put_device) (struct mtd_info *mtd);
 
+	/*
+	 * flag indicates a panic write, low level drivers can take appropriate
+	 * action if required to ensure writes go through
+	 */
+	bool oops_panic_write;
+
 	struct notifier_block reboot_notifier;  /* default mode before reboot */
 
 	/* ECC status information */
-- 
cgit v1.2.3


From 3552691616c940a7c4125c2678ba816653cd725e Mon Sep 17 00:00:00 2001
From: Jeff Kletsky <git-commits@allycomm.com>
Date: Tue, 18 Jun 2019 10:08:05 -0700
Subject: mtd: spinand: Add initial support for Paragon PN26G0xA

Add initial support for Paragon Technology
PN26G01Axxxxx and PN26G02Axxxxx SPI NAND

Datasheets available at
http://www.xtxtech.com/upfile/2016082517274590.pdf
http://www.xtxtech.com/upfile/2016082517282329.pdf

Signed-off-by: Jeff Kletsky <git-commits@allycomm.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/Makefile  |   2 +-
 drivers/mtd/nand/spi/core.c    |   1 +
 drivers/mtd/nand/spi/paragon.c | 147 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spinand.h    |   1 +
 4 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/paragon.c

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
index 753125082640..9662b9c1d5a9 100644
--- a/drivers/mtd/nand/spi/Makefile
+++ b/drivers/mtd/nand/spi/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
-spinand-objs := core.o gigadevice.o macronix.o micron.o toshiba.o winbond.o
+spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o
 obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 556bfdb34455..f0f3528aab8f 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -757,6 +757,7 @@ static const struct spinand_manufacturer *spinand_manufacturers[] = {
 	&gigadevice_spinand_manufacturer,
 	&macronix_spinand_manufacturer,
 	&micron_spinand_manufacturer,
+	&paragon_spinand_manufacturer,
 	&toshiba_spinand_manufacturer,
 	&winbond_spinand_manufacturer,
 };
diff --git a/drivers/mtd/nand/spi/paragon.c b/drivers/mtd/nand/spi/paragon.c
new file mode 100644
index 000000000000..52307681cbd0
--- /dev/null
+++ b/drivers/mtd/nand/spi/paragon.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Jeff Kletsky
+ *
+ * Author: Jeff Kletsky <git-commits@allycomm.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mtd/spinand.h>
+
+
+#define SPINAND_MFR_PARAGON	0xa1
+
+
+#define PN26G0XA_STATUS_ECC_BITMASK		(3 << 4)
+
+#define PN26G0XA_STATUS_ECC_NONE_DETECTED	(0 << 4)
+#define PN26G0XA_STATUS_ECC_1_7_CORRECTED	(1 << 4)
+#define PN26G0XA_STATUS_ECC_ERRORED		(2 << 4)
+#define PN26G0XA_STATUS_ECC_8_CORRECTED		(3 << 4)
+
+
+static SPINAND_OP_VARIANTS(read_cache_variants,
+		SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 2, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+
+static SPINAND_OP_VARIANTS(write_cache_variants,
+		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+		SPINAND_PROG_LOAD(true, 0, NULL, 0));
+
+static SPINAND_OP_VARIANTS(update_cache_variants,
+		SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
+		SPINAND_PROG_LOAD(false, 0, NULL, 0));
+
+
+static int pn26g0xa_ooblayout_ecc(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	region->offset = 6 + (15 * section); /* 4 BBM + 2 user bytes */
+	region->length = 13;
+
+	return 0;
+}
+
+static int pn26g0xa_ooblayout_free(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 4)
+		return -ERANGE;
+
+	if (section == 4) {
+		region->offset = 64;
+		region->length = 64;
+	} else {
+		region->offset = 4 + (15 * section);
+		region->length = 2;
+	}
+
+	return 0;
+}
+
+static int pn26g0xa_ecc_get_status(struct spinand_device *spinand,
+				   u8 status)
+{
+	switch (status & PN26G0XA_STATUS_ECC_BITMASK) {
+	case PN26G0XA_STATUS_ECC_NONE_DETECTED:
+		return 0;
+
+	case PN26G0XA_STATUS_ECC_1_7_CORRECTED:
+		return 7;	/* Return upper limit by convention */
+
+	case PN26G0XA_STATUS_ECC_8_CORRECTED:
+		return 8;
+
+	case PN26G0XA_STATUS_ECC_ERRORED:
+		return -EBADMSG;
+
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static const struct mtd_ooblayout_ops pn26g0xa_ooblayout = {
+	.ecc = pn26g0xa_ooblayout_ecc,
+	.free = pn26g0xa_ooblayout_free,
+};
+
+
+static const struct spinand_info paragon_spinand_table[] = {
+	SPINAND_INFO("PN26G01A", 0xe1,
+		     NAND_MEMORG(1, 2048, 128, 64, 1024, 21, 1, 1, 1),
+		     NAND_ECCREQ(8, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&pn26g0xa_ooblayout,
+				     pn26g0xa_ecc_get_status)),
+	SPINAND_INFO("PN26G02A", 0xe2,
+		     NAND_MEMORG(1, 2048, 128, 64, 2048, 41, 1, 1, 1),
+		     NAND_ECCREQ(8, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&pn26g0xa_ooblayout,
+				     pn26g0xa_ecc_get_status)),
+};
+
+static int paragon_spinand_detect(struct spinand_device *spinand)
+{
+	u8 *id = spinand->id.data;
+	int ret;
+
+	/* Read ID returns [0][MID][DID] */
+
+	if (id[1] != SPINAND_MFR_PARAGON)
+		return 0;
+
+	ret = spinand_match_and_init(spinand, paragon_spinand_table,
+				     ARRAY_SIZE(paragon_spinand_table),
+				     id[2]);
+	if (ret)
+		return ret;
+
+	return 1;
+}
+
+static const struct spinand_manufacturer_ops paragon_spinand_manuf_ops = {
+	.detect = paragon_spinand_detect,
+};
+
+const struct spinand_manufacturer paragon_spinand_manufacturer = {
+	.id = SPINAND_MFR_PARAGON,
+	.name = "Paragon",
+	.ops = &paragon_spinand_manuf_ops,
+};
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index fbc0423bb4ae..4ea558bd3c46 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -227,6 +227,7 @@ struct spinand_manufacturer {
 extern const struct spinand_manufacturer gigadevice_spinand_manufacturer;
 extern const struct spinand_manufacturer macronix_spinand_manufacturer;
 extern const struct spinand_manufacturer micron_spinand_manufacturer;
+extern const struct spinand_manufacturer paragon_spinand_manufacturer;
 extern const struct spinand_manufacturer toshiba_spinand_manufacturer;
 extern const struct spinand_manufacturer winbond_spinand_manufacturer;
 
-- 
cgit v1.2.3


From a4496d52b3430cb3c4c16d03cdd5f4ee97ad1241 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Tue, 7 May 2019 11:52:47 +0200
Subject: power: supply: add input power and voltage limit properties

For thermal management strategy you might be interested on limit the
input power for a power supply. We already have current limit but
basically what we probably want is to limit power. So, introduce the
input_power_limit property.

Although the common use case is limit the input power, in some
specific cases it is the voltage that is problematic (i.e some regulators
have different efficiencies at higher voltage resulting in more heat).
So introduce also the input_voltage_limit property.

This happens in one Chromebook and is used on the Pixel C's thermal
management strategy to effectively limit the input power to 5V 3A when
the screen is on. When the screen is on, the display, the CPU, and the GPU
all contribute more heat to the system than while the screen is off, and
we made a tradeoff to throttle the charger in order to give more of the
thermal budget to those other components.

So there's nothing fundamentally broken about the hardware that would
cause the Pixel C to malfunction if we were charging at 9V or 12V instead
of 5V when the screen is on, i.e. if userspace doesn't change this.

What would happen is that you wouldn't meet Google's skin temperature
targets on the system if the charger was allowed to run at 9V or 12V with
the screen on.

For folks hacking on Pixel Cs (which is now outside of Google's official
support window for Android) and customizing their own kernel and userspace
this would be acceptable, but we wanted to expose this feature in the
power supply properties because the feature does exist in the Emedded
Controller firmware of the Pixel C and all of Google's Chromebooks with
USB-C made since 2015 in case someone running an up to date kernel wanted
to limit the charging power for thermal or other reasons.

This patch exposes a new property, similar to input current limit, to
re-configure the maximum voltage from the external supply at runtime
based on system-level knowledge or user input.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Acked-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Reviewed-by: Benson Leung <bleung@chromium.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 Documentation/ABI/testing/sysfs-class-power | 32 +++++++++++++++++++++++++++++
 Documentation/power/power_supply_class.txt  |  4 ++++
 drivers/power/supply/power_supply_sysfs.c   |  2 ++
 include/linux/power_supply.h                |  2 ++
 4 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power
index b77e30b9014e..27edc06e2495 100644
--- a/Documentation/ABI/testing/sysfs-class-power
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -376,10 +376,42 @@ Description:
 		supply. Normally this is configured based on the type of
 		connection made (e.g. A configured SDP should output a maximum
 		of 500mA so the input current limit is set to the same value).
+		Use preferably input_power_limit, and for problems that can be
+		solved using power limit use input_current_limit.
 
 		Access: Read, Write
 		Valid values: Represented in microamps
 
+What:		/sys/class/power_supply/<supply_name>/input_voltage_limit
+Date:		May 2019
+Contact:	linux-pm@vger.kernel.org
+Description:
+		This entry configures the incoming VBUS voltage limit currently
+		set in the supply. Normally this is configured based on
+		system-level knowledge or user input (e.g. This is part of the
+		Pixel C's thermal management strategy to effectively limit the
+		input power to 5V when the screen is on to meet Google's skin
+		temperature targets). Note that this feature should not be
+		used for safety critical things.
+		Use preferably input_power_limit, and for problems that can be
+		solved using power limit use input_voltage_limit.
+
+		Access: Read, Write
+		Valid values: Represented in microvolts
+
+What:		/sys/class/power_supply/<supply_name>/input_power_limit
+Date:		May 2019
+Contact:	linux-pm@vger.kernel.org
+Description:
+		This entry configures the incoming power limit currently set
+		in the supply. Normally this is configured based on
+		system-level knowledge or user input. Use preferably this
+		feature to limit the incoming power and use current/voltage
+		limit only for problems that can be solved using power limit.
+
+		Access: Read, Write
+		Valid values: Represented in microwatts
+
 What:		/sys/class/power_supply/<supply_name>/online,
 Date:		May 2007
 Contact:	linux-pm@vger.kernel.org
diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
index 300d37896e51..1e3c705111db 100644
--- a/Documentation/power/power_supply_class.txt
+++ b/Documentation/power/power_supply_class.txt
@@ -137,6 +137,10 @@ power supply object.
 
 INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates
 the current drawn from a charging source.
+INPUT_VOLTAGE_LIMIT - input voltage limit programmed by charger. Indicates
+the voltage limit from a charging source.
+INPUT_POWER_LIMIT - input power limit programmed by charger. Indicates
+the power limit from a charging source.
 
 CHARGE_CONTROL_LIMIT - current charge control limit setting
 CHARGE_CONTROL_LIMIT_MAX - maximum charge control limit setting
diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c
index a704a76d7529..829e12c800e5 100644
--- a/drivers/power/supply/power_supply_sysfs.c
+++ b/drivers/power/supply/power_supply_sysfs.c
@@ -277,6 +277,8 @@ static struct device_attribute power_supply_attrs[] = {
 	POWER_SUPPLY_ATTR(charge_control_start_threshold),
 	POWER_SUPPLY_ATTR(charge_control_end_threshold),
 	POWER_SUPPLY_ATTR(input_current_limit),
+	POWER_SUPPLY_ATTR(input_voltage_limit),
+	POWER_SUPPLY_ATTR(input_power_limit),
 	POWER_SUPPLY_ATTR(energy_full_design),
 	POWER_SUPPLY_ATTR(energy_empty_design),
 	POWER_SUPPLY_ATTR(energy_full),
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index d5b15e039f4f..cbb708b57b11 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -129,6 +129,8 @@ enum power_supply_property {
 	POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD, /* in percents! */
 	POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD, /* in percents! */
 	POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT,
+	POWER_SUPPLY_PROP_INPUT_VOLTAGE_LIMIT,
+	POWER_SUPPLY_PROP_INPUT_POWER_LIMIT,
 	POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN,
 	POWER_SUPPLY_PROP_ENERGY_EMPTY_DESIGN,
 	POWER_SUPPLY_PROP_ENERGY_FULL,
-- 
cgit v1.2.3


From 239b0e52d8aa64d2559c672fd8c29cf1fffc3ec7 Mon Sep 17 00:00:00 2001
From: Kamenee Arumugam <kamenee.arumugam@intel.com>
Date: Fri, 28 Jun 2019 14:04:17 -0400
Subject: IB/hfi1: Move rvt_cq_wc struct into uapi directory

The rvt_cq_wc struct elements are shared between rdmavt and the providers
but not in uapi directory.  As per the comment in
https://marc.info/?l=linux-rdma&m=152296522708522&w=2 The hfi1 driver and
the rdma core driver are not using shared structures in the uapi
directory.

In that case, move rvt_cq_wc struct into the rvt-abi.h header file and
create a rvt_k_cq_w for the kernel completion queue.

Signed-off-by: Kamenee Arumugam <kamenee.arumugam@intel.com>
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/qp.c   |   4 +-
 drivers/infiniband/sw/rdmavt/cq.c | 192 ++++++++++++++++++++++++--------------
 include/rdma/rdmavt_cq.h          |  22 +++--
 include/rdma/rdmavt_qp.h          |  32 +++++++
 include/uapi/rdma/rvt-abi.h       |  32 +++++++
 5 files changed, 205 insertions(+), 77 deletions(-)
 create mode 100644 include/uapi/rdma/rvt-abi.h

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 4e0e9fc0a777..41261e72c429 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -702,8 +702,8 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter)
 		   sde ? sde->this_idx : 0,
 		   send_context,
 		   send_context ? send_context->sw_index : 0,
-		   ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head,
-		   ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail,
+		   ib_cq_head(qp->ibqp.send_cq),
+		   ib_cq_tail(qp->ibqp.send_cq),
 		   qp->pid,
 		   qp->s_state,
 		   qp->s_ack_state,
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index b46714a92b7a..2602ad8b8cb0 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -63,19 +63,33 @@ static struct workqueue_struct *comp_vector_wq;
  */
 void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 {
-	struct rvt_cq_wc *wc;
+	struct ib_uverbs_wc *uqueue = NULL;
+	struct ib_wc *kqueue = NULL;
+	struct rvt_cq_wc *u_wc = NULL;
+	struct rvt_k_cq_wc *k_wc = NULL;
 	unsigned long flags;
 	u32 head;
 	u32 next;
+	u32 tail;
 
 	spin_lock_irqsave(&cq->lock, flags);
 
+	if (cq->ip) {
+		u_wc = cq->queue;
+		uqueue = &u_wc->uqueue[0];
+		head = RDMA_READ_UAPI_ATOMIC(u_wc->head);
+		tail = RDMA_READ_UAPI_ATOMIC(u_wc->tail);
+	} else {
+		k_wc = cq->kqueue;
+		kqueue = &k_wc->kqueue[0];
+		head = k_wc->head;
+		tail = k_wc->tail;
+	}
+
 	/*
-	 * Note that the head pointer might be writable by user processes.
-	 * Take care to verify it is a sane value.
+	 * Note that the head pointer might be writable by
+	 * user processes.Take care to verify it is a sane value.
 	 */
-	wc = cq->queue;
-	head = wc->head;
 	if (head >= (unsigned)cq->ibcq.cqe) {
 		head = cq->ibcq.cqe;
 		next = 0;
@@ -83,7 +97,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 		next = head + 1;
 	}
 
-	if (unlikely(next == wc->tail)) {
+	if (unlikely(next == tail)) {
 		spin_unlock_irqrestore(&cq->lock, flags);
 		if (cq->ibcq.event_handler) {
 			struct ib_event ev;
@@ -96,27 +110,27 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 		return;
 	}
 	trace_rvt_cq_enter(cq, entry, head);
-	if (cq->ip) {
-		wc->uqueue[head].wr_id = entry->wr_id;
-		wc->uqueue[head].status = entry->status;
-		wc->uqueue[head].opcode = entry->opcode;
-		wc->uqueue[head].vendor_err = entry->vendor_err;
-		wc->uqueue[head].byte_len = entry->byte_len;
-		wc->uqueue[head].ex.imm_data = entry->ex.imm_data;
-		wc->uqueue[head].qp_num = entry->qp->qp_num;
-		wc->uqueue[head].src_qp = entry->src_qp;
-		wc->uqueue[head].wc_flags = entry->wc_flags;
-		wc->uqueue[head].pkey_index = entry->pkey_index;
-		wc->uqueue[head].slid = ib_lid_cpu16(entry->slid);
-		wc->uqueue[head].sl = entry->sl;
-		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
-		wc->uqueue[head].port_num = entry->port_num;
+	if (uqueue) {
+		uqueue[head].wr_id = entry->wr_id;
+		uqueue[head].status = entry->status;
+		uqueue[head].opcode = entry->opcode;
+		uqueue[head].vendor_err = entry->vendor_err;
+		uqueue[head].byte_len = entry->byte_len;
+		uqueue[head].ex.imm_data = entry->ex.imm_data;
+		uqueue[head].qp_num = entry->qp->qp_num;
+		uqueue[head].src_qp = entry->src_qp;
+		uqueue[head].wc_flags = entry->wc_flags;
+		uqueue[head].pkey_index = entry->pkey_index;
+		uqueue[head].slid = ib_lid_cpu16(entry->slid);
+		uqueue[head].sl = entry->sl;
+		uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+		uqueue[head].port_num = entry->port_num;
 		/* Make sure entry is written before the head index. */
-		smp_wmb();
+		RDMA_WRITE_UAPI_ATOMIC(u_wc->head, next);
 	} else {
-		wc->kqueue[head] = *entry;
+		kqueue[head] = *entry;
+		k_wc->head = next;
 	}
-	wc->head = next;
 
 	if (cq->notify == IB_CQ_NEXT_COMP ||
 	    (cq->notify == IB_CQ_SOLICITED &&
@@ -179,8 +193,9 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 {
 	struct ib_device *ibdev = ibcq->device;
 	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
-	struct rvt_cq *cq = container_of(ibcq, struct rvt_cq, ibcq);
-	struct rvt_cq_wc *wc;
+	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+	struct rvt_cq_wc *u_wc = NULL;
+	struct rvt_k_cq_wc *k_wc = NULL;
 	u32 sz;
 	unsigned int entries = attr->cqe;
 	int comp_vector = attr->comp_vector;
@@ -204,22 +219,28 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	 * We need to use vmalloc() in order to support mmap and large
 	 * numbers of entries.
 	 */
-	sz = sizeof(*wc);
-	if (udata && udata->outlen >= sizeof(__u64))
-		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
-	else
-		sz += sizeof(struct ib_wc) * (entries + 1);
-	wc = udata ?
-		vmalloc_user(sz) :
-		vzalloc_node(sz, rdi->dparms.node);
-	if (!wc)
-		return -ENOMEM;
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		sz = sizeof(struct ib_uverbs_wc) * (entries + 1);
+		sz += sizeof(*u_wc);
+		u_wc = vmalloc_user(sz);
+		if (!u_wc)
+			return -ENOMEM;
+	} else {
+		sz = sizeof(struct ib_wc) * (entries + 1);
+		sz += sizeof(*k_wc);
+		k_wc = vzalloc_node(sz, rdi->dparms.node);
+		if (!k_wc)
+			return -ENOMEM;
+	}
+
 	/*
 	 * Return the address of the WC as the offset to mmap.
 	 * See rvt_mmap() for details.
 	 */
 	if (udata && udata->outlen >= sizeof(__u64)) {
-		cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc);
+		int err;
+
+		cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc);
 		if (!cq->ip) {
 			err = -ENOMEM;
 			goto bail_wc;
@@ -264,7 +285,10 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	cq->notify = RVT_CQ_NONE;
 	spin_lock_init(&cq->lock);
 	INIT_WORK(&cq->comptask, send_complete);
-	cq->queue = wc;
+	if (u_wc)
+		cq->queue = u_wc;
+	else
+		cq->kqueue = k_wc;
 
 	trace_rvt_create_cq(cq, attr);
 	return 0;
@@ -272,7 +296,8 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 bail_ip:
 	kfree(cq->ip);
 bail_wc:
-	vfree(wc);
+	vfree(u_wc);
+	vfree(k_wc);
 	return err;
 }
 
@@ -322,9 +347,16 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
 	if (cq->notify != IB_CQ_NEXT_COMP)
 		cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
 
-	if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
-	    cq->queue->head != cq->queue->tail)
-		ret = 1;
+	if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+		if (cq->queue) {
+			if (RDMA_READ_UAPI_ATOMIC(cq->queue->head) !=
+				RDMA_READ_UAPI_ATOMIC(cq->queue->tail))
+				ret = 1;
+		} else {
+			if (cq->kqueue->head != cq->kqueue->tail)
+				ret = 1;
+		}
+	}
 
 	spin_unlock_irqrestore(&cq->lock, flags);
 
@@ -340,12 +372,14 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 {
 	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
-	struct rvt_cq_wc *old_wc;
-	struct rvt_cq_wc *wc;
 	u32 head, tail, n;
 	int ret;
 	u32 sz;
 	struct rvt_dev_info *rdi = cq->rdi;
+	struct rvt_cq_wc *u_wc = NULL;
+	struct rvt_cq_wc *old_u_wc = NULL;
+	struct rvt_k_cq_wc *k_wc = NULL;
+	struct rvt_k_cq_wc *old_k_wc = NULL;
 
 	if (cqe < 1 || cqe > rdi->dparms.props.max_cqe)
 		return -EINVAL;
@@ -353,17 +387,19 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 	/*
 	 * Need to use vmalloc() if we want to support large #s of entries.
 	 */
-	sz = sizeof(*wc);
-	if (udata && udata->outlen >= sizeof(__u64))
-		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
-	else
-		sz += sizeof(struct ib_wc) * (cqe + 1);
-	wc = udata ?
-		vmalloc_user(sz) :
-		vzalloc_node(sz, rdi->dparms.node);
-	if (!wc)
-		return -ENOMEM;
-
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		sz = sizeof(struct ib_uverbs_wc) * (cqe + 1);
+		sz += sizeof(*u_wc);
+		u_wc = vmalloc_user(sz);
+		if (!u_wc)
+			return -ENOMEM;
+	} else {
+		sz = sizeof(struct ib_wc) * (cqe + 1);
+		sz += sizeof(*k_wc);
+		k_wc = vzalloc_node(sz, rdi->dparms.node);
+		if (!k_wc)
+			return -ENOMEM;
+	}
 	/* Check that we can write the offset to mmap. */
 	if (udata && udata->outlen >= sizeof(__u64)) {
 		__u64 offset = 0;
@@ -378,11 +414,18 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 	 * Make sure head and tail are sane since they
 	 * might be user writable.
 	 */
-	old_wc = cq->queue;
-	head = old_wc->head;
+	if (u_wc) {
+		old_u_wc = cq->queue;
+		head = RDMA_READ_UAPI_ATOMIC(old_u_wc->head);
+		tail = RDMA_READ_UAPI_ATOMIC(old_u_wc->tail);
+	} else {
+		old_k_wc = cq->kqueue;
+		head = old_k_wc->head;
+		tail = old_k_wc->tail;
+	}
+
 	if (head > (u32)cq->ibcq.cqe)
 		head = (u32)cq->ibcq.cqe;
-	tail = old_wc->tail;
 	if (tail > (u32)cq->ibcq.cqe)
 		tail = (u32)cq->ibcq.cqe;
 	if (head < tail)
@@ -394,27 +437,36 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 		goto bail_unlock;
 	}
 	for (n = 0; tail != head; n++) {
-		if (cq->ip)
-			wc->uqueue[n] = old_wc->uqueue[tail];
+		if (u_wc)
+			u_wc->uqueue[n] = old_u_wc->uqueue[tail];
 		else
-			wc->kqueue[n] = old_wc->kqueue[tail];
+			k_wc->kqueue[n] = old_k_wc->kqueue[tail];
 		if (tail == (u32)cq->ibcq.cqe)
 			tail = 0;
 		else
 			tail++;
 	}
 	cq->ibcq.cqe = cqe;
-	wc->head = n;
-	wc->tail = 0;
-	cq->queue = wc;
+	if (u_wc) {
+		RDMA_WRITE_UAPI_ATOMIC(u_wc->head, n);
+		RDMA_WRITE_UAPI_ATOMIC(u_wc->tail, 0);
+		cq->queue = u_wc;
+	} else {
+		k_wc->head = n;
+		k_wc->tail = 0;
+		cq->kqueue = k_wc;
+	}
 	spin_unlock_irq(&cq->lock);
 
-	vfree(old_wc);
+	if (u_wc)
+		vfree(old_u_wc);
+	else
+		vfree(old_k_wc);
 
 	if (cq->ip) {
 		struct rvt_mmap_info *ip = cq->ip;
 
-		rvt_update_mmap_info(rdi, ip, sz, wc);
+		rvt_update_mmap_info(rdi, ip, sz, u_wc);
 
 		/*
 		 * Return the offset to mmap.
@@ -438,7 +490,9 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 bail_unlock:
 	spin_unlock_irq(&cq->lock);
 bail_free:
-	vfree(wc);
+	vfree(u_wc);
+	vfree(k_wc);
+
 	return ret;
 }
 
@@ -456,7 +510,7 @@ bail_free:
 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 {
 	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
-	struct rvt_cq_wc *wc;
+	struct rvt_k_cq_wc *wc;
 	unsigned long flags;
 	int npolled;
 	u32 tail;
@@ -467,7 +521,7 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 
 	spin_lock_irqsave(&cq->lock, flags);
 
-	wc = cq->queue;
+	wc = cq->kqueue;
 	tail = wc->tail;
 	if (tail > (u32)cq->ibcq.cqe)
 		tail = (u32)cq->ibcq.cqe;
diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h
index 75dc65c0bfb8..ab22860a63e2 100644
--- a/include/rdma/rdmavt_cq.h
+++ b/include/rdma/rdmavt_cq.h
@@ -60,19 +60,28 @@
  */
 #define RVT_CQ_NONE      (IB_CQ_NEXT_COMP + 1)
 
+/*
+ * Define read macro that apply smp_load_acquire memory barrier
+ * when reading indice of circular buffer that mmaped to user space.
+ */
+#define RDMA_READ_UAPI_ATOMIC(member) smp_load_acquire(&(member).val)
+
+/*
+ * Define write macro that uses smp_store_release memory barrier
+ * when writing indice of circular buffer that mmaped to user space.
+ */
+#define RDMA_WRITE_UAPI_ATOMIC(member, x) smp_store_release(&(member).val, x)
+#include <rdma/rvt-abi.h>
+
 /*
  * This structure is used to contain the head pointer, tail pointer,
  * and completion queue entries as a single memory allocation so
  * it can be mmap'ed into user space.
  */
-struct rvt_cq_wc {
+struct rvt_k_cq_wc {
 	u32 head;               /* index of next entry to fill */
 	u32 tail;               /* index of next ib_poll_cq() entry */
-	union {
-		/* these are actually size ibcq.cqe + 1 */
-		struct ib_uverbs_wc uqueue[0];
-		struct ib_wc kqueue[0];
-	};
+	struct ib_wc kqueue[];
 };
 
 /*
@@ -88,6 +97,7 @@ struct rvt_cq {
 	struct rvt_dev_info *rdi;
 	struct rvt_cq_wc *queue;
 	struct rvt_mmap_info *ip;
+	struct rvt_k_cq_wc *kqueue;
 };
 
 static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq)
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 84d0f36afc2f..7fcd687af278 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -820,6 +820,38 @@ struct rvt_qp_iter {
 	int n;
 };
 
+/**
+ * ib_cq_tail - Return tail index of cq buffer
+ * @send_cq - The cq for send
+ *
+ * This is called in qp_iter_print to get tail
+ * of cq buffer.
+ */
+static inline u32 ib_cq_tail(struct ib_cq *send_cq)
+{
+	struct rvt_cq *cq = ibcq_to_rvtcq(send_cq);
+
+	return ibcq_to_rvtcq(send_cq)->ip ?
+	       RDMA_READ_UAPI_ATOMIC(cq->queue->tail) :
+	       ibcq_to_rvtcq(send_cq)->kqueue->tail;
+}
+
+/**
+ * ib_cq_head - Return head index of cq buffer
+ * @send_cq - The cq for send
+ *
+ * This is called in qp_iter_print to get head
+ * of cq buffer.
+ */
+static inline u32 ib_cq_head(struct ib_cq *send_cq)
+{
+	struct rvt_cq *cq = ibcq_to_rvtcq(send_cq);
+
+	return ibcq_to_rvtcq(send_cq)->ip ?
+	       RDMA_READ_UAPI_ATOMIC(cq->queue->head) :
+	       ibcq_to_rvtcq(send_cq)->kqueue->head;
+}
+
 struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
 				     u64 v,
 				     void (*cb)(struct rvt_qp *qp, u64 v));
diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h
new file mode 100644
index 000000000000..8e5f7e0c15fe
--- /dev/null
+++ b/include/uapi/rdma/rvt-abi.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+#ifndef RVT_ABI_USER_H
+#define RVT_ABI_USER_H
+
+#include <linux/types.h>
+#include <rdma/ib_user_verbs.h>
+#ifndef RDMA_ATOMIC_UAPI
+#define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name
+#endif
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct rvt_cq_wc {
+	/* index of next entry to fill */
+	RDMA_ATOMIC_UAPI(__u32, head);
+	/* index of next ib_poll_cq() entry */
+	RDMA_ATOMIC_UAPI(__u32, tail);
+
+	/* these are actually size ibcq.cqe + 1 */
+	struct ib_uverbs_wc uqueue[];
+};
+
+#endif /* RVT_ABI_USER_H */
-- 
cgit v1.2.3


From dabac6e460ce8473f1e685432a8ab7818d81a1f1 Mon Sep 17 00:00:00 2001
From: Kamenee Arumugam <kamenee.arumugam@intel.com>
Date: Fri, 28 Jun 2019 14:04:24 -0400
Subject: IB/hfi1: Move receive work queue struct into uapi directory

The rvt_rwqe and rvt_rwq struct elements are shared between rdmavt and the
providers but are not in uapi directory.  As per the comment in
https://marc.info/?l=linux-rdma&m=152296522708522&w=2, The hfi1 driver and
the rdma core driver are not using shared structures in the uapi
directory.

Move rvt_rwqe and rvt_rwq struct into rvt-abi.h header in uapi directory.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Kamenee Arumugam <kamenee.arumugam@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/rdmavt/qp.c  | 152 +++++++++++++++++++++++++++----------
 drivers/infiniband/sw/rdmavt/qp.h  |   2 +
 drivers/infiniband/sw/rdmavt/rc.c  |  10 ++-
 drivers/infiniband/sw/rdmavt/srq.c |  59 +++++++-------
 include/rdma/rdmavt_qp.h           |  52 ++++++++-----
 include/uapi/rdma/rvt-abi.h        |  29 +++++++
 6 files changed, 212 insertions(+), 92 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 0d804a58f954..1384060f175d 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -802,6 +802,46 @@ static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 	}
 }
 
+/**
+ * rvt_alloc_rq - allocate memory for user or kernel buffer
+ * @rq: receive queue data structure
+ * @size: number of request queue entries
+ * @node: The NUMA node
+ * @udata: True if user data is available or not false
+ *
+ * Return: If memory allocation failed, return -ENONEM
+ * This function is used by both shared receive
+ * queues and non-shared receive queues to allocate
+ * memory.
+ */
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
+		 struct ib_udata *udata)
+{
+	if (udata) {
+		rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size);
+		if (!rq->wq)
+			goto bail;
+		/* need kwq with no buffers */
+		rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node);
+		if (!rq->kwq)
+			goto bail;
+		rq->kwq->curr_wq = rq->wq->wq;
+	} else {
+		/* need kwq with buffers */
+		rq->kwq =
+			vzalloc_node(sizeof(struct rvt_krwq) + size, node);
+		if (!rq->kwq)
+			goto bail;
+		rq->kwq->curr_wq = rq->kwq->wq;
+	}
+
+	spin_lock_init(&rq->lock);
+	return 0;
+bail:
+	rvt_free_rq(rq);
+	return -ENOMEM;
+}
+
 /**
  * rvt_init_qp - initialize the QP state to the reset state
  * @qp: the QP to init or reinit
@@ -852,10 +892,6 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 	qp->s_tail_ack_queue = 0;
 	qp->s_acked_ack_queue = 0;
 	qp->s_num_rd_atomic = 0;
-	if (qp->r_rq.wq) {
-		qp->r_rq.wq->head = 0;
-		qp->r_rq.wq->tail = 0;
-	}
 	qp->r_sge.num_sge = 0;
 	atomic_set(&qp->s_reserved_used, 0);
 }
@@ -1046,17 +1082,12 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
 			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
 				sizeof(struct rvt_rwqe);
-			if (udata)
-				qp->r_rq.wq = vmalloc_user(
-						sizeof(struct rvt_rwq) +
-						qp->r_rq.size * sz);
-			else
-				qp->r_rq.wq = vzalloc_node(
-						sizeof(struct rvt_rwq) +
-						qp->r_rq.size * sz,
-						rdi->dparms.node);
-			if (!qp->r_rq.wq)
+			err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz,
+					   rdi->dparms.node, udata);
+			if (err) {
+				ret = ERR_PTR(err);
 				goto bail_driver_priv;
+			}
 		}
 
 		/*
@@ -1202,8 +1233,7 @@ bail_qpn:
 	rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
 
 bail_rq_wq:
-	if (!qp->ip)
-		vfree(qp->r_rq.wq);
+	rvt_free_rq(&qp->r_rq);
 
 bail_driver_priv:
 	rdi->driver_f.qp_priv_free(rdi, qp);
@@ -1269,19 +1299,26 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
 	}
 	wc.status = IB_WC_WR_FLUSH_ERR;
 
-	if (qp->r_rq.wq) {
-		struct rvt_rwq *wq;
+	if (qp->r_rq.kwq) {
 		u32 head;
 		u32 tail;
+		struct rvt_rwq *wq = NULL;
+		struct rvt_krwq *kwq = NULL;
 
 		spin_lock(&qp->r_rq.lock);
-
+		/* qp->ip used to validate if there is a  user buffer mmaped */
+		if (qp->ip) {
+			wq = qp->r_rq.wq;
+			head = RDMA_READ_UAPI_ATOMIC(wq->head);
+			tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
+		} else {
+			kwq = qp->r_rq.kwq;
+			head = kwq->head;
+			tail = kwq->tail;
+		}
 		/* sanity check pointers before trusting them */
-		wq = qp->r_rq.wq;
-		head = wq->head;
 		if (head >= qp->r_rq.size)
 			head = 0;
-		tail = wq->tail;
 		if (tail >= qp->r_rq.size)
 			tail = 0;
 		while (tail != head) {
@@ -1290,8 +1327,10 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
 				tail = 0;
 			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
 		}
-		wq->tail = tail;
-
+		if (qp->ip)
+			RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
+		else
+			kwq->tail = tail;
 		spin_unlock(&qp->r_rq.lock);
 	} else if (qp->ibqp.event_handler) {
 		ret = 1;
@@ -1634,8 +1673,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 
 	if (qp->ip)
 		kref_put(&qp->ip->ref, rvt_release_mmap_info);
-	else
-		vfree(qp->r_rq.wq);
+	kvfree(qp->r_rq.kwq);
 	rdi->driver_f.qp_priv_free(rdi, qp);
 	kfree(qp->s_ack_queue);
 	rdma_destroy_ah_attr(&qp->remote_ah_attr);
@@ -1721,7 +1759,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 		  const struct ib_recv_wr **bad_wr)
 {
 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
-	struct rvt_rwq *wq = qp->r_rq.wq;
+	struct rvt_krwq *wq = qp->r_rq.kwq;
 	unsigned long flags;
 	int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
 				!qp->ibqp.srq;
@@ -1746,7 +1784,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 		next = wq->head + 1;
 		if (next >= qp->r_rq.size)
 			next = 0;
-		if (next == wq->tail) {
+		if (next == READ_ONCE(wq->tail)) {
 			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
 			*bad_wr = wr;
 			return -ENOMEM;
@@ -1770,8 +1808,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 			 * Make sure queue entry is written
 			 * before the head index.
 			 */
-			smp_wmb();
-			wq->head = next;
+			smp_store_release(&wq->head, next);
 		}
 		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
 	}
@@ -2141,7 +2178,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 		      const struct ib_recv_wr **bad_wr)
 {
 	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
-	struct rvt_rwq *wq;
+	struct rvt_krwq *wq;
 	unsigned long flags;
 
 	for (; wr; wr = wr->next) {
@@ -2155,11 +2192,11 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 		}
 
 		spin_lock_irqsave(&srq->rq.lock, flags);
-		wq = srq->rq.wq;
+		wq = srq->rq.kwq;
 		next = wq->head + 1;
 		if (next >= srq->rq.size)
 			next = 0;
-		if (next == wq->tail) {
+		if (next == READ_ONCE(wq->tail)) {
 			spin_unlock_irqrestore(&srq->rq.lock, flags);
 			*bad_wr = wr;
 			return -ENOMEM;
@@ -2171,8 +2208,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 		for (i = 0; i < wr->num_sge; i++)
 			wqe->sg_list[i] = wr->sg_list[i];
 		/* Make sure queue entry is written before the head index. */
-		smp_wmb();
-		wq->head = next;
+		smp_store_release(&wq->head, next);
 		spin_unlock_irqrestore(&srq->rq.lock, flags);
 	}
 	return 0;
@@ -2229,6 +2265,25 @@ bad_lkey:
 	return 0;
 }
 
+/**
+ * get_rvt_head - get head indices of the circular buffer
+ * @rq: data structure for request queue entry
+ * @ip: the QP
+ *
+ * Return - head index value
+ */
+static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip)
+{
+	u32 head;
+
+	if (ip)
+		head = RDMA_READ_UAPI_ATOMIC(rq->wq->head);
+	else
+		head = rq->kwq->head;
+
+	return head;
+}
+
 /**
  * rvt_get_rwqe - copy the next RWQE into the QP's RWQE
  * @qp: the QP
@@ -2243,21 +2298,26 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
 {
 	unsigned long flags;
 	struct rvt_rq *rq;
+	struct rvt_krwq *kwq;
 	struct rvt_rwq *wq;
 	struct rvt_srq *srq;
 	struct rvt_rwqe *wqe;
 	void (*handler)(struct ib_event *, void *);
 	u32 tail;
+	u32 head;
 	int ret;
+	void *ip = NULL;
 
 	if (qp->ibqp.srq) {
 		srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
 		handler = srq->ibsrq.event_handler;
 		rq = &srq->rq;
+		ip = srq->ip;
 	} else {
 		srq = NULL;
 		handler = NULL;
 		rq = &qp->r_rq;
+		ip = qp->ip;
 	}
 
 	spin_lock_irqsave(&rq->lock, flags);
@@ -2265,17 +2325,24 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
 		ret = 0;
 		goto unlock;
 	}
+	if (ip) {
+		wq = rq->wq;
+		tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
+	} else {
+		kwq = rq->kwq;
+		tail = kwq->tail;
+	}
 
-	wq = rq->wq;
-	tail = wq->tail;
 	/* Validate tail before using it since it is user writable. */
 	if (tail >= rq->size)
 		tail = 0;
-	if (unlikely(tail == wq->head)) {
+
+	head = get_rvt_head(rq, ip);
+	if (unlikely(tail == head)) {
 		ret = 0;
 		goto unlock;
 	}
-	/* Make sure entry is read after head index is read. */
+	/* Make sure entry is read after the count is read. */
 	smp_rmb();
 	wqe = rvt_get_rwqe_ptr(rq, tail);
 	/*
@@ -2285,7 +2352,10 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
 	 */
 	if (++tail >= rq->size)
 		tail = 0;
-	wq->tail = tail;
+	if (ip)
+		RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
+	else
+		kwq->tail = tail;
 	if (!wr_id_only && !init_sge(qp, wqe)) {
 		ret = -1;
 		goto unlock;
@@ -2301,7 +2371,7 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
 		 * Validate head pointer value and compute
 		 * the number of remaining WQEs.
 		 */
-		n = wq->head;
+		n = get_rvt_head(rq, ip);
 		if (n >= rq->size)
 			n = 0;
 		if (n < tail)
diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h
index 6db1619389b0..2cdba1283bf6 100644
--- a/drivers/infiniband/sw/rdmavt/qp.h
+++ b/drivers/infiniband/sw/rdmavt/qp.h
@@ -68,4 +68,6 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 		      const struct ib_recv_wr **bad_wr);
 int rvt_wss_init(struct rvt_dev_info *rdi);
 void rvt_wss_exit(struct rvt_dev_info *rdi);
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
+		 struct ib_udata *udata);
 #endif          /* DEF_RVTQP_H */
diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c
index 09f0cf538be6..44cc7ee1b321 100644
--- a/drivers/infiniband/sw/rdmavt/rc.c
+++ b/drivers/infiniband/sw/rdmavt/rc.c
@@ -104,15 +104,19 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp)
 	} else {
 		u32 min, max, x;
 		u32 credits;
-		struct rvt_rwq *wq = qp->r_rq.wq;
 		u32 head;
 		u32 tail;
 
 		/* sanity check pointers before trusting them */
-		head = wq->head;
+		if (qp->ip) {
+			head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head);
+			tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail);
+		} else {
+			head = READ_ONCE(qp->r_rq.kwq->head);
+			tail = READ_ONCE(qp->r_rq.kwq->tail);
+		}
 		if (head >= qp->r_rq.size)
 			head = 0;
-		tail = wq->tail;
 		if (tail >= qp->r_rq.size)
 			tail = 0;
 		/*
diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c
index 8d6b3e764255..d306f6547cba 100644
--- a/drivers/infiniband/sw/rdmavt/srq.c
+++ b/drivers/infiniband/sw/rdmavt/srq.c
@@ -52,7 +52,7 @@
 
 #include "srq.h"
 #include "vt.h"
-
+#include "qp.h"
 /**
  * rvt_driver_srq_init - init srq resources on a per driver basis
  * @rdi: rvt dev structure
@@ -97,11 +97,8 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
 	srq->rq.max_sge = srq_init_attr->attr.max_sge;
 	sz = sizeof(struct ib_sge) * srq->rq.max_sge +
 		sizeof(struct rvt_rwqe);
-	srq->rq.wq = udata ?
-		vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) :
-		vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz,
-			     dev->dparms.node);
-	if (!srq->rq.wq) {
+	if (rvt_alloc_rq(&srq->rq, srq->rq.size * sz,
+			 dev->dparms.node, udata)) {
 		ret = -ENOMEM;
 		goto bail_srq;
 	}
@@ -152,7 +149,7 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
 bail_ip:
 	kfree(srq->ip);
 bail_wq:
-	vfree(srq->rq.wq);
+	rvt_free_rq(&srq->rq);
 bail_srq:
 	return ret;
 }
@@ -172,11 +169,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 {
 	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
 	struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
-	struct rvt_rwq *wq;
+	struct rvt_rq tmp_rq = {};
 	int ret = 0;
 
 	if (attr_mask & IB_SRQ_MAX_WR) {
-		struct rvt_rwq *owq;
+		struct rvt_krwq *okwq = NULL;
+		struct rvt_rwq *owq = NULL;
 		struct rvt_rwqe *p;
 		u32 sz, size, n, head, tail;
 
@@ -185,17 +183,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		    ((attr_mask & IB_SRQ_LIMIT) ?
 		     attr->srq_limit : srq->limit) > attr->max_wr)
 			return -EINVAL;
-
 		sz = sizeof(struct rvt_rwqe) +
 			srq->rq.max_sge * sizeof(struct ib_sge);
 		size = attr->max_wr + 1;
-		wq = udata ?
-			vmalloc_user(sizeof(struct rvt_rwq) + size * sz) :
-			vzalloc_node(sizeof(struct rvt_rwq) + size * sz,
-				     dev->dparms.node);
-		if (!wq)
+		if (rvt_alloc_rq(&tmp_rq, size * sz, dev->dparms.node,
+				 udata))
 			return -ENOMEM;
-
 		/* Check that we can write the offset to mmap. */
 		if (udata && udata->inlen >= sizeof(__u64)) {
 			__u64 offset_addr;
@@ -218,9 +211,15 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		 * validate head and tail pointer values and compute
 		 * the number of remaining WQEs.
 		 */
-		owq = srq->rq.wq;
-		head = owq->head;
-		tail = owq->tail;
+		if (udata) {
+			owq = srq->rq.wq;
+			head = RDMA_READ_UAPI_ATOMIC(owq->head);
+			tail = RDMA_READ_UAPI_ATOMIC(owq->tail);
+		} else {
+			okwq = srq->rq.kwq;
+			head = okwq->head;
+			tail = okwq->tail;
+		}
 		if (head >= srq->rq.size || tail >= srq->rq.size) {
 			ret = -EINVAL;
 			goto bail_unlock;
@@ -235,7 +234,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 			goto bail_unlock;
 		}
 		n = 0;
-		p = wq->wq;
+		p = tmp_rq.kwq->curr_wq;
 		while (tail != head) {
 			struct rvt_rwqe *wqe;
 			int i;
@@ -250,22 +249,29 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 			if (++tail >= srq->rq.size)
 				tail = 0;
 		}
-		srq->rq.wq = wq;
+		srq->rq.kwq = tmp_rq.kwq;
+		if (udata) {
+			srq->rq.wq = tmp_rq.wq;
+			RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->head, n);
+			RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->tail, 0);
+		} else {
+			tmp_rq.kwq->head = n;
+			tmp_rq.kwq->tail = 0;
+		}
 		srq->rq.size = size;
-		wq->head = n;
-		wq->tail = 0;
 		if (attr_mask & IB_SRQ_LIMIT)
 			srq->limit = attr->srq_limit;
 		spin_unlock_irq(&srq->rq.lock);
 
 		vfree(owq);
+		kvfree(okwq);
 
 		if (srq->ip) {
 			struct rvt_mmap_info *ip = srq->ip;
 			struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device);
 			u32 s = sizeof(struct rvt_rwq) + size * sz;
 
-			rvt_update_mmap_info(dev, ip, s, wq);
+			rvt_update_mmap_info(dev, ip, s, tmp_rq.wq);
 
 			/*
 			 * Return the offset to mmap.
@@ -301,7 +307,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 bail_unlock:
 	spin_unlock_irq(&srq->rq.lock);
 bail_free:
-	vfree(wq);
+	rvt_free_rq(&tmp_rq);
 	return ret;
 }
 
@@ -336,6 +342,5 @@ void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 	spin_unlock(&dev->n_srqs_lock);
 	if (srq->ip)
 		kref_put(&srq->ip->ref, rvt_release_mmap_info);
-	else
-		vfree(srq->rq.wq);
+	kvfree(srq->rq.kwq);
 }
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 7fcd687af278..ee55fd04f6da 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -52,6 +52,7 @@
 #include <rdma/ib_pack.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdmavt_cq.h>
+#include <rdma/rvt-abi.h>
 /*
  * Atomic bit definitions for r_aflags.
  */
@@ -177,33 +178,27 @@ struct rvt_swqe {
 	struct rvt_sge sg_list[0];
 };
 
-/*
- * Receive work request queue entry.
- * The size of the sg_list is determined when the QP (or SRQ) is created
- * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+/**
+ * struct rvt_krwq - kernel struct receive work request
+ * @head: index of next entry to fill
+ * @tail: index of next entry to pull
+ * @count: count is aproximate of total receive enteries posted
+ * @rvt_rwqe: struct of receive work request queue entry
+ *
+ * This structure is used to contain the head pointer,
+ * tail pointer and receive work queue entries for kernel
+ * mode user.
  */
-struct rvt_rwqe {
-	u64 wr_id;
-	u8 num_sge;
-	struct ib_sge sg_list[0];
-};
-
-/*
- * This structure is used to contain the head pointer, tail pointer,
- * and receive work queue entries as a single memory allocation so
- * it can be mmap'ed into user space.
- * Note that the wq array elements are variable size so you can't
- * just index into the array to get the N'th element;
- * use get_rwqe_ptr() instead.
- */
-struct rvt_rwq {
+struct rvt_krwq {
 	u32 head;               /* new work requests posted to the head */
 	u32 tail;               /* receives pull requests from here. */
-	struct rvt_rwqe wq[0];
+	struct rvt_rwqe *curr_wq;
+	struct rvt_rwqe wq[];
 };
 
 struct rvt_rq {
 	struct rvt_rwq *wq;
+	struct rvt_krwq *kwq;
 	u32 size;               /* size of RWQE array */
 	u8 max_sge;
 	/* protect changes in this struct */
@@ -472,7 +467,7 @@ static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp,
 static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n)
 {
 	return (struct rvt_rwqe *)
-		((char *)rq->wq->wq +
+		((char *)rq->kwq->curr_wq +
 		 (sizeof(struct rvt_rwqe) +
 		  rq->max_sge * sizeof(struct ib_sge)) * n);
 }
@@ -852,6 +847,21 @@ static inline u32 ib_cq_head(struct ib_cq *send_cq)
 	       ibcq_to_rvtcq(send_cq)->kqueue->head;
 }
 
+/**
+ * rvt_free_rq - free memory allocated for rvt_rq struct
+ * @rvt_rq: request queue data structure
+ *
+ * This function should only be called if the rvt_mmap_info()
+ * has not succeeded.
+ */
+static inline void rvt_free_rq(struct rvt_rq *rq)
+{
+	kvfree(rq->kwq);
+	rq->kwq = NULL;
+	vfree(rq->wq);
+	rq->wq = NULL;
+}
+
 struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
 				     u64 v,
 				     void (*cb)(struct rvt_qp *qp, u64 v));
diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h
index 8e5f7e0c15fe..d2e35d24f1a9 100644
--- a/include/uapi/rdma/rvt-abi.h
+++ b/include/uapi/rdma/rvt-abi.h
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
 #ifndef RDMA_ATOMIC_UAPI
 #define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name
 #endif
@@ -29,4 +30,32 @@ struct rvt_cq_wc {
 	struct ib_uverbs_wc uqueue[];
 };
 
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct rvt_rwqe {
+	__u64 wr_id;
+	__u8 num_sge;
+	__u8 padding[7];
+	struct ib_sge sg_list[];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() for user space and rvt_get_rwqe_ptr()
+ * for kernel space.
+ */
+struct rvt_rwq {
+	/* new work requests posted to the head */
+	RDMA_ATOMIC_UAPI(__u32, head);
+	/* receives pull requests from here. */
+	RDMA_ATOMIC_UAPI(__u32, tail);
+	struct rvt_rwqe wq[];
+};
 #endif /* RVT_ABI_USER_H */
-- 
cgit v1.2.3


From f592ae3c999fbe4faeeb90dfde8ff7da49ee4ae6 Mon Sep 17 00:00:00 2001
From: Kamenee Arumugam <kamenee.arumugam@intel.com>
Date: Fri, 28 Jun 2019 14:04:30 -0400
Subject: IB/rdmavt: Fracture single lock used for posting and processing RWQEs

Usage of single lock prevents fetching posted and processing receive work
queue entries from progressing simultaneously and impacts overall
performance.

Fracture the single lock used for posting and processing Receive Work
Queue Entries (RWQEs) to allow the circular buffer to be filled and
emptied at the same time. Two new spinlocks - one for the producers and
one for the consumers used for posting and processing RWQEs simultaneously
and the two indices are define on two different cache lines. The threshold
count is used to avoid reading other index in different cache line every
time.

Signed-off-by: Harish Chegondi <harish.chegondi@intel.com>
Signed-off-by: Kamenee Arumugam <kamenee.arumugam@intel.com>
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/rdmavt/qp.c  | 97 ++++++++++++++++++++++++--------------
 drivers/infiniband/sw/rdmavt/rc.c  | 43 +++++++++--------
 drivers/infiniband/sw/rdmavt/srq.c | 10 ++--
 include/rdma/rdmavt_qp.h           |  7 +++
 4 files changed, 97 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 1384060f175d..200b292be63e 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -58,6 +58,8 @@
 #include "vt.h"
 #include "trace.h"
 
+#define RVT_RWQ_COUNT_THRESHOLD 16
+
 static void rvt_rc_timeout(struct timer_list *t);
 
 /*
@@ -835,7 +837,8 @@ int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
 		rq->kwq->curr_wq = rq->kwq->wq;
 	}
 
-	spin_lock_init(&rq->lock);
+	spin_lock_init(&rq->kwq->p_lock);
+	spin_lock_init(&rq->kwq->c_lock);
 	return 0;
 bail:
 	rvt_free_rq(rq);
@@ -892,6 +895,8 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 	qp->s_tail_ack_queue = 0;
 	qp->s_acked_ack_queue = 0;
 	qp->s_num_rd_atomic = 0;
+	if (qp->r_rq.kwq)
+		qp->r_rq.kwq->count = qp->r_rq.size;
 	qp->r_sge.num_sge = 0;
 	atomic_set(&qp->s_reserved_used, 0);
 }
@@ -1097,7 +1102,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 		spin_lock_init(&qp->r_lock);
 		spin_lock_init(&qp->s_hlock);
 		spin_lock_init(&qp->s_lock);
-		spin_lock_init(&qp->r_rq.lock);
 		atomic_set(&qp->refcount, 0);
 		atomic_set(&qp->local_ops_pending, 0);
 		init_waitqueue_head(&qp->wait);
@@ -1305,7 +1309,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
 		struct rvt_rwq *wq = NULL;
 		struct rvt_krwq *kwq = NULL;
 
-		spin_lock(&qp->r_rq.lock);
+		spin_lock(&qp->r_rq.kwq->c_lock);
 		/* qp->ip used to validate if there is a  user buffer mmaped */
 		if (qp->ip) {
 			wq = qp->r_rq.wq;
@@ -1331,7 +1335,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
 			RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
 		else
 			kwq->tail = tail;
-		spin_unlock(&qp->r_rq.lock);
+		spin_unlock(&qp->r_rq.kwq->c_lock);
 	} else if (qp->ibqp.event_handler) {
 		ret = 1;
 	}
@@ -1780,12 +1784,12 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 			return -EINVAL;
 		}
 
-		spin_lock_irqsave(&qp->r_rq.lock, flags);
+		spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags);
 		next = wq->head + 1;
 		if (next >= qp->r_rq.size)
 			next = 0;
 		if (next == READ_ONCE(wq->tail)) {
-			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+			spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
 			*bad_wr = wr;
 			return -ENOMEM;
 		}
@@ -1810,7 +1814,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 			 */
 			smp_store_release(&wq->head, next);
 		}
-		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+		spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
 	}
 	return 0;
 }
@@ -2191,13 +2195,13 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 			return -EINVAL;
 		}
 
-		spin_lock_irqsave(&srq->rq.lock, flags);
+		spin_lock_irqsave(&srq->rq.kwq->p_lock, flags);
 		wq = srq->rq.kwq;
 		next = wq->head + 1;
 		if (next >= srq->rq.size)
 			next = 0;
 		if (next == READ_ONCE(wq->tail)) {
-			spin_unlock_irqrestore(&srq->rq.lock, flags);
+			spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
 			*bad_wr = wr;
 			return -ENOMEM;
 		}
@@ -2209,7 +2213,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 			wqe->sg_list[i] = wr->sg_list[i];
 		/* Make sure queue entry is written before the head index. */
 		smp_store_release(&wq->head, next);
-		spin_unlock_irqrestore(&srq->rq.lock, flags);
+		spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
 	}
 	return 0;
 }
@@ -2265,6 +2269,31 @@ bad_lkey:
 	return 0;
 }
 
+/**
+ * get_count - count numbers of request work queue entries
+ * in circular buffer
+ * @rq: data structure for request queue entry
+ * @tail: tail indices of the circular buffer
+ * @head: head indices of the circular buffer
+ *
+ * Return - total number of entries in the circular buffer
+ */
+static u32 get_count(struct rvt_rq *rq, u32 tail, u32 head)
+{
+	u32 count;
+
+	count = head;
+
+	if (count >= rq->size)
+		count = 0;
+	if (count < tail)
+		count += rq->size - tail;
+	else
+		count -= tail;
+
+	return count;
+}
+
 /**
  * get_rvt_head - get head indices of the circular buffer
  * @rq: data structure for request queue entry
@@ -2298,7 +2327,7 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
 {
 	unsigned long flags;
 	struct rvt_rq *rq;
-	struct rvt_krwq *kwq;
+	struct rvt_krwq *kwq = NULL;
 	struct rvt_rwq *wq;
 	struct rvt_srq *srq;
 	struct rvt_rwqe *wqe;
@@ -2320,16 +2349,16 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
 		ip = qp->ip;
 	}
 
-	spin_lock_irqsave(&rq->lock, flags);
+	spin_lock_irqsave(&rq->kwq->c_lock, flags);
 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
 		ret = 0;
 		goto unlock;
 	}
+	kwq = rq->kwq;
 	if (ip) {
 		wq = rq->wq;
 		tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
 	} else {
-		kwq = rq->kwq;
 		tail = kwq->tail;
 	}
 
@@ -2337,8 +2366,11 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
 	if (tail >= rq->size)
 		tail = 0;
 
-	head = get_rvt_head(rq, ip);
-	if (unlikely(tail == head)) {
+	if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) {
+		head = get_rvt_head(rq, ip);
+		kwq->count = get_count(rq, tail, head);
+	}
+	if (unlikely(kwq->count == 0)) {
 		ret = 0;
 		goto unlock;
 	}
@@ -2362,36 +2394,31 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
 	}
 	qp->r_wr_id = wqe->wr_id;
 
+	kwq->count--;
 	ret = 1;
 	set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
 	if (handler) {
-		u32 n;
-
 		/*
 		 * Validate head pointer value and compute
 		 * the number of remaining WQEs.
 		 */
-		n = get_rvt_head(rq, ip);
-		if (n >= rq->size)
-			n = 0;
-		if (n < tail)
-			n += rq->size - tail;
-		else
-			n -= tail;
-		if (n < srq->limit) {
-			struct ib_event ev;
-
-			srq->limit = 0;
-			spin_unlock_irqrestore(&rq->lock, flags);
-			ev.device = qp->ibqp.device;
-			ev.element.srq = qp->ibqp.srq;
-			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-			handler(&ev, srq->ibsrq.srq_context);
-			goto bail;
+		if (kwq->count < srq->limit) {
+			kwq->count = get_count(rq, tail, get_rvt_head(rq, ip));
+			if (kwq->count < srq->limit) {
+				struct ib_event ev;
+
+				srq->limit = 0;
+				spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
+				ev.device = qp->ibqp.device;
+				ev.element.srq = qp->ibqp.srq;
+				ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+				handler(&ev, srq->ibsrq.srq_context);
+				goto bail;
+			}
 		}
 	}
 unlock:
-	spin_unlock_irqrestore(&rq->lock, flags);
+	spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
 bail:
 	return ret;
 }
diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c
index 44cc7ee1b321..890d7b760d2e 100644
--- a/drivers/infiniband/sw/rdmavt/rc.c
+++ b/drivers/infiniband/sw/rdmavt/rc.c
@@ -107,27 +107,30 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp)
 		u32 head;
 		u32 tail;
 
-		/* sanity check pointers before trusting them */
-		if (qp->ip) {
-			head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head);
-			tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail);
-		} else {
-			head = READ_ONCE(qp->r_rq.kwq->head);
-			tail = READ_ONCE(qp->r_rq.kwq->tail);
+		credits = READ_ONCE(qp->r_rq.kwq->count);
+		if (credits == 0) {
+			/* sanity check pointers before trusting them */
+			if (qp->ip) {
+				head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head);
+				tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail);
+			} else {
+				head = READ_ONCE(qp->r_rq.kwq->head);
+				tail = READ_ONCE(qp->r_rq.kwq->tail);
+			}
+			if (head >= qp->r_rq.size)
+				head = 0;
+			if (tail >= qp->r_rq.size)
+				tail = 0;
+			/*
+			 * Compute the number of credits available (RWQEs).
+			 * There is a small chance that the pair of reads are
+			 * not atomic, which is OK, since the fuzziness is
+			 * resolved as further ACKs go out.
+			 */
+			credits = head - tail;
+			if ((int)credits < 0)
+				credits += qp->r_rq.size;
 		}
-		if (head >= qp->r_rq.size)
-			head = 0;
-		if (tail >= qp->r_rq.size)
-			tail = 0;
-		/*
-		 * Compute the number of credits available (RWQEs).
-		 * There is a small chance that the pair of reads are
-		 * not atomic, which is OK, since the fuzziness is
-		 * resolved as further ACKs go out.
-		 */
-		credits = head - tail;
-		if ((int)credits < 0)
-			credits += qp->r_rq.size;
 		/*
 		 * Binary search the credit table to find the code to
 		 * use.
diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c
index d306f6547cba..24fef021d51d 100644
--- a/drivers/infiniband/sw/rdmavt/srq.c
+++ b/drivers/infiniband/sw/rdmavt/srq.c
@@ -206,7 +206,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 				goto bail_free;
 		}
 
-		spin_lock_irq(&srq->rq.lock);
+		spin_lock_irq(&srq->rq.kwq->c_lock);
 		/*
 		 * validate head and tail pointer values and compute
 		 * the number of remaining WQEs.
@@ -261,7 +261,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		srq->rq.size = size;
 		if (attr_mask & IB_SRQ_LIMIT)
 			srq->limit = attr->srq_limit;
-		spin_unlock_irq(&srq->rq.lock);
+		spin_unlock_irq(&srq->rq.kwq->c_lock);
 
 		vfree(owq);
 		kvfree(okwq);
@@ -295,17 +295,17 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 			spin_unlock_irq(&dev->pending_lock);
 		}
 	} else if (attr_mask & IB_SRQ_LIMIT) {
-		spin_lock_irq(&srq->rq.lock);
+		spin_lock_irq(&srq->rq.kwq->c_lock);
 		if (attr->srq_limit >= srq->rq.size)
 			ret = -EINVAL;
 		else
 			srq->limit = attr->srq_limit;
-		spin_unlock_irq(&srq->rq.lock);
+		spin_unlock_irq(&srq->rq.kwq->c_lock);
 	}
 	return ret;
 
 bail_unlock:
-	spin_unlock_irq(&srq->rq.lock);
+	spin_unlock_irq(&srq->rq.kwq->c_lock);
 bail_free:
 	rvt_free_rq(&tmp_rq);
 	return ret;
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index ee55fd04f6da..de5915b244be 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -180,7 +180,9 @@ struct rvt_swqe {
 
 /**
  * struct rvt_krwq - kernel struct receive work request
+ * @p_lock: lock to protect producer of the kernel buffer
  * @head: index of next entry to fill
+ * @c_lock:lock to protect consumer of the kernel buffer
  * @tail: index of next entry to pull
  * @count: count is aproximate of total receive enteries posted
  * @rvt_rwqe: struct of receive work request queue entry
@@ -190,8 +192,13 @@ struct rvt_swqe {
  * mode user.
  */
 struct rvt_krwq {
+	spinlock_t p_lock;	/* protect producer */
 	u32 head;               /* new work requests posted to the head */
+
+	/* protect consumer */
+	spinlock_t c_lock ____cacheline_aligned_in_smp;
 	u32 tail;               /* receives pull requests from here. */
+	u32 count;		/* approx count of receive entries posted */
 	struct rvt_rwqe *curr_wq;
 	struct rvt_rwqe wq[];
 };
-- 
cgit v1.2.3


From 5136bfea7e79b333af77594fac5bc70282a95313 Mon Sep 17 00:00:00 2001
From: Kamenee Arumugam <kamenee.arumugam@intel.com>
Date: Fri, 28 Jun 2019 14:21:52 -0400
Subject: IB/{hfi1, qib, rdmavt}: Put qp in error state when cq is full

When a completion queue is full, the associated queue pairs are not put
into the error state. According to the IBTA specification, this is a
violation.

Quote from IBTA spec:
C9-218: A Requester Class F error occurs when the CQ is inaccessible or
full and an attempt is made to complete a WQE.  The Affected QP shall be
moved to the error state and affiliated asynchronous errors generated as
described in 11.6.3.1 Affiliated Asynchronous Events on page 678. The
current WQE and any subsequent WQEs are left in an unknown state.

C11-37: The CI shall generate a CQ Error when a CQ overrun is
detected. This condition will result in an Affiliated Asynchronous Error
for any associated Work Queues when they attempt to use that
CQ. Completions can no longer be added to the CQ. It is not guaranteed
that completions present in the CQ at the time the error occurred can be
retrieved. Possible causes include a CQ overrun or a CQ protection error.

Put the qp in error state when cq is full. Implement a state called full
to continue to put other associated QPs in error state.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Kamenee Arumugam <kamenee.arumugam@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/rc.c    |  3 +--
 drivers/infiniband/hw/hfi1/uc.c    |  3 +--
 drivers/infiniband/hw/hfi1/ud.c    |  5 ++--
 drivers/infiniband/hw/qib/qib_rc.c |  3 +--
 drivers/infiniband/hw/qib/qib_uc.c |  3 +--
 drivers/infiniband/hw/qib/qib_ud.c |  6 ++---
 drivers/infiniband/sw/rdmavt/cq.c  | 15 +++++++++---
 drivers/infiniband/sw/rdmavt/qp.c  |  3 +--
 drivers/infiniband/sw/rdmavt/vt.h  |  9 ++++++++
 include/rdma/rdmavt_cq.h           |  3 ++-
 include/rdma/rdmavt_qp.h           | 47 ++++++++++++++++++++++++++++++++++----
 11 files changed, 75 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 235bdbc706ac..0477c14633ab 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -3008,8 +3008,7 @@ send_last:
 		wc.dlid_path_bits = 0;
 		wc.port_num = 0;
 		/* Signal completion event if the solicited bit is set. */
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-			     ib_bth_is_solicited(ohdr));
+		rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
 		break;
 
 	case OP(RDMA_WRITE_ONLY):
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index 4ed4fcfabd6c..0c77f18120ed 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -476,8 +476,7 @@ last_imm:
 		wc.dlid_path_bits = 0;
 		wc.port_num = 0;
 		/* Signal completion event if the solicited bit is set. */
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-			     ib_bth_is_solicited(ohdr));
+		rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
 		break;
 
 	case OP(RDMA_WRITE_FIRST):
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index 4cb0fce5c096..e16d499cfd1e 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -255,8 +255,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1);
 	wc.port_num = qp->port_num;
 	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-		     swqe->wr.send_flags & IB_SEND_SOLICITED);
+	rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED);
 	ibp->rvp.n_loop_pkts++;
 bail_unlock:
 	spin_unlock_irqrestore(&qp->r_lock, flags);
@@ -1061,7 +1060,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 		dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
 	wc.port_num = qp->port_num;
 	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, solicited);
+	rvt_recv_cq(qp, &wc, solicited);
 	return;
 
 drop:
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index 8d9a94d6f685..1d5e2d4ee257 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -1891,8 +1891,7 @@ send_last:
 		wc.dlid_path_bits = 0;
 		wc.port_num = 0;
 		/* Signal completion event if the solicited bit is set. */
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-			     ib_bth_is_solicited(ohdr));
+		rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
 		break;
 
 	case OP(RDMA_WRITE_FIRST):
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
index 30c70ad0f4bf..e17b91e2c22a 100644
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -400,8 +400,7 @@ last_imm:
 		wc.dlid_path_bits = 0;
 		wc.port_num = 0;
 		/* Signal completion event if the solicited bit is set. */
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-			     ib_bth_is_solicited(ohdr));
+		rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
 		break;
 
 	case OP(RDMA_WRITE_FIRST):
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index 5cdedba2d164..32ad0b635fc6 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -210,8 +210,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1);
 	wc.port_num = qp->port_num;
 	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-		     swqe->wr.send_flags & IB_SEND_SOLICITED);
+	rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED);
 	ibp->rvp.n_loop_pkts++;
 bail_unlock:
 	spin_unlock_irqrestore(&qp->r_lock, flags);
@@ -573,8 +572,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
 		dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
 	wc.port_num = qp->port_num;
 	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-		     ib_bth_is_solicited(ohdr));
+	rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
 	return;
 
 drop:
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 2602ad8b8cb0..fac87b13329d 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -60,8 +60,11 @@ static struct workqueue_struct *comp_vector_wq;
  * @solicited: true if @entry is solicited
  *
  * This may be called with qp->s_lock held.
+ *
+ * Return: return true on success, else return
+ * false if cq is full.
  */
-void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
+bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 {
 	struct ib_uverbs_wc *uqueue = NULL;
 	struct ib_wc *kqueue = NULL;
@@ -97,7 +100,12 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 		next = head + 1;
 	}
 
-	if (unlikely(next == tail)) {
+	if (unlikely(next == tail || cq->cq_full)) {
+		struct rvt_dev_info *rdi = cq->rdi;
+
+		if (!cq->cq_full)
+			rvt_pr_err_ratelimited(rdi, "CQ is full!\n");
+		cq->cq_full = true;
 		spin_unlock_irqrestore(&cq->lock, flags);
 		if (cq->ibcq.event_handler) {
 			struct ib_event ev;
@@ -107,7 +115,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 			ev.event = IB_EVENT_CQ_ERR;
 			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
 		}
-		return;
+		return false;
 	}
 	trace_rvt_cq_enter(cq, entry, head);
 	if (uqueue) {
@@ -146,6 +154,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 	}
 
 	spin_unlock_irqrestore(&cq->lock, flags);
+	return true;
 }
 EXPORT_SYMBOL(rvt_cq_enter);
 
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 200b292be63e..17e192a2c8b6 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -3103,8 +3103,7 @@ do_write:
 	wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
 	wc.port_num = 1;
 	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-		     wqe->wr.send_flags & IB_SEND_SOLICITED);
+	rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED);
 
 send_comp:
 	spin_unlock_irqrestore(&qp->r_lock, flags);
diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h
index 0675ea6c3872..d19ff817c2c7 100644
--- a/drivers/infiniband/sw/rdmavt/vt.h
+++ b/drivers/infiniband/sw/rdmavt/vt.h
@@ -78,6 +78,12 @@
 		     fmt, \
 		     ##__VA_ARGS__)
 
+#define rvt_pr_err_ratelimited(rdi, fmt, ...) \
+	__rvt_pr_err_ratelimited((rdi)->driver_f.get_pci_dev(rdi), \
+				 rvt_get_ibdev_name(rdi), \
+				 fmt, \
+				 ##__VA_ARGS__)
+
 #define __rvt_pr_info(pdev, name, fmt, ...) \
 	dev_info(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
 
@@ -87,6 +93,9 @@
 #define __rvt_pr_err(pdev, name, fmt, ...) \
 	dev_err(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
 
+#define __rvt_pr_err_ratelimited(pdev, name, fmt, ...) \
+	dev_err_ratelimited(&(pdev)->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
 static inline int ibport_num_to_idx(struct ib_device *ibdev, u8 port_num)
 {
 	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h
index ab22860a63e2..04c519ef6d71 100644
--- a/include/rdma/rdmavt_cq.h
+++ b/include/rdma/rdmavt_cq.h
@@ -93,6 +93,7 @@ struct rvt_cq {
 	spinlock_t lock; /* protect changes in this struct */
 	u8 notify;
 	u8 triggered;
+	u8 cq_full;
 	int comp_vector_cpu;
 	struct rvt_dev_info *rdi;
 	struct rvt_cq_wc *queue;
@@ -105,6 +106,6 @@ static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq)
 	return container_of(ibcq, struct rvt_cq, ibcq);
 }
 
-void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited);
+bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited);
 
 #endif          /* DEF_RDMAVT_INCCQH */
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index de5915b244be..e4be869c4f21 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -718,6 +718,48 @@ rvt_qp_swqe_incr(struct rvt_qp *qp, u32 val)
 	return val;
 }
 
+int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err);
+
+/**
+ * rvt_recv_cq - add a new entry to completion queue
+ *			by receive queue
+ * @qp: receive queue
+ * @wc: work completion entry to add
+ * @solicited: true if @entry is solicited
+ *
+ * This is wrapper function for rvt_enter_cq function call by
+ * receive queue. If rvt_cq_enter return false, it means cq is
+ * full and the qp is put into error state.
+ */
+static inline void rvt_recv_cq(struct rvt_qp *qp, struct ib_wc *wc,
+			       bool solicited)
+{
+	struct rvt_cq *cq = ibcq_to_rvtcq(qp->ibqp.recv_cq);
+
+	if (unlikely(!rvt_cq_enter(cq, wc, solicited)))
+		rvt_error_qp(qp, IB_WC_LOC_QP_OP_ERR);
+}
+
+/**
+ * rvt_send_cq - add a new entry to completion queue
+ *                        by send queue
+ * @qp: send queue
+ * @wc: work completion entry to add
+ * @solicited: true if @entry is solicited
+ *
+ * This is wrapper function for rvt_enter_cq function call by
+ * send queue. If rvt_cq_enter return false, it means cq is
+ * full and the qp is put into error state.
+ */
+static inline void rvt_send_cq(struct rvt_qp *qp, struct ib_wc *wc,
+			       bool solicited)
+{
+	struct rvt_cq *cq = ibcq_to_rvtcq(qp->ibqp.send_cq);
+
+	if (unlikely(!rvt_cq_enter(cq, wc, solicited)))
+		rvt_error_qp(qp, IB_WC_LOC_QP_OP_ERR);
+}
+
 /**
  * rvt_qp_complete_swqe - insert send completion
  * @qp - the qp
@@ -768,9 +810,7 @@ rvt_qp_complete_swqe(struct rvt_qp *qp,
 			.qp = &qp->ibqp,
 			.byte_len = byte_len,
 		};
-
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &w,
-			     status != IB_WC_SUCCESS);
+		rvt_send_cq(qp, &w, status != IB_WC_SUCCESS);
 	}
 	return last;
 }
@@ -780,7 +820,6 @@ extern const int  ib_rvt_state_ops[];
 struct rvt_dev_info;
 int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only);
 void rvt_comm_est(struct rvt_qp *qp);
-int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err);
 void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err);
 unsigned long rvt_rnr_tbl_to_usec(u32 index);
 enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t);
-- 
cgit v1.2.3


From d310c4bf8aeacc0256091feb6a0337b8fef763ac Mon Sep 17 00:00:00 2001
From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>
Date: Fri, 28 Jun 2019 14:22:04 -0400
Subject: IB/{rdmavt, hfi1, qib}: Remove AH refcount for UD QPs

Historically rdmavt destroy_ah() has returned an -EBUSY when the AH has a
non-zero reference count.  IBTA 11.2.2 notes no such return value or error
case:

	Output Modifiers:
	- Verb results:
	- Operation completed successfully.
	- Invalid HCA handle.
	- Invalid address handle.

ULPs never test for this error and this will leak memory.

The reference count exists to allow for driver independent progress
mechanisms to process UD SWQEs in parallel with post sends.  The SWQE will
hold a reference count until the UD SWQE completes and then drops the
reference.

Fix by removing need to reference count the AH.  Add a UD specific
allocation to each SWQE entry to cache the necessary information for
independent progress.  Copy the information during the post send
processing.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/qp.c    |  4 +--
 drivers/infiniband/hw/hfi1/ud.c    | 30 ++++++++++----------
 drivers/infiniband/hw/qib/qib_qp.c |  4 +--
 drivers/infiniband/hw/qib/qib_ud.c | 21 +++++++-------
 drivers/infiniband/sw/rdmavt/ah.c  |  6 +---
 drivers/infiniband/sw/rdmavt/qp.c  | 58 ++++++++++++++++++++++++++++++++++++--
 include/rdma/rdma_vt.h             |  3 +-
 include/rdma/rdmavt_qp.h           | 22 +++++++++++++--
 8 files changed, 106 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 41261e72c429..a84b44af7b97 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -348,7 +348,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
 		break;
 	case IB_QPT_GSI:
 	case IB_QPT_UD:
-		ah = ibah_to_rvtah(wqe->ud_wr.ah);
+		ah = ibah_to_rvtah(wqe->ud_wr.wr.ah);
 		if (wqe->length > (1 << ah->log_pmtu))
 			return -EINVAL;
 		if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf)
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index e16d499cfd1e..f8e796e45517 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -87,7 +87,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	rcu_read_lock();
 
 	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
-			    swqe->ud_wr.remote_qpn);
+			    swqe->ud_wr.wr.remote_qpn);
 	if (!qp) {
 		ibp->rvp.n_pkt_drops++;
 		rcu_read_unlock();
@@ -105,7 +105,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 		goto drop;
 	}
 
-	ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+	ah_attr = swqe->ud_wr.attr;
 	ppd = ppd_from_ibp(ibp);
 
 	if (qp->ibqp.qp_num > 1) {
@@ -135,8 +135,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	if (qp->ibqp.qp_num) {
 		u32 qkey;
 
-		qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
-			sqp->qkey : swqe->ud_wr.remote_qkey;
+		qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ?
+			sqp->qkey : swqe->ud_wr.wr.remote_qkey;
 		if (unlikely(qkey != qp->qkey))
 			goto drop; /* silently drop per IBTA spec */
 	}
@@ -240,7 +240,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
 		if (sqp->ibqp.qp_type == IB_QPT_GSI ||
 		    sqp->ibqp.qp_type == IB_QPT_SMI)
-			wc.pkey_index = swqe->ud_wr.pkey_index;
+			wc.pkey_index = swqe->ud_wr.wr.pkey_index;
 		else
 			wc.pkey_index = sqp->s_pkey_index;
 	} else {
@@ -282,20 +282,20 @@ static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe,
 		bth0 |= IB_BTH_SOLICITED;
 	bth0 |= extra_bytes << 20;
 	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
-		*pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+		*pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index);
 	else
 		*pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
 	if (!bypass)
 		bth0 |= *pkey;
 	ohdr->bth[0] = cpu_to_be32(bth0);
-	ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
+	ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.wr.remote_qpn);
 	ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
 	/*
 	 * Qkeys with the high order bit set mean use the
 	 * qkey from the QP context instead of the WR (see 10.2.5).
 	 */
-	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
-					 qp->qkey : wqe->ud_wr.remote_qkey);
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ?
+					 qp->qkey : wqe->ud_wr.wr.remote_qkey);
 	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 }
 
@@ -315,7 +315,7 @@ void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	ah_attr = wqe->ud_wr.attr;
 
 	extra_bytes = -wqe->length & 3;
 	nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC;
@@ -379,7 +379,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 	struct hfi1_pportdata *ppd;
 	struct hfi1_ibport *ibp;
 	u32 dlid, slid, nwords, extra_bytes;
-	u32 dest_qp = wqe->ud_wr.remote_qpn;
+	u32 dest_qp = wqe->ud_wr.wr.remote_qpn;
 	u32 src_qp = qp->ibqp.qp_num;
 	u16 len, pkey;
 	u8 l4, sc5;
@@ -387,7 +387,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	ah_attr = wqe->ud_wr.attr;
 
 	/*
 	 * Build 16B Management Packet if either the destination
@@ -449,7 +449,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
 	if (is_mgmt) {
 		l4 = OPA_16B_L4_FM;
-		pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+		pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index);
 		hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt,
 				 dest_qp, src_qp);
 	} else {
@@ -514,7 +514,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	/* Construct the header. */
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	ah_attr = wqe->ud_wr.attr;
 	priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr);
 	if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) ||
 	    (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) {
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c
index a81905df2d0f..0e1d0d692891 100644
--- a/drivers/infiniband/hw/qib/qib_qp.c
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012 - 2017 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2012 - 2019 Intel Corporation.  All rights reserved.
  * Copyright (c) 2006 - 2012 QLogic Corporation.  * All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
@@ -398,7 +398,7 @@ int qib_check_send_wqe(struct rvt_qp *qp,
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	case IB_QPT_UD:
-		ah = ibah_to_rvtah(wqe->ud_wr.ah);
+		ah = ibah_to_rvtah(wqe->ud_wr.wr.ah);
 		if (wqe->length > (1 << ah->log_pmtu))
 			return -EINVAL;
 		/* progress hint */
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index 32ad0b635fc6..d8c2c968909f 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2012 - 2019 Intel Corporation.  All rights reserved.
  * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
@@ -63,7 +64,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	enum ib_qp_type sqptype, dqptype;
 
 	rcu_read_lock();
-	qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.remote_qpn);
+	qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.wr.remote_qpn);
 	if (!qp) {
 		ibp->rvp.n_pkt_drops++;
 		goto drop;
@@ -80,7 +81,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 		goto drop;
 	}
 
-	ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+	ah_attr = swqe->ud_wr.attr;
 	ppd = ppd_from_ibp(ibp);
 
 	if (qp->ibqp.qp_num > 1) {
@@ -110,8 +111,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	if (qp->ibqp.qp_num) {
 		u32 qkey;
 
-		qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
-			sqp->qkey : swqe->ud_wr.remote_qkey;
+		qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ?
+			sqp->qkey : swqe->ud_wr.wr.remote_qkey;
 		if (unlikely(qkey != qp->qkey))
 			goto drop;
 	}
@@ -203,7 +204,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	wc.qp = &qp->ibqp;
 	wc.src_qp = sqp->ibqp.qp_num;
 	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
-		swqe->ud_wr.pkey_index : 0;
+		swqe->ud_wr.wr.pkey_index : 0;
 	wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
 				((1 << ppd->lmc) - 1));
 	wc.sl = rdma_ah_get_sl(ah_attr);
@@ -270,7 +271,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 	/* Construct the header. */
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	ah_attr = wqe->ud_wr.attr;
 	if (rdma_ah_get_dlid(ah_attr) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
 		if (rdma_ah_get_dlid(ah_attr) !=
 				be16_to_cpu(IB_LID_PERMISSIVE))
@@ -362,7 +363,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 	bth0 |= extra_bytes << 20;
 	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY :
 		qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ?
-			     wqe->ud_wr.pkey_index : qp->s_pkey_index);
+			     wqe->ud_wr.wr.pkey_index : qp->s_pkey_index);
 	ohdr->bth[0] = cpu_to_be32(bth0);
 	/*
 	 * Use the multicast QP if the destination LID is a multicast LID.
@@ -371,14 +372,14 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 			be16_to_cpu(IB_MULTICAST_LID_BASE) &&
 		rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE) ?
 		cpu_to_be32(QIB_MULTICAST_QPN) :
-		cpu_to_be32(wqe->ud_wr.remote_qpn);
+		cpu_to_be32(wqe->ud_wr.wr.remote_qpn);
 	ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK);
 	/*
 	 * Qkeys with the high order bit set mean use the
 	 * qkey from the QP context instead of the WR (see 10.2.5).
 	 */
-	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
-					 qp->qkey : wqe->ud_wr.remote_qkey);
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ?
+					 qp->qkey : wqe->ud_wr.wr.remote_qkey);
 	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 
 done:
diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c
index 0e147b32cbe9..fe99da0ff060 100644
--- a/drivers/infiniband/sw/rdmavt/ah.c
+++ b/drivers/infiniband/sw/rdmavt/ah.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -119,8 +119,6 @@ int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
 
 	rdma_copy_ah_attr(&ah->attr, ah_attr);
 
-	atomic_set(&ah->refcount, 0);
-
 	if (dev->driver_f.notify_new_ah)
 		dev->driver_f.notify_new_ah(ibah->device, ah_attr, ah);
 
@@ -141,8 +139,6 @@ void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags)
 	struct rvt_ah *ah = ibah_to_rvtah(ibah);
 	unsigned long flags;
 
-	WARN_ON_ONCE(atomic_read(&ah->refcount));
-
 	spin_lock_irqsave(&dev->n_ahs_lock, flags);
 	dev->n_ahs_allocated--;
 	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index b9035d969057..de7d2edb9781 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -978,6 +978,51 @@ static u8 get_allowed_ops(enum ib_qp_type type)
 		IB_OPCODE_UC : IB_OPCODE_UD;
 }
 
+/**
+ * free_ud_wq_attr - Clean up AH attribute cache for UD QPs
+ * @qp: Valid QP with allowed_ops set
+ *
+ * The rvt_swqe data structure being used is a union, so this is
+ * only valid for UD QPs.
+ */
+static void free_ud_wq_attr(struct rvt_qp *qp)
+{
+	struct rvt_swqe *wqe;
+	int i;
+
+	for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
+		wqe = rvt_get_swqe_ptr(qp, i);
+		kfree(wqe->ud_wr.attr);
+		wqe->ud_wr.attr = NULL;
+	}
+}
+
+/**
+ * alloc_ud_wq_attr - AH attribute cache for UD QPs
+ * @qp: Valid QP with allowed_ops set
+ * @node: Numa node for allocation
+ *
+ * The rvt_swqe data structure being used is a union, so this is
+ * only valid for UD QPs.
+ */
+static int alloc_ud_wq_attr(struct rvt_qp *qp, int node)
+{
+	struct rvt_swqe *wqe;
+	int i;
+
+	for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
+		wqe = rvt_get_swqe_ptr(qp, i);
+		wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr),
+					       GFP_KERNEL, node);
+		if (!wqe->ud_wr.attr) {
+			free_ud_wq_attr(qp);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * rvt_create_qp - create a queue pair for a device
  * @ibpd: the protection domain who's device we create the queue pair for
@@ -1124,6 +1169,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 		qp->s_max_sge = init_attr->cap.max_send_sge;
 		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
 			qp->s_flags = RVT_S_SIGNAL_REQ_WR;
+		err = alloc_ud_wq_attr(qp, rdi->dparms.node);
+		if (err) {
+			ret = (ERR_PTR(err));
+			goto bail_driver_priv;
+		}
 
 		err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
 				init_attr->qp_type,
@@ -1227,6 +1277,7 @@ bail_qpn:
 
 bail_rq_wq:
 	rvt_free_rq(&qp->r_rq);
+	free_ud_wq_attr(qp);
 
 bail_driver_priv:
 	rdi->driver_f.qp_priv_free(rdi, qp);
@@ -1671,6 +1722,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 	kfree(qp->s_ack_queue);
 	rdma_destroy_ah_attr(&qp->remote_ah_attr);
 	rdma_destroy_ah_attr(&qp->alt_ah_attr);
+	free_ud_wq_attr(qp);
 	vfree(qp->s_wq);
 	kfree(qp);
 	return 0;
@@ -2037,10 +2089,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 	 */
 	log_pmtu = qp->log_pmtu;
 	if (qp->allowed_ops == IB_OPCODE_UD) {
-		struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
+		struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.wr.ah);
 
 		log_pmtu = ah->log_pmtu;
-		atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
+		rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
 	}
 
 	if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
@@ -2085,7 +2137,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 
 bail_inval_free_ref:
 	if (qp->allowed_ops == IB_OPCODE_UD)
-		atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
+		rdma_destroy_ah_attr(wqe->ud_wr.attr);
 bail_inval_free:
 	/* release mr holds */
 	while (j) {
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 997f42678806..525848e227dc 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -2,7 +2,7 @@
 #define DEF_RDMA_VT_H
 
 /*
- * Copyright(c) 2016 - 2018 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -202,7 +202,6 @@ struct rvt_pd {
 struct rvt_ah {
 	struct ib_ah ibah;
 	struct rdma_ah_attr attr;
-	atomic_t refcount;
 	u8 vl;
 	u8 log_pmtu;
 };
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index e4be869c4f21..9531de2fabe2 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -2,7 +2,7 @@
 #define DEF_RDMAVT_INCQP_H
 
 /*
- * Copyright(c) 2016 - 2018 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -157,6 +157,22 @@
 #define RVT_SEND_RESERVE_USED           IB_SEND_RESERVED_START
 #define RVT_SEND_COMPLETION_ONLY	(IB_SEND_RESERVED_START << 1)
 
+/**
+ * rvt_ud_wr - IB UD work plus AH cache
+ * @wr: valid IB work request
+ * @attr: pointer to an allocated AH attribute
+ *
+ * Special case the UD WR so we can keep track of the AH attributes.
+ *
+ * NOTE: This data structure is stricly ordered wr then attr. I.e the attr
+ * MUST come after wr.  The ib_ud_wr is sized and copied in rvt_post_one_wr.
+ * The copy assumes that wr is first.
+ */
+struct rvt_ud_wr {
+	struct ib_ud_wr wr;
+	struct rdma_ah_attr *attr;
+};
+
 /*
  * Send work request queue entry.
  * The size of the sg_list is determined when the QP is created and stored
@@ -165,7 +181,7 @@
 struct rvt_swqe {
 	union {
 		struct ib_send_wr wr;   /* don't use wr.sg_list */
-		struct ib_ud_wr ud_wr;
+		struct rvt_ud_wr ud_wr;
 		struct ib_reg_wr reg_wr;
 		struct ib_rdma_wr rdma_wr;
 		struct ib_atomic_wr atomic_wr;
@@ -700,7 +716,7 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
 {
 	rvt_put_swqe(wqe);
 	if (qp->allowed_ops == IB_OPCODE_UD)
-		atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
+		rdma_destroy_ah_attr(wqe->ud_wr.attr);
 }
 
 /**
-- 
cgit v1.2.3


From 2b0ad2da8fd4c32f63d9142f2de43a4d34fdd679 Mon Sep 17 00:00:00 2001
From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>
Date: Fri, 28 Jun 2019 14:22:11 -0400
Subject: IB/{rdmavt, hfi1, qib}: Add helpers to hide SWQE WR details

Add some helper functions to hide struct rvt_swqe details.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/qp.c    |  2 +-
 drivers/infiniband/hw/hfi1/ud.c    | 29 +++++++++++-----------
 drivers/infiniband/hw/qib/qib_qp.c |  2 +-
 drivers/infiniband/hw/qib/qib_ud.c | 21 ++++++++--------
 drivers/infiniband/sw/rdmavt/qp.c  |  2 +-
 include/rdma/rdmavt_qp.h           | 50 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 79 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index a84b44af7b97..f8e733aa3bb8 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -348,7 +348,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
 		break;
 	case IB_QPT_GSI:
 	case IB_QPT_UD:
-		ah = ibah_to_rvtah(wqe->ud_wr.wr.ah);
+		ah = rvt_get_swqe_ah(wqe);
 		if (wqe->length > (1 << ah->log_pmtu))
 			return -EINVAL;
 		if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf)
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index f8e796e45517..e804af71b629 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -87,7 +87,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	rcu_read_lock();
 
 	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
-			    swqe->ud_wr.wr.remote_qpn);
+			    rvt_get_swqe_remote_qpn(swqe));
 	if (!qp) {
 		ibp->rvp.n_pkt_drops++;
 		rcu_read_unlock();
@@ -105,7 +105,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 		goto drop;
 	}
 
-	ah_attr = swqe->ud_wr.attr;
+	ah_attr = rvt_get_swqe_ah_attr(swqe);
 	ppd = ppd_from_ibp(ibp);
 
 	if (qp->ibqp.qp_num > 1) {
@@ -135,8 +135,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	if (qp->ibqp.qp_num) {
 		u32 qkey;
 
-		qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ?
-			sqp->qkey : swqe->ud_wr.wr.remote_qkey;
+		qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ?
+			sqp->qkey : rvt_get_swqe_remote_qkey(swqe);
 		if (unlikely(qkey != qp->qkey))
 			goto drop; /* silently drop per IBTA spec */
 	}
@@ -240,7 +240,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
 		if (sqp->ibqp.qp_type == IB_QPT_GSI ||
 		    sqp->ibqp.qp_type == IB_QPT_SMI)
-			wc.pkey_index = swqe->ud_wr.wr.pkey_index;
+			wc.pkey_index = rvt_get_swqe_pkey_index(swqe);
 		else
 			wc.pkey_index = sqp->s_pkey_index;
 	} else {
@@ -282,20 +282,21 @@ static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe,
 		bth0 |= IB_BTH_SOLICITED;
 	bth0 |= extra_bytes << 20;
 	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
-		*pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index);
+		*pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe));
 	else
 		*pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
 	if (!bypass)
 		bth0 |= *pkey;
 	ohdr->bth[0] = cpu_to_be32(bth0);
-	ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.wr.remote_qpn);
+	ohdr->bth[1] = cpu_to_be32(rvt_get_swqe_remote_qpn(wqe));
 	ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
 	/*
 	 * Qkeys with the high order bit set mean use the
 	 * qkey from the QP context instead of the WR (see 10.2.5).
 	 */
-	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ?
-					 qp->qkey : wqe->ud_wr.wr.remote_qkey);
+	ohdr->u.ud.deth[0] =
+		cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey :
+			    rvt_get_swqe_remote_qkey(wqe));
 	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 }
 
@@ -315,7 +316,7 @@ void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = wqe->ud_wr.attr;
+	ah_attr = rvt_get_swqe_ah_attr(wqe);
 
 	extra_bytes = -wqe->length & 3;
 	nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC;
@@ -379,7 +380,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 	struct hfi1_pportdata *ppd;
 	struct hfi1_ibport *ibp;
 	u32 dlid, slid, nwords, extra_bytes;
-	u32 dest_qp = wqe->ud_wr.wr.remote_qpn;
+	u32 dest_qp = rvt_get_swqe_remote_qpn(wqe);
 	u32 src_qp = qp->ibqp.qp_num;
 	u16 len, pkey;
 	u8 l4, sc5;
@@ -387,7 +388,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = wqe->ud_wr.attr;
+	ah_attr = rvt_get_swqe_ah_attr(wqe);
 
 	/*
 	 * Build 16B Management Packet if either the destination
@@ -449,7 +450,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
 	if (is_mgmt) {
 		l4 = OPA_16B_L4_FM;
-		pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index);
+		pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe));
 		hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt,
 				 dest_qp, src_qp);
 	} else {
@@ -514,7 +515,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	/* Construct the header. */
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = wqe->ud_wr.attr;
+	ah_attr = rvt_get_swqe_ah_attr(wqe);
 	priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr);
 	if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) ||
 	    (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) {
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c
index 0e1d0d692891..8d0563ef5be1 100644
--- a/drivers/infiniband/hw/qib/qib_qp.c
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -398,7 +398,7 @@ int qib_check_send_wqe(struct rvt_qp *qp,
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	case IB_QPT_UD:
-		ah = ibah_to_rvtah(wqe->ud_wr.wr.ah);
+		ah = rvt_get_swqe_ah(wqe);
 		if (wqe->length > (1 << ah->log_pmtu))
 			return -EINVAL;
 		/* progress hint */
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index d8c2c968909f..93ca21347959 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -64,7 +64,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	enum ib_qp_type sqptype, dqptype;
 
 	rcu_read_lock();
-	qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.wr.remote_qpn);
+	qp = rvt_lookup_qpn(rdi, &ibp->rvp, rvt_get_swqe_remote_qpn(swqe));
 	if (!qp) {
 		ibp->rvp.n_pkt_drops++;
 		goto drop;
@@ -81,7 +81,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 		goto drop;
 	}
 
-	ah_attr = swqe->ud_wr.attr;
+	ah_attr = rvt_get_swqe_ah_attr(swqe);
 	ppd = ppd_from_ibp(ibp);
 
 	if (qp->ibqp.qp_num > 1) {
@@ -111,8 +111,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	if (qp->ibqp.qp_num) {
 		u32 qkey;
 
-		qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ?
-			sqp->qkey : swqe->ud_wr.wr.remote_qkey;
+		qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ?
+			sqp->qkey : rvt_get_swqe_remote_qkey(swqe);
 		if (unlikely(qkey != qp->qkey))
 			goto drop;
 	}
@@ -204,7 +204,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	wc.qp = &qp->ibqp;
 	wc.src_qp = sqp->ibqp.qp_num;
 	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
-		swqe->ud_wr.wr.pkey_index : 0;
+		rvt_get_swqe_pkey_index(swqe) : 0;
 	wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
 				((1 << ppd->lmc) - 1));
 	wc.sl = rdma_ah_get_sl(ah_attr);
@@ -271,7 +271,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 	/* Construct the header. */
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = wqe->ud_wr.attr;
+	ah_attr = rvt_get_swqe_ah_attr(wqe);
 	if (rdma_ah_get_dlid(ah_attr) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
 		if (rdma_ah_get_dlid(ah_attr) !=
 				be16_to_cpu(IB_LID_PERMISSIVE))
@@ -363,7 +363,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 	bth0 |= extra_bytes << 20;
 	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY :
 		qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ?
-			     wqe->ud_wr.wr.pkey_index : qp->s_pkey_index);
+			     rvt_get_swqe_pkey_index(wqe) : qp->s_pkey_index);
 	ohdr->bth[0] = cpu_to_be32(bth0);
 	/*
 	 * Use the multicast QP if the destination LID is a multicast LID.
@@ -372,14 +372,15 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 			be16_to_cpu(IB_MULTICAST_LID_BASE) &&
 		rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE) ?
 		cpu_to_be32(QIB_MULTICAST_QPN) :
-		cpu_to_be32(wqe->ud_wr.wr.remote_qpn);
+		cpu_to_be32(rvt_get_swqe_remote_qpn(wqe));
 	ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK);
 	/*
 	 * Qkeys with the high order bit set mean use the
 	 * qkey from the QP context instead of the WR (see 10.2.5).
 	 */
-	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ?
-					 qp->qkey : wqe->ud_wr.wr.remote_qkey);
+	ohdr->u.ud.deth[0] =
+		cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey :
+			    rvt_get_swqe_remote_qkey(wqe));
 	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 
 done:
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index de7d2edb9781..11b4d3c1efd4 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -2089,7 +2089,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 	 */
 	log_pmtu = qp->log_pmtu;
 	if (qp->allowed_ops == IB_OPCODE_UD) {
-		struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.wr.ah);
+		struct rvt_ah *ah = rvt_get_swqe_ah(wqe);
 
 		log_pmtu = ah->log_pmtu;
 		rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 9531de2fabe2..0eeea520a853 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -219,6 +219,56 @@ struct rvt_krwq {
 	struct rvt_rwqe wq[];
 };
 
+/*
+ * rvt_get_swqe_ah - Return the pointer to the struct rvt_ah
+ * @swqe: valid Send WQE
+ *
+ */
+static inline struct rvt_ah *rvt_get_swqe_ah(struct rvt_swqe *swqe)
+{
+	return ibah_to_rvtah(swqe->ud_wr.wr.ah);
+}
+
+/**
+ * rvt_get_swqe_ah_attr - Return the cached ah attribute information
+ * @swqe: valid Send WQE
+ *
+ */
+static inline struct rdma_ah_attr *rvt_get_swqe_ah_attr(struct rvt_swqe *swqe)
+{
+	return swqe->ud_wr.attr;
+}
+
+/**
+ * rvt_get_swqe_remote_qpn - Access the remote QPN value
+ * @swqe: valid Send WQE
+ *
+ */
+static inline u32 rvt_get_swqe_remote_qpn(struct rvt_swqe *swqe)
+{
+	return swqe->ud_wr.wr.remote_qpn;
+}
+
+/**
+ * rvt_get_swqe_remote_qkey - Acces the remote qkey value
+ * @swqe: valid Send WQE
+ *
+ */
+static inline u32 rvt_get_swqe_remote_qkey(struct rvt_swqe *swqe)
+{
+	return swqe->ud_wr.wr.remote_qkey;
+}
+
+/**
+ * rvt_get_swqe_pkey_index - Access the pkey index
+ * @swqe: valid Send WQE
+ *
+ */
+static inline u16 rvt_get_swqe_pkey_index(struct rvt_swqe *swqe)
+{
+	return swqe->ud_wr.wr.pkey_index;
+}
+
 struct rvt_rq {
 	struct rvt_rwq *wq;
 	struct rvt_krwq *kwq;
-- 
cgit v1.2.3


From b1a17513a2d60f9e933016bed04d0eeb8651a915 Mon Sep 17 00:00:00 2001
From: Clement Leger <cleger@kalray.eu>
Date: Mon, 17 Jun 2019 14:57:30 +0200
Subject: remoteproc: add vendor resources handling

In order to allow rproc backend to handle vendor resources such as in
OpenAMP, add a handle_rsc hook. This hook allow the rproc backends to
handle vendor resources as they like. The hook will be called only for
vendor resources and should return RSC_HANDLED on successful resource
handling, RSC_IGNORED if resource was ignored, or a negative value on
error.

Signed-off-by: Clement Leger <cleger@kalray.eu>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 Documentation/remoteproc.txt             | 14 +++++++++-----
 drivers/remoteproc/remoteproc_core.c     | 14 ++++++++++++++
 drivers/remoteproc/remoteproc_internal.h | 11 +++++++++++
 include/linux/remoteproc.h               | 32 ++++++++++++++++++++++++++------
 4 files changed, 60 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/Documentation/remoteproc.txt b/Documentation/remoteproc.txt
index 77fb03acdbb4..03c3d2e568b0 100644
--- a/Documentation/remoteproc.txt
+++ b/Documentation/remoteproc.txt
@@ -314,6 +314,8 @@ Here are the various resource types that are currently supported::
    * @RSC_VDEV:       declare support for a virtio device, and serve as its
    *		    virtio header.
    * @RSC_LAST:       just keep this one at the end
+   * @RSC_VENDOR_START:	start of the vendor specific resource types range
+   * @RSC_VENDOR_END:	end of the vendor specific resource types range
    *
    * Please note that these values are used as indices to the rproc_handle_rsc
    * lookup table, so please keep them sane. Moreover, @RSC_LAST is used to
@@ -321,11 +323,13 @@ Here are the various resource types that are currently supported::
    * please update it as needed.
    */
   enum fw_resource_type {
-	RSC_CARVEOUT	= 0,
-	RSC_DEVMEM	= 1,
-	RSC_TRACE	= 2,
-	RSC_VDEV	= 3,
-	RSC_LAST	= 4,
+	RSC_CARVEOUT		= 0,
+	RSC_DEVMEM		= 1,
+	RSC_TRACE		= 2,
+	RSC_VDEV		= 3,
+	RSC_LAST		= 4,
+	RSC_VENDOR_START	= 128,
+	RSC_VENDOR_END		= 512,
   };
 
 For more details regarding a specific resource type, please see its
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 48feebd6d0a2..263e9c9614a8 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1066,6 +1066,20 @@ static int rproc_handle_resources(struct rproc *rproc,
 
 		dev_dbg(dev, "rsc: type %d\n", hdr->type);
 
+		if (hdr->type >= RSC_VENDOR_START &&
+		    hdr->type <= RSC_VENDOR_END) {
+			ret = rproc_handle_rsc(rproc, hdr->type, rsc,
+					       offset + sizeof(*hdr), avail);
+			if (ret == RSC_HANDLED)
+				continue;
+			else if (ret < 0)
+				break;
+
+			dev_warn(dev, "unsupported vendor resource %d\n",
+				 hdr->type);
+			continue;
+		}
+
 		if (hdr->type >= RSC_LAST) {
 			dev_warn(dev, "unsupported resource %d\n", hdr->type);
 			continue;
diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h
index 45ff76a06c72..4c77bdd517b9 100644
--- a/drivers/remoteproc/remoteproc_internal.h
+++ b/drivers/remoteproc/remoteproc_internal.h
@@ -106,6 +106,17 @@ static inline int rproc_parse_fw(struct rproc *rproc, const struct firmware *fw)
 	return 0;
 }
 
+static inline
+int rproc_handle_rsc(struct rproc *rproc, u32 rsc_type, void *rsc, int offset,
+		     int avail)
+{
+	if (rproc->ops->handle_rsc)
+		return rproc->ops->handle_rsc(rproc, rsc_type, rsc, offset,
+					      avail);
+
+	return RSC_IGNORED;
+}
+
 static inline
 struct resource_table *rproc_find_loaded_rsc_table(struct rproc *rproc,
 						   const struct firmware *fw)
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 04d04709f2bd..16ad66683ad0 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -100,7 +100,9 @@ struct fw_rsc_hdr {
  *		    the remote processor will be writing logs.
  * @RSC_VDEV:       declare support for a virtio device, and serve as its
  *		    virtio header.
- * @RSC_LAST:       just keep this one at the end
+ * @RSC_LAST:       just keep this one at the end of standard resources
+ * @RSC_VENDOR_START:	start of the vendor specific resource types range
+ * @RSC_VENDOR_END:	end of the vendor specific resource types range
  *
  * For more details regarding a specific resource type, please see its
  * dedicated structure below.
@@ -111,11 +113,13 @@ struct fw_rsc_hdr {
  * please update it as needed.
  */
 enum fw_resource_type {
-	RSC_CARVEOUT	= 0,
-	RSC_DEVMEM	= 1,
-	RSC_TRACE	= 2,
-	RSC_VDEV	= 3,
-	RSC_LAST	= 4,
+	RSC_CARVEOUT		= 0,
+	RSC_DEVMEM		= 1,
+	RSC_TRACE		= 2,
+	RSC_VDEV		= 3,
+	RSC_LAST		= 4,
+	RSC_VENDOR_START	= 128,
+	RSC_VENDOR_END		= 512,
 };
 
 #define FW_RSC_ADDR_ANY (-1)
@@ -339,6 +343,16 @@ struct rproc_mem_entry {
 
 struct firmware;
 
+/**
+ * enum rsc_handling_status - return status of rproc_ops handle_rsc hook
+ * @RSC_HANDLED:	resource was handled
+ * @RSC_IGNORED:	resource was ignored
+ */
+enum rsc_handling_status {
+	RSC_HANDLED	= 0,
+	RSC_IGNORED	= 1,
+};
+
 /**
  * struct rproc_ops - platform-specific device handlers
  * @start:	power on the device and boot it
@@ -346,6 +360,10 @@ struct firmware;
  * @kick:	kick a virtqueue (virtqueue id given as a parameter)
  * @da_to_va:	optional platform hook to perform address translations
  * @parse_fw:	parse firmware to extract information (e.g. resource table)
+ * @handle_rsc:	optional platform hook to handle vendor resources. Should return
+ * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled and a
+ * negative value on error
+ * @load_rsc_table:	load resource table from firmware image
  * @find_loaded_rsc_table: find the loaded resouce table
  * @load:		load firmware to memory, where the remote processor
  *			expects to find it
@@ -358,6 +376,8 @@ struct rproc_ops {
 	void (*kick)(struct rproc *rproc, int vqid);
 	void * (*da_to_va)(struct rproc *rproc, u64 da, int len);
 	int (*parse_fw)(struct rproc *rproc, const struct firmware *fw);
+	int (*handle_rsc)(struct rproc *rproc, u32 rsc_type, void *rsc,
+			  int offset, int avail);
 	struct resource_table *(*find_loaded_rsc_table)(
 				struct rproc *rproc, const struct firmware *fw);
 	int (*load)(struct rproc *rproc, const struct firmware *fw);
-- 
cgit v1.2.3


From 360aa640a59f269b784848c0b2d6d462952750d9 Mon Sep 17 00:00:00 2001
From: Fabien Dessenne <fabien.dessenne@st.com>
Date: Thu, 7 Mar 2019 16:58:23 +0100
Subject: hwspinlock: add the 'in_atomic' API

Add the 'in_atomic' mode which can be called from an atomic context.
This mode relies on the existing 'raw' mode (no lock, no preemption/irq
disabling) with the difference that the timeout is not based on jiffies
(jiffies won't increase when irq are disabled) but handled with
busy-waiting udelay() calls.

Signed-off-by: Fabien Dessenne <fabien.dessenne@st.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 Documentation/hwspinlock.txt         | 39 +++++++++++++++++++++++
 drivers/hwspinlock/hwspinlock_core.c | 43 +++++++++++++++++--------
 include/linux/hwspinlock.h           | 61 ++++++++++++++++++++++++++++++++++--
 3 files changed, 127 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/Documentation/hwspinlock.txt b/Documentation/hwspinlock.txt
index c3403f9ae27a..6f03713b7003 100644
--- a/Documentation/hwspinlock.txt
+++ b/Documentation/hwspinlock.txt
@@ -151,6 +151,22 @@ notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
 
 The function will never sleep.
 
+::
+
+  int hwspin_lock_timeout_in_atomic(struct hwspinlock *hwlock, unsigned int to);
+
+Lock a previously-assigned hwspinlock with a timeout limit (specified in
+msecs). If the hwspinlock is already taken, the function will busy loop
+waiting for it to be released, but give up when the timeout elapses.
+
+This function shall be called only from an atomic context and the timeout
+value shall not exceed a few msecs.
+
+Returns 0 when successful and an appropriate error code otherwise (most
+notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
+
+The function will never sleep.
+
 ::
 
   int hwspin_trylock(struct hwspinlock *hwlock);
@@ -216,6 +232,19 @@ Returns 0 on success and an appropriate error code otherwise (most
 notably -EBUSY if the hwspinlock was already taken).
 The function will never sleep.
 
+::
+
+  int hwspin_trylock_in_atomic(struct hwspinlock *hwlock);
+
+Attempt to lock a previously-assigned hwspinlock, but immediately fail if
+it is already taken.
+
+This function shall be called only from an atomic context.
+
+Returns 0 on success and an appropriate error code otherwise (most
+notably -EBUSY if the hwspinlock was already taken).
+The function will never sleep.
+
 ::
 
   void hwspin_unlock(struct hwspinlock *hwlock);
@@ -262,6 +291,16 @@ The caller should **never** unlock an hwspinlock which is already unlocked.
 Doing so is considered a bug (there is no protection against this).
 This function will never sleep.
 
+::
+
+  void hwspin_unlock_in_atomic(struct hwspinlock *hwlock);
+
+Unlock a previously-locked hwspinlock.
+
+The caller should **never** unlock an hwspinlock which is already unlocked.
+Doing so is considered a bug (there is no protection against this).
+This function will never sleep.
+
 ::
 
   int hwspin_lock_get_id(struct hwspinlock *hwlock);
diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c
index d806307f19c2..8862445aa858 100644
--- a/drivers/hwspinlock/hwspinlock_core.c
+++ b/drivers/hwspinlock/hwspinlock_core.c
@@ -9,6 +9,7 @@
 
 #define pr_fmt(fmt)    "%s: " fmt, __func__
 
+#include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
@@ -23,6 +24,9 @@
 
 #include "hwspinlock_internal.h"
 
+/* retry delay used in atomic context */
+#define HWSPINLOCK_RETRY_DELAY_US	100
+
 /* radix tree tags */
 #define HWSPINLOCK_UNUSED	(0) /* tags an hwspinlock as unused */
 
@@ -68,11 +72,11 @@ static DEFINE_MUTEX(hwspinlock_tree_lock);
  * user need some time-consuming or sleepable operations under the hardware
  * lock, they need one sleepable lock (like mutex) to protect the operations.
  *
- * If the mode is not HWLOCK_RAW, upon a successful return from this function,
- * preemption (and possibly interrupts) is disabled, so the caller must not
- * sleep, and is advised to release the hwspinlock as soon as possible. This is
- * required in order to minimize remote cores polling on the hardware
- * interconnect.
+ * If the mode is neither HWLOCK_IN_ATOMIC nor HWLOCK_RAW, upon a successful
+ * return from this function, preemption (and possibly interrupts) is disabled,
+ * so the caller must not sleep, and is advised to release the hwspinlock as
+ * soon as possible. This is required in order to minimize remote cores polling
+ * on the hardware interconnect.
  *
  * The user decides whether local interrupts are disabled or not, and if yes,
  * whether he wants their previous state to be saved. It is up to the user
@@ -112,6 +116,7 @@ int __hwspin_trylock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 		ret = spin_trylock_irq(&hwlock->lock);
 		break;
 	case HWLOCK_RAW:
+	case HWLOCK_IN_ATOMIC:
 		ret = 1;
 		break;
 	default:
@@ -136,6 +141,7 @@ int __hwspin_trylock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 			spin_unlock_irq(&hwlock->lock);
 			break;
 		case HWLOCK_RAW:
+		case HWLOCK_IN_ATOMIC:
 			/* Nothing to do */
 			break;
 		default:
@@ -179,11 +185,14 @@ EXPORT_SYMBOL_GPL(__hwspin_trylock);
  * user need some time-consuming or sleepable operations under the hardware
  * lock, they need one sleepable lock (like mutex) to protect the operations.
  *
- * If the mode is not HWLOCK_RAW, upon a successful return from this function,
- * preemption is disabled (and possibly local interrupts, too), so the caller
- * must not sleep, and is advised to release the hwspinlock as soon as possible.
- * This is required in order to minimize remote cores polling on the
- * hardware interconnect.
+ * If the mode is HWLOCK_IN_ATOMIC (called from an atomic context) the timeout
+ * is handled with busy-waiting delays, hence shall not exceed few msecs.
+ *
+ * If the mode is neither HWLOCK_IN_ATOMIC nor HWLOCK_RAW, upon a successful
+ * return from this function, preemption (and possibly interrupts) is disabled,
+ * so the caller must not sleep, and is advised to release the hwspinlock as
+ * soon as possible. This is required in order to minimize remote cores polling
+ * on the hardware interconnect.
  *
  * The user decides whether local interrupts are disabled or not, and if yes,
  * whether he wants their previous state to be saved. It is up to the user
@@ -198,7 +207,7 @@ int __hwspin_lock_timeout(struct hwspinlock *hwlock, unsigned int to,
 					int mode, unsigned long *flags)
 {
 	int ret;
-	unsigned long expire;
+	unsigned long expire, atomic_delay = 0;
 
 	expire = msecs_to_jiffies(to) + jiffies;
 
@@ -212,8 +221,15 @@ int __hwspin_lock_timeout(struct hwspinlock *hwlock, unsigned int to,
 		 * The lock is already taken, let's check if the user wants
 		 * us to try again
 		 */
-		if (time_is_before_eq_jiffies(expire))
-			return -ETIMEDOUT;
+		if (mode == HWLOCK_IN_ATOMIC) {
+			udelay(HWSPINLOCK_RETRY_DELAY_US);
+			atomic_delay += HWSPINLOCK_RETRY_DELAY_US;
+			if (atomic_delay > to * 1000)
+				return -ETIMEDOUT;
+		} else {
+			if (time_is_before_eq_jiffies(expire))
+				return -ETIMEDOUT;
+		}
 
 		/*
 		 * Allow platform-specific relax handlers to prevent
@@ -276,6 +292,7 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 		spin_unlock_irq(&hwlock->lock);
 		break;
 	case HWLOCK_RAW:
+	case HWLOCK_IN_ATOMIC:
 		/* Nothing to do */
 		break;
 	default:
diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h
index 0afe693be5f4..bfe7c1f1ac6d 100644
--- a/include/linux/hwspinlock.h
+++ b/include/linux/hwspinlock.h
@@ -14,9 +14,10 @@
 #include <linux/sched.h>
 
 /* hwspinlock mode argument */
-#define HWLOCK_IRQSTATE	0x01	/* Disable interrupts, save state */
-#define HWLOCK_IRQ	0x02	/* Disable interrupts, don't save state */
-#define HWLOCK_RAW	0x03
+#define HWLOCK_IRQSTATE		0x01 /* Disable interrupts, save state */
+#define HWLOCK_IRQ		0x02 /* Disable interrupts, don't save state */
+#define HWLOCK_RAW		0x03
+#define HWLOCK_IN_ATOMIC	0x04 /* Called while in atomic context */
 
 struct device;
 struct device_node;
@@ -222,6 +223,23 @@ static inline int hwspin_trylock_raw(struct hwspinlock *hwlock)
 	return __hwspin_trylock(hwlock, HWLOCK_RAW, NULL);
 }
 
+/**
+ * hwspin_trylock_in_atomic() - attempt to lock a specific hwspinlock
+ * @hwlock: an hwspinlock which we want to trylock
+ *
+ * This function attempts to lock an hwspinlock, and will immediately fail
+ * if the hwspinlock is already taken.
+ *
+ * This function shall be called only from an atomic context.
+ *
+ * Returns 0 if we successfully locked the hwspinlock, -EBUSY if
+ * the hwspinlock was already taken, and -EINVAL if @hwlock is invalid.
+ */
+static inline int hwspin_trylock_in_atomic(struct hwspinlock *hwlock)
+{
+	return __hwspin_trylock(hwlock, HWLOCK_IN_ATOMIC, NULL);
+}
+
 /**
  * hwspin_trylock() - attempt to lock a specific hwspinlock
  * @hwlock: an hwspinlock which we want to trylock
@@ -312,6 +330,28 @@ int hwspin_lock_timeout_raw(struct hwspinlock *hwlock, unsigned int to)
 	return __hwspin_lock_timeout(hwlock, to, HWLOCK_RAW, NULL);
 }
 
+/**
+ * hwspin_lock_timeout_in_atomic() - lock an hwspinlock with timeout limit
+ * @hwlock: the hwspinlock to be locked
+ * @to: timeout value in msecs
+ *
+ * This function locks the underlying @hwlock. If the @hwlock
+ * is already taken, the function will busy loop waiting for it to
+ * be released, but give up when @timeout msecs have elapsed.
+ *
+ * This function shall be called only from an atomic context and the timeout
+ * value shall not exceed a few msecs.
+ *
+ * Returns 0 when the @hwlock was successfully taken, and an appropriate
+ * error code otherwise (most notably an -ETIMEDOUT if the @hwlock is still
+ * busy after @timeout msecs). The function will never sleep.
+ */
+static inline
+int hwspin_lock_timeout_in_atomic(struct hwspinlock *hwlock, unsigned int to)
+{
+	return __hwspin_lock_timeout(hwlock, to, HWLOCK_IN_ATOMIC, NULL);
+}
+
 /**
  * hwspin_lock_timeout() - lock an hwspinlock with timeout limit
  * @hwlock: the hwspinlock to be locked
@@ -386,6 +426,21 @@ static inline void hwspin_unlock_raw(struct hwspinlock *hwlock)
 	__hwspin_unlock(hwlock, HWLOCK_RAW, NULL);
 }
 
+/**
+ * hwspin_unlock_in_atomic() - unlock hwspinlock
+ * @hwlock: a previously-acquired hwspinlock which we want to unlock
+ *
+ * This function will unlock a specific hwspinlock.
+ *
+ * @hwlock must be already locked (e.g. by hwspin_trylock()) before calling
+ * this function: it is a bug to call unlock on a @hwlock that is already
+ * unlocked.
+ */
+static inline void hwspin_unlock_in_atomic(struct hwspinlock *hwlock)
+{
+	__hwspin_unlock(hwlock, HWLOCK_IN_ATOMIC, NULL);
+}
+
 /**
  * hwspin_unlock() - unlock hwspinlock
  * @hwlock: a previously-acquired hwspinlock which we want to unlock
-- 
cgit v1.2.3


From 0828c1001399d5c5fcab547ef7b0a29c78d4bdf6 Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <jeffrey.l.hugo@gmail.com>
Date: Sun, 30 Jun 2019 00:14:52 -0700
Subject: Input: elan_i2c - export the device id whitelist

Elan_i2c and hid-quirks work in conjunction to decide which devices each
driver will handle.  Elan_i2c has a whitelist of devices that should be
consumed by hid-quirks so that there is one master list of devices to
handoff between the drivers.  Put the ids in a header file so that
hid-quirks can consume it instead of duplicating the list.

Signed-off-by: Jeffrey Hugo <jeffrey.l.hugo@gmail.com>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/mouse/elan_i2c_core.c | 50 +-----------------------
 include/linux/input/elan-i2c-ids.h  | 76 +++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 49 deletions(-)
 create mode 100644 include/linux/input/elan-i2c-ids.h

(limited to 'include')

diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c
index 65cd325eabc3..e2c824abd19c 100644
--- a/drivers/input/mouse/elan_i2c_core.c
+++ b/drivers/input/mouse/elan_i2c_core.c
@@ -37,6 +37,7 @@
 #include <linux/completion.h>
 #include <linux/of.h>
 #include <linux/property.h>
+#include <linux/input/elan-i2c-ids.h>
 #include <linux/regulator/consumer.h>
 #include <asm/unaligned.h>
 
@@ -1375,55 +1376,6 @@ static const struct i2c_device_id elan_id[] = {
 MODULE_DEVICE_TABLE(i2c, elan_id);
 
 #ifdef CONFIG_ACPI
-static const struct acpi_device_id elan_acpi_id[] = {
-	{ "ELAN0000", 0 },
-	{ "ELAN0100", 0 },
-	{ "ELAN0600", 0 },
-	{ "ELAN0601", 0 },
-	{ "ELAN0602", 0 },
-	{ "ELAN0603", 0 },
-	{ "ELAN0604", 0 },
-	{ "ELAN0605", 0 },
-	{ "ELAN0606", 0 },
-	{ "ELAN0607", 0 },
-	{ "ELAN0608", 0 },
-	{ "ELAN0609", 0 },
-	{ "ELAN060B", 0 },
-	{ "ELAN060C", 0 },
-	{ "ELAN060F", 0 },
-	{ "ELAN0610", 0 },
-	{ "ELAN0611", 0 },
-	{ "ELAN0612", 0 },
-	{ "ELAN0615", 0 },
-	{ "ELAN0616", 0 },
-	{ "ELAN0617", 0 },
-	{ "ELAN0618", 0 },
-	{ "ELAN0619", 0 },
-	{ "ELAN061A", 0 },
-	{ "ELAN061B", 0 },
-	{ "ELAN061C", 0 },
-	{ "ELAN061D", 0 },
-	{ "ELAN061E", 0 },
-	{ "ELAN061F", 0 },
-	{ "ELAN0620", 0 },
-	{ "ELAN0621", 0 },
-	{ "ELAN0622", 0 },
-	{ "ELAN0623", 0 },
-	{ "ELAN0624", 0 },
-	{ "ELAN0625", 0 },
-	{ "ELAN0626", 0 },
-	{ "ELAN0627", 0 },
-	{ "ELAN0628", 0 },
-	{ "ELAN0629", 0 },
-	{ "ELAN062A", 0 },
-	{ "ELAN062B", 0 },
-	{ "ELAN062C", 0 },
-	{ "ELAN062D", 0 },
-	{ "ELAN0631", 0 },
-	{ "ELAN0632", 0 },
-	{ "ELAN1000", 0 },
-	{ }
-};
 MODULE_DEVICE_TABLE(acpi, elan_acpi_id);
 #endif
 
diff --git a/include/linux/input/elan-i2c-ids.h b/include/linux/input/elan-i2c-ids.h
new file mode 100644
index 000000000000..ceabb01a6a7d
--- /dev/null
+++ b/include/linux/input/elan-i2c-ids.h
@@ -0,0 +1,76 @@
+/*
+ * Elan I2C/SMBus Touchpad device whitelist
+ *
+ * Copyright (c) 2013 ELAN Microelectronics Corp.
+ *
+ * Author: æ維 (Duson Lin) <dusonlin@emc.com.tw>
+ * Author: KT Liao <kt.liao@emc.com.tw>
+ * Version: 1.6.3
+ *
+ * Based on cyapa driver:
+ * copyright (c) 2011-2012 Cypress Semiconductor, Inc.
+ * copyright (c) 2011-2012 Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Trademarks are the property of their respective owners.
+ */
+
+#ifndef __ELAN_I2C_IDS_H
+#define __ELAN_I2C_IDS_H
+
+#include <linux/mod_devicetable.h>
+
+static const struct acpi_device_id elan_acpi_id[] = {
+	{ "ELAN0000", 0 },
+	{ "ELAN0100", 0 },
+	{ "ELAN0600", 0 },
+	{ "ELAN0601", 0 },
+	{ "ELAN0602", 0 },
+	{ "ELAN0603", 0 },
+	{ "ELAN0604", 0 },
+	{ "ELAN0605", 0 },
+	{ "ELAN0606", 0 },
+	{ "ELAN0607", 0 },
+	{ "ELAN0608", 0 },
+	{ "ELAN0609", 0 },
+	{ "ELAN060B", 0 },
+	{ "ELAN060C", 0 },
+	{ "ELAN060F", 0 },
+	{ "ELAN0610", 0 },
+	{ "ELAN0611", 0 },
+	{ "ELAN0612", 0 },
+	{ "ELAN0615", 0 },
+	{ "ELAN0616", 0 },
+	{ "ELAN0617", 0 },
+	{ "ELAN0618", 0 },
+	{ "ELAN0619", 0 },
+	{ "ELAN061A", 0 },
+	{ "ELAN061B", 0 },
+	{ "ELAN061C", 0 },
+	{ "ELAN061D", 0 },
+	{ "ELAN061E", 0 },
+	{ "ELAN061F", 0 },
+	{ "ELAN0620", 0 },
+	{ "ELAN0621", 0 },
+	{ "ELAN0622", 0 },
+	{ "ELAN0623", 0 },
+	{ "ELAN0624", 0 },
+	{ "ELAN0625", 0 },
+	{ "ELAN0626", 0 },
+	{ "ELAN0627", 0 },
+	{ "ELAN0628", 0 },
+	{ "ELAN0629", 0 },
+	{ "ELAN062A", 0 },
+	{ "ELAN062B", 0 },
+	{ "ELAN062C", 0 },
+	{ "ELAN062D", 0 },
+	{ "ELAN0631", 0 },
+	{ "ELAN0632", 0 },
+	{ "ELAN1000", 0 },
+	{ }
+};
+
+#endif /* __ELAN_I2C_IDS_H */
-- 
cgit v1.2.3


From c0898fca3fce00f824e7f5d48e1edd0900378a03 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 28 Jun 2019 14:16:45 +0200
Subject: drm/selftests: reduce stack usage

Putting a large drm_connector object on the stack can lead to warnings
in some configuration, such as:

drivers/gpu/drm/selftests/test-drm_cmdline_parser.c:18:12: error: stack frame size of 1040 bytes in function 'drm_cmdline_test_res' [-Werror,-Wframe-larger-than=]
static int drm_cmdline_test_res(void *ignored)

Since the object is never modified, just declare it as 'static const'
and allow this to be passed down.

Fixes: b7ced38916a9 ("drm/selftests: Add command line parser selftests")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190628121712.1928142-1-arnd@arndb.de
---
 drivers/gpu/drm/drm_modes.c                        |   8 +-
 .../gpu/drm/selftests/test-drm_cmdline_parser.c    | 136 ++++++++-------------
 include/drm/drm_modes.h                            |   2 +-
 3 files changed, 53 insertions(+), 93 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index 57e6408288c8..910561d4f071 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -1448,7 +1448,7 @@ static int drm_mode_parse_cmdline_refresh(const char *str, char **end_ptr,
 }
 
 static int drm_mode_parse_cmdline_extra(const char *str, int length,
-					struct drm_connector *connector,
+					const struct drm_connector *connector,
 					struct drm_cmdline_mode *mode)
 {
 	int i;
@@ -1493,7 +1493,7 @@ static int drm_mode_parse_cmdline_extra(const char *str, int length,
 
 static int drm_mode_parse_cmdline_res_mode(const char *str, unsigned int length,
 					   bool extras,
-					   struct drm_connector *connector,
+					   const struct drm_connector *connector,
 					   struct drm_cmdline_mode *mode)
 {
 	const char *str_start = str;
@@ -1555,7 +1555,7 @@ static int drm_mode_parse_cmdline_res_mode(const char *str, unsigned int length,
 }
 
 static int drm_mode_parse_cmdline_options(char *str, size_t len,
-					  struct drm_connector *connector,
+					  const struct drm_connector *connector,
 					  struct drm_cmdline_mode *mode)
 {
 	unsigned int rotation = 0;
@@ -1689,7 +1689,7 @@ static int drm_mode_parse_cmdline_options(char *str, size_t len,
  * True if a valid modeline has been parsed, false otherwise.
  */
 bool drm_mode_parse_command_line_for_connector(const char *mode_option,
-					       struct drm_connector *connector,
+					       const struct drm_connector *connector,
 					       struct drm_cmdline_mode *mode)
 {
 	const char *name;
diff --git a/drivers/gpu/drm/selftests/test-drm_cmdline_parser.c b/drivers/gpu/drm/selftests/test-drm_cmdline_parser.c
index bef4edde6f9f..14c96edb13df 100644
--- a/drivers/gpu/drm/selftests/test-drm_cmdline_parser.c
+++ b/drivers/gpu/drm/selftests/test-drm_cmdline_parser.c
@@ -15,13 +15,14 @@
 #include "drm_selftest.h"
 #include "test-drm_modeset_common.h"
 
+static const struct drm_connector no_connector = {};
+
 static int drm_cmdline_test_res(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -42,11 +43,10 @@ static int drm_cmdline_test_res(void *ignored)
 
 static int drm_cmdline_test_res_missing_x(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("x480",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -54,11 +54,10 @@ static int drm_cmdline_test_res_missing_x(void *ignored)
 
 static int drm_cmdline_test_res_missing_y(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("1024x",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -66,11 +65,10 @@ static int drm_cmdline_test_res_missing_y(void *ignored)
 
 static int drm_cmdline_test_res_bad_y(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("1024xtest",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -78,11 +76,10 @@ static int drm_cmdline_test_res_bad_y(void *ignored)
 
 static int drm_cmdline_test_res_missing_y_bpp(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("1024x-24",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -90,11 +87,10 @@ static int drm_cmdline_test_res_missing_y_bpp(void *ignored)
 
 static int drm_cmdline_test_res_vesa(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480M",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -115,11 +111,10 @@ static int drm_cmdline_test_res_vesa(void *ignored)
 
 static int drm_cmdline_test_res_vesa_rblank(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480MR",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -140,11 +135,10 @@ static int drm_cmdline_test_res_vesa_rblank(void *ignored)
 
 static int drm_cmdline_test_res_rblank(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480R",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -165,11 +159,10 @@ static int drm_cmdline_test_res_rblank(void *ignored)
 
 static int drm_cmdline_test_res_bpp(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480-24",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -191,11 +184,10 @@ static int drm_cmdline_test_res_bpp(void *ignored)
 
 static int drm_cmdline_test_res_bad_bpp(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("720x480-test",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -203,11 +195,10 @@ static int drm_cmdline_test_res_bad_bpp(void *ignored)
 
 static int drm_cmdline_test_res_refresh(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480@60",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -229,11 +220,10 @@ static int drm_cmdline_test_res_refresh(void *ignored)
 
 static int drm_cmdline_test_res_bad_refresh(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("720x480@refresh",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -241,11 +231,10 @@ static int drm_cmdline_test_res_bad_refresh(void *ignored)
 
 static int drm_cmdline_test_res_bpp_refresh(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480-24@60",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -268,11 +257,10 @@ static int drm_cmdline_test_res_bpp_refresh(void *ignored)
 
 static int drm_cmdline_test_res_bpp_refresh_interlaced(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480-24@60i",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -295,11 +283,10 @@ static int drm_cmdline_test_res_bpp_refresh_interlaced(void *ignored)
 
 static int drm_cmdline_test_res_bpp_refresh_margins(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480-24@60m",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -322,11 +309,10 @@ static int drm_cmdline_test_res_bpp_refresh_margins(void *ignored)
 
 static int drm_cmdline_test_res_bpp_refresh_force_off(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480-24@60d",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -349,11 +335,10 @@ static int drm_cmdline_test_res_bpp_refresh_force_off(void *ignored)
 
 static int drm_cmdline_test_res_bpp_refresh_force_on_off(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("720x480-24@60de",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -361,11 +346,10 @@ static int drm_cmdline_test_res_bpp_refresh_force_on_off(void *ignored)
 
 static int drm_cmdline_test_res_bpp_refresh_force_on(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480-24@60e",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -388,11 +372,10 @@ static int drm_cmdline_test_res_bpp_refresh_force_on(void *ignored)
 
 static int drm_cmdline_test_res_bpp_refresh_force_on_analog(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480-24@60D",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -415,10 +398,11 @@ static int drm_cmdline_test_res_bpp_refresh_force_on_analog(void *ignored)
 
 static int drm_cmdline_test_res_bpp_refresh_force_on_digital(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
+	static const struct drm_connector connector = {
+		.connector_type = DRM_MODE_CONNECTOR_DVII,
+	};
 
-	connector.connector_type = DRM_MODE_CONNECTOR_DVII;
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480-24@60D",
 							   &connector,
 							   &mode));
@@ -443,11 +427,10 @@ static int drm_cmdline_test_res_bpp_refresh_force_on_digital(void *ignored)
 
 static int drm_cmdline_test_res_bpp_refresh_interlaced_margins_force_on(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480-24@60ime",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -470,11 +453,10 @@ static int drm_cmdline_test_res_bpp_refresh_interlaced_margins_force_on(void *ig
 
 static int drm_cmdline_test_res_margins_force_on(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480me",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -495,11 +477,10 @@ static int drm_cmdline_test_res_margins_force_on(void *ignored)
 
 static int drm_cmdline_test_res_vesa_margins(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480Mm",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -520,11 +501,10 @@ static int drm_cmdline_test_res_vesa_margins(void *ignored)
 
 static int drm_cmdline_test_res_invalid_mode(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("720x480f",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -532,11 +512,10 @@ static int drm_cmdline_test_res_invalid_mode(void *ignored)
 
 static int drm_cmdline_test_res_bpp_wrong_place_mode(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("720x480e-24",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -544,11 +523,10 @@ static int drm_cmdline_test_res_bpp_wrong_place_mode(void *ignored)
 
 static int drm_cmdline_test_name(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("NTSC",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(strcmp(mode.name, "NTSC"));
 	FAIL_ON(mode.refresh_specified);
@@ -559,11 +537,10 @@ static int drm_cmdline_test_name(void *ignored)
 
 static int drm_cmdline_test_name_bpp(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("NTSC-24",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(strcmp(mode.name, "NTSC"));
 
@@ -577,11 +554,10 @@ static int drm_cmdline_test_name_bpp(void *ignored)
 
 static int drm_cmdline_test_name_bpp_refresh(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("NTSC-24@60",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -589,11 +565,10 @@ static int drm_cmdline_test_name_bpp_refresh(void *ignored)
 
 static int drm_cmdline_test_name_refresh(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("NTSC@60",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -601,11 +576,10 @@ static int drm_cmdline_test_name_refresh(void *ignored)
 
 static int drm_cmdline_test_name_refresh_wrong_mode(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("NTSC@60m",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -613,11 +587,10 @@ static int drm_cmdline_test_name_refresh_wrong_mode(void *ignored)
 
 static int drm_cmdline_test_name_refresh_invalid_mode(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("NTSC@60f",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -625,11 +598,10 @@ static int drm_cmdline_test_name_refresh_invalid_mode(void *ignored)
 
 static int drm_cmdline_test_name_option(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("NTSC,rotate=180",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(strcmp(mode.name, "NTSC"));
@@ -640,11 +612,10 @@ static int drm_cmdline_test_name_option(void *ignored)
 
 static int drm_cmdline_test_name_bpp_option(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("NTSC-24,rotate=180",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(strcmp(mode.name, "NTSC"));
@@ -657,11 +628,10 @@ static int drm_cmdline_test_name_bpp_option(void *ignored)
 
 static int drm_cmdline_test_rotate_0(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480,rotate=0",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -683,11 +653,10 @@ static int drm_cmdline_test_rotate_0(void *ignored)
 
 static int drm_cmdline_test_rotate_90(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480,rotate=90",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -709,11 +678,10 @@ static int drm_cmdline_test_rotate_90(void *ignored)
 
 static int drm_cmdline_test_rotate_180(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480,rotate=180",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -735,11 +703,10 @@ static int drm_cmdline_test_rotate_180(void *ignored)
 
 static int drm_cmdline_test_rotate_270(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480,rotate=270",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -761,11 +728,10 @@ static int drm_cmdline_test_rotate_270(void *ignored)
 
 static int drm_cmdline_test_rotate_invalid_val(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("720x480,rotate=42",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -773,11 +739,10 @@ static int drm_cmdline_test_rotate_invalid_val(void *ignored)
 
 static int drm_cmdline_test_rotate_truncated(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("720x480,rotate=",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
@@ -785,11 +750,10 @@ static int drm_cmdline_test_rotate_truncated(void *ignored)
 
 static int drm_cmdline_test_hmirror(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480,reflect_x",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -811,11 +775,10 @@ static int drm_cmdline_test_hmirror(void *ignored)
 
 static int drm_cmdline_test_vmirror(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480,reflect_y",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -837,11 +800,10 @@ static int drm_cmdline_test_vmirror(void *ignored)
 
 static int drm_cmdline_test_margin_options(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480,margin_right=14,margin_left=24,margin_bottom=36,margin_top=42",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -866,11 +828,10 @@ static int drm_cmdline_test_margin_options(void *ignored)
 
 static int drm_cmdline_test_multiple_options(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(!drm_mode_parse_command_line_for_connector("720x480,rotate=270,reflect_x",
-							   &connector,
+							   &no_connector,
 							   &mode));
 	FAIL_ON(!mode.specified);
 	FAIL_ON(mode.xres != 720);
@@ -892,11 +853,10 @@ static int drm_cmdline_test_multiple_options(void *ignored)
 
 static int drm_cmdline_test_invalid_option(void *ignored)
 {
-	struct drm_connector connector = { };
 	struct drm_cmdline_mode mode = { };
 
 	FAIL_ON(drm_mode_parse_command_line_for_connector("720x480,test=42",
-							  &connector,
+							  &no_connector,
 							  &mode));
 
 	return 0;
diff --git a/include/drm/drm_modes.h b/include/drm/drm_modes.h
index 083f16747369..e946e20c61d8 100644
--- a/include/drm/drm_modes.h
+++ b/include/drm/drm_modes.h
@@ -537,7 +537,7 @@ void drm_connector_list_update(struct drm_connector *connector);
 /* parsing cmdline modes */
 bool
 drm_mode_parse_command_line_for_connector(const char *mode_option,
-					  struct drm_connector *connector,
+					  const struct drm_connector *connector,
 					  struct drm_cmdline_mode *mode);
 struct drm_display_mode *
 drm_mode_create_from_cmdline_mode(struct drm_device *dev,
-- 
cgit v1.2.3


From 480b9b4d847fe18f4a559f5cac718d9b2cbcdcaf Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 29 Apr 2019 14:03:33 +0800
Subject: btrfs: extent-tree: Add trace events for space info numbers update

Add trace event for update_bytes_pinned() and update_bytes_may_use() to
detect underflow better.

The output would be something like (only showing data part):

  ## Buffered write start, 16K total ##
  2255.954 xfs_io/860 btrfs:update_bytes_may_use:(nil)U: type=DATA old=0 diff=4096
  2257.169 sudo/860 btrfs:update_bytes_may_use:(nil)U: type=DATA old=4096 diff=4096
  2257.346 sudo/860 btrfs:update_bytes_may_use:(nil)U: type=DATA old=8192 diff=4096
  2257.542 sudo/860 btrfs:update_bytes_may_use:(nil)U: type=DATA old=12288 diff=4096

  ## Delalloc start ##
  3727.853 kworker/u8:3-e/700 btrfs:update_bytes_may_use:(nil)U: type=DATA old=16384 diff=-16384

  ## Space cache update ##
  3733.132 sudo/862 btrfs:update_bytes_may_use:(nil)U: type=DATA old=0 diff=65536
  3733.169 sudo/862 btrfs:update_bytes_may_use:(nil)U: type=DATA old=65536 diff=-65536
  3739.868 sudo/862 btrfs:update_bytes_may_use:(nil)U: type=DATA old=0 diff=65536
  3739.891 sudo/862 btrfs:update_bytes_may_use:(nil)U: type=DATA old=65536 diff=-65536

These two trace events will allow bcc tool to probe btrfs_space_info
changes and detect underflow with more details (e.g. backtrace for each
update).

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c       | 36 ++++++++++++++++++++----------------
 include/trace/events/btrfs.h | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f37daddbeabe..f63dfb777e4a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -55,10 +55,12 @@ enum {
  * Declare a helper function to detect underflow of various space info members
  */
 #define DECLARE_SPACE_INFO_UPDATE(name)					\
-static inline void update_##name(struct btrfs_space_info *sinfo,	\
+static inline void update_##name(struct btrfs_fs_info *fs_info,		\
+				 struct btrfs_space_info *sinfo,	\
 				 s64 bytes)				\
 {									\
 	lockdep_assert_held(&sinfo->lock);				\
+	trace_update_##name(fs_info, sinfo, sinfo->name, bytes);	\
 	if (bytes < 0 && sinfo->name < -bytes) {			\
 		WARN_ON(1);						\
 		sinfo->name = 0;					\
@@ -4209,7 +4211,7 @@ commit_trans:
 					      data_sinfo->flags, bytes, 1);
 		return -ENOSPC;
 	}
-	update_bytes_may_use(data_sinfo, bytes);
+	update_bytes_may_use(fs_info, data_sinfo, bytes);
 	trace_btrfs_space_reservation(fs_info, "space_info",
 				      data_sinfo->flags, bytes, 1);
 	spin_unlock(&data_sinfo->lock);
@@ -4262,7 +4264,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 
 	data_sinfo = fs_info->data_sinfo;
 	spin_lock(&data_sinfo->lock);
-	update_bytes_may_use(data_sinfo, -len);
+	update_bytes_may_use(fs_info, data_sinfo, -len);
 	trace_btrfs_space_reservation(fs_info, "space_info",
 				      data_sinfo->flags, len, 0);
 	spin_unlock(&data_sinfo->lock);
@@ -5169,13 +5171,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 	 * If not things get more complicated.
 	 */
 	if (used + orig_bytes <= space_info->total_bytes) {
-		update_bytes_may_use(space_info, orig_bytes);
+		update_bytes_may_use(fs_info, space_info, orig_bytes);
 		trace_btrfs_space_reservation(fs_info, "space_info",
 					      space_info->flags, orig_bytes, 1);
 		ret = 0;
 	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
 				  system_chunk)) {
-		update_bytes_may_use(space_info, orig_bytes);
+		update_bytes_may_use(fs_info, space_info, orig_bytes);
 		trace_btrfs_space_reservation(fs_info, "space_info",
 					      space_info->flags, orig_bytes, 1);
 		ret = 0;
@@ -5503,7 +5505,7 @@ again:
 		flush = BTRFS_RESERVE_FLUSH_ALL;
 		goto again;
 	}
-	update_bytes_may_use(space_info, -num_bytes);
+	update_bytes_may_use(fs_info, space_info, -num_bytes);
 	trace_btrfs_space_reservation(fs_info, "space_info",
 				      space_info->flags, num_bytes, 0);
 	spin_unlock(&space_info->lock);
@@ -5531,7 +5533,8 @@ again:
 						      ticket->bytes, 1);
 			list_del_init(&ticket->list);
 			num_bytes -= ticket->bytes;
-			update_bytes_may_use(space_info, ticket->bytes);
+			update_bytes_may_use(fs_info, space_info,
+					     ticket->bytes);
 			ticket->bytes = 0;
 			space_info->tickets_id++;
 			wake_up(&ticket->wait);
@@ -5539,7 +5542,7 @@ again:
 			trace_btrfs_space_reservation(fs_info, "space_info",
 						      space_info->flags,
 						      num_bytes, 1);
-			update_bytes_may_use(space_info, num_bytes);
+			update_bytes_may_use(fs_info, space_info, num_bytes);
 			ticket->bytes -= num_bytes;
 			num_bytes = 0;
 		}
@@ -5832,14 +5835,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 			num_bytes = min(num_bytes,
 					block_rsv->size - block_rsv->reserved);
 			block_rsv->reserved += num_bytes;
-			update_bytes_may_use(sinfo, num_bytes);
+			update_bytes_may_use(fs_info, sinfo, num_bytes);
 			trace_btrfs_space_reservation(fs_info, "space_info",
 						      sinfo->flags, num_bytes,
 						      1);
 		}
 	} else if (block_rsv->reserved > block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
-		update_bytes_may_use(sinfo, -num_bytes);
+		update_bytes_may_use(fs_info, sinfo, -num_bytes);
 		trace_btrfs_space_reservation(fs_info, "space_info",
 				      sinfo->flags, num_bytes, 0);
 		block_rsv->reserved = block_rsv->size;
@@ -6302,7 +6305,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			old_val -= num_bytes;
 			btrfs_set_block_group_used(&cache->item, old_val);
 			cache->pinned += num_bytes;
-			update_bytes_pinned(cache->space_info, num_bytes);
+			update_bytes_pinned(info, cache->space_info, num_bytes);
 			cache->space_info->bytes_used -= num_bytes;
 			cache->space_info->disk_used -= num_bytes * factor;
 			spin_unlock(&cache->lock);
@@ -6377,7 +6380,7 @@ static int pin_down_extent(struct btrfs_block_group_cache *cache,
 	spin_lock(&cache->space_info->lock);
 	spin_lock(&cache->lock);
 	cache->pinned += num_bytes;
-	update_bytes_pinned(cache->space_info, num_bytes);
+	update_bytes_pinned(fs_info, cache->space_info, num_bytes);
 	if (reserved) {
 		cache->reserved -= num_bytes;
 		cache->space_info->bytes_reserved -= num_bytes;
@@ -6586,7 +6589,7 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
 	} else {
 		cache->reserved += num_bytes;
 		space_info->bytes_reserved += num_bytes;
-		update_bytes_may_use(space_info, -ram_bytes);
+		update_bytes_may_use(cache->fs_info, space_info, -ram_bytes);
 		if (delalloc)
 			cache->delalloc_bytes += num_bytes;
 	}
@@ -6742,7 +6745,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 		spin_lock(&space_info->lock);
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
-		update_bytes_pinned(space_info, -len);
+		update_bytes_pinned(fs_info, space_info, -len);
 
 		trace_btrfs_space_reservation(fs_info, "pinned",
 					      space_info->flags, len, 0);
@@ -6763,7 +6766,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 				to_add = min(len, global_rsv->size -
 					     global_rsv->reserved);
 				global_rsv->reserved += to_add;
-				update_bytes_may_use(space_info, to_add);
+				update_bytes_may_use(fs_info, space_info,
+						     to_add);
 				if (global_rsv->reserved >= global_rsv->size)
 					global_rsv->full = 1;
 				trace_btrfs_space_reservation(fs_info,
@@ -11024,7 +11028,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		spin_lock(&space_info->lock);
 		spin_lock(&block_group->lock);
 
-		update_bytes_pinned(space_info, -block_group->pinned);
+		update_bytes_pinned(fs_info, space_info, -block_group->pinned);
 		space_info->bytes_readonly += block_group->pinned;
 		percpu_counter_add_batch(&space_info->total_bytes_pinned,
 				   -block_group->pinned,
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index f9eff010fc7e..2f6a669408bb 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -29,6 +29,7 @@ struct btrfs_qgroup_extent_record;
 struct btrfs_qgroup;
 struct extent_io_tree;
 struct prelim_ref;
+struct btrfs_space_info;
 
 TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS_NR);
 TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS);
@@ -2091,6 +2092,45 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock);
 DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock);
 DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic);
 
+DECLARE_EVENT_CLASS(btrfs__space_info_update,
+
+	TP_PROTO(struct btrfs_fs_info *fs_info,
+		 struct btrfs_space_info *sinfo, u64 old, s64 diff),
+
+	TP_ARGS(fs_info, sinfo, old, diff),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	type		)
+		__field(	u64,	old		)
+		__field(	s64,	diff		)
+	),
+
+	TP_fast_assign_btrfs(fs_info,
+		__entry->type	= sinfo->flags;
+		__entry->old	= old;
+		__entry->diff	= diff;
+	),
+	TP_printk_btrfs("type=%s old=%llu diff=%lld",
+		__print_flags(__entry->type, "|", BTRFS_GROUP_FLAGS),
+		__entry->old, __entry->diff)
+);
+
+DEFINE_EVENT(btrfs__space_info_update, update_bytes_may_use,
+
+	TP_PROTO(struct btrfs_fs_info *fs_info,
+		 struct btrfs_space_info *sinfo, u64 old, s64 diff),
+
+	TP_ARGS(fs_info, sinfo, old, diff)
+);
+
+DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned,
+
+	TP_PROTO(struct btrfs_fs_info *fs_info,
+		 struct btrfs_space_info *sinfo, u64 old, s64 diff),
+
+	TP_ARGS(fs_info, sinfo, old, diff)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 498b98e939007f8bb65094dfa229e84b6bf30e62 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Fri, 21 Jun 2019 18:21:45 -0700
Subject: soc: qcom: mdt_loader: Support loading non-split images

In some software releases the firmware images are not split up with each
loadable segment in it's own file. Check the size of the loaded firmware
to see if it still contains each segment to be loaded, before falling
back to the split-out segments.

Acked-by: Andy Gross <agross@kernel.org>
Reviewed-by: Jeffrey Hugo <jeffrey.l.hugo@gmail.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/soc/qcom/mdt_loader.c       | 88 +++++++++++++++++++++++++++++++++++--
 include/linux/soc/qcom/mdt_loader.h |  2 +
 2 files changed, 87 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/qcom/mdt_loader.c b/drivers/soc/qcom/mdt_loader.c
index 1c488024c698..1c970d6dd532 100644
--- a/drivers/soc/qcom/mdt_loader.c
+++ b/drivers/soc/qcom/mdt_loader.c
@@ -74,6 +74,66 @@ ssize_t qcom_mdt_get_size(const struct firmware *fw)
 }
 EXPORT_SYMBOL_GPL(qcom_mdt_get_size);
 
+/**
+ * qcom_mdt_read_metadata() - read header and metadata from mdt or mbn
+ * @fw:		firmware of mdt header or mbn
+ * @data_len:	length of the read metadata blob
+ *
+ * The mechanism that performs the authentication of the loading firmware
+ * expects an ELF header directly followed by the segment of hashes, with no
+ * padding inbetween. This function allocates a chunk of memory for this pair
+ * and copy the two pieces into the buffer.
+ *
+ * In the case of split firmware the hash is found directly following the ELF
+ * header, rather than at p_offset described by the second program header.
+ *
+ * The caller is responsible to free (kfree()) the returned pointer.
+ *
+ * Return: pointer to data, or ERR_PTR()
+ */
+void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len)
+{
+	const struct elf32_phdr *phdrs;
+	const struct elf32_hdr *ehdr;
+	size_t hash_offset;
+	size_t hash_size;
+	size_t ehdr_size;
+	void *data;
+
+	ehdr = (struct elf32_hdr *)fw->data;
+	phdrs = (struct elf32_phdr *)(ehdr + 1);
+
+	if (ehdr->e_phnum < 2)
+		return ERR_PTR(-EINVAL);
+
+	if (phdrs[0].p_type == PT_LOAD || phdrs[1].p_type == PT_LOAD)
+		return ERR_PTR(-EINVAL);
+
+	if ((phdrs[1].p_flags & QCOM_MDT_TYPE_MASK) != QCOM_MDT_TYPE_HASH)
+		return ERR_PTR(-EINVAL);
+
+	ehdr_size = phdrs[0].p_filesz;
+	hash_size = phdrs[1].p_filesz;
+
+	data = kmalloc(ehdr_size + hash_size, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	/* Is the header and hash already packed */
+	if (ehdr_size + hash_size == fw->size)
+		hash_offset = phdrs[0].p_filesz;
+	else
+		hash_offset = phdrs[1].p_offset;
+
+	memcpy(data, fw->data, ehdr_size);
+	memcpy(data + ehdr_size, fw->data + hash_offset, hash_size);
+
+	*data_len = ehdr_size + hash_size;
+
+	return data;
+}
+EXPORT_SYMBOL_GPL(qcom_mdt_read_metadata);
+
 static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 			   const char *firmware, int pas_id, void *mem_region,
 			   phys_addr_t mem_phys, size_t mem_size,
@@ -86,12 +146,14 @@ static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 	phys_addr_t mem_reloc;
 	phys_addr_t min_addr = PHYS_ADDR_MAX;
 	phys_addr_t max_addr = 0;
+	size_t metadata_len;
 	size_t fw_name_len;
 	ssize_t offset;
+	void *metadata;
 	char *fw_name;
 	bool relocate = false;
 	void *ptr;
-	int ret;
+	int ret = 0;
 	int i;
 
 	if (!fw || !mem_region || !mem_phys || !mem_size)
@@ -109,7 +171,15 @@ static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 		return -ENOMEM;
 
 	if (pas_init) {
-		ret = qcom_scm_pas_init_image(pas_id, fw->data, fw->size);
+		metadata = qcom_mdt_read_metadata(fw, &metadata_len);
+		if (IS_ERR(metadata)) {
+			ret = PTR_ERR(metadata);
+			goto out;
+		}
+
+		ret = qcom_scm_pas_init_image(pas_id, metadata, metadata_len);
+
+		kfree(metadata);
 		if (ret) {
 			dev_err(dev, "invalid firmware metadata\n");
 			goto out;
@@ -170,7 +240,19 @@ static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 
 		ptr = mem_region + offset;
 
-		if (phdr->p_filesz) {
+		if (phdr->p_filesz && phdr->p_offset < fw->size) {
+			/* Firmware is large enough to be non-split */
+			if (phdr->p_offset + phdr->p_filesz > fw->size) {
+				dev_err(dev,
+					"failed to load segment %d from truncated file %s\n",
+					i, firmware);
+				ret = -EINVAL;
+				break;
+			}
+
+			memcpy(ptr, fw->data + phdr->p_offset, phdr->p_filesz);
+		} else if (phdr->p_filesz) {
+			/* Firmware not large enough, load split-out segments */
 			sprintf(fw_name + fw_name_len - 3, "b%02d", i);
 			ret = request_firmware_into_buf(&seg_fw, fw_name, dev,
 							ptr, phdr->p_filesz);
diff --git a/include/linux/soc/qcom/mdt_loader.h b/include/linux/soc/qcom/mdt_loader.h
index 944b06aefb0f..e600baec6825 100644
--- a/include/linux/soc/qcom/mdt_loader.h
+++ b/include/linux/soc/qcom/mdt_loader.h
@@ -21,4 +21,6 @@ int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw,
 			  const char *fw_name, int pas_id, void *mem_region,
 			  phys_addr_t mem_phys, size_t mem_size,
 			  phys_addr_t *reloc_base);
+void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len);
+
 #endif
-- 
cgit v1.2.3


From c7369b3faea230cf6009449147ed755c45e74afd Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 31 May 2019 15:39:31 +0200
Subject: btrfs: add mask for all RAID1 types

Preparatory patch for additional RAID1 profiles with more copies. The
mask will contain 3-copy and 4-copy, most of the checks for plain RAID1
work the same for the other profiles.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c          | 8 ++++----
 fs/btrfs/scrub.c                | 2 +-
 fs/btrfs/volumes.c              | 8 ++++----
 include/uapi/linux/btrfs_tree.h | 2 ++
 4 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f24ef9020323..13c17f94f15d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7873,7 +7873,7 @@ search:
 		 */
 		if (!block_group_bits(block_group, flags)) {
 			u64 extra = BTRFS_BLOCK_GROUP_DUP |
-				BTRFS_BLOCK_GROUP_RAID1 |
+				BTRFS_BLOCK_GROUP_RAID1_MASK |
 				BTRFS_BLOCK_GROUP_RAID5 |
 				BTRFS_BLOCK_GROUP_RAID6 |
 				BTRFS_BLOCK_GROUP_RAID10;
@@ -9564,7 +9564,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
 
 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
-		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+		BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
 
 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
@@ -9575,7 +9575,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
 			return stripped;
 
 		/* turn mirroring into duplication */
-		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+		if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
 			     BTRFS_BLOCK_GROUP_RAID10))
 			return stripped | BTRFS_BLOCK_GROUP_DUP;
 	} else {
@@ -10445,7 +10445,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	list_for_each_entry_rcu(space_info, &info->space_info, list) {
 		if (!(get_alloc_profile(info, space_info->flags) &
 		      (BTRFS_BLOCK_GROUP_RAID10 |
-		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_RAID1_MASK |
 		       BTRFS_BLOCK_GROUP_RAID5 |
 		       BTRFS_BLOCK_GROUP_RAID6 |
 		       BTRFS_BLOCK_GROUP_DUP)))
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9f0297d529d4..0c99cf9fb595 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3091,7 +3091,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		offset = map->stripe_len * (num / map->sub_stripes);
 		increment = map->stripe_len * factor;
 		mirror_num = num % map->sub_stripes + 1;
-	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
 		increment = map->stripe_len;
 		mirror_num = num % map->num_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 776f5c7ca7c5..9e5167a0e406 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5400,7 +5400,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		return 1;
 
 	map = em->map_lookup;
-	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
 		ret = map->num_stripes;
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 		ret = map->sub_stripes;
@@ -5474,7 +5474,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	struct btrfs_device *srcdev;
 
 	ASSERT((map->type &
-		 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
+		 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
 
 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 		num_stripes = map->sub_stripes;
@@ -5663,7 +5663,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
 					      &remaining_stripes);
 		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
 		last_stripe *= sub_stripes;
-	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
 				BTRFS_BLOCK_GROUP_DUP)) {
 		num_stripes = map->num_stripes;
 	} else {
@@ -6035,7 +6035,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 				&stripe_index);
 		if (!need_full_stripe(op))
 			mirror_num = 1;
-	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
 		if (need_full_stripe(op))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 421239b98db2..34d5b34286fa 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -866,6 +866,8 @@ enum btrfs_raid_types {
 #define BTRFS_BLOCK_GROUP_RAID56_MASK	(BTRFS_BLOCK_GROUP_RAID5 |   \
 					 BTRFS_BLOCK_GROUP_RAID6)
 
+#define BTRFS_BLOCK_GROUP_RAID1_MASK	(BTRFS_BLOCK_GROUP_RAID1)
+
 /*
  * We need a bit for restriper to be able to tell when chunks of type
  * SINGLE are available.  This "extended" profile format is used in
-- 
cgit v1.2.3


From 6987fd42239ad43937166dace2ed8fb260b14d25 Mon Sep 17 00:00:00 2001
From: Otto Sabart <ottosabart@seberm.com>
Date: Mon, 20 May 2019 10:06:26 +0100
Subject: mfd: madera: Fix bad reference to pinctrl.txt file

The pinctrl.txt file was converted into reStructuredText and moved into
driver-api folder. This patch updates the broken reference.

Fixes: 5a9b73832e9e ("pinctrl.txt: move it to the driver-api book")
Signed-off-by: Otto Sabart <ottosabart@seberm.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/madera/pdata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mfd/madera/pdata.h b/include/linux/mfd/madera/pdata.h
index 8dc852402dbb..dd00ab824e5b 100644
--- a/include/linux/mfd/madera/pdata.h
+++ b/include/linux/mfd/madera/pdata.h
@@ -34,7 +34,8 @@ struct madera_codec_pdata;
  * @micvdd:	    Substruct of pdata for the MICVDD regulator
  * @irq_flags:	    Mode for primary IRQ (defaults to active low)
  * @gpio_base:	    Base GPIO number
- * @gpio_configs:   Array of GPIO configurations (See Documentation/pinctrl.txt)
+ * @gpio_configs:   Array of GPIO configurations (See
+ *		    Documentation/driver-api/pinctl.rst)
  * @n_gpio_configs: Number of entries in gpio_configs
  * @gpsw:	    General purpose switch mode setting. Depends on the external
  *		    hardware connected to the switch. (See the SW1_MODE field
-- 
cgit v1.2.3


From 1ef921b6d1b68887be22f02dabc6ae73c112dce4 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 30 May 2019 15:39:52 +0100
Subject: mfd: madera: Add Madera core support for CS47L15

This patch adds all the core support and defines for the Cirrus
Logic CS47L15 smart audio CODEC.

Registers or fields are named MADERA_* if it is part of the
common hardware platform and does not conflict with any other
Madera codecs. It is named CS47L15_* if it is unique to CS47L15
and conflicts with definitions on other codecs.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |    7 +
 drivers/mfd/Makefile                 |    3 +
 drivers/mfd/cs47l15-tables.c         | 1301 ++++++++++++++++++++++++++++++++++
 drivers/mfd/madera-core.c            |   44 ++
 drivers/mfd/madera-i2c.c             |    7 +
 drivers/mfd/madera-spi.c             |    7 +
 drivers/mfd/madera.h                 |    6 +
 include/linux/mfd/madera/core.h      |    2 +
 include/linux/mfd/madera/registers.h |    5 +
 9 files changed, 1382 insertions(+)
 create mode 100644 drivers/mfd/cs47l15-tables.c

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 760b9e29c8e5..86ae0a11f631 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -260,6 +260,13 @@ config MFD_MADERA_SPI
 	  Support for the Cirrus Logic Madera platform audio SoC
 	  core functionality controlled via SPI.
 
+config MFD_CS47L15
+	bool "Cirrus Logic CS47L15"
+	select PINCTRL_CS47L15
+	depends on MFD_MADERA
+	help
+	  Support for Cirrus Logic CS47L15 Smart Codec
+
 config MFD_CS47L35
 	bool "Cirrus Logic CS47L35"
 	select PINCTRL_CS47L35
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 643d65bcb6ea..cc044f38af84 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -75,6 +75,9 @@ obj-$(CONFIG_MFD_WM8994)	+= wm8994.o
 obj-$(CONFIG_MFD_WM97xx)	+= wm97xx-core.o
 
 madera-objs			:= madera-core.o
+ifeq ($(CONFIG_MFD_CS47L15),y)
+madera-objs			+= cs47l15-tables.o
+endif
 ifeq ($(CONFIG_MFD_CS47L35),y)
 madera-objs			+= cs47l35-tables.o
 endif
diff --git a/drivers/mfd/cs47l15-tables.c b/drivers/mfd/cs47l15-tables.c
new file mode 100644
index 000000000000..1b4f6f79eac2
--- /dev/null
+++ b/drivers/mfd/cs47l15-tables.c
@@ -0,0 +1,1301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Regmap tables for CS47L15 codec
+ *
+ * Copyright (C) 2016-2019 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+
+#include <linux/mfd/madera/core.h>
+#include <linux/mfd/madera/registers.h>
+
+#include "madera.h"
+
+static const struct reg_sequence cs47l15_reva_16_patch[] = {
+	{ 0x8C, 0x5555 },
+	{ 0x8C, 0xAAAA },
+	{ 0x314, 0x0080 },
+	{ 0x4A8, 0x6023 },
+	{ 0x4A9, 0x6023 },
+	{ 0x4D4, 0x0008 },
+	{ 0x4CF, 0x0F00 },
+	{ 0x4D7, 0x1B2B },
+	{ 0x8C, 0xCCCC },
+	{ 0x8C, 0x3333 },
+};
+
+int cs47l15_patch(struct madera *madera)
+{
+	int ret;
+
+	ret = regmap_register_patch(madera->regmap,
+				    cs47l15_reva_16_patch,
+				    ARRAY_SIZE(cs47l15_reva_16_patch));
+	if (ret < 0) {
+		dev_err(madera->dev,
+			"Error in applying 16-bit patch: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cs47l15_patch);
+
+static const struct reg_default cs47l15_reg_default[] = {
+	{ 0x00000020, 0x0000 }, /* R32 (0x20) - Tone Generator 1 */
+	{ 0x00000021, 0x1000 }, /* R33 (0x21) - Tone Generator 2 */
+	{ 0x00000022, 0x0000 }, /* R34 (0x22) - Tone Generator 3 */
+	{ 0x00000023, 0x1000 }, /* R35 (0x23) - Tone Generator 4 */
+	{ 0x00000024, 0x0000 }, /* R36 (0x24) - Tone Generator 5 */
+	{ 0x00000030, 0x0000 }, /* R48 (0x30) - PWM Drive 1 */
+	{ 0x00000031, 0x0100 }, /* R49 (0x31) - PWM Drive 2 */
+	{ 0x00000032, 0x0100 }, /* R50 (0x32) - PWM Drive 3 */
+	{ 0x00000061, 0x01ff }, /* R97 (0x61) - Sample Rate Sequence Select 1 */
+	{ 0x00000062, 0x01ff }, /* R98 (0x62) - Sample Rate Sequence Select 2 */
+	{ 0x00000063, 0x01ff }, /* R99 (0x63) - Sample Rate Sequence Select 3 */
+	{ 0x00000064, 0x01ff }, /* R100 (0x64) - Sample Rate Sequence Select 4 */
+	{ 0x00000066, 0x01ff }, /* R102 (0x66) - Always On Triggers Sequence Select 1 */
+	{ 0x00000067, 0x01ff }, /* R103 (0x67) - Always On Triggers Sequence Select 2 */
+	{ 0x00000090, 0x0000 }, /* R144 (0x90) - Haptics Control 1 */
+	{ 0x00000091, 0x7fff }, /* R145 (0x91) - Haptics Control 2 */
+	{ 0x00000092, 0x0000 }, /* R146 (0x92) - Haptics Phase 1 Intensity */
+	{ 0x00000093, 0x0000 }, /* R147 (0x93) - Haptics Phase 1 Duration */
+	{ 0x00000094, 0x0000 }, /* R148 (0x94) - Haptics Phase 2 Intensity */
+	{ 0x00000095, 0x0000 }, /* R149 (0x95) - Haptics Phase 2 Duration */
+	{ 0x00000096, 0x0000 }, /* R150 (0x96) - Haptics Phase 3 Intensity */
+	{ 0x00000097, 0x0000 }, /* R151 (0x97) - Haptics Phase 3 Duration */
+	{ 0x000000a0, 0x0000 }, /* R160 (0xA0) - Comfort Noise Generator */
+	{ 0x00000100, 0x0002 }, /* R256 (0x100) - Clock 32K 1 */
+	{ 0x00000101, 0x0404 }, /* R257 (0x101) - System Clock 1 */
+	{ 0x00000102, 0x0011 }, /* R258 (0x102) - Sample Rate 1 */
+	{ 0x00000103, 0x0011 }, /* R259 (0x103) - Sample Rate 2 */
+	{ 0x00000104, 0x0011 }, /* R260 (0x104) - Sample Rate 3 */
+	{ 0x00000120, 0x0304 }, /* R288 (0x120) - DSP Clock 1 */
+	{ 0x00000122, 0x0000 }, /* R290 (0x122) - DSP Clock 2 */
+	{ 0x00000149, 0x0000 }, /* R329 (0x149) - Output System Clock */
+	{ 0x00000152, 0x0000 }, /* R338 (0x152) - Rate Estimator 1 */
+	{ 0x00000153, 0x0000 }, /* R339 (0x153) - Rate Estimator 2 */
+	{ 0x00000154, 0x0000 }, /* R340 (0x154) - Rate Estimator 3 */
+	{ 0x00000155, 0x0000 }, /* R341 (0x155) - Rate Estimator 4 */
+	{ 0x00000156, 0x0000 }, /* R342 (0x156) - Rate Estimator 5 */
+	{ 0x00000171, 0x0002 }, /* R369 (0x171) - FLL1 Control 1 */
+	{ 0x00000172, 0x0008 }, /* R370 (0x172) - FLL1 Control 2 */
+	{ 0x00000173, 0x0018 }, /* R371 (0x173) - FLL1 Control 3 */
+	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
+	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
+	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
+	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
+	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
+	{ 0x0000017a, 0x2906 }, /* R378 (0x17A) - FLL1 EFS 2 */
+	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
+	{ 0x00000182, 0x0000 }, /* R386 (0x182) - FLL1 Synchroniser 2 */
+	{ 0x00000183, 0x0000 }, /* R387 (0x183) - FLL1 Synchroniser 3 */
+	{ 0x00000184, 0x0000 }, /* R388 (0x184) - FLL1 Synchroniser 4 */
+	{ 0x00000185, 0x0000 }, /* R389 (0x185) - FLL1 Synchroniser 5 */
+	{ 0x00000186, 0x0000 }, /* R390 (0x186) - FLL1 Synchroniser 6 */
+	{ 0x00000187, 0x0001 }, /* R391 (0x187) - FLL1 Synchroniser 7 */
+	{ 0x00000189, 0x0000 }, /* R393 (0x189) - FLL1 Spread Spectrum */
+	{ 0x0000018a, 0x0004 }, /* R394 (0x18A) - FLL1 GPIO Clock */
+	{ 0x000001d1, 0x0004 }, /* R465 (0x1D1) - FLL AO Control 1 */
+	{ 0x000001d2, 0x0004 }, /* R466 (0x1D2) - FLL AO Control 2 */
+	{ 0x000001d3, 0x0000 }, /* R467 (0x1D3) - FLL AO Control 3 */
+	{ 0x000001d4, 0x0000 }, /* R468 (0x1D4) - FLL AO Control 4 */
+	{ 0x000001d5, 0x0001 }, /* R469 (0x1D5) - FLL AO Control 5 */
+	{ 0x000001d6, 0x8004 }, /* R470 (0x1D6) - FLL AO Control 6 */
+	{ 0x000001d8, 0x0000 }, /* R472 (0x1D8) - FLL AO Control 7 */
+	{ 0x000001da, 0x0077 }, /* R474 (0x1DA) - FLL AO Control 8 */
+	{ 0x000001db, 0x0000 }, /* R475 (0x1DB) - FLL AO Control 9 */
+	{ 0x000001dc, 0x06da }, /* R476 (0x1DC) - FLL AO Control 10 */
+	{ 0x000001dd, 0x0011 }, /* R477 (0x1DD) - FLL AO Control 11 */
+	{ 0x00000218, 0x00e6 }, /* R536 (0x218) - Mic Bias Ctrl 1 */
+	{ 0x0000021c, 0x0222 }, /* R540 (0x21C) - Mic Bias Ctrl 5 */
+	{ 0x00000299, 0x0000 }, /* R665 (0x299) - Headphone Detect 0 */
+	{ 0x0000029b, 0x0000 }, /* R667 (0x29B) - Headphone Detect 1 */
+	{ 0x000002a2, 0x0010 }, /* R674 (0x2A2) - Mic Detect 1 Control 0 */
+	{ 0x000002a3, 0x1102 }, /* R675 (0x2A3) - Mic Detect 1 Control 1 */
+	{ 0x000002a4, 0x009f }, /* R676 (0x2A4) - Mic Detect 1 Control 2 */
+	{ 0x000002a6, 0x3d3d }, /* R678 (0x2A6) - Mic Detect 1 Level 1 */
+	{ 0x000002a7, 0x3d3d }, /* R679 (0x2A7) - Mic Detect 1 Level 2 */
+	{ 0x000002a8, 0x333d }, /* R680 (0x2A8) - Mic Detect 1 Level 3 */
+	{ 0x000002a9, 0x202d }, /* R681 (0x2A9) - Mic Detect 1 Level 4 */
+	{ 0x000002c6, 0x0010 }, /* R710 (0x2C6) - Micd Clamp Control */
+	{ 0x000002c8, 0x0000 }, /* R712 (0x2C8) - GP Switch 1 */
+	{ 0x000002d3, 0x0000 }, /* R723 (0x2D3) - Jack Detect Analogue */
+	{ 0x00000300, 0x0000 }, /* R768 (0x300) - Input Enables */
+	{ 0x00000308, 0x0000 }, /* R776 (0x308) - Input Rate */
+	{ 0x00000309, 0x0022 }, /* R777 (0x309) - Input Volume Ramp */
+	{ 0x0000030c, 0x0002 }, /* R780 (0x30C) - HPF Control */
+	{ 0x00000310, 0x0080 }, /* R784 (0x310) - IN1L Control */
+	{ 0x00000311, 0x0180 }, /* R785 (0x311) - ADC Digital Volume 1L */
+	{ 0x00000312, 0x0500 }, /* R786 (0x312) - DMIC1L Control */
+	{ 0x00000313, 0x0000 }, /* R787 (0x313) - IN1L Rate Control */
+	{ 0x00000314, 0x0080 }, /* R788 (0x314) - IN1R Control */
+	{ 0x00000315, 0x0180 }, /* R789 (0x315) - ADC Digital Volume 1R */
+	{ 0x00000316, 0x0000 }, /* R790 (0x316) - DMIC1R Control */
+	{ 0x00000317, 0x0000 }, /* R791 (0x317) - IN1R Rate Control */
+	{ 0x00000318, 0x0000 }, /* R792 (0x318) - IN2L Control */
+	{ 0x00000319, 0x0180 }, /* R793 (0x319) - ADC Digital Volume 2L */
+	{ 0x0000031a, 0x0500 }, /* R794 (0x31A) - DMIC2L Control */
+	{ 0x0000031b, 0x0000 }, /* R795 (0x31B) - IN2L Rate Control */
+	{ 0x0000031c, 0x0800 }, /* R796 (0x31C) - IN2R Control */
+	{ 0x0000031d, 0x0180 }, /* R797 (0x31D) - ADC Digital Volume 2R */
+	{ 0x0000031e, 0x0000 }, /* R798 (0x31E) - DMIC2R Control */
+	{ 0x0000031f, 0x0000 }, /* R799 (0x31F) - IN2R Rate Control */
+	{ 0x000003a8, 0x2000 }, /* R936 (0x3A8) - CS47L15 ADC Int Bias */
+	{ 0x000003c4, 0x0000 }, /* R964 (0x3C4) - CS47L15 PGA Bias Sel */
+	{ 0x00000400, 0x0000 }, /* R1024 (0x400) - Output Enables 1 */
+	{ 0x00000408, 0x0000 }, /* R1032 (0x408) - Output Rate 1 */
+	{ 0x00000409, 0x0022 }, /* R1033 (0x409) - Output Volume Ramp */
+	{ 0x00000410, 0x0080 }, /* R1040 (0x410) - Output Path Config 1L */
+	{ 0x00000411, 0x0180 }, /* R1041 (0x411) - DAC Digital Volume 1L */
+	{ 0x00000412, 0x0000 }, /* R1042 (0x412) - Output Path Config 1 */
+	{ 0x00000413, 0x0001 }, /* R1043 (0x413) - Noise Gate Select 1L */
+	{ 0x00000414, 0x0080 }, /* R1044 (0x414) - Output Path Config 1R */
+	{ 0x00000415, 0x0180 }, /* R1045 (0x415) - DAC Digital Volume 1R */
+	{ 0x00000417, 0x0002 }, /* R1047 (0x417) - Noise Gate Select 1R */
+	{ 0x0000041a, 0x0600 }, /* R1050 (0x41A) - Output Path Config 2 */
+	{ 0x00000428, 0x0000 }, /* R1064 (0x428) - Output Path Config 4L */
+	{ 0x00000429, 0x0180 }, /* R1065 (0x429) - DAC Digital Volume 4L */
+	{ 0x0000042b, 0x0040 }, /* R1067 (0x42B) - Noise Gate Select 4L */
+	{ 0x00000430, 0x0000 }, /* R1072 (0x430) - Output Path Config 5L */
+	{ 0x00000431, 0x0180 }, /* R1073 (0x431) - DAC Digital Volume 5L */
+	{ 0x00000433, 0x0100 }, /* R1075 (0x433) - Noise Gate Select 5L */
+	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
+	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
+	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
+	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
+	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
+	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
+	{ 0x00000490, 0x0069 }, /* R1168 (0x490) - PDM SPK1 Ctrl 1 */
+	{ 0x00000491, 0x0000 }, /* R1169 (0x491) - PDM SPK1 Ctrl 2 */
+	{ 0x000004a0, 0x3080 }, /* R1184 (0x4A0) - HP1 Short Circuit Ctrl */
+	{ 0x000004a8, 0x6023 }, /* R1192 (0x4A8) - HP Test Ctrl 5 */
+	{ 0x000004a9, 0x6023 }, /* R1193 (0x4A9) - HP Test Ctrl 6 */
+	{ 0x00000500, 0x000c }, /* R1280 (0x500) - AIF1 BCLK Ctrl */
+	{ 0x00000501, 0x0000 }, /* R1281 (0x501) - AIF1 Tx Pin Ctrl */
+	{ 0x00000502, 0x0000 }, /* R1282 (0x502) - AIF1 Rx Pin Ctrl */
+	{ 0x00000503, 0x0000 }, /* R1283 (0x503) - AIF1 Rate Ctrl */
+	{ 0x00000504, 0x0000 }, /* R1284 (0x504) - AIF1 Format */
+	{ 0x00000506, 0x0040 }, /* R1286 (0x506) - AIF1 Rx BCLK Rate */
+	{ 0x00000507, 0x1818 }, /* R1287 (0x507) - AIF1 Frame Ctrl 1 */
+	{ 0x00000508, 0x1818 }, /* R1288 (0x508) - AIF1 Frame Ctrl 2 */
+	{ 0x00000509, 0x0000 }, /* R1289 (0x509) - AIF1 Frame Ctrl 3 */
+	{ 0x0000050a, 0x0001 }, /* R1290 (0x50A) - AIF1 Frame Ctrl 4 */
+	{ 0x0000050b, 0x0002 }, /* R1291 (0x50B) - AIF1 Frame Ctrl 5 */
+	{ 0x0000050c, 0x0003 }, /* R1292 (0x50C) - AIF1 Frame Ctrl 6 */
+	{ 0x0000050d, 0x0004 }, /* R1293 (0x50D) - AIF1 Frame Ctrl 7 */
+	{ 0x0000050e, 0x0005 }, /* R1294 (0x50E) - AIF1 Frame Ctrl 8 */
+	{ 0x00000511, 0x0000 }, /* R1297 (0x511) - AIF1 Frame Ctrl 11 */
+	{ 0x00000512, 0x0001 }, /* R1298 (0x512) - AIF1 Frame Ctrl 12 */
+	{ 0x00000513, 0x0002 }, /* R1299 (0x513) - AIF1 Frame Ctrl 13 */
+	{ 0x00000514, 0x0003 }, /* R1300 (0x514) - AIF1 Frame Ctrl 14 */
+	{ 0x00000515, 0x0004 }, /* R1301 (0x515) - AIF1 Frame Ctrl 15 */
+	{ 0x00000516, 0x0005 }, /* R1302 (0x516) - AIF1 Frame Ctrl 16 */
+	{ 0x00000519, 0x0000 }, /* R1305 (0x519) - AIF1 Tx Enables */
+	{ 0x0000051a, 0x0000 }, /* R1306 (0x51A) - AIF1 Rx Enables */
+	{ 0x00000540, 0x000c }, /* R1344 (0x540) - AIF2 BCLK Ctrl */
+	{ 0x00000541, 0x0000 }, /* R1345 (0x541) - AIF2 Tx Pin Ctrl */
+	{ 0x00000542, 0x0000 }, /* R1346 (0x542) - AIF2 Rx Pin Ctrl */
+	{ 0x00000543, 0x0000 }, /* R1347 (0x543) - AIF2 Rate Ctrl */
+	{ 0x00000544, 0x0000 }, /* R1348 (0x544) - AIF2 Format */
+	{ 0x00000546, 0x0040 }, /* R1350 (0x546) - AIF2 Rx BCLK Rate */
+	{ 0x00000547, 0x1818 }, /* R1351 (0x547) - AIF2 Frame Ctrl 1 */
+	{ 0x00000548, 0x1818 }, /* R1352 (0x548) - AIF2 Frame Ctrl 2 */
+	{ 0x00000549, 0x0000 }, /* R1353 (0x549) - AIF2 Frame Ctrl 3 */
+	{ 0x0000054a, 0x0001 }, /* R1354 (0x54A) - AIF2 Frame Ctrl 4 */
+	{ 0x0000054b, 0x0002 }, /* R1355 (0x54B) - AIF2 Frame Ctrl 5 */
+	{ 0x0000054c, 0x0003 }, /* R1356 (0x54C) - AIF2 Frame Ctrl 6 */
+	{ 0x00000551, 0x0000 }, /* R1361 (0x551) - AIF2 Frame Ctrl 11 */
+	{ 0x00000552, 0x0001 }, /* R1362 (0x552) - AIF2 Frame Ctrl 12 */
+	{ 0x00000553, 0x0002 }, /* R1363 (0x553) - AIF2 Frame Ctrl 13 */
+	{ 0x00000554, 0x0003 }, /* R1364 (0x554) - AIF2 Frame Ctrl 14 */
+	{ 0x00000559, 0x0000 }, /* R1369 (0x559) - AIF2 Tx Enables */
+	{ 0x0000055a, 0x0000 }, /* R1370 (0x55A) - AIF2 Rx Enables */
+	{ 0x00000580, 0x000c }, /* R1408 (0x580) - AIF3 BCLK Ctrl */
+	{ 0x00000581, 0x0000 }, /* R1409 (0x581) - AIF3 Tx Pin Ctrl */
+	{ 0x00000582, 0x0000 }, /* R1410 (0x582) - AIF3 Rx Pin Ctrl */
+	{ 0x00000583, 0x0000 }, /* R1411 (0x583) - AIF3 Rate Ctrl */
+	{ 0x00000584, 0x0000 }, /* R1412 (0x584) - AIF3 Format */
+	{ 0x00000586, 0x0040 }, /* R1414 (0x586) - AIF3 Rx BCLK Rate */
+	{ 0x00000587, 0x1818 }, /* R1415 (0x587) - AIF3 Frame Ctrl 1 */
+	{ 0x00000588, 0x1818 }, /* R1416 (0x588) - AIF3 Frame Ctrl 2 */
+	{ 0x00000589, 0x0000 }, /* R1417 (0x589) - AIF3 Frame Ctrl 3 */
+	{ 0x0000058a, 0x0001 }, /* R1418 (0x58A) - AIF3 Frame Ctrl 4 */
+	{ 0x00000591, 0x0000 }, /* R1425 (0x591) - AIF3 Frame Ctrl 11 */
+	{ 0x00000592, 0x0001 }, /* R1426 (0x592) - AIF3 Frame Ctrl 12 */
+	{ 0x00000599, 0x0000 }, /* R1433 (0x599) - AIF3 Tx Enables */
+	{ 0x0000059a, 0x0000 }, /* R1434 (0x59A) - AIF3 Rx Enables */
+	{ 0x000005c2, 0x0000 }, /* R1474 (0x5C2) - SPD1 Tx Control */
+	{ 0x00000640, 0x0000 }, /* R1600 (0x640) - PWM1MIX Input 1 Source */
+	{ 0x00000641, 0x0080 }, /* R1601 (0x641) - PWM1MIX Input 1 Volume */
+	{ 0x00000642, 0x0000 }, /* R1602 (0x642) - PWM1MIX Input 2 Source */
+	{ 0x00000643, 0x0080 }, /* R1603 (0x643) - PWM1MIX Input 2 Volume */
+	{ 0x00000644, 0x0000 }, /* R1604 (0x644) - PWM1MIX Input 3 Source */
+	{ 0x00000645, 0x0080 }, /* R1605 (0x645) - PWM1MIX Input 3 Volume */
+	{ 0x00000646, 0x0000 }, /* R1606 (0x646) - PWM1MIX Input 4 Source */
+	{ 0x00000647, 0x0080 }, /* R1607 (0x647) - PWM1MIX Input 4 Volume */
+	{ 0x00000648, 0x0000 }, /* R1608 (0x648) - PWM2MIX Input 1 Source */
+	{ 0x00000649, 0x0080 }, /* R1609 (0x649) - PWM2MIX Input 1 Volume */
+	{ 0x0000064a, 0x0000 }, /* R1610 (0x64A) - PWM2MIX Input 2 Source */
+	{ 0x0000064b, 0x0080 }, /* R1611 (0x64B) - PWM2MIX Input 2 Volume */
+	{ 0x0000064c, 0x0000 }, /* R1612 (0x64C) - PWM2MIX Input 3 Source */
+	{ 0x0000064d, 0x0080 }, /* R1613 (0x64D) - PWM2MIX Input 3 Volume */
+	{ 0x0000064e, 0x0000 }, /* R1614 (0x64E) - PWM2MIX Input 4 Source */
+	{ 0x0000064f, 0x0080 }, /* R1615 (0x64F) - PWM2MIX Input 4 Volume */
+	{ 0x00000680, 0x0000 }, /* R1664 (0x680) - OUT1LMIX Input 1 Source */
+	{ 0x00000681, 0x0080 }, /* R1665 (0x681) - OUT1LMIX Input 1 Volume */
+	{ 0x00000682, 0x0000 }, /* R1666 (0x682) - OUT1LMIX Input 2 Source */
+	{ 0x00000683, 0x0080 }, /* R1667 (0x683) - OUT1LMIX Input 2 Volume */
+	{ 0x00000684, 0x0000 }, /* R1668 (0x684) - OUT1LMIX Input 3 Source */
+	{ 0x00000685, 0x0080 }, /* R1669 (0x685) - OUT1LMIX Input 3 Volume */
+	{ 0x00000686, 0x0000 }, /* R1670 (0x686) - OUT1LMIX Input 4 Source */
+	{ 0x00000687, 0x0080 }, /* R1671 (0x687) - OUT1LMIX Input 4 Volume */
+	{ 0x00000688, 0x0000 }, /* R1672 (0x688) - OUT1RMIX Input 1 Source */
+	{ 0x00000689, 0x0080 }, /* R1673 (0x689) - OUT1RMIX Input 1 Volume */
+	{ 0x0000068a, 0x0000 }, /* R1674 (0x68A) - OUT1RMIX Input 2 Source */
+	{ 0x0000068b, 0x0080 }, /* R1675 (0x68B) - OUT1RMIX Input 2 Volume */
+	{ 0x0000068c, 0x0000 }, /* R1676 (0x68C) - OUT1RMIX Input 3 Source */
+	{ 0x0000068d, 0x0080 }, /* R1677 (0x68D) - OUT1RMIX Input 3 Volume */
+	{ 0x0000068e, 0x0000 }, /* R1678 (0x68E) - OUT1RMIX Input 4 Source */
+	{ 0x0000068f, 0x0080 }, /* R1679 (0x68F) - OUT1RMIX Input 4 Volume */
+	{ 0x000006b0, 0x0000 }, /* R1712 (0x6B0) - OUT4LMIX Input 1 Source */
+	{ 0x000006b1, 0x0080 }, /* R1713 (0x6B1) - OUT4LMIX Input 1 Volume */
+	{ 0x000006b2, 0x0000 }, /* R1714 (0x6B2) - OUT4LMIX Input 2 Source */
+	{ 0x000006b3, 0x0080 }, /* R1715 (0x6B3) - OUT4LMIX Input 2 Volume */
+	{ 0x000006b4, 0x0000 }, /* R1716 (0x6B4) - OUT4LMIX Input 3 Source */
+	{ 0x000006b5, 0x0080 }, /* R1717 (0x6B5) - OUT4LMIX Input 3 Volume */
+	{ 0x000006b6, 0x0000 }, /* R1718 (0x6B6) - OUT4LMIX Input 4 Source */
+	{ 0x000006b7, 0x0080 }, /* R1719 (0x6B7) - OUT4LMIX Input 4 Volume */
+	{ 0x000006c0, 0x0000 }, /* R1728 (0x6C0) - OUT5LMIX Input 1 Source */
+	{ 0x000006c1, 0x0080 }, /* R1729 (0x6C1) - OUT5LMIX Input 1 Volume */
+	{ 0x000006c2, 0x0000 }, /* R1730 (0x6C2) - OUT5LMIX Input 2 Source */
+	{ 0x000006c3, 0x0080 }, /* R1731 (0x6C3) - OUT5LMIX Input 2 Volume */
+	{ 0x000006c4, 0x0000 }, /* R1732 (0x6C4) - OUT5LMIX Input 3 Source */
+	{ 0x000006c5, 0x0080 }, /* R1733 (0x6C5) - OUT5LMIX Input 3 Volume */
+	{ 0x000006c6, 0x0000 }, /* R1734 (0x6C6) - OUT5LMIX Input 4 Source */
+	{ 0x000006c7, 0x0080 }, /* R1735 (0x6C7) - OUT5LMIX Input 4 Volume */
+	{ 0x000006c8, 0x0000 }, /* R1736 (0x6C8) - OUT5RMIX Input 1 Source */
+	{ 0x000006c9, 0x0080 }, /* R1737 (0x6C9) - OUT5RMIX Input 1 Volume */
+	{ 0x000006ca, 0x0000 }, /* R1738 (0x6CA) - OUT5RMIX Input 2 Source */
+	{ 0x000006cb, 0x0080 }, /* R1739 (0x6CB) - OUT5RMIX Input 2 Volume */
+	{ 0x000006cc, 0x0000 }, /* R1740 (0x6CC) - OUT5RMIX Input 3 Source */
+	{ 0x000006cd, 0x0080 }, /* R1741 (0x6CD) - OUT5RMIX Input 3 Volume */
+	{ 0x000006ce, 0x0000 }, /* R1742 (0x6CE) - OUT5RMIX Input 4 Source */
+	{ 0x000006cf, 0x0080 }, /* R1743 (0x6CF) - OUT5RMIX Input 4 Volume */
+	{ 0x00000700, 0x0000 }, /* R1792 (0x700) - AIF1TX1MIX Input 1 Source */
+	{ 0x00000701, 0x0080 }, /* R1793 (0x701) - AIF1TX1MIX Input 1 Volume */
+	{ 0x00000702, 0x0000 }, /* R1794 (0x702) - AIF1TX1MIX Input 2 Source */
+	{ 0x00000703, 0x0080 }, /* R1795 (0x703) - AIF1TX1MIX Input 2 Volume */
+	{ 0x00000704, 0x0000 }, /* R1796 (0x704) - AIF1TX1MIX Input 3 Source */
+	{ 0x00000705, 0x0080 }, /* R1797 (0x705) - AIF1TX1MIX Input 3 Volume */
+	{ 0x00000706, 0x0000 }, /* R1798 (0x706) - AIF1TX1MIX Input 4 Source */
+	{ 0x00000707, 0x0080 }, /* R1799 (0x707) - AIF1TX1MIX Input 4 Volume */
+	{ 0x00000708, 0x0000 }, /* R1800 (0x708) - AIF1TX2MIX Input 1 Source */
+	{ 0x00000709, 0x0080 }, /* R1801 (0x709) - AIF1TX2MIX Input 1 Volume */
+	{ 0x0000070a, 0x0000 }, /* R1802 (0x70A) - AIF1TX2MIX Input 2 Source */
+	{ 0x0000070b, 0x0080 }, /* R1803 (0x70B) - AIF1TX2MIX Input 2 Volume */
+	{ 0x0000070c, 0x0000 }, /* R1804 (0x70C) - AIF1TX2MIX Input 3 Source */
+	{ 0x0000070d, 0x0080 }, /* R1805 (0x70D) - AIF1TX2MIX Input 3 Volume */
+	{ 0x0000070e, 0x0000 }, /* R1806 (0x70E) - AIF1TX2MIX Input 4 Source */
+	{ 0x0000070f, 0x0080 }, /* R1807 (0x70F) - AIF1TX2MIX Input 4 Volume */
+	{ 0x00000710, 0x0000 }, /* R1808 (0x710) - AIF1TX3MIX Input 1 Source */
+	{ 0x00000711, 0x0080 }, /* R1809 (0x711) - AIF1TX3MIX Input 1 Volume */
+	{ 0x00000712, 0x0000 }, /* R1810 (0x712) - AIF1TX3MIX Input 2 Source */
+	{ 0x00000713, 0x0080 }, /* R1811 (0x713) - AIF1TX3MIX Input 2 Volume */
+	{ 0x00000714, 0x0000 }, /* R1812 (0x714) - AIF1TX3MIX Input 3 Source */
+	{ 0x00000715, 0x0080 }, /* R1813 (0x715) - AIF1TX3MIX Input 3 Volume */
+	{ 0x00000716, 0x0000 }, /* R1814 (0x716) - AIF1TX3MIX Input 4 Source */
+	{ 0x00000717, 0x0080 }, /* R1815 (0x717) - AIF1TX3MIX Input 4 Volume */
+	{ 0x00000718, 0x0000 }, /* R1816 (0x718) - AIF1TX4MIX Input 1 Source */
+	{ 0x00000719, 0x0080 }, /* R1817 (0x719) - AIF1TX4MIX Input 1 Volume */
+	{ 0x0000071a, 0x0000 }, /* R1818 (0x71A) - AIF1TX4MIX Input 2 Source */
+	{ 0x0000071b, 0x0080 }, /* R1819 (0x71B) - AIF1TX4MIX Input 2 Volume */
+	{ 0x0000071c, 0x0000 }, /* R1820 (0x71C) - AIF1TX4MIX Input 3 Source */
+	{ 0x0000071d, 0x0080 }, /* R1821 (0x71D) - AIF1TX4MIX Input 3 Volume */
+	{ 0x0000071e, 0x0000 }, /* R1822 (0x71E) - AIF1TX4MIX Input 4 Source */
+	{ 0x0000071f, 0x0080 }, /* R1823 (0x71F) - AIF1TX4MIX Input 4 Volume */
+	{ 0x00000720, 0x0000 }, /* R1824 (0x720) - AIF1TX5MIX Input 1 Source */
+	{ 0x00000721, 0x0080 }, /* R1825 (0x721) - AIF1TX5MIX Input 1 Volume */
+	{ 0x00000722, 0x0000 }, /* R1826 (0x722) - AIF1TX5MIX Input 2 Source */
+	{ 0x00000723, 0x0080 }, /* R1827 (0x723) - AIF1TX5MIX Input 2 Volume */
+	{ 0x00000724, 0x0000 }, /* R1828 (0x724) - AIF1TX5MIX Input 3 Source */
+	{ 0x00000725, 0x0080 }, /* R1829 (0x725) - AIF1TX5MIX Input 3 Volume */
+	{ 0x00000726, 0x0000 }, /* R1830 (0x726) - AIF1TX5MIX Input 4 Source */
+	{ 0x00000727, 0x0080 }, /* R1831 (0x727) - AIF1TX5MIX Input 4 Volume */
+	{ 0x00000728, 0x0000 }, /* R1832 (0x728) - AIF1TX6MIX Input 1 Source */
+	{ 0x00000729, 0x0080 }, /* R1833 (0x729) - AIF1TX6MIX Input 1 Volume */
+	{ 0x0000072a, 0x0000 }, /* R1834 (0x72A) - AIF1TX6MIX Input 2 Source */
+	{ 0x0000072b, 0x0080 }, /* R1835 (0x72B) - AIF1TX6MIX Input 2 Volume */
+	{ 0x0000072c, 0x0000 }, /* R1836 (0x72C) - AIF1TX6MIX Input 3 Source */
+	{ 0x0000072d, 0x0080 }, /* R1837 (0x72D) - AIF1TX6MIX Input 3 Volume */
+	{ 0x0000072e, 0x0000 }, /* R1838 (0x72E) - AIF1TX6MIX Input 4 Source */
+	{ 0x0000072f, 0x0080 }, /* R1839 (0x72F) - AIF1TX6MIX Input 4 Volume */
+	{ 0x00000740, 0x0000 }, /* R1856 (0x740) - AIF2TX1MIX Input 1 Source */
+	{ 0x00000741, 0x0080 }, /* R1857 (0x741) - AIF2TX1MIX Input 1 Volume */
+	{ 0x00000742, 0x0000 }, /* R1858 (0x742) - AIF2TX1MIX Input 2 Source */
+	{ 0x00000743, 0x0080 }, /* R1859 (0x743) - AIF2TX1MIX Input 2 Volume */
+	{ 0x00000744, 0x0000 }, /* R1860 (0x744) - AIF2TX1MIX Input 3 Source */
+	{ 0x00000745, 0x0080 }, /* R1861 (0x745) - AIF2TX1MIX Input 3 Volume */
+	{ 0x00000746, 0x0000 }, /* R1862 (0x746) - AIF2TX1MIX Input 4 Source */
+	{ 0x00000747, 0x0080 }, /* R1863 (0x747) - AIF2TX1MIX Input 4 Volume */
+	{ 0x00000748, 0x0000 }, /* R1864 (0x748) - AIF2TX2MIX Input 1 Source */
+	{ 0x00000749, 0x0080 }, /* R1865 (0x749) - AIF2TX2MIX Input 1 Volume */
+	{ 0x0000074a, 0x0000 }, /* R1866 (0x74A) - AIF2TX2MIX Input 2 Source */
+	{ 0x0000074b, 0x0080 }, /* R1867 (0x74B) - AIF2TX2MIX Input 2 Volume */
+	{ 0x0000074c, 0x0000 }, /* R1868 (0x74C) - AIF2TX2MIX Input 3 Source */
+	{ 0x0000074d, 0x0080 }, /* R1869 (0x74D) - AIF2TX2MIX Input 3 Volume */
+	{ 0x0000074e, 0x0000 }, /* R1870 (0x74E) - AIF2TX2MIX Input 4 Source */
+	{ 0x0000074f, 0x0080 }, /* R1871 (0x74F) - AIF2TX2MIX Input 4 Volume */
+	{ 0x00000750, 0x0000 }, /* R1872 (0x750) - AIF2TX3MIX Input 1 Source */
+	{ 0x00000751, 0x0080 }, /* R1873 (0x751) - AIF2TX3MIX Input 1 Volume */
+	{ 0x00000752, 0x0000 }, /* R1874 (0x752) - AIF2TX3MIX Input 2 Source */
+	{ 0x00000753, 0x0080 }, /* R1875 (0x753) - AIF2TX3MIX Input 2 Volume */
+	{ 0x00000754, 0x0000 }, /* R1876 (0x754) - AIF2TX3MIX Input 3 Source */
+	{ 0x00000755, 0x0080 }, /* R1877 (0x755) - AIF2TX3MIX Input 3 Volume */
+	{ 0x00000756, 0x0000 }, /* R1878 (0x756) - AIF2TX3MIX Input 4 Source */
+	{ 0x00000757, 0x0080 }, /* R1879 (0x757) - AIF2TX3MIX Input 4 Volume */
+	{ 0x00000758, 0x0000 }, /* R1880 (0x758) - AIF2TX4MIX Input 1 Source */
+	{ 0x00000759, 0x0080 }, /* R1881 (0x759) - AIF2TX4MIX Input 1 Volume */
+	{ 0x0000075a, 0x0000 }, /* R1882 (0x75A) - AIF2TX4MIX Input 2 Source */
+	{ 0x0000075b, 0x0080 }, /* R1883 (0x75B) - AIF2TX4MIX Input 2 Volume */
+	{ 0x0000075c, 0x0000 }, /* R1884 (0x75C) - AIF2TX4MIX Input 3 Source */
+	{ 0x0000075d, 0x0080 }, /* R1885 (0x75D) - AIF2TX4MIX Input 3 Volume */
+	{ 0x0000075e, 0x0000 }, /* R1886 (0x75E) - AIF2TX4MIX Input 4 Source */
+	{ 0x0000075f, 0x0080 }, /* R1887 (0x75F) - AIF2TX4MIX Input 4 Volume */
+	{ 0x00000780, 0x0000 }, /* R1920 (0x780) - AIF3TX1MIX Input 1 Source */
+	{ 0x00000781, 0x0080 }, /* R1921 (0x781) - AIF3TX1MIX Input 1 Volume */
+	{ 0x00000782, 0x0000 }, /* R1922 (0x782) - AIF3TX1MIX Input 2 Source */
+	{ 0x00000783, 0x0080 }, /* R1923 (0x783) - AIF3TX1MIX Input 2 Volume */
+	{ 0x00000784, 0x0000 }, /* R1924 (0x784) - AIF3TX1MIX Input 3 Source */
+	{ 0x00000785, 0x0080 }, /* R1925 (0x785) - AIF3TX1MIX Input 3 Volume */
+	{ 0x00000786, 0x0000 }, /* R1926 (0x786) - AIF3TX1MIX Input 4 Source */
+	{ 0x00000787, 0x0080 }, /* R1927 (0x787) - AIF3TX1MIX Input 4 Volume */
+	{ 0x00000788, 0x0000 }, /* R1928 (0x788) - AIF3TX2MIX Input 1 Source */
+	{ 0x00000789, 0x0080 }, /* R1929 (0x789) - AIF3TX2MIX Input 1 Volume */
+	{ 0x0000078a, 0x0000 }, /* R1930 (0x78A) - AIF3TX2MIX Input 2 Source */
+	{ 0x0000078b, 0x0080 }, /* R1931 (0x78B) - AIF3TX2MIX Input 2 Volume */
+	{ 0x0000078c, 0x0000 }, /* R1932 (0x78C) - AIF3TX2MIX Input 3 Source */
+	{ 0x0000078d, 0x0080 }, /* R1933 (0x78D) - AIF3TX2MIX Input 3 Volume */
+	{ 0x0000078e, 0x0000 }, /* R1934 (0x78E) - AIF3TX2MIX Input 4 Source */
+	{ 0x0000078f, 0x0080 }, /* R1935 (0x78F) - AIF3TX2MIX Input 4 Volume */
+	{ 0x00000800, 0x0000 }, /* R2048 (0x800) - SPDIF1TX1MIX Input 1 Source */
+	{ 0x00000801, 0x0080 }, /* R2049 (0x801) - SPDIF1TX1MIX Input 1 Volume */
+	{ 0x00000808, 0x0000 }, /* R2056 (0x808) - SPDIF1TX2MIX Input 1 Source */
+	{ 0x00000809, 0x0080 }, /* R2057 (0x809) - SPDIF1TX2MIX Input 1 Volume */
+	{ 0x00000880, 0x0000 }, /* R2176 (0x880) - EQ1MIX Input 1 Source */
+	{ 0x00000881, 0x0080 }, /* R2177 (0x881) - EQ1MIX Input 1 Volume */
+	{ 0x00000882, 0x0000 }, /* R2178 (0x882) - EQ1MIX Input 2 Source */
+	{ 0x00000883, 0x0080 }, /* R2179 (0x883) - EQ1MIX Input 2 Volume */
+	{ 0x00000884, 0x0000 }, /* R2180 (0x884) - EQ1MIX Input 3 Source */
+	{ 0x00000885, 0x0080 }, /* R2181 (0x885) - EQ1MIX Input 3 Volume */
+	{ 0x00000886, 0x0000 }, /* R2182 (0x886) - EQ1MIX Input 4 Source */
+	{ 0x00000887, 0x0080 }, /* R2183 (0x887) - EQ1MIX Input 4 Volume */
+	{ 0x00000888, 0x0000 }, /* R2184 (0x888) - EQ2MIX Input 1 Source */
+	{ 0x00000889, 0x0080 }, /* R2185 (0x889) - EQ2MIX Input 1 Volume */
+	{ 0x0000088a, 0x0000 }, /* R2186 (0x88A) - EQ2MIX Input 2 Source */
+	{ 0x0000088b, 0x0080 }, /* R2187 (0x88B) - EQ2MIX Input 2 Volume */
+	{ 0x0000088c, 0x0000 }, /* R2188 (0x88C) - EQ2MIX Input 3 Source */
+	{ 0x0000088d, 0x0080 }, /* R2189 (0x88D) - EQ2MIX Input 3 Volume */
+	{ 0x0000088e, 0x0000 }, /* R2190 (0x88E) - EQ2MIX Input 4 Source */
+	{ 0x0000088f, 0x0080 }, /* R2191 (0x88F) - EQ2MIX Input 4 Volume */
+	{ 0x00000890, 0x0000 }, /* R2192 (0x890) - EQ3MIX Input 1 Source */
+	{ 0x00000891, 0x0080 }, /* R2193 (0x891) - EQ3MIX Input 1 Volume */
+	{ 0x00000892, 0x0000 }, /* R2194 (0x892) - EQ3MIX Input 2 Source */
+	{ 0x00000893, 0x0080 }, /* R2195 (0x893) - EQ3MIX Input 2 Volume */
+	{ 0x00000894, 0x0000 }, /* R2196 (0x894) - EQ3MIX Input 3 Source */
+	{ 0x00000895, 0x0080 }, /* R2197 (0x895) - EQ3MIX Input 3 Volume */
+	{ 0x00000896, 0x0000 }, /* R2198 (0x896) - EQ3MIX Input 4 Source */
+	{ 0x00000897, 0x0080 }, /* R2199 (0x897) - EQ3MIX Input 4 Volume */
+	{ 0x00000898, 0x0000 }, /* R2200 (0x898) - EQ4MIX Input 1 Source */
+	{ 0x00000899, 0x0080 }, /* R2201 (0x899) - EQ4MIX Input 1 Volume */
+	{ 0x0000089a, 0x0000 }, /* R2202 (0x89A) - EQ4MIX Input 2 Source */
+	{ 0x0000089b, 0x0080 }, /* R2203 (0x89B) - EQ4MIX Input 2 Volume */
+	{ 0x0000089c, 0x0000 }, /* R2204 (0x89C) - EQ4MIX Input 3 Source */
+	{ 0x0000089d, 0x0080 }, /* R2205 (0x89D) - EQ4MIX Input 3 Volume */
+	{ 0x0000089e, 0x0000 }, /* R2206 (0x89E) - EQ4MIX Input 4 Source */
+	{ 0x0000089f, 0x0080 }, /* R2207 (0x89F) - EQ4MIX Input 4 Volume */
+	{ 0x000008c0, 0x0000 }, /* R2240 (0x8C0) - DRC1LMIX Input 1 Source */
+	{ 0x000008c1, 0x0080 }, /* R2241 (0x8C1) - DRC1LMIX Input 1 Volume */
+	{ 0x000008c2, 0x0000 }, /* R2242 (0x8C2) - DRC1LMIX Input 2 Source */
+	{ 0x000008c3, 0x0080 }, /* R2243 (0x8C3) - DRC1LMIX Input 2 Volume */
+	{ 0x000008c4, 0x0000 }, /* R2244 (0x8C4) - DRC1LMIX Input 3 Source */
+	{ 0x000008c5, 0x0080 }, /* R2245 (0x8C5) - DRC1LMIX Input 3 Volume */
+	{ 0x000008c6, 0x0000 }, /* R2246 (0x8C6) - DRC1LMIX Input 4 Source */
+	{ 0x000008c7, 0x0080 }, /* R2247 (0x8C7) - DRC1LMIX Input 4 Volume */
+	{ 0x000008c8, 0x0000 }, /* R2248 (0x8C8) - DRC1RMIX Input 1 Source */
+	{ 0x000008c9, 0x0080 }, /* R2249 (0x8C9) - DRC1RMIX Input 1 Volume */
+	{ 0x000008ca, 0x0000 }, /* R2250 (0x8CA) - DRC1RMIX Input 2 Source */
+	{ 0x000008cb, 0x0080 }, /* R2251 (0x8CB) - DRC1RMIX Input 2 Volume */
+	{ 0x000008cc, 0x0000 }, /* R2252 (0x8CC) - DRC1RMIX Input 3 Source */
+	{ 0x000008cd, 0x0080 }, /* R2253 (0x8CD) - DRC1RMIX Input 3 Volume */
+	{ 0x000008ce, 0x0000 }, /* R2254 (0x8CE) - DRC1RMIX Input 4 Source */
+	{ 0x000008cf, 0x0080 }, /* R2255 (0x8CF) - DRC1RMIX Input 4 Volume */
+	{ 0x000008d0, 0x0000 }, /* R2256 (0x8D0) - DRC2LMIX Input 1 Source */
+	{ 0x000008d1, 0x0080 }, /* R2257 (0x8D1) - DRC2LMIX Input 1 Volume */
+	{ 0x000008d2, 0x0000 }, /* R2258 (0x8D2) - DRC2LMIX Input 2 Source */
+	{ 0x000008d3, 0x0080 }, /* R2259 (0x8D3) - DRC2LMIX Input 2 Volume */
+	{ 0x000008d4, 0x0000 }, /* R2260 (0x8D4) - DRC2LMIX Input 3 Source */
+	{ 0x000008d5, 0x0080 }, /* R2261 (0x8D5) - DRC2LMIX Input 3 Volume */
+	{ 0x000008d6, 0x0000 }, /* R2262 (0x8D6) - DRC2LMIX Input 4 Source */
+	{ 0x000008d7, 0x0080 }, /* R2263 (0x8D7) - DRC2LMIX Input 4 Volume */
+	{ 0x000008d8, 0x0000 }, /* R2264 (0x8D8) - DRC2RMIX Input 1 Source */
+	{ 0x000008d9, 0x0080 }, /* R2265 (0x8D9) - DRC2RMIX Input 1 Volume */
+	{ 0x000008da, 0x0000 }, /* R2266 (0x8DA) - DRC2RMIX Input 2 Source */
+	{ 0x000008db, 0x0080 }, /* R2267 (0x8DB) - DRC2RMIX Input 2 Volume */
+	{ 0x000008dc, 0x0000 }, /* R2268 (0x8DC) - DRC2RMIX Input 3 Source */
+	{ 0x000008dd, 0x0080 }, /* R2269 (0x8DD) - DRC2RMIX Input 3 Volume */
+	{ 0x000008de, 0x0000 }, /* R2270 (0x8DE) - DRC2RMIX Input 4 Source */
+	{ 0x000008df, 0x0080 }, /* R2271 (0x8DF) - DRC2RMIX Input 4 Volume */
+	{ 0x00000900, 0x0000 }, /* R2304 (0x900) - HPLP1MIX Input 1 Source */
+	{ 0x00000901, 0x0080 }, /* R2305 (0x901) - HPLP1MIX Input 1 Volume */
+	{ 0x00000902, 0x0000 }, /* R2306 (0x902) - HPLP1MIX Input 2 Source */
+	{ 0x00000903, 0x0080 }, /* R2307 (0x903) - HPLP1MIX Input 2 Volume */
+	{ 0x00000904, 0x0000 }, /* R2308 (0x904) - HPLP1MIX Input 3 Source */
+	{ 0x00000905, 0x0080 }, /* R2309 (0x905) - HPLP1MIX Input 3 Volume */
+	{ 0x00000906, 0x0000 }, /* R2310 (0x906) - HPLP1MIX Input 4 Source */
+	{ 0x00000907, 0x0080 }, /* R2311 (0x907) - HPLP1MIX Input 4 Volume */
+	{ 0x00000908, 0x0000 }, /* R2312 (0x908) - HPLP2MIX Input 1 Source */
+	{ 0x00000909, 0x0080 }, /* R2313 (0x909) - HPLP2MIX Input 1 Volume */
+	{ 0x0000090a, 0x0000 }, /* R2314 (0x90A) - HPLP2MIX Input 2 Source */
+	{ 0x0000090b, 0x0080 }, /* R2315 (0x90B) - HPLP2MIX Input 2 Volume */
+	{ 0x0000090c, 0x0000 }, /* R2316 (0x90C) - HPLP2MIX Input 3 Source */
+	{ 0x0000090d, 0x0080 }, /* R2317 (0x90D) - HPLP2MIX Input 3 Volume */
+	{ 0x0000090e, 0x0000 }, /* R2318 (0x90E) - HPLP2MIX Input 4 Source */
+	{ 0x0000090f, 0x0080 }, /* R2319 (0x90F) - HPLP2MIX Input 4 Volume */
+	{ 0x00000910, 0x0000 }, /* R2320 (0x910) - HPLP3MIX Input 1 Source */
+	{ 0x00000911, 0x0080 }, /* R2321 (0x911) - HPLP3MIX Input 1 Volume */
+	{ 0x00000912, 0x0000 }, /* R2322 (0x912) - HPLP3MIX Input 2 Source */
+	{ 0x00000913, 0x0080 }, /* R2323 (0x913) - HPLP3MIX Input 2 Volume */
+	{ 0x00000914, 0x0000 }, /* R2324 (0x914) - HPLP3MIX Input 3 Source */
+	{ 0x00000915, 0x0080 }, /* R2325 (0x915) - HPLP3MIX Input 3 Volume */
+	{ 0x00000916, 0x0000 }, /* R2326 (0x916) - HPLP3MIX Input 4 Source */
+	{ 0x00000917, 0x0080 }, /* R2327 (0x917) - HPLP3MIX Input 4 Volume */
+	{ 0x00000918, 0x0000 }, /* R2328 (0x918) - HPLP4MIX Input 1 Source */
+	{ 0x00000919, 0x0080 }, /* R2329 (0x919) - HPLP4MIX Input 1 Volume */
+	{ 0x0000091a, 0x0000 }, /* R2330 (0x91A) - HPLP4MIX Input 2 Source */
+	{ 0x0000091b, 0x0080 }, /* R2331 (0x91B) - HPLP4MIX Input 2 Volume */
+	{ 0x0000091c, 0x0000 }, /* R2332 (0x91C) - HPLP4MIX Input 3 Source */
+	{ 0x0000091d, 0x0080 }, /* R2333 (0x91D) - HPLP4MIX Input 3 Volume */
+	{ 0x0000091e, 0x0000 }, /* R2334 (0x91E) - HPLP4MIX Input 4 Source */
+	{ 0x0000091f, 0x0080 }, /* R2335 (0x91F) - HPLP4MIX Input 4 Volume */
+	{ 0x00000940, 0x0000 }, /* R2368 (0x940) - DSP1LMIX Input 1 Source */
+	{ 0x00000941, 0x0080 }, /* R2369 (0x941) - DSP1LMIX Input 1 Volume */
+	{ 0x00000942, 0x0000 }, /* R2370 (0x942) - DSP1LMIX Input 2 Source */
+	{ 0x00000943, 0x0080 }, /* R2371 (0x943) - DSP1LMIX Input 2 Volume */
+	{ 0x00000944, 0x0000 }, /* R2372 (0x944) - DSP1LMIX Input 3 Source */
+	{ 0x00000945, 0x0080 }, /* R2373 (0x945) - DSP1LMIX Input 3 Volume */
+	{ 0x00000946, 0x0000 }, /* R2374 (0x946) - DSP1LMIX Input 4 Source */
+	{ 0x00000947, 0x0080 }, /* R2375 (0x947) - DSP1LMIX Input 4 Volume */
+	{ 0x00000948, 0x0000 }, /* R2376 (0x948) - DSP1RMIX Input 1 Source */
+	{ 0x00000949, 0x0080 }, /* R2377 (0x949) - DSP1RMIX Input 1 Volume */
+	{ 0x0000094a, 0x0000 }, /* R2378 (0x94A) - DSP1RMIX Input 2 Source */
+	{ 0x0000094b, 0x0080 }, /* R2379 (0x94B) - DSP1RMIX Input 2 Volume */
+	{ 0x0000094c, 0x0000 }, /* R2380 (0x94C) - DSP1RMIX Input 3 Source */
+	{ 0x0000094d, 0x0080 }, /* R2381 (0x94D) - DSP1RMIX Input 3 Volume */
+	{ 0x0000094e, 0x0000 }, /* R2382 (0x94E) - DSP1RMIX Input 4 Source */
+	{ 0x0000094f, 0x0080 }, /* R2383 (0x94F) - DSP1RMIX Input 4 Volume */
+	{ 0x00000950, 0x0000 }, /* R2384 (0x950) - DSP1AUX1MIX Input 1 Source */
+	{ 0x00000958, 0x0000 }, /* R2392 (0x958) - DSP1AUX2MIX Input 1 Source */
+	{ 0x00000960, 0x0000 }, /* R2400 (0x960) - DSP1AUX3MIX Input 1 Source */
+	{ 0x00000968, 0x0000 }, /* R2408 (0x968) - DSP1AUX4MIX Input 1 Source */
+	{ 0x00000970, 0x0000 }, /* R2416 (0x970) - DSP1AUX5MIX Input 1 Source */
+	{ 0x00000978, 0x0000 }, /* R2424 (0x978) - DSP1AUX6MIX Input 1 Source */
+	{ 0x00000b00, 0x0000 }, /* R2816 (0xB00) - ISRC1DEC1MIX Input 1 Source */
+	{ 0x00000b08, 0x0000 }, /* R2824 (0xB08) - ISRC1DEC2MIX Input 1 Source */
+	{ 0x00000b10, 0x0000 }, /* R2832 (0xB10) - ISRC1DEC3MIX Input 1 Source */
+	{ 0x00000b18, 0x0000 }, /* R2840 (0xB18) - ISRC1DEC4MIX Input 1 Source */
+	{ 0x00000b20, 0x0000 }, /* R2848 (0xB20) - ISRC1INT1MIX Input 1 Source */
+	{ 0x00000b28, 0x0000 }, /* R2856 (0xB28) - ISRC1INT2MIX Input 1 Source */
+	{ 0x00000b30, 0x0000 }, /* R2864 (0xB30) - ISRC1INT3MIX Input 1 Source */
+	{ 0x00000b38, 0x0000 }, /* R2872 (0xB38) - ISRC1INT4MIX Input 1 Source */
+	{ 0x00000b40, 0x0000 }, /* R2880 (0xB40) - ISRC2DEC1MIX Input 1 Source */
+	{ 0x00000b48, 0x0000 }, /* R2888 (0xB48) - ISRC2DEC2MIX Input 1 Source */
+	{ 0x00000b50, 0x0000 }, /* R2896 (0xB50) - ISRC2DEC3MIX Input 1 Source */
+	{ 0x00000b58, 0x0000 }, /* R2904 (0xB58) - ISRC2DEC4MIX Input 1 Source */
+	{ 0x00000b60, 0x0000 }, /* R2912 (0xB60) - ISRC2INT1MIX Input 1 Source */
+	{ 0x00000b68, 0x0000 }, /* R2920 (0xB68) - ISRC2INT2MIX Input 1 Source */
+	{ 0x00000b70, 0x0000 }, /* R2928 (0xB70) - ISRC2INT3MIX Input 1 Source */
+	{ 0x00000b78, 0x0000 }, /* R2936 (0xB78) - ISRC2INT4MIX Input 1 Source */
+	{ 0x00000e00, 0x0000 }, /* R3584 (0xE00) - FX Ctrl 1 */
+	{ 0x00000e10, 0x6318 }, /* R3600 (0xE10) - EQ1 1 */
+	{ 0x00000e11, 0x6300 }, /* R3601 (0xE11) - EQ1 2 */
+	{ 0x00000e12, 0x0fc8 }, /* R3602 (0xE12) - EQ1 3 */
+	{ 0x00000e13, 0x03fe }, /* R3603 (0xE13) - EQ1 4 */
+	{ 0x00000e14, 0x00e0 }, /* R3604 (0xE14) - EQ1 5 */
+	{ 0x00000e15, 0x1ec4 }, /* R3605 (0xE15) - EQ1 6 */
+	{ 0x00000e16, 0xf136 }, /* R3606 (0xE16) - EQ1 7 */
+	{ 0x00000e17, 0x0409 }, /* R3607 (0xE17) - EQ1 8 */
+	{ 0x00000e18, 0x04cc }, /* R3608 (0xE18) - EQ1 9 */
+	{ 0x00000e19, 0x1c9b }, /* R3609 (0xE19) - EQ1 10 */
+	{ 0x00000e1a, 0xf337 }, /* R3610 (0xE1A) - EQ1 11 */
+	{ 0x00000e1b, 0x040b }, /* R3611 (0xE1B) - EQ1 12 */
+	{ 0x00000e1c, 0x0cbb }, /* R3612 (0xE1C) - EQ1 13 */
+	{ 0x00000e1d, 0x16f8 }, /* R3613 (0xE1D) - EQ1 14 */
+	{ 0x00000e1e, 0xf7d9 }, /* R3614 (0xE1E) - EQ1 15 */
+	{ 0x00000e1f, 0x040a }, /* R3615 (0xE1F) - EQ1 16 */
+	{ 0x00000e20, 0x1f14 }, /* R3616 (0xE20) - EQ1 17 */
+	{ 0x00000e21, 0x058c }, /* R3617 (0xE21) - EQ1 18 */
+	{ 0x00000e22, 0x0563 }, /* R3618 (0xE22) - EQ1 19 */
+	{ 0x00000e23, 0x4000 }, /* R3619 (0xE23) - EQ1 20 */
+	{ 0x00000e24, 0x0b75 }, /* R3620 (0xE24) - EQ1 21 */
+	{ 0x00000e26, 0x6318 }, /* R3622 (0xE26) - EQ2 1 */
+	{ 0x00000e27, 0x6300 }, /* R3623 (0xE27) - EQ2 2 */
+	{ 0x00000e28, 0x0fc8 }, /* R3624 (0xE28) - EQ2 3 */
+	{ 0x00000e29, 0x03fe }, /* R3625 (0xE29) - EQ2 4 */
+	{ 0x00000e2a, 0x00e0 }, /* R3626 (0xE2A) - EQ2 5 */
+	{ 0x00000e2b, 0x1ec4 }, /* R3627 (0xE2B) - EQ2 6 */
+	{ 0x00000e2c, 0xf136 }, /* R3628 (0xE2C) - EQ2 7 */
+	{ 0x00000e2d, 0x0409 }, /* R3629 (0xE2D) - EQ2 8 */
+	{ 0x00000e2e, 0x04cc }, /* R3630 (0xE2E) - EQ2 9 */
+	{ 0x00000e2f, 0x1c9b }, /* R3631 (0xE2F) - EQ2 10 */
+	{ 0x00000e30, 0xf337 }, /* R3632 (0xE30) - EQ2 11 */
+	{ 0x00000e31, 0x040b }, /* R3633 (0xE31) - EQ2 12 */
+	{ 0x00000e32, 0x0cbb }, /* R3634 (0xE32) - EQ2 13 */
+	{ 0x00000e33, 0x16f8 }, /* R3635 (0xE33) - EQ2 14 */
+	{ 0x00000e34, 0xf7d9 }, /* R3636 (0xE34) - EQ2 15 */
+	{ 0x00000e35, 0x040a }, /* R3637 (0xE35) - EQ2 16 */
+	{ 0x00000e36, 0x1f14 }, /* R3638 (0xE36) - EQ2 17 */
+	{ 0x00000e37, 0x058c }, /* R3639 (0xE37) - EQ2 18 */
+	{ 0x00000e38, 0x0563 }, /* R3640 (0xE38) - EQ2 19 */
+	{ 0x00000e39, 0x4000 }, /* R3641 (0xE39) - EQ2 20 */
+	{ 0x00000e3a, 0x0b75 }, /* R3642 (0xE3A) - EQ2 21 */
+	{ 0x00000e3c, 0x6318 }, /* R3644 (0xE3C) - EQ3 1 */
+	{ 0x00000e3d, 0x6300 }, /* R3645 (0xE3D) - EQ3 2 */
+	{ 0x00000e3e, 0x0fc8 }, /* R3646 (0xE3E) - EQ3 3 */
+	{ 0x00000e3f, 0x03fe }, /* R3647 (0xE3F) - EQ3 4 */
+	{ 0x00000e40, 0x00e0 }, /* R3648 (0xE40) - EQ3 5 */
+	{ 0x00000e41, 0x1ec4 }, /* R3649 (0xE41) - EQ3 6 */
+	{ 0x00000e42, 0xf136 }, /* R3650 (0xE42) - EQ3 7 */
+	{ 0x00000e43, 0x0409 }, /* R3651 (0xE43) - EQ3 8 */
+	{ 0x00000e44, 0x04cc }, /* R3652 (0xE44) - EQ3 9 */
+	{ 0x00000e45, 0x1c9b }, /* R3653 (0xE45) - EQ3 10 */
+	{ 0x00000e46, 0xf337 }, /* R3654 (0xE46) - EQ3 11 */
+	{ 0x00000e47, 0x040b }, /* R3655 (0xE47) - EQ3 12 */
+	{ 0x00000e48, 0x0cbb }, /* R3656 (0xE48) - EQ3 13 */
+	{ 0x00000e49, 0x16f8 }, /* R3657 (0xE49) - EQ3 14 */
+	{ 0x00000e4a, 0xf7d9 }, /* R3658 (0xE4A) - EQ3 15 */
+	{ 0x00000e4b, 0x040a }, /* R3659 (0xE4B) - EQ3 16 */
+	{ 0x00000e4c, 0x1f14 }, /* R3660 (0xE4C) - EQ3 17 */
+	{ 0x00000e4d, 0x058c }, /* R3661 (0xE4D) - EQ3 18 */
+	{ 0x00000e4e, 0x0563 }, /* R3662 (0xE4E) - EQ3 19 */
+	{ 0x00000e4f, 0x4000 }, /* R3663 (0xE4F) - EQ3 20 */
+	{ 0x00000e50, 0x0b75 }, /* R3664 (0xE50) - EQ3 21 */
+	{ 0x00000e52, 0x6318 }, /* R3666 (0xE52) - EQ4 1 */
+	{ 0x00000e53, 0x6300 }, /* R3667 (0xE53) - EQ4 2 */
+	{ 0x00000e54, 0x0fc8 }, /* R3668 (0xE54) - EQ4 3 */
+	{ 0x00000e55, 0x03fe }, /* R3669 (0xE55) - EQ4 4 */
+	{ 0x00000e56, 0x00e0 }, /* R3670 (0xE56) - EQ4 5 */
+	{ 0x00000e57, 0x1ec4 }, /* R3671 (0xE57) - EQ4 6 */
+	{ 0x00000e58, 0xf136 }, /* R3672 (0xE58) - EQ4 7 */
+	{ 0x00000e59, 0x0409 }, /* R3673 (0xE59) - EQ4 8 */
+	{ 0x00000e5a, 0x04cc }, /* R3674 (0xE5A) - EQ4 9 */
+	{ 0x00000e5b, 0x1c9b }, /* R3675 (0xE5B) - EQ4 10 */
+	{ 0x00000e5c, 0xf337 }, /* R3676 (0xE5C) - EQ4 11 */
+	{ 0x00000e5d, 0x040b }, /* R3677 (0xE5D) - EQ4 12 */
+	{ 0x00000e5e, 0x0cbb }, /* R3678 (0xE5E) - EQ4 13 */
+	{ 0x00000e5f, 0x16f8 }, /* R3679 (0xE5F) - EQ4 14 */
+	{ 0x00000e60, 0xf7d9 }, /* R3680 (0xE60) - EQ4 15 */
+	{ 0x00000e61, 0x040a }, /* R3681 (0xE61) - EQ4 16 */
+	{ 0x00000e62, 0x1f14 }, /* R3682 (0xE62) - EQ4 17 */
+	{ 0x00000e63, 0x058c }, /* R3683 (0xE63) - EQ4 18 */
+	{ 0x00000e64, 0x0563 }, /* R3684 (0xE64) - EQ4 19 */
+	{ 0x00000e65, 0x4000 }, /* R3685 (0xE65) - EQ4 20 */
+	{ 0x00000e66, 0x0b75 }, /* R3686 (0xE66) - EQ4 21 */
+	{ 0x00000e80, 0x0018 }, /* R3712 (0xE80) - DRC1 Ctrl 1 */
+	{ 0x00000e81, 0x0933 }, /* R3713 (0xE81) - DRC1 Ctrl 2 */
+	{ 0x00000e82, 0x0018 }, /* R3714 (0xE82) - DRC1 Ctrl 3 */
+	{ 0x00000e83, 0x0000 }, /* R3715 (0xE83) - DRC1 Ctrl 4 */
+	{ 0x00000e84, 0x0000 }, /* R3716 (0xE84) - DRC1 Ctrl 5 */
+	{ 0x00000e88, 0x0018 }, /* R3720 (0xE88) - DRC2 Ctrl 1 */
+	{ 0x00000e89, 0x0933 }, /* R3721 (0xE89) - DRC2 Ctrl 2 */
+	{ 0x00000e8a, 0x0018 }, /* R3722 (0xE8A) - DRC2 Ctrl 3 */
+	{ 0x00000e8b, 0x0000 }, /* R3723 (0xE8B) - DRC2 Ctrl 4 */
+	{ 0x00000e8c, 0x0000 }, /* R3724 (0xE8C) - DRC2 Ctrl 5 */
+	{ 0x00000ec0, 0x0000 }, /* R3776 (0xEC0) - HPLPF1 1 */
+	{ 0x00000ec1, 0x0000 }, /* R3777 (0xEC1) - HPLPF1 2 */
+	{ 0x00000ec4, 0x0000 }, /* R3780 (0xEC4) - HPLPF2 1 */
+	{ 0x00000ec5, 0x0000 }, /* R3781 (0xEC5) - HPLPF2 2 */
+	{ 0x00000ec8, 0x0000 }, /* R3784 (0xEC8) - HPLPF3 1 */
+	{ 0x00000ec9, 0x0000 }, /* R3785 (0xEC9) - HPLPF3 2 */
+	{ 0x00000ecc, 0x0000 }, /* R3788 (0xECC) - HPLPF4 1 */
+	{ 0x00000ecd, 0x0000 }, /* R3789 (0xECD) - HPLPF4 2 */
+	{ 0x00000ef0, 0x0000 }, /* R3824 (0xEF0) - ISRC1 Ctrl 1 */
+	{ 0x00000ef1, 0x0001 }, /* R3825 (0xEF1) - ISRC1 Ctrl 2 */
+	{ 0x00000ef2, 0x0000 }, /* R3826 (0xEF2) - ISRC1 Ctrl 3 */
+	{ 0x00000ef3, 0x0000 }, /* R3827 (0xEF3) - ISRC2 Ctrl 1 */
+	{ 0x00000ef4, 0x0001 }, /* R3828 (0xEF4) - ISRC2 Ctrl 2 */
+	{ 0x00000ef5, 0x0000 }, /* R3829 (0xEF5) - ISRC2 Ctrl 3 */
+	{ 0x00001700, 0x2801 }, /* R5888 (0x1700) - GPIO1 Ctrl 1 */
+	{ 0x00001701, 0xe800 }, /* R5889 (0x1701) - GPIO1 Ctrl 2 */
+	{ 0x00001702, 0x2801 }, /* R5890 (0x1702) - GPIO2 Ctrl 1 */
+	{ 0x00001703, 0xe800 }, /* R5891 (0x1703) - GPIO2 Ctrl 2 */
+	{ 0x00001704, 0x2801 }, /* R5892 (0x1704) - GPIO3 Ctrl 1 */
+	{ 0x00001705, 0xe800 }, /* R5893 (0x1705) - GPIO3 Ctrl 2 */
+	{ 0x00001706, 0x2801 }, /* R5894 (0x1706) - GPIO4 Ctrl 1 */
+	{ 0x00001707, 0xe800 }, /* R5895 (0x1707) - GPIO4 Ctrl 2 */
+	{ 0x00001708, 0x2801 }, /* R5896 (0x1708) - GPIO5 Ctrl 1 */
+	{ 0x00001709, 0xe800 }, /* R5897 (0x1709) - GPIO5 Ctrl 2 */
+	{ 0x0000170a, 0x2801 }, /* R5898 (0x170A) - GPIO6 Ctrl 1 */
+	{ 0x0000170b, 0xe800 }, /* R5899 (0x170B) - GPIO6 Ctrl 2 */
+	{ 0x0000170c, 0x2801 }, /* R5900 (0x170C) - GPIO7 Ctrl 1 */
+	{ 0x0000170d, 0xe800 }, /* R5901 (0x170D) - GPIO7 Ctrl 2 */
+	{ 0x0000170e, 0x2801 }, /* R5902 (0x170E) - GPIO8 Ctrl 1 */
+	{ 0x0000170f, 0xe800 }, /* R5903 (0x170F) - GPIO8 Ctrl 2 */
+	{ 0x00001710, 0x2801 }, /* R5904 (0x1710) - GPIO9 Ctrl 1 */
+	{ 0x00001711, 0xe800 }, /* R5905 (0x1711) - GPIO9 Ctrl 2 */
+	{ 0x00001712, 0x2801 }, /* R5906 (0x1712) - GPIO10 Ctrl 1 */
+	{ 0x00001713, 0xe800 }, /* R5907 (0x1713) - GPIO10 Ctrl 2 */
+	{ 0x00001714, 0x2801 }, /* R5908 (0x1714) - GPIO11 Ctrl 1 */
+	{ 0x00001715, 0xe800 }, /* R5909 (0x1715) - GPIO11 Ctrl 2 */
+	{ 0x00001716, 0x2801 }, /* R5910 (0x1716) - GPIO12 Ctrl 1 */
+	{ 0x00001717, 0xe800 }, /* R5911 (0x1717) - GPIO12 Ctrl 2 */
+	{ 0x00001718, 0x2801 }, /* R5912 (0x1718) - GPIO13 Ctrl 1 */
+	{ 0x00001719, 0xe800 }, /* R5913 (0x1719) - GPIO13 Ctrl 2 */
+	{ 0x0000171a, 0x2801 }, /* R5914 (0x171A) - GPIO14 Ctrl 1 */
+	{ 0x0000171b, 0xe800 }, /* R5915 (0x171B) - GPIO14 Ctrl 2 */
+	{ 0x0000171c, 0x2801 }, /* R5916 (0x171C) - GPIO15 Ctrl 1 */
+	{ 0x0000171d, 0xe800 }, /* R5917 (0x171D) - GPIO15 Ctrl 2 */
+	{ 0x00001840, 0xffff }, /* R6208 (0x1840) - IRQ1 Mask 1 */
+	{ 0x00001841, 0xffff }, /* R6209 (0x1841) - IRQ1 Mask 2 */
+	{ 0x00001842, 0xffff }, /* R6210 (0x1842) - IRQ1 Mask 3 */
+	{ 0x00001843, 0xffff }, /* R6211 (0x1843) - IRQ1 Mask 4 */
+	{ 0x00001844, 0xffff }, /* R6212 (0x1844) - IRQ1 Mask 5 */
+	{ 0x00001845, 0xffff }, /* R6213 (0x1845) - IRQ1 Mask 6 */
+	{ 0x00001846, 0xffff }, /* R6214 (0x1846) - IRQ1 Mask 7 */
+	{ 0x00001847, 0xffff }, /* R6215 (0x1847) - IRQ1 Mask 8 */
+	{ 0x00001848, 0xffff }, /* R6216 (0x1848) - IRQ1 Mask 9 */
+	{ 0x00001849, 0xffff }, /* R6217 (0x1849) - IRQ1 Mask 10 */
+	{ 0x0000184a, 0xffff }, /* R6218 (0x184A) - IRQ1 Mask 11 */
+	{ 0x0000184b, 0xffff }, /* R6219 (0x184B) - IRQ1 Mask 12 */
+	{ 0x0000184c, 0xffff }, /* R6220 (0x184C) - IRQ1 Mask 13 */
+	{ 0x0000184d, 0xffff }, /* R6221 (0x184D) - IRQ1 Mask 14 */
+	{ 0x0000184e, 0xffff }, /* R6222 (0x184E) - IRQ1 Mask 15 */
+	{ 0x0000184f, 0xffff }, /* R6223 (0x184F) - IRQ1 Mask 16 */
+	{ 0x00001850, 0xffff }, /* R6224 (0x1850) - IRQ1 Mask 17 */
+	{ 0x00001851, 0xffff }, /* R6225 (0x1851) - IRQ1 Mask 18 */
+	{ 0x00001852, 0xffff }, /* R6226 (0x1852) - IRQ1 Mask 19 */
+	{ 0x00001853, 0xffff }, /* R6227 (0x1853) - IRQ1 Mask 20 */
+	{ 0x00001854, 0xffff }, /* R6228 (0x1854) - IRQ1 Mask 21 */
+	{ 0x00001855, 0xffff }, /* R6229 (0x1855) - IRQ1 Mask 22 */
+	{ 0x00001856, 0xffff }, /* R6230 (0x1856) - IRQ1 Mask 23 */
+	{ 0x00001857, 0xffff }, /* R6231 (0x1857) - IRQ1 Mask 24 */
+	{ 0x00001858, 0xffff }, /* R6232 (0x1858) - IRQ1 Mask 25 */
+	{ 0x00001859, 0xffff }, /* R6233 (0x1859) - IRQ1 Mask 26 */
+	{ 0x0000185a, 0xffff }, /* R6234 (0x185A) - IRQ1 Mask 27 */
+	{ 0x0000185b, 0xffff }, /* R6235 (0x185B) - IRQ1 Mask 28 */
+	{ 0x0000185c, 0xffff }, /* R6236 (0x185C) - IRQ1 Mask 29 */
+	{ 0x0000185d, 0xffff }, /* R6237 (0x185D) - IRQ1 Mask 30 */
+	{ 0x0000185e, 0xffff }, /* R6238 (0x185E) - IRQ1 Mask 31 */
+	{ 0x0000185f, 0xffff }, /* R6239 (0x185F) - IRQ1 Mask 32 */
+	{ 0x00001860, 0xffff }, /* R6240 (0x1860) - IRQ1 Mask 33 */
+	{ 0x00001a06, 0x0000 }, /* R6662 (0x1A06) - Interrupt Debounce 7 */
+	{ 0x00001a80, 0x4400 }, /* R6784 (0x1A80) - IRQ1 Ctrl */
+};
+
+static bool cs47l15_is_adsp_memory(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case 0x080000 ... 0x088ffe:
+	case 0x0a0000 ... 0x0a9ffe:
+	case 0x0c0000 ... 0x0c1ffe:
+	case 0x0e0000 ... 0x0e1ffe:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l15_16bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_TONE_GENERATOR_1 ... MADERA_TONE_GENERATOR_5:
+	case MADERA_PWM_DRIVE_1 ... MADERA_PWM_DRIVE_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_1:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_2:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_4:
+	case MADERA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_1:
+	case MADERA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_2:
+	case MADERA_HAPTICS_CONTROL_1 ... MADERA_HAPTICS_CONTROL_2:
+	case MADERA_HAPTICS_PHASE_1_INTENSITY:
+	case MADERA_HAPTICS_PHASE_1_DURATION:
+	case MADERA_HAPTICS_PHASE_2_INTENSITY:
+	case MADERA_HAPTICS_PHASE_2_DURATION:
+	case MADERA_HAPTICS_PHASE_3_INTENSITY:
+	case MADERA_HAPTICS_PHASE_3_DURATION:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_COMFORT_NOISE_GENERATOR:
+	case MADERA_CLOCK_32K_1:
+	case MADERA_SYSTEM_CLOCK_1:
+	case MADERA_SAMPLE_RATE_1 ... MADERA_SAMPLE_RATE_3:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_DSP_CLOCK_1:
+	case MADERA_DSP_CLOCK_2:
+	case MADERA_OUTPUT_SYSTEM_CLOCK:
+	case MADERA_RATE_ESTIMATOR_1 ... MADERA_RATE_ESTIMATOR_5:
+	case MADERA_FLL1_CONTROL_1 ... MADERA_FLL1_CONTROL_6:
+	case MADERA_FLL1_CONTROL_7:
+	case MADERA_FLL1_EFS_2:
+	case MADERA_FLL1_LOOP_FILTER_TEST_1:
+	case MADERA_FLL1_SYNCHRONISER_1 ... MADERA_FLL1_SYNCHRONISER_7:
+	case MADERA_FLL1_SPREAD_SPECTRUM:
+	case MADERA_FLL1_GPIO_CLOCK:
+	case MADERA_FLLAO_CONTROL_1:
+	case MADERA_FLLAO_CONTROL_2:
+	case MADERA_FLLAO_CONTROL_3:
+	case MADERA_FLLAO_CONTROL_4:
+	case MADERA_FLLAO_CONTROL_5:
+	case MADERA_FLLAO_CONTROL_6:
+	case MADERA_FLLAO_CONTROL_7:
+	case MADERA_FLLAO_CONTROL_8:
+	case MADERA_FLLAO_CONTROL_9:
+	case MADERA_FLLAO_CONTROL_10:
+	case MADERA_FLLAO_CONTROL_11:
+	case MADERA_MIC_BIAS_CTRL_1:
+	case MADERA_MIC_BIAS_CTRL_5:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_ACCESSORY_DETECT_MODE_1:
+	case MADERA_HEADPHONE_DETECT_0:
+	case MADERA_HEADPHONE_DETECT_1:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_MICD_CLAMP_CONTROL:
+	case MADERA_MIC_DETECT_1_CONTROL_0:
+	case MADERA_MIC_DETECT_1_CONTROL_1:
+	case MADERA_MIC_DETECT_1_CONTROL_2:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_LEVEL_1 ... MADERA_MIC_DETECT_1_LEVEL_4:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_GP_SWITCH_1:
+	case MADERA_JACK_DETECT_ANALOGUE:
+	case MADERA_INPUT_ENABLES:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_INPUT_RATE:
+	case MADERA_INPUT_VOLUME_RAMP:
+	case MADERA_HPF_CONTROL:
+	case MADERA_IN1L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1L:
+	case MADERA_DMIC1L_CONTROL:
+	case MADERA_IN1L_RATE_CONTROL:
+	case MADERA_IN1R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1R:
+	case MADERA_DMIC1R_CONTROL:
+	case MADERA_IN1R_RATE_CONTROL:
+	case MADERA_IN2L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2L:
+	case MADERA_DMIC2L_CONTROL:
+	case MADERA_IN2L_RATE_CONTROL:
+	case MADERA_IN2R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2R:
+	case MADERA_DMIC2R_CONTROL:
+	case MADERA_IN2R_RATE_CONTROL:
+	case CS47L15_ADC_INT_BIAS:
+	case CS47L15_PGA_BIAS_SEL:
+	case MADERA_OUTPUT_ENABLES_1:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_OUTPUT_RATE_1:
+	case MADERA_OUTPUT_VOLUME_RAMP:
+	case MADERA_OUTPUT_PATH_CONFIG_1L:
+	case MADERA_DAC_DIGITAL_VOLUME_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1:
+	case MADERA_NOISE_GATE_SELECT_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1R:
+	case MADERA_DAC_DIGITAL_VOLUME_1R:
+	case MADERA_NOISE_GATE_SELECT_1R:
+	case MADERA_OUTPUT_PATH_CONFIG_2:
+	case MADERA_OUTPUT_PATH_CONFIG_4L:
+	case MADERA_DAC_DIGITAL_VOLUME_4L:
+	case MADERA_NOISE_GATE_SELECT_4L:
+	case MADERA_OUTPUT_PATH_CONFIG_5L:
+	case MADERA_DAC_DIGITAL_VOLUME_5L:
+	case MADERA_NOISE_GATE_SELECT_5L:
+	case MADERA_OUTPUT_PATH_CONFIG_5R:
+	case MADERA_DAC_DIGITAL_VOLUME_5R:
+	case MADERA_NOISE_GATE_SELECT_5R:
+	case MADERA_DAC_AEC_CONTROL_1:
+	case MADERA_DAC_AEC_CONTROL_2:
+	case MADERA_NOISE_GATE_CONTROL:
+	case MADERA_PDM_SPK1_CTRL_1 ... MADERA_PDM_SPK1_CTRL_2:
+	case MADERA_HP1_SHORT_CIRCUIT_CTRL:
+	case MADERA_HP_TEST_CTRL_5:
+	case MADERA_HP_TEST_CTRL_6:
+	case MADERA_AIF1_BCLK_CTRL:
+	case MADERA_AIF1_TX_PIN_CTRL:
+	case MADERA_AIF1_RX_PIN_CTRL:
+	case MADERA_AIF1_RATE_CTRL:
+	case MADERA_AIF1_FORMAT:
+	case MADERA_AIF1_RX_BCLK_RATE:
+	case MADERA_AIF1_FRAME_CTRL_1 ... MADERA_AIF1_FRAME_CTRL_8:
+	case MADERA_AIF1_FRAME_CTRL_11 ... MADERA_AIF1_FRAME_CTRL_16:
+	case MADERA_AIF1_TX_ENABLES:
+	case MADERA_AIF1_RX_ENABLES:
+	case MADERA_AIF2_BCLK_CTRL:
+	case MADERA_AIF2_TX_PIN_CTRL:
+	case MADERA_AIF2_RX_PIN_CTRL:
+	case MADERA_AIF2_RATE_CTRL:
+	case MADERA_AIF2_FORMAT:
+	case MADERA_AIF2_RX_BCLK_RATE:
+	case MADERA_AIF2_FRAME_CTRL_1 ... MADERA_AIF2_FRAME_CTRL_6:
+	case MADERA_AIF2_FRAME_CTRL_11 ... MADERA_AIF2_FRAME_CTRL_14:
+	case MADERA_AIF2_TX_ENABLES:
+	case MADERA_AIF2_RX_ENABLES:
+	case MADERA_AIF3_BCLK_CTRL:
+	case MADERA_AIF3_TX_PIN_CTRL:
+	case MADERA_AIF3_RX_PIN_CTRL:
+	case MADERA_AIF3_RATE_CTRL:
+	case MADERA_AIF3_FORMAT:
+	case MADERA_AIF3_RX_BCLK_RATE:
+	case MADERA_AIF3_FRAME_CTRL_1 ... MADERA_AIF3_FRAME_CTRL_4:
+	case MADERA_AIF3_FRAME_CTRL_11 ... MADERA_AIF3_FRAME_CTRL_12:
+	case MADERA_AIF3_TX_ENABLES:
+	case MADERA_AIF3_RX_ENABLES:
+	case MADERA_SPD1_TX_CONTROL:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_PWM1MIX_INPUT_1_SOURCE:
+	case MADERA_PWM1MIX_INPUT_1_VOLUME:
+	case MADERA_PWM1MIX_INPUT_2_SOURCE:
+	case MADERA_PWM1MIX_INPUT_2_VOLUME:
+	case MADERA_PWM1MIX_INPUT_3_SOURCE:
+	case MADERA_PWM1MIX_INPUT_3_VOLUME:
+	case MADERA_PWM1MIX_INPUT_4_SOURCE:
+	case MADERA_PWM1MIX_INPUT_4_VOLUME:
+	case MADERA_PWM2MIX_INPUT_1_SOURCE:
+	case MADERA_PWM2MIX_INPUT_1_VOLUME:
+	case MADERA_PWM2MIX_INPUT_2_SOURCE:
+	case MADERA_PWM2MIX_INPUT_2_VOLUME:
+	case MADERA_PWM2MIX_INPUT_3_SOURCE:
+	case MADERA_PWM2MIX_INPUT_3_VOLUME:
+	case MADERA_PWM2MIX_INPUT_4_SOURCE:
+	case MADERA_PWM2MIX_INPUT_4_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_4_VOLUME:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_1_SOURCE:
+	case MADERA_EQ1MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_2_SOURCE:
+	case MADERA_EQ1MIX_INPUT_2_VOLUME:
+	case MADERA_EQ1MIX_INPUT_3_SOURCE:
+	case MADERA_EQ1MIX_INPUT_3_VOLUME:
+	case MADERA_EQ1MIX_INPUT_4_SOURCE:
+	case MADERA_EQ1MIX_INPUT_4_VOLUME:
+	case MADERA_EQ2MIX_INPUT_1_SOURCE:
+	case MADERA_EQ2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ2MIX_INPUT_2_SOURCE:
+	case MADERA_EQ2MIX_INPUT_2_VOLUME:
+	case MADERA_EQ2MIX_INPUT_3_SOURCE:
+	case MADERA_EQ2MIX_INPUT_3_VOLUME:
+	case MADERA_EQ2MIX_INPUT_4_SOURCE:
+	case MADERA_EQ2MIX_INPUT_4_VOLUME:
+	case MADERA_EQ3MIX_INPUT_1_SOURCE:
+	case MADERA_EQ3MIX_INPUT_1_VOLUME:
+	case MADERA_EQ3MIX_INPUT_2_SOURCE:
+	case MADERA_EQ3MIX_INPUT_2_VOLUME:
+	case MADERA_EQ3MIX_INPUT_3_SOURCE:
+	case MADERA_EQ3MIX_INPUT_3_VOLUME:
+	case MADERA_EQ3MIX_INPUT_4_SOURCE:
+	case MADERA_EQ3MIX_INPUT_4_VOLUME:
+	case MADERA_EQ4MIX_INPUT_1_SOURCE:
+	case MADERA_EQ4MIX_INPUT_1_VOLUME:
+	case MADERA_EQ4MIX_INPUT_2_SOURCE:
+	case MADERA_EQ4MIX_INPUT_2_VOLUME:
+	case MADERA_EQ4MIX_INPUT_3_SOURCE:
+	case MADERA_EQ4MIX_INPUT_3_VOLUME:
+	case MADERA_EQ4MIX_INPUT_4_SOURCE:
+	case MADERA_EQ4MIX_INPUT_4_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_4_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_4_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1AUX1MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX2MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX3MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX4MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX5MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX6MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC4MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT4MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC4MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT4MIX_INPUT_1_SOURCE:
+	case MADERA_FX_CTRL1 ... MADERA_FX_CTRL2:
+	case MADERA_EQ1_1 ... MADERA_EQ1_21:
+	case MADERA_EQ2_1 ... MADERA_EQ2_21:
+	case MADERA_EQ3_1 ... MADERA_EQ3_21:
+	case MADERA_EQ4_1 ... MADERA_EQ4_21:
+	case MADERA_DRC1_CTRL1 ... MADERA_DRC1_CTRL5:
+	case MADERA_DRC2_CTRL1 ... MADERA_DRC2_CTRL5:
+	case MADERA_HPLPF1_1 ... MADERA_HPLPF1_2:
+	case MADERA_HPLPF2_1 ... MADERA_HPLPF2_2:
+	case MADERA_HPLPF3_1 ... MADERA_HPLPF3_2:
+	case MADERA_HPLPF4_1 ... MADERA_HPLPF4_2:
+	case MADERA_ISRC_1_CTRL_1 ... MADERA_ISRC_1_CTRL_3:
+	case MADERA_ISRC_2_CTRL_1 ... MADERA_ISRC_2_CTRL_3:
+	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO15_CTRL_2:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+	case MADERA_INTERRUPT_DEBOUNCE_7:
+	case MADERA_IRQ1_CTRL:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l15_16bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_FX_CTRL2:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l15_32bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_225:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l15_is_adsp_memory(dev, reg);
+	}
+}
+
+static bool cs47l15_32bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_225:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l15_is_adsp_memory(dev, reg);
+	}
+}
+
+const struct regmap_config cs47l15_16bit_spi_regmap = {
+	.name = "cs47l15_16bit",
+	.reg_bits = 32,
+	.pad_bits = 16,
+	.val_bits = 16,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l15_16bit_readable_register,
+	.volatile_reg = &cs47l15_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l15_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l15_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l15_16bit_spi_regmap);
+
+const struct regmap_config cs47l15_16bit_i2c_regmap = {
+	.name = "cs47l15_16bit",
+	.reg_bits = 32,
+	.val_bits = 16,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l15_16bit_readable_register,
+	.volatile_reg = &cs47l15_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l15_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l15_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l15_16bit_i2c_regmap);
+
+const struct regmap_config cs47l15_32bit_spi_regmap = {
+	.name = "cs47l15_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.pad_bits = 16,
+	.val_bits = 32,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l15_32bit_readable_register,
+	.volatile_reg = &cs47l15_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l15_32bit_spi_regmap);
+
+const struct regmap_config cs47l15_32bit_i2c_regmap = {
+	.name = "cs47l15_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.val_bits = 32,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l15_32bit_readable_register,
+	.volatile_reg = &cs47l15_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l15_32bit_i2c_regmap);
diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c
index bc4e25b5b97d..a354567ebc86 100644
--- a/drivers/mfd/madera-core.c
+++ b/drivers/mfd/madera-core.c
@@ -31,6 +31,7 @@
 
 #include "madera.h"
 
+#define CS47L15_SILICON_ID	0x6370
 #define CS47L35_SILICON_ID	0x6360
 #define CS47L85_SILICON_ID	0x6338
 #define CS47L90_SILICON_ID	0x6364
@@ -46,6 +47,28 @@ static const struct mfd_cell madera_ldo1_devs[] = {
 	{ .name = "madera-ldo1" },
 };
 
+static const char * const cs47l15_supplies[] = {
+	"MICVDD",
+	"CPVDD1",
+	"SPKVDD",
+};
+
+static const struct mfd_cell cs47l15_devs[] = {
+	{ .name = "madera-pinctrl", },
+	{ .name = "madera-irq" },
+	{ .name = "madera-gpio" },
+	{
+		.name = "madera-extcon",
+		.parent_supplies = cs47l15_supplies,
+		.num_parent_supplies = 1, /* We only need MICVDD */
+	},
+	{
+		.name = "cs47l15-codec",
+		.parent_supplies = cs47l15_supplies,
+		.num_parent_supplies = ARRAY_SIZE(cs47l15_supplies),
+	},
+};
+
 static const char * const cs47l35_supplies[] = {
 	"MICVDD",
 	"DBVDD2",
@@ -129,6 +152,8 @@ static const struct mfd_cell cs47l90_devs[] = {
 const char *madera_name_from_type(enum madera_type type)
 {
 	switch (type) {
+	case CS47L15:
+		return "CS47L15";
 	case CS47L35:
 		return "CS47L35";
 	case CS47L85:
@@ -291,6 +316,7 @@ const struct dev_pm_ops madera_pm_ops = {
 EXPORT_SYMBOL_GPL(madera_pm_ops);
 
 const struct of_device_id madera_of_match[] = {
+	{ .compatible = "cirrus,cs47l15", .data = (void *)CS47L15 },
 	{ .compatible = "cirrus,cs47l35", .data = (void *)CS47L35 },
 	{ .compatible = "cirrus,cs47l85", .data = (void *)CS47L85 },
 	{ .compatible = "cirrus,cs47l90", .data = (void *)CS47L90 },
@@ -339,6 +365,10 @@ static void madera_set_micbias_info(struct madera *madera)
 	 * childbiases for each micbias. Unspecified values default to 0.
 	 */
 	switch (madera->type) {
+	case CS47L15:
+		madera->num_micbias = 1;
+		madera->num_childbias[0] = 3;
+		return;
 	case CS47L35:
 		madera->num_micbias = 2;
 		madera->num_childbias[0] = 2;
@@ -402,6 +432,7 @@ int madera_dev_init(struct madera *madera)
 	 * No devm_ because we need to control shutdown order of children.
 	 */
 	switch (madera->type) {
+	case CS47L15:
 	case CS47L35:
 	case CS47L90:
 	case CS47L91:
@@ -471,6 +502,19 @@ int madera_dev_init(struct madera *madera)
 	}
 
 	switch (hwid) {
+	case CS47L15_SILICON_ID:
+		if (IS_ENABLED(CONFIG_MFD_CS47L15)) {
+			switch (madera->type) {
+			case CS47L15:
+				patch_fn = &cs47l15_patch;
+				mfd_devs = cs47l15_devs;
+				n_devs = ARRAY_SIZE(cs47l15_devs);
+				break;
+			default:
+				break;
+			}
+		}
+		break;
 	case CS47L35_SILICON_ID:
 		if (IS_ENABLED(CONFIG_MFD_CS47L35)) {
 			switch (madera->type) {
diff --git a/drivers/mfd/madera-i2c.c b/drivers/mfd/madera-i2c.c
index 05ae94be01d8..bd868459cedb 100644
--- a/drivers/mfd/madera-i2c.c
+++ b/drivers/mfd/madera-i2c.c
@@ -39,6 +39,12 @@ static int madera_i2c_probe(struct i2c_client *i2c,
 		type = id->driver_data;
 
 	switch (type) {
+	case CS47L15:
+		if (IS_ENABLED(CONFIG_MFD_CS47L15)) {
+			regmap_16bit_config = &cs47l15_16bit_i2c_regmap;
+			regmap_32bit_config = &cs47l15_32bit_i2c_regmap;
+		}
+		break;
 	case CS47L35:
 		if (IS_ENABLED(CONFIG_MFD_CS47L35)) {
 			regmap_16bit_config = &cs47l35_16bit_i2c_regmap;
@@ -113,6 +119,7 @@ static int madera_i2c_remove(struct i2c_client *i2c)
 }
 
 static const struct i2c_device_id madera_i2c_id[] = {
+	{ "cs47l15", CS47L15 },
 	{ "cs47l35", CS47L35 },
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
diff --git a/drivers/mfd/madera-spi.c b/drivers/mfd/madera-spi.c
index 4c398b278bba..a36741b73c25 100644
--- a/drivers/mfd/madera-spi.c
+++ b/drivers/mfd/madera-spi.c
@@ -39,6 +39,12 @@ static int madera_spi_probe(struct spi_device *spi)
 		type = id->driver_data;
 
 	switch (type) {
+	case CS47L15:
+		if (IS_ENABLED(CONFIG_MFD_CS47L15)) {
+			regmap_16bit_config = &cs47l15_16bit_spi_regmap;
+			regmap_32bit_config = &cs47l15_32bit_spi_regmap;
+		}
+		break;
 	case CS47L35:
 		if (IS_ENABLED(CONFIG_MFD_CS47L35)) {
 			regmap_16bit_config = &cs47l35_16bit_spi_regmap;
@@ -112,6 +118,7 @@ static int madera_spi_remove(struct spi_device *spi)
 }
 
 static const struct spi_device_id madera_spi_ids[] = {
+	{ "cs47l15", CS47L15 },
 	{ "cs47l35", CS47L35 },
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
diff --git a/drivers/mfd/madera.h b/drivers/mfd/madera.h
index 891b84efb9a7..ccc16f2a1288 100644
--- a/drivers/mfd/madera.h
+++ b/drivers/mfd/madera.h
@@ -24,6 +24,12 @@ int madera_dev_exit(struct madera *madera);
 
 const char *madera_name_from_type(enum madera_type type);
 
+extern const struct regmap_config cs47l15_16bit_spi_regmap;
+extern const struct regmap_config cs47l15_32bit_spi_regmap;
+extern const struct regmap_config cs47l15_16bit_i2c_regmap;
+extern const struct regmap_config cs47l15_32bit_i2c_regmap;
+int cs47l15_patch(struct madera *madera);
+
 extern const struct regmap_config cs47l35_16bit_spi_regmap;
 extern const struct regmap_config cs47l35_32bit_spi_regmap;
 extern const struct regmap_config cs47l35_16bit_i2c_regmap;
diff --git a/include/linux/mfd/madera/core.h b/include/linux/mfd/madera/core.h
index 4d5d51a9c8a6..98dd3cb5e84d 100644
--- a/include/linux/mfd/madera/core.h
+++ b/include/linux/mfd/madera/core.h
@@ -27,11 +27,13 @@ enum madera_type {
 	CS47L90 = 3,
 	CS47L91 = 4,
 	WM1840 = 7,
+	CS47L15 = 8,
 };
 
 #define MADERA_MAX_CORE_SUPPLIES	2
 #define MADERA_MAX_GPIOS		40
 
+#define CS47L15_NUM_GPIOS		15
 #define CS47L35_NUM_GPIOS		16
 #define CS47L85_NUM_GPIOS		40
 #define CS47L90_NUM_GPIOS		38
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 977e06101711..5b054d511c6a 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -244,6 +244,8 @@
 #define MADERA_IN6R_CONTROL				0x33C
 #define MADERA_ADC_DIGITAL_VOLUME_6R			0x33D
 #define MADERA_DMIC6R_CONTROL				0x33E
+#define CS47L15_ADC_INT_BIAS				0x3A8
+#define CS47L15_PGA_BIAS_SEL				0x3C4
 #define MADERA_OUTPUT_ENABLES_1				0x400
 #define MADERA_OUTPUT_STATUS_1				0x401
 #define MADERA_RAW_OUTPUT_STATUS_1			0x406
@@ -1202,6 +1204,8 @@
 #define MADERA_GPIO1_CTRL_2				0x1701
 #define MADERA_GPIO2_CTRL_1				0x1702
 #define MADERA_GPIO2_CTRL_2				0x1703
+#define MADERA_GPIO15_CTRL_1				0x171C
+#define MADERA_GPIO15_CTRL_2				0x171D
 #define MADERA_GPIO16_CTRL_1				0x171E
 #define MADERA_GPIO16_CTRL_2				0x171F
 #define MADERA_GPIO38_CTRL_1				0x174A
@@ -1232,6 +1236,7 @@
 #define MADERA_IRQ2_CTRL				0x1A82
 #define MADERA_INTERRUPT_RAW_STATUS_1			0x1AA0
 #define MADERA_WSEQ_SEQUENCE_1				0x3000
+#define MADERA_WSEQ_SEQUENCE_225			0x31C0
 #define MADERA_WSEQ_SEQUENCE_252			0x31F6
 #define CS47L35_OTP_HPDET_CAL_1				0x31F8
 #define CS47L35_OTP_HPDET_CAL_2				0x31FA
-- 
cgit v1.2.3


From 297939901f382f16ab78a8073cdfb2a6279bb654 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 30 May 2019 15:39:53 +0100
Subject: mfd: madera: Add Madera core support for CS47L92

This patch adds all the core support and defines for the Cirrus
Logic CS42L92, CS47L92 and CS47L93 smart audio CODECs.

Registers or fields are named MADERA_* if it is part of the
common hardware platform and does not conflict with any other
Madera codecs. It is named CS47L15_* if it is unique to CS47L15
and conflicts with definitions on other codecs.

Signed-off-by: Stuart Henderson <stuarth@opensource.cirrus.com>
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |    7 +
 drivers/mfd/Makefile                 |    3 +
 drivers/mfd/cs47l92-tables.c         | 1948 ++++++++++++++++++++++++++++++++++
 drivers/mfd/madera-core.c            |   58 +
 drivers/mfd/madera-i2c.c             |   11 +
 drivers/mfd/madera-spi.c             |   11 +
 drivers/mfd/madera.h                 |    7 +
 include/linux/mfd/madera/core.h      |    4 +
 include/linux/mfd/madera/registers.h |  195 ++++
 9 files changed, 2244 insertions(+)
 create mode 100644 drivers/mfd/cs47l92-tables.c

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 86ae0a11f631..760100c7d5f9 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -288,6 +288,13 @@ config MFD_CS47L90
 	help
 	  Support for Cirrus Logic CS47L90 and CS47L91 Smart Codecs
 
+config MFD_CS47L92
+	bool "Cirrus Logic CS47L92/93"
+	select PINCTRL_CS47L92
+	depends on MFD_MADERA
+	help
+	  Support for Cirrus Logic CS42L92, CS47L92 and CS47L93 Smart Codecs
+
 config MFD_ASIC3
 	bool "Compaq ASIC3"
 	depends on GPIOLIB && ARM
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index cc044f38af84..f026ada68f6a 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -87,6 +87,9 @@ endif
 ifeq ($(CONFIG_MFD_CS47L90),y)
 madera-objs			+= cs47l90-tables.o
 endif
+ifeq ($(CONFIG_MFD_CS47L92),y)
+madera-objs			+= cs47l92-tables.o
+endif
 obj-$(CONFIG_MFD_MADERA)	+= madera.o
 obj-$(CONFIG_MFD_MADERA_I2C)	+= madera-i2c.o
 obj-$(CONFIG_MFD_MADERA_SPI)	+= madera-spi.o
diff --git a/drivers/mfd/cs47l92-tables.c b/drivers/mfd/cs47l92-tables.c
new file mode 100644
index 000000000000..3dc1fefe68f5
--- /dev/null
+++ b/drivers/mfd/cs47l92-tables.c
@@ -0,0 +1,1948 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Regmap tables for CS47L92 codec
+ *
+ * Copyright (C) 2016-2019 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ *
+ * Author: Stuart Henderson <stuarth@opensource.cirrus.com>
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+
+#include <linux/mfd/madera/core.h>
+#include <linux/mfd/madera/registers.h>
+
+#include "madera.h"
+
+static const struct reg_sequence cs47l92_reva_16_patch[] = {
+	{ 0x3A2,  0x2C29 },
+	{ 0x3A3,  0x0E00 },
+	{ 0x281,  0x0000 },
+	{ 0x282,  0x0000 },
+	{ 0x4EA,  0x0100 },
+	{ 0x22B,  0x0000 },
+	{ 0x4A0,  0x0080 },
+	{ 0x4A1,  0x0000 },
+	{ 0x4A2,  0x0000 },
+	{ 0x180B, 0x033F },
+	{ 0x190B, 0x033F },
+	{ 0x442,  0x0304 },
+	{ 0x34C,  0x0003 },
+	{ 0x124,  0x0C49 },
+	{ 0x120,  0x0345 },
+	{ 0x120,  0x0305 },
+	{ 0x4FA,  0x5064 },
+	{ 0x1300, 0x050E },
+	{ 0x1302, 0x0101 },
+	{ 0x1380, 0x02E0 },
+	{ 0x1381, 0xF942 },
+	{ 0x1382, 0x04CE },
+	{ 0x1383, 0xFF06 },
+	{ 0x1390, 0x0304 },
+	{ 0x1391, 0xF8FF },
+	{ 0x1392, 0x04F3 },
+	{ 0x1393, 0xFF00 },
+	{ 0x13A0, 0x02E0 },
+	{ 0x13A1, 0xF942 },
+	{ 0x13A2, 0x04CE },
+	{ 0x13A3, 0xFF06 },
+	{ 0x13B0, 0x0304 },
+	{ 0x13B1, 0xF8FF },
+	{ 0x13B2, 0x04F3 },
+	{ 0x13B3, 0xFF00 },
+	{ 0x412,  0x0005 },
+	{ 0x41A,  0x0005 },
+	{ 0x422,  0x0005 },
+};
+
+static const struct reg_sequence cs47l92_reva_32_patch[] = {
+	{ 0x3030, 0x04A00C01 },
+	{ 0x3032, 0x0225F501 },
+	{ 0x3044, 0x04A00C00 },
+	{ 0x3046, 0x0225FF01 },
+	{ 0x3080, 0x04A00C01 },
+	{ 0x3082, 0x0226F501 },
+	{ 0x3094, 0x04A00C00 },
+	{ 0x3096, 0x0226FF01 },
+	{ 0x30D1, 0x04A10C01 },
+	{ 0x30D2, 0x0227F501 },
+	{ 0x30E4, 0x04A10C00 },
+	{ 0x30E6, 0x0227FF01 },
+	{ 0x3120, 0x04A10C01 },
+	{ 0x3122, 0x0228F501 },
+	{ 0x3134, 0x04A10C00 },
+	{ 0x3136, 0x0228FF01 },
+	{ 0x3170, 0x04A20C01 },
+	{ 0x3172, 0x022B0101 },
+	{ 0x3174, 0x0229F501 },
+	{ 0x3184, 0x04A20C00 },
+	{ 0x3186, 0x022B0100 },
+	{ 0x3188, 0x0229FF01 },
+	{ 0x31C0, 0x04A20C01 },
+	{ 0x31C2, 0x022B0001 },
+	{ 0x31C4, 0x022AF501 },
+	{ 0x31D4, 0x04A20C00 },
+	{ 0x31D6, 0x022B0000 },
+	{ 0x31D8, 0x022AFF01 },
+};
+
+int cs47l92_patch(struct madera *madera)
+{
+	int ret;
+
+	ret = regmap_register_patch(madera->regmap,
+				    cs47l92_reva_16_patch,
+				    ARRAY_SIZE(cs47l92_reva_16_patch));
+	if (ret < 0) {
+		dev_err(madera->dev,
+			"Error in applying 16-bit patch: %d\n", ret);
+		return ret;
+	}
+
+	ret = regmap_register_patch(madera->regmap_32bit,
+				    cs47l92_reva_32_patch,
+				    ARRAY_SIZE(cs47l92_reva_32_patch));
+	if (ret < 0) {
+		dev_err(madera->dev,
+			"Error in applying 32-bit patch: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cs47l92_patch);
+
+static const struct reg_default cs47l92_reg_default[] = {
+	{ 0x00000020, 0x0000 }, /* R32 (0x20) - Tone Generator 1 */
+	{ 0x00000021, 0x1000 }, /* R33 (0x21) - Tone Generator 2 */
+	{ 0x00000022, 0x0000 }, /* R34 (0x22) - Tone Generator 3 */
+	{ 0x00000023, 0x1000 }, /* R35 (0x23) - Tone Generator 4 */
+	{ 0x00000024, 0x0000 }, /* R36 (0x24) - Tone Generator 5 */
+	{ 0x00000030, 0x0000 }, /* R48 (0x30) - PWM Drive 1 */
+	{ 0x00000031, 0x0100 }, /* R49 (0x31) - PWM Drive 2 */
+	{ 0x00000032, 0x0100 }, /* R50 (0x32) - PWM Drive 3 */
+	{ 0x00000061, 0x01ff }, /* R97 (0x61) - Sample Rate Sequence Select 1 */
+	{ 0x00000062, 0x01ff }, /* R98 (0x62) - Sample Rate Sequence Select 2 */
+	{ 0x00000063, 0x01ff }, /* R99 (0x63) - Sample Rate Sequence Select 3 */
+	{ 0x00000064, 0x01ff }, /* R100 (0x64) - Sample Rate Sequence Select 4 */
+	{ 0x00000090, 0x0000 }, /* R144 (0x90) - Haptics Control 1 */
+	{ 0x00000091, 0x7fff }, /* R145 (0x91) - Haptics Control 2 */
+	{ 0x00000092, 0x0000 }, /* R146 (0x92) - Haptics Phase 1 Intensity */
+	{ 0x00000093, 0x0000 }, /* R147 (0x93) - Haptics Phase 1 Duration */
+	{ 0x00000094, 0x0000 }, /* R148 (0x94) - Haptics Phase 2 Intensity */
+	{ 0x00000095, 0x0000 }, /* R149 (0x95) - Haptics Phase 2 Duration */
+	{ 0x00000096, 0x0000 }, /* R150 (0x96) - Haptics Phase 3 Intensity */
+	{ 0x00000097, 0x0000 }, /* R151 (0x97) - Haptics Phase 3 Duration */
+	{ 0x000000a0, 0x0000 }, /* R160 (0xa0) - Comfort Noise Generator */
+	{ 0x00000100, 0x0002 }, /* R256 (0x100) - Clock 32k 1 */
+	{ 0x00000101, 0x0404 }, /* R257 (0x101) - System Clock 1 */
+	{ 0x00000102, 0x0011 }, /* R258 (0x102) - Sample Rate 1 */
+	{ 0x00000103, 0x0011 }, /* R259 (0x103) - Sample Rate 2 */
+	{ 0x00000104, 0x0011 }, /* R260 (0x104) - Sample Rate 3 */
+	{ 0x00000112, 0x0305 }, /* R274 (0x112) - Async Clock 1 */
+	{ 0x00000113, 0x0011 }, /* R275 (0x113) - Async Sample Rate 1 */
+	{ 0x00000114, 0x0011 }, /* R276 (0x114) - Async Sample Rate 2 */
+	{ 0x00000120, 0x0305 }, /* R288 (0x120) - DSP Clock 1 */
+	{ 0x00000122, 0x0000 }, /* R290 (0x122) - DSP Clock 2 */
+	{ 0x00000149, 0x0000 }, /* R329 (0x149) - Output System Clock */
+	{ 0x0000014a, 0x0000 }, /* R330 (0x14a) - Output Async Clock */
+	{ 0x00000152, 0x0000 }, /* R338 (0x152) - Rate Estimator 1 */
+	{ 0x00000153, 0x0000 }, /* R339 (0x153) - Rate Estimator 2 */
+	{ 0x00000154, 0x0000 }, /* R340 (0x154) - Rate Estimator 3 */
+	{ 0x00000155, 0x0000 }, /* R341 (0x155) - Rate Estimator 4 */
+	{ 0x00000156, 0x0000 }, /* R342 (0x156) - Rate Estimator 5 */
+	{ 0x00000171, 0x7004 }, /* R369 (0x171) - FLL1 Control 1 */
+	{ 0x00000172, 0x0004 }, /* R370 (0x172) - FLL1 Control 2 */
+	{ 0x00000173, 0x0000 }, /* R371 (0x173) - FLL1 Control 3 */
+	{ 0x00000174, 0x0000 }, /* R372 (0x174) - FLL1 Control 4 */
+	{ 0x00000175, 0x0001 }, /* R373 (0x175) - FLL1 Control 5 */
+	{ 0x00000176, 0x8000 }, /* R374 (0x176) - FLL1 Control 6 */
+	{ 0x00000177, 0x0680 }, /* R375 (0x177) - FLL1 Control 7 */
+	{ 0x00000178, 0x21f0 }, /* R376 (0x178) - FLL1 Control 8 */
+	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 9 */
+	{ 0x0000017a, 0x0000 }, /* R378 (0x17a) - FLL1 Control 10 */
+	{ 0x0000017b, 0x0011 }, /* R379 (0x17b) - FLL1 Control 11 */
+	{ 0x0000017d, 0x33e8 }, /* R381 (0x17d) - FLL1 Digital Test 1 */
+	{ 0x00000181, 0x7000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
+	{ 0x00000182, 0x0004 }, /* R386 (0x182) - FLL1 Synchroniser 2 */
+	{ 0x00000183, 0x0000 }, /* R387 (0x183) - FLL1 Synchroniser 3 */
+	{ 0x00000184, 0x0000 }, /* R388 (0x184) - FLL1 Synchroniser 4 */
+	{ 0x00000185, 0x0001 }, /* R389 (0x185) - FLL1 Synchroniser 5 */
+	{ 0x00000186, 0x0000 }, /* R390 (0x186) - FLL1 Synchroniser 6 */
+	{ 0x0000018e, 0x0c04 }, /* R398 (0x18e) - FLL1 GPIO Clock */
+	{ 0x00000191, 0x7000 }, /* R401 (0x191) - FLL2 Control 1 */
+	{ 0x00000192, 0x0004 }, /* R402 (0x192) - FLL2 Control 2 */
+	{ 0x00000193, 0x0000 }, /* R403 (0x193) - FLL2 Control 3 */
+	{ 0x00000194, 0x0000 }, /* R404 (0x194) - FLL2 Control 4 */
+	{ 0x00000195, 0x0001 }, /* R405 (0x195) - FLL2 Control 5 */
+	{ 0x00000196, 0x8000 }, /* R406 (0x196) - FLL2 Control 6 */
+	{ 0x00000197, 0x0680 }, /* R407 (0x197) - FLL2 Control 7 */
+	{ 0x00000198, 0x21f0 }, /* R408 (0x198) - FLL2 Control 8 */
+	{ 0x00000199, 0x0000 }, /* R409 (0x199) - FLL2 Control 9 */
+	{ 0x0000019a, 0x0000 }, /* R410 (0x19a) - FLL2 Control 10 */
+	{ 0x0000019b, 0x0011 }, /* R411 (0x19b) - FLL2 Control 11 */
+	{ 0x0000019d, 0x33e8 }, /* R413 (0x19d) - FLL2 Digital Test 1 */
+	{ 0x000001a1, 0x7000 }, /* R417 (0x1a1) - FLL2 Synchroniser 1 */
+	{ 0x000001a2, 0x0004 }, /* R418 (0x1a2) - FLL2 Synchroniser 2 */
+	{ 0x000001a3, 0x0000 }, /* R419 (0x1a3) - FLL2 Synchroniser 3 */
+	{ 0x000001a4, 0x0000 }, /* R420 (0x1a4) - FLL2 Synchroniser 4 */
+	{ 0x000001a5, 0x0001 }, /* R421 (0x1a5) - FLL2 Synchroniser 5 */
+	{ 0x000001a6, 0x0000 }, /* R422 (0x1a6) - FLL2 Synchroniser 6 */
+	{ 0x000001ae, 0x0c04 }, /* R430 (0x1ae) - FLL2 GPIO Clock */
+	{ 0x00000200, 0x0006 }, /* R512 (0x200) - Mic Charge Pump 1 */
+	{ 0x00000213, 0x03e4 }, /* R531 (0x213) - LDO2 Control 1 */
+	{ 0x00000218, 0x00e6 }, /* R536 (0x218) - Mic Bias Ctrl 1 */
+	{ 0x00000219, 0x00e6 }, /* R537 (0x219) - Mic Bias Ctrl 2 */
+	{ 0x0000021c, 0x2222 }, /* R540 (0x21c) - Mic Bias Ctrl 5 */
+	{ 0x0000021e, 0x0022 }, /* R542 (0x21e) - Mic Bias Ctrl 6 */
+	{ 0x00000293, 0x0080 }, /* R659 (0x293) - Accessory Detect Mode 1 */
+	{ 0x00000299, 0x0000 }, /* R665 (0x299) - Headphone Detect 0 */
+	{ 0x0000029b, 0x0000 }, /* R667 (0x29b) - Headphone Detect 1 */
+	{ 0x000002a2, 0x0010 }, /* R674 (0x2a2) - Mic Detect 1 Control 0 */
+	{ 0x000002a3, 0x1102 }, /* R675 (0x2a3) - Mic Detect 1 Control 1 */
+	{ 0x000002a4, 0x009f }, /* R676 (0x2a4) - Mic Detect 1 Control 2 */
+	{ 0x000002a6, 0x3d3d }, /* R678 (0x2a6) - Mic Detect 1 Level 1 */
+	{ 0x000002a7, 0x3d3d }, /* R679 (0x2a7) - Mic Detect 1 Level 2 */
+	{ 0x000002a8, 0x333d }, /* R680 (0x2a8) - Mic Detect 1 Level 3 */
+	{ 0x000002a9, 0x202d }, /* R681 (0x2a9) - Mic Detect 1 Level 4 */
+	{ 0x000002b2, 0x0010 }, /* R690 (0x2b2) - Mic Detect 2 Control 0 */
+	{ 0x000002b3, 0x1102 }, /* R691 (0x2b3) - Mic Detect 2 Control 1 */
+	{ 0x000002b4, 0x009f }, /* R692 (0x2b4) - Mic Detect 2 Control 2 */
+	{ 0x000002b6, 0x3d3d }, /* R694 (0x2b6) - Mic Detect 2 Level 1 */
+	{ 0x000002b7, 0x3d3d }, /* R695 (0x2b7) - Mic Detect 2 Level 2 */
+	{ 0x000002b8, 0x333d }, /* R696 (0x2b8) - Mic Detect 2 Level 3 */
+	{ 0x000002b9, 0x202d }, /* R697 (0x2b9) - Mic Detect 2 Level 4 */
+	{ 0x000002c6, 0x0210 }, /* R710 (0x2c6) - Micd Clamp control */
+	{ 0x000002c8, 0x0000 }, /* R712 (0x2c8) - GP Switch 1 */
+	{ 0x000002d3, 0x0000 }, /* R723 (0x2d3) - Jack Detect Analogue */
+	{ 0x00000300, 0x0000 }, /* R768 (0x300) - Input Enables */
+	{ 0x00000308, 0x0400 }, /* R776 (0x308) - Input Rate */
+	{ 0x00000309, 0x0022 }, /* R777 (0x309) - Input Volume Ramp */
+	{ 0x0000030c, 0x0002 }, /* R780 (0x30c) - HPF Control */
+	{ 0x00000310, 0x0080 }, /* R784 (0x310) - IN1L Control */
+	{ 0x00000311, 0x0180 }, /* R785 (0x311) - ADC Digital Volume 1L */
+	{ 0x00000312, 0x0500 }, /* R786 (0x312) - DMIC1L Control */
+	{ 0x00000313, 0x0000 }, /* R787 (0x313) - IN1L Rate Control */
+	{ 0x00000314, 0x0080 }, /* R788 (0x314) - IN1R Control */
+	{ 0x00000315, 0x0180 }, /* R789 (0x315) - ADC Digital Volume 1R */
+	{ 0x00000316, 0x0000 }, /* R790 (0x316) - DMIC1R Control */
+	{ 0x00000317, 0x0000 }, /* R791 (0x317) - IN1R Rate Control */
+	{ 0x00000318, 0x0080 }, /* R792 (0x318) - IN2L Control */
+	{ 0x00000319, 0x0180 }, /* R793 (0x319) - ADC Digital Volume 2L */
+	{ 0x0000031a, 0x0500 }, /* R794 (0x31a) - DMIC2L Control */
+	{ 0x0000031b, 0x0000 }, /* R795 (0x31b) - IN2L Rate Control */
+	{ 0x0000031c, 0x0080 }, /* R796 (0x31c) - IN2R Control */
+	{ 0x0000031d, 0x0180 }, /* R797 (0x31d) - ADC Digital Volume 2R */
+	{ 0x0000031e, 0x0000 }, /* R798 (0x31e) - DMIC2R Control */
+	{ 0x0000031f, 0x0000 }, /* R799 (0x31f) - IN2R Rate Control */
+	{ 0x00000320, 0x0000 }, /* R800 (0x320) - IN3L Control */
+	{ 0x00000321, 0x0180 }, /* R801 (0x321) - ADC Digital Volume 3L */
+	{ 0x00000322, 0x0500 }, /* R802 (0x322) - DMIC3L Control */
+	{ 0x00000323, 0x0000 }, /* R803 (0x323) - IN3L Rate Control */
+	{ 0x00000324, 0x0000 }, /* R804 (0x324) - IN3R Control */
+	{ 0x00000325, 0x0180 }, /* R805 (0x325) - ADC Digital Volume 3R */
+	{ 0x00000326, 0x0000 }, /* R806 (0x326) - DMIC3R Control */
+	{ 0x00000327, 0x0000 }, /* R807 (0x327) - IN3R Rate Control */
+	{ 0x00000328, 0x0000 }, /* R808 (0x328) - IN4L Control */
+	{ 0x00000329, 0x0180 }, /* R809 (0x329) - ADC Digital Volume 4L */
+	{ 0x0000032a, 0x0500 }, /* R810 (0x32a) - DMIC4L Control */
+	{ 0x0000032b, 0x0000 }, /* R811 (0x32b) - IN4L Rate Control */
+	{ 0x0000032c, 0x0000 }, /* R812 (0x32c) - IN4R Control */
+	{ 0x0000032d, 0x0180 }, /* R813 (0x32d) - ADC Digital Volume 4R */
+	{ 0x0000032e, 0x0000 }, /* R814 (0x32e) - DMIC4R Control */
+	{ 0x0000032f, 0x0000 }, /* R815 (0x32f) - IN4R Rate Control */
+	{ 0x00000400, 0x0000 }, /* R1024 (0x400) - Output Enables 1 */
+	{ 0x00000408, 0x0040 }, /* R1032 (0x408) - Output Rate 1 */
+	{ 0x00000409, 0x0022 }, /* R1033 (0x409) - Output Volume Ramp */
+	{ 0x00000410, 0x0080 }, /* R1040 (0x410) - Output Path Config 1L */
+	{ 0x00000411, 0x0180 }, /* R1041 (0x411) - DAC Digital Volume 1L */
+	{ 0x00000412, 0x0005 }, /* R1042 (0x412) - Output Path Config 1 */
+	{ 0x00000413, 0x0001 }, /* R1043 (0x413) - Noise Gate Select 1L */
+	{ 0x00000414, 0x0080 }, /* R1044 (0x414) - Output Path Config 1R */
+	{ 0x00000415, 0x0180 }, /* R1045 (0x415) - DAC Digital Volume 1R */
+	{ 0x00000417, 0x0002 }, /* R1047 (0x417) - Noise Gate Select 1R */
+	{ 0x00000418, 0x0080 }, /* R1048 (0x418) - Output Path Config 2L */
+	{ 0x00000419, 0x0180 }, /* R1049 (0x419) - DAC Digital Volume 2L */
+	{ 0x0000041a, 0x0005 }, /* R1050 (0x41a) - Output Path Config 2 */
+	{ 0x0000041b, 0x0004 }, /* R1051 (0x41b) - Noise Gate Select 2L */
+	{ 0x0000041c, 0x0080 }, /* R1052 (0x41c) - Output Path Config 2R */
+	{ 0x0000041d, 0x0180 }, /* R1053 (0x41d) - DAC Digital Volume 2R */
+	{ 0x0000041f, 0x0008 }, /* R1055 (0x41f) - Noise Gate Select 2R */
+	{ 0x00000420, 0x0080 }, /* R1056 (0x420) - Output Path Config 3L */
+	{ 0x00000421, 0x0180 }, /* R1057 (0x421) - DAC Digital Volume 3L */
+	{ 0x00000422, 0x0005 }, /* R1058 (0x422) - Output Path Config 3 */
+	{ 0x00000423, 0x0010 }, /* R1059 (0x423) - Noise Gate Select 3L */
+	{ 0x00000424, 0x0080 }, /* R1060 (0x424) - Output Path Config 3R */
+	{ 0x00000425, 0x0180 }, /* R1061 (0x425) - DAC Digital Volume 3R */
+	{ 0x00000427, 0x0020 }, /* R1063 (0x427) - Noise Gate Select 3R */
+	{ 0x00000430, 0x0000 }, /* R1072 (0x430) - Output Path Config 5L */
+	{ 0x00000431, 0x0180 }, /* R1073 (0x431) - DAC Digital Volume 5L */
+	{ 0x00000433, 0x0100 }, /* R1075 (0x433) - Noise Gate Select 5L */
+	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
+	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
+	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
+	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
+	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
+	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
+	{ 0x00000490, 0x0069 }, /* R1168 (0x490) - PDM SPK1 Ctrl 1 */
+	{ 0x00000491, 0x0000 }, /* R1169 (0x491) - PDM SPK1 Ctrl 2 */
+	{ 0x000004a0, 0x0080 }, /* R1184 (0x4a0) - HP1 Short Circuit Ctrl */
+	{ 0x000004a1, 0x0000 }, /* R1185 (0x4a1) - HP2 Short Circuit Ctrl */
+	{ 0x000004a2, 0x0000 }, /* R1186 (0x4a2) - HP3 Short Circuit Ctrl */
+	{ 0x00000500, 0x000c }, /* R1280 (0x500) - AIF1 BCLK Ctrl */
+	{ 0x00000501, 0x0000 }, /* R1281 (0x501) - AIF1 Tx Pin Ctrl */
+	{ 0x00000502, 0x0000 }, /* R1282 (0x502) - AIF1 Rx Pin Ctrl */
+	{ 0x00000503, 0x0000 }, /* R1283 (0x503) - AIF1 Rate Ctrl */
+	{ 0x00000504, 0x0000 }, /* R1284 (0x504) - AIF1 Format */
+	{ 0x00000506, 0x0040 }, /* R1286 (0x506) - AIF1 Rx BCLK Rate */
+	{ 0x00000507, 0x1818 }, /* R1287 (0x507) - AIF1 Frame Ctrl 1 */
+	{ 0x00000508, 0x1818 }, /* R1288 (0x508) - AIF1 Frame Ctrl 2 */
+	{ 0x00000509, 0x0000 }, /* R1289 (0x509) - AIF1 Frame Ctrl 3 */
+	{ 0x0000050a, 0x0001 }, /* R1290 (0x50a) - AIF1 Frame Ctrl 4 */
+	{ 0x0000050b, 0x0002 }, /* R1291 (0x50b) - AIF1 Frame Ctrl 5 */
+	{ 0x0000050c, 0x0003 }, /* R1292 (0x50c) - AIF1 Frame Ctrl 6 */
+	{ 0x0000050d, 0x0004 }, /* R1293 (0x50d) - AIF1 Frame Ctrl 7 */
+	{ 0x0000050e, 0x0005 }, /* R1294 (0x50e) - AIF1 Frame Ctrl 8 */
+	{ 0x0000050f, 0x0006 }, /* R1295 (0x50f) - AIF1 Frame Ctrl 9 */
+	{ 0x00000510, 0x0007 }, /* R1296 (0x510) - AIF1 Frame Ctrl 10 */
+	{ 0x00000511, 0x0000 }, /* R1297 (0x511) - AIF1 Frame Ctrl 11 */
+	{ 0x00000512, 0x0001 }, /* R1298 (0x512) - AIF1 Frame Ctrl 12 */
+	{ 0x00000513, 0x0002 }, /* R1299 (0x513) - AIF1 Frame Ctrl 13 */
+	{ 0x00000514, 0x0003 }, /* R1300 (0x514) - AIF1 Frame Ctrl 14 */
+	{ 0x00000515, 0x0004 }, /* R1301 (0x515) - AIF1 Frame Ctrl 15 */
+	{ 0x00000516, 0x0005 }, /* R1302 (0x516) - AIF1 Frame Ctrl 16 */
+	{ 0x00000517, 0x0006 }, /* R1303 (0x517) - AIF1 Frame Ctrl 17 */
+	{ 0x00000518, 0x0007 }, /* R1304 (0x518) - AIF1 Frame Ctrl 18 */
+	{ 0x00000519, 0x0000 }, /* R1305 (0x519) - AIF1 Tx Enables */
+	{ 0x0000051a, 0x0000 }, /* R1306 (0x51a) - AIF1 Rx Enables */
+	{ 0x00000540, 0x000c }, /* R1344 (0x540) - AIF2 BCLK Ctrl */
+	{ 0x00000541, 0x0000 }, /* R1345 (0x541) - AIF2 Tx Pin Ctrl */
+	{ 0x00000542, 0x0000 }, /* R1346 (0x542) - AIF2 Rx Pin Ctrl */
+	{ 0x00000543, 0x0000 }, /* R1347 (0x543) - AIF2 Rate Ctrl */
+	{ 0x00000544, 0x0000 }, /* R1348 (0x544) - AIF2 Format */
+	{ 0x00000546, 0x0040 }, /* R1350 (0x546) - AIF2 Rx BCLK Rate */
+	{ 0x00000547, 0x1818 }, /* R1351 (0x547) - AIF2 Frame Ctrl 1 */
+	{ 0x00000548, 0x1818 }, /* R1352 (0x548) - AIF2 Frame Ctrl 2 */
+	{ 0x00000549, 0x0000 }, /* R1353 (0x549) - AIF2 Frame Ctrl 3 */
+	{ 0x0000054a, 0x0001 }, /* R1354 (0x54a) - AIF2 Frame Ctrl 4 */
+	{ 0x0000054b, 0x0002 }, /* R1355 (0x54b) - AIF2 Frame Ctrl 5 */
+	{ 0x0000054c, 0x0003 }, /* R1356 (0x54c) - AIF2 Frame Ctrl 6 */
+	{ 0x0000054d, 0x0004 }, /* R1357 (0x54d) - AIF2 Frame Ctrl 7 */
+	{ 0x0000054e, 0x0005 }, /* R1358 (0x54e) - AIF2 Frame Ctrl 8 */
+	{ 0x0000054f, 0x0006 }, /* R1359 (0x54f) - AIF2 Frame Ctrl 9 */
+	{ 0x00000550, 0x0007 }, /* R1360 (0x550) - AIF2 Frame Ctrl 10 */
+	{ 0x00000551, 0x0000 }, /* R1361 (0x551) - AIF2 Frame Ctrl 11 */
+	{ 0x00000552, 0x0001 }, /* R1362 (0x552) - AIF2 Frame Ctrl 12 */
+	{ 0x00000553, 0x0002 }, /* R1363 (0x553) - AIF2 Frame Ctrl 13 */
+	{ 0x00000554, 0x0003 }, /* R1364 (0x554) - AIF2 Frame Ctrl 14 */
+	{ 0x00000555, 0x0004 }, /* R1365 (0x555) - AIF2 Frame Ctrl 15 */
+	{ 0x00000556, 0x0005 }, /* R1366 (0x556) - AIF2 Frame Ctrl 16 */
+	{ 0x00000557, 0x0006 }, /* R1367 (0x557) - AIF2 Frame Ctrl 17 */
+	{ 0x00000558, 0x0007 }, /* R1368 (0x558) - AIF2 Frame Ctrl 18 */
+	{ 0x00000559, 0x0000 }, /* R1369 (0x559) - AIF2 Tx Enables */
+	{ 0x0000055a, 0x0000 }, /* R1370 (0x55a) - AIF2 Rx Enables */
+	{ 0x00000580, 0x000c }, /* R1408 (0x580) - AIF3 BCLK Ctrl */
+	{ 0x00000581, 0x0000 }, /* R1409 (0x581) - AIF3 Tx Pin Ctrl */
+	{ 0x00000582, 0x0000 }, /* R1410 (0x582) - AIF3 Rx Pin Ctrl */
+	{ 0x00000583, 0x0000 }, /* R1411 (0x583) - AIF3 Rate Ctrl */
+	{ 0x00000584, 0x0000 }, /* R1412 (0x584) - AIF3 Format */
+	{ 0x00000586, 0x0040 }, /* R1414 (0x586) - AIF3 Rx BCLK Rate */
+	{ 0x00000587, 0x1818 }, /* R1415 (0x587) - AIF3 Frame Ctrl 1 */
+	{ 0x00000588, 0x1818 }, /* R1416 (0x588) - AIF3 Frame Ctrl 2 */
+	{ 0x00000589, 0x0000 }, /* R1417 (0x589) - AIF3 Frame Ctrl 3 */
+	{ 0x0000058a, 0x0001 }, /* R1418 (0x58a) - AIF3 Frame Ctrl 4 */
+	{ 0x0000058b, 0x0002 }, /* R1419 (0x58b) - AIF3 Frame Ctrl 5 */
+	{ 0x0000058c, 0x0003 }, /* R1420 (0x58c) - AIF3 Frame Ctrl 6 */
+	{ 0x0000058d, 0x0004 }, /* R1421 (0x58d) - AIF3 Frame Ctrl 7 */
+	{ 0x0000058e, 0x0005 }, /* R1422 (0x58e) - AIF3 Frame Ctrl 8 */
+	{ 0x0000058f, 0x0006 }, /* R1423 (0x58f) - AIF3 Frame Ctrl 9 */
+	{ 0x00000590, 0x0007 }, /* R1424 (0x590) - AIF3 Frame Ctrl 10 */
+	{ 0x00000591, 0x0000 }, /* R1425 (0x591) - AIF3 Frame Ctrl 11 */
+	{ 0x00000592, 0x0001 }, /* R1426 (0x592) - AIF3 Frame Ctrl 12 */
+	{ 0x00000593, 0x0002 }, /* R1427 (0x593) - AIF3 Frame Ctrl 13 */
+	{ 0x00000594, 0x0003 }, /* R1428 (0x594) - AIF3 Frame Ctrl 14 */
+	{ 0x00000595, 0x0004 }, /* R1429 (0x595) - AIF3 Frame Ctrl 15 */
+	{ 0x00000596, 0x0005 }, /* R1430 (0x596) - AIF3 Frame Ctrl 16 */
+	{ 0x00000597, 0x0006 }, /* R1431 (0x597) - AIF3 Frame Ctrl 17 */
+	{ 0x00000598, 0x0007 }, /* R1432 (0x598) - AIF3 Frame Ctrl 18 */
+	{ 0x00000599, 0x0000 }, /* R1433 (0x599) - AIF3 Tx Enables */
+	{ 0x0000059a, 0x0000 }, /* R1434 (0x59a) - AIF3 Rx Enables */
+	{ 0x000005c2, 0x0000 }, /* R1474 (0x5c2) - SPD1 Tx Control */
+	{ 0x000005e3, 0x0000 }, /* R1507 (0x5e3) - SLIMBus Framer Ref Gear */
+	{ 0x000005e5, 0x0000 }, /* R1509 (0x5e5) - SLIMBus Rates 1 */
+	{ 0x000005e6, 0x0000 }, /* R1510 (0x5e6) - SLIMBus Rates 2 */
+	{ 0x000005e7, 0x0000 }, /* R1511 (0x5e7) - SLIMBus Rates 3 */
+	{ 0x000005e8, 0x0000 }, /* R1512 (0x5e8) - SLIMBus Rates 4 */
+	{ 0x000005e9, 0x0000 }, /* R1513 (0x5e9) - SLIMBus Rates 5 */
+	{ 0x000005ea, 0x0000 }, /* R1514 (0x5ea) - SLIMBus Rates 6 */
+	{ 0x000005eb, 0x0000 }, /* R1515 (0x5eb) - SLIMBus Rates 7 */
+	{ 0x000005ec, 0x0000 }, /* R1516 (0x5ec) - SLIMBus Rates 8 */
+	{ 0x000005f5, 0x0000 }, /* R1525 (0x5f5) - SLIMBus RX Channel Enable */
+	{ 0x000005f6, 0x0000 }, /* R1526 (0x5f6) - SLIMBus TX Channel Enable */
+	{ 0x00000640, 0x0000 }, /* R1600 (0x640) - PWM1MIX Input 1 Source */
+	{ 0x00000641, 0x0080 }, /* R1601 (0x641) - PWM1MIX Input 1 Volume */
+	{ 0x00000642, 0x0000 }, /* R1602 (0x642) - PWM1MIX Input 2 Source */
+	{ 0x00000643, 0x0080 }, /* R1603 (0x643) - PWM1MIX Input 2 Volume */
+	{ 0x00000644, 0x0000 }, /* R1604 (0x644) - PWM1MIX Input 3 Source */
+	{ 0x00000645, 0x0080 }, /* R1605 (0x645) - PWM1MIX Input 3 Volume */
+	{ 0x00000646, 0x0000 }, /* R1606 (0x646) - PWM1MIX Input 4 Source */
+	{ 0x00000647, 0x0080 }, /* R1607 (0x647) - PWM1MIX Input 4 Volume */
+	{ 0x00000648, 0x0000 }, /* R1608 (0x648) - PWM2MIX Input 1 Source */
+	{ 0x00000649, 0x0080 }, /* R1609 (0x649) - PWM2MIX Input 1 Volume */
+	{ 0x0000064a, 0x0000 }, /* R1610 (0x64a) - PWM2MIX Input 2 Source */
+	{ 0x0000064b, 0x0080 }, /* R1611 (0x64b) - PWM2MIX Input 2 Volume */
+	{ 0x0000064c, 0x0000 }, /* R1612 (0x64c) - PWM2MIX Input 3 Source */
+	{ 0x0000064d, 0x0080 }, /* R1613 (0x64d) - PWM2MIX Input 3 Volume */
+	{ 0x0000064e, 0x0000 }, /* R1614 (0x64e) - PWM2MIX Input 4 Source */
+	{ 0x0000064f, 0x0080 }, /* R1615 (0x64f) - PWM2MIX Input 4 Volume */
+	{ 0x00000680, 0x0000 }, /* R1664 (0x680) - OUT1LMIX Input 1 Source */
+	{ 0x00000681, 0x0080 }, /* R1665 (0x681) - OUT1LMIX Input 1 Volume */
+	{ 0x00000682, 0x0000 }, /* R1666 (0x682) - OUT1LMIX Input 2 Source */
+	{ 0x00000683, 0x0080 }, /* R1667 (0x683) - OUT1LMIX Input 2 Volume */
+	{ 0x00000684, 0x0000 }, /* R1668 (0x684) - OUT1LMIX Input 3 Source */
+	{ 0x00000685, 0x0080 }, /* R1669 (0x685) - OUT1LMIX Input 3 Volume */
+	{ 0x00000686, 0x0000 }, /* R1670 (0x686) - OUT1LMIX Input 4 Source */
+	{ 0x00000687, 0x0080 }, /* R1671 (0x687) - OUT1LMIX Input 4 Volume */
+	{ 0x00000688, 0x0000 }, /* R1672 (0x688) - OUT1RMIX Input 1 Source */
+	{ 0x00000689, 0x0080 }, /* R1673 (0x689) - OUT1RMIX Input 1 Volume */
+	{ 0x0000068a, 0x0000 }, /* R1674 (0x68a) - OUT1RMIX Input 2 Source */
+	{ 0x0000068b, 0x0080 }, /* R1675 (0x68b) - OUT1RMIX Input 2 Volume */
+	{ 0x0000068c, 0x0000 }, /* R1676 (0x68c) - OUT1RMIX Input 3 Source */
+	{ 0x0000068d, 0x0080 }, /* R1677 (0x68d) - OUT1RMIX Input 3 Volume */
+	{ 0x0000068e, 0x0000 }, /* R1678 (0x68e) - OUT1RMIX Input 4 Source */
+	{ 0x0000068f, 0x0080 }, /* R1679 (0x68f) - OUT1RMIX Input 4 Volume */
+	{ 0x00000690, 0x0000 }, /* R1680 (0x690) - OUT2LMIX Input 1 Source */
+	{ 0x00000691, 0x0080 }, /* R1681 (0x691) - OUT2LMIX Input 1 Volume */
+	{ 0x00000692, 0x0000 }, /* R1682 (0x692) - OUT2LMIX Input 2 Source */
+	{ 0x00000693, 0x0080 }, /* R1683 (0x693) - OUT2LMIX Input 2 Volume */
+	{ 0x00000694, 0x0000 }, /* R1684 (0x694) - OUT2LMIX Input 3 Source */
+	{ 0x00000695, 0x0080 }, /* R1685 (0x695) - OUT2LMIX Input 3 Volume */
+	{ 0x00000696, 0x0000 }, /* R1686 (0x696) - OUT2LMIX Input 4 Source */
+	{ 0x00000697, 0x0080 }, /* R1687 (0x697) - OUT2LMIX Input 4 Volume */
+	{ 0x00000698, 0x0000 }, /* R1688 (0x698) - OUT2RMIX Input 1 Source */
+	{ 0x00000699, 0x0080 }, /* R1689 (0x699) - OUT2RMIX Input 1 Volume */
+	{ 0x0000069a, 0x0000 }, /* R1690 (0x69a) - OUT2RMIX Input 2 Source */
+	{ 0x0000069b, 0x0080 }, /* R1691 (0x69b) - OUT2RMIX Input 2 Volume */
+	{ 0x0000069c, 0x0000 }, /* R1692 (0x69c) - OUT2RMIX Input 3 Source */
+	{ 0x0000069d, 0x0080 }, /* R1693 (0x69d) - OUT2RMIX Input 3 Volume */
+	{ 0x0000069e, 0x0000 }, /* R1694 (0x69e) - OUT2RMIX Input 4 Source */
+	{ 0x0000069f, 0x0080 }, /* R1695 (0x69f) - OUT2RMIX Input 4 Volume */
+	{ 0x000006a0, 0x0000 }, /* R1696 (0x6a0) - OUT3LMIX Input 1 Source */
+	{ 0x000006a1, 0x0080 }, /* R1697 (0x6a1) - OUT3LMIX Input 1 Volume */
+	{ 0x000006a2, 0x0000 }, /* R1698 (0x6a2) - OUT3LMIX Input 2 Source */
+	{ 0x000006a3, 0x0080 }, /* R1699 (0x6a3) - OUT3LMIX Input 2 Volume */
+	{ 0x000006a4, 0x0000 }, /* R1700 (0x6a4) - OUT3LMIX Input 3 Source */
+	{ 0x000006a5, 0x0080 }, /* R1701 (0x6a5) - OUT3LMIX Input 3 Volume */
+	{ 0x000006a6, 0x0000 }, /* R1702 (0x6a6) - OUT3LMIX Input 4 Source */
+	{ 0x000006a7, 0x0080 }, /* R1703 (0x6a7) - OUT3LMIX Input 4 Volume */
+	{ 0x000006a8, 0x0000 }, /* R1704 (0x6a8) - OUT3RMIX Input 1 Source */
+	{ 0x000006a9, 0x0080 }, /* R1705 (0x6a9) - OUT3RMIX Input 1 Volume */
+	{ 0x000006aa, 0x0000 }, /* R1706 (0x6aa) - OUT3RMIX Input 2 Source */
+	{ 0x000006ab, 0x0080 }, /* R1707 (0x6ab) - OUT3RMIX Input 2 Volume */
+	{ 0x000006ac, 0x0000 }, /* R1708 (0x6ac) - OUT3RMIX Input 3 Source */
+	{ 0x000006ad, 0x0080 }, /* R1709 (0x6ad) - OUT3RMIX Input 3 Volume */
+	{ 0x000006ae, 0x0000 }, /* R1710 (0x6ae) - OUT3RMIX Input 4 Source */
+	{ 0x000006af, 0x0080 }, /* R1711 (0x6af) - OUT3RMIX Input 4 Volume */
+	{ 0x000006c0, 0x0000 }, /* R1728 (0x6c0) - OUT5LMIX Input 1 Source */
+	{ 0x000006c1, 0x0080 }, /* R1729 (0x6c1) - OUT5LMIX Input 1 Volume */
+	{ 0x000006c2, 0x0000 }, /* R1730 (0x6c2) - OUT5LMIX Input 2 Source */
+	{ 0x000006c3, 0x0080 }, /* R1731 (0x6c3) - OUT5LMIX Input 2 Volume */
+	{ 0x000006c4, 0x0000 }, /* R1732 (0x6c4) - OUT5LMIX Input 3 Source */
+	{ 0x000006c5, 0x0080 }, /* R1733 (0x6c5) - OUT5LMIX Input 3 Volume */
+	{ 0x000006c6, 0x0000 }, /* R1734 (0x6c6) - OUT5LMIX Input 4 Source */
+	{ 0x000006c7, 0x0080 }, /* R1735 (0x6c7) - OUT5LMIX Input 4 Volume */
+	{ 0x000006c8, 0x0000 }, /* R1736 (0x6c8) - OUT5RMIX Input 1 Source */
+	{ 0x000006c9, 0x0080 }, /* R1737 (0x6c9) - OUT5RMIX Input 1 Volume */
+	{ 0x000006ca, 0x0000 }, /* R1738 (0x6ca) - OUT5RMIX Input 2 Source */
+	{ 0x000006cb, 0x0080 }, /* R1739 (0x6cb) - OUT5RMIX Input 2 Volume */
+	{ 0x000006cc, 0x0000 }, /* R1740 (0x6cc) - OUT5RMIX Input 3 Source */
+	{ 0x000006cd, 0x0080 }, /* R1741 (0x6cd) - OUT5RMIX Input 3 Volume */
+	{ 0x000006ce, 0x0000 }, /* R1742 (0x6ce) - OUT5RMIX Input 4 Source */
+	{ 0x000006cf, 0x0080 }, /* R1743 (0x6cf) - OUT5RMIX Input 4 Volume */
+	{ 0x00000700, 0x0000 }, /* R1792 (0x700) - AIF1TX1MIX Input 1 Source */
+	{ 0x00000701, 0x0080 }, /* R1793 (0x701) - AIF1TX1MIX Input 1 Volume */
+	{ 0x00000702, 0x0000 }, /* R1794 (0x702) - AIF1TX1MIX Input 2 Source */
+	{ 0x00000703, 0x0080 }, /* R1795 (0x703) - AIF1TX1MIX Input 2 Volume */
+	{ 0x00000704, 0x0000 }, /* R1796 (0x704) - AIF1TX1MIX Input 3 Source */
+	{ 0x00000705, 0x0080 }, /* R1797 (0x705) - AIF1TX1MIX Input 3 Volume */
+	{ 0x00000706, 0x0000 }, /* R1798 (0x706) - AIF1TX1MIX Input 4 Source */
+	{ 0x00000707, 0x0080 }, /* R1799 (0x707) - AIF1TX1MIX Input 4 Volume */
+	{ 0x00000708, 0x0000 }, /* R1800 (0x708) - AIF1TX2MIX Input 1 Source */
+	{ 0x00000709, 0x0080 }, /* R1801 (0x709) - AIF1TX2MIX Input 1 Volume */
+	{ 0x0000070a, 0x0000 }, /* R1802 (0x70a) - AIF1TX2MIX Input 2 Source */
+	{ 0x0000070b, 0x0080 }, /* R1803 (0x70b) - AIF1TX2MIX Input 2 Volume */
+	{ 0x0000070c, 0x0000 }, /* R1804 (0x70c) - AIF1TX2MIX Input 3 Source */
+	{ 0x0000070d, 0x0080 }, /* R1805 (0x70d) - AIF1TX2MIX Input 3 Volume */
+	{ 0x0000070e, 0x0000 }, /* R1806 (0x70e) - AIF1TX2MIX Input 4 Source */
+	{ 0x0000070f, 0x0080 }, /* R1807 (0x70f) - AIF1TX2MIX Input 4 Volume */
+	{ 0x00000710, 0x0000 }, /* R1808 (0x710) - AIF1TX3MIX Input 1 Source */
+	{ 0x00000711, 0x0080 }, /* R1809 (0x711) - AIF1TX3MIX Input 1 Volume */
+	{ 0x00000712, 0x0000 }, /* R1810 (0x712) - AIF1TX3MIX Input 2 Source */
+	{ 0x00000713, 0x0080 }, /* R1811 (0x713) - AIF1TX3MIX Input 2 Volume */
+	{ 0x00000714, 0x0000 }, /* R1812 (0x714) - AIF1TX3MIX Input 3 Source */
+	{ 0x00000715, 0x0080 }, /* R1813 (0x715) - AIF1TX3MIX Input 3 Volume */
+	{ 0x00000716, 0x0000 }, /* R1814 (0x716) - AIF1TX3MIX Input 4 Source */
+	{ 0x00000717, 0x0080 }, /* R1815 (0x717) - AIF1TX3MIX Input 4 Volume */
+	{ 0x00000718, 0x0000 }, /* R1816 (0x718) - AIF1TX4MIX Input 1 Source */
+	{ 0x00000719, 0x0080 }, /* R1817 (0x719) - AIF1TX4MIX Input 1 Volume */
+	{ 0x0000071a, 0x0000 }, /* R1818 (0x71a) - AIF1TX4MIX Input 2 Source */
+	{ 0x0000071b, 0x0080 }, /* R1819 (0x71b) - AIF1TX4MIX Input 2 Volume */
+	{ 0x0000071c, 0x0000 }, /* R1820 (0x71c) - AIF1TX4MIX Input 3 Source */
+	{ 0x0000071d, 0x0080 }, /* R1821 (0x71d) - AIF1TX4MIX Input 3 Volume */
+	{ 0x0000071e, 0x0000 }, /* R1822 (0x71e) - AIF1TX4MIX Input 4 Source */
+	{ 0x0000071f, 0x0080 }, /* R1823 (0x71f) - AIF1TX4MIX Input 4 Volume */
+	{ 0x00000720, 0x0000 }, /* R1824 (0x720) - AIF1TX5MIX Input 1 Source */
+	{ 0x00000721, 0x0080 }, /* R1825 (0x721) - AIF1TX5MIX Input 1 Volume */
+	{ 0x00000722, 0x0000 }, /* R1826 (0x722) - AIF1TX5MIX Input 2 Source */
+	{ 0x00000723, 0x0080 }, /* R1827 (0x723) - AIF1TX5MIX Input 2 Volume */
+	{ 0x00000724, 0x0000 }, /* R1828 (0x724) - AIF1TX5MIX Input 3 Source */
+	{ 0x00000725, 0x0080 }, /* R1829 (0x725) - AIF1TX5MIX Input 3 Volume */
+	{ 0x00000726, 0x0000 }, /* R1830 (0x726) - AIF1TX5MIX Input 4 Source */
+	{ 0x00000727, 0x0080 }, /* R1831 (0x727) - AIF1TX5MIX Input 4 Volume */
+	{ 0x00000728, 0x0000 }, /* R1832 (0x728) - AIF1TX6MIX Input 1 Source */
+	{ 0x00000729, 0x0080 }, /* R1833 (0x729) - AIF1TX6MIX Input 1 Volume */
+	{ 0x0000072a, 0x0000 }, /* R1834 (0x72a) - AIF1TX6MIX Input 2 Source */
+	{ 0x0000072b, 0x0080 }, /* R1835 (0x72b) - AIF1TX6MIX Input 2 Volume */
+	{ 0x0000072c, 0x0000 }, /* R1836 (0x72c) - AIF1TX6MIX Input 3 Source */
+	{ 0x0000072d, 0x0080 }, /* R1837 (0x72d) - AIF1TX6MIX Input 3 Volume */
+	{ 0x0000072e, 0x0000 }, /* R1838 (0x72e) - AIF1TX6MIX Input 4 Source */
+	{ 0x0000072f, 0x0080 }, /* R1839 (0x72f) - AIF1TX6MIX Input 4 Volume */
+	{ 0x00000730, 0x0000 }, /* R1840 (0x730) - AIF1TX7MIX Input 1 Source */
+	{ 0x00000731, 0x0080 }, /* R1841 (0x731) - AIF1TX7MIX Input 1 Volume */
+	{ 0x00000732, 0x0000 }, /* R1842 (0x732) - AIF1TX7MIX Input 2 Source */
+	{ 0x00000733, 0x0080 }, /* R1843 (0x733) - AIF1TX7MIX Input 2 Volume */
+	{ 0x00000734, 0x0000 }, /* R1844 (0x734) - AIF1TX7MIX Input 3 Source */
+	{ 0x00000735, 0x0080 }, /* R1845 (0x735) - AIF1TX7MIX Input 3 Volume */
+	{ 0x00000736, 0x0000 }, /* R1846 (0x736) - AIF1TX7MIX Input 4 Source */
+	{ 0x00000737, 0x0080 }, /* R1847 (0x737) - AIF1TX7MIX Input 4 Volume */
+	{ 0x00000738, 0x0000 }, /* R1848 (0x738) - AIF1TX8MIX Input 1 Source */
+	{ 0x00000739, 0x0080 }, /* R1849 (0x739) - AIF1TX8MIX Input 1 Volume */
+	{ 0x0000073a, 0x0000 }, /* R1850 (0x73a) - AIF1TX8MIX Input 2 Source */
+	{ 0x0000073b, 0x0080 }, /* R1851 (0x73b) - AIF1TX8MIX Input 2 Volume */
+	{ 0x0000073c, 0x0000 }, /* R1852 (0x73c) - AIF1TX8MIX Input 3 Source */
+	{ 0x0000073d, 0x0080 }, /* R1853 (0x73d) - AIF1TX8MIX Input 3 Volume */
+	{ 0x0000073e, 0x0000 }, /* R1854 (0x73e) - AIF1TX8MIX Input 4 Source */
+	{ 0x0000073f, 0x0080 }, /* R1855 (0x73f) - AIF1TX8MIX Input 4 Volume */
+	{ 0x00000740, 0x0000 }, /* R1856 (0x740) - AIF2TX1MIX Input 1 Source */
+	{ 0x00000741, 0x0080 }, /* R1857 (0x741) - AIF2TX1MIX Input 1 Volume */
+	{ 0x00000742, 0x0000 }, /* R1858 (0x742) - AIF2TX1MIX Input 2 Source */
+	{ 0x00000743, 0x0080 }, /* R1859 (0x743) - AIF2TX1MIX Input 2 Volume */
+	{ 0x00000744, 0x0000 }, /* R1860 (0x744) - AIF2TX1MIX Input 3 Source */
+	{ 0x00000745, 0x0080 }, /* R1861 (0x745) - AIF2TX1MIX Input 3 Volume */
+	{ 0x00000746, 0x0000 }, /* R1862 (0x746) - AIF2TX1MIX Input 4 Source */
+	{ 0x00000747, 0x0080 }, /* R1863 (0x747) - AIF2TX1MIX Input 4 Volume */
+	{ 0x00000748, 0x0000 }, /* R1864 (0x748) - AIF2TX2MIX Input 1 Source */
+	{ 0x00000749, 0x0080 }, /* R1865 (0x749) - AIF2TX2MIX Input 1 Volume */
+	{ 0x0000074a, 0x0000 }, /* R1866 (0x74a) - AIF2TX2MIX Input 2 Source */
+	{ 0x0000074b, 0x0080 }, /* R1867 (0x74b) - AIF2TX2MIX Input 2 Volume */
+	{ 0x0000074c, 0x0000 }, /* R1868 (0x74c) - AIF2TX2MIX Input 3 Source */
+	{ 0x0000074d, 0x0080 }, /* R1869 (0x74d) - AIF2TX2MIX Input 3 Volume */
+	{ 0x0000074e, 0x0000 }, /* R1870 (0x74e) - AIF2TX2MIX Input 4 Source */
+	{ 0x0000074f, 0x0080 }, /* R1871 (0x74f) - AIF2TX2MIX Input 4 Volume */
+	{ 0x00000750, 0x0000 }, /* R1872 (0x750) - AIF2TX3MIX Input 1 Source */
+	{ 0x00000751, 0x0080 }, /* R1873 (0x751) - AIF2TX3MIX Input 1 Volume */
+	{ 0x00000752, 0x0000 }, /* R1874 (0x752) - AIF2TX3MIX Input 2 Source */
+	{ 0x00000753, 0x0080 }, /* R1875 (0x753) - AIF2TX3MIX Input 2 Volume */
+	{ 0x00000754, 0x0000 }, /* R1876 (0x754) - AIF2TX3MIX Input 3 Source */
+	{ 0x00000755, 0x0080 }, /* R1877 (0x755) - AIF2TX3MIX Input 3 Volume */
+	{ 0x00000756, 0x0000 }, /* R1878 (0x756) - AIF2TX3MIX Input 4 Source */
+	{ 0x00000757, 0x0080 }, /* R1879 (0x757) - AIF2TX3MIX Input 4 Volume */
+	{ 0x00000758, 0x0000 }, /* R1880 (0x758) - AIF2TX4MIX Input 1 Source */
+	{ 0x00000759, 0x0080 }, /* R1881 (0x759) - AIF2TX4MIX Input 1 Volume */
+	{ 0x0000075a, 0x0000 }, /* R1882 (0x75a) - AIF2TX4MIX Input 2 Source */
+	{ 0x0000075b, 0x0080 }, /* R1883 (0x75b) - AIF2TX4MIX Input 2 Volume */
+	{ 0x0000075c, 0x0000 }, /* R1884 (0x75c) - AIF2TX4MIX Input 3 Source */
+	{ 0x0000075d, 0x0080 }, /* R1885 (0x75d) - AIF2TX4MIX Input 3 Volume */
+	{ 0x0000075e, 0x0000 }, /* R1886 (0x75e) - AIF2TX4MIX Input 4 Source */
+	{ 0x0000075f, 0x0080 }, /* R1887 (0x75f) - AIF2TX4MIX Input 4 Volume */
+	{ 0x00000760, 0x0000 }, /* R1888 (0x760) - AIF2TX5MIX Input 1 Source */
+	{ 0x00000761, 0x0080 }, /* R1889 (0x761) - AIF2TX5MIX Input 1 Volume */
+	{ 0x00000762, 0x0000 }, /* R1890 (0x762) - AIF2TX5MIX Input 2 Source */
+	{ 0x00000763, 0x0080 }, /* R1891 (0x763) - AIF2TX5MIX Input 2 Volume */
+	{ 0x00000764, 0x0000 }, /* R1892 (0x764) - AIF2TX5MIX Input 3 Source */
+	{ 0x00000765, 0x0080 }, /* R1893 (0x765) - AIF2TX5MIX Input 3 Volume */
+	{ 0x00000766, 0x0000 }, /* R1894 (0x766) - AIF2TX5MIX Input 4 Source */
+	{ 0x00000767, 0x0080 }, /* R1895 (0x767) - AIF2TX5MIX Input 4 Volume */
+	{ 0x00000768, 0x0000 }, /* R1896 (0x768) - AIF2TX6MIX Input 1 Source */
+	{ 0x00000769, 0x0080 }, /* R1897 (0x769) - AIF2TX6MIX Input 1 Volume */
+	{ 0x0000076a, 0x0000 }, /* R1898 (0x76a) - AIF2TX6MIX Input 2 Source */
+	{ 0x0000076b, 0x0080 }, /* R1899 (0x76b) - AIF2TX6MIX Input 2 Volume */
+	{ 0x0000076c, 0x0000 }, /* R1900 (0x76c) - AIF2TX6MIX Input 3 Source */
+	{ 0x0000076d, 0x0080 }, /* R1901 (0x76d) - AIF2TX6MIX Input 3 Volume */
+	{ 0x0000076e, 0x0000 }, /* R1902 (0x76e) - AIF2TX6MIX Input 4 Source */
+	{ 0x0000076f, 0x0080 }, /* R1903 (0x76f) - AIF2TX6MIX Input 4 Volume */
+	{ 0x00000770, 0x0000 }, /* R1904 (0x770) - AIF2TX7MIX Input 1 Source */
+	{ 0x00000771, 0x0080 }, /* R1905 (0x771) - AIF2TX7MIX Input 1 Volume */
+	{ 0x00000772, 0x0000 }, /* R1906 (0x772) - AIF2TX7MIX Input 2 Source */
+	{ 0x00000773, 0x0080 }, /* R1907 (0x773) - AIF2TX7MIX Input 2 Volume */
+	{ 0x00000774, 0x0000 }, /* R1908 (0x774) - AIF2TX7MIX Input 3 Source */
+	{ 0x00000775, 0x0080 }, /* R1909 (0x775) - AIF2TX7MIX Input 3 Volume */
+	{ 0x00000776, 0x0000 }, /* R1910 (0x776) - AIF2TX7MIX Input 4 Source */
+	{ 0x00000777, 0x0080 }, /* R1911 (0x777) - AIF2TX7MIX Input 4 Volume */
+	{ 0x00000778, 0x0000 }, /* R1912 (0x778) - AIF2TX8MIX Input 1 Source */
+	{ 0x00000779, 0x0080 }, /* R1913 (0x779) - AIF2TX8MIX Input 1 Volume */
+	{ 0x0000077a, 0x0000 }, /* R1914 (0x77a) - AIF2TX8MIX Input 2 Source */
+	{ 0x0000077b, 0x0080 }, /* R1915 (0x77b) - AIF2TX8MIX Input 2 Volume */
+	{ 0x0000077c, 0x0000 }, /* R1916 (0x77c) - AIF2TX8MIX Input 3 Source */
+	{ 0x0000077d, 0x0080 }, /* R1917 (0x77d) - AIF2TX8MIX Input 3 Volume */
+	{ 0x0000077e, 0x0000 }, /* R1918 (0x77e) - AIF2TX8MIX Input 4 Source */
+	{ 0x0000077f, 0x0080 }, /* R1919 (0x77f) - AIF2TX8MIX Input 4 Volume */
+	{ 0x00000780, 0x0000 }, /* R1920 (0x780) - AIF3TX1MIX Input 1 Source */
+	{ 0x00000781, 0x0080 }, /* R1921 (0x781) - AIF3TX1MIX Input 1 Volume */
+	{ 0x00000782, 0x0000 }, /* R1922 (0x782) - AIF3TX1MIX Input 2 Source */
+	{ 0x00000783, 0x0080 }, /* R1923 (0x783) - AIF3TX1MIX Input 2 Volume */
+	{ 0x00000784, 0x0000 }, /* R1924 (0x784) - AIF3TX1MIX Input 3 Source */
+	{ 0x00000785, 0x0080 }, /* R1925 (0x785) - AIF3TX1MIX Input 3 Volume */
+	{ 0x00000786, 0x0000 }, /* R1926 (0x786) - AIF3TX1MIX Input 4 Source */
+	{ 0x00000787, 0x0080 }, /* R1927 (0x787) - AIF3TX1MIX Input 4 Volume */
+	{ 0x00000788, 0x0000 }, /* R1928 (0x788) - AIF3TX2MIX Input 1 Source */
+	{ 0x00000789, 0x0080 }, /* R1929 (0x789) - AIF3TX2MIX Input 1 Volume */
+	{ 0x0000078a, 0x0000 }, /* R1930 (0x78a) - AIF3TX2MIX Input 2 Source */
+	{ 0x0000078b, 0x0080 }, /* R1931 (0x78b) - AIF3TX2MIX Input 2 Volume */
+	{ 0x0000078c, 0x0000 }, /* R1932 (0x78c) - AIF3TX2MIX Input 3 Source */
+	{ 0x0000078d, 0x0080 }, /* R1933 (0x78d) - AIF3TX2MIX Input 3 Volume */
+	{ 0x0000078e, 0x0000 }, /* R1934 (0x78e) - AIF3TX2MIX Input 4 Source */
+	{ 0x0000078f, 0x0080 }, /* R1935 (0x78f) - AIF3TX2MIX Input 4 Volume */
+	{ 0x00000790, 0x0000 }, /* R1936 (0x790) - AIF3TX3MIX Input 1 Source */
+	{ 0x00000791, 0x0080 }, /* R1937 (0x791) - AIF3TX3MIX Input 1 Volume */
+	{ 0x00000792, 0x0000 }, /* R1938 (0x792) - AIF3TX3MIX Input 2 Source */
+	{ 0x00000793, 0x0080 }, /* R1939 (0x793) - AIF3TX3MIX Input 2 Volume */
+	{ 0x00000794, 0x0000 }, /* R1940 (0x794) - AIF3TX3MIX Input 3 Source */
+	{ 0x00000795, 0x0080 }, /* R1941 (0x795) - AIF3TX3MIX Input 3 Volume */
+	{ 0x00000796, 0x0000 }, /* R1942 (0x796) - AIF3TX3MIX Input 4 Source */
+	{ 0x00000797, 0x0080 }, /* R1943 (0x797) - AIF3TX3MIX Input 4 Volume */
+	{ 0x00000798, 0x0000 }, /* R1944 (0x798) - AIF3TX4MIX Input 1 Source */
+	{ 0x00000799, 0x0080 }, /* R1945 (0x799) - AIF3TX4MIX Input 1 Volume */
+	{ 0x0000079a, 0x0000 }, /* R1946 (0x79a) - AIF3TX4MIX Input 2 Source */
+	{ 0x0000079b, 0x0080 }, /* R1947 (0x79b) - AIF3TX4MIX Input 2 Volume */
+	{ 0x0000079c, 0x0000 }, /* R1948 (0x79c) - AIF3TX4MIX Input 3 Source */
+	{ 0x0000079d, 0x0080 }, /* R1949 (0x79d) - AIF3TX4MIX Input 3 Volume */
+	{ 0x0000079e, 0x0000 }, /* R1950 (0x79e) - AIF3TX4MIX Input 4 Source */
+	{ 0x0000079f, 0x0080 }, /* R1951 (0x79f) - AIF3TX4MIX Input 4 Volume */
+	{ 0x000007a0, 0x0000 }, /* R1952 (0x7a0) - AIF3TX5MIX Input 1 Source */
+	{ 0x000007a1, 0x0080 }, /* R1953 (0x7a1) - AIF3TX5MIX Input 1 Volume */
+	{ 0x000007a2, 0x0000 }, /* R1954 (0x7a2) - AIF3TX5MIX Input 2 Source */
+	{ 0x000007a3, 0x0080 }, /* R1955 (0x7a3) - AIF3TX5MIX Input 2 Volume */
+	{ 0x000007a4, 0x0000 }, /* R1956 (0x7a4) - AIF3TX5MIX Input 3 Source */
+	{ 0x000007a5, 0x0080 }, /* R1957 (0x7a5) - AIF3TX5MIX Input 3 Volume */
+	{ 0x000007a6, 0x0000 }, /* R1958 (0x7a6) - AIF3TX5MIX Input 4 Source */
+	{ 0x000007a7, 0x0080 }, /* R1959 (0x7a7) - AIF3TX5MIX Input 4 Volume */
+	{ 0x000007a8, 0x0000 }, /* R1960 (0x7a8) - AIF3TX6MIX Input 1 Source */
+	{ 0x000007a9, 0x0080 }, /* R1961 (0x7a9) - AIF3TX6MIX Input 1 Volume */
+	{ 0x000007aa, 0x0000 }, /* R1962 (0x7aa) - AIF3TX6MIX Input 2 Source */
+	{ 0x000007ab, 0x0080 }, /* R1963 (0x7ab) - AIF3TX6MIX Input 2 Volume */
+	{ 0x000007ac, 0x0000 }, /* R1964 (0x7ac) - AIF3TX6MIX Input 3 Source */
+	{ 0x000007ad, 0x0080 }, /* R1965 (0x7ad) - AIF3TX6MIX Input 3 Volume */
+	{ 0x000007ae, 0x0000 }, /* R1966 (0x7ae) - AIF3TX6MIX Input 4 Source */
+	{ 0x000007af, 0x0080 }, /* R1967 (0x7af) - AIF3TX6MIX Input 4 Volume */
+	{ 0x000007b0, 0x0000 }, /* R1968 (0x7b0) - AIF3TX7MIX Input 1 Source */
+	{ 0x000007b1, 0x0080 }, /* R1969 (0x7b1) - AIF3TX7MIX Input 1 Volume */
+	{ 0x000007b2, 0x0000 }, /* R1970 (0x7b2) - AIF3TX7MIX Input 2 Source */
+	{ 0x000007b3, 0x0080 }, /* R1971 (0x7b3) - AIF3TX7MIX Input 2 Volume */
+	{ 0x000007b4, 0x0000 }, /* R1972 (0x7b4) - AIF3TX7MIX Input 3 Source */
+	{ 0x000007b5, 0x0080 }, /* R1973 (0x7b5) - AIF3TX7MIX Input 3 Volume */
+	{ 0x000007b6, 0x0000 }, /* R1974 (0x7b6) - AIF3TX7MIX Input 4 Source */
+	{ 0x000007b7, 0x0080 }, /* R1975 (0x7b7) - AIF3TX7MIX Input 4 Volume */
+	{ 0x000007b8, 0x0000 }, /* R1976 (0x7b8) - AIF3TX8MIX Input 1 Source */
+	{ 0x000007b9, 0x0080 }, /* R1977 (0x7b9) - AIF3TX8MIX Input 1 Volume */
+	{ 0x000007ba, 0x0000 }, /* R1978 (0x7ba) - AIF3TX8MIX Input 2 Source */
+	{ 0x000007bb, 0x0080 }, /* R1979 (0x7bb) - AIF3TX8MIX Input 2 Volume */
+	{ 0x000007bc, 0x0000 }, /* R1980 (0x7bc) - AIF3TX8MIX Input 3 Source */
+	{ 0x000007bd, 0x0080 }, /* R1981 (0x7bd) - AIF3TX8MIX Input 3 Volume */
+	{ 0x000007be, 0x0000 }, /* R1982 (0x7be) - AIF3TX8MIX Input 4 Source */
+	{ 0x000007bf, 0x0080 }, /* R1983 (0x7bf) - AIF3TX8MIX Input 4 Volume */
+	{ 0x000007c0, 0x0000 }, /* R1984 (0x7c0) - SLIMTX1MIX Input 1 Source */
+	{ 0x000007c1, 0x0080 }, /* R1985 (0x7c1) - SLIMTX1MIX Input 1 Volume */
+	{ 0x000007c2, 0x0000 }, /* R1986 (0x7c2) - SLIMTX1MIX Input 2 Source */
+	{ 0x000007c3, 0x0080 }, /* R1987 (0x7c3) - SLIMTX1MIX Input 2 Volume */
+	{ 0x000007c4, 0x0000 }, /* R1988 (0x7c4) - SLIMTX1MIX Input 3 Source */
+	{ 0x000007c5, 0x0080 }, /* R1989 (0x7c5) - SLIMTX1MIX Input 3 Volume */
+	{ 0x000007c6, 0x0000 }, /* R1990 (0x7c6) - SLIMTX1MIX Input 4 Source */
+	{ 0x000007c7, 0x0080 }, /* R1991 (0x7c7) - SLIMTX1MIX Input 4 Volume */
+	{ 0x000007c8, 0x0000 }, /* R1992 (0x7c8) - SLIMTX2MIX Input 1 Source */
+	{ 0x000007c9, 0x0080 }, /* R1993 (0x7c9) - SLIMTX2MIX Input 1 Volume */
+	{ 0x000007ca, 0x0000 }, /* R1994 (0x7ca) - SLIMTX2MIX Input 2 Source */
+	{ 0x000007cb, 0x0080 }, /* R1995 (0x7cb) - SLIMTX2MIX Input 2 Volume */
+	{ 0x000007cc, 0x0000 }, /* R1996 (0x7cc) - SLIMTX2MIX Input 3 Source */
+	{ 0x000007cd, 0x0080 }, /* R1997 (0x7cd) - SLIMTX2MIX Input 3 Volume */
+	{ 0x000007ce, 0x0000 }, /* R1998 (0x7ce) - SLIMTX2MIX Input 4 Source */
+	{ 0x000007cf, 0x0080 }, /* R1999 (0x7cf) - SLIMTX2MIX Input 4 Volume */
+	{ 0x000007d0, 0x0000 }, /* R2000 (0x7d0) - SLIMTX3MIX Input 1 Source */
+	{ 0x000007d1, 0x0080 }, /* R2001 (0x7d1) - SLIMTX3MIX Input 1 Volume */
+	{ 0x000007d2, 0x0000 }, /* R2002 (0x7d2) - SLIMTX3MIX Input 2 Source */
+	{ 0x000007d3, 0x0080 }, /* R2003 (0x7d3) - SLIMTX3MIX Input 2 Volume */
+	{ 0x000007d4, 0x0000 }, /* R2004 (0x7d4) - SLIMTX3MIX Input 3 Source */
+	{ 0x000007d5, 0x0080 }, /* R2005 (0x7d5) - SLIMTX3MIX Input 3 Volume */
+	{ 0x000007d6, 0x0000 }, /* R2006 (0x7d6) - SLIMTX3MIX Input 4 Source */
+	{ 0x000007d7, 0x0080 }, /* R2007 (0x7d7) - SLIMTX3MIX Input 4 Volume */
+	{ 0x000007d8, 0x0000 }, /* R2008 (0x7d8) - SLIMTX4MIX Input 1 Source */
+	{ 0x000007d9, 0x0080 }, /* R2009 (0x7d9) - SLIMTX4MIX Input 1 Volume */
+	{ 0x000007da, 0x0000 }, /* R2010 (0x7da) - SLIMTX4MIX Input 2 Source */
+	{ 0x000007db, 0x0080 }, /* R2011 (0x7db) - SLIMTX4MIX Input 2 Volume */
+	{ 0x000007dc, 0x0000 }, /* R2012 (0x7dc) - SLIMTX4MIX Input 3 Source */
+	{ 0x000007dd, 0x0080 }, /* R2013 (0x7dd) - SLIMTX4MIX Input 3 Volume */
+	{ 0x000007de, 0x0000 }, /* R2014 (0x7de) - SLIMTX4MIX Input 4 Source */
+	{ 0x000007df, 0x0080 }, /* R2015 (0x7df) - SLIMTX4MIX Input 4 Volume */
+	{ 0x000007e0, 0x0000 }, /* R2016 (0x7e0) - SLIMTX5MIX Input 1 Source */
+	{ 0x000007e1, 0x0080 }, /* R2017 (0x7e1) - SLIMTX5MIX Input 1 Volume */
+	{ 0x000007e2, 0x0000 }, /* R2018 (0x7e2) - SLIMTX5MIX Input 2 Source */
+	{ 0x000007e3, 0x0080 }, /* R2019 (0x7e3) - SLIMTX5MIX Input 2 Volume */
+	{ 0x000007e4, 0x0000 }, /* R2020 (0x7e4) - SLIMTX5MIX Input 3 Source */
+	{ 0x000007e5, 0x0080 }, /* R2021 (0x7e5) - SLIMTX5MIX Input 3 Volume */
+	{ 0x000007e6, 0x0000 }, /* R2022 (0x7e6) - SLIMTX5MIX Input 4 Source */
+	{ 0x000007e7, 0x0080 }, /* R2023 (0x7e7) - SLIMTX5MIX Input 4 Volume */
+	{ 0x000007e8, 0x0000 }, /* R2024 (0x7e8) - SLIMTX6MIX Input 1 Source */
+	{ 0x000007e9, 0x0080 }, /* R2025 (0x7e9) - SLIMTX6MIX Input 1 Volume */
+	{ 0x000007ea, 0x0000 }, /* R2026 (0x7ea) - SLIMTX6MIX Input 2 Source */
+	{ 0x000007eb, 0x0080 }, /* R2027 (0x7eb) - SLIMTX6MIX Input 2 Volume */
+	{ 0x000007ec, 0x0000 }, /* R2028 (0x7ec) - SLIMTX6MIX Input 3 Source */
+	{ 0x000007ed, 0x0080 }, /* R2029 (0x7ed) - SLIMTX6MIX Input 3 Volume */
+	{ 0x000007ee, 0x0000 }, /* R2030 (0x7ee) - SLIMTX6MIX Input 4 Source */
+	{ 0x000007ef, 0x0080 }, /* R2031 (0x7ef) - SLIMTX6MIX Input 4 Volume */
+	{ 0x000007f0, 0x0000 }, /* R2032 (0x7f0) - SLIMTX7MIX Input 1 Source */
+	{ 0x000007f1, 0x0080 }, /* R2033 (0x7f1) - SLIMTX7MIX Input 1 Volume */
+	{ 0x000007f2, 0x0000 }, /* R2034 (0x7f2) - SLIMTX7MIX Input 2 Source */
+	{ 0x000007f3, 0x0080 }, /* R2035 (0x7f3) - SLIMTX7MIX Input 2 Volume */
+	{ 0x000007f4, 0x0000 }, /* R2036 (0x7f4) - SLIMTX7MIX Input 3 Source */
+	{ 0x000007f5, 0x0080 }, /* R2037 (0x7f5) - SLIMTX7MIX Input 3 Volume */
+	{ 0x000007f6, 0x0000 }, /* R2038 (0x7f6) - SLIMTX7MIX Input 4 Source */
+	{ 0x000007f7, 0x0080 }, /* R2039 (0x7f7) - SLIMTX7MIX Input 4 Volume */
+	{ 0x000007f8, 0x0000 }, /* R2040 (0x7f8) - SLIMTX8MIX Input 1 Source */
+	{ 0x000007f9, 0x0080 }, /* R2041 (0x7f9) - SLIMTX8MIX Input 1 Volume */
+	{ 0x000007fa, 0x0000 }, /* R2042 (0x7fa) - SLIMTX8MIX Input 2 Source */
+	{ 0x000007fb, 0x0080 }, /* R2043 (0x7fb) - SLIMTX8MIX Input 2 Volume */
+	{ 0x000007fc, 0x0000 }, /* R2044 (0x7fc) - SLIMTX8MIX Input 3 Source */
+	{ 0x000007fd, 0x0080 }, /* R2045 (0x7fd) - SLIMTX8MIX Input 3 Volume */
+	{ 0x000007fe, 0x0000 }, /* R2046 (0x7fe) - SLIMTX8MIX Input 4 Source */
+	{ 0x000007ff, 0x0080 }, /* R2047 (0x7ff) - SLIMTX8MIX Input 4 Volume */
+	{ 0x00000800, 0x0000 }, /* R2048 (0x800) - SPDIF1TX1MIX Input 1 Source */
+	{ 0x00000801, 0x0080 }, /* R2049 (0x801) - SPDIF1TX1MIX Input 1 Volume */
+	{ 0x00000808, 0x0000 }, /* R2056 (0x808) - SPDIF1TX2MIX Input 1 Source */
+	{ 0x00000809, 0x0080 }, /* R2057 (0x809) - SPDIF1TX2MIX Input 1 Volume */
+	{ 0x00000880, 0x0000 }, /* R2176 (0x880) - EQ1MIX Input 1 Source */
+	{ 0x00000881, 0x0080 }, /* R2177 (0x881) - EQ1MIX Input 1 Volume */
+	{ 0x00000882, 0x0000 }, /* R2178 (0x882) - EQ1MIX Input 2 Source */
+	{ 0x00000883, 0x0080 }, /* R2179 (0x883) - EQ1MIX Input 2 Volume */
+	{ 0x00000884, 0x0000 }, /* R2180 (0x884) - EQ1MIX Input 3 Source */
+	{ 0x00000885, 0x0080 }, /* R2181 (0x885) - EQ1MIX Input 3 Volume */
+	{ 0x00000886, 0x0000 }, /* R2182 (0x886) - EQ1MIX Input 4 Source */
+	{ 0x00000887, 0x0080 }, /* R2183 (0x887) - EQ1MIX Input 4 Volume */
+	{ 0x00000888, 0x0000 }, /* R2184 (0x888) - EQ2MIX Input 1 Source */
+	{ 0x00000889, 0x0080 }, /* R2185 (0x889) - EQ2MIX Input 1 Volume */
+	{ 0x0000088a, 0x0000 }, /* R2186 (0x88a) - EQ2MIX Input 2 Source */
+	{ 0x0000088b, 0x0080 }, /* R2187 (0x88b) - EQ2MIX Input 2 Volume */
+	{ 0x0000088c, 0x0000 }, /* R2188 (0x88c) - EQ2MIX Input 3 Source */
+	{ 0x0000088d, 0x0080 }, /* R2189 (0x88d) - EQ2MIX Input 3 Volume */
+	{ 0x0000088e, 0x0000 }, /* R2190 (0x88e) - EQ2MIX Input 4 Source */
+	{ 0x0000088f, 0x0080 }, /* R2191 (0x88f) - EQ2MIX Input 4 Volume */
+	{ 0x00000890, 0x0000 }, /* R2192 (0x890) - EQ3MIX Input 1 Source */
+	{ 0x00000891, 0x0080 }, /* R2193 (0x891) - EQ3MIX Input 1 Volume */
+	{ 0x00000892, 0x0000 }, /* R2194 (0x892) - EQ3MIX Input 2 Source */
+	{ 0x00000893, 0x0080 }, /* R2195 (0x893) - EQ3MIX Input 2 Volume */
+	{ 0x00000894, 0x0000 }, /* R2196 (0x894) - EQ3MIX Input 3 Source */
+	{ 0x00000895, 0x0080 }, /* R2197 (0x895) - EQ3MIX Input 3 Volume */
+	{ 0x00000896, 0x0000 }, /* R2198 (0x896) - EQ3MIX Input 4 Source */
+	{ 0x00000897, 0x0080 }, /* R2199 (0x897) - EQ3MIX Input 4 Volume */
+	{ 0x00000898, 0x0000 }, /* R2200 (0x898) - EQ4MIX Input 1 Source */
+	{ 0x00000899, 0x0080 }, /* R2201 (0x899) - EQ4MIX Input 1 Volume */
+	{ 0x0000089a, 0x0000 }, /* R2202 (0x89a) - EQ4MIX Input 2 Source */
+	{ 0x0000089b, 0x0080 }, /* R2203 (0x89b) - EQ4MIX Input 2 Volume */
+	{ 0x0000089c, 0x0000 }, /* R2204 (0x89c) - EQ4MIX Input 3 Source */
+	{ 0x0000089d, 0x0080 }, /* R2205 (0x89d) - EQ4MIX Input 3 Volume */
+	{ 0x0000089e, 0x0000 }, /* R2206 (0x89e) - EQ4MIX Input 4 Source */
+	{ 0x0000089f, 0x0080 }, /* R2207 (0x89f) - EQ4MIX Input 4 Volume */
+	{ 0x000008c0, 0x0000 }, /* R2240 (0x8c0) - DRC1LMIX Input 1 Source */
+	{ 0x000008c1, 0x0080 }, /* R2241 (0x8c1) - DRC1LMIX Input 1 Volume */
+	{ 0x000008c2, 0x0000 }, /* R2242 (0x8c2) - DRC1LMIX Input 2 Source */
+	{ 0x000008c3, 0x0080 }, /* R2243 (0x8c3) - DRC1LMIX Input 2 Volume */
+	{ 0x000008c4, 0x0000 }, /* R2244 (0x8c4) - DRC1LMIX Input 3 Source */
+	{ 0x000008c5, 0x0080 }, /* R2245 (0x8c5) - DRC1LMIX Input 3 Volume */
+	{ 0x000008c6, 0x0000 }, /* R2246 (0x8c6) - DRC1LMIX Input 4 Source */
+	{ 0x000008c7, 0x0080 }, /* R2247 (0x8c7) - DRC1LMIX Input 4 Volume */
+	{ 0x000008c8, 0x0000 }, /* R2248 (0x8c8) - DRC1RMIX Input 1 Source */
+	{ 0x000008c9, 0x0080 }, /* R2249 (0x8c9) - DRC1RMIX Input 1 Volume */
+	{ 0x000008ca, 0x0000 }, /* R2250 (0x8ca) - DRC1RMIX Input 2 Source */
+	{ 0x000008cb, 0x0080 }, /* R2251 (0x8cb) - DRC1RMIX Input 2 Volume */
+	{ 0x000008cc, 0x0000 }, /* R2252 (0x8cc) - DRC1RMIX Input 3 Source */
+	{ 0x000008cd, 0x0080 }, /* R2253 (0x8cd) - DRC1RMIX Input 3 Volume */
+	{ 0x000008ce, 0x0000 }, /* R2254 (0x8ce) - DRC1RMIX Input 4 Source */
+	{ 0x000008cf, 0x0080 }, /* R2255 (0x8cf) - DRC1RMIX Input 4 Volume */
+	{ 0x000008d0, 0x0000 }, /* R2256 (0x8d0) - DRC2LMIX Input 1 Source */
+	{ 0x000008d1, 0x0080 }, /* R2257 (0x8d1) - DRC2LMIX Input 1 Volume */
+	{ 0x000008d2, 0x0000 }, /* R2258 (0x8d2) - DRC2LMIX Input 2 Source */
+	{ 0x000008d3, 0x0080 }, /* R2259 (0x8d3) - DRC2LMIX Input 2 Volume */
+	{ 0x000008d4, 0x0000 }, /* R2260 (0x8d4) - DRC2LMIX Input 3 Source */
+	{ 0x000008d5, 0x0080 }, /* R2261 (0x8d5) - DRC2LMIX Input 3 Volume */
+	{ 0x000008d6, 0x0000 }, /* R2262 (0x8d6) - DRC2LMIX Input 4 Source */
+	{ 0x000008d7, 0x0080 }, /* R2263 (0x8d7) - DRC2LMIX Input 4 Volume */
+	{ 0x000008d8, 0x0000 }, /* R2264 (0x8d8) - DRC2RMIX Input 1 Source */
+	{ 0x000008d9, 0x0080 }, /* R2265 (0x8d9) - DRC2RMIX Input 1 Volume */
+	{ 0x000008da, 0x0000 }, /* R2266 (0x8da) - DRC2RMIX Input 2 Source */
+	{ 0x000008db, 0x0080 }, /* R2267 (0x8db) - DRC2RMIX Input 2 Volume */
+	{ 0x000008dc, 0x0000 }, /* R2268 (0x8dc) - DRC2RMIX Input 3 Source */
+	{ 0x000008dd, 0x0080 }, /* R2269 (0x8dd) - DRC2RMIX Input 3 Volume */
+	{ 0x000008de, 0x0000 }, /* R2270 (0x8de) - DRC2RMIX Input 4 Source */
+	{ 0x000008df, 0x0080 }, /* R2271 (0x8df) - DRC2RMIX Input 4 Volume */
+	{ 0x00000900, 0x0000 }, /* R2304 (0x900) - HPLP1MIX Input 1 Source */
+	{ 0x00000901, 0x0080 }, /* R2305 (0x901) - HPLP1MIX Input 1 Volume */
+	{ 0x00000902, 0x0000 }, /* R2306 (0x902) - HPLP1MIX Input 2 Source */
+	{ 0x00000903, 0x0080 }, /* R2307 (0x903) - HPLP1MIX Input 2 Volume */
+	{ 0x00000904, 0x0000 }, /* R2308 (0x904) - HPLP1MIX Input 3 Source */
+	{ 0x00000905, 0x0080 }, /* R2309 (0x905) - HPLP1MIX Input 3 Volume */
+	{ 0x00000906, 0x0000 }, /* R2310 (0x906) - HPLP1MIX Input 4 Source */
+	{ 0x00000907, 0x0080 }, /* R2311 (0x907) - HPLP1MIX Input 4 Volume */
+	{ 0x00000908, 0x0000 }, /* R2312 (0x908) - HPLP2MIX Input 1 Source */
+	{ 0x00000909, 0x0080 }, /* R2313 (0x909) - HPLP2MIX Input 1 Volume */
+	{ 0x0000090a, 0x0000 }, /* R2314 (0x90a) - HPLP2MIX Input 2 Source */
+	{ 0x0000090b, 0x0080 }, /* R2315 (0x90b) - HPLP2MIX Input 2 Volume */
+	{ 0x0000090c, 0x0000 }, /* R2316 (0x90c) - HPLP2MIX Input 3 Source */
+	{ 0x0000090d, 0x0080 }, /* R2317 (0x90d) - HPLP2MIX Input 3 Volume */
+	{ 0x0000090e, 0x0000 }, /* R2318 (0x90e) - HPLP2MIX Input 4 Source */
+	{ 0x0000090f, 0x0080 }, /* R2319 (0x90f) - HPLP2MIX Input 4 Volume */
+	{ 0x00000910, 0x0000 }, /* R2320 (0x910) - HPLP3MIX Input 1 Source */
+	{ 0x00000911, 0x0080 }, /* R2321 (0x911) - HPLP3MIX Input 1 Volume */
+	{ 0x00000912, 0x0000 }, /* R2322 (0x912) - HPLP3MIX Input 2 Source */
+	{ 0x00000913, 0x0080 }, /* R2323 (0x913) - HPLP3MIX Input 2 Volume */
+	{ 0x00000914, 0x0000 }, /* R2324 (0x914) - HPLP3MIX Input 3 Source */
+	{ 0x00000915, 0x0080 }, /* R2325 (0x915) - HPLP3MIX Input 3 Volume */
+	{ 0x00000916, 0x0000 }, /* R2326 (0x916) - HPLP3MIX Input 4 Source */
+	{ 0x00000917, 0x0080 }, /* R2327 (0x917) - HPLP3MIX Input 4 Volume */
+	{ 0x00000918, 0x0000 }, /* R2328 (0x918) - HPLP4MIX Input 1 Source */
+	{ 0x00000919, 0x0080 }, /* R2329 (0x919) - HPLP4MIX Input 1 Volume */
+	{ 0x0000091a, 0x0000 }, /* R2330 (0x91a) - HPLP4MIX Input 2 Source */
+	{ 0x0000091b, 0x0080 }, /* R2331 (0x91b) - HPLP4MIX Input 2 Volume */
+	{ 0x0000091c, 0x0000 }, /* R2332 (0x91c) - HPLP4MIX Input 3 Source */
+	{ 0x0000091d, 0x0080 }, /* R2333 (0x91d) - HPLP4MIX Input 3 Volume */
+	{ 0x0000091e, 0x0000 }, /* R2334 (0x91e) - HPLP4MIX Input 4 Source */
+	{ 0x0000091f, 0x0080 }, /* R2335 (0x91f) - HPLP4MIX Input 4 Volume */
+	{ 0x00000940, 0x0000 }, /* R2368 (0x940) - DSP1LMIX Input 1 Source */
+	{ 0x00000941, 0x0080 }, /* R2369 (0x941) - DSP1LMIX Input 1 Volume */
+	{ 0x00000942, 0x0000 }, /* R2370 (0x942) - DSP1LMIX Input 2 Source */
+	{ 0x00000943, 0x0080 }, /* R2371 (0x943) - DSP1LMIX Input 2 Volume */
+	{ 0x00000944, 0x0000 }, /* R2372 (0x944) - DSP1LMIX Input 3 Source */
+	{ 0x00000945, 0x0080 }, /* R2373 (0x945) - DSP1LMIX Input 3 Volume */
+	{ 0x00000946, 0x0000 }, /* R2374 (0x946) - DSP1LMIX Input 4 Source */
+	{ 0x00000947, 0x0080 }, /* R2375 (0x947) - DSP1LMIX Input 4 Volume */
+	{ 0x00000948, 0x0000 }, /* R2376 (0x948) - DSP1RMIX Input 1 Source */
+	{ 0x00000949, 0x0080 }, /* R2377 (0x949) - DSP1RMIX Input 1 Volume */
+	{ 0x0000094a, 0x0000 }, /* R2378 (0x94a) - DSP1RMIX Input 2 Source */
+	{ 0x0000094b, 0x0080 }, /* R2379 (0x94b) - DSP1RMIX Input 2 Volume */
+	{ 0x0000094c, 0x0000 }, /* R2380 (0x94c) - DSP1RMIX Input 3 Source */
+	{ 0x0000094d, 0x0080 }, /* R2381 (0x94d) - DSP1RMIX Input 3 Volume */
+	{ 0x0000094e, 0x0000 }, /* R2382 (0x94e) - DSP1RMIX Input 4 Source */
+	{ 0x0000094f, 0x0080 }, /* R2383 (0x94f) - DSP1RMIX Input 4 Volume */
+	{ 0x00000950, 0x0000 }, /* R2384 (0x950) - DSP1AUX1MIX Input 1 Source */
+	{ 0x00000958, 0x0000 }, /* R2392 (0x958) - DSP1AUX2MIX Input 1 Source */
+	{ 0x00000960, 0x0000 }, /* R2400 (0x960) - DSP1AUX3MIX Input 1 Source */
+	{ 0x00000968, 0x0000 }, /* R2408 (0x968) - DSP1AUX4MIX Input 1 Source */
+	{ 0x00000970, 0x0000 }, /* R2416 (0x970) - DSP1AUX5MIX Input 1 Source */
+	{ 0x00000978, 0x0000 }, /* R2424 (0x978) - DSP1AUX6MIX Input 1 Source */
+	{ 0x00000a80, 0x0000 }, /* R2688 (0xa80) - ASRC1 1LMIX Input 1 Source */
+	{ 0x00000a88, 0x0000 }, /* R2696 (0xa88) - ASRC1 1RMIX Input 1 Source */
+	{ 0x00000a90, 0x0000 }, /* R2704 (0xa90) - ASRC1 2LMIX Input 1 Source */
+	{ 0x00000a98, 0x0000 }, /* R2712 (0xa98) - ASRC1 2RMIX Input 1 Source */
+	{ 0x00000b00, 0x0000 }, /* R2816 (0xb00) - ISRC1DEC1MIX Input 1 Source */
+	{ 0x00000b08, 0x0000 }, /* R2824 (0xb08) - ISRC1DEC2MIX Input 1 Source */
+	{ 0x00000b20, 0x0000 }, /* R2848 (0xb20) - ISRC1INT1MIX Input 1 Source */
+	{ 0x00000b28, 0x0000 }, /* R2856 (0xb28) - ISRC1INT2MIX Input 1 Source */
+	{ 0x00000b40, 0x0000 }, /* R2880 (0xb40) - ISRC2DEC1MIX Input 1 Source */
+	{ 0x00000b48, 0x0000 }, /* R2888 (0xb48) - ISRC2DEC2MIX Input 1 Source */
+	{ 0x00000b60, 0x0000 }, /* R2912 (0xb60) - ISRC2INT1MIX Input 1 Source */
+	{ 0x00000b68, 0x0000 }, /* R2920 (0xb68) - ISRC2INT2MIX Input 1 Source */
+	{ 0x00000dc0, 0x0000 }, /* R3520 (0xdc0) - DFC1MIX Input 1 Source */
+	{ 0x00000dc8, 0x0000 }, /* R3528 (0xdc8) - DFC2MIX Input 1 Source */
+	{ 0x00000dd0, 0x0000 }, /* R3536 (0xdd0) - DFC3MIX Input 1 Source */
+	{ 0x00000dd8, 0x0000 }, /* R3544 (0xdd8) - DFC4MIX Input 1 Source */
+	{ 0x00000de0, 0x0000 }, /* R3552 (0xde0) - DFC5MIX Input 1 Source */
+	{ 0x00000de8, 0x0000 }, /* R3560 (0xde8) - DFC6MIX Input 1 Source */
+	{ 0x00000df0, 0x0000 }, /* R3568 (0xdf0) - DFC7MIX Input 1 Source */
+	{ 0x00000df8, 0x0000 }, /* R3576 (0xdf8) - DFC8MIX Input 1 Source */
+	{ 0x00000e00, 0x0000 }, /* R3584 (0xe00) - FX Ctrl 1 */
+	{ 0x00000e10, 0x6318 }, /* R3600 (0xe10) - EQ1 1 */
+	{ 0x00000e11, 0x6300 }, /* R3601 (0xe11) - EQ1 2 */
+	{ 0x00000e12, 0x0fc8 }, /* R3602 (0xe12) - EQ1 3 */
+	{ 0x00000e13, 0x03fe }, /* R3603 (0xe13) - EQ1 4 */
+	{ 0x00000e14, 0x00e0 }, /* R3604 (0xe14) - EQ1 5 */
+	{ 0x00000e15, 0x1ec4 }, /* R3605 (0xe15) - EQ1 6 */
+	{ 0x00000e16, 0xf136 }, /* R3606 (0xe16) - EQ1 7 */
+	{ 0x00000e17, 0x0409 }, /* R3607 (0xe17) - EQ1 8 */
+	{ 0x00000e18, 0x04cc }, /* R3608 (0xe18) - EQ1 9 */
+	{ 0x00000e19, 0x1c9b }, /* R3609 (0xe19) - EQ1 10 */
+	{ 0x00000e1a, 0xf337 }, /* R3610 (0xe1a) - EQ1 11 */
+	{ 0x00000e1b, 0x040b }, /* R3611 (0xe1b) - EQ1 12 */
+	{ 0x00000e1c, 0x0cbb }, /* R3612 (0xe1c) - EQ1 13 */
+	{ 0x00000e1d, 0x16f8 }, /* R3613 (0xe1d) - EQ1 14 */
+	{ 0x00000e1e, 0xf7d9 }, /* R3614 (0xe1e) - EQ1 15 */
+	{ 0x00000e1f, 0x040a }, /* R3615 (0xe1f) - EQ1 16 */
+	{ 0x00000e20, 0x1f14 }, /* R3616 (0xe20) - EQ1 17 */
+	{ 0x00000e21, 0x058c }, /* R3617 (0xe21) - EQ1 18 */
+	{ 0x00000e22, 0x0563 }, /* R3618 (0xe22) - EQ1 19 */
+	{ 0x00000e23, 0x4000 }, /* R3619 (0xe23) - EQ1 20 */
+	{ 0x00000e24, 0x0b75 }, /* R3620 (0xe24) - EQ1 21 */
+	{ 0x00000e26, 0x6318 }, /* R3622 (0xe26) - EQ2 1 */
+	{ 0x00000e27, 0x6300 }, /* R3623 (0xe27) - EQ2 2 */
+	{ 0x00000e28, 0x0fc8 }, /* R3624 (0xe28) - EQ2 3 */
+	{ 0x00000e29, 0x03fe }, /* R3625 (0xe29) - EQ2 4 */
+	{ 0x00000e2a, 0x00e0 }, /* R3626 (0xe2a) - EQ2 5 */
+	{ 0x00000e2b, 0x1ec4 }, /* R3627 (0xe2b) - EQ2 6 */
+	{ 0x00000e2c, 0xf136 }, /* R3628 (0xe2c) - EQ2 7 */
+	{ 0x00000e2d, 0x0409 }, /* R3629 (0xe2d) - EQ2 8 */
+	{ 0x00000e2e, 0x04cc }, /* R3630 (0xe2e) - EQ2 9 */
+	{ 0x00000e2f, 0x1c9b }, /* R3631 (0xe2f) - EQ2 10 */
+	{ 0x00000e30, 0xf337 }, /* R3632 (0xe30) - EQ2 11 */
+	{ 0x00000e31, 0x040b }, /* R3633 (0xe31) - EQ2 12 */
+	{ 0x00000e32, 0x0cbb }, /* R3634 (0xe32) - EQ2 13 */
+	{ 0x00000e33, 0x16f8 }, /* R3635 (0xe33) - EQ2 14 */
+	{ 0x00000e34, 0xf7d9 }, /* R3636 (0xe34) - EQ2 15 */
+	{ 0x00000e35, 0x040a }, /* R3637 (0xe35) - EQ2 16 */
+	{ 0x00000e36, 0x1f14 }, /* R3638 (0xe36) - EQ2 17 */
+	{ 0x00000e37, 0x058c }, /* R3639 (0xe37) - EQ2 18 */
+	{ 0x00000e38, 0x0563 }, /* R3640 (0xe38) - EQ2 19 */
+	{ 0x00000e39, 0x4000 }, /* R3641 (0xe39) - EQ2 20 */
+	{ 0x00000e3a, 0x0b75 }, /* R3642 (0xe3a) - EQ2 21 */
+	{ 0x00000e3c, 0x6318 }, /* R3644 (0xe3c) - EQ3 1 */
+	{ 0x00000e3d, 0x6300 }, /* R3645 (0xe3d) - EQ3 2 */
+	{ 0x00000e3e, 0x0fc8 }, /* R3646 (0xe3e) - EQ3 3 */
+	{ 0x00000e3f, 0x03fe }, /* R3647 (0xe3f) - EQ3 4 */
+	{ 0x00000e40, 0x00e0 }, /* R3648 (0xe40) - EQ3 5 */
+	{ 0x00000e41, 0x1ec4 }, /* R3649 (0xe41) - EQ3 6 */
+	{ 0x00000e42, 0xf136 }, /* R3650 (0xe42) - EQ3 7 */
+	{ 0x00000e43, 0x0409 }, /* R3651 (0xe43) - EQ3 8 */
+	{ 0x00000e44, 0x04cc }, /* R3652 (0xe44) - EQ3 9 */
+	{ 0x00000e45, 0x1c9b }, /* R3653 (0xe45) - EQ3 10 */
+	{ 0x00000e46, 0xf337 }, /* R3654 (0xe46) - EQ3 11 */
+	{ 0x00000e47, 0x040b }, /* R3655 (0xe47) - EQ3 12 */
+	{ 0x00000e48, 0x0cbb }, /* R3656 (0xe48) - EQ3 13 */
+	{ 0x00000e49, 0x16f8 }, /* R3657 (0xe49) - EQ3 14 */
+	{ 0x00000e4a, 0xf7d9 }, /* R3658 (0xe4a) - EQ3 15 */
+	{ 0x00000e4b, 0x040a }, /* R3659 (0xe4b) - EQ3 16 */
+	{ 0x00000e4c, 0x1f14 }, /* R3660 (0xe4c) - EQ3 17 */
+	{ 0x00000e4d, 0x058c }, /* R3661 (0xe4d) - EQ3 18 */
+	{ 0x00000e4e, 0x0563 }, /* R3662 (0xe4e) - EQ3 19 */
+	{ 0x00000e4f, 0x4000 }, /* R3663 (0xe4f) - EQ3 20 */
+	{ 0x00000e50, 0x0b75 }, /* R3664 (0xe50) - EQ3 21 */
+	{ 0x00000e52, 0x6318 }, /* R3666 (0xe52) - EQ4 1 */
+	{ 0x00000e53, 0x6300 }, /* R3667 (0xe53) - EQ4 2 */
+	{ 0x00000e54, 0x0fc8 }, /* R3668 (0xe54) - EQ4 3 */
+	{ 0x00000e55, 0x03fe }, /* R3669 (0xe55) - EQ4 4 */
+	{ 0x00000e56, 0x00e0 }, /* R3670 (0xe56) - EQ4 5 */
+	{ 0x00000e57, 0x1ec4 }, /* R3671 (0xe57) - EQ4 6 */
+	{ 0x00000e58, 0xf136 }, /* R3672 (0xe58) - EQ4 7 */
+	{ 0x00000e59, 0x0409 }, /* R3673 (0xe59) - EQ4 8 */
+	{ 0x00000e5a, 0x04cc }, /* R3674 (0xe5a) - EQ4 9 */
+	{ 0x00000e5b, 0x1c9b }, /* R3675 (0xe5b) - EQ4 10 */
+	{ 0x00000e5c, 0xf337 }, /* R3676 (0xe5c) - EQ4 11 */
+	{ 0x00000e5d, 0x040b }, /* R3677 (0xe5d) - EQ4 12 */
+	{ 0x00000e5e, 0x0cbb }, /* R3678 (0xe5e) - EQ4 13 */
+	{ 0x00000e5f, 0x16f8 }, /* R3679 (0xe5f) - EQ4 14 */
+	{ 0x00000e60, 0xf7d9 }, /* R3680 (0xe60) - EQ4 15 */
+	{ 0x00000e61, 0x040a }, /* R3681 (0xe61) - EQ4 16 */
+	{ 0x00000e62, 0x1f14 }, /* R3682 (0xe62) - EQ4 17 */
+	{ 0x00000e63, 0x058c }, /* R3683 (0xe63) - EQ4 18 */
+	{ 0x00000e64, 0x0563 }, /* R3684 (0xe64) - EQ4 19 */
+	{ 0x00000e65, 0x4000 }, /* R3685 (0xe65) - EQ4 20 */
+	{ 0x00000e66, 0x0b75 }, /* R3686 (0xe66) - EQ4 21 */
+	{ 0x00000e80, 0x0018 }, /* R3712 (0xe80) - DRC1 Ctrl 1 */
+	{ 0x00000e81, 0x0933 }, /* R3713 (0xe81) - DRC1 Ctrl 2 */
+	{ 0x00000e82, 0x0018 }, /* R3714 (0xe82) - DRC1 Ctrl 3 */
+	{ 0x00000e83, 0x0000 }, /* R3715 (0xe83) - DRC1 Ctrl 4 */
+	{ 0x00000e84, 0x0000 }, /* R3716 (0xe84) - DRC1 Ctrl 5 */
+	{ 0x00000e88, 0x0018 }, /* R3720 (0xe88) - DRC2 Ctrl 1 */
+	{ 0x00000e89, 0x0933 }, /* R3721 (0xe89) - DRC2 Ctrl 2 */
+	{ 0x00000e8a, 0x0018 }, /* R3722 (0xe8a) - DRC2 Ctrl 3 */
+	{ 0x00000e8b, 0x0000 }, /* R3723 (0xe8b) - DRC2 Ctrl 4 */
+	{ 0x00000e8c, 0x0000 }, /* R3724 (0xe8c) - DRC2 Ctrl 5 */
+	{ 0x00000ec0, 0x0000 }, /* R3776 (0xec0) - HPLPF1 1 */
+	{ 0x00000ec1, 0x0000 }, /* R3777 (0xec1) - HPLPF1 2 */
+	{ 0x00000ec4, 0x0000 }, /* R3780 (0xec4) - HPLPF2 1 */
+	{ 0x00000ec5, 0x0000 }, /* R3781 (0xec5) - HPLPF2 2 */
+	{ 0x00000ec8, 0x0000 }, /* R3784 (0xec8) - HPLPF3 1 */
+	{ 0x00000ec9, 0x0000 }, /* R3785 (0xec9) - HPLPF3 2 */
+	{ 0x00000ecc, 0x0000 }, /* R3788 (0xecc) - HPLPF4 1 */
+	{ 0x00000ecd, 0x0000 }, /* R3789 (0xecd) - HPLPF4 2 */
+	{ 0x00000ee0, 0x0000 }, /* R3808 (0xee0) - ASRC1 Enable */
+	{ 0x00000ee2, 0x0000 }, /* R3810 (0xee2) - ASRC1 Rate 1 */
+	{ 0x00000ee3, 0x4000 }, /* R3811 (0xee3) - ASRC1 Rate 2 */
+	{ 0x00000ef0, 0x0000 }, /* R3824 (0xef0) - ISRC1 Ctrl 1 */
+	{ 0x00000ef1, 0x0001 }, /* R3825 (0xef1) - ISRC1 Ctrl 2 */
+	{ 0x00000ef2, 0x0000 }, /* R3826 (0xef2) - ISRC1 Ctrl 3 */
+	{ 0x00000ef3, 0x0000 }, /* R3827 (0xef3) - ISRC2 Ctrl 1 */
+	{ 0x00000ef4, 0x0001 }, /* R3828 (0xef4) - ISRC2 Ctrl 2 */
+	{ 0x00000ef5, 0x0000 }, /* R3829 (0xef5) - ISRC2 Ctrl 3 */
+	{ 0x000010c0, 0x0008 }, /* R4288 (0x10c0) - AUXPDM1 Ctrl 0 */
+	{ 0x000010c1, 0x4000 }, /* R4289 (0x10c1) - AUXPDM1 Ctrl 1 */
+	{ 0x00001480, 0x0000 }, /* R5248 (0x1480) - DFC1 Ctrl W0 */
+	{ 0x00001482, 0x1f00 }, /* R5250 (0x1482) - DFC1 Rx W0 */
+	{ 0x00001484, 0x1f00 }, /* R5252 (0x1484) - DFC1 Tx W0 */
+	{ 0x00001486, 0x0000 }, /* R5254 (0x1486) - DFC2 Ctrl W0 */
+	{ 0x00001488, 0x1f00 }, /* R5256 (0x1488) - DFC2 Rx W0 */
+	{ 0x0000148a, 0x1f00 }, /* R5258 (0x148a) - DFC2 Tx W0 */
+	{ 0x0000148c, 0x0000 }, /* R5260 (0x148c) - DFC3 Ctrl W0 */
+	{ 0x0000148e, 0x1f00 }, /* R5262 (0x148e) - DFC3 Rx W0 */
+	{ 0x00001490, 0x1f00 }, /* R5264 (0x1490) - DFC3 Tx W0 */
+	{ 0x00001492, 0x0000 }, /* R5266 (0x1492) - DFC4 Ctrl W0 */
+	{ 0x00001494, 0x1f00 }, /* R5268 (0x1494) - DFC4 Rx W0 */
+	{ 0x00001496, 0x1f00 }, /* R5270 (0x1496) - DFC4 Tx W0 */
+	{ 0x00001498, 0x0000 }, /* R5272 (0x1498) - DFC5 Ctrl W0 */
+	{ 0x0000149a, 0x1f00 }, /* R5274 (0x149a) - DFC5 Rx W0 */
+	{ 0x0000149c, 0x1f00 }, /* R5276 (0x149c) - DFC5 Tx W0 */
+	{ 0x0000149e, 0x0000 }, /* R5278 (0x149e) - DFC6 Ctrl W0 */
+	{ 0x000014a0, 0x1f00 }, /* R5280 (0x14a0) - DFC6 Rx W0 */
+	{ 0x000014a2, 0x1f00 }, /* R5282 (0x14a2) - DFC6 Tx W0 */
+	{ 0x000014a4, 0x0000 }, /* R5284 (0x14a4) - DFC7 Ctrl W0 */
+	{ 0x000014a6, 0x1f00 }, /* R5286 (0x14a6) - DFC7 Rx W0 */
+	{ 0x000014a8, 0x1f00 }, /* R5288 (0x14a8) - DFC7 Tx W0 */
+	{ 0x000014aa, 0x0000 }, /* R5290 (0x14aa) - DFC8 Ctrl W0 */
+	{ 0x000014ac, 0x1f00 }, /* R5292 (0x14ac) - DFC8 Rx W0 */
+	{ 0x000014ae, 0x1f00 }, /* R5294 (0x14ae) - DFC8 Tx W0 */
+	{ 0x00001700, 0x2001 }, /* R5888 (0x1700) - GPIO1 Ctrl 1 */
+	{ 0x00001701, 0xf000 }, /* R5889 (0x1701) - GPIO1 Ctrl 2 */
+	{ 0x00001702, 0x2001 }, /* R5890 (0x1702) - GPIO2 Ctrl 1 */
+	{ 0x00001703, 0xf000 }, /* R5891 (0x1703) - GPIO2 Ctrl 2 */
+	{ 0x00001704, 0x2001 }, /* R5892 (0x1704) - GPIO3 Ctrl 1 */
+	{ 0x00001705, 0xf000 }, /* R5893 (0x1705) - GPIO3 Ctrl 2 */
+	{ 0x00001706, 0x2001 }, /* R5894 (0x1706) - GPIO4 Ctrl 1 */
+	{ 0x00001707, 0xf000 }, /* R5895 (0x1707) - GPIO4 Ctrl 2 */
+	{ 0x00001708, 0x2001 }, /* R5896 (0x1708) - GPIO5 Ctrl 1 */
+	{ 0x00001709, 0xf000 }, /* R5897 (0x1709) - GPIO5 Ctrl 2 */
+	{ 0x0000170a, 0x2001 }, /* R5898 (0x170a) - GPIO6 Ctrl 1 */
+	{ 0x0000170b, 0xf000 }, /* R5899 (0x170b) - GPIO6 Ctrl 2 */
+	{ 0x0000170c, 0x2001 }, /* R5900 (0x170c) - GPIO7 Ctrl 1 */
+	{ 0x0000170d, 0xf000 }, /* R5901 (0x170d) - GPIO7 Ctrl 2 */
+	{ 0x0000170e, 0x2001 }, /* R5902 (0x170e) - GPIO8 Ctrl 1 */
+	{ 0x0000170f, 0xf000 }, /* R5903 (0x170f) - GPIO8 Ctrl 2 */
+	{ 0x00001710, 0x2001 }, /* R5904 (0x1710) - GPIO9 Ctrl 1 */
+	{ 0x00001711, 0xf000 }, /* R5905 (0x1711) - GPIO9 Ctrl 2 */
+	{ 0x00001712, 0x2001 }, /* R5906 (0x1712) - GPIO10 Ctrl 1 */
+	{ 0x00001713, 0xf000 }, /* R5907 (0x1713) - GPIO10 Ctrl 2 */
+	{ 0x00001714, 0x2001 }, /* R5908 (0x1714) - GPIO11 Ctrl 1 */
+	{ 0x00001715, 0xf000 }, /* R5909 (0x1715) - GPIO11 Ctrl 2 */
+	{ 0x00001716, 0x2001 }, /* R5910 (0x1716) - GPIO12 Ctrl 1 */
+	{ 0x00001717, 0xf000 }, /* R5911 (0x1717) - GPIO12 Ctrl 2 */
+	{ 0x00001718, 0x2001 }, /* R5912 (0x1718) - GPIO13 Ctrl 1 */
+	{ 0x00001719, 0xf000 }, /* R5913 (0x1719) - GPIO13 Ctrl 2 */
+	{ 0x0000171a, 0x2001 }, /* R5914 (0x171a) - GPIO14 Ctrl 1 */
+	{ 0x0000171b, 0xf000 }, /* R5915 (0x171b) - GPIO14 Ctrl 2 */
+	{ 0x0000171c, 0x2001 }, /* R5916 (0x171c) - GPIO15 Ctrl 1 */
+	{ 0x0000171d, 0xf000 }, /* R5917 (0x171d) - GPIO15 Ctrl 2 */
+	{ 0x0000171e, 0x2001 }, /* R5918 (0x171e) - GPIO16 Ctrl 1 */
+	{ 0x0000171f, 0xf000 }, /* R5919 (0x171f) - GPIO16 Ctrl 2 */
+	{ 0x00001840, 0x1200 }, /* R6208 (0x1840) - IRQ1 Mask 1 */
+	{ 0x00001841, 0x77e0 }, /* R6209 (0x1841) - IRQ1 Mask 2 */
+	{ 0x00001842, 0xffff }, /* R6210 (0x1842) - IRQ1 Mask 3 */
+	{ 0x00001843, 0xffff }, /* R6211 (0x1843) - IRQ1 Mask 4 */
+	{ 0x00001844, 0xffff }, /* R6212 (0x1844) - IRQ1 Mask 5 */
+	{ 0x00001845, 0x0301 }, /* R6213 (0x1845) - IRQ1 Mask 6 */
+	{ 0x00001846, 0x0f3f }, /* R6214 (0x1846) - IRQ1 Mask 7 */
+	{ 0x00001847, 0xffff }, /* R6215 (0x1847) - IRQ1 Mask 8 */
+	{ 0x00001848, 0x031f }, /* R6216 (0x1848) - IRQ1 Mask 9 */
+	{ 0x00001849, 0x031f }, /* R6217 (0x1849) - IRQ1 Mask 10 */
+	{ 0x0000184a, 0xffff }, /* R6218 (0x184a) - IRQ1 Mask 11 */
+	{ 0x0000184b, 0x033f }, /* R6219 (0x184b) - IRQ1 Mask 12 */
+	{ 0x0000184c, 0x003f }, /* R6220 (0x184c) - IRQ1 Mask 13 */
+	{ 0x0000184d, 0x003f }, /* R6221 (0x184d) - IRQ1 Mask 14 */
+	{ 0x0000184e, 0x1000 }, /* R6222 (0x184e) - IRQ1 Mask 15 */
+	{ 0x0000184f, 0xffff }, /* R6223 (0x184f) - IRQ1 Mask 16 */
+	{ 0x00001850, 0xffff }, /* R6224 (0x1850) - IRQ1 Mask 17 */
+	{ 0x00001851, 0xffff }, /* R6225 (0x1851) - IRQ1 Mask 18 */
+	{ 0x00001852, 0xffff }, /* R6226 (0x1852) - IRQ1 Mask 19 */
+	{ 0x00001853, 0xffff }, /* R6227 (0x1853) - IRQ1 Mask 20 */
+	{ 0x00001854, 0x0001 }, /* R6228 (0x1854) - IRQ1 Mask 21 */
+	{ 0x00001855, 0x0001 }, /* R6229 (0x1855) - IRQ1 Mask 22 */
+	{ 0x00001856, 0x0001 }, /* R6230 (0x1856) - IRQ1 Mask 23 */
+	{ 0x00001857, 0x0001 }, /* R6231 (0x1857) - IRQ1 Mask 24 */
+	{ 0x00001858, 0x0001 }, /* R6232 (0x1858) - IRQ1 Mask 25 */
+	{ 0x00001859, 0xffff }, /* R6233 (0x1859) - IRQ1 Mask 26 */
+	{ 0x0000185a, 0x0001 }, /* R6234 (0x185a) - IRQ1 Mask 27 */
+	{ 0x0000185b, 0x0001 }, /* R6235 (0x185b) - IRQ1 Mask 28 */
+	{ 0x0000185c, 0xffff }, /* R6236 (0x185c) - IRQ1 Mask 29 */
+	{ 0x0000185d, 0x0001 }, /* R6237 (0x185d) - IRQ1 Mask 30 */
+	{ 0x0000185e, 0xffff }, /* R6238 (0x185e) - IRQ1 Mask 31 */
+	{ 0x0000185f, 0xffff }, /* R6239 (0x185f) - IRQ1 Mask 32 */
+	{ 0x00001860, 0x0001 }, /* R6240 (0x1860) - IRQ1 Mask 33 */
+	{ 0x00001948, 0x031f }, /* R6472 (0x1948) - IRQ2 Mask 9 */
+	{ 0x00001a06, 0x0000 }, /* R6662 (0x1a06) - Interrupt Debounce 7 */
+	{ 0x00001a80, 0x4400 }, /* R6784 (0x1a80) - IRQ1 Ctrl */
+};
+
+static bool cs47l92_is_adsp_memory(unsigned int reg)
+{
+	switch (reg) {
+	case 0x080000 ... 0x082ffe:
+	case 0x0a0000 ... 0x0a1ffe:
+	case 0x0c0000 ... 0x0c1ffe:
+	case 0x0e0000 ... 0x0e1ffe:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l92_16bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_TONE_GENERATOR_1 ... MADERA_TONE_GENERATOR_5:
+	case MADERA_PWM_DRIVE_1 ... MADERA_PWM_DRIVE_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_1:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_2:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_4:
+	case MADERA_HAPTICS_CONTROL_1 ... MADERA_HAPTICS_CONTROL_2:
+	case MADERA_HAPTICS_PHASE_1_INTENSITY:
+	case MADERA_HAPTICS_PHASE_1_DURATION:
+	case MADERA_HAPTICS_PHASE_2_INTENSITY:
+	case MADERA_HAPTICS_PHASE_2_DURATION:
+	case MADERA_HAPTICS_PHASE_3_INTENSITY:
+	case MADERA_HAPTICS_PHASE_3_DURATION:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_COMFORT_NOISE_GENERATOR:
+	case MADERA_CLOCK_32K_1:
+	case MADERA_SYSTEM_CLOCK_1:
+	case MADERA_SAMPLE_RATE_1 ... MADERA_SAMPLE_RATE_3:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_ASYNC_CLOCK_1:
+	case MADERA_ASYNC_SAMPLE_RATE_1:
+	case MADERA_ASYNC_SAMPLE_RATE_1_STATUS:
+	case MADERA_ASYNC_SAMPLE_RATE_2:
+	case MADERA_ASYNC_SAMPLE_RATE_2_STATUS:
+	case MADERA_DSP_CLOCK_1:
+	case MADERA_DSP_CLOCK_2:
+	case MADERA_OUTPUT_SYSTEM_CLOCK:
+	case MADERA_OUTPUT_ASYNC_CLOCK:
+	case MADERA_RATE_ESTIMATOR_1 ...  MADERA_RATE_ESTIMATOR_5:
+	case MADERA_FLL1_CONTROL_1 ... MADERA_FLL1_CONTROL_6:
+	case CS47L92_FLL1_CONTROL_7 ...  CS47L92_FLL1_CONTROL_10:
+	case MADERA_FLL1_CONTROL_11:
+	case MADERA_FLL1_DIGITAL_TEST_1:
+	case MADERA_FLL1_SYNCHRONISER_1 ... MADERA_FLL1_SYNCHRONISER_6:
+	case CS47L92_FLL1_GPIO_CLOCK:
+	case MADERA_FLL2_CONTROL_1 ... MADERA_FLL2_CONTROL_6:
+	case CS47L92_FLL2_CONTROL_7 ... CS47L92_FLL2_CONTROL_10:
+	case MADERA_FLL2_CONTROL_11:
+	case MADERA_FLL2_DIGITAL_TEST_1:
+	case MADERA_FLL2_SYNCHRONISER_1 ... MADERA_FLL2_SYNCHRONISER_6:
+	case CS47L92_FLL2_GPIO_CLOCK:
+	case MADERA_MIC_CHARGE_PUMP_1:
+	case MADERA_LDO2_CONTROL_1:
+	case MADERA_MIC_BIAS_CTRL_1:
+	case MADERA_MIC_BIAS_CTRL_2:
+	case MADERA_MIC_BIAS_CTRL_5:
+	case MADERA_MIC_BIAS_CTRL_6:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_HP_CTRL_2L:
+	case MADERA_HP_CTRL_2R:
+	case MADERA_HP_CTRL_3L:
+	case MADERA_HP_CTRL_3R:
+	case MADERA_ACCESSORY_DETECT_MODE_1:
+	case MADERA_HEADPHONE_DETECT_0:
+	case MADERA_HEADPHONE_DETECT_1:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_MICD_CLAMP_CONTROL:
+	case MADERA_MIC_DETECT_1_CONTROL_0:
+	case MADERA_MIC_DETECT_1_CONTROL_1:
+	case MADERA_MIC_DETECT_1_CONTROL_2:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_MIC_DETECT_1_LEVEL_1 ... MADERA_MIC_DETECT_1_LEVEL_4:
+	case MADERA_MIC_DETECT_2_CONTROL_0:
+	case MADERA_MIC_DETECT_2_CONTROL_1:
+	case MADERA_MIC_DETECT_2_CONTROL_2:
+	case MADERA_MIC_DETECT_2_CONTROL_3:
+	case MADERA_MIC_DETECT_2_CONTROL_4:
+	case MADERA_MIC_DETECT_2_LEVEL_1 ... MADERA_MIC_DETECT_2_LEVEL_4:
+	case MADERA_GP_SWITCH_1:
+	case MADERA_JACK_DETECT_ANALOGUE:
+	case MADERA_INPUT_ENABLES:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_INPUT_RATE:
+	case MADERA_INPUT_VOLUME_RAMP:
+	case MADERA_HPF_CONTROL:
+	case MADERA_IN1L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1L:
+	case MADERA_DMIC1L_CONTROL:
+	case MADERA_IN1L_RATE_CONTROL:
+	case MADERA_IN1R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1R:
+	case MADERA_DMIC1R_CONTROL:
+	case MADERA_IN1R_RATE_CONTROL:
+	case MADERA_IN2L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2L:
+	case MADERA_DMIC2L_CONTROL:
+	case MADERA_IN2L_RATE_CONTROL:
+	case MADERA_IN2R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2R:
+	case MADERA_DMIC2R_CONTROL:
+	case MADERA_IN2R_RATE_CONTROL:
+	case MADERA_IN3L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_3L:
+	case MADERA_DMIC3L_CONTROL:
+	case MADERA_IN3L_RATE_CONTROL:
+	case MADERA_IN3R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_3R:
+	case MADERA_DMIC3R_CONTROL:
+	case MADERA_IN3R_RATE_CONTROL:
+	case MADERA_IN4L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_4L:
+	case MADERA_DMIC4L_CONTROL:
+	case MADERA_IN4L_RATE_CONTROL:
+	case MADERA_IN4R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_4R:
+	case MADERA_DMIC4R_CONTROL:
+	case MADERA_IN4R_RATE_CONTROL:
+	case MADERA_OUTPUT_ENABLES_1:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_OUTPUT_RATE_1:
+	case MADERA_OUTPUT_VOLUME_RAMP:
+	case MADERA_OUTPUT_PATH_CONFIG_1L:
+	case MADERA_DAC_DIGITAL_VOLUME_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1:
+	case MADERA_NOISE_GATE_SELECT_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1R:
+	case MADERA_DAC_DIGITAL_VOLUME_1R:
+	case MADERA_NOISE_GATE_SELECT_1R:
+	case MADERA_OUTPUT_PATH_CONFIG_2L:
+	case MADERA_DAC_DIGITAL_VOLUME_2L:
+	case MADERA_OUTPUT_PATH_CONFIG_2:
+	case MADERA_NOISE_GATE_SELECT_2L:
+	case MADERA_OUTPUT_PATH_CONFIG_2R:
+	case MADERA_DAC_DIGITAL_VOLUME_2R:
+	case MADERA_NOISE_GATE_SELECT_2R:
+	case MADERA_OUTPUT_PATH_CONFIG_3L:
+	case MADERA_DAC_DIGITAL_VOLUME_3L:
+	case MADERA_OUTPUT_PATH_CONFIG_3:
+	case MADERA_NOISE_GATE_SELECT_3L:
+	case MADERA_OUTPUT_PATH_CONFIG_3R:
+	case MADERA_DAC_DIGITAL_VOLUME_3R:
+	case MADERA_NOISE_GATE_SELECT_3R:
+	case MADERA_OUTPUT_PATH_CONFIG_5L:
+	case MADERA_DAC_DIGITAL_VOLUME_5L:
+	case MADERA_NOISE_GATE_SELECT_5L:
+	case MADERA_OUTPUT_PATH_CONFIG_5R:
+	case MADERA_DAC_DIGITAL_VOLUME_5R:
+	case MADERA_NOISE_GATE_SELECT_5R:
+	case MADERA_DAC_AEC_CONTROL_1 ...  MADERA_DAC_AEC_CONTROL_2:
+	case MADERA_NOISE_GATE_CONTROL:
+	case MADERA_PDM_SPK1_CTRL_1 ... MADERA_PDM_SPK1_CTRL_2:
+	case MADERA_HP1_SHORT_CIRCUIT_CTRL:
+	case MADERA_HP2_SHORT_CIRCUIT_CTRL:
+	case MADERA_HP3_SHORT_CIRCUIT_CTRL:
+	case MADERA_AIF1_BCLK_CTRL:
+	case MADERA_AIF1_TX_PIN_CTRL:
+	case MADERA_AIF1_RX_PIN_CTRL:
+	case MADERA_AIF1_RATE_CTRL:
+	case MADERA_AIF1_FORMAT:
+	case MADERA_AIF1_RX_BCLK_RATE:
+	case MADERA_AIF1_FRAME_CTRL_1 ... MADERA_AIF1_FRAME_CTRL_18:
+	case MADERA_AIF1_TX_ENABLES:
+	case MADERA_AIF1_RX_ENABLES:
+	case MADERA_AIF2_BCLK_CTRL:
+	case MADERA_AIF2_TX_PIN_CTRL:
+	case MADERA_AIF2_RX_PIN_CTRL:
+	case MADERA_AIF2_RATE_CTRL:
+	case MADERA_AIF2_FORMAT:
+	case MADERA_AIF2_RX_BCLK_RATE:
+	case MADERA_AIF2_FRAME_CTRL_1 ... MADERA_AIF2_FRAME_CTRL_18:
+	case MADERA_AIF2_TX_ENABLES:
+	case MADERA_AIF2_RX_ENABLES:
+	case MADERA_AIF3_BCLK_CTRL:
+	case MADERA_AIF3_TX_PIN_CTRL:
+	case MADERA_AIF3_RX_PIN_CTRL:
+	case MADERA_AIF3_RATE_CTRL:
+	case MADERA_AIF3_FORMAT:
+	case MADERA_AIF3_RX_BCLK_RATE:
+	case MADERA_AIF3_FRAME_CTRL_1 ... MADERA_AIF3_FRAME_CTRL_18:
+	case MADERA_AIF3_TX_ENABLES:
+	case MADERA_AIF3_RX_ENABLES:
+	case MADERA_SPD1_TX_CONTROL:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_SLIMBUS_FRAMER_REF_GEAR:
+	case MADERA_SLIMBUS_RATES_1 ... MADERA_SLIMBUS_RATES_8:
+	case MADERA_SLIMBUS_RX_CHANNEL_ENABLE:
+	case MADERA_SLIMBUS_TX_CHANNEL_ENABLE:
+	case MADERA_SLIMBUS_RX_PORT_STATUS:
+	case MADERA_SLIMBUS_TX_PORT_STATUS:
+	case MADERA_PWM1MIX_INPUT_1_SOURCE:
+	case MADERA_PWM1MIX_INPUT_1_VOLUME:
+	case MADERA_PWM1MIX_INPUT_2_SOURCE:
+	case MADERA_PWM1MIX_INPUT_2_VOLUME:
+	case MADERA_PWM1MIX_INPUT_3_SOURCE:
+	case MADERA_PWM1MIX_INPUT_3_VOLUME:
+	case MADERA_PWM1MIX_INPUT_4_SOURCE:
+	case MADERA_PWM1MIX_INPUT_4_VOLUME:
+	case MADERA_PWM2MIX_INPUT_1_SOURCE:
+	case MADERA_PWM2MIX_INPUT_1_VOLUME:
+	case MADERA_PWM2MIX_INPUT_2_SOURCE:
+	case MADERA_PWM2MIX_INPUT_2_VOLUME:
+	case MADERA_PWM2MIX_INPUT_3_SOURCE:
+	case MADERA_PWM2MIX_INPUT_3_VOLUME:
+	case MADERA_PWM2MIX_INPUT_4_SOURCE:
+	case MADERA_PWM2MIX_INPUT_4_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_4_VOLUME:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_1_SOURCE:
+	case MADERA_EQ1MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_2_SOURCE:
+	case MADERA_EQ1MIX_INPUT_2_VOLUME:
+	case MADERA_EQ1MIX_INPUT_3_SOURCE:
+	case MADERA_EQ1MIX_INPUT_3_VOLUME:
+	case MADERA_EQ1MIX_INPUT_4_SOURCE:
+	case MADERA_EQ1MIX_INPUT_4_VOLUME:
+	case MADERA_EQ2MIX_INPUT_1_SOURCE:
+	case MADERA_EQ2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ2MIX_INPUT_2_SOURCE:
+	case MADERA_EQ2MIX_INPUT_2_VOLUME:
+	case MADERA_EQ2MIX_INPUT_3_SOURCE:
+	case MADERA_EQ2MIX_INPUT_3_VOLUME:
+	case MADERA_EQ2MIX_INPUT_4_SOURCE:
+	case MADERA_EQ2MIX_INPUT_4_VOLUME:
+	case MADERA_EQ3MIX_INPUT_1_SOURCE:
+	case MADERA_EQ3MIX_INPUT_1_VOLUME:
+	case MADERA_EQ3MIX_INPUT_2_SOURCE:
+	case MADERA_EQ3MIX_INPUT_2_VOLUME:
+	case MADERA_EQ3MIX_INPUT_3_SOURCE:
+	case MADERA_EQ3MIX_INPUT_3_VOLUME:
+	case MADERA_EQ3MIX_INPUT_4_SOURCE:
+	case MADERA_EQ3MIX_INPUT_4_VOLUME:
+	case MADERA_EQ4MIX_INPUT_1_SOURCE:
+	case MADERA_EQ4MIX_INPUT_1_VOLUME:
+	case MADERA_EQ4MIX_INPUT_2_SOURCE:
+	case MADERA_EQ4MIX_INPUT_2_VOLUME:
+	case MADERA_EQ4MIX_INPUT_3_SOURCE:
+	case MADERA_EQ4MIX_INPUT_3_VOLUME:
+	case MADERA_EQ4MIX_INPUT_4_SOURCE:
+	case MADERA_EQ4MIX_INPUT_4_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_4_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_4_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1AUX1MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX2MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX3MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX4MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX5MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX6MIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_1LMIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_1RMIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_2LMIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_2RMIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT2MIX_INPUT_1_SOURCE:
+	case MADERA_DFC1MIX_INPUT_1_SOURCE:
+	case MADERA_DFC2MIX_INPUT_1_SOURCE:
+	case MADERA_DFC3MIX_INPUT_1_SOURCE:
+	case MADERA_DFC4MIX_INPUT_1_SOURCE:
+	case MADERA_DFC5MIX_INPUT_1_SOURCE:
+	case MADERA_DFC6MIX_INPUT_1_SOURCE:
+	case MADERA_DFC7MIX_INPUT_1_SOURCE:
+	case MADERA_DFC8MIX_INPUT_1_SOURCE:
+	case MADERA_FX_CTRL1 ... MADERA_FX_CTRL2:
+	case MADERA_EQ1_1 ... MADERA_EQ1_21:
+	case MADERA_EQ2_1 ... MADERA_EQ2_21:
+	case MADERA_EQ3_1 ... MADERA_EQ3_21:
+	case MADERA_EQ4_1 ... MADERA_EQ4_21:
+	case MADERA_DRC1_CTRL1 ... MADERA_DRC1_CTRL5:
+	case MADERA_DRC2_CTRL1 ... MADERA_DRC2_CTRL5:
+	case MADERA_HPLPF1_1 ... MADERA_HPLPF1_2:
+	case MADERA_HPLPF2_1 ... MADERA_HPLPF2_2:
+	case MADERA_HPLPF3_1 ... MADERA_HPLPF3_2:
+	case MADERA_HPLPF4_1 ... MADERA_HPLPF4_2:
+	case MADERA_ASRC1_ENABLE:
+	case MADERA_ASRC1_STATUS:
+	case MADERA_ASRC1_RATE1 ... MADERA_ASRC1_RATE2:
+	case MADERA_ISRC_1_CTRL_1 ... MADERA_ISRC_1_CTRL_3:
+	case MADERA_ISRC_2_CTRL_1 ... MADERA_ISRC_2_CTRL_3:
+	case MADERA_AUXPDM1_CTRL_0 ... MADERA_AUXPDM1_CTRL_1:
+	case MADERA_DFC1_CTRL:
+	case MADERA_DFC1_RX:
+	case MADERA_DFC1_TX:
+	case MADERA_DFC2_CTRL:
+	case MADERA_DFC2_RX:
+	case MADERA_DFC2_TX:
+	case MADERA_DFC3_CTRL:
+	case MADERA_DFC3_RX:
+	case MADERA_DFC3_TX:
+	case MADERA_DFC4_CTRL:
+	case MADERA_DFC4_RX:
+	case MADERA_DFC4_TX:
+	case MADERA_DFC5_CTRL:
+	case MADERA_DFC5_RX:
+	case MADERA_DFC5_TX:
+	case MADERA_DFC6_CTRL:
+	case MADERA_DFC6_RX:
+	case MADERA_DFC6_TX:
+	case MADERA_DFC7_CTRL:
+	case MADERA_DFC7_RX:
+	case MADERA_DFC7_TX:
+	case MADERA_DFC8_CTRL:
+	case MADERA_DFC8_RX:
+	case MADERA_DFC8_TX:
+	case MADERA_DFC_STATUS:
+	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO16_CTRL_2:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+	case MADERA_INTERRUPT_DEBOUNCE_7:
+	case MADERA_IRQ1_CTRL:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l92_16bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_ASYNC_SAMPLE_RATE_1_STATUS:
+	case MADERA_ASYNC_SAMPLE_RATE_2_STATUS:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_HP_CTRL_2L:
+	case MADERA_HP_CTRL_2R:
+	case MADERA_HP_CTRL_3L:
+	case MADERA_HP_CTRL_3R:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_MIC_DETECT_2_CONTROL_3:
+	case MADERA_MIC_DETECT_2_CONTROL_4:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_SLIMBUS_RX_PORT_STATUS:
+	case MADERA_SLIMBUS_TX_PORT_STATUS:
+	case MADERA_FX_CTRL2:
+	case MADERA_ASRC1_STATUS:
+	case MADERA_DFC_STATUS:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l92_32bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_508:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l92_is_adsp_memory(reg);
+	}
+}
+
+static bool cs47l92_32bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_508:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l92_is_adsp_memory(reg);
+	}
+}
+
+const struct regmap_config cs47l92_16bit_spi_regmap = {
+	.name = "cs47l92_16bit",
+	.reg_bits = 32,
+	.pad_bits = 16,
+	.val_bits = 16,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l92_16bit_readable_register,
+	.volatile_reg = &cs47l92_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l92_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l92_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l92_16bit_spi_regmap);
+
+const struct regmap_config cs47l92_16bit_i2c_regmap = {
+	.name = "cs47l92_16bit",
+	.reg_bits = 32,
+	.val_bits = 16,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l92_16bit_readable_register,
+	.volatile_reg = &cs47l92_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l92_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l92_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l92_16bit_i2c_regmap);
+
+const struct regmap_config cs47l92_32bit_spi_regmap = {
+	.name = "cs47l92_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.pad_bits = 16,
+	.val_bits = 32,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l92_32bit_readable_register,
+	.volatile_reg = &cs47l92_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l92_32bit_spi_regmap);
+
+const struct regmap_config cs47l92_32bit_i2c_regmap = {
+	.name = "cs47l92_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.val_bits = 32,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l92_32bit_readable_register,
+	.volatile_reg = &cs47l92_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l92_32bit_i2c_regmap);
diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c
index a354567ebc86..b9e9c169c6cc 100644
--- a/drivers/mfd/madera-core.c
+++ b/drivers/mfd/madera-core.c
@@ -35,6 +35,7 @@
 #define CS47L35_SILICON_ID	0x6360
 #define CS47L85_SILICON_ID	0x6338
 #define CS47L90_SILICON_ID	0x6364
+#define CS47L92_SILICON_ID	0x6371
 
 #define MADERA_32KZ_MCLK2	1
 
@@ -148,6 +149,29 @@ static const struct mfd_cell cs47l90_devs[] = {
 	},
 };
 
+static const char * const cs47l92_supplies[] = {
+	"MICVDD",
+	"CPVDD1",
+	"CPVDD2",
+};
+
+static const struct mfd_cell cs47l92_devs[] = {
+	{ .name = "madera-pinctrl" },
+	{ .name = "madera-irq", },
+	{ .name = "madera-micsupp", },
+	{ .name = "madera-gpio" },
+	{
+		.name = "madera-extcon",
+		.parent_supplies = cs47l92_supplies,
+		.num_parent_supplies = 1, /* We only need MICVDD */
+	},
+	{
+		.name = "cs47l92-codec",
+		.parent_supplies = cs47l92_supplies,
+		.num_parent_supplies = ARRAY_SIZE(cs47l92_supplies),
+	},
+};
+
 /* Used by madera-i2c and madera-spi drivers */
 const char *madera_name_from_type(enum madera_type type)
 {
@@ -162,6 +186,12 @@ const char *madera_name_from_type(enum madera_type type)
 		return "CS47L90";
 	case CS47L91:
 		return "CS47L91";
+	case CS42L92:
+		return "CS42L92";
+	case CS47L92:
+		return "CS47L92";
+	case CS47L93:
+		return "CS47L93";
 	case WM1840:
 		return "WM1840";
 	default:
@@ -321,6 +351,9 @@ const struct of_device_id madera_of_match[] = {
 	{ .compatible = "cirrus,cs47l85", .data = (void *)CS47L85 },
 	{ .compatible = "cirrus,cs47l90", .data = (void *)CS47L90 },
 	{ .compatible = "cirrus,cs47l91", .data = (void *)CS47L91 },
+	{ .compatible = "cirrus,cs42l92", .data = (void *)CS42L92 },
+	{ .compatible = "cirrus,cs47l92", .data = (void *)CS47L92 },
+	{ .compatible = "cirrus,cs47l93", .data = (void *)CS47L93 },
 	{ .compatible = "cirrus,wm1840", .data = (void *)WM1840 },
 	{}
 };
@@ -385,6 +418,13 @@ static void madera_set_micbias_info(struct madera *madera)
 		madera->num_childbias[0] = 4;
 		madera->num_childbias[1] = 4;
 		return;
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
+		madera->num_micbias = 2;
+		madera->num_childbias[0] = 4;
+		madera->num_childbias[1] = 2;
+		return;
 	default:
 		return;
 	}
@@ -436,6 +476,9 @@ int madera_dev_init(struct madera *madera)
 	case CS47L35:
 	case CS47L90:
 	case CS47L91:
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
 		break;
 	case CS47L85:
 	case WM1840:
@@ -556,6 +599,21 @@ int madera_dev_init(struct madera *madera)
 			}
 		}
 		break;
+	case CS47L92_SILICON_ID:
+		if (IS_ENABLED(CONFIG_MFD_CS47L92)) {
+			switch (madera->type) {
+			case CS42L92:
+			case CS47L92:
+			case CS47L93:
+				patch_fn = cs47l92_patch;
+				mfd_devs = cs47l92_devs;
+				n_devs = ARRAY_SIZE(cs47l92_devs);
+				break;
+			default:
+				break;
+			}
+		}
+		break;
 	default:
 		dev_err(madera->dev, "Unknown device ID: %x\n", hwid);
 		ret = -EINVAL;
diff --git a/drivers/mfd/madera-i2c.c b/drivers/mfd/madera-i2c.c
index bd868459cedb..3f4ab5dcf5c3 100644
--- a/drivers/mfd/madera-i2c.c
+++ b/drivers/mfd/madera-i2c.c
@@ -65,6 +65,14 @@ static int madera_i2c_probe(struct i2c_client *i2c,
 			regmap_32bit_config = &cs47l90_32bit_i2c_regmap;
 		}
 		break;
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
+		if (IS_ENABLED(CONFIG_MFD_CS47L92)) {
+			regmap_16bit_config = &cs47l92_16bit_i2c_regmap;
+			regmap_32bit_config = &cs47l92_32bit_i2c_regmap;
+		}
+		break;
 	default:
 		dev_err(&i2c->dev,
 			"Unknown Madera I2C device type %ld\n", type);
@@ -124,6 +132,9 @@ static const struct i2c_device_id madera_i2c_id[] = {
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
 	{ "cs47l91", CS47L91 },
+	{ "cs42l92", CS42L92 },
+	{ "cs47l92", CS47L92 },
+	{ "cs47l93", CS47L93 },
 	{ "wm1840", WM1840 },
 	{ }
 };
diff --git a/drivers/mfd/madera-spi.c b/drivers/mfd/madera-spi.c
index a36741b73c25..d76c7e7376d7 100644
--- a/drivers/mfd/madera-spi.c
+++ b/drivers/mfd/madera-spi.c
@@ -65,6 +65,14 @@ static int madera_spi_probe(struct spi_device *spi)
 			regmap_32bit_config = &cs47l90_32bit_spi_regmap;
 		}
 		break;
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
+		if (IS_ENABLED(CONFIG_MFD_CS47L92)) {
+			regmap_16bit_config = &cs47l92_16bit_spi_regmap;
+			regmap_32bit_config = &cs47l92_32bit_spi_regmap;
+		}
+		break;
 	default:
 		dev_err(&spi->dev,
 			"Unknown Madera SPI device type %ld\n", type);
@@ -123,6 +131,9 @@ static const struct spi_device_id madera_spi_ids[] = {
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
 	{ "cs47l91", CS47L91 },
+	{ "cs42l92", CS42L92 },
+	{ "cs47l92", CS47L92 },
+	{ "cs47l93", CS47L93 },
 	{ "wm1840", WM1840 },
 	{ }
 };
diff --git a/drivers/mfd/madera.h b/drivers/mfd/madera.h
index ccc16f2a1288..69a40aba7e69 100644
--- a/drivers/mfd/madera.h
+++ b/drivers/mfd/madera.h
@@ -47,4 +47,11 @@ extern const struct regmap_config cs47l90_32bit_spi_regmap;
 extern const struct regmap_config cs47l90_16bit_i2c_regmap;
 extern const struct regmap_config cs47l90_32bit_i2c_regmap;
 int cs47l90_patch(struct madera *madera);
+
+extern const struct regmap_config cs47l92_16bit_spi_regmap;
+extern const struct regmap_config cs47l92_32bit_spi_regmap;
+extern const struct regmap_config cs47l92_16bit_i2c_regmap;
+extern const struct regmap_config cs47l92_32bit_i2c_regmap;
+int cs47l92_patch(struct madera *madera);
+
 #endif
diff --git a/include/linux/mfd/madera/core.h b/include/linux/mfd/madera/core.h
index 98dd3cb5e84d..7b87f9a02ecc 100644
--- a/include/linux/mfd/madera/core.h
+++ b/include/linux/mfd/madera/core.h
@@ -26,8 +26,11 @@ enum madera_type {
 	CS47L85 = 2,
 	CS47L90 = 3,
 	CS47L91 = 4,
+	CS47L92 = 5,
+	CS47L93 = 6,
 	WM1840 = 7,
 	CS47L15 = 8,
+	CS42L92 = 9,
 };
 
 #define MADERA_MAX_CORE_SUPPLIES	2
@@ -37,6 +40,7 @@ enum madera_type {
 #define CS47L35_NUM_GPIOS		16
 #define CS47L85_NUM_GPIOS		40
 #define CS47L90_NUM_GPIOS		38
+#define CS47L92_NUM_GPIOS		16
 
 #define MADERA_MAX_MICBIAS		4
 
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 5b054d511c6a..6439c0282ac6 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -77,9 +77,15 @@
 #define MADERA_FLL1_CONTROL_5				0x175
 #define MADERA_FLL1_CONTROL_6				0x176
 #define MADERA_FLL1_LOOP_FILTER_TEST_1			0x177
+#define CS47L92_FLL1_CONTROL_7				0x177
 #define MADERA_FLL1_NCO_TEST_0				0x178
+#define CS47L92_FLL1_CONTROL_8				0x178
 #define MADERA_FLL1_CONTROL_7				0x179
+#define CS47L92_FLL1_CONTROL_9				0x179
 #define MADERA_FLL1_EFS_2				0x17A
+#define CS47L92_FLL1_CONTROL_10				0x17A
+#define MADERA_FLL1_CONTROL_11				0x17B
+#define MADERA_FLL1_DIGITAL_TEST_1			0x17D
 #define CS47L35_FLL1_SYNCHRONISER_1			0x17F
 #define CS47L35_FLL1_SYNCHRONISER_2			0x180
 #define CS47L35_FLL1_SYNCHRONISER_3			0x181
@@ -98,6 +104,7 @@
 #define MADERA_FLL1_SYNCHRONISER_7			0x187
 #define MADERA_FLL1_SPREAD_SPECTRUM			0x189
 #define MADERA_FLL1_GPIO_CLOCK				0x18A
+#define CS47L92_FLL1_GPIO_CLOCK				0x18E
 #define MADERA_FLL2_CONTROL_1				0x191
 #define MADERA_FLL2_CONTROL_2				0x192
 #define MADERA_FLL2_CONTROL_3				0x193
@@ -105,9 +112,15 @@
 #define MADERA_FLL2_CONTROL_5				0x195
 #define MADERA_FLL2_CONTROL_6				0x196
 #define MADERA_FLL2_LOOP_FILTER_TEST_1			0x197
+#define CS47L92_FLL2_CONTROL_7				0x197
 #define MADERA_FLL2_NCO_TEST_0				0x198
+#define CS47L92_FLL2_CONTROL_8				0x198
 #define MADERA_FLL2_CONTROL_7				0x199
+#define CS47L92_FLL2_CONTROL_9				0x199
 #define MADERA_FLL2_EFS_2				0x19A
+#define CS47L92_FLL2_CONTROL_10				0x19A
+#define MADERA_FLL2_CONTROL_11				0x19B
+#define MADERA_FLL2_DIGITAL_TEST_1			0x19D
 #define MADERA_FLL2_SYNCHRONISER_1			0x1A1
 #define MADERA_FLL2_SYNCHRONISER_2			0x1A2
 #define MADERA_FLL2_SYNCHRONISER_3			0x1A3
@@ -117,6 +130,7 @@
 #define MADERA_FLL2_SYNCHRONISER_7			0x1A7
 #define MADERA_FLL2_SPREAD_SPECTRUM			0x1A9
 #define MADERA_FLL2_GPIO_CLOCK				0x1AA
+#define CS47L92_FLL2_GPIO_CLOCK				0x1AE
 #define MADERA_FLL3_CONTROL_1				0x1B1
 #define MADERA_FLL3_CONTROL_2				0x1B2
 #define MADERA_FLL3_CONTROL_3				0x1B3
@@ -267,6 +281,7 @@
 #define MADERA_NOISE_GATE_SELECT_2R			0x41F
 #define MADERA_OUTPUT_PATH_CONFIG_3L			0x420
 #define MADERA_DAC_DIGITAL_VOLUME_3L			0x421
+#define MADERA_OUTPUT_PATH_CONFIG_3			0x422
 #define MADERA_NOISE_GATE_SELECT_3L			0x423
 #define MADERA_OUTPUT_PATH_CONFIG_3R			0x424
 #define MADERA_DAC_DIGITAL_VOLUME_3R			0x425
@@ -369,8 +384,20 @@
 #define MADERA_AIF3_FRAME_CTRL_2			0x588
 #define MADERA_AIF3_FRAME_CTRL_3			0x589
 #define MADERA_AIF3_FRAME_CTRL_4			0x58A
+#define MADERA_AIF3_FRAME_CTRL_5			0x58B
+#define MADERA_AIF3_FRAME_CTRL_6			0x58C
+#define MADERA_AIF3_FRAME_CTRL_7			0x58D
+#define MADERA_AIF3_FRAME_CTRL_8			0x58E
+#define MADERA_AIF3_FRAME_CTRL_9			0x58F
+#define MADERA_AIF3_FRAME_CTRL_10			0x590
 #define MADERA_AIF3_FRAME_CTRL_11			0x591
 #define MADERA_AIF3_FRAME_CTRL_12			0x592
+#define MADERA_AIF3_FRAME_CTRL_13			0x593
+#define MADERA_AIF3_FRAME_CTRL_14			0x594
+#define MADERA_AIF3_FRAME_CTRL_15			0x595
+#define MADERA_AIF3_FRAME_CTRL_16			0x596
+#define MADERA_AIF3_FRAME_CTRL_17			0x597
+#define MADERA_AIF3_FRAME_CTRL_18			0x598
 #define MADERA_AIF3_TX_ENABLES				0x599
 #define MADERA_AIF3_RX_ENABLES				0x59A
 #define MADERA_AIF3_FORCE_WRITE				0x59B
@@ -662,6 +689,54 @@
 #define MADERA_AIF3TX2MIX_INPUT_3_VOLUME		0x78D
 #define MADERA_AIF3TX2MIX_INPUT_4_SOURCE		0x78E
 #define MADERA_AIF3TX2MIX_INPUT_4_VOLUME		0x78F
+#define MADERA_AIF3TX3MIX_INPUT_1_SOURCE		0x790
+#define MADERA_AIF3TX3MIX_INPUT_1_VOLUME		0x791
+#define MADERA_AIF3TX3MIX_INPUT_2_SOURCE		0x792
+#define MADERA_AIF3TX3MIX_INPUT_2_VOLUME		0x793
+#define MADERA_AIF3TX3MIX_INPUT_3_SOURCE		0x794
+#define MADERA_AIF3TX3MIX_INPUT_3_VOLUME		0x795
+#define MADERA_AIF3TX3MIX_INPUT_4_SOURCE		0x796
+#define MADERA_AIF3TX3MIX_INPUT_4_VOLUME		0x797
+#define MADERA_AIF3TX4MIX_INPUT_1_SOURCE		0x798
+#define MADERA_AIF3TX4MIX_INPUT_1_VOLUME		0x799
+#define MADERA_AIF3TX4MIX_INPUT_2_SOURCE		0x79A
+#define MADERA_AIF3TX4MIX_INPUT_2_VOLUME		0x79B
+#define MADERA_AIF3TX4MIX_INPUT_3_SOURCE		0x79C
+#define MADERA_AIF3TX4MIX_INPUT_3_VOLUME		0x79D
+#define MADERA_AIF3TX4MIX_INPUT_4_SOURCE		0x79E
+#define MADERA_AIF3TX4MIX_INPUT_4_VOLUME		0x79F
+#define CS47L92_AIF3TX5MIX_INPUT_1_SOURCE		0x7A0
+#define CS47L92_AIF3TX5MIX_INPUT_1_VOLUME		0x7A1
+#define CS47L92_AIF3TX5MIX_INPUT_2_SOURCE		0x7A2
+#define CS47L92_AIF3TX5MIX_INPUT_2_VOLUME		0x7A3
+#define CS47L92_AIF3TX5MIX_INPUT_3_SOURCE		0x7A4
+#define CS47L92_AIF3TX5MIX_INPUT_3_VOLUME		0x7A5
+#define CS47L92_AIF3TX5MIX_INPUT_4_SOURCE		0x7A6
+#define CS47L92_AIF3TX5MIX_INPUT_4_VOLUME		0x7A7
+#define CS47L92_AIF3TX6MIX_INPUT_1_SOURCE		0x7A8
+#define CS47L92_AIF3TX6MIX_INPUT_1_VOLUME		0x7A9
+#define CS47L92_AIF3TX6MIX_INPUT_2_SOURCE		0x7AA
+#define CS47L92_AIF3TX6MIX_INPUT_2_VOLUME		0x7AB
+#define CS47L92_AIF3TX6MIX_INPUT_3_SOURCE		0x7AC
+#define CS47L92_AIF3TX6MIX_INPUT_3_VOLUME		0x7AD
+#define CS47L92_AIF3TX6MIX_INPUT_4_SOURCE		0x7AE
+#define CS47L92_AIF3TX6MIX_INPUT_4_VOLUME		0x7AF
+#define CS47L92_AIF3TX7MIX_INPUT_1_SOURCE		0x7B0
+#define CS47L92_AIF3TX7MIX_INPUT_1_VOLUME		0x7B1
+#define CS47L92_AIF3TX7MIX_INPUT_2_SOURCE		0x7B2
+#define CS47L92_AIF3TX7MIX_INPUT_2_VOLUME		0x7B3
+#define CS47L92_AIF3TX7MIX_INPUT_3_SOURCE		0x7B4
+#define CS47L92_AIF3TX7MIX_INPUT_3_VOLUME		0x7B5
+#define CS47L92_AIF3TX7MIX_INPUT_4_SOURCE		0x7B6
+#define CS47L92_AIF3TX7MIX_INPUT_4_VOLUME		0x7B7
+#define CS47L92_AIF3TX8MIX_INPUT_1_SOURCE		0x7B8
+#define CS47L92_AIF3TX8MIX_INPUT_1_VOLUME		0x7B9
+#define CS47L92_AIF3TX8MIX_INPUT_2_SOURCE		0x7BA
+#define CS47L92_AIF3TX8MIX_INPUT_2_VOLUME		0x7BB
+#define CS47L92_AIF3TX8MIX_INPUT_3_SOURCE		0x7BC
+#define CS47L92_AIF3TX8MIX_INPUT_3_VOLUME		0x7BD
+#define CS47L92_AIF3TX8MIX_INPUT_4_SOURCE		0x7BE
+#define CS47L92_AIF3TX8MIX_INPUT_4_VOLUME		0x7BF
 #define MADERA_AIF4TX1MIX_INPUT_1_SOURCE		0x7A0
 #define MADERA_AIF4TX1MIX_INPUT_1_VOLUME		0x7A1
 #define MADERA_AIF4TX1MIX_INPUT_2_SOURCE		0x7A2
@@ -1105,6 +1180,8 @@
 #define MADERA_FCR_ADC_REFORMATTER_CONTROL		0xF73
 #define MADERA_FCR_COEFF_START				0xF74
 #define MADERA_FCR_COEFF_END				0xFC5
+#define MADERA_AUXPDM1_CTRL_0				0x10C0
+#define MADERA_AUXPDM1_CTRL_1				0x10C1
 #define MADERA_DAC_COMP_1				0x1300
 #define MADERA_DAC_COMP_2				0x1302
 #define MADERA_FRF_COEFFICIENT_1L_1			0x1380
@@ -1446,6 +1523,12 @@
 #define MADERA_OPCLK_ASYNC_SEL_WIDTH			     3
 
 /* (0x0171)  FLL1_Control_1 */
+#define CS47L92_FLL1_REFCLK_SRC_MASK			0xF000
+#define CS47L92_FLL1_REFCLK_SRC_SHIFT			    12
+#define CS47L92_FLL1_REFCLK_SRC_WIDTH			     4
+#define MADERA_FLL1_HOLD_MASK				0x0004
+#define MADERA_FLL1_HOLD_SHIFT				     2
+#define MADERA_FLL1_HOLD_WIDTH				     1
 #define MADERA_FLL1_FREERUN				0x0002
 #define MADERA_FLL1_FREERUN_MASK			0x0002
 #define MADERA_FLL1_FREERUN_SHIFT			     1
@@ -1478,6 +1561,9 @@
 #define MADERA_FLL1_FRATIO_MASK				0x0F00
 #define MADERA_FLL1_FRATIO_SHIFT			     8
 #define MADERA_FLL1_FRATIO_WIDTH			     4
+#define MADERA_FLL1_FB_DIV_MASK				0x03FF
+#define MADERA_FLL1_FB_DIV_SHIFT			     0
+#define MADERA_FLL1_FB_DIV_WIDTH			    10
 
 /* (0x0176)  FLL1_Control_6 */
 #define MADERA_FLL1_REFCLK_DIV_MASK			0x00C0
@@ -1509,6 +1595,30 @@
 #define MADERA_FLL1_PHASE_ENA_SHIFT			    11
 #define MADERA_FLL1_PHASE_ENA_WIDTH			     1
 
+/* (0x017A)  FLL1_Control_10 */
+#define MADERA_FLL1_HP_MASK				0xC000
+#define MADERA_FLL1_HP_SHIFT				    14
+#define MADERA_FLL1_HP_WIDTH				     2
+#define MADERA_FLL1_PHASEDET_ENA_MASK			0x1000
+#define MADERA_FLL1_PHASEDET_ENA_SHIFT			    12
+#define MADERA_FLL1_PHASEDET_ENA_WIDTH			     1
+
+/* (0x017B)  FLL1_Control_11 */
+#define MADERA_FLL1_LOCKDET_THR_MASK			0x001E
+#define MADERA_FLL1_LOCKDET_THR_SHIFT			     1
+#define MADERA_FLL1_LOCKDET_THR_WIDTH			     4
+#define MADERA_FLL1_LOCKDET_MASK			0x0001
+#define MADERA_FLL1_LOCKDET_SHIFT			     0
+#define MADERA_FLL1_LOCKDET_WIDTH			     1
+
+/* (0x017D)  FLL1_Digital_Test_1 */
+#define MADERA_FLL1_SYNC_EFS_ENA_MASK			0x0100
+#define MADERA_FLL1_SYNC_EFS_ENA_SHIFT			     8
+#define MADERA_FLL1_SYNC_EFS_ENA_WIDTH			     1
+#define MADERA_FLL1_CLK_VCO_FAST_SRC_MASK		0x0003
+#define MADERA_FLL1_CLK_VCO_FAST_SRC_SHIFT		     0
+#define MADERA_FLL1_CLK_VCO_FAST_SRC_WIDTH		     2
+
 /* (0x0181)  FLL1_Synchroniser_1 */
 #define MADERA_FLL1_SYNC_ENA				0x0001
 #define MADERA_FLL1_SYNC_ENA_MASK			0x0001
@@ -1630,6 +1740,13 @@
 #define MADERA_LDO2_ENA_WIDTH				     1
 
 /* (0x0218)  Mic_Bias_Ctrl_1 */
+#define MADERA_MICB1_EXT_CAP				0x8000
+#define MADERA_MICB1_EXT_CAP_MASK			0x8000
+#define MADERA_MICB1_EXT_CAP_SHIFT			    15
+#define MADERA_MICB1_EXT_CAP_WIDTH			     1
+#define MADERA_MICB1_LVL_MASK				0x01E0
+#define MADERA_MICB1_LVL_SHIFT				     5
+#define MADERA_MICB1_LVL_WIDTH				     4
 #define MADERA_MICB1_ENA				0x0001
 #define MADERA_MICB1_ENA_MASK				0x0001
 #define MADERA_MICB1_ENA_SHIFT				     0
@@ -2313,6 +2430,17 @@
 #define MADERA_OUT1R_ENA_SHIFT				     0
 #define MADERA_OUT1R_ENA_WIDTH				     1
 
+/* (0x0408)  Output_Rate_1 */
+#define MADERA_CP_DAC_MODE_MASK				0x0040
+#define MADERA_CP_DAC_MODE_SHIFT			     6
+#define MADERA_CP_DAC_MODE_WIDTH			     1
+#define MADERA_OUT_EXT_CLK_DIV_MASK			0x0030
+#define MADERA_OUT_EXT_CLK_DIV_SHIFT			     4
+#define MADERA_OUT_EXT_CLK_DIV_WIDTH			     2
+#define MADERA_OUT_CLK_SRC_MASK				0x0007
+#define MADERA_OUT_CLK_SRC_SHIFT			     0
+#define MADERA_OUT_CLK_SRC_WIDTH			     3
+
 /* (0x0409)  Output_Volume_Ramp */
 #define MADERA_OUT_VD_RAMP_MASK				0x0070
 #define MADERA_OUT_VD_RAMP_SHIFT			     4
@@ -2834,6 +2962,30 @@
 #define MADERA_AIF2RX1_ENA_WIDTH			     1
 
 /* (0x0599)  AIF3_Tx_Enables */
+#define MADERA_AIF3TX8_ENA				0x0080
+#define MADERA_AIF3TX8_ENA_MASK				0x0080
+#define MADERA_AIF3TX8_ENA_SHIFT			     7
+#define MADERA_AIF3TX8_ENA_WIDTH			     1
+#define MADERA_AIF3TX7_ENA				0x0040
+#define MADERA_AIF3TX7_ENA_MASK				0x0040
+#define MADERA_AIF3TX7_ENA_SHIFT			     6
+#define MADERA_AIF3TX7_ENA_WIDTH			     1
+#define MADERA_AIF3TX6_ENA				0x0020
+#define MADERA_AIF3TX6_ENA_MASK				0x0020
+#define MADERA_AIF3TX6_ENA_SHIFT			     5
+#define MADERA_AIF3TX6_ENA_WIDTH			     1
+#define MADERA_AIF3TX5_ENA				0x0010
+#define MADERA_AIF3TX5_ENA_MASK				0x0010
+#define MADERA_AIF3TX5_ENA_SHIFT			     4
+#define MADERA_AIF3TX5_ENA_WIDTH			     1
+#define MADERA_AIF3TX4_ENA				0x0008
+#define MADERA_AIF3TX4_ENA_MASK				0x0008
+#define MADERA_AIF3TX4_ENA_SHIFT			     3
+#define MADERA_AIF3TX4_ENA_WIDTH			     1
+#define MADERA_AIF3TX3_ENA				0x0004
+#define MADERA_AIF3TX3_ENA_MASK				0x0004
+#define MADERA_AIF3TX3_ENA_SHIFT			     2
+#define MADERA_AIF3TX3_ENA_WIDTH			     1
 #define MADERA_AIF3TX2_ENA				0x0002
 #define MADERA_AIF3TX2_ENA_MASK				0x0002
 #define MADERA_AIF3TX2_ENA_SHIFT			     1
@@ -2844,6 +2996,30 @@
 #define MADERA_AIF3TX1_ENA_WIDTH			     1
 
 /* (0x059A)  AIF3_Rx_Enables */
+#define MADERA_AIF3RX8_ENA				0x0080
+#define MADERA_AIF3RX8_ENA_MASK				0x0080
+#define MADERA_AIF3RX8_ENA_SHIFT			     7
+#define MADERA_AIF3RX8_ENA_WIDTH			     1
+#define MADERA_AIF3RX7_ENA				0x0040
+#define MADERA_AIF3RX7_ENA_MASK				0x0040
+#define MADERA_AIF3RX7_ENA_SHIFT			     6
+#define MADERA_AIF3RX7_ENA_WIDTH			     1
+#define MADERA_AIF3RX6_ENA				0x0020
+#define MADERA_AIF3RX6_ENA_MASK				0x0020
+#define MADERA_AIF3RX6_ENA_SHIFT			     5
+#define MADERA_AIF3RX6_ENA_WIDTH			     1
+#define MADERA_AIF3RX5_ENA				0x0010
+#define MADERA_AIF3RX5_ENA_MASK				0x0010
+#define MADERA_AIF3RX5_ENA_SHIFT			     4
+#define MADERA_AIF3RX5_ENA_WIDTH			     1
+#define MADERA_AIF3RX4_ENA				0x0008
+#define MADERA_AIF3RX4_ENA_MASK				0x0008
+#define MADERA_AIF3RX4_ENA_SHIFT			     3
+#define MADERA_AIF3RX4_ENA_WIDTH			     1
+#define MADERA_AIF3RX3_ENA				0x0004
+#define MADERA_AIF3RX3_ENA_MASK				0x0004
+#define MADERA_AIF3RX3_ENA_SHIFT			     2
+#define MADERA_AIF3RX3_ENA_WIDTH			     1
 #define MADERA_AIF3RX2_ENA				0x0002
 #define MADERA_AIF3RX2_ENA_MASK				0x0002
 #define MADERA_AIF3RX2_ENA_SHIFT			     1
@@ -3458,6 +3634,25 @@
 #define MADERA_FCR_MIC_MODE_SEL_SHIFT			     2
 #define MADERA_FCR_MIC_MODE_SEL_WIDTH			     2
 
+/* (0x10C0)  AUXPDM1_CTRL_0 */
+#define MADERA_AUXPDM1_SRC_MASK				0x0F00
+#define MADERA_AUXPDM1_SRC_SHIFT			     8
+#define MADERA_AUXPDM1_SRC_WIDTH			     4
+#define MADERA_AUXPDM1_TXEDGE_MASK			0x0010
+#define MADERA_AUXPDM1_TXEDGE_SHIFT			     4
+#define MADERA_AUXPDM1_TXEDGE_WIDTH			     1
+#define MADERA_AUXPDM1_MSTR_MASK			0x0008
+#define MADERA_AUXPDM1_MSTR_SHIFT			     3
+#define MADERA_AUXPDM1_MSTR_WIDTH			     1
+#define MADERA_AUXPDM1_ENABLE_MASK			0x0001
+#define MADERA_AUXPDM1_ENABLE_SHIFT			     0
+#define MADERA_AUXPDM1_ENABLE_WIDTH			     1
+
+/* (0x10C1)  AUXPDM1_CTRL_1 */
+#define MADERA_AUXPDM1_CLK_FREQ_MASK			0xC000
+#define MADERA_AUXPDM1_CLK_FREQ_SHIFT			    14
+#define MADERA_AUXPDM1_CLK_FREQ_WIDTH			     2
+
 /* (0x1480)  DFC1_CTRL_W0 */
 #define MADERA_DFC1_RATE_MASK				0x007C
 #define MADERA_DFC1_RATE_SHIFT				     2
-- 
cgit v1.2.3


From 554e937ec8d75930bc711612208af27961e3dc61 Mon Sep 17 00:00:00 2001
From: Pi-Hsun Shih <pihsun@chromium.org>
Date: Mon, 3 Jun 2019 11:45:11 +0800
Subject: mfd: cros_ec: differentiate SCP from EC by feature bit

System Companion Processor (SCP) is Cortex M4 co-processor on some
MediaTek platform that can run EC-style firmware. Since a SCP and EC
would both exist on a system, and use the cros_ec_dev driver, we need to
differentiate between them for the userspace, or they would both be
registered at /dev/cros_ec, causing a conflict.

Signed-off-by: Pi-Hsun Shih <pihsun@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec_dev.c   | 10 ++++++++++
 include/linux/mfd/cros_ec.h |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index d992365472b8..a47223d2baf4 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -459,6 +459,16 @@ static int ec_device_probe(struct platform_device *pdev)
 		ec_platform->ec_name = CROS_EC_DEV_TP_NAME;
 	}
 
+	/* Check whether this is actually a SCP rather than an EC. */
+	if (cros_ec_check_features(ec, EC_FEATURE_SCP)) {
+		dev_info(dev, "CrOS SCP MCU detected.\n");
+		/*
+		 * Help userspace differentiating ECs from SCP,
+		 * regardless of the probing order.
+		 */
+		ec_platform->ec_name = CROS_EC_DEV_SCP_NAME;
+	}
+
 	/*
 	 * Add the class device
 	 * Link to the character device for creating the /dev entry
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index cfa78bb4990f..751cb3756d49 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -27,6 +27,7 @@
 #define CROS_EC_DEV_PD_NAME "cros_pd"
 #define CROS_EC_DEV_TP_NAME "cros_tp"
 #define CROS_EC_DEV_ISH_NAME "cros_ish"
+#define CROS_EC_DEV_SCP_NAME "cros_scp"
 
 /*
  * The EC is unresponsive for a time after a reboot command.  Add a
-- 
cgit v1.2.3


From 76304994645028accc0cfe287652344b696f4470 Mon Sep 17 00:00:00 2001
From: Stefan Mavrodiev <stefan@olimex.com>
Date: Fri, 7 Jun 2019 15:42:25 +0300
Subject: mfd: rk808: Check pm_power_off pointer

The function pointer pm_power_off may point to function from other
module (PSCI for example). If rk808 is removed, pm_power_off is
overwritten to NULL and the system cannot be powered off.

This patch checks if pm_power_off points to a module function.

Signed-off-by: Stefan Mavrodiev <stefan@olimex.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rk808.c       | 17 +++++++++++------
 include/linux/mfd/rk808.h |  1 +
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index 6ee1c461a3bb..e234720fee02 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -562,7 +562,6 @@ static int rk808_probe(struct i2c_client *client,
 	struct rk808 *rk808;
 	const struct rk808_reg_data *pre_init_reg;
 	const struct mfd_cell *cells;
-	void (*pm_pwroff_fn)(void) = NULL;
 	int nr_pre_init_regs;
 	int nr_cells;
 	int pm_off = 0, msb, lsb;
@@ -609,7 +608,7 @@ static int rk808_probe(struct i2c_client *client,
 		nr_pre_init_regs = ARRAY_SIZE(rk805_pre_init_reg);
 		cells = rk805s;
 		nr_cells = ARRAY_SIZE(rk805s);
-		pm_pwroff_fn = rk805_device_shutdown;
+		rk808->pm_pwroff_fn = rk805_device_shutdown;
 		break;
 	case RK808_ID:
 		rk808->regmap_cfg = &rk808_regmap_config;
@@ -618,7 +617,7 @@ static int rk808_probe(struct i2c_client *client,
 		nr_pre_init_regs = ARRAY_SIZE(rk808_pre_init_reg);
 		cells = rk808s;
 		nr_cells = ARRAY_SIZE(rk808s);
-		pm_pwroff_fn = rk808_device_shutdown;
+		rk808->pm_pwroff_fn = rk808_device_shutdown;
 		break;
 	case RK818_ID:
 		rk808->regmap_cfg = &rk818_regmap_config;
@@ -627,7 +626,7 @@ static int rk808_probe(struct i2c_client *client,
 		nr_pre_init_regs = ARRAY_SIZE(rk818_pre_init_reg);
 		cells = rk818s;
 		nr_cells = ARRAY_SIZE(rk818s);
-		pm_pwroff_fn = rk818_device_shutdown;
+		rk808->pm_pwroff_fn = rk818_device_shutdown;
 		break;
 	case RK809_ID:
 	case RK817_ID:
@@ -692,7 +691,7 @@ static int rk808_probe(struct i2c_client *client,
 				"rockchip,system-power-controller");
 	if (pm_off && !pm_power_off) {
 		rk808_i2c_client = client;
-		pm_power_off = pm_pwroff_fn;
+		pm_power_off = rk808->pm_pwroff_fn;
 	}
 
 	return 0;
@@ -707,7 +706,13 @@ static int rk808_remove(struct i2c_client *client)
 	struct rk808 *rk808 = i2c_get_clientdata(client);
 
 	regmap_del_irq_chip(client->irq, rk808->irq_data);
-	pm_power_off = NULL;
+
+	/**
+	 * pm_power_off may points to a function from another module.
+	 * Check if the pointer is set by us and only then overwrite it.
+	 */
+	if (rk808->pm_pwroff_fn && pm_power_off == rk808->pm_pwroff_fn)
+		pm_power_off = NULL;
 
 	return 0;
 }
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 2a9cd01691b2..286316375636 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -628,5 +628,6 @@ struct rk808 {
 	long				variant;
 	const struct regmap_config	*regmap_cfg;
 	const struct regmap_irq_chip	*regmap_irq_chip;
+	void				(*pm_pwroff_fn)(void);
 };
 #endif /* __LINUX_REGULATOR_RK808_H */
-- 
cgit v1.2.3


From ac195d94280a783f030a01ee84998a198b779d99 Mon Sep 17 00:00:00 2001
From: Stefan Mavrodiev <stefan@olimex.com>
Date: Fri, 7 Jun 2019 15:42:26 +0300
Subject: mfd: rk808: Prepare rk805 for poweroff

RK805 has SLEEP signal, which can put the device into SLEEP or OFF
mode. The default is SLEEP mode.

However, when the kernel performs power-off (actually the ATF) the
device will not go fully off and this will result in higher power
consumption and inability to wake the device with RTC alarm.

The solution is to enable pm_power_off_prepare function, which will
configure SLEEP pin for OFF function.

Signed-off-by: Stefan Mavrodiev <stefan@olimex.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rk808.c       | 50 +++++++++++++++++++++++++++++++++--------------
 include/linux/mfd/rk808.h |  1 +
 2 files changed, 36 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index e234720fee02..09fc8d3da541 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -474,17 +474,29 @@ static void rk805_device_shutdown(void)
 	int ret;
 	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
 
-	if (!rk808) {
-		dev_warn(&rk808_i2c_client->dev,
-			 "have no rk805, so do nothing here\n");
+	if (!rk808)
 		return;
-	}
 
 	ret = regmap_update_bits(rk808->regmap,
 				 RK805_DEV_CTRL_REG,
 				 DEV_OFF, DEV_OFF);
 	if (ret)
-		dev_err(&rk808_i2c_client->dev, "power off error!\n");
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
+}
+
+static void rk805_device_shutdown_prepare(void)
+{
+	int ret;
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+
+	if (!rk808)
+		return;
+
+	ret = regmap_update_bits(rk808->regmap,
+				 RK805_GPIO_IO_POL_REG,
+				 SLP_SD_MSK, SHUTDOWN_FUN);
+	if (ret)
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
 }
 
 static void rk808_device_shutdown(void)
@@ -492,17 +504,14 @@ static void rk808_device_shutdown(void)
 	int ret;
 	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
 
-	if (!rk808) {
-		dev_warn(&rk808_i2c_client->dev,
-			 "have no rk808, so do nothing here\n");
+	if (!rk808)
 		return;
-	}
 
 	ret = regmap_update_bits(rk808->regmap,
 				 RK808_DEVCTRL_REG,
 				 DEV_OFF_RST, DEV_OFF_RST);
 	if (ret)
-		dev_err(&rk808_i2c_client->dev, "power off error!\n");
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
 }
 
 static void rk818_device_shutdown(void)
@@ -510,17 +519,14 @@ static void rk818_device_shutdown(void)
 	int ret;
 	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
 
-	if (!rk808) {
-		dev_warn(&rk808_i2c_client->dev,
-			 "have no rk818, so do nothing here\n");
+	if (!rk808)
 		return;
-	}
 
 	ret = regmap_update_bits(rk808->regmap,
 				 RK818_DEVCTRL_REG,
 				 DEV_OFF, DEV_OFF);
 	if (ret)
-		dev_err(&rk808_i2c_client->dev, "power off error!\n");
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
 }
 
 static void rk8xx_syscore_shutdown(void)
@@ -609,6 +615,7 @@ static int rk808_probe(struct i2c_client *client,
 		cells = rk805s;
 		nr_cells = ARRAY_SIZE(rk805s);
 		rk808->pm_pwroff_fn = rk805_device_shutdown;
+		rk808->pm_pwroff_prep_fn = rk805_device_shutdown_prepare;
 		break;
 	case RK808_ID:
 		rk808->regmap_cfg = &rk808_regmap_config;
@@ -694,6 +701,12 @@ static int rk808_probe(struct i2c_client *client,
 		pm_power_off = rk808->pm_pwroff_fn;
 	}
 
+	if (pm_off && !pm_power_off_prepare) {
+		if (!rk808_i2c_client)
+			rk808_i2c_client = client;
+		pm_power_off_prepare = rk808->pm_pwroff_prep_fn;
+	}
+
 	return 0;
 
 err_irq:
@@ -714,6 +727,13 @@ static int rk808_remove(struct i2c_client *client)
 	if (rk808->pm_pwroff_fn && pm_power_off == rk808->pm_pwroff_fn)
 		pm_power_off = NULL;
 
+	/**
+	 * As above, check if the pointer is set by us before overwrite.
+	 */
+	if (rk808->pm_pwroff_prep_fn &&
+	    pm_power_off_prepare == rk808->pm_pwroff_prep_fn)
+		pm_power_off_prepare = NULL;
+
 	return 0;
 }
 
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 286316375636..b264ac794c74 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -629,5 +629,6 @@ struct rk808 {
 	const struct regmap_config	*regmap_cfg;
 	const struct regmap_irq_chip	*regmap_irq_chip;
 	void				(*pm_pwroff_fn)(void);
+	void				(*pm_pwroff_prep_fn)(void);
 };
 #endif /* __LINUX_REGULATOR_RK808_H */
-- 
cgit v1.2.3


From b1c83bd84618e5a3ec6395845d11803047a3ef9a Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Fri, 10 May 2019 18:23:01 -0700
Subject: mfd: stmfx: Fix macro definition spelling

Clang warns:

In file included from drivers/mfd/stmfx.c:13:
include/linux/mfd/stmfx.h:7:9: warning: 'MFD_STMFX_H' is used as a
header guard here, followed by #define of a different macro
[-Wheader-guard]

Fixes: 06252ade9156 ("mfd: Add ST Multi-Function eXpander (STMFX) core driver")
Link: https://github.com/ClangBuiltLinux/linux/issues/475
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/stmfx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mfd/stmfx.h b/include/linux/mfd/stmfx.h
index d890595b89b6..3c67983678ec 100644
--- a/include/linux/mfd/stmfx.h
+++ b/include/linux/mfd/stmfx.h
@@ -5,7 +5,7 @@
  */
 
 #ifndef MFD_STMFX_H
-#define MFX_STMFX_H
+#define MFD_STMFX_H
 
 #include <linux/regmap.h>
 
-- 
cgit v1.2.3


From 0772a34bb8a12fcc245074e0f76e96cba2c9a434 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 26 Jun 2019 14:33:35 +0100
Subject: mfd: madera: Remove some unused registers and fix some defaults

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cs47l15-tables.c         |   2 -
 drivers/mfd/cs47l35-tables.c         |  54 +---------------
 drivers/mfd/cs47l85-tables.c         | 122 ++---------------------------------
 drivers/mfd/cs47l90-tables.c         |  76 ----------------------
 drivers/mfd/cs47l92-tables.c         |   1 -
 include/linux/mfd/madera/registers.h |  80 -----------------------
 6 files changed, 6 insertions(+), 329 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/cs47l15-tables.c b/drivers/mfd/cs47l15-tables.c
index 1b4f6f79eac2..73db8d03b531 100644
--- a/drivers/mfd/cs47l15-tables.c
+++ b/drivers/mfd/cs47l15-tables.c
@@ -88,7 +88,6 @@ static const struct reg_default cs47l15_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
 	{ 0x0000017a, 0x2906 }, /* R378 (0x17A) - FLL1 EFS 2 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
@@ -746,7 +745,6 @@ static bool cs47l15_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_1 ... MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
 	case MADERA_FLL1_EFS_2:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case MADERA_FLL1_SYNCHRONISER_1 ... MADERA_FLL1_SYNCHRONISER_7:
 	case MADERA_FLL1_SPREAD_SPECTRUM:
 	case MADERA_FLL1_GPIO_CLOCK:
diff --git a/drivers/mfd/cs47l35-tables.c b/drivers/mfd/cs47l35-tables.c
index 338b825127f1..fe838cbc2a7e 100644
--- a/drivers/mfd/cs47l35-tables.c
+++ b/drivers/mfd/cs47l35-tables.c
@@ -109,9 +109,8 @@ static const struct reg_default cs47l35_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
-	{ 0x0000017a, 0x0b06 }, /* R378 (0x17a) - FLL1 EFS2 */
+	{ 0x0000017a, 0x2906 }, /* R378 (0x17a) - FLL1 EFS2 */
 	{ 0x0000017f, 0x0000 }, /* R383 (0x17f) - FLL1 Synchroniser 1 */
 	{ 0x00000180, 0x0000 }, /* R384 (0x180) - FLL1 Synchroniser 2 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 3 */
@@ -174,9 +173,6 @@ static const struct reg_default cs47l35_reg_default[] = {
 	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
 	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
 	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
-	{ 0x00000440, 0x0003 }, /* R1088 (0x440) - DRE Enable */
-	{ 0x00000448, 0x0a83 }, /* R1096 (0x448) - eDRE Enable */
-	{ 0x0000044a, 0x0000 }, /* R1098 (0x44a) - eDRE Manual */
 	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
 	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
 	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
@@ -720,28 +716,6 @@ static const struct reg_default cs47l35_reg_default[] = {
 	{ 0x00000ef3, 0x0000 }, /* R3827 (0xef3) - ISRC 2 CTRL 1 */
 	{ 0x00000ef4, 0x0001 }, /* R3828 (0xef4) - ISRC 2 CTRL 2 */
 	{ 0x00000ef5, 0x0000 }, /* R3829 (0xef5) - ISRC 2 CTRL 3 */
-	{ 0x00001300, 0x0000 }, /* R4864 (0x1300) - DAC Comp 1 */
-	{ 0x00001302, 0x0000 }, /* R4866 (0x1302) - DAC Comp 2 */
-	{ 0x00001380, 0x0000 }, /* R4992 (0x1380) - FRF Coefficient 1L 1 */
-	{ 0x00001381, 0x0000 }, /* R4993 (0x1381) - FRF Coefficient 1L 2 */
-	{ 0x00001382, 0x0000 }, /* R4994 (0x1382) - FRF Coefficient 1L 3 */
-	{ 0x00001383, 0x0000 }, /* R4995 (0x1383) - FRF Coefficient 1L 4 */
-	{ 0x00001390, 0x0000 }, /* R5008 (0x1390) - FRF Coefficient 1R 1 */
-	{ 0x00001391, 0x0000 }, /* R5009 (0x1391) - FRF Coefficient 1R 2 */
-	{ 0x00001392, 0x0000 }, /* R5010 (0x1392) - FRF Coefficient 1R 3 */
-	{ 0x00001393, 0x0000 }, /* R5011 (0x1393) - FRF Coefficient 1R 4 */
-	{ 0x000013a0, 0x0000 }, /* R5024 (0x13a0) - FRF Coefficient 4L 1 */
-	{ 0x000013a1, 0x0000 }, /* R5025 (0x13a1) - FRF Coefficient 4L 2 */
-	{ 0x000013a2, 0x0000 }, /* R5026 (0x13a2) - FRF Coefficient 4L 3 */
-	{ 0x000013a3, 0x0000 }, /* R5027 (0x13a3) - FRF Coefficient 4L 4 */
-	{ 0x000013b0, 0x0000 }, /* R5040 (0x13b0) - FRF Coefficient 5L 1 */
-	{ 0x000013b1, 0x0000 }, /* R5041 (0x13b1) - FRF Coefficient 5L 2 */
-	{ 0x000013b2, 0x0000 }, /* R5042 (0x13b2) - FRF Coefficient 5L 3 */
-	{ 0x000013b3, 0x0000 }, /* R5043 (0x13b3) - FRF Coefficient 5L 4 */
-	{ 0x000013c0, 0x0000 }, /* R5040 (0x13c0) - FRF Coefficient 5R 1 */
-	{ 0x000013c1, 0x0000 }, /* R5041 (0x13c1) - FRF Coefficient 5R 2 */
-	{ 0x000013c2, 0x0000 }, /* R5042 (0x13c2) - FRF Coefficient 5R 3 */
-	{ 0x000013c3, 0x0000 }, /* R5043 (0x13c3) - FRF Coefficient 5R 4 */
 	{ 0x00001700, 0x2001 }, /* R5888 (0x1700) - GPIO1 Control 1 */
 	{ 0x00001701, 0xf000 }, /* R5889 (0x1701) - GPIO1 Control 2 */
 	{ 0x00001702, 0x2001 }, /* R5890 (0x1702) - GPIO2 Control 1 */
@@ -892,7 +866,6 @@ static bool cs47l35_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
 	case MADERA_FLL1_EFS_2:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case CS47L35_FLL1_SYNCHRONISER_1:
 	case CS47L35_FLL1_SYNCHRONISER_2:
 	case CS47L35_FLL1_SYNCHRONISER_3:
@@ -967,9 +940,6 @@ static bool cs47l35_16bit_readable_register(struct device *dev,
 	case MADERA_OUTPUT_PATH_CONFIG_5R:
 	case MADERA_DAC_DIGITAL_VOLUME_5R:
 	case MADERA_NOISE_GATE_SELECT_5R:
-	case MADERA_DRE_ENABLE:
-	case MADERA_EDRE_ENABLE:
-	case MADERA_EDRE_MANUAL:
 	case MADERA_DAC_AEC_CONTROL_1:
 	case MADERA_DAC_AEC_CONTROL_2:
 	case MADERA_NOISE_GATE_CONTROL:
@@ -1439,28 +1409,6 @@ static bool cs47l35_16bit_readable_register(struct device *dev,
 	case MADERA_ISRC_2_CTRL_1:
 	case MADERA_ISRC_2_CTRL_2:
 	case MADERA_ISRC_2_CTRL_3:
-	case MADERA_DAC_COMP_1:
-	case MADERA_DAC_COMP_2:
-	case MADERA_FRF_COEFFICIENT_1L_1:
-	case MADERA_FRF_COEFFICIENT_1L_2:
-	case MADERA_FRF_COEFFICIENT_1L_3:
-	case MADERA_FRF_COEFFICIENT_1L_4:
-	case MADERA_FRF_COEFFICIENT_1R_1:
-	case MADERA_FRF_COEFFICIENT_1R_2:
-	case MADERA_FRF_COEFFICIENT_1R_3:
-	case MADERA_FRF_COEFFICIENT_1R_4:
-	case CS47L35_FRF_COEFFICIENT_4L_1:
-	case CS47L35_FRF_COEFFICIENT_4L_2:
-	case CS47L35_FRF_COEFFICIENT_4L_3:
-	case CS47L35_FRF_COEFFICIENT_4L_4:
-	case CS47L35_FRF_COEFFICIENT_5L_1:
-	case CS47L35_FRF_COEFFICIENT_5L_2:
-	case CS47L35_FRF_COEFFICIENT_5L_3:
-	case CS47L35_FRF_COEFFICIENT_5L_4:
-	case CS47L35_FRF_COEFFICIENT_5R_1:
-	case CS47L35_FRF_COEFFICIENT_5R_2:
-	case CS47L35_FRF_COEFFICIENT_5R_3:
-	case CS47L35_FRF_COEFFICIENT_5R_4:
 	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO16_CTRL_2:
 	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
 	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
diff --git a/drivers/mfd/cs47l85-tables.c b/drivers/mfd/cs47l85-tables.c
index 43803145d8e5..d0198b5e86ba 100644
--- a/drivers/mfd/cs47l85-tables.c
+++ b/drivers/mfd/cs47l85-tables.c
@@ -402,7 +402,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
 	{ 0x00000182, 0x0000 }, /* R386 (0x182) - FLL1 Synchroniser 2 */
@@ -419,7 +418,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000194, 0x007d }, /* R404 (0x194) - FLL2 Control 4 */
 	{ 0x00000195, 0x0000 }, /* R405 (0x195) - FLL2 Control 5 */
 	{ 0x00000196, 0x0000 }, /* R406 (0x196) - FLL2 Control 6 */
-	{ 0x00000197, 0x0281 }, /* R407 (0x197) - FLL2 Loop Filter Test 1 */
 	{ 0x00000199, 0x0000 }, /* R409 (0x199) - FLL2 Control 7 */
 	{ 0x000001a1, 0x0000 }, /* R417 (0x1a1) - FLL2 Synchroniser 1 */
 	{ 0x000001a2, 0x0000 }, /* R418 (0x1a2) - FLL2 Synchroniser 2 */
@@ -436,7 +434,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x000001b4, 0x007d }, /* R436 (0x1b4) - FLL3 Control 4 */
 	{ 0x000001b5, 0x0000 }, /* R437 (0x1b5) - FLL3 Control 5 */
 	{ 0x000001b6, 0x0000 }, /* R438 (0x1b6) - FLL3 Control 6 */
-	{ 0x000001b7, 0x0281 }, /* R439 (0x1b7) - FLL3 Loop Filter Test 1 */
 	{ 0x000001b9, 0x0000 }, /* R441 (0x1b9) - FLL3 Control 7 */
 	{ 0x000001c1, 0x0000 }, /* R449 (0x1c1) - FLL3 Synchroniser 1 */
 	{ 0x000001c2, 0x0000 }, /* R450 (0x1c2) - FLL3 Synchroniser 2 */
@@ -546,9 +543,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x0000043c, 0x0000 }, /* R1084 (0x43c) - Output Path Config 6R */
 	{ 0x0000043d, 0x0180 }, /* R1085 (0x43d) - DAC Digital Volume 6R */
 	{ 0x0000043f, 0x0800 }, /* R1087 (0x43f) - Noise Gate Select 6R */
-	{ 0x00000440, 0x003f }, /* R1088 (0x440) - DRE Enable */
-	{ 0x00000448, 0x003f }, /* R1096 (0x448) - EDRE Enable */
-	{ 0x0000044a, 0x0000 }, /* R1098 (0x44a) - EDRE Manual */
 	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
 	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
 	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
@@ -556,7 +550,7 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000491, 0x0000 }, /* R1169 (0x491) - PDM SPK1 CTRL 2 */
 	{ 0x00000492, 0x0069 }, /* R1170 (0x492) - PDM SPK2 CTRL 1 */
 	{ 0x00000493, 0x0000 }, /* R1171 (0x493) - PDM SPK2 CTRL 2 */
-	{ 0x000004a0, 0x3210 }, /* R1184 (0x4a0) - HP1 Short Circuit Ctrl */
+	{ 0x000004a0, 0x3280 }, /* R1184 (0x4a0) - HP1 Short Circuit Ctrl */
 	{ 0x000004a1, 0x3200 }, /* R1185 (0x4a1) - HP2 Short Circuit Ctrl */
 	{ 0x000004a2, 0x3200 }, /* R1186 (0x4a2) - HP3 Short Circuit Ctrl */
 	{ 0x000004a8, 0x7020 }, /* R1192 (0x4a8) - HP Test Ctrl 5 */
@@ -1365,11 +1359,11 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000e82, 0x0018 }, /* R3714 (0xe82) - DRC1 ctrl3 */
 	{ 0x00000e83, 0x0000 }, /* R3715 (0xe83) - DRC1 ctrl4 */
 	{ 0x00000e84, 0x0000 }, /* R3716 (0xe84) - DRC1 ctrl5 */
-	{ 0x00000e88, 0x0933 }, /* R3720 (0xe88) - DRC2 ctrl1 */
-	{ 0x00000e89, 0x0018 }, /* R3721 (0xe89) - DRC2 ctrl2 */
-	{ 0x00000e8a, 0x0000 }, /* R3722 (0xe8a) - DRC2 ctrl3 */
+	{ 0x00000e88, 0x0018 }, /* R3720 (0xe88) - DRC2 ctrl1 */
+	{ 0x00000e89, 0x0933 }, /* R3721 (0xe89) - DRC2 ctrl2 */
+	{ 0x00000e8a, 0x0018 }, /* R3722 (0xe8a) - DRC2 ctrl3 */
 	{ 0x00000e8b, 0x0000 }, /* R3723 (0xe8b) - DRC2 ctrl4 */
-	{ 0x00000e8c, 0x0040 }, /* R3724 (0xe8c) - DRC2 ctrl5 */
+	{ 0x00000e8c, 0x0000 }, /* R3724 (0xe8c) - DRC2 ctrl5 */
 	{ 0x00000ec0, 0x0000 }, /* R3776 (0xec0) - HPLPF1_1 */
 	{ 0x00000ec1, 0x0000 }, /* R3777 (0xec1) - HPLPF1_2 */
 	{ 0x00000ec4, 0x0000 }, /* R3780 (0xec4) - HPLPF2_1 */
@@ -1577,56 +1571,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000fc3, 0x0000 }, /* R4035 (0xfc3) - ANC Coefficient */
 	{ 0x00000fc4, 0x0000 }, /* R4036 (0xfc4) - ANC Coefficient */
 	{ 0x00000fc5, 0x0000 }, /* R4037 (0xfc5) - ANC Coefficient */
-	{ 0x00001300, 0x0000 }, /* R4864 (0x1300) - DAC Comp 1 */
-	{ 0x00001302, 0x0000 }, /* R4866 (0x1302) - DAC Comp 2 */
-	{ 0x00001380, 0x0000 }, /* R4992 (0x1380) - FRF Coefficient 1L 1 */
-	{ 0x00001381, 0x0000 }, /* R4993 (0x1381) - FRF Coefficient 1L 2 */
-	{ 0x00001382, 0x0000 }, /* R4994 (0x1382) - FRF Coefficient 1L 3 */
-	{ 0x00001383, 0x0000 }, /* R4995 (0x1383) - FRF Coefficient 1L 4 */
-	{ 0x00001390, 0x0000 }, /* R5008 (0x1390) - FRF Coefficient 1R 1 */
-	{ 0x00001391, 0x0000 }, /* R5009 (0x1391) - FRF Coefficient 1R 2 */
-	{ 0x00001392, 0x0000 }, /* R5010 (0x1392) - FRF Coefficient 1R 3 */
-	{ 0x00001393, 0x0000 }, /* R5011 (0x1393) - FRF Coefficient 1R 4 */
-	{ 0x000013a0, 0x0000 }, /* R5024 (0x13a0) - FRF Coefficient 2L 1 */
-	{ 0x000013a1, 0x0000 }, /* R5025 (0x13a1) - FRF Coefficient 2L 2 */
-	{ 0x000013a2, 0x0000 }, /* R5026 (0x13a2) - FRF Coefficient 2L 3 */
-	{ 0x000013a3, 0x0000 }, /* R5027 (0x13a3) - FRF Coefficient 2L 4 */
-	{ 0x000013b0, 0x0000 }, /* R5040 (0x13b0) - FRF Coefficient 2R 1 */
-	{ 0x000013b1, 0x0000 }, /* R5041 (0x13b1) - FRF Coefficient 2R 2 */
-	{ 0x000013b2, 0x0000 }, /* R5042 (0x13b2) - FRF Coefficient 2R 3 */
-	{ 0x000013b3, 0x0000 }, /* R5043 (0x13b3) - FRF Coefficient 2R 4 */
-	{ 0x000013c0, 0x0000 }, /* R5040 (0x13c0) - FRF Coefficient 3L 1 */
-	{ 0x000013c1, 0x0000 }, /* R5041 (0x13c1) - FRF Coefficient 3L 2 */
-	{ 0x000013c2, 0x0000 }, /* R5042 (0x13c2) - FRF Coefficient 3L 3 */
-	{ 0x000013c3, 0x0000 }, /* R5043 (0x13c3) - FRF Coefficient 3L 4 */
-	{ 0x000013d0, 0x0000 }, /* R5072 (0x13d0) - FRF Coefficient 3R 1 */
-	{ 0x000013d1, 0x0000 }, /* R5073 (0x13d1) - FRF Coefficient 3R 2 */
-	{ 0x000013d2, 0x0000 }, /* R5074 (0x13d2) - FRF Coefficient 3R 3 */
-	{ 0x000013d3, 0x0000 }, /* R5075 (0x13d3) - FRF Coefficient 3R 4 */
-	{ 0x000013e0, 0x0000 }, /* R5088 (0x13e0) - FRF Coefficient 4L 1 */
-	{ 0x000013e1, 0x0000 }, /* R5089 (0x13e1) - FRF Coefficient 4L 2 */
-	{ 0x000013e2, 0x0000 }, /* R5090 (0x13e2) - FRF Coefficient 4L 3 */
-	{ 0x000013e3, 0x0000 }, /* R5091 (0x13e3) - FRF Coefficient 4L 4 */
-	{ 0x000013f0, 0x0000 }, /* R5104 (0x13f0) - FRF Coefficient 4R 1 */
-	{ 0x000013f1, 0x0000 }, /* R5105 (0x13f1) - FRF Coefficient 4R 2 */
-	{ 0x000013f2, 0x0000 }, /* R5106 (0x13f2) - FRF Coefficient 4R 3 */
-	{ 0x000013f3, 0x0000 }, /* R5107 (0x13f3) - FRF Coefficient 4R 4 */
-	{ 0x00001400, 0x0000 }, /* R5120 (0x1400) - FRF Coefficient 5L 1 */
-	{ 0x00001401, 0x0000 }, /* R5121 (0x1401) - FRF Coefficient 5L 2 */
-	{ 0x00001402, 0x0000 }, /* R5122 (0x1402) - FRF Coefficient 5L 3 */
-	{ 0x00001403, 0x0000 }, /* R5123 (0x1403) - FRF Coefficient 5L 4 */
-	{ 0x00001410, 0x0000 }, /* R5136 (0x1410) - FRF Coefficient 5R 1 */
-	{ 0x00001411, 0x0000 }, /* R5137 (0x1411) - FRF Coefficient 5R 2 */
-	{ 0x00001412, 0x0000 }, /* R5138 (0x1412) - FRF Coefficient 5R 3 */
-	{ 0x00001413, 0x0000 }, /* R5139 (0x1413) - FRF Coefficient 5R 4 */
-	{ 0x00001420, 0x0000 }, /* R5152 (0x1420) - FRF Coefficient 6L 1 */
-	{ 0x00001421, 0x0000 }, /* R5153 (0x1421) - FRF Coefficient 6L 2 */
-	{ 0x00001422, 0x0000 }, /* R5154 (0x1422) - FRF Coefficient 6L 3 */
-	{ 0x00001423, 0x0000 }, /* R5155 (0x1423) - FRF Coefficient 6L 4 */
-	{ 0x00001430, 0x0000 }, /* R5168 (0x1430) - FRF Coefficient 6R 1 */
-	{ 0x00001431, 0x0000 }, /* R5169 (0x1431) - FRF Coefficient 6R 2 */
-	{ 0x00001432, 0x0000 }, /* R5170 (0x1432) - FRF Coefficient 6R 3 */
-	{ 0x00001433, 0x0000 }, /* R5171 (0x1433) - FRF Coefficient 6R 4 */
 	{ 0x00001700, 0x2001 }, /* R5888 (0x1700) - GPIO1 Control 1 */
 	{ 0x00001701, 0xe000 }, /* R5889 (0x1701) - GPIO1 Control 2 */
 	{ 0x00001702, 0x2001 }, /* R5890 (0x1702) - GPIO2 Control 1 */
@@ -1845,7 +1789,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_5:
 	case MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case MADERA_FLL1_SYNCHRONISER_1:
 	case MADERA_FLL1_SYNCHRONISER_2:
 	case MADERA_FLL1_SYNCHRONISER_3:
@@ -1862,7 +1805,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FLL2_CONTROL_5:
 	case MADERA_FLL2_CONTROL_6:
 	case MADERA_FLL2_CONTROL_7:
-	case MADERA_FLL2_LOOP_FILTER_TEST_1:
 	case MADERA_FLL2_SYNCHRONISER_1:
 	case MADERA_FLL2_SYNCHRONISER_2:
 	case MADERA_FLL2_SYNCHRONISER_3:
@@ -1879,7 +1821,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FLL3_CONTROL_5:
 	case MADERA_FLL3_CONTROL_6:
 	case MADERA_FLL3_CONTROL_7:
-	case MADERA_FLL3_LOOP_FILTER_TEST_1:
 	case MADERA_FLL3_SYNCHRONISER_1:
 	case MADERA_FLL3_SYNCHRONISER_2:
 	case MADERA_FLL3_SYNCHRONISER_3:
@@ -2004,9 +1945,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_OUTPUT_PATH_CONFIG_6R:
 	case MADERA_DAC_DIGITAL_VOLUME_6R:
 	case MADERA_NOISE_GATE_SELECT_6R:
-	case MADERA_DRE_ENABLE:
-	case MADERA_EDRE_ENABLE:
-	case MADERA_EDRE_MANUAL:
 	case MADERA_DAC_AEC_CONTROL_1:
 	case MADERA_DAC_AEC_CONTROL_2:
 	case MADERA_NOISE_GATE_CONTROL:
@@ -2792,56 +2730,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FCR_FILTER_CONTROL:
 	case MADERA_FCR_ADC_REFORMATTER_CONTROL:
 	case MADERA_FCR_COEFF_START ... MADERA_FCR_COEFF_END:
-	case MADERA_DAC_COMP_1:
-	case MADERA_DAC_COMP_2:
-	case MADERA_FRF_COEFFICIENT_1L_1:
-	case MADERA_FRF_COEFFICIENT_1L_2:
-	case MADERA_FRF_COEFFICIENT_1L_3:
-	case MADERA_FRF_COEFFICIENT_1L_4:
-	case MADERA_FRF_COEFFICIENT_1R_1:
-	case MADERA_FRF_COEFFICIENT_1R_2:
-	case MADERA_FRF_COEFFICIENT_1R_3:
-	case MADERA_FRF_COEFFICIENT_1R_4:
-	case MADERA_FRF_COEFFICIENT_2L_1:
-	case MADERA_FRF_COEFFICIENT_2L_2:
-	case MADERA_FRF_COEFFICIENT_2L_3:
-	case MADERA_FRF_COEFFICIENT_2L_4:
-	case MADERA_FRF_COEFFICIENT_2R_1:
-	case MADERA_FRF_COEFFICIENT_2R_2:
-	case MADERA_FRF_COEFFICIENT_2R_3:
-	case MADERA_FRF_COEFFICIENT_2R_4:
-	case MADERA_FRF_COEFFICIENT_3L_1:
-	case MADERA_FRF_COEFFICIENT_3L_2:
-	case MADERA_FRF_COEFFICIENT_3L_3:
-	case MADERA_FRF_COEFFICIENT_3L_4:
-	case MADERA_FRF_COEFFICIENT_3R_1:
-	case MADERA_FRF_COEFFICIENT_3R_2:
-	case MADERA_FRF_COEFFICIENT_3R_3:
-	case MADERA_FRF_COEFFICIENT_3R_4:
-	case MADERA_FRF_COEFFICIENT_4L_1:
-	case MADERA_FRF_COEFFICIENT_4L_2:
-	case MADERA_FRF_COEFFICIENT_4L_3:
-	case MADERA_FRF_COEFFICIENT_4L_4:
-	case MADERA_FRF_COEFFICIENT_4R_1:
-	case MADERA_FRF_COEFFICIENT_4R_2:
-	case MADERA_FRF_COEFFICIENT_4R_3:
-	case MADERA_FRF_COEFFICIENT_4R_4:
-	case MADERA_FRF_COEFFICIENT_5L_1:
-	case MADERA_FRF_COEFFICIENT_5L_2:
-	case MADERA_FRF_COEFFICIENT_5L_3:
-	case MADERA_FRF_COEFFICIENT_5L_4:
-	case MADERA_FRF_COEFFICIENT_5R_1:
-	case MADERA_FRF_COEFFICIENT_5R_2:
-	case MADERA_FRF_COEFFICIENT_5R_3:
-	case MADERA_FRF_COEFFICIENT_5R_4:
-	case MADERA_FRF_COEFFICIENT_6L_1:
-	case MADERA_FRF_COEFFICIENT_6L_2:
-	case MADERA_FRF_COEFFICIENT_6L_3:
-	case MADERA_FRF_COEFFICIENT_6L_4:
-	case MADERA_FRF_COEFFICIENT_6R_1:
-	case MADERA_FRF_COEFFICIENT_6R_2:
-	case MADERA_FRF_COEFFICIENT_6R_3:
-	case MADERA_FRF_COEFFICIENT_6R_4:
 	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO40_CTRL_2:
 	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
 	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
diff --git a/drivers/mfd/cs47l90-tables.c b/drivers/mfd/cs47l90-tables.c
index c040d3d7232a..2c761fc241f3 100644
--- a/drivers/mfd/cs47l90-tables.c
+++ b/drivers/mfd/cs47l90-tables.c
@@ -119,7 +119,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
 	{ 0x0000017a, 0x2906 }, /* R377 (0x17a) - FLL1 Efs 2 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
@@ -137,7 +136,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000194, 0x007d }, /* R404 (0x194) - FLL2 Control 4 */
 	{ 0x00000195, 0x0000 }, /* R405 (0x195) - FLL2 Control 5 */
 	{ 0x00000196, 0x0000 }, /* R406 (0x196) - FLL2 Control 6 */
-	{ 0x00000197, 0x0281 }, /* R407 (0x197) - FLL2 Loop Filter Test 1 */
 	{ 0x00000199, 0x0000 }, /* R409 (0x199) - FLL2 Control 7 */
 	{ 0x0000019a, 0x2906 }, /* R410 (0x19a) - FLL2 Efs 2 */
 	{ 0x000001a1, 0x0000 }, /* R417 (0x1a1) - FLL2 Synchroniser 1 */
@@ -260,8 +258,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
 	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
 	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
-	{ 0x00000440, 0x003f }, /* R1088 (0x440) - DRE Enable */
-	{ 0x00000448, 0x003f }, /* R1096 (0x448) - eDRE Enable */
 	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
 	{ 0x00000451, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 2 */
 	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
@@ -1262,40 +1258,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000fc3, 0x0000 }, /* R4035 (0xfc3) - ANC Coefficient */
 	{ 0x00000fc4, 0x0000 }, /* R4036 (0xfc4) - ANC Coefficient */
 	{ 0x00000fc5, 0x0000 }, /* R4037 (0xfc5) - ANC Coefficient */
-	{ 0x00001300, 0x050E }, /* R4864 (0x1300) - DAC Comp 1 */
-	{ 0x00001302, 0x0101 }, /* R4866 (0x1302) - DAC Comp 2 */
-	{ 0x00001380, 0x0425 }, /* R4992 (0x1380) - FRF Coefficient 1L 1 */
-	{ 0x00001381, 0xF6D8 }, /* R4993 (0x1381) - FRF Coefficient 1L 2 */
-	{ 0x00001382, 0x0632 }, /* R4994 (0x1382) - FRF Coefficient 1L 3 */
-	{ 0x00001383, 0xFEC8 }, /* R4995 (0x1383) - FRF Coefficient 1L 4 */
-	{ 0x00001390, 0x042F }, /* R5008 (0x1390) - FRF Coefficient 1R 1 */
-	{ 0x00001391, 0xF6CA }, /* R5009 (0x1391) - FRF Coefficient 1R 2 */
-	{ 0x00001392, 0x0637 }, /* R5010 (0x1392) - FRF Coefficient 1R 3 */
-	{ 0x00001393, 0xFEC8 }, /* R5011 (0x1393) - FRF Coefficient 1R 4 */
-	{ 0x000013a0, 0x0000 }, /* R5024 (0x13a0) - FRF Coefficient 2L 1 */
-	{ 0x000013a1, 0x0000 }, /* R5025 (0x13a1) - FRF Coefficient 2L 2 */
-	{ 0x000013a2, 0x0000 }, /* R5026 (0x13a2) - FRF Coefficient 2L 3 */
-	{ 0x000013a3, 0x0000 }, /* R5027 (0x13a3) - FRF Coefficient 2L 4 */
-	{ 0x000013b0, 0x0000 }, /* R5040 (0x13b0) - FRF Coefficient 2R 1 */
-	{ 0x000013b1, 0x0000 }, /* R5041 (0x13b1) - FRF Coefficient 2R 2 */
-	{ 0x000013b2, 0x0000 }, /* R5042 (0x13b2) - FRF Coefficient 2R 3 */
-	{ 0x000013b3, 0x0000 }, /* R5043 (0x13b3) - FRF Coefficient 2R 4 */
-	{ 0x000013c0, 0x0000 }, /* R5040 (0x13c0) - FRF Coefficient 3L 1 */
-	{ 0x000013c1, 0x0000 }, /* R5041 (0x13c1) - FRF Coefficient 3L 2 */
-	{ 0x000013c2, 0x0000 }, /* R5042 (0x13c2) - FRF Coefficient 3L 3 */
-	{ 0x000013c3, 0x0000 }, /* R5043 (0x13c3) - FRF Coefficient 3L 4 */
-	{ 0x000013d0, 0x0000 }, /* R5072 (0x13d0) - FRF Coefficient 3R 1 */
-	{ 0x000013d1, 0x0000 }, /* R5073 (0x13d1) - FRF Coefficient 3R 2 */
-	{ 0x000013d2, 0x0000 }, /* R5074 (0x13d2) - FRF Coefficient 3R 3 */
-	{ 0x000013d3, 0x0000 }, /* R5075 (0x13d3) - FRF Coefficient 3R 4 */
-	{ 0x00001400, 0x0000 }, /* R5120 (0x1400) - FRF Coefficient 5L 1 */
-	{ 0x00001401, 0x0000 }, /* R5121 (0x1401) - FRF Coefficient 5L 2 */
-	{ 0x00001402, 0x0000 }, /* R5122 (0x1402) - FRF Coefficient 5L 3 */
-	{ 0x00001403, 0x0000 }, /* R5123 (0x1403) - FRF Coefficient 5L 4 */
-	{ 0x00001410, 0x0000 }, /* R5136 (0x1410) - FRF Coefficient 5R 1 */
-	{ 0x00001411, 0x0000 }, /* R5137 (0x1411) - FRF Coefficient 5R 2 */
-	{ 0x00001412, 0x0000 }, /* R5138 (0x1412) - FRF Coefficient 5R 3 */
-	{ 0x00001413, 0x0000 }, /* R5139 (0x1413) - FRF Coefficient 5R 4 */
 	{ 0x00001480, 0x0000 }, /* R5248 (0x1480) - DFC1_CTRL */
 	{ 0x00001482, 0x1f00 }, /* R5250 (0x1482) - DFC1_RX */
 	{ 0x00001484, 0x1f00 }, /* R5252 (0x1486) - DFC1_TX */
@@ -1535,7 +1497,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
 	case MADERA_FLL1_EFS_2:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case MADERA_FLL1_SYNCHRONISER_1:
 	case MADERA_FLL1_SYNCHRONISER_2:
 	case MADERA_FLL1_SYNCHRONISER_3:
@@ -1553,7 +1514,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_FLL2_CONTROL_6:
 	case MADERA_FLL2_CONTROL_7:
 	case MADERA_FLL2_EFS_2:
-	case MADERA_FLL2_LOOP_FILTER_TEST_1:
 	case MADERA_FLL2_SYNCHRONISER_1:
 	case MADERA_FLL2_SYNCHRONISER_2:
 	case MADERA_FLL2_SYNCHRONISER_3:
@@ -1690,8 +1650,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_OUTPUT_PATH_CONFIG_5R:
 	case MADERA_DAC_DIGITAL_VOLUME_5R:
 	case MADERA_NOISE_GATE_SELECT_5R:
-	case MADERA_DRE_ENABLE:
-	case MADERA_EDRE_ENABLE:
 	case MADERA_DAC_AEC_CONTROL_1:
 	case MADERA_DAC_AEC_CONTROL_2:
 	case MADERA_NOISE_GATE_CONTROL:
@@ -2449,40 +2407,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_FCR_FILTER_CONTROL:
 	case MADERA_FCR_ADC_REFORMATTER_CONTROL:
 	case MADERA_FCR_COEFF_START ... MADERA_FCR_COEFF_END:
-	case MADERA_DAC_COMP_1:
-	case MADERA_DAC_COMP_2:
-	case MADERA_FRF_COEFFICIENT_1L_1:
-	case MADERA_FRF_COEFFICIENT_1L_2:
-	case MADERA_FRF_COEFFICIENT_1L_3:
-	case MADERA_FRF_COEFFICIENT_1L_4:
-	case MADERA_FRF_COEFFICIENT_1R_1:
-	case MADERA_FRF_COEFFICIENT_1R_2:
-	case MADERA_FRF_COEFFICIENT_1R_3:
-	case MADERA_FRF_COEFFICIENT_1R_4:
-	case MADERA_FRF_COEFFICIENT_2L_1:
-	case MADERA_FRF_COEFFICIENT_2L_2:
-	case MADERA_FRF_COEFFICIENT_2L_3:
-	case MADERA_FRF_COEFFICIENT_2L_4:
-	case MADERA_FRF_COEFFICIENT_2R_1:
-	case MADERA_FRF_COEFFICIENT_2R_2:
-	case MADERA_FRF_COEFFICIENT_2R_3:
-	case MADERA_FRF_COEFFICIENT_2R_4:
-	case MADERA_FRF_COEFFICIENT_3L_1:
-	case MADERA_FRF_COEFFICIENT_3L_2:
-	case MADERA_FRF_COEFFICIENT_3L_3:
-	case MADERA_FRF_COEFFICIENT_3L_4:
-	case MADERA_FRF_COEFFICIENT_3R_1:
-	case MADERA_FRF_COEFFICIENT_3R_2:
-	case MADERA_FRF_COEFFICIENT_3R_3:
-	case MADERA_FRF_COEFFICIENT_3R_4:
-	case MADERA_FRF_COEFFICIENT_5L_1:
-	case MADERA_FRF_COEFFICIENT_5L_2:
-	case MADERA_FRF_COEFFICIENT_5L_3:
-	case MADERA_FRF_COEFFICIENT_5L_4:
-	case MADERA_FRF_COEFFICIENT_5R_1:
-	case MADERA_FRF_COEFFICIENT_5R_2:
-	case MADERA_FRF_COEFFICIENT_5R_3:
-	case MADERA_FRF_COEFFICIENT_5R_4:
 	case MADERA_DFC1_CTRL:
 	case MADERA_DFC1_RX:
 	case MADERA_DFC1_TX:
diff --git a/drivers/mfd/cs47l92-tables.c b/drivers/mfd/cs47l92-tables.c
index 3dc1fefe68f5..c8a234381350 100644
--- a/drivers/mfd/cs47l92-tables.c
+++ b/drivers/mfd/cs47l92-tables.c
@@ -1063,7 +1063,6 @@ static const struct reg_default cs47l92_reg_default[] = {
 	{ 0x0000185e, 0xffff }, /* R6238 (0x185e) - IRQ1 Mask 31 */
 	{ 0x0000185f, 0xffff }, /* R6239 (0x185f) - IRQ1 Mask 32 */
 	{ 0x00001860, 0x0001 }, /* R6240 (0x1860) - IRQ1 Mask 33 */
-	{ 0x00001948, 0x031f }, /* R6472 (0x1948) - IRQ2 Mask 9 */
 	{ 0x00001a06, 0x0000 }, /* R6662 (0x1a06) - Interrupt Debounce 7 */
 	{ 0x00001a80, 0x4400 }, /* R6784 (0x1a80) - IRQ1 Ctrl */
 };
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 6439c0282ac6..53c2377b54b2 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -76,9 +76,7 @@
 #define MADERA_FLL1_CONTROL_4				0x174
 #define MADERA_FLL1_CONTROL_5				0x175
 #define MADERA_FLL1_CONTROL_6				0x176
-#define MADERA_FLL1_LOOP_FILTER_TEST_1			0x177
 #define CS47L92_FLL1_CONTROL_7				0x177
-#define MADERA_FLL1_NCO_TEST_0				0x178
 #define CS47L92_FLL1_CONTROL_8				0x178
 #define MADERA_FLL1_CONTROL_7				0x179
 #define CS47L92_FLL1_CONTROL_9				0x179
@@ -111,9 +109,7 @@
 #define MADERA_FLL2_CONTROL_4				0x194
 #define MADERA_FLL2_CONTROL_5				0x195
 #define MADERA_FLL2_CONTROL_6				0x196
-#define MADERA_FLL2_LOOP_FILTER_TEST_1			0x197
 #define CS47L92_FLL2_CONTROL_7				0x197
-#define MADERA_FLL2_NCO_TEST_0				0x198
 #define CS47L92_FLL2_CONTROL_8				0x198
 #define MADERA_FLL2_CONTROL_7				0x199
 #define CS47L92_FLL2_CONTROL_9				0x199
@@ -137,8 +133,6 @@
 #define MADERA_FLL3_CONTROL_4				0x1B4
 #define MADERA_FLL3_CONTROL_5				0x1B5
 #define MADERA_FLL3_CONTROL_6				0x1B6
-#define MADERA_FLL3_LOOP_FILTER_TEST_1			0x1B7
-#define MADERA_FLL3_NCO_TEST_0				0x1B8
 #define MADERA_FLL3_CONTROL_7				0x1B9
 #define MADERA_FLL3_SYNCHRONISER_1			0x1C1
 #define MADERA_FLL3_SYNCHRONISER_2			0x1C2
@@ -304,9 +298,6 @@
 #define MADERA_OUTPUT_PATH_CONFIG_6R			0x43C
 #define MADERA_DAC_DIGITAL_VOLUME_6R			0x43D
 #define MADERA_NOISE_GATE_SELECT_6R			0x43F
-#define MADERA_DRE_ENABLE				0x440
-#define MADERA_EDRE_ENABLE				0x448
-#define MADERA_EDRE_MANUAL				0x44A
 #define MADERA_DAC_AEC_CONTROL_1			0x450
 #define MADERA_DAC_AEC_CONTROL_2			0x451
 #define MADERA_NOISE_GATE_CONTROL			0x458
@@ -1182,68 +1173,6 @@
 #define MADERA_FCR_COEFF_END				0xFC5
 #define MADERA_AUXPDM1_CTRL_0				0x10C0
 #define MADERA_AUXPDM1_CTRL_1				0x10C1
-#define MADERA_DAC_COMP_1				0x1300
-#define MADERA_DAC_COMP_2				0x1302
-#define MADERA_FRF_COEFFICIENT_1L_1			0x1380
-#define MADERA_FRF_COEFFICIENT_1L_2			0x1381
-#define MADERA_FRF_COEFFICIENT_1L_3			0x1382
-#define MADERA_FRF_COEFFICIENT_1L_4			0x1383
-#define MADERA_FRF_COEFFICIENT_1R_1			0x1390
-#define MADERA_FRF_COEFFICIENT_1R_2			0x1391
-#define MADERA_FRF_COEFFICIENT_1R_3			0x1392
-#define MADERA_FRF_COEFFICIENT_1R_4			0x1393
-#define MADERA_FRF_COEFFICIENT_2L_1			0x13A0
-#define MADERA_FRF_COEFFICIENT_2L_2			0x13A1
-#define MADERA_FRF_COEFFICIENT_2L_3			0x13A2
-#define MADERA_FRF_COEFFICIENT_2L_4			0x13A3
-#define MADERA_FRF_COEFFICIENT_2R_1			0x13B0
-#define MADERA_FRF_COEFFICIENT_2R_2			0x13B1
-#define MADERA_FRF_COEFFICIENT_2R_3			0x13B2
-#define MADERA_FRF_COEFFICIENT_2R_4			0x13B3
-#define MADERA_FRF_COEFFICIENT_3L_1			0x13C0
-#define MADERA_FRF_COEFFICIENT_3L_2			0x13C1
-#define MADERA_FRF_COEFFICIENT_3L_3			0x13C2
-#define MADERA_FRF_COEFFICIENT_3L_4			0x13C3
-#define MADERA_FRF_COEFFICIENT_3R_1			0x13D0
-#define MADERA_FRF_COEFFICIENT_3R_2			0x13D1
-#define MADERA_FRF_COEFFICIENT_3R_3			0x13D2
-#define MADERA_FRF_COEFFICIENT_3R_4			0x13D3
-#define MADERA_FRF_COEFFICIENT_4L_1			0x13E0
-#define MADERA_FRF_COEFFICIENT_4L_2			0x13E1
-#define MADERA_FRF_COEFFICIENT_4L_3			0x13E2
-#define MADERA_FRF_COEFFICIENT_4L_4			0x13E3
-#define MADERA_FRF_COEFFICIENT_4R_1			0x13F0
-#define MADERA_FRF_COEFFICIENT_4R_2			0x13F1
-#define MADERA_FRF_COEFFICIENT_4R_3			0x13F2
-#define MADERA_FRF_COEFFICIENT_4R_4			0x13F3
-#define CS47L35_FRF_COEFFICIENT_4L_1			0x13A0
-#define CS47L35_FRF_COEFFICIENT_4L_2			0x13A1
-#define CS47L35_FRF_COEFFICIENT_4L_3			0x13A2
-#define CS47L35_FRF_COEFFICIENT_4L_4			0x13A3
-#define CS47L35_FRF_COEFFICIENT_5L_1			0x13B0
-#define CS47L35_FRF_COEFFICIENT_5L_2			0x13B1
-#define CS47L35_FRF_COEFFICIENT_5L_3			0x13B2
-#define CS47L35_FRF_COEFFICIENT_5L_4			0x13B3
-#define CS47L35_FRF_COEFFICIENT_5R_1			0x13C0
-#define CS47L35_FRF_COEFFICIENT_5R_2			0x13C1
-#define CS47L35_FRF_COEFFICIENT_5R_3			0x13C2
-#define CS47L35_FRF_COEFFICIENT_5R_4			0x13C3
-#define MADERA_FRF_COEFFICIENT_5L_1			0x1400
-#define MADERA_FRF_COEFFICIENT_5L_2			0x1401
-#define MADERA_FRF_COEFFICIENT_5L_3			0x1402
-#define MADERA_FRF_COEFFICIENT_5L_4			0x1403
-#define MADERA_FRF_COEFFICIENT_5R_1			0x1410
-#define MADERA_FRF_COEFFICIENT_5R_2			0x1411
-#define MADERA_FRF_COEFFICIENT_5R_3			0x1412
-#define MADERA_FRF_COEFFICIENT_5R_4			0x1413
-#define MADERA_FRF_COEFFICIENT_6L_1			0x1420
-#define MADERA_FRF_COEFFICIENT_6L_2			0x1421
-#define MADERA_FRF_COEFFICIENT_6L_3			0x1422
-#define MADERA_FRF_COEFFICIENT_6L_4			0x1423
-#define MADERA_FRF_COEFFICIENT_6R_1			0x1430
-#define MADERA_FRF_COEFFICIENT_6R_2			0x1431
-#define MADERA_FRF_COEFFICIENT_6R_3			0x1432
-#define MADERA_FRF_COEFFICIENT_6R_4			0x1433
 #define MADERA_DFC1_CTRL				0x1480
 #define MADERA_DFC1_RX					0x1482
 #define MADERA_DFC1_TX					0x1484
@@ -1573,15 +1502,6 @@
 #define MADERA_FLL1_REFCLK_SRC_SHIFT			     0
 #define MADERA_FLL1_REFCLK_SRC_WIDTH			     4
 
-/* (0x0177)  FLL1_Loop_Filter_Test_1 */
-#define MADERA_FLL1_FRC_INTEG_UPD			0x8000
-#define MADERA_FLL1_FRC_INTEG_UPD_MASK			0x8000
-#define MADERA_FLL1_FRC_INTEG_UPD_SHIFT			    15
-#define MADERA_FLL1_FRC_INTEG_UPD_WIDTH			     1
-#define MADERA_FLL1_FRC_INTEG_VAL_MASK			0x0FFF
-#define MADERA_FLL1_FRC_INTEG_VAL_SHIFT			     0
-#define MADERA_FLL1_FRC_INTEG_VAL_WIDTH			    12
-
 /* (0x0179)  FLL1_Control_7 */
 #define MADERA_FLL1_GAIN_MASK				0x003c
 #define MADERA_FLL1_GAIN_SHIFT				     2
-- 
cgit v1.2.3


From d6871a73387d51dfdde6ad1479aea54d3eafcc89 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 26 Jun 2019 14:33:36 +0100
Subject: mfd: madera: Fixup SPDX headers

GPL-2.0-only is the preferred way of expressing v2 of the GPL, so switch
to that. Remove some redundant copyright notices and correct some
instances where the wrong comment type has been used in header files.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cs47l15-tables.c         | 2 +-
 drivers/mfd/cs47l35-tables.c         | 6 +-----
 drivers/mfd/cs47l85-tables.c         | 6 +-----
 drivers/mfd/cs47l90-tables.c         | 6 +-----
 drivers/mfd/cs47l92-tables.c         | 2 +-
 drivers/mfd/madera-core.c            | 6 +-----
 drivers/mfd/madera-i2c.c             | 6 +-----
 drivers/mfd/madera-spi.c             | 6 +-----
 include/linux/mfd/madera/core.h      | 6 +-----
 include/linux/mfd/madera/pdata.h     | 6 +-----
 include/linux/mfd/madera/registers.h | 6 +-----
 11 files changed, 11 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/cs47l15-tables.c b/drivers/mfd/cs47l15-tables.c
index 73db8d03b531..f81b45336690 100644
--- a/drivers/mfd/cs47l15-tables.c
+++ b/drivers/mfd/cs47l15-tables.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L15 codec
  *
diff --git a/drivers/mfd/cs47l35-tables.c b/drivers/mfd/cs47l35-tables.c
index fe838cbc2a7e..a0bc6c5100d6 100644
--- a/drivers/mfd/cs47l35-tables.c
+++ b/drivers/mfd/cs47l35-tables.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L35 codec
  *
  * Copyright (C) 2015-2017 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/cs47l85-tables.c b/drivers/mfd/cs47l85-tables.c
index d0198b5e86ba..270d8eda3f5f 100644
--- a/drivers/mfd/cs47l85-tables.c
+++ b/drivers/mfd/cs47l85-tables.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L85 codec
  *
  * Copyright (C) 2015-2017 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/cs47l90-tables.c b/drivers/mfd/cs47l90-tables.c
index 2c761fc241f3..7345fc09c0bb 100644
--- a/drivers/mfd/cs47l90-tables.c
+++ b/drivers/mfd/cs47l90-tables.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L90 codec
  *
  * Copyright (C) 2015-2017 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/cs47l92-tables.c b/drivers/mfd/cs47l92-tables.c
index c8a234381350..f296e355df4d 100644
--- a/drivers/mfd/cs47l92-tables.c
+++ b/drivers/mfd/cs47l92-tables.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L92 codec
  *
diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c
index b9e9c169c6cc..29540cbf7593 100644
--- a/drivers/mfd/madera-core.c
+++ b/drivers/mfd/madera-core.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Core MFD support for Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/madera-i2c.c b/drivers/mfd/madera-i2c.c
index 3f4ab5dcf5c3..6b965eb034b6 100644
--- a/drivers/mfd/madera-i2c.c
+++ b/drivers/mfd/madera-i2c.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * I2C bus interface to Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/madera-spi.c b/drivers/mfd/madera-spi.c
index d76c7e7376d7..e860f5ff0933 100644
--- a/drivers/mfd/madera-spi.c
+++ b/drivers/mfd/madera-spi.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * SPI bus interface to Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/include/linux/mfd/madera/core.h b/include/linux/mfd/madera/core.h
index 7b87f9a02ecc..7ffa696cce7c 100644
--- a/include/linux/mfd/madera/core.h
+++ b/include/linux/mfd/madera/core.h
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * MFD internals for Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #ifndef MADERA_CORE_H
diff --git a/include/linux/mfd/madera/pdata.h b/include/linux/mfd/madera/pdata.h
index dd00ab824e5b..ec0711bcad50 100644
--- a/include/linux/mfd/madera/pdata.h
+++ b/include/linux/mfd/madera/pdata.h
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Platform data for Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #ifndef MADERA_PDATA_H
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 53c2377b54b2..fe909d177762 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Madera register definitions
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #ifndef MADERA_REGISTERS_H
-- 
cgit v1.2.3


From 07ec38917e68f0114b9c8aeeb1c584b5e73e4dd6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:01 +0200
Subject: mm: remove the struct hmm_device infrastructure

This code is a trivial wrapper around device model helpers, which
should have been integrated into the driver device model usage from
the start.  Assuming it actually had users, which it never had since
the code was added more than 1 1/2 years ago.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h | 20 --------------
 mm/hmm.c            | 80 -----------------------------------------------------
 2 files changed, 100 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 044a36d7c3f8..99765be3284d 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -751,26 +751,6 @@ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
 {
 	return page->hmm_data;
 }
-
-
-/*
- * struct hmm_device - fake device to hang device memory onto
- *
- * @device: device struct
- * @minor: device minor number
- */
-struct hmm_device {
-	struct device		device;
-	unsigned int		minor;
-};
-
-/*
- * A device driver that wants to handle multiple devices memory through a
- * single fake device can use hmm_device to do so. This is purely a helper and
- * it is not strictly needed, in order to make use of any HMM functionality.
- */
-struct hmm_device *hmm_device_new(void *drvdata);
-void hmm_device_put(struct hmm_device *hmm_device);
 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 #else /* IS_ENABLED(CONFIG_HMM) */
 static inline void hmm_mm_destroy(struct mm_struct *mm) {}
diff --git a/mm/hmm.c b/mm/hmm.c
index f702a3895d05..00cc642b3d7e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1528,84 +1528,4 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
 	return devmem;
 }
 EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
-
-/*
- * A device driver that wants to handle multiple devices memory through a
- * single fake device can use hmm_device to do so. This is purely a helper
- * and it is not needed to make use of any HMM functionality.
- */
-#define HMM_DEVICE_MAX 256
-
-static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
-static DEFINE_SPINLOCK(hmm_device_lock);
-static struct class *hmm_device_class;
-static dev_t hmm_device_devt;
-
-static void hmm_device_release(struct device *device)
-{
-	struct hmm_device *hmm_device;
-
-	hmm_device = container_of(device, struct hmm_device, device);
-	spin_lock(&hmm_device_lock);
-	clear_bit(hmm_device->minor, hmm_device_mask);
-	spin_unlock(&hmm_device_lock);
-
-	kfree(hmm_device);
-}
-
-struct hmm_device *hmm_device_new(void *drvdata)
-{
-	struct hmm_device *hmm_device;
-
-	hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
-	if (!hmm_device)
-		return ERR_PTR(-ENOMEM);
-
-	spin_lock(&hmm_device_lock);
-	hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
-	if (hmm_device->minor >= HMM_DEVICE_MAX) {
-		spin_unlock(&hmm_device_lock);
-		kfree(hmm_device);
-		return ERR_PTR(-EBUSY);
-	}
-	set_bit(hmm_device->minor, hmm_device_mask);
-	spin_unlock(&hmm_device_lock);
-
-	dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
-	hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
-					hmm_device->minor);
-	hmm_device->device.release = hmm_device_release;
-	dev_set_drvdata(&hmm_device->device, drvdata);
-	hmm_device->device.class = hmm_device_class;
-	device_initialize(&hmm_device->device);
-
-	return hmm_device;
-}
-EXPORT_SYMBOL(hmm_device_new);
-
-void hmm_device_put(struct hmm_device *hmm_device)
-{
-	put_device(&hmm_device->device);
-}
-EXPORT_SYMBOL(hmm_device_put);
-
-static int __init hmm_init(void)
-{
-	int ret;
-
-	ret = alloc_chrdev_region(&hmm_device_devt, 0,
-				  HMM_DEVICE_MAX,
-				  "hmm_device");
-	if (ret)
-		return ret;
-
-	hmm_device_class = class_create(THIS_MODULE, "hmm_device");
-	if (IS_ERR(hmm_device_class)) {
-		unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
-		return PTR_ERR(hmm_device_class);
-	}
-	return 0;
-}
-
-device_initcall(hmm_init);
 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-- 
cgit v1.2.3


From 35f2c14d2a076b063a76c5bf275c46c0743ba3a0 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 26 Jun 2019 15:38:43 -0700
Subject: platform/x86: ISST: Add common API to register and handle ioctls

Encapsulate common functions which all Intel Speed Select Technology
interface drivers can use. This creates API to register misc device for
user kernel communication and handle all common IOCTLs. As part of the
registry it allows a callback which is to handle domain specific ioctl
processing.

There can be multiple drivers register for services, which can be built
as modules. So this driver handle contention during registry and as well
as during removal. Once user space opened the misc device, the registered
driver will be prevented from removal. Also once misc device is opened by
the user space new client driver can't register, till the misc device is
closed.

There are two types of client drivers, one to handle mail box interface
and the other is to allow direct read/write to some specific MMIO space.

This common driver implements IOCTL ISST_IF_GET_PLATFORM_INFO.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/Kconfig                       |   2 +
 drivers/platform/x86/Makefile                      |   1 +
 drivers/platform/x86/intel_speed_select_if/Kconfig |  17 ++
 .../platform/x86/intel_speed_select_if/Makefile    |   7 +
 .../x86/intel_speed_select_if/isst_if_common.c     | 182 +++++++++++++++++++++
 .../x86/intel_speed_select_if/isst_if_common.h     |  60 +++++++
 include/uapi/linux/isst_if.h                       |  41 +++++
 7 files changed, 310 insertions(+)
 create mode 100644 drivers/platform/x86/intel_speed_select_if/Kconfig
 create mode 100644 drivers/platform/x86/intel_speed_select_if/Makefile
 create mode 100644 drivers/platform/x86/intel_speed_select_if/isst_if_common.c
 create mode 100644 drivers/platform/x86/intel_speed_select_if/isst_if_common.h
 create mode 100644 include/uapi/linux/isst_if.h

(limited to 'include')

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 58494a12a9b0..ebd44d071f7b 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1336,6 +1336,8 @@ config PCENGINES_APU2
 	  To compile this driver as a module, choose M here: the module
 	  will be called pcengines-apuv2.
 
+source "drivers/platform/x86/intel_speed_select_if/Kconfig"
+
 endif # X86_PLATFORM_DEVICES
 
 config PMC_ATOM
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index f64445d69f99..3a62157e9062 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -99,3 +99,4 @@ obj-$(CONFIG_INTEL_MRFLD_PWRBTN)	+= intel_mrfld_pwrbtn.o
 obj-$(CONFIG_I2C_MULTI_INSTANTIATE)	+= i2c-multi-instantiate.o
 obj-$(CONFIG_INTEL_ATOMISP2_PM)	+= intel_atomisp2_pm.o
 obj-$(CONFIG_PCENGINES_APU2)	+= pcengines-apuv2.o
+obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += intel_speed_select_if/
diff --git a/drivers/platform/x86/intel_speed_select_if/Kconfig b/drivers/platform/x86/intel_speed_select_if/Kconfig
new file mode 100644
index 000000000000..ce3e3dc076d2
--- /dev/null
+++ b/drivers/platform/x86/intel_speed_select_if/Kconfig
@@ -0,0 +1,17 @@
+menu "Intel Speed Select Technology interface support"
+	depends on PCI
+	depends on X86_64 || COMPILE_TEST
+
+config INTEL_SPEED_SELECT_INTERFACE
+	tristate "Intel(R) Speed Select Technology interface drivers"
+	help
+	  This config enables the Intel(R) Speed Select Technology interface
+	  drivers. The Intel(R) speed select technology features are non
+	  architectural and only supported on specific Xeon(R) servers.
+	  These drivers provide interface to directly communicate with hardware
+	  via MMIO and Mail boxes to enumerate and control all the speed select
+	  features.
+
+	  Enable this config, if there is a need to enable and control the
+	  Intel(R) Speed Select Technology features from the user space.
+endmenu
diff --git a/drivers/platform/x86/intel_speed_select_if/Makefile b/drivers/platform/x86/intel_speed_select_if/Makefile
new file mode 100644
index 000000000000..c12687672fc9
--- /dev/null
+++ b/drivers/platform/x86/intel_speed_select_if/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile - Intel Speed Select Interface drivers
+# Copyright (c) 2019, Intel Corporation.
+#
+
+obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_common.o
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
new file mode 100644
index 000000000000..ab2bb4862dc8
--- /dev/null
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Speed Select Interface: Common functions
+ * Copyright (c) 2019, Intel Corporation.
+ * All rights reserved.
+ *
+ * Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/isst_if.h>
+
+#include "isst_if_common.h"
+
+static struct isst_if_cmd_cb punit_callbacks[ISST_IF_DEV_MAX];
+
+static int isst_if_get_platform_info(void __user *argp)
+{
+	struct isst_if_platform_info info;
+
+	info.api_version = ISST_IF_API_VERSION,
+	info.driver_version = ISST_IF_DRIVER_VERSION,
+	info.max_cmds_per_ioctl = ISST_IF_CMD_LIMIT,
+	info.mbox_supported = punit_callbacks[ISST_IF_DEV_MBOX].registered;
+	info.mmio_supported = punit_callbacks[ISST_IF_DEV_MMIO].registered;
+
+	if (copy_to_user(argp, &info, sizeof(info)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
+			      unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	long ret = -ENOTTY;
+
+	switch (cmd) {
+	case ISST_IF_GET_PLATFORM_INFO:
+		ret = isst_if_get_platform_info(argp);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static DEFINE_MUTEX(punit_misc_dev_lock);
+static int misc_usage_count;
+static int misc_device_ret;
+static int misc_device_open;
+
+static int isst_if_open(struct inode *inode, struct file *file)
+{
+	int i, ret = 0;
+
+	/* Fail open, if a module is going away */
+	mutex_lock(&punit_misc_dev_lock);
+	for (i = 0; i < ISST_IF_DEV_MAX; ++i) {
+		struct isst_if_cmd_cb *cb = &punit_callbacks[i];
+
+		if (cb->registered && !try_module_get(cb->owner)) {
+			ret = -ENODEV;
+			break;
+		}
+	}
+	if (ret) {
+		int j;
+
+		for (j = 0; j < i; ++j) {
+			struct isst_if_cmd_cb *cb;
+
+			cb = &punit_callbacks[j];
+			if (cb->registered)
+				module_put(cb->owner);
+		}
+	} else {
+		misc_device_open++;
+	}
+	mutex_unlock(&punit_misc_dev_lock);
+
+	return ret;
+}
+
+static int isst_if_relase(struct inode *inode, struct file *f)
+{
+	int i;
+
+	mutex_lock(&punit_misc_dev_lock);
+	misc_device_open--;
+	for (i = 0; i < ISST_IF_DEV_MAX; ++i) {
+		struct isst_if_cmd_cb *cb = &punit_callbacks[i];
+
+		if (cb->registered)
+			module_put(cb->owner);
+	}
+	mutex_unlock(&punit_misc_dev_lock);
+
+	return 0;
+}
+
+static const struct file_operations isst_if_char_driver_ops = {
+	.open = isst_if_open,
+	.unlocked_ioctl = isst_if_def_ioctl,
+	.release = isst_if_relase,
+};
+
+static struct miscdevice isst_if_char_driver = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "isst_interface",
+	.fops		= &isst_if_char_driver_ops,
+};
+
+/**
+ * isst_if_cdev_register() - Register callback for IOCTL
+ * @device_type: The device type this callback handling.
+ * @cb:	Callback structure.
+ *
+ * This function registers a callback to device type. On very first call
+ * it will register a misc device, which is used for user kernel interface.
+ * Other calls simply increment ref count. Registry will fail, if the user
+ * already opened misc device for operation. Also if the misc device
+ * creation failed, then it will not try again and all callers will get
+ * failure code.
+ *
+ * Return: Return the return value from the misc creation device or -EINVAL
+ * for unsupported device type.
+ */
+int isst_if_cdev_register(int device_type, struct isst_if_cmd_cb *cb)
+{
+	if (misc_device_ret)
+		return misc_device_ret;
+
+	if (device_type >= ISST_IF_DEV_MAX)
+		return -EINVAL;
+
+	mutex_lock(&punit_misc_dev_lock);
+	if (misc_device_open) {
+		mutex_unlock(&punit_misc_dev_lock);
+		return -EAGAIN;
+	}
+	if (!misc_usage_count) {
+		misc_device_ret = misc_register(&isst_if_char_driver);
+		if (misc_device_ret)
+			goto unlock_exit;
+	}
+	memcpy(&punit_callbacks[device_type], cb, sizeof(*cb));
+	punit_callbacks[device_type].registered = 1;
+	misc_usage_count++;
+unlock_exit:
+	mutex_unlock(&punit_misc_dev_lock);
+
+	return misc_device_ret;
+}
+EXPORT_SYMBOL_GPL(isst_if_cdev_register);
+
+/**
+ * isst_if_cdev_unregister() - Unregister callback for IOCTL
+ * @device_type: The device type to unregister.
+ *
+ * This function unregisters the previously registered callback. If this
+ * is the last callback unregistering, then misc device is removed.
+ *
+ * Return: None.
+ */
+void isst_if_cdev_unregister(int device_type)
+{
+	mutex_lock(&punit_misc_dev_lock);
+	misc_usage_count--;
+	punit_callbacks[device_type].registered = 0;
+	if (!misc_usage_count && !misc_device_ret)
+		misc_deregister(&isst_if_char_driver);
+	mutex_unlock(&punit_misc_dev_lock);
+}
+EXPORT_SYMBOL_GPL(isst_if_cdev_unregister);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
new file mode 100644
index 000000000000..11f339226fb4
--- /dev/null
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Intel Speed Select Interface: Drivers Internal defines
+ * Copyright (c) 2019, Intel Corporation.
+ * All rights reserved.
+ *
+ * Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+ */
+
+#ifndef __ISST_IF_COMMON_H
+#define __ISST_IF_COMMON_H
+
+/*
+ * Validate maximum commands in a single request.
+ * This is enough to handle command to every core in one ioctl, or all
+ * possible message id to one CPU. Limit is also helpful for resonse time
+ * per IOCTL request, as PUNIT may take different times to process each
+ * request and may hold for long for too many commands.
+ */
+#define ISST_IF_CMD_LIMIT	64
+
+#define ISST_IF_API_VERSION	0x01
+#define ISST_IF_DRIVER_VERSION	0x01
+
+#define ISST_IF_DEV_MBOX	0
+#define ISST_IF_DEV_MMIO	1
+#define ISST_IF_DEV_MAX		2
+
+/**
+ * struct isst_if_cmd_cb - Used to register a IOCTL handler
+ * @registered:	Used by the common code to store registry. Caller don't
+ *		to touch this field
+ * @cmd_size:	The command size of the individual command in IOCTL
+ * @offset:	Offset to the first valid member in command structure.
+ *		This will be the offset of the start of the command
+ *		after command count field
+ * @cmd_callback: Callback function to handle IOCTL. The callback has the
+ *		command pointer with data for command. There is a pointer
+ *		called write_only, which when set, will not copy the
+ *		response to user ioctl buffer. The "resume" argument
+ *		can be used to avoid storing the command for replay
+ *		during system resume
+ *
+ * This structure is used to register an handler for IOCTL. To avoid
+ * code duplication common code handles all the IOCTL command read/write
+ * including handling multiple command in single IOCTL. The caller just
+ * need to execute a command via the registered callback.
+ */
+struct isst_if_cmd_cb {
+	int registered;
+	int cmd_size;
+	int offset;
+	struct module *owner;
+	long (*cmd_callback)(u8 *ptr, int *write_only, int resume);
+};
+
+/* Internal interface functions */
+int isst_if_cdev_register(int type, struct isst_if_cmd_cb *cb);
+void isst_if_cdev_unregister(int type);
+#endif
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
new file mode 100644
index 000000000000..fa94480b5f74
--- /dev/null
+++ b/include/uapi/linux/isst_if.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Intel Speed Select Interface: OS to hardware Interface
+ * Copyright (c) 2019, Intel Corporation.
+ * All rights reserved.
+ *
+ * Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+ */
+
+#ifndef __ISST_IF_H
+#define __ISST_IF_H
+
+#include <linux/types.h>
+
+/**
+ * struct isst_if_platform_info - Define platform information
+ * @api_version:	Version of the firmware document, which this driver
+ *			can communicate
+ * @driver_version:	Driver version, which will help user to send right
+ *			commands. Even if the firmware is capable, driver may
+ *			not be ready
+ * @max_cmds_per_ioctl:	Returns the maximum number of commands driver will
+ *			accept in a single ioctl
+ * @mbox_supported:	Support of mail box interface
+ * @mmio_supported:	Support of mmio interface for core-power feature
+ *
+ * Used to return output of IOCTL ISST_IF_GET_PLATFORM_INFO. This
+ * information can be used by the user space, to get the driver, firmware
+ * support and also number of commands to send in a single IOCTL request.
+ */
+struct isst_if_platform_info {
+	__u16 api_version;
+	__u16 driver_version;
+	__u16 max_cmds_per_ioctl;
+	__u8 mbox_supported;
+	__u8 mmio_supported;
+};
+
+#define ISST_IF_MAGIC			0xFE
+#define ISST_IF_GET_PLATFORM_INFO	_IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *)
+#endif
-- 
cgit v1.2.3


From fb5b36a413b9f30fba573fc2a596ab7142dfaf12 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 26 Jun 2019 15:38:45 -0700
Subject: platform/x86: ISST: Add IOCTL to Translate Linux logical CPU to PUNIT
 CPU number

Add processing for IOCTL command ISST_IF_GET_PHY_ID. This converts from the
Linux logical CPU to PUNIT CPU numbering scheme.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 .../x86/intel_speed_select_if/isst_if_common.c     | 74 ++++++++++++++++++++++
 include/uapi/linux/isst_if.h                       | 28 ++++++++
 2 files changed, 102 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
index 0e16cbf685d0..72e74d72724b 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
@@ -134,16 +134,90 @@ static void isst_if_cpu_info_exit(void)
 	kfree(isst_cpu_info);
 };
 
+static long isst_if_proc_phyid_req(u8 *cmd_ptr, int *write_only, int resume)
+{
+	struct isst_if_cpu_map *cpu_map;
+
+	cpu_map = (struct isst_if_cpu_map *)cmd_ptr;
+	if (cpu_map->logical_cpu >= nr_cpu_ids ||
+	    cpu_map->logical_cpu >= num_possible_cpus())
+		return -EINVAL;
+
+	*write_only = 0;
+	cpu_map->physical_cpu = isst_cpu_info[cpu_map->logical_cpu].punit_cpu_id;
+
+	return 0;
+}
+
+static long isst_if_exec_multi_cmd(void __user *argp, struct isst_if_cmd_cb *cb)
+{
+	unsigned char __user *ptr;
+	u32 cmd_count;
+	u8 *cmd_ptr;
+	long ret;
+	int i;
+
+	/* Each multi command has u32 command count as the first field */
+	if (copy_from_user(&cmd_count, argp, sizeof(cmd_count)))
+		return -EFAULT;
+
+	if (!cmd_count || cmd_count > ISST_IF_CMD_LIMIT)
+		return -EINVAL;
+
+	cmd_ptr = kmalloc(cb->cmd_size, GFP_KERNEL);
+	if (!cmd_ptr)
+		return -ENOMEM;
+
+	/* cb->offset points to start of the command after the command count */
+	ptr = argp + cb->offset;
+
+	for (i = 0; i < cmd_count; ++i) {
+		int wr_only;
+
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		if (copy_from_user(cmd_ptr, ptr, cb->cmd_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = cb->cmd_callback(cmd_ptr, &wr_only, 0);
+		if (ret)
+			break;
+
+		if (!wr_only && copy_to_user(ptr, cmd_ptr, cb->cmd_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ptr += cb->cmd_size;
+	}
+
+	kfree(cmd_ptr);
+
+	return i ? i : ret;
+}
+
 static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
 			      unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
+	struct isst_if_cmd_cb cmd_cb;
 	long ret = -ENOTTY;
 
 	switch (cmd) {
 	case ISST_IF_GET_PLATFORM_INFO:
 		ret = isst_if_get_platform_info(argp);
 		break;
+	case ISST_IF_GET_PHY_ID:
+		cmd_cb.cmd_size = sizeof(struct isst_if_cpu_map);
+		cmd_cb.offset = offsetof(struct isst_if_cpu_maps, cpu_map);
+		cmd_cb.cmd_callback = isst_if_proc_phyid_req;
+		ret = isst_if_exec_multi_cmd(argp, &cmd_cb);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index fa94480b5f74..15d1f286a830 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -36,6 +36,34 @@ struct isst_if_platform_info {
 	__u8 mmio_supported;
 };
 
+/**
+ * struct isst_if_cpu_map - CPU mapping between logical and physical CPU
+ * @logical_cpu:	Linux logical CPU number
+ * @physical_cpu:	PUNIT CPU number
+ *
+ * Used to convert from Linux logical CPU to PUNIT CPU numbering scheme.
+ * The PUNIT CPU number is different than APIC ID based CPU numbering.
+ */
+struct isst_if_cpu_map {
+	__u32 logical_cpu;
+	__u32 physical_cpu;
+};
+
+/**
+ * struct isst_if_cpu_maps - structure for CPU map IOCTL
+ * @cmd_count:	Number of CPU mapping command in cpu_map[]
+ * @cpu_map[]:	Holds one or more CPU map data structure
+ *
+ * This structure used with ioctl ISST_IF_GET_PHY_ID to send
+ * one or more CPU mapping commands. Here IOCTL return value indicates
+ * number of commands sent or error number if no commands have been sent.
+ */
+struct isst_if_cpu_maps {
+	__u32 cmd_count;
+	struct isst_if_cpu_map cpu_map[1];
+};
+
 #define ISST_IF_MAGIC			0xFE
 #define ISST_IF_GET_PLATFORM_INFO	_IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *)
+#define ISST_IF_GET_PHY_ID		_IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *)
 #endif
-- 
cgit v1.2.3


From d3a23584294c1f379239a3b52bac13e03fecd147 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 26 Jun 2019 15:38:46 -0700
Subject: platform/x86: ISST: Add Intel Speed Select mmio interface

Added MMIO interface to read/write specific offsets in PUNIT PCI device
which export core priortization. This MMIO interface can be used using
ioctl interface on /dev/isst_interface using IOCTL ISST_IF_IO_CMD.

This MMIO interface is used by the intel-speed-select tool under
tools/x86/power to enumerate and set core priority. The MMIO offsets and
semantics of the message can be checked from the source code of the tool.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 .../platform/x86/intel_speed_select_if/Makefile    |   1 +
 .../x86/intel_speed_select_if/isst_if_common.c     |   6 +
 .../x86/intel_speed_select_if/isst_if_common.h     |   2 +
 .../x86/intel_speed_select_if/isst_if_mmio.c       | 131 +++++++++++++++++++++
 include/uapi/linux/isst_if.h                       |  33 ++++++
 5 files changed, 173 insertions(+)
 create mode 100644 drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c

(limited to 'include')

diff --git a/drivers/platform/x86/intel_speed_select_if/Makefile b/drivers/platform/x86/intel_speed_select_if/Makefile
index c12687672fc9..7e94919208d3 100644
--- a/drivers/platform/x86/intel_speed_select_if/Makefile
+++ b/drivers/platform/x86/intel_speed_select_if/Makefile
@@ -5,3 +5,4 @@
 #
 
 obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_common.o
+obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_mmio.o
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
index 72e74d72724b..3f96a3925bc6 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
@@ -206,6 +206,7 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
 {
 	void __user *argp = (void __user *)arg;
 	struct isst_if_cmd_cb cmd_cb;
+	struct isst_if_cmd_cb *cb;
 	long ret = -ENOTTY;
 
 	switch (cmd) {
@@ -218,6 +219,11 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
 		cmd_cb.cmd_callback = isst_if_proc_phyid_req;
 		ret = isst_if_exec_multi_cmd(argp, &cmd_cb);
 		break;
+	case ISST_IF_IO_CMD:
+		cb = &punit_callbacks[ISST_IF_DEV_MMIO];
+		if (cb->registered)
+			ret = isst_if_exec_multi_cmd(argp, cb);
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
index dade77c58b22..cdc7d019748a 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
@@ -10,6 +10,8 @@
 #ifndef __ISST_IF_COMMON_H
 #define __ISST_IF_COMMON_H
 
+#define INTEL_RAPL_PRIO_DEVID_0	0x3451
+
 /*
  * Validate maximum commands in a single request.
  * This is enough to handle command to every core in one ioctl, or all
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
new file mode 100644
index 000000000000..1c25a1235b9e
--- /dev/null
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Speed Select Interface: MMIO Interface
+ * Copyright (c) 2019, Intel Corporation.
+ * All rights reserved.
+ *
+ * Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/isst_if.h>
+
+#include "isst_if_common.h"
+
+struct isst_if_device {
+	void __iomem *punit_mmio;
+	struct mutex mutex;
+};
+
+static long isst_if_mmio_rd_wr(u8 *cmd_ptr, int *write_only, int resume)
+{
+	struct isst_if_device *punit_dev;
+	struct isst_if_io_reg *io_reg;
+	struct pci_dev *pdev;
+
+	io_reg = (struct isst_if_io_reg *)cmd_ptr;
+	if (io_reg->reg < 0x04 || io_reg->reg > 0xD0)
+		return -EINVAL;
+
+	if (io_reg->read_write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	pdev = isst_if_get_pci_dev(io_reg->logical_cpu, 0, 0, 1);
+	if (!pdev)
+		return -EINVAL;
+
+	punit_dev = pci_get_drvdata(pdev);
+	if (!punit_dev)
+		return -EINVAL;
+
+	/*
+	 * Ensure that operation is complete on a PCI device to avoid read
+	 * write race by using per PCI device mutex.
+	 */
+	mutex_lock(&punit_dev->mutex);
+	if (io_reg->read_write) {
+		writel(io_reg->value, punit_dev->punit_mmio+io_reg->reg);
+		*write_only = 1;
+	} else {
+		io_reg->value = readl(punit_dev->punit_mmio+io_reg->reg);
+		*write_only = 0;
+	}
+	mutex_unlock(&punit_dev->mutex);
+
+	return 0;
+}
+
+static const struct pci_device_id isst_if_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_RAPL_PRIO_DEVID_0)},
+	{ 0 },
+};
+MODULE_DEVICE_TABLE(pci, isst_if_ids);
+
+static int isst_if_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct isst_if_device *punit_dev;
+	struct isst_if_cmd_cb cb;
+	u32 mmio_base, pcu_base;
+	u64 base_addr;
+	int ret;
+
+	punit_dev = devm_kzalloc(&pdev->dev, sizeof(*punit_dev), GFP_KERNEL);
+	if (!punit_dev)
+		return -ENOMEM;
+
+	ret = pcim_enable_device(pdev);
+	if (ret)
+		return ret;
+
+	ret = pci_read_config_dword(pdev, 0xD0, &mmio_base);
+	if (ret)
+		return ret;
+
+	ret = pci_read_config_dword(pdev, 0xFC, &pcu_base);
+	if (ret)
+		return ret;
+
+	pcu_base &= GENMASK(10, 0);
+	base_addr = (u64)mmio_base << 23 | (u64) pcu_base << 12;
+	punit_dev->punit_mmio = devm_ioremap(&pdev->dev, base_addr, 256);
+	if (!punit_dev->punit_mmio)
+		return -ENOMEM;
+
+	mutex_init(&punit_dev->mutex);
+	pci_set_drvdata(pdev, punit_dev);
+
+	memset(&cb, 0, sizeof(cb));
+	cb.cmd_size = sizeof(struct isst_if_io_reg);
+	cb.offset = offsetof(struct isst_if_io_regs, io_reg);
+	cb.cmd_callback = isst_if_mmio_rd_wr;
+	cb.owner = THIS_MODULE;
+	ret = isst_if_cdev_register(ISST_IF_DEV_MMIO, &cb);
+	if (ret)
+		mutex_destroy(&punit_dev->mutex);
+
+	return ret;
+}
+
+static void isst_if_remove(struct pci_dev *pdev)
+{
+	struct isst_if_device *punit_dev;
+
+	punit_dev = pci_get_drvdata(pdev);
+	isst_if_cdev_unregister(ISST_IF_DEV_MBOX);
+	mutex_destroy(&punit_dev->mutex);
+}
+
+static struct pci_driver isst_if_pci_driver = {
+	.name			= "isst_if_pci",
+	.id_table		= isst_if_ids,
+	.probe			= isst_if_probe,
+	.remove			= isst_if_remove,
+};
+
+module_pci_driver(isst_if_pci_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel speed select interface mmio driver");
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index 15d1f286a830..fe2492ade078 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -63,7 +63,40 @@ struct isst_if_cpu_maps {
 	struct isst_if_cpu_map cpu_map[1];
 };
 
+/**
+ * struct isst_if_io_reg - Read write PUNIT IO register
+ * @read_write:		Value 0: Read, 1: Write
+ * @logical_cpu:	Logical CPU number to get target PCI device.
+ * @reg:		PUNIT register offset
+ * @value:		For write operation value to write and for
+ *			for read placeholder read value
+ *
+ * Structure to specify read/write data to PUNIT registers.
+ */
+struct isst_if_io_reg {
+	__u32 read_write; /* Read:0, Write:1 */
+	__u32 logical_cpu;
+	__u32 reg;
+	__u32 value;
+};
+
+/**
+ * struct isst_if_io_regs - structure for IO register commands
+ * @cmd_count:	Number of io reg commands in io_reg[]
+ * @io_reg[]:	Holds one or more io_reg command structure
+ *
+ * This structure used with ioctl ISST_IF_IO_CMD to send
+ * one or more read/write commands to PUNIT. Here IOCTL return value
+ * indicates number of requests sent or error number if no requests have
+ * been sent.
+ */
+struct isst_if_io_regs {
+	__u32 req_count;
+	struct isst_if_io_reg io_reg[1];
+};
+
 #define ISST_IF_MAGIC			0xFE
 #define ISST_IF_GET_PLATFORM_INFO	_IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *)
 #define ISST_IF_GET_PHY_ID		_IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *)
+#define ISST_IF_IO_CMD		_IOW(ISST_IF_MAGIC, 2, struct isst_if_io_regs *)
 #endif
-- 
cgit v1.2.3


From 31a166fe9c269af17977e650846ee4ea50361c07 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 26 Jun 2019 15:38:47 -0700
Subject: platform/x86: ISST: Add Intel Speed Select mailbox interface via PCI

Add an IOCTL to send mailbox commands to PUNIT using PUNIT PCI device.
A limited set of mailbox commands can be sent to PUNIT.

This MMIO interface is used by the intel-speed-select tool under
tools/x86/power to enumerate and control Intel Speed Select features.
The MBOX commands ids and semantics of the message can be checked from
the source code of the tool.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 .../platform/x86/intel_speed_select_if/Makefile    |   1 +
 .../x86/intel_speed_select_if/isst_if_common.c     |  85 +++++++++
 .../x86/intel_speed_select_if/isst_if_common.h     |   3 +
 .../x86/intel_speed_select_if/isst_if_mbox_pci.c   | 199 +++++++++++++++++++++
 include/uapi/linux/isst_if.h                       |  38 ++++
 5 files changed, 326 insertions(+)
 create mode 100644 drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c

(limited to 'include')

diff --git a/drivers/platform/x86/intel_speed_select_if/Makefile b/drivers/platform/x86/intel_speed_select_if/Makefile
index 7e94919208d3..8dec8c858649 100644
--- a/drivers/platform/x86/intel_speed_select_if/Makefile
+++ b/drivers/platform/x86/intel_speed_select_if/Makefile
@@ -6,3 +6,4 @@
 
 obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_common.o
 obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_mmio.o
+obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_mbox_pci.o
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
index 3f96a3925bc6..391fc3f12161 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
@@ -25,6 +25,86 @@
 
 static struct isst_if_cmd_cb punit_callbacks[ISST_IF_DEV_MAX];
 
+struct isst_valid_cmd_ranges {
+	u16 cmd;
+	u16 sub_cmd_beg;
+	u16 sub_cmd_end;
+};
+
+struct isst_cmd_set_req_type {
+	u16 cmd;
+	u16 sub_cmd;
+	u16 param;
+};
+
+static const struct isst_valid_cmd_ranges isst_valid_cmds[] = {
+	{0xD0, 0x00, 0x03},
+	{0x7F, 0x00, 0x0B},
+	{0x7F, 0x10, 0x12},
+	{0x7F, 0x20, 0x23},
+};
+
+static const struct isst_cmd_set_req_type isst_cmd_set_reqs[] = {
+	{0xD0, 0x00, 0x08},
+	{0xD0, 0x01, 0x08},
+	{0xD0, 0x02, 0x08},
+	{0xD0, 0x03, 0x08},
+	{0x7F, 0x02, 0x00},
+	{0x7F, 0x08, 0x00},
+};
+
+/**
+ * isst_if_mbox_cmd_invalid() - Check invalid mailbox commands
+ * @cmd: Pointer to the command structure to verify.
+ *
+ * Invalid command to PUNIT to may result in instability of the platform.
+ * This function has a whitelist of commands, which are allowed.
+ *
+ * Return: Return true if the command is invalid, else false.
+ */
+bool isst_if_mbox_cmd_invalid(struct isst_if_mbox_cmd *cmd)
+{
+	int i;
+
+	if (cmd->logical_cpu >= nr_cpu_ids)
+		return true;
+
+	for (i = 0; i < ARRAY_SIZE(isst_valid_cmds); ++i) {
+		if (cmd->command == isst_valid_cmds[i].cmd &&
+		    (cmd->sub_command >= isst_valid_cmds[i].sub_cmd_beg &&
+		     cmd->sub_command <= isst_valid_cmds[i].sub_cmd_end)) {
+			return false;
+		}
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(isst_if_mbox_cmd_invalid);
+
+/**
+ * isst_if_mbox_cmd_set_req() - Check mailbox command is a set request
+ * @cmd: Pointer to the command structure to verify.
+ *
+ * Check if the given mail box level is set request and not a get request.
+ *
+ * Return: Return true if the command is set_req, else false.
+ */
+bool isst_if_mbox_cmd_set_req(struct isst_if_mbox_cmd *cmd)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(isst_cmd_set_reqs); ++i) {
+		if (cmd->command == isst_cmd_set_reqs[i].cmd &&
+		    cmd->sub_command == isst_cmd_set_reqs[i].sub_cmd &&
+		    cmd->parameter == isst_cmd_set_reqs[i].param) {
+			return true;
+		}
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(isst_if_mbox_cmd_set_req);
+
 static int isst_if_get_platform_info(void __user *argp)
 {
 	struct isst_if_platform_info info;
@@ -224,6 +304,11 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
 		if (cb->registered)
 			ret = isst_if_exec_multi_cmd(argp, cb);
 		break;
+	case ISST_IF_MBOX_COMMAND:
+		cb = &punit_callbacks[ISST_IF_DEV_MBOX];
+		if (cb->registered)
+			ret = isst_if_exec_multi_cmd(argp, cb);
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
index cdc7d019748a..7c0f71221da7 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
@@ -11,6 +11,7 @@
 #define __ISST_IF_COMMON_H
 
 #define INTEL_RAPL_PRIO_DEVID_0	0x3451
+#define INTEL_CFG_MBOX_DEVID_0	0x3459
 
 /*
  * Validate maximum commands in a single request.
@@ -60,4 +61,6 @@ struct isst_if_cmd_cb {
 int isst_if_cdev_register(int type, struct isst_if_cmd_cb *cb);
 void isst_if_cdev_unregister(int type);
 struct pci_dev *isst_if_get_pci_dev(int cpu, int bus, int dev, int fn);
+bool isst_if_mbox_cmd_set_req(struct isst_if_mbox_cmd *mbox_cmd);
+bool isst_if_mbox_cmd_invalid(struct isst_if_mbox_cmd *cmd);
 #endif
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c
new file mode 100644
index 000000000000..1c4f2893cd80
--- /dev/null
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Speed Select Interface: Mbox via PCI Interface
+ * Copyright (c) 2019, Intel Corporation.
+ * All rights reserved.
+ *
+ * Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/isst_if.h>
+
+#include "isst_if_common.h"
+
+#define PUNIT_MAILBOX_DATA		0xA0
+#define PUNIT_MAILBOX_INTERFACE		0xA4
+#define PUNIT_MAILBOX_BUSY_BIT		31
+
+/*
+ * Commands has variable amount of processing time. Most of the commands will
+ * be done in 0-3 tries, but some takes up to 50.
+ * The real processing time was observed as 25us for the most of the commands
+ * at 2GHz. It is possible to optimize this count taking samples on customer
+ * systems.
+ */
+#define OS_MAILBOX_RETRY_COUNT		50
+
+struct isst_if_device {
+	struct mutex mutex;
+};
+
+static int isst_if_mbox_cmd(struct pci_dev *pdev,
+			    struct isst_if_mbox_cmd *mbox_cmd)
+{
+	u32 retries, data;
+	int ret;
+
+	/* Poll for rb bit == 0 */
+	retries = OS_MAILBOX_RETRY_COUNT;
+	do {
+		ret = pci_read_config_dword(pdev, PUNIT_MAILBOX_INTERFACE,
+					    &data);
+		if (ret)
+			return ret;
+
+		if (data & BIT_ULL(PUNIT_MAILBOX_BUSY_BIT)) {
+			ret = -EBUSY;
+			continue;
+		}
+		ret = 0;
+		break;
+	} while (--retries);
+
+	if (ret)
+		return ret;
+
+	/* Write DATA register */
+	ret = pci_write_config_dword(pdev, PUNIT_MAILBOX_DATA,
+				     mbox_cmd->req_data);
+	if (ret)
+		return ret;
+
+	/* Write command register */
+	data = BIT_ULL(PUNIT_MAILBOX_BUSY_BIT) |
+		      (mbox_cmd->parameter & GENMASK_ULL(13, 0)) << 16 |
+		      (mbox_cmd->sub_command << 8) |
+		      mbox_cmd->command;
+
+	ret = pci_write_config_dword(pdev, PUNIT_MAILBOX_INTERFACE, data);
+	if (ret)
+		return ret;
+
+	/* Poll for rb bit == 0 */
+	retries = OS_MAILBOX_RETRY_COUNT;
+	do {
+		ret = pci_read_config_dword(pdev, PUNIT_MAILBOX_INTERFACE,
+					    &data);
+		if (ret)
+			return ret;
+
+		if (data & BIT_ULL(PUNIT_MAILBOX_BUSY_BIT)) {
+			ret = -EBUSY;
+			continue;
+		}
+
+		if (data & 0xff)
+			return -ENXIO;
+
+		ret = pci_read_config_dword(pdev, PUNIT_MAILBOX_DATA, &data);
+		if (ret)
+			return ret;
+
+		mbox_cmd->resp_data = data;
+		ret = 0;
+		break;
+	} while (--retries);
+
+	return ret;
+}
+
+static long isst_if_mbox_proc_cmd(u8 *cmd_ptr, int *write_only, int resume)
+{
+	struct isst_if_mbox_cmd *mbox_cmd;
+	struct isst_if_device *punit_dev;
+	struct pci_dev *pdev;
+	int ret;
+
+	mbox_cmd = (struct isst_if_mbox_cmd *)cmd_ptr;
+
+	if (isst_if_mbox_cmd_invalid(mbox_cmd))
+		return -EINVAL;
+
+	if (isst_if_mbox_cmd_set_req(mbox_cmd) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	pdev = isst_if_get_pci_dev(mbox_cmd->logical_cpu, 1, 30, 1);
+	if (!pdev)
+		return -EINVAL;
+
+	punit_dev = pci_get_drvdata(pdev);
+	if (!punit_dev)
+		return -EINVAL;
+
+	/*
+	 * Basically we are allowing one complete mailbox transaction on
+	 * a mapped PCI device at a time.
+	 */
+	mutex_lock(&punit_dev->mutex);
+	ret = isst_if_mbox_cmd(pdev, mbox_cmd);
+	mutex_unlock(&punit_dev->mutex);
+	if (ret)
+		return ret;
+
+	*write_only = 0;
+
+	return 0;
+}
+
+static const struct pci_device_id isst_if_mbox_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_CFG_MBOX_DEVID_0)},
+	{ 0 },
+};
+MODULE_DEVICE_TABLE(pci, isst_if_mbox_ids);
+
+static int isst_if_mbox_probe(struct pci_dev *pdev,
+			      const struct pci_device_id *ent)
+{
+	struct isst_if_device *punit_dev;
+	struct isst_if_cmd_cb cb;
+	int ret;
+
+	punit_dev = devm_kzalloc(&pdev->dev, sizeof(*punit_dev), GFP_KERNEL);
+	if (!punit_dev)
+		return -ENOMEM;
+
+	ret = pcim_enable_device(pdev);
+	if (ret)
+		return ret;
+
+	mutex_init(&punit_dev->mutex);
+	pci_set_drvdata(pdev, punit_dev);
+
+	memset(&cb, 0, sizeof(cb));
+	cb.cmd_size = sizeof(struct isst_if_mbox_cmd);
+	cb.offset = offsetof(struct isst_if_mbox_cmds, mbox_cmd);
+	cb.cmd_callback = isst_if_mbox_proc_cmd;
+	cb.owner = THIS_MODULE;
+	ret = isst_if_cdev_register(ISST_IF_DEV_MBOX, &cb);
+
+	if (ret)
+		mutex_destroy(&punit_dev->mutex);
+
+	return ret;
+}
+
+static void isst_if_mbox_remove(struct pci_dev *pdev)
+{
+	struct isst_if_device *punit_dev;
+
+	punit_dev = pci_get_drvdata(pdev);
+	isst_if_cdev_unregister(ISST_IF_DEV_MBOX);
+	mutex_destroy(&punit_dev->mutex);
+}
+
+static struct pci_driver isst_if_pci_driver = {
+	.name			= "isst_if_mbox_pci",
+	.id_table		= isst_if_mbox_ids,
+	.probe			= isst_if_mbox_probe,
+	.remove			= isst_if_mbox_remove,
+};
+
+module_pci_driver(isst_if_pci_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel speed select interface pci mailbox driver");
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index fe2492ade078..e4b1c2ec3279 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -95,8 +95,46 @@ struct isst_if_io_regs {
 	struct isst_if_io_reg io_reg[1];
 };
 
+/**
+ * struct isst_if_mbox_cmd - Structure to define mail box command
+ * @logical_cpu:	Logical CPU number to get target PCI device
+ * @parameter:		Mailbox parameter value
+ * @req_data:		Request data for the mailbox
+ * @resp_data:		Response data for mailbox command response
+ * @command:		Mailbox command value
+ * @sub_command:	Mailbox sub command value
+ * @reserved:		Unused, set to 0
+ *
+ * Structure to specify mailbox command to be sent to PUNIT.
+ */
+struct isst_if_mbox_cmd {
+	__u32 logical_cpu;
+	__u32 parameter;
+	__u32 req_data;
+	__u32 resp_data;
+	__u16 command;
+	__u16 sub_command;
+	__u32 reserved;
+};
+
+/**
+ * struct isst_if_mbox_cmds - structure for mailbox commands
+ * @cmd_count:	Number of mailbox commands in mbox_cmd[]
+ * @mbox_cmd[]:	Holds one or more mbox commands
+ *
+ * This structure used with ioctl ISST_IF_MBOX_COMMAND to send
+ * one or more mailbox commands to PUNIT. Here IOCTL return value
+ * indicates number of commands sent or error number if no commands have
+ * been sent.
+ */
+struct isst_if_mbox_cmds {
+	__u32 cmd_count;
+	struct isst_if_mbox_cmd mbox_cmd[1];
+};
+
 #define ISST_IF_MAGIC			0xFE
 #define ISST_IF_GET_PLATFORM_INFO	_IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *)
 #define ISST_IF_GET_PHY_ID		_IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *)
 #define ISST_IF_IO_CMD		_IOW(ISST_IF_MAGIC, 2, struct isst_if_io_regs *)
+#define ISST_IF_MBOX_COMMAND	_IOWR(ISST_IF_MAGIC, 3, struct isst_if_mbox_cmds *)
 #endif
-- 
cgit v1.2.3


From e765f37b9b8b4fa65682e9a78a2ca2b11d3d9096 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 26 Jun 2019 15:38:49 -0700
Subject: platform/x86: ISST: Add Intel Speed Select PUNIT MSR interface

While using new non arhitectural features using PUNIT Mailbox and MMIO
read/write interface, still there is need to operate using MSRs to
control PUNIT. User space could have used user user-space MSR interface for
this, but when user space MSR access is disabled, then it can't. Here only
limited number of MSRs are allowed using this new interface.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 .../x86/intel_speed_select_if/isst_if_common.c     | 59 ++++++++++++++++++++++
 include/uapi/linux/isst_if.h                       | 32 ++++++++++++
 2 files changed, 91 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
index 391fc3f12161..de2fb5292f1c 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
@@ -25,6 +25,11 @@
 
 static struct isst_if_cmd_cb punit_callbacks[ISST_IF_DEV_MAX];
 
+static int punit_msr_white_list[] = {
+	MSR_TURBO_RATIO_LIMIT,
+	MSR_CONFIG_TDP_CONTROL,
+};
+
 struct isst_valid_cmd_ranges {
 	u16 cmd;
 	u16 sub_cmd_beg;
@@ -229,6 +234,54 @@ static long isst_if_proc_phyid_req(u8 *cmd_ptr, int *write_only, int resume)
 	return 0;
 }
 
+static bool match_punit_msr_white_list(int msr)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(punit_msr_white_list); ++i) {
+		if (punit_msr_white_list[i] == msr)
+			return true;
+	}
+
+	return false;
+}
+
+static long isst_if_msr_cmd_req(u8 *cmd_ptr, int *write_only, int resume)
+{
+	struct isst_if_msr_cmd *msr_cmd;
+	int ret;
+
+	msr_cmd = (struct isst_if_msr_cmd *)cmd_ptr;
+
+	if (!match_punit_msr_white_list(msr_cmd->msr))
+		return -EINVAL;
+
+	if (msr_cmd->logical_cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	if (msr_cmd->read_write) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		ret = wrmsrl_safe_on_cpu(msr_cmd->logical_cpu,
+					 msr_cmd->msr,
+					 msr_cmd->data);
+		*write_only = 1;
+	} else {
+		u64 data;
+
+		ret = rdmsrl_safe_on_cpu(msr_cmd->logical_cpu,
+					 msr_cmd->msr, &data);
+		if (!ret) {
+			msr_cmd->data = data;
+			*write_only = 0;
+		}
+	}
+
+
+	return ret;
+}
+
 static long isst_if_exec_multi_cmd(void __user *argp, struct isst_if_cmd_cb *cb)
 {
 	unsigned char __user *ptr;
@@ -309,6 +362,12 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd,
 		if (cb->registered)
 			ret = isst_if_exec_multi_cmd(argp, cb);
 		break;
+	case ISST_IF_MSR_COMMAND:
+		cmd_cb.cmd_size = sizeof(struct isst_if_msr_cmd);
+		cmd_cb.offset = offsetof(struct isst_if_msr_cmds, msr_cmd);
+		cmd_cb.cmd_callback = isst_if_msr_cmd_req;
+		ret = isst_if_exec_multi_cmd(argp, &cmd_cb);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index e4b1c2ec3279..d10b832c58c5 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -132,9 +132,41 @@ struct isst_if_mbox_cmds {
 	struct isst_if_mbox_cmd mbox_cmd[1];
 };
 
+/**
+ * struct isst_if_msr_cmd - Structure to define msr command
+ * @read_write:		Value 0: Read, 1: Write
+ * @logical_cpu:	Logical CPU number
+ * @msr:		MSR number
+ * @data:		For write operation, data to write, for read
+ *			place holder
+ *
+ * Structure to specify MSR command related to PUNIT.
+ */
+struct isst_if_msr_cmd {
+	__u32 read_write; /* Read:0, Write:1 */
+	__u32 logical_cpu;
+	__u64 msr;
+	__u64 data;
+};
+
+/**
+ * struct isst_if_msr_cmds - structure for msr commands
+ * @cmd_count:	Number of mailbox commands in msr_cmd[]
+ * @msr_cmd[]:	Holds one or more msr commands
+ *
+ * This structure used with ioctl ISST_IF_MSR_COMMAND to send
+ * one or more MSR commands. IOCTL return value indicates number of
+ * commands sent or error number if no commands have been sent.
+ */
+struct isst_if_msr_cmds {
+	__u32 cmd_count;
+	struct isst_if_msr_cmd msr_cmd[1];
+};
+
 #define ISST_IF_MAGIC			0xFE
 #define ISST_IF_GET_PLATFORM_INFO	_IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *)
 #define ISST_IF_GET_PHY_ID		_IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *)
 #define ISST_IF_IO_CMD		_IOW(ISST_IF_MAGIC, 2, struct isst_if_io_regs *)
 #define ISST_IF_MBOX_COMMAND	_IOWR(ISST_IF_MAGIC, 3, struct isst_if_mbox_cmds *)
+#define ISST_IF_MSR_COMMAND	_IOWR(ISST_IF_MAGIC, 4, struct isst_if_msr_cmds *)
 #endif
-- 
cgit v1.2.3


From 25b2995a35b609119cf96f6b62eccd56c0234c7d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Jun 2019 22:50:49 +0200
Subject: mm: remove MEMORY_DEVICE_PUBLIC support

The code hasn't been used since it was added to the tree, and doesn't
appear to actually be usable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 fs/proc/task_mmu.c       |  2 +-
 include/linux/hmm.h      |  7 ++-----
 include/linux/ioport.h   |  1 -
 include/linux/memremap.h |  8 -------
 include/linux/mm.h       | 18 ++--------------
 mm/Kconfig               | 11 ----------
 mm/gup.c                 |  7 -------
 mm/hmm.c                 | 54 ++----------------------------------------------
 mm/madvise.c             |  2 +-
 mm/memcontrol.c          | 13 ++++++------
 mm/memory-failure.c      |  6 +-----
 mm/memory.c              | 40 +++--------------------------------
 mm/migrate.c             | 28 ++++---------------------
 mm/swap.c                | 11 ----------
 14 files changed, 22 insertions(+), 186 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 01d4eb0e6bd1..74d8f00b3615 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1279,7 +1279,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		if (pm->show_pfn)
 			frame = pte_pfn(pte);
 		flags |= PM_PRESENT;
-		page = _vm_normal_page(vma, addr, pte, true);
+		page = vm_normal_page(vma, addr, pte);
 		if (pte_soft_dirty(pte))
 			flags |= PM_SOFT_DIRTY;
 	} else if (is_swap_pte(pte)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 99765be3284d..44a5ac738bb5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -584,7 +584,7 @@ static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -722,9 +722,6 @@ struct hmm_devmem {
 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 				  struct device *device,
 				  unsigned long size);
-struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
-					   struct device *device,
-					   struct resource *res);
 
 /*
  * hmm_devmem_page_set_drvdata - set per-page driver data field
@@ -751,7 +748,7 @@ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
 {
 	return page->hmm_data;
 }
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+#endif /* CONFIG_DEVICE_PRIVATE */
 #else /* IS_ENABLED(CONFIG_HMM) */
 static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..dd961882bc74 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -132,7 +132,6 @@ enum {
 	IORES_DESC_PERSISTENT_MEMORY		= 4,
 	IORES_DESC_PERSISTENT_MEMORY_LEGACY	= 5,
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
-	IORES_DESC_DEVICE_PUBLIC_MEMORY		= 7,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 1732dea030b2..995c62c5a48b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -37,13 +37,6 @@ struct vmem_altmap {
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.rst.
  *
- * MEMORY_DEVICE_PUBLIC:
- * Device memory that is cache coherent from device and CPU point of view. This
- * is use on platform that have an advance system bus (like CAPI or CCIX). A
- * driver can hotplug the device memory using ZONE_DEVICE and with that memory
- * type. Any page of a process can be migrated to such memory. However no one
- * should be allow to pin such memory so that it can always be evicted.
- *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
  * coherent and supports page pinning. In support of coordinating page
@@ -58,7 +51,6 @@ struct vmem_altmap {
  */
 enum memory_type {
 	MEMORY_DEVICE_PRIVATE = 1,
-	MEMORY_DEVICE_PUBLIC,
 	MEMORY_DEVICE_FS_DAX,
 	MEMORY_DEVICE_PCI_P2PDMA,
 };
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd0b5f4e1e45..7399f9f08de6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -944,7 +944,6 @@ static inline bool put_devmap_managed_page(struct page *page)
 		return false;
 	switch (page->pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
-	case MEMORY_DEVICE_PUBLIC:
 	case MEMORY_DEVICE_FS_DAX:
 		__put_devmap_managed_page(page);
 		return true;
@@ -960,12 +959,6 @@ static inline bool is_device_private_page(const struct page *page)
 		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
-static inline bool is_device_public_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PUBLIC;
-}
-
 #ifdef CONFIG_PCI_P2PDMA
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
@@ -998,11 +991,6 @@ static inline bool is_device_private_page(const struct page *page)
 	return false;
 }
 
-static inline bool is_device_public_page(const struct page *page)
-{
-	return false;
-}
-
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
 	return false;
@@ -1431,10 +1419,8 @@ struct zap_details {
 	pgoff_t last_index;			/* Highest page->index to unmap */
 };
 
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-			     pte_t pte, bool with_public_device);
-#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
-
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+			     pte_t pte);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 				pmd_t pmd);
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 0d2ba7e1f43e..6f35b85b3052 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -718,17 +718,6 @@ config DEVICE_PRIVATE
 	  memory; i.e., memory that is only accessible from the device (or
 	  group of devices). You likely also want to select HMM_MIRROR.
 
-config DEVICE_PUBLIC
-	bool "Addressable device memory (like GPU memory)"
-	depends on ARCH_HAS_HMM
-	select HMM
-	select DEV_PAGEMAP_OPS
-
-	help
-	  Allows creation of struct pages to represent addressable device
-	  memory; i.e., memory that is accessible from both the device and
-	  the CPU
-
 config FRAME_VECTOR
 	bool
 
diff --git a/mm/gup.c b/mm/gup.c
index ddde097cf9e4..fe131d879c70 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -605,13 +605,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
 		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
 			goto unmap;
 		*page = pte_page(*pte);
-
-		/*
-		 * This should never happen (a device public page in the gate
-		 * area).
-		 */
-		if (is_device_public_page(*page))
-			goto unmap;
 	}
 	if (unlikely(!try_get_page(*page))) {
 		ret = -ENOMEM;
diff --git a/mm/hmm.c b/mm/hmm.c
index 00cc642b3d7e..376159a769fb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1331,7 +1331,7 @@ EXPORT_SYMBOL(hmm_range_dma_unmap);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
 				       unsigned long addr)
 {
@@ -1478,54 +1478,4 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	return devmem;
 }
 EXPORT_SYMBOL_GPL(hmm_devmem_add);
-
-struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
-					   struct device *device,
-					   struct resource *res)
-{
-	struct hmm_devmem *devmem;
-	void *result;
-	int ret;
-
-	if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
-		return ERR_PTR(-EINVAL);
-
-	dev_pagemap_get_ops();
-
-	devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
-	if (!devmem)
-		return ERR_PTR(-ENOMEM);
-
-	init_completion(&devmem->completion);
-	devmem->pfn_first = -1UL;
-	devmem->pfn_last = -1UL;
-	devmem->resource = res;
-	devmem->device = device;
-	devmem->ops = ops;
-
-	ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
-			      0, GFP_KERNEL);
-	if (ret)
-		return ERR_PTR(ret);
-
-	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
-	devmem->pfn_last = devmem->pfn_first +
-			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-	devmem->page_fault = hmm_devmem_fault;
-
-	devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
-	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.page_free = hmm_devmem_free;
-	devmem->pagemap.altmap_valid = false;
-	devmem->pagemap.ref = &devmem->ref;
-	devmem->pagemap.data = devmem;
-	devmem->pagemap.kill = hmm_devmem_ref_kill;
-	devmem->pagemap.cleanup = hmm_devmem_ref_exit;
-
-	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
-	if (IS_ERR(result))
-		return result;
-	return devmem;
-}
-EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+#endif /* CONFIG_DEVICE_PRIVATE  */
diff --git a/mm/madvise.c b/mm/madvise.c
index 628022e674a7..968df3aa069f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -354,7 +354,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			continue;
 		}
 
-		page = _vm_normal_page(vma, addr, ptent, true);
+		page = vm_normal_page(vma, addr, ptent);
 		if (!page)
 			continue;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ba9138a4a1de..d2a6454fa0bd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4793,7 +4793,7 @@ enum mc_target_type {
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 						unsigned long addr, pte_t ptent)
 {
-	struct page *page = _vm_normal_page(vma, addr, ptent, true);
+	struct page *page = vm_normal_page(vma, addr, ptent);
 
 	if (!page || !page_mapped(page))
 		return NULL;
@@ -4994,8 +4994,8 @@ out:
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
- *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
+ *     (so ZONE_DEVICE page and thus not on the lru).
  *     For now we such page is charge like a regular page would be as for all
  *     intent and purposes it is just special memory taking the place of a
  *     regular page.
@@ -5029,8 +5029,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 */
 		if (page->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
-			if (is_device_private_page(page) ||
-			    is_device_public_page(page))
+			if (is_device_private_page(page))
 				ret = MC_TARGET_DEVICE;
 			if (target)
 				target->page = page;
@@ -5101,8 +5100,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 	if (ptl) {
 		/*
 		 * Note their can not be MC_TARGET_DEVICE for now as we do not
-		 * support transparent huge page with MEMORY_DEVICE_PUBLIC or
-		 * MEMORY_DEVICE_PRIVATE but this might change.
+		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
+		 * this might change.
 		 */
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d9cc6606f409..31e7c7b424a1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1177,16 +1177,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 		goto unlock;
 	}
 
-	switch (pgmap->type) {
-	case MEMORY_DEVICE_PRIVATE:
-	case MEMORY_DEVICE_PUBLIC:
+	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
 		/*
 		 * TODO: Handle HMM pages which may need coordination
 		 * with device-side memory.
 		 */
 		goto unlock;
-	default:
-		break;
 	}
 
 	/*
diff --git a/mm/memory.c b/mm/memory.c
index ddf20bd0c317..2d14f4c7e152 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -571,8 +571,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
  * PFNMAP mappings in order to support COWable mappings.
  *
  */
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-			     pte_t pte, bool with_public_device)
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+			    pte_t pte)
 {
 	unsigned long pfn = pte_pfn(pte);
 
@@ -585,29 +585,6 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			return NULL;
 		if (is_zero_pfn(pfn))
 			return NULL;
-
-		/*
-		 * Device public pages are special pages (they are ZONE_DEVICE
-		 * pages but different from persistent memory). They behave
-		 * allmost like normal pages. The difference is that they are
-		 * not on the lru and thus should never be involve with any-
-		 * thing that involve lru manipulation (mlock, numa balancing,
-		 * ...).
-		 *
-		 * This is why we still want to return NULL for such page from
-		 * vm_normal_page() so that we do not have to special case all
-		 * call site of vm_normal_page().
-		 */
-		if (likely(pfn <= highest_memmap_pfn)) {
-			struct page *page = pfn_to_page(pfn);
-
-			if (is_device_public_page(page)) {
-				if (with_public_device)
-					return page;
-				return NULL;
-			}
-		}
-
 		if (pte_devmap(pte))
 			return NULL;
 
@@ -797,17 +774,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		rss[mm_counter(page)]++;
 	} else if (pte_devmap(pte)) {
 		page = pte_page(pte);
-
-		/*
-		 * Cache coherent device memory behave like regular page and
-		 * not like persistent memory page. For more informations see
-		 * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
-		 */
-		if (is_device_public_page(page)) {
-			get_page(page);
-			page_dup_rmap(page, false);
-			rss[mm_counter(page)]++;
-		}
 	}
 
 out_set_pte:
@@ -1063,7 +1029,7 @@ again:
 		if (pte_present(ptent)) {
 			struct page *page;
 
-			page = _vm_normal_page(vma, addr, ptent, true);
+			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
diff --git a/mm/migrate.c b/mm/migrate.c
index f2ecc2855a12..78d45e184457 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -246,8 +246,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 			if (is_device_private_page(new)) {
 				entry = make_device_private_entry(new, pte_write(pte));
 				pte = swp_entry_to_pte(entry);
-			} else if (is_device_public_page(new)) {
-				pte = pte_mkdevmap(pte);
 			}
 		}
 
@@ -381,7 +379,6 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
 	 * ZONE_DEVICE pages.
 	 */
 	expected_count += is_device_private_page(page);
-	expected_count += is_device_public_page(page);
 	if (mapping)
 		expected_count += hpage_nr_pages(page) + page_has_private(page);
 
@@ -994,10 +991,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 		if (!PageMappingFlags(page))
 			page->mapping = NULL;
 
-		if (unlikely(is_zone_device_page(newpage))) {
-			if (is_device_public_page(newpage))
-				flush_dcache_page(newpage);
-		} else
+		if (likely(!is_zone_device_page(newpage)))
 			flush_dcache_page(newpage);
 
 	}
@@ -2265,7 +2259,7 @@ again:
 				pfn = 0;
 				goto next;
 			}
-			page = _vm_normal_page(migrate->vma, addr, pte, true);
+			page = vm_normal_page(migrate->vma, addr, pte);
 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 		}
@@ -2406,16 +2400,7 @@ static bool migrate_vma_check_page(struct page *page)
 		 * FIXME proper solution is to rework migration_entry_wait() so
 		 * it does not need to take a reference on page.
 		 */
-		if (is_device_private_page(page))
-			return true;
-
-		/*
-		 * Only allow device public page to be migrated and account for
-		 * the extra reference count imply by ZONE_DEVICE pages.
-		 */
-		if (!is_device_public_page(page))
-			return false;
-		extra++;
+		return is_device_private_page(page);
 	}
 
 	/* For file back page */
@@ -2665,11 +2650,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 
 			swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
 			entry = swp_entry_to_pte(swp_entry);
-		} else if (is_device_public_page(page)) {
-			entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
-			if (vma->vm_flags & VM_WRITE)
-				entry = pte_mkwrite(pte_mkdirty(entry));
-			entry = pte_mkdevmap(entry);
 		}
 	} else {
 		entry = mk_pte(page, vma->vm_page_prot);
@@ -2789,7 +2769,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 					continue;
 				}
-			} else if (!is_device_public_page(newpage)) {
+			} else {
 				/*
 				 * Other types of ZONE_DEVICE page are not
 				 * supported.
diff --git a/mm/swap.c b/mm/swap.c
index 7ede3eddc12a..83107410d29f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -740,17 +740,6 @@ void release_pages(struct page **pages, int nr)
 		if (is_huge_zero_page(page))
 			continue;
 
-		/* Device public page can not be huge page */
-		if (is_device_public_page(page)) {
-			if (locked_pgdat) {
-				spin_unlock_irqrestore(&locked_pgdat->lru_lock,
-						       flags);
-				locked_pgdat = NULL;
-			}
-			put_devmap_managed_page(page);
-			continue;
-		}
-
 		page = compound_head(page);
 		if (!put_page_testzero(page))
 			continue;
-- 
cgit v1.2.3


From 0092908d16c604b8207c2141ec64b0fa4473bb03 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:06 +0200
Subject: mm: factor out a devm_request_free_mem_region helper

Keep the physical address allocation that hmm_add_device does with the
rest of the resource code, and allow future reuse of it without the hmm
wrapper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/ioport.h |  2 ++
 kernel/resource.c      | 39 +++++++++++++++++++++++++++++++++++++++
 mm/hmm.c               | 33 ++++-----------------------------
 3 files changed, 45 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index dd961882bc74..a02b290ca08a 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -285,6 +285,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
        return (r1->start <= r2->end && r1->end >= r2->start);
 }
 
+struct resource *devm_request_free_mem_region(struct device *dev,
+		struct resource *base, unsigned long size);
 
 #endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index 158f04ec1d4f..d22423e85cf8 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head)
 }
 EXPORT_SYMBOL(resource_list_free);
 
+#ifdef CONFIG_DEVICE_PRIVATE
+/**
+ * devm_request_free_mem_region - find free region for device private memory
+ *
+ * @dev: device struct to bind the resource to
+ * @size: size in bytes of the device memory to add
+ * @base: resource tree to look in
+ *
+ * This function tries to find an empty range of physical address big enough to
+ * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
+ * memory, which in turn allocates struct pages.
+ */
+struct resource *devm_request_free_mem_region(struct device *dev,
+		struct resource *base, unsigned long size)
+{
+	resource_size_t end, addr;
+	struct resource *res;
+
+	size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
+	end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
+	addr = end - size + 1UL;
+
+	for (; addr > size && addr >= base->start; addr -= size) {
+		if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
+				REGION_DISJOINT)
+			continue;
+
+		res = devm_request_mem_region(dev, addr, size, dev_name(dev));
+		if (!res)
+			return ERR_PTR(-ENOMEM);
+		res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+		return res;
+	}
+
+	return ERR_PTR(-ERANGE);
+}
+EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
 static int __init strict_iomem(char *str)
 {
 	if (strstr(str, "relaxed"))
diff --git a/mm/hmm.c b/mm/hmm.c
index e7dd2ab8f9ab..48574f8485bb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -25,8 +25,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
-#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-
 #if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
@@ -1408,7 +1406,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 				  unsigned long size)
 {
 	struct hmm_devmem *devmem;
-	resource_size_t addr;
 	void *result;
 	int ret;
 
@@ -1430,32 +1427,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	if (ret)
 		return ERR_PTR(ret);
 
-	size = ALIGN(size, PA_SECTION_SIZE);
-	addr = min((unsigned long)iomem_resource.end,
-		   (1UL << MAX_PHYSMEM_BITS) - 1);
-	addr = addr - size + 1UL;
-
-	/*
-	 * FIXME add a new helper to quickly walk resource tree and find free
-	 * range
-	 *
-	 * FIXME what about ioport_resource resource ?
-	 */
-	for (; addr > size && addr >= iomem_resource.start; addr -= size) {
-		ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
-		if (ret != REGION_DISJOINT)
-			continue;
-
-		devmem->resource = devm_request_mem_region(device, addr, size,
-							   dev_name(device));
-		if (!devmem->resource)
-			return ERR_PTR(-ENOMEM);
-		break;
-	}
-	if (!devmem->resource)
-		return ERR_PTR(-ERANGE);
-
-	devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+	devmem->resource = devm_request_free_mem_region(device, &iomem_resource,
+			size);
+	if (IS_ERR(devmem->resource))
+		return ERR_CAST(devmem->resource);
 	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
 	devmem->pfn_last = devmem->pfn_first +
 			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-- 
cgit v1.2.3


From 3ed2dcdf54d5bf1f9823b5faf1a702e7cee53982 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:07 +0200
Subject: memremap: validate the pagemap type passed to devm_memremap_pages

Most pgmap types are only supported when certain config options are
enabled.  Check for a type that is valid for the current configuration
before setting up the pagemap.  For this the usage of the 0 type for
device dax gets replaced with an explicit MEMORY_DEVICE_DEVDAX type.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/dax/device.c     |  1 +
 include/linux/memremap.h |  8 ++++++++
 kernel/memremap.c        | 22 ++++++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 8465d12fecba..79014baa782d 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -468,6 +468,7 @@ int dev_dax_probe(struct device *dev)
 	dev_dax->pgmap.ref = &dev_dax->ref;
 	dev_dax->pgmap.kill = dev_dax_percpu_kill;
 	dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
+	dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX;
 	addr = devm_memremap_pages(dev, &dev_dax->pgmap);
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 995c62c5a48b..0c86f2c5ac9c 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -45,13 +45,21 @@ struct vmem_altmap {
  * wakeup is used to coordinate physical address space management (ex:
  * fs truncate/hole punch) vs pinned pages (ex: device dma).
  *
+ * MEMORY_DEVICE_DEVDAX:
+ * Host memory that has similar access semantics as System RAM i.e. DMA
+ * coherent and supports page pinning. In contrast to
+ * MEMORY_DEVICE_FS_DAX, this memory is access via a device-dax
+ * character device.
+ *
  * MEMORY_DEVICE_PCI_P2PDMA:
  * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
  * transactions.
  */
 enum memory_type {
+	/* 0 is reserved to catch uninitialized type fields */
 	MEMORY_DEVICE_PRIVATE = 1,
 	MEMORY_DEVICE_FS_DAX,
+	MEMORY_DEVICE_DEVDAX,
 	MEMORY_DEVICE_PCI_P2PDMA,
 };
 
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6e1970719dc2..abda62d1e5a3 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -157,6 +157,28 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
 
+	switch (pgmap->type) {
+	case MEMORY_DEVICE_PRIVATE:
+		if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
+			WARN(1, "Device private memory not supported\n");
+			return ERR_PTR(-EINVAL);
+		}
+		break;
+	case MEMORY_DEVICE_FS_DAX:
+		if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
+		    IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
+			WARN(1, "File system DAX not supported\n");
+			return ERR_PTR(-EINVAL);
+		}
+		break;
+	case MEMORY_DEVICE_DEVDAX:
+	case MEMORY_DEVICE_PCI_P2PDMA:
+		break;
+	default:
+		WARN(1, "Invalid pgmap type %d\n", pgmap->type);
+		break;
+	}
+
 	if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
 		WARN(1, "Missing reference count teardown definition\n");
 		return ERR_PTR(-EINVAL);
-- 
cgit v1.2.3


From 1e240e8d4a7d92232b6214e02a0a4197a53afd6c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:08 +0200
Subject: memremap: move dev_pagemap callbacks into a separate structure

The dev_pagemap is a growing too many callbacks.  Move them into a
separate ops structure so that they are not duplicated for multiple
instances, and an attacker can't easily overwrite them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/dax/device.c              | 11 +++++++----
 drivers/dax/pmem/core.c           |  2 +-
 drivers/nvdimm/pmem.c             | 19 +++++++++++--------
 drivers/pci/p2pdma.c              |  8 ++++++--
 include/linux/memremap.h          | 36 ++++++++++++++++++++----------------
 kernel/memremap.c                 | 18 +++++++++---------
 mm/hmm.c                          | 10 +++++++---
 tools/testing/nvdimm/test/iomap.c |  7 ++++---
 8 files changed, 65 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 79014baa782d..f390083a64d7 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -36,9 +36,8 @@ static void dev_dax_percpu_exit(struct percpu_ref *ref)
 	percpu_ref_exit(ref);
 }
 
-static void dev_dax_percpu_kill(struct percpu_ref *data)
+static void dev_dax_percpu_kill(struct percpu_ref *ref)
 {
-	struct percpu_ref *ref = data;
 	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
 
 	dev_dbg(&dev_dax->dev, "%s\n", __func__);
@@ -442,6 +441,11 @@ static void dev_dax_kill(void *dev_dax)
 	kill_dev_dax(dev_dax);
 }
 
+static const struct dev_pagemap_ops dev_dax_pagemap_ops = {
+	.kill		= dev_dax_percpu_kill,
+	.cleanup	= dev_dax_percpu_exit,
+};
+
 int dev_dax_probe(struct device *dev)
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
@@ -466,9 +470,8 @@ int dev_dax_probe(struct device *dev)
 		return rc;
 
 	dev_dax->pgmap.ref = &dev_dax->ref;
-	dev_dax->pgmap.kill = dev_dax_percpu_kill;
-	dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
 	dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX;
+	dev_dax->pgmap.ops = &dev_dax_pagemap_ops;
 	addr = devm_memremap_pages(dev, &dev_dax->pgmap);
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c
index f9f51786d556..6eb6dfdf19bf 100644
--- a/drivers/dax/pmem/core.c
+++ b/drivers/dax/pmem/core.c
@@ -16,7 +16,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
 	struct dev_dax *dev_dax;
 	struct nd_namespace_io *nsio;
 	struct dax_region *dax_region;
-	struct dev_pagemap pgmap = { 0 };
+	struct dev_pagemap pgmap = { };
 	struct nd_namespace_common *ndns;
 	struct nd_dax *nd_dax = to_nd_dax(dev);
 	struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 24d7fe7c74ed..c2449af2b388 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -303,7 +303,7 @@ static const struct attribute_group *pmem_attribute_groups[] = {
 	NULL,
 };
 
-static void __pmem_release_queue(struct percpu_ref *ref)
+static void pmem_pagemap_cleanup(struct percpu_ref *ref)
 {
 	struct request_queue *q;
 
@@ -313,10 +313,10 @@ static void __pmem_release_queue(struct percpu_ref *ref)
 
 static void pmem_release_queue(void *ref)
 {
-	__pmem_release_queue(ref);
+	pmem_pagemap_cleanup(ref);
 }
 
-static void pmem_freeze_queue(struct percpu_ref *ref)
+static void pmem_pagemap_kill(struct percpu_ref *ref)
 {
 	struct request_queue *q;
 
@@ -339,19 +339,24 @@ static void pmem_release_pgmap_ops(void *__pgmap)
 	dev_pagemap_put_ops();
 }
 
-static void fsdax_pagefree(struct page *page, void *data)
+static void pmem_pagemap_page_free(struct page *page, void *data)
 {
 	wake_up_var(&page->_refcount);
 }
 
+static const struct dev_pagemap_ops fsdax_pagemap_ops = {
+	.page_free		= pmem_pagemap_page_free,
+	.kill			= pmem_pagemap_kill,
+	.cleanup		= pmem_pagemap_cleanup,
+};
+
 static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
 {
 	dev_pagemap_get_ops();
 	if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
 		return -ENOMEM;
 	pgmap->type = MEMORY_DEVICE_FS_DAX;
-	pgmap->page_free = fsdax_pagefree;
-
+	pgmap->ops = &fsdax_pagemap_ops;
 	return 0;
 }
 
@@ -409,8 +414,6 @@ static int pmem_attach_disk(struct device *dev,
 
 	pmem->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
-	pmem->pgmap.kill = pmem_freeze_queue;
-	pmem->pgmap.cleanup = __pmem_release_queue;
 	if (is_nd_pfn(dev)) {
 		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
 			return -ENOMEM;
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index a4994aa3acc0..fb039259d463 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -153,6 +153,11 @@ out:
 	return error;
 }
 
+static const struct dev_pagemap_ops pci_p2pdma_pagemap_ops = {
+	.kill		= pci_p2pdma_percpu_kill,
+	.cleanup	= pci_p2pdma_percpu_cleanup,
+};
+
 /**
  * pci_p2pdma_add_resource - add memory for use as p2p memory
  * @pdev: the device to add the memory to
@@ -208,8 +213,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
 	pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
 		pci_resource_start(pdev, bar);
-	pgmap->kill = pci_p2pdma_percpu_kill;
-	pgmap->cleanup = pci_p2pdma_percpu_cleanup;
+	pgmap->ops = &pci_p2pdma_pagemap_ops;
 
 	addr = devm_memremap_pages(&pdev->dev, pgmap);
 	if (IS_ERR(addr)) {
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 0c86f2c5ac9c..919755f48c7e 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -63,41 +63,45 @@ enum memory_type {
 	MEMORY_DEVICE_PCI_P2PDMA,
 };
 
-/*
- * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief
- * explanation in include/linux/memory_hotplug.h.
- *
- * The page_free() callback is called once the page refcount reaches 1
- * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
- * This allows the device driver to implement its own memory management.)
- */
-typedef void (*dev_page_free_t)(struct page *page, void *data);
+struct dev_pagemap_ops {
+	/*
+	 * Called once the page refcount reaches 1.  (ZONE_DEVICE pages never
+	 * reach 0 refcount unless there is a refcount bug. This allows the
+	 * device driver to implement its own memory management.)
+	 */
+	void (*page_free)(struct page *page, void *data);
+
+	/*
+	 * Transition the refcount in struct dev_pagemap to the dead state.
+	 */
+	void (*kill)(struct percpu_ref *ref);
+
+	/*
+	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
+	 */
+	void (*cleanup)(struct percpu_ref *ref);
+};
 
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
- * @page_free: free page callback when page refcount reaches 1
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @res: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
- * @kill: callback to transition @ref to the dead state
- * @cleanup: callback to wait for @ref to be idle and reap it
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @ops: method table
  */
 struct dev_pagemap {
-	dev_page_free_t page_free;
 	struct vmem_altmap altmap;
 	bool altmap_valid;
 	struct resource res;
 	struct percpu_ref *ref;
-	void (*kill)(struct percpu_ref *ref);
-	void (*cleanup)(struct percpu_ref *ref);
 	struct device *dev;
 	void *data;
 	enum memory_type type;
 	u64 pci_p2pdma_bus_offset;
+	const struct dev_pagemap_ops *ops;
 };
 
 #ifdef CONFIG_ZONE_DEVICE
diff --git a/kernel/memremap.c b/kernel/memremap.c
index abda62d1e5a3..0824237ef979 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data)
 	unsigned long pfn;
 	int nid;
 
-	pgmap->kill(pgmap->ref);
+	pgmap->ops->kill(pgmap->ref);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
-	pgmap->cleanup(pgmap->ref);
+	pgmap->ops->cleanup(pgmap->ref);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -128,8 +128,8 @@ static void devm_memremap_pages_release(void *data)
  * @pgmap: pointer to a struct dev_pagemap
  *
  * Notes:
- * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
- *    by the caller before passing it to this function
+ * 1/ At a minimum the res, ref and type and ops members of @pgmap must be
+ *    initialized by the caller before passing it to this function
  *
  * 2/ The altmap field may optionally be initialized, in which case altmap_valid
  *    must be set to true
@@ -179,7 +179,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		break;
 	}
 
-	if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
+	if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
+	    !pgmap->ops->cleanup) {
 		WARN(1, "Missing reference count teardown definition\n");
 		return ERR_PTR(-EINVAL);
 	}
@@ -293,9 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
 	pgmap_array_delete(res);
  err_array:
-	pgmap->kill(pgmap->ref);
-	pgmap->cleanup(pgmap->ref);
-
+	pgmap->ops->kill(pgmap->ref);
+	pgmap->ops->cleanup(pgmap->ref);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
@@ -388,7 +388,7 @@ void __put_devmap_managed_page(struct page *page)
 
 		mem_cgroup_uncharge(page);
 
-		page->pgmap->page_free(page, page->pgmap->data);
+		page->pgmap->ops->page_free(page, page->pgmap->data);
 	} else if (!count)
 		__put_page(page);
 }
diff --git a/mm/hmm.c b/mm/hmm.c
index 48574f8485bb..583a02a16872 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1384,6 +1384,12 @@ static void hmm_devmem_free(struct page *page, void *data)
 	devmem->ops->free(devmem, page);
 }
 
+static const struct dev_pagemap_ops hmm_pagemap_ops = {
+	.page_free		= hmm_devmem_free,
+	.kill			= hmm_devmem_ref_kill,
+	.cleanup		= hmm_devmem_ref_exit,
+};
+
 /*
  * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
  *
@@ -1438,12 +1444,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 
 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.page_free = hmm_devmem_free;
+	devmem->pagemap.ops = &hmm_pagemap_ops;
 	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
 	devmem->pagemap.data = devmem;
-	devmem->pagemap.kill = hmm_devmem_ref_kill;
-	devmem->pagemap.cleanup = hmm_devmem_ref_exit;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
 	if (IS_ERR(result))
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 076df22e4bda..cf3f064a697d 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -100,9 +100,10 @@ static void nfit_test_kill(void *_pgmap)
 {
 	struct dev_pagemap *pgmap = _pgmap;
 
-	WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup);
-	pgmap->kill(pgmap->ref);
-	pgmap->cleanup(pgmap->ref);
+	WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
+		!pgmap->ops->cleanup);
+	pgmap->ops->kill(pgmap->ref);
+	pgmap->ops->cleanup(pgmap->ref);
 }
 
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
-- 
cgit v1.2.3


From d8668bb0451c3c45b59dbcde2654e0539aad1d2a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:09 +0200
Subject: memremap: pass a struct dev_pagemap to ->kill and ->cleanup

Passing the actual typed structure leads to more understandable code
vs just passing the ref member.

Reported-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/dax/device.c              | 12 ++++++------
 drivers/nvdimm/pmem.c             | 18 +++++++++---------
 drivers/pci/p2pdma.c              |  9 +++++----
 include/linux/memremap.h          |  4 ++--
 kernel/memremap.c                 |  8 ++++----
 mm/hmm.c                          | 10 +++++-----
 tools/testing/nvdimm/test/iomap.c |  4 ++--
 7 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index f390083a64d7..b5257038c188 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -27,21 +27,21 @@ static void dev_dax_percpu_release(struct percpu_ref *ref)
 	complete(&dev_dax->cmp);
 }
 
-static void dev_dax_percpu_exit(struct percpu_ref *ref)
+static void dev_dax_percpu_exit(struct dev_pagemap *pgmap)
 {
-	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
+	struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap);
 
 	dev_dbg(&dev_dax->dev, "%s\n", __func__);
 	wait_for_completion(&dev_dax->cmp);
-	percpu_ref_exit(ref);
+	percpu_ref_exit(pgmap->ref);
 }
 
-static void dev_dax_percpu_kill(struct percpu_ref *ref)
+static void dev_dax_percpu_kill(struct dev_pagemap *pgmap)
 {
-	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
+	struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap);
 
 	dev_dbg(&dev_dax->dev, "%s\n", __func__);
-	percpu_ref_kill(ref);
+	percpu_ref_kill(pgmap->ref);
 }
 
 static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index c2449af2b388..9dac48359353 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -303,24 +303,24 @@ static const struct attribute_group *pmem_attribute_groups[] = {
 	NULL,
 };
 
-static void pmem_pagemap_cleanup(struct percpu_ref *ref)
+static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
 {
-	struct request_queue *q;
+	struct request_queue *q =
+		container_of(pgmap->ref, struct request_queue, q_usage_counter);
 
-	q = container_of(ref, typeof(*q), q_usage_counter);
 	blk_cleanup_queue(q);
 }
 
-static void pmem_release_queue(void *ref)
+static void pmem_release_queue(void *pgmap)
 {
-	pmem_pagemap_cleanup(ref);
+	pmem_pagemap_cleanup(pgmap);
 }
 
-static void pmem_pagemap_kill(struct percpu_ref *ref)
+static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
 {
-	struct request_queue *q;
+	struct request_queue *q =
+		container_of(pgmap->ref, struct request_queue, q_usage_counter);
 
-	q = container_of(ref, typeof(*q), q_usage_counter);
 	blk_freeze_queue_start(q);
 }
 
@@ -435,7 +435,7 @@ static int pmem_attach_disk(struct device *dev,
 		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
 	} else {
 		if (devm_add_action_or_reset(dev, pmem_release_queue,
-					&q->q_usage_counter))
+					&pmem->pgmap))
 			return -ENOMEM;
 		addr = devm_memremap(dev, pmem->phys_addr,
 				pmem->size, ARCH_MEMREMAP_PMEM);
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index fb039259d463..fa6249e4ed5f 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -91,14 +91,15 @@ static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
 	complete(&p2p_pgmap->ref_done);
 }
 
-static void pci_p2pdma_percpu_kill(struct percpu_ref *ref)
+static void pci_p2pdma_percpu_kill(struct dev_pagemap *pgmap)
 {
-	percpu_ref_kill(ref);
+	percpu_ref_kill(pgmap->ref);
 }
 
-static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref)
+static void pci_p2pdma_percpu_cleanup(struct dev_pagemap *pgmap)
 {
-	struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
+	struct p2pdma_pagemap *p2p_pgmap =
+		container_of(pgmap, struct p2pdma_pagemap, pgmap);
 
 	wait_for_completion(&p2p_pgmap->ref_done);
 	percpu_ref_exit(&p2p_pgmap->ref);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 919755f48c7e..b8666a0d8665 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -74,12 +74,12 @@ struct dev_pagemap_ops {
 	/*
 	 * Transition the refcount in struct dev_pagemap to the dead state.
 	 */
-	void (*kill)(struct percpu_ref *ref);
+	void (*kill)(struct dev_pagemap *pgmap);
 
 	/*
 	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
 	 */
-	void (*cleanup)(struct percpu_ref *ref);
+	void (*cleanup)(struct dev_pagemap *pgmap);
 };
 
 /**
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 0824237ef979..00c1ceb60c19 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data)
 	unsigned long pfn;
 	int nid;
 
-	pgmap->ops->kill(pgmap->ref);
+	pgmap->ops->kill(pgmap);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
-	pgmap->ops->cleanup(pgmap->ref);
+	pgmap->ops->cleanup(pgmap);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -294,8 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
 	pgmap_array_delete(res);
  err_array:
-	pgmap->ops->kill(pgmap->ref);
-	pgmap->ops->cleanup(pgmap->ref);
+	pgmap->ops->kill(pgmap);
+	pgmap->ops->cleanup(pgmap);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
diff --git a/mm/hmm.c b/mm/hmm.c
index 583a02a16872..987793fba923 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1352,18 +1352,18 @@ static void hmm_devmem_ref_release(struct percpu_ref *ref)
 	complete(&devmem->completion);
 }
 
-static void hmm_devmem_ref_exit(struct percpu_ref *ref)
+static void hmm_devmem_ref_exit(struct dev_pagemap *pgmap)
 {
 	struct hmm_devmem *devmem;
 
-	devmem = container_of(ref, struct hmm_devmem, ref);
+	devmem = container_of(pgmap, struct hmm_devmem, pagemap);
 	wait_for_completion(&devmem->completion);
-	percpu_ref_exit(ref);
+	percpu_ref_exit(pgmap->ref);
 }
 
-static void hmm_devmem_ref_kill(struct percpu_ref *ref)
+static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
 {
-	percpu_ref_kill(ref);
+	percpu_ref_kill(pgmap->ref);
 }
 
 static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index cf3f064a697d..82f901569e06 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -102,8 +102,8 @@ static void nfit_test_kill(void *_pgmap)
 
 	WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
 		!pgmap->ops->cleanup);
-	pgmap->ops->kill(pgmap->ref);
-	pgmap->ops->cleanup(pgmap->ref);
+	pgmap->ops->kill(pgmap);
+	pgmap->ops->cleanup(pgmap);
 }
 
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
-- 
cgit v1.2.3


From f6a55e1a3fe6b3bb294a80a05437fcf86488d819 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:10 +0200
Subject: memremap: lift the devmap_enable manipulation into
 devm_memremap_pages

Just check if there is a ->page_free operation set and take care of the
static key enable, as well as the put using device managed resources.
Also check that a ->page_free is provided for the pgmaps types that
require it, and check for a valid type as well while we are at it.

Note that this also fixes the fact that hmm never called
dev_pagemap_put_ops and thus would leave the slow path enabled forever,
even after a device driver unload or disable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/nvdimm/pmem.c | 23 ++++----------------
 include/linux/mm.h    | 10 ---------
 kernel/memremap.c     | 59 ++++++++++++++++++++++++++++++++-------------------
 mm/hmm.c              |  2 --
 4 files changed, 41 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9dac48359353..48767171a4df 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -334,11 +334,6 @@ static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
-static void pmem_release_pgmap_ops(void *__pgmap)
-{
-	dev_pagemap_put_ops();
-}
-
 static void pmem_pagemap_page_free(struct page *page, void *data)
 {
 	wake_up_var(&page->_refcount);
@@ -350,16 +345,6 @@ static const struct dev_pagemap_ops fsdax_pagemap_ops = {
 	.cleanup		= pmem_pagemap_cleanup,
 };
 
-static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
-{
-	dev_pagemap_get_ops();
-	if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
-		return -ENOMEM;
-	pgmap->type = MEMORY_DEVICE_FS_DAX;
-	pgmap->ops = &fsdax_pagemap_ops;
-	return 0;
-}
-
 static int pmem_attach_disk(struct device *dev,
 		struct nd_namespace_common *ndns)
 {
@@ -415,8 +400,8 @@ static int pmem_attach_disk(struct device *dev,
 	pmem->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
 	if (is_nd_pfn(dev)) {
-		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
-			return -ENOMEM;
+		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pfn_sb = nd_pfn->pfn_sb;
 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@@ -428,8 +413,8 @@ static int pmem_attach_disk(struct device *dev,
 	} else if (pmem_should_map_pages(dev)) {
 		memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
 		pmem->pgmap.altmap_valid = false;
-		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
-			return -ENOMEM;
+		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pmem->pfn_flags |= PFN_MAP;
 		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7399f9f08de6..2425f4167ec2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -932,8 +932,6 @@ static inline bool is_zone_device_page(const struct page *page)
 #endif
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-void dev_pagemap_get_ops(void);
-void dev_pagemap_put_ops(void);
 void __put_devmap_managed_page(struct page *page);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 static inline bool put_devmap_managed_page(struct page *page)
@@ -973,14 +971,6 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
 #endif /* CONFIG_PCI_P2PDMA */
 
 #else /* CONFIG_DEV_PAGEMAP_OPS */
-static inline void dev_pagemap_get_ops(void)
-{
-}
-
-static inline void dev_pagemap_put_ops(void)
-{
-}
-
 static inline bool put_devmap_managed_page(struct page *page)
 {
 	return false;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 00c1ceb60c19..3219a4c91d07 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -17,6 +17,35 @@ static DEFINE_XARRAY(pgmap_array);
 #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
 #define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
+EXPORT_SYMBOL(devmap_managed_key);
+static atomic_t devmap_managed_enable;
+
+static void devmap_managed_enable_put(void *data)
+{
+	if (atomic_dec_and_test(&devmap_managed_enable))
+		static_branch_disable(&devmap_managed_key);
+}
+
+static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
+{
+	if (!pgmap->ops->page_free) {
+		WARN(1, "Missing page_free method\n");
+		return -EINVAL;
+	}
+
+	if (atomic_inc_return(&devmap_managed_enable) == 1)
+		static_branch_enable(&devmap_managed_key);
+	return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL);
+}
+#else
+static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
+
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
 		       unsigned long addr,
@@ -156,6 +185,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	};
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
+	bool need_devmap_managed = true;
 
 	switch (pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
@@ -173,6 +203,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		break;
 	case MEMORY_DEVICE_DEVDAX:
 	case MEMORY_DEVICE_PCI_P2PDMA:
+		need_devmap_managed = false;
 		break;
 	default:
 		WARN(1, "Invalid pgmap type %d\n", pgmap->type);
@@ -185,6 +216,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		return ERR_PTR(-EINVAL);
 	}
 
+	if (need_devmap_managed) {
+		error = devmap_managed_enable_get(dev, pgmap);
+		if (error)
+			return ERR_PTR(error);
+	}
+
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
 		- align_start;
@@ -351,28 +388,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL(devmap_managed_key);
-static atomic_t devmap_enable;
-
-/*
- * Toggle the static key for ->page_free() callbacks when dev_pagemap
- * pages go idle.
- */
-void dev_pagemap_get_ops(void)
-{
-	if (atomic_inc_return(&devmap_enable) == 1)
-		static_branch_enable(&devmap_managed_key);
-}
-EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
-
-void dev_pagemap_put_ops(void)
-{
-	if (atomic_dec_and_test(&devmap_enable))
-		static_branch_disable(&devmap_managed_key);
-}
-EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
-
 void __put_devmap_managed_page(struct page *page)
 {
 	int count = page_ref_dec_return(page);
diff --git a/mm/hmm.c b/mm/hmm.c
index 987793fba923..5b0bd5f6a74f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1415,8 +1415,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	void *result;
 	int ret;
 
-	dev_pagemap_get_ops();
-
 	devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
 	if (!devmem)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From 897e6365cda6ba6356e83a3aaa68dec82ef4c548 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:11 +0200
Subject: memremap: add a migrate_to_ram method to struct dev_pagemap_ops

This replaces the hacky ->fault callback, which is currently directly
called from common code through a hmm specific data structure as an
exercise in layering violations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h      |  6 ------
 include/linux/memremap.h |  6 ++++++
 include/linux/swapops.h  | 15 ---------------
 kernel/memremap.c        | 35 ++++-------------------------------
 mm/hmm.c                 | 13 +++++--------
 mm/memory.c              |  9 ++-------
 6 files changed, 17 insertions(+), 67 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 44a5ac738bb5..ba19c19e24ed 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -692,11 +692,6 @@ struct hmm_devmem_ops {
  * chunk, as an optimization. It must, however, prioritize the faulting address
  * over all the others.
  */
-typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma,
-				unsigned long addr,
-				const struct page *page,
-				unsigned int flags,
-				pmd_t *pmdp);
 
 struct hmm_devmem {
 	struct completion		completion;
@@ -707,7 +702,6 @@ struct hmm_devmem {
 	struct dev_pagemap		pagemap;
 	const struct hmm_devmem_ops	*ops;
 	struct percpu_ref		ref;
-	dev_page_fault_t		page_fault;
 };
 
 /*
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index b8666a0d8665..ac985bd03a7f 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -80,6 +80,12 @@ struct dev_pagemap_ops {
 	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
 	 */
 	void (*cleanup)(struct dev_pagemap *pgmap);
+
+	/*
+	 * Used for private (un-addressable) device memory only.  Must migrate
+	 * the page back to a CPU accessible page.
+	 */
+	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
 };
 
 /**
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 4d961668e5fc..15bdb6fe71e5 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
 {
 	return pfn_to_page(swp_offset(entry));
 }
-
-vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
-		       unsigned long addr,
-		       swp_entry_t entry,
-		       unsigned int flags,
-		       pmd_t *pmdp);
 #else /* CONFIG_DEVICE_PRIVATE */
 static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
 {
@@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
 {
 	return NULL;
 }
-
-static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
-				     unsigned long addr,
-				     swp_entry_t entry,
-				     unsigned int flags,
-				     pmd_t *pmdp)
-{
-	return VM_FAULT_SIGBUS;
-}
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 #ifdef CONFIG_MIGRATION
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 3219a4c91d07..c06a5487dda7 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,7 +11,6 @@
 #include <linux/types.h>
 #include <linux/wait_bit.h>
 #include <linux/xarray.h>
-#include <linux/hmm.h>
 
 static DEFINE_XARRAY(pgmap_array);
 #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
@@ -46,36 +45,6 @@ static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgm
 }
 #endif /* CONFIG_DEV_PAGEMAP_OPS */
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
-		       unsigned long addr,
-		       swp_entry_t entry,
-		       unsigned int flags,
-		       pmd_t *pmdp)
-{
-	struct page *page = device_private_entry_to_page(entry);
-	struct hmm_devmem *devmem;
-
-	devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
-
-	/*
-	 * The page_fault() callback must migrate page back to system memory
-	 * so that CPU can access it. This might fail for various reasons
-	 * (device issue, device was unsafely unplugged, ...). When such
-	 * error conditions happen, the callback must return VM_FAULT_SIGBUS.
-	 *
-	 * Note that because memory cgroup charges are accounted to the device
-	 * memory, this should never fail because of memory restrictions (but
-	 * allocation of regular system page might still fail because we are
-	 * out of memory).
-	 *
-	 * There is a more in-depth description of what that callback can and
-	 * cannot do, in include/linux/memremap.h
-	 */
-	return devmem->page_fault(vma, addr, page, flags, pmdp);
-}
-#endif /* CONFIG_DEVICE_PRIVATE */
-
 static void pgmap_array_delete(struct resource *res)
 {
 	xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
@@ -193,6 +162,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 			WARN(1, "Device private memory not supported\n");
 			return ERR_PTR(-EINVAL);
 		}
+		if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
+			WARN(1, "Missing migrate_to_ram method\n");
+			return ERR_PTR(-EINVAL);
+		}
 		break;
 	case MEMORY_DEVICE_FS_DAX:
 		if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
diff --git a/mm/hmm.c b/mm/hmm.c
index 5b0bd5f6a74f..96633ee066d8 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1366,15 +1366,12 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
 	percpu_ref_kill(pgmap->ref);
 }
 
-static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
-			    unsigned long addr,
-			    const struct page *page,
-			    unsigned int flags,
-			    pmd_t *pmdp)
+static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf)
 {
-	struct hmm_devmem *devmem = page->pgmap->data;
+	struct hmm_devmem *devmem = vmf->page->pgmap->data;
 
-	return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
+	return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page,
+			vmf->flags, vmf->pmd);
 }
 
 static void hmm_devmem_free(struct page *page, void *data)
@@ -1388,6 +1385,7 @@ static const struct dev_pagemap_ops hmm_pagemap_ops = {
 	.page_free		= hmm_devmem_free,
 	.kill			= hmm_devmem_ref_kill,
 	.cleanup		= hmm_devmem_ref_exit,
+	.migrate_to_ram		= hmm_devmem_migrate_to_ram,
 };
 
 /*
@@ -1438,7 +1436,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
 	devmem->pfn_last = devmem->pfn_first +
 			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-	devmem->page_fault = hmm_devmem_fault;
 
 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	devmem->pagemap.res = *devmem->resource;
diff --git a/mm/memory.c b/mm/memory.c
index 2d14f4c7e152..d437ccdb210c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2748,13 +2748,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			migration_entry_wait(vma->vm_mm, vmf->pmd,
 					     vmf->address);
 		} else if (is_device_private_entry(entry)) {
-			/*
-			 * For un-addressable device memory we call the pgmap
-			 * fault handler callback. The callback must migrate
-			 * the page back to some CPU accessible page.
-			 */
-			ret = device_private_entry_fault(vma, vmf->address, entry,
-						 vmf->flags, vmf->pmd);
+			vmf->page = device_private_entry_to_page(entry);
+			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
 		} else {
-- 
cgit v1.2.3


From 80a72d0af05ae97a8b106c172e431072ba587492 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:12 +0200
Subject: memremap: remove the data field in struct dev_pagemap

struct dev_pagemap is always embedded into a containing structure, so
there is no need to an additional private data field.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/nvdimm/pmem.c    | 2 +-
 include/linux/memremap.h | 3 +--
 kernel/memremap.c        | 2 +-
 mm/hmm.c                 | 9 +++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 48767171a4df..093408ce40ad 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -334,7 +334,7 @@ static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
-static void pmem_pagemap_page_free(struct page *page, void *data)
+static void pmem_pagemap_page_free(struct page *page)
 {
 	wake_up_var(&page->_refcount);
 }
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index ac985bd03a7f..336eca601dad 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -69,7 +69,7 @@ struct dev_pagemap_ops {
 	 * reach 0 refcount unless there is a refcount bug. This allows the
 	 * device driver to implement its own memory management.)
 	 */
-	void (*page_free)(struct page *page, void *data);
+	void (*page_free)(struct page *page);
 
 	/*
 	 * Transition the refcount in struct dev_pagemap to the dead state.
@@ -104,7 +104,6 @@ struct dev_pagemap {
 	struct resource res;
 	struct percpu_ref *ref;
 	struct device *dev;
-	void *data;
 	enum memory_type type;
 	u64 pci_p2pdma_bus_offset;
 	const struct dev_pagemap_ops *ops;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index c06a5487dda7..6c3dbb692037 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -376,7 +376,7 @@ void __put_devmap_managed_page(struct page *page)
 
 		mem_cgroup_uncharge(page);
 
-		page->pgmap->ops->page_free(page, page->pgmap->data);
+		page->pgmap->ops->page_free(page);
 	} else if (!count)
 		__put_page(page);
 }
diff --git a/mm/hmm.c b/mm/hmm.c
index 96633ee066d8..36e25cdbdac1 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1368,15 +1368,17 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
 
 static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf)
 {
-	struct hmm_devmem *devmem = vmf->page->pgmap->data;
+	struct hmm_devmem *devmem =
+		container_of(vmf->page->pgmap, struct hmm_devmem, pagemap);
 
 	return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page,
 			vmf->flags, vmf->pmd);
 }
 
-static void hmm_devmem_free(struct page *page, void *data)
+static void hmm_devmem_free(struct page *page)
 {
-	struct hmm_devmem *devmem = data;
+	struct hmm_devmem *devmem =
+		container_of(page->pgmap, struct hmm_devmem, pagemap);
 
 	devmem->ops->free(devmem, page);
 }
@@ -1442,7 +1444,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pagemap.ops = &hmm_pagemap_ops;
 	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
-	devmem->pagemap.data = devmem;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
 	if (IS_ERR(result))
-- 
cgit v1.2.3


From 514caf23a70fd697fa2ece238b2cd8dcc73fb16f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:13 +0200
Subject: memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID
 flag

Add a flags field to struct dev_pagemap to replace the altmap_valid
boolean to be a little more extensible.  Also add a pgmap_altmap() helper
to find the optional altmap and clean up the code using the altmap using
it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 arch/powerpc/mm/mem.c     | 10 +---------
 arch/x86/mm/init_64.c     |  8 ++------
 drivers/nvdimm/pfn_devs.c |  3 +--
 drivers/nvdimm/pmem.c     |  1 -
 include/linux/memremap.h  | 12 +++++++++++-
 kernel/memremap.c         | 26 ++++++++++----------------
 mm/hmm.c                  |  1 -
 mm/memory_hotplug.c       |  6 ++----
 mm/page_alloc.c           |  5 ++---
 9 files changed, 29 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 2540d3b2588c..a2923c5c1982 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -131,17 +131,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct page *page;
+	struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
 	int ret;
 
-	/*
-	 * If we have an altmap then we need to skip over any reserved PFNs
-	 * when querying the zone.
-	 */
-	page = pfn_to_page(start_pfn);
-	if (altmap)
-		page += vmem_altmap_offset(altmap);
-
 	__remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
 
 	/* Remove htab bolted mappings for this section of memory */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0f01c7b1d217..08bbf648827b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1213,13 +1213,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct page *page = pfn_to_page(start_pfn);
-	struct zone *zone;
+	struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
+	struct zone *zone = page_zone(page);
 
-	/* With altmap the first mapped page is offset from @start */
-	if (altmap)
-		page += vmem_altmap_offset(altmap);
-	zone = page_zone(page);
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 	kernel_physical_mapping_remove(start, start + size);
 }
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 0f81fc56bbfd..55fb6b7433ed 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -622,7 +622,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 		if (offset < reserve)
 			return -EINVAL;
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
-		pgmap->altmap_valid = false;
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
 		nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
 					- offset) / PAGE_SIZE);
@@ -634,7 +633,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 		memcpy(altmap, &__altmap, sizeof(*altmap));
 		altmap->free = PHYS_PFN(offset - reserve);
 		altmap->alloc = 0;
-		pgmap->altmap_valid = true;
+		pgmap->flags |= PGMAP_ALTMAP_VALID;
 	} else
 		return -ENXIO;
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 093408ce40ad..e7d8cc9f41e8 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -412,7 +412,6 @@ static int pmem_attach_disk(struct device *dev,
 		bb_res.start += pmem->data_offset;
 	} else if (pmem_should_map_pages(dev)) {
 		memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
-		pmem->pgmap.altmap_valid = false;
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
 		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 336eca601dad..e25685b878e9 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -88,6 +88,8 @@ struct dev_pagemap_ops {
 	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
 };
 
+#define PGMAP_ALTMAP_VALID	(1 << 0)
+
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
@@ -96,19 +98,27 @@ struct dev_pagemap_ops {
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @flags: PGMAP_* flags to specify defailed behavior
  * @ops: method table
  */
 struct dev_pagemap {
 	struct vmem_altmap altmap;
-	bool altmap_valid;
 	struct resource res;
 	struct percpu_ref *ref;
 	struct device *dev;
 	enum memory_type type;
+	unsigned int flags;
 	u64 pci_p2pdma_bus_offset;
 	const struct dev_pagemap_ops *ops;
 };
 
+static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
+{
+	if (pgmap->flags & PGMAP_ALTMAP_VALID)
+		return &pgmap->altmap;
+	return NULL;
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
 void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6c3dbb692037..eee490e7d7e1 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -54,14 +54,8 @@ static void pgmap_array_delete(struct resource *res)
 
 static unsigned long pfn_first(struct dev_pagemap *pgmap)
 {
-	const struct resource *res = &pgmap->res;
-	struct vmem_altmap *altmap = &pgmap->altmap;
-	unsigned long pfn;
-
-	pfn = res->start >> PAGE_SHIFT;
-	if (pgmap->altmap_valid)
-		pfn += vmem_altmap_offset(altmap);
-	return pfn;
+	return (pgmap->res.start >> PAGE_SHIFT) +
+		vmem_altmap_offset(pgmap_altmap(pgmap));
 }
 
 static unsigned long pfn_end(struct dev_pagemap *pgmap)
@@ -109,7 +103,7 @@ static void devm_memremap_pages_release(void *data)
 				align_size >> PAGE_SHIFT, NULL);
 	} else {
 		arch_remove_memory(nid, align_start, align_size,
-				pgmap->altmap_valid ? &pgmap->altmap : NULL);
+				pgmap_altmap(pgmap));
 		kasan_remove_zero_shadow(__va(align_start), align_size);
 	}
 	mem_hotplug_done();
@@ -129,8 +123,8 @@ static void devm_memremap_pages_release(void *data)
  * 1/ At a minimum the res, ref and type and ops members of @pgmap must be
  *    initialized by the caller before passing it to this function
  *
- * 2/ The altmap field may optionally be initialized, in which case altmap_valid
- *    must be set to true
+ * 2/ The altmap field may optionally be initialized, in which case
+ *    PGMAP_ALTMAP_VALID must be set in pgmap->flags.
  *
  * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
  *    at devm_memremap_pages_release() time, or if this routine fails.
@@ -142,15 +136,13 @@ static void devm_memremap_pages_release(void *data)
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 {
 	resource_size_t align_start, align_size, align_end;
-	struct vmem_altmap *altmap = pgmap->altmap_valid ?
-			&pgmap->altmap : NULL;
 	struct resource *res = &pgmap->res;
 	struct dev_pagemap *conflict_pgmap;
 	struct mhp_restrictions restrictions = {
 		/*
 		 * We do not want any optional features only our own memmap
 		*/
-		.altmap = altmap,
+		.altmap = pgmap_altmap(pgmap),
 	};
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
@@ -274,7 +266,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 
 		zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
 		move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
-				align_size >> PAGE_SHIFT, altmap);
+				align_size >> PAGE_SHIFT, pgmap_altmap(pgmap));
 	}
 
 	mem_hotplug_done();
@@ -319,7 +311,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages);
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
 {
 	/* number of pfns from base where pfn_to_page() is valid */
-	return altmap->reserve + altmap->free;
+	if (altmap)
+		return altmap->reserve + altmap->free;
+	return 0;
 }
 
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
diff --git a/mm/hmm.c b/mm/hmm.c
index 36e25cdbdac1..e4470462298f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1442,7 +1442,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	devmem->pagemap.res = *devmem->resource;
 	devmem->pagemap.ops = &hmm_pagemap_ops;
-	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e096c987d261..6166ba5a15f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -557,10 +557,8 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	int sections_to_remove;
 
 	/* In the ZONE_DEVICE case device driver owns the memory region */
-	if (is_dev_zone(zone)) {
-		if (altmap)
-			map_offset = vmem_altmap_offset(altmap);
-	}
+	if (is_dev_zone(zone))
+		map_offset = vmem_altmap_offset(altmap);
 
 	clear_zone_contiguous(zone);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d66bc8abe0af..17a39d40a556 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5853,6 +5853,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 {
 	unsigned long pfn, end_pfn = start_pfn + size;
 	struct pglist_data *pgdat = zone->zone_pgdat;
+	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
 	unsigned long zone_idx = zone_idx(zone);
 	unsigned long start = jiffies;
 	int nid = pgdat->node_id;
@@ -5865,9 +5866,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 	 * of the pages reserved for the memmap, so we can just jump to
 	 * the end of that region and start processing the device pages.
 	 */
-	if (pgmap->altmap_valid) {
-		struct vmem_altmap *altmap = &pgmap->altmap;
-
+	if (altmap) {
 		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
 		size = end_pfn - start_pfn;
 	}
-- 
cgit v1.2.3


From 24917f6b1041a73993178920656e13364f847995 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:14 +0200
Subject: memremap: provide an optional internal refcount in struct dev_pagemap

Provide an internal refcounting logic if no ->ref field is provided
in the pagemap passed into devm_memremap_pages so that callers don't
have to reinvent it poorly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/memremap.h          |  4 +++
 kernel/memremap.c                 | 64 +++++++++++++++++++++++++++++++--------
 tools/testing/nvdimm/test/iomap.c | 58 +++++++++++++++++++++++++++--------
 3 files changed, 101 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index e25685b878e9..f8a5b2a19945 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -95,6 +95,8 @@ struct dev_pagemap_ops {
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @res: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
+ * @internal_ref: internal reference if @ref is not provided by the caller
+ * @done: completion for @internal_ref
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
@@ -105,6 +107,8 @@ struct dev_pagemap {
 	struct vmem_altmap altmap;
 	struct resource res;
 	struct percpu_ref *ref;
+	struct percpu_ref internal_ref;
+	struct completion done;
 	struct device *dev;
 	enum memory_type type;
 	unsigned int flags;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index eee490e7d7e1..bea6f887adad 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -29,7 +29,7 @@ static void devmap_managed_enable_put(void *data)
 
 static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
 {
-	if (!pgmap->ops->page_free) {
+	if (!pgmap->ops || !pgmap->ops->page_free) {
 		WARN(1, "Missing page_free method\n");
 		return -EINVAL;
 	}
@@ -75,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn)
 #define for_each_device_pfn(pfn, map) \
 	for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
 
+static void dev_pagemap_kill(struct dev_pagemap *pgmap)
+{
+	if (pgmap->ops && pgmap->ops->kill)
+		pgmap->ops->kill(pgmap);
+	else
+		percpu_ref_kill(pgmap->ref);
+}
+
+static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
+{
+	if (pgmap->ops && pgmap->ops->cleanup) {
+		pgmap->ops->cleanup(pgmap);
+	} else {
+		wait_for_completion(&pgmap->done);
+		percpu_ref_exit(pgmap->ref);
+	}
+}
+
 static void devm_memremap_pages_release(void *data)
 {
 	struct dev_pagemap *pgmap = data;
@@ -84,10 +102,10 @@ static void devm_memremap_pages_release(void *data)
 	unsigned long pfn;
 	int nid;
 
-	pgmap->ops->kill(pgmap);
+	dev_pagemap_kill(pgmap);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
-	pgmap->ops->cleanup(pgmap);
+	dev_pagemap_cleanup(pgmap);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -114,20 +132,29 @@ static void devm_memremap_pages_release(void *data)
 		      "%s: failed to free all reserved pages\n", __func__);
 }
 
+static void dev_pagemap_percpu_release(struct percpu_ref *ref)
+{
+	struct dev_pagemap *pgmap =
+		container_of(ref, struct dev_pagemap, internal_ref);
+
+	complete(&pgmap->done);
+}
+
 /**
  * devm_memremap_pages - remap and provide memmap backing for the given resource
  * @dev: hosting device for @res
  * @pgmap: pointer to a struct dev_pagemap
  *
  * Notes:
- * 1/ At a minimum the res, ref and type and ops members of @pgmap must be
- *    initialized by the caller before passing it to this function
+ * 1/ At a minimum the res and type members of @pgmap must be initialized
+ *    by the caller before passing it to this function
  *
  * 2/ The altmap field may optionally be initialized, in which case
  *    PGMAP_ALTMAP_VALID must be set in pgmap->flags.
  *
- * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
- *    at devm_memremap_pages_release() time, or if this routine fails.
+ * 3/ The ref field may optionally be provided, in which pgmap->ref must be
+ *    'live' on entry and will be killed and reaped at
+ *    devm_memremap_pages_release() time, or if this routine fails.
  *
  * 4/ res is expected to be a host memory range that could feasibly be
  *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -175,10 +202,21 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		break;
 	}
 
-	if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
-	    !pgmap->ops->cleanup) {
-		WARN(1, "Missing reference count teardown definition\n");
-		return ERR_PTR(-EINVAL);
+	if (!pgmap->ref) {
+		if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
+			return ERR_PTR(-EINVAL);
+
+		init_completion(&pgmap->done);
+		error = percpu_ref_init(&pgmap->internal_ref,
+				dev_pagemap_percpu_release, 0, GFP_KERNEL);
+		if (error)
+			return ERR_PTR(error);
+		pgmap->ref = &pgmap->internal_ref;
+	} else {
+		if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
+			WARN(1, "Missing reference count teardown definition\n");
+			return ERR_PTR(-EINVAL);
+		}
 	}
 
 	if (need_devmap_managed) {
@@ -296,8 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
 	pgmap_array_delete(res);
  err_array:
-	pgmap->ops->kill(pgmap);
-	pgmap->ops->cleanup(pgmap);
+	dev_pagemap_kill(pgmap);
+	dev_pagemap_cleanup(pgmap);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 82f901569e06..cd040b5abffe 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -100,26 +100,60 @@ static void nfit_test_kill(void *_pgmap)
 {
 	struct dev_pagemap *pgmap = _pgmap;
 
-	WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
-		!pgmap->ops->cleanup);
-	pgmap->ops->kill(pgmap);
-	pgmap->ops->cleanup(pgmap);
+	WARN_ON(!pgmap || !pgmap->ref);
+
+	if (pgmap->ops && pgmap->ops->kill)
+		pgmap->ops->kill(pgmap);
+	else
+		percpu_ref_kill(pgmap->ref);
+
+	if (pgmap->ops && pgmap->ops->cleanup) {
+		pgmap->ops->cleanup(pgmap);
+	} else {
+		wait_for_completion(&pgmap->done);
+		percpu_ref_exit(pgmap->ref);
+	}
+}
+
+static void dev_pagemap_percpu_release(struct percpu_ref *ref)
+{
+	struct dev_pagemap *pgmap =
+		container_of(ref, struct dev_pagemap, internal_ref);
+
+	complete(&pgmap->done);
 }
 
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 {
+	int error;
 	resource_size_t offset = pgmap->res.start;
 	struct nfit_test_resource *nfit_res = get_nfit_res(offset);
 
-	if (nfit_res) {
-		int rc;
-
-		rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
-		if (rc)
-			return ERR_PTR(rc);
-		return nfit_res->buf + offset - nfit_res->res.start;
+	if (!nfit_res)
+		return devm_memremap_pages(dev, pgmap);
+
+	pgmap->dev = dev;
+	if (!pgmap->ref) {
+		if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
+			return ERR_PTR(-EINVAL);
+
+		init_completion(&pgmap->done);
+		error = percpu_ref_init(&pgmap->internal_ref,
+				dev_pagemap_percpu_release, 0, GFP_KERNEL);
+		if (error)
+			return ERR_PTR(error);
+		pgmap->ref = &pgmap->internal_ref;
+	} else {
+		if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
+			WARN(1, "Missing reference count teardown definition\n");
+			return ERR_PTR(-EINVAL);
+		}
 	}
-	return devm_memremap_pages(dev, pgmap);
+
+	error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
+	if (error)
+		return ERR_PTR(error);
+	return nfit_res->buf + offset - nfit_res->res.start;
 }
 EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
 
-- 
cgit v1.2.3


From 47e9d836a5e827acdaa5cb6175648fbef15b4e84 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:19 +0200
Subject: mm: remove hmm_vma_alloc_locked_page

The only user of it has just been removed, and there wasn't really any need
to wrap a basic memory allocator to start with.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h |  3 ---
 mm/hmm.c            | 14 --------------
 2 files changed, 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ba19c19e24ed..1d55b7ea2da6 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -587,9 +587,6 @@ static inline void hmm_mm_init(struct mm_struct *mm) {}
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 struct hmm_devmem;
 
-struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
-				       unsigned long addr);
-
 /*
  * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
  *
diff --git a/mm/hmm.c b/mm/hmm.c
index e4470462298f..fdbd48771292 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1330,20 +1330,6 @@ EXPORT_SYMBOL(hmm_range_dma_unmap);
 
 
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
-				       unsigned long addr)
-{
-	struct page *page;
-
-	page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
-	if (!page)
-		return NULL;
-	lock_page(page);
-	return page;
-}
-EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
-
-
 static void hmm_devmem_ref_release(struct percpu_ref *ref)
 {
 	struct hmm_devmem *devmem;
-- 
cgit v1.2.3


From eee3ae41b153e55e25d6cf7bd5b5098ba0afe705 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:20 +0200
Subject: mm: remove hmm_devmem_add

There isn't really much value add in the hmm_devmem_add wrapper and
more, as using devm_memremap_pages directly now is just as simple.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 Documentation/vm/hmm.rst |  26 ----------
 include/linux/hmm.h      | 129 -----------------------------------------------
 mm/hmm.c                 | 110 ----------------------------------------
 3 files changed, 265 deletions(-)

(limited to 'include')

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 7cdf7282e022..50e1380950a9 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -329,32 +329,6 @@ directly using struct page for device memory which left most kernel code paths
 unaware of the difference. We only need to make sure that no one ever tries to
 map those pages from the CPU side.
 
-HMM provides a set of helpers to register and hotplug device memory as a new
-region needing a struct page. This is offered through a very simple API::
-
- struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
-                                   struct device *device,
-                                   unsigned long size);
- void hmm_devmem_remove(struct hmm_devmem *devmem);
-
-The hmm_devmem_ops is where most of the important things are::
-
- struct hmm_devmem_ops {
-     void (*free)(struct hmm_devmem *devmem, struct page *page);
-     int (*fault)(struct hmm_devmem *devmem,
-                  struct vm_area_struct *vma,
-                  unsigned long addr,
-                  struct page *page,
-                  unsigned flags,
-                  pmd_t *pmdp);
- };
-
-The first callback (free()) happens when the last reference on a device page is
-dropped. This means the device page is now free and no longer used by anyone.
-The second callback happens whenever the CPU tries to access a device page
-which it cannot do. This second callback must trigger a migration back to
-system memory.
-
 
 Migration to and from device memory
 ===================================
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 1d55b7ea2da6..86aa4ec3404c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -585,135 +585,6 @@ static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-struct hmm_devmem;
-
-/*
- * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
- *
- * @free: call when refcount on page reach 1 and thus is no longer use
- * @fault: call when there is a page fault to unaddressable memory
- *
- * Both callback happens from page_free() and page_fault() callback of struct
- * dev_pagemap respectively. See include/linux/memremap.h for more details on
- * those.
- *
- * The hmm_devmem_ops callback are just here to provide a coherent and
- * uniq API to device driver and device driver should not register their
- * own page_free() or page_fault() but rely on the hmm_devmem_ops call-
- * back.
- */
-struct hmm_devmem_ops {
-	/*
-	 * free() - free a device page
-	 * @devmem: device memory structure (see struct hmm_devmem)
-	 * @page: pointer to struct page being freed
-	 *
-	 * Call back occurs whenever a device page refcount reach 1 which
-	 * means that no one is holding any reference on the page anymore
-	 * (ZONE_DEVICE page have an elevated refcount of 1 as default so
-	 * that they are not release to the general page allocator).
-	 *
-	 * Note that callback has exclusive ownership of the page (as no
-	 * one is holding any reference).
-	 */
-	void (*free)(struct hmm_devmem *devmem, struct page *page);
-	/*
-	 * fault() - CPU page fault or get user page (GUP)
-	 * @devmem: device memory structure (see struct hmm_devmem)
-	 * @vma: virtual memory area containing the virtual address
-	 * @addr: virtual address that faulted or for which there is a GUP
-	 * @page: pointer to struct page backing virtual address (unreliable)
-	 * @flags: FAULT_FLAG_* (see include/linux/mm.h)
-	 * @pmdp: page middle directory
-	 * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
-	 *   on error
-	 *
-	 * The callback occurs whenever there is a CPU page fault or GUP on a
-	 * virtual address. This means that the device driver must migrate the
-	 * page back to regular memory (CPU accessible).
-	 *
-	 * The device driver is free to migrate more than one page from the
-	 * fault() callback as an optimization. However if device decide to
-	 * migrate more than one page it must always priotirize the faulting
-	 * address over the others.
-	 *
-	 * The struct page pointer is only given as an hint to allow quick
-	 * lookup of internal device driver data. A concurrent migration
-	 * might have already free that page and the virtual address might
-	 * not longer be back by it. So it should not be modified by the
-	 * callback.
-	 *
-	 * Note that mmap semaphore is held in read mode at least when this
-	 * callback occurs, hence the vma is valid upon callback entry.
-	 */
-	vm_fault_t (*fault)(struct hmm_devmem *devmem,
-		     struct vm_area_struct *vma,
-		     unsigned long addr,
-		     const struct page *page,
-		     unsigned int flags,
-		     pmd_t *pmdp);
-};
-
-/*
- * struct hmm_devmem - track device memory
- *
- * @completion: completion object for device memory
- * @pfn_first: first pfn for this resource (set by hmm_devmem_add())
- * @pfn_last: last pfn for this resource (set by hmm_devmem_add())
- * @resource: IO resource reserved for this chunk of memory
- * @pagemap: device page map for that chunk
- * @device: device to bind resource to
- * @ops: memory operations callback
- * @ref: per CPU refcount
- * @page_fault: callback when CPU fault on an unaddressable device page
- *
- * This an helper structure for device drivers that do not wish to implement
- * the gory details related to hotplugging new memoy and allocating struct
- * pages.
- *
- * Device drivers can directly use ZONE_DEVICE memory on their own if they
- * wish to do so.
- *
- * The page_fault() callback must migrate page back, from device memory to
- * system memory, so that the CPU can access it. This might fail for various
- * reasons (device issues,  device have been unplugged, ...). When such error
- * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
- * set the CPU page table entry to "poisoned".
- *
- * Note that because memory cgroup charges are transferred to the device memory,
- * this should never fail due to memory restrictions. However, allocation
- * of a regular system page might still fail because we are out of memory. If
- * that happens, the page_fault() callback must return VM_FAULT_OOM.
- *
- * The page_fault() callback can also try to migrate back multiple pages in one
- * chunk, as an optimization. It must, however, prioritize the faulting address
- * over all the others.
- */
-
-struct hmm_devmem {
-	struct completion		completion;
-	unsigned long			pfn_first;
-	unsigned long			pfn_last;
-	struct resource			*resource;
-	struct device			*device;
-	struct dev_pagemap		pagemap;
-	const struct hmm_devmem_ops	*ops;
-	struct percpu_ref		ref;
-};
-
-/*
- * To add (hotplug) device memory, HMM assumes that there is no real resource
- * that reserves a range in the physical address space (this is intended to be
- * use by unaddressable device memory). It will reserve a physical range big
- * enough and allocate struct page for it.
- *
- * The device driver can wrap the hmm_devmem struct inside a private device
- * driver struct.
- */
-struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
-				  struct device *device,
-				  unsigned long size);
-
 /*
  * hmm_devmem_page_set_drvdata - set per-page driver data field
  *
diff --git a/mm/hmm.c b/mm/hmm.c
index fdbd48771292..90ca0cdab9db 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1327,113 +1327,3 @@ long hmm_range_dma_unmap(struct hmm_range *range,
 }
 EXPORT_SYMBOL(hmm_range_dma_unmap);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-
-
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-static void hmm_devmem_ref_release(struct percpu_ref *ref)
-{
-	struct hmm_devmem *devmem;
-
-	devmem = container_of(ref, struct hmm_devmem, ref);
-	complete(&devmem->completion);
-}
-
-static void hmm_devmem_ref_exit(struct dev_pagemap *pgmap)
-{
-	struct hmm_devmem *devmem;
-
-	devmem = container_of(pgmap, struct hmm_devmem, pagemap);
-	wait_for_completion(&devmem->completion);
-	percpu_ref_exit(pgmap->ref);
-}
-
-static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
-{
-	percpu_ref_kill(pgmap->ref);
-}
-
-static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf)
-{
-	struct hmm_devmem *devmem =
-		container_of(vmf->page->pgmap, struct hmm_devmem, pagemap);
-
-	return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page,
-			vmf->flags, vmf->pmd);
-}
-
-static void hmm_devmem_free(struct page *page)
-{
-	struct hmm_devmem *devmem =
-		container_of(page->pgmap, struct hmm_devmem, pagemap);
-
-	devmem->ops->free(devmem, page);
-}
-
-static const struct dev_pagemap_ops hmm_pagemap_ops = {
-	.page_free		= hmm_devmem_free,
-	.kill			= hmm_devmem_ref_kill,
-	.cleanup		= hmm_devmem_ref_exit,
-	.migrate_to_ram		= hmm_devmem_migrate_to_ram,
-};
-
-/*
- * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
- *
- * @ops: memory event device driver callback (see struct hmm_devmem_ops)
- * @device: device struct to bind the resource too
- * @size: size in bytes of the device memory to add
- * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
- *
- * This function first finds an empty range of physical address big enough to
- * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
- * in turn allocates struct pages. It does not do anything beyond that; all
- * events affecting the memory will go through the various callbacks provided
- * by hmm_devmem_ops struct.
- *
- * Device driver should call this function during device initialization and
- * is then responsible of memory management. HMM only provides helpers.
- */
-struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
-				  struct device *device,
-				  unsigned long size)
-{
-	struct hmm_devmem *devmem;
-	void *result;
-	int ret;
-
-	devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
-	if (!devmem)
-		return ERR_PTR(-ENOMEM);
-
-	init_completion(&devmem->completion);
-	devmem->pfn_first = -1UL;
-	devmem->pfn_last = -1UL;
-	devmem->resource = NULL;
-	devmem->device = device;
-	devmem->ops = ops;
-
-	ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
-			      0, GFP_KERNEL);
-	if (ret)
-		return ERR_PTR(ret);
-
-	devmem->resource = devm_request_free_mem_region(device, &iomem_resource,
-			size);
-	if (IS_ERR(devmem->resource))
-		return ERR_CAST(devmem->resource);
-	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
-	devmem->pfn_last = devmem->pfn_first +
-			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-
-	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.ops = &hmm_pagemap_ops;
-	devmem->pagemap.ref = &devmem->ref;
-
-	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
-	if (IS_ERR(result))
-		return result;
-	return devmem;
-}
-EXPORT_SYMBOL_GPL(hmm_devmem_add);
-#endif /* CONFIG_DEVICE_PRIVATE  */
-- 
cgit v1.2.3


From 8a164fef9c4ccf6ff7757170397222860e40d192 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:21 +0200
Subject: mm: simplify ZONE_DEVICE page private data

Remove the clumsy hmm_devmem_page_{get,set}_drvdata helpers, and
instead just access the page directly.  Also make the page data
a void pointer, and thus much easier to use.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/gpu/drm/nouveau/nouveau_dmem.c | 18 +++++++-----------
 include/linux/hmm.h                    | 32 --------------------------------
 include/linux/mm_types.h               |  2 +-
 mm/page_alloc.c                        |  8 ++++----
 4 files changed, 12 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 0fb7a44b8bc4..42c026010938 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -104,11 +104,8 @@ struct nouveau_migrate {
 
 static void nouveau_dmem_page_free(struct page *page)
 {
-	struct nouveau_dmem_chunk *chunk;
-	unsigned long idx;
-
-	chunk = (void *)hmm_devmem_page_get_drvdata(page);
-	idx = page_to_pfn(page) - chunk->pfn_first;
+	struct nouveau_dmem_chunk *chunk = page->zone_device_data;
+	unsigned long idx = page_to_pfn(page) - chunk->pfn_first;
 
 	/*
 	 * FIXME:
@@ -200,7 +197,7 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
 
 		dst_addr = fault->dma[fault->npages++];
 
-		chunk = (void *)hmm_devmem_page_get_drvdata(spage);
+		chunk = spage->zone_device_data;
 		src_addr = page_to_pfn(spage) - chunk->pfn_first;
 		src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
 
@@ -633,9 +630,8 @@ nouveau_dmem_init(struct nouveau_drm *drm)
 		list_add_tail(&chunk->list, &drm->dmem->chunk_empty);
 
 		page = pfn_to_page(chunk->pfn_first);
-		for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page) {
-			hmm_devmem_page_set_drvdata(page, (long)chunk);
-		}
+		for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page)
+			page->zone_device_data = chunk;
 	}
 
 	NV_INFO(drm, "DMEM: registered %ldMB of device memory\n", size >> 20);
@@ -698,7 +694,7 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma,
 		if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
 			continue;
 
-		chunk = (void *)hmm_devmem_page_get_drvdata(dpage);
+		chunk = dpage->zone_device_data;
 		dst_addr = page_to_pfn(dpage) - chunk->pfn_first;
 		dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
 
@@ -862,7 +858,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
 			continue;
 		}
 
-		chunk = (void *)hmm_devmem_page_get_drvdata(page);
+		chunk = page->zone_device_data;
 		addr = page_to_pfn(page) - chunk->pfn_first;
 		addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT;
 
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 86aa4ec3404c..3d00e9550e77 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -584,36 +584,4 @@ static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-/*
- * hmm_devmem_page_set_drvdata - set per-page driver data field
- *
- * @page: pointer to struct page
- * @data: driver data value to set
- *
- * Because page can not be on lru we have an unsigned long that driver can use
- * to store a per page field. This just a simple helper to do that.
- */
-static inline void hmm_devmem_page_set_drvdata(struct page *page,
-					       unsigned long data)
-{
-	page->hmm_data = data;
-}
-
-/*
- * hmm_devmem_page_get_drvdata - get per page driver data field
- *
- * @page: pointer to struct page
- * Return: driver data value
- */
-static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
-{
-	return page->hmm_data;
-}
-#endif /* CONFIG_DEVICE_PRIVATE */
-#else /* IS_ENABLED(CONFIG_HMM) */
-static inline void hmm_mm_destroy(struct mm_struct *mm) {}
-static inline void hmm_mm_init(struct mm_struct *mm) {}
-#endif /* IS_ENABLED(CONFIG_HMM) */
-
 #endif /* LINUX_HMM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8ec38b11b361..f33a1289c101 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -158,7 +158,7 @@ struct page {
 		struct {	/* ZONE_DEVICE pages */
 			/** @pgmap: Points to the hosting device page map. */
 			struct dev_pagemap *pgmap;
-			unsigned long hmm_data;
+			void *zone_device_data;
 			unsigned long _zd_pad_1;	/* uses mapping */
 		};
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17a39d40a556..c0e031c52db5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5886,12 +5886,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
 		__SetPageReserved(page);
 
 		/*
-		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
-		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
-		 * page is ever freed or placed on a driver-private list.
+		 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
+		 * and zone_device_data.  It is a bug if a ZONE_DEVICE page is
+		 * ever freed or placed on a driver-private list.
 		 */
 		page->pgmap = pgmap;
-		page->hmm_data = 0;
+		page->zone_device_data = NULL;
 
 		/*
 		 * Mark the block movable so that blocks are reserved for
-- 
cgit v1.2.3


From 43535b0aefab29ea6564e608de4c783ed2ab6c49 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:23 +0200
Subject: mm: remove the HMM config option

All the mm/hmm.c code is better keyed off HMM_MIRROR.  Also let nouveau
depend on it instead of the mix of a dummy dependency symbol plus the
actually selected one.  Drop various odd dependencies, as the code is
pretty portable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/gpu/drm/nouveau/Kconfig |  3 +--
 include/linux/hmm.h             |  5 +----
 include/linux/mm_types.h        |  2 +-
 mm/Kconfig                      | 27 ++++-----------------------
 mm/Makefile                     |  2 +-
 mm/hmm.c                        |  2 --
 6 files changed, 8 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig
index 6303d203ab1d..66c839d8e9d1 100644
--- a/drivers/gpu/drm/nouveau/Kconfig
+++ b/drivers/gpu/drm/nouveau/Kconfig
@@ -84,11 +84,10 @@ config DRM_NOUVEAU_BACKLIGHT
 
 config DRM_NOUVEAU_SVM
 	bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support"
-	depends on ARCH_HAS_HMM
 	depends on DEVICE_PRIVATE
 	depends on DRM_NOUVEAU
+	depends on HMM_MIRROR
 	depends on STAGING
-	select HMM_MIRROR
 	default n
 	help
 	  Say Y here if you want to enable experimental support for
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 3d00e9550e77..b697496e85ba 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -62,7 +62,7 @@
 #include <linux/kconfig.h>
 #include <asm/pgtable.h>
 
-#if IS_ENABLED(CONFIG_HMM)
+#ifdef CONFIG_HMM_MIRROR
 
 #include <linux/device.h>
 #include <linux/migrate.h>
@@ -332,9 +332,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
 	return hmm_device_entry_from_pfn(range, pfn);
 }
 
-
-
-#if IS_ENABLED(CONFIG_HMM_MIRROR)
 /*
  * Mirroring: how to synchronize device page table with CPU page table.
  *
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f33a1289c101..8d37182f8dbe 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -501,7 +501,7 @@ struct mm_struct {
 #endif
 		struct work_struct async_put_work;
 
-#if IS_ENABLED(CONFIG_HMM)
+#ifdef CONFIG_HMM_MIRROR
 		/* HMM needs to track a few things per mm */
 		struct hmm *hmm;
 #endif
diff --git a/mm/Kconfig b/mm/Kconfig
index eecf037a54b3..1e426c26b1d6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -669,37 +669,18 @@ config ZONE_DEVICE
 
 	  If FS_DAX is enabled, then say Y.
 
-config ARCH_HAS_HMM_MIRROR
-	bool
-	default y
-	depends on (X86_64 || PPC64)
-	depends on MMU && 64BIT
-
-config ARCH_HAS_HMM
-	bool
-	depends on (X86_64 || PPC64)
-	depends on ZONE_DEVICE
-	depends on MMU && 64BIT
-	depends on MEMORY_HOTPLUG
-	depends on MEMORY_HOTREMOVE
-	depends on SPARSEMEM_VMEMMAP
-	default y
-
 config MIGRATE_VMA_HELPER
 	bool
 
 config DEV_PAGEMAP_OPS
 	bool
 
-config HMM
-	bool
-	select MMU_NOTIFIER
-	select MIGRATE_VMA_HELPER
-
 config HMM_MIRROR
 	bool "HMM mirror CPU page table into a device page table"
-	depends on ARCH_HAS_HMM
-	select HMM
+	depends on (X86_64 || PPC64)
+	depends on MMU && 64BIT
+	select MMU_NOTIFIER
+	select MIGRATE_VMA_HELPER
 	help
 	  Select HMM_MIRROR if you want to mirror range of the CPU page table of a
 	  process into a device page table. Here, mirror means "keep synchronized".
diff --git a/mm/Makefile b/mm/Makefile
index ac5e5ba78874..91c99040065c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
 obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
-obj-$(CONFIG_HMM) += hmm.o
+obj-$(CONFIG_HMM_MIRROR) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/hmm.c b/mm/hmm.c
index 90ca0cdab9db..d62ce64d6bca 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -25,7 +25,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
-#if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
 static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
@@ -1326,4 +1325,3 @@ long hmm_range_dma_unmap(struct hmm_range *range,
 	return cpages;
 }
 EXPORT_SYMBOL(hmm_range_dma_unmap);
-#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-- 
cgit v1.2.3


From 303ae1cdfdf7280ff4cfbbe65563b5ff15bb025b Mon Sep 17 00:00:00 2001
From: Bernard Metzler <bmt@zurich.ibm.com>
Date: Thu, 20 Jun 2019 18:21:27 +0200
Subject: rdma/siw: application interface

Broken up commit to add the Soft iWarp RDMA driver.

Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/siw/siw_verbs.c    | 1760 ++++++++++++++++++++++++++++++
 drivers/infiniband/sw/siw/siw_verbs.h    |   91 ++
 include/uapi/rdma/rdma_user_ioctl_cmds.h |    1 +
 include/uapi/rdma/siw-abi.h              |  185 ++++
 4 files changed, 2037 insertions(+)
 create mode 100644 drivers/infiniband/sw/siw/siw_verbs.c
 create mode 100644 drivers/infiniband/sw/siw/siw_verbs.h
 create mode 100644 include/uapi/rdma/siw-abi.h

(limited to 'include')

diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
new file mode 100644
index 000000000000..32dc79d0e898
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -0,0 +1,1760 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/xarray.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = SIW_QP_STATE_IDLE,
+	[IB_QPS_INIT] = SIW_QP_STATE_IDLE,
+	[IB_QPS_RTR] = SIW_QP_STATE_RTR,
+	[IB_QPS_RTS] = SIW_QP_STATE_RTS,
+	[IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
+	[IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
+	[IB_QPS_ERR] = SIW_QP_STATE_ERROR
+};
+
+static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
+	[IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
+	[IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
+	[IB_QPS_ERR] = "ERR"
+};
+
+static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size)
+{
+	struct siw_uobj *uobj;
+	struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY);
+	u32 key;
+
+	uobj = kzalloc(sizeof(*uobj), GFP_KERNEL);
+	if (!uobj)
+		return SIW_INVAL_UOBJ_KEY;
+
+	if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey,
+			    GFP_KERNEL) < 0) {
+		kfree(uobj);
+		return SIW_INVAL_UOBJ_KEY;
+	}
+	uobj->size = PAGE_ALIGN(size);
+	uobj->addr = vaddr;
+
+	return key;
+}
+
+static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx,
+				     unsigned long off, u32 size)
+{
+	struct siw_uobj *uobj = xa_load(&uctx->xa, off);
+
+	if (uobj && uobj->size == size)
+		return uobj;
+
+	return NULL;
+}
+
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
+{
+	struct siw_ucontext *uctx = to_siw_ctx(ctx);
+	struct siw_uobj *uobj;
+	unsigned long off = vma->vm_pgoff;
+	int size = vma->vm_end - vma->vm_start;
+	int rv = -EINVAL;
+
+	/*
+	 * Must be page aligned
+	 */
+	if (vma->vm_start & (PAGE_SIZE - 1)) {
+		pr_warn("siw: mmap not page aligned\n");
+		goto out;
+	}
+	uobj = siw_get_uobj(uctx, off, size);
+	if (!uobj) {
+		siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n",
+			off, size);
+		goto out;
+	}
+	rv = remap_vmalloc_range(vma, uobj->addr, 0);
+	if (rv)
+		pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size);
+out:
+	return rv;
+}
+
+int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(base_ctx->device);
+	struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
+	struct siw_uresp_alloc_ctx uresp = {};
+	int rv;
+
+	if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC);
+	ctx->uobj_nextkey = 0;
+	ctx->sdev = sdev;
+
+	uresp.dev_id = sdev->vendor_part_id;
+
+	if (udata->outlen < sizeof(uresp)) {
+		rv = -EINVAL;
+		goto err_out;
+	}
+	rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (rv)
+		goto err_out;
+
+	siw_dbg(base_ctx->device, "success. now %d context(s)\n",
+		atomic_read(&sdev->num_ctx));
+
+	return 0;
+
+err_out:
+	atomic_dec(&sdev->num_ctx);
+	siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
+		atomic_read(&sdev->num_ctx));
+
+	return rv;
+}
+
+void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
+{
+	struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
+	void *entry;
+	unsigned long index;
+
+	/*
+	 * Make sure all user mmap objects are gone. Since QP, CQ
+	 * and SRQ destroy routines destroy related objects, nothing
+	 * should be found here.
+	 */
+	xa_for_each(&uctx->xa, index, entry) {
+		kfree(xa_erase(&uctx->xa, index));
+		pr_warn("siw: dropping orphaned uobj at %lu\n", index);
+	}
+	xa_destroy(&uctx->xa);
+	atomic_dec(&uctx->sdev->num_ctx);
+}
+
+int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
+		     struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(base_dev);
+
+	if (udata->inlen || udata->outlen)
+		return -EINVAL;
+
+	memset(attr, 0, sizeof(*attr));
+
+	/* Revisit atomic caps if RFC 7306 gets supported */
+	attr->atomic_cap = 0;
+	attr->device_cap_flags =
+		IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG;
+	attr->max_cq = sdev->attrs.max_cq;
+	attr->max_cqe = sdev->attrs.max_cqe;
+	attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
+	attr->max_fmr = sdev->attrs.max_fmr;
+	attr->max_mr = sdev->attrs.max_mr;
+	attr->max_mw = sdev->attrs.max_mw;
+	attr->max_mr_size = ~0ull;
+	attr->max_pd = sdev->attrs.max_pd;
+	attr->max_qp = sdev->attrs.max_qp;
+	attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
+	attr->max_qp_rd_atom = sdev->attrs.max_ord;
+	attr->max_qp_wr = sdev->attrs.max_qp_wr;
+	attr->max_recv_sge = sdev->attrs.max_sge;
+	attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
+	attr->max_send_sge = sdev->attrs.max_sge;
+	attr->max_sge_rd = sdev->attrs.max_sge_rd;
+	attr->max_srq = sdev->attrs.max_srq;
+	attr->max_srq_sge = sdev->attrs.max_srq_sge;
+	attr->max_srq_wr = sdev->attrs.max_srq_wr;
+	attr->page_size_cap = PAGE_SIZE;
+	attr->vendor_id = SIW_VENDOR_ID;
+	attr->vendor_part_id = sdev->vendor_part_id;
+
+	memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6);
+
+	return 0;
+}
+
+int siw_query_port(struct ib_device *base_dev, u8 port,
+		   struct ib_port_attr *attr)
+{
+	struct siw_device *sdev = to_siw_dev(base_dev);
+
+	memset(attr, 0, sizeof(*attr));
+
+	attr->active_mtu = attr->max_mtu;
+	attr->active_speed = 2;
+	attr->active_width = 2;
+	attr->gid_tbl_len = 1;
+	attr->max_msg_sz = -1;
+	attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+	attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3;
+	attr->pkey_tbl_len = 1;
+	attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
+	attr->state = sdev->state;
+	/*
+	 * All zero
+	 *
+	 * attr->lid = 0;
+	 * attr->bad_pkey_cntr = 0;
+	 * attr->qkey_viol_cntr = 0;
+	 * attr->sm_lid = 0;
+	 * attr->lmc = 0;
+	 * attr->max_vl_num = 0;
+	 * attr->sm_sl = 0;
+	 * attr->subnet_timeout = 0;
+	 * attr->init_type_repy = 0;
+	 */
+	return 0;
+}
+
+int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
+			   struct ib_port_immutable *port_immutable)
+{
+	struct ib_port_attr attr;
+	int rv = siw_query_port(base_dev, port, &attr);
+
+	if (rv)
+		return rv;
+
+	port_immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	port_immutable->gid_tbl_len = attr.gid_tbl_len;
+	port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+	return 0;
+}
+
+int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey)
+{
+	/* Report the default pkey */
+	*pkey = 0xffff;
+	return 0;
+}
+
+int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
+		  union ib_gid *gid)
+{
+	struct siw_device *sdev = to_siw_dev(base_dev);
+
+	/* subnet_prefix == interface_id == 0; */
+	memset(gid, 0, sizeof(*gid));
+	memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6);
+
+	return 0;
+}
+
+int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+
+	if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
+		atomic_dec(&sdev->num_pd);
+		return -ENOMEM;
+	}
+	siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
+
+	return 0;
+}
+
+void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+
+	siw_dbg_pd(pd, "free PD\n");
+	atomic_dec(&sdev->num_pd);
+}
+
+void siw_qp_get_ref(struct ib_qp *base_qp)
+{
+	siw_qp_get(to_siw_qp(base_qp));
+}
+
+void siw_qp_put_ref(struct ib_qp *base_qp)
+{
+	siw_qp_put(to_siw_qp(base_qp));
+}
+
+/*
+ * siw_create_qp()
+ *
+ * Create QP of requested size on given device.
+ *
+ * @pd:		Protection Domain
+ * @attrs:	Initial QP attributes.
+ * @udata:	used to provide QP ID, SQ and RQ size back to user.
+ */
+
+struct ib_qp *siw_create_qp(struct ib_pd *pd,
+			    struct ib_qp_init_attr *attrs,
+			    struct ib_udata *udata)
+{
+	struct siw_qp *qp = NULL;
+	struct siw_base_qp *siw_base_qp = NULL;
+	struct ib_device *base_dev = pd->device;
+	struct siw_device *sdev = to_siw_dev(base_dev);
+	struct siw_ucontext *uctx =
+		rdma_udata_to_drv_context(udata, struct siw_ucontext,
+					  base_ucontext);
+	struct siw_cq *scq = NULL, *rcq = NULL;
+	unsigned long flags;
+	int num_sqe, num_rqe, rv = 0;
+
+	siw_dbg(base_dev, "create new QP\n");
+
+	if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
+		siw_dbg(base_dev, "too many QP's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (attrs->qp_type != IB_QPT_RC) {
+		siw_dbg(base_dev, "only RC QP's supported\n");
+		rv = -EINVAL;
+		goto err_out;
+	}
+	if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
+	    (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
+	    (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
+	    (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
+		siw_dbg(base_dev, "QP size error\n");
+		rv = -EINVAL;
+		goto err_out;
+	}
+	if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
+		siw_dbg(base_dev, "max inline send: %d > %d\n",
+			attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
+		rv = -EINVAL;
+		goto err_out;
+	}
+	/*
+	 * NOTE: we allow for zero element SQ and RQ WQE's SGL's
+	 * but not for a QP unable to hold any WQE (SQ + RQ)
+	 */
+	if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
+		siw_dbg(base_dev, "QP must have send or receive queue\n");
+		rv = -EINVAL;
+		goto err_out;
+	}
+	scq = to_siw_cq(attrs->send_cq);
+	rcq = to_siw_cq(attrs->recv_cq);
+
+	if (!scq || (!rcq && !attrs->srq)) {
+		siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
+		rv = -EINVAL;
+		goto err_out;
+	}
+	siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL);
+	if (!siw_base_qp) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	siw_base_qp->qp = qp;
+	qp->ib_qp = &siw_base_qp->base_qp;
+
+	init_rwsem(&qp->state_lock);
+	spin_lock_init(&qp->sq_lock);
+	spin_lock_init(&qp->rq_lock);
+	spin_lock_init(&qp->orq_lock);
+
+	qp->kernel_verbs = !udata;
+	qp->xa_sq_index = SIW_INVAL_UOBJ_KEY;
+	qp->xa_rq_index = SIW_INVAL_UOBJ_KEY;
+
+	rv = siw_qp_add(sdev, qp);
+	if (rv)
+		goto err_out;
+
+	/* All queue indices are derived from modulo operations
+	 * on a free running 'get' (consumer) and 'put' (producer)
+	 * unsigned counter. Having queue sizes at power of two
+	 * avoids handling counter wrap around.
+	 */
+	num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr);
+	num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr);
+
+	if (qp->kernel_verbs)
+		qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe));
+	else
+		qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
+
+	if (qp->sendq == NULL) {
+		siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe);
+		rv = -ENOMEM;
+		goto err_out_xa;
+	}
+	if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
+		if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
+			qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
+		else {
+			rv = -EINVAL;
+			goto err_out_xa;
+		}
+	}
+	qp->pd = pd;
+	qp->scq = scq;
+	qp->rcq = rcq;
+
+	if (attrs->srq) {
+		/*
+		 * SRQ support.
+		 * Verbs 6.3.7: ignore RQ size, if SRQ present
+		 * Verbs 6.3.5: do not check PD of SRQ against PD of QP
+		 */
+		qp->srq = to_siw_srq(attrs->srq);
+		qp->attrs.rq_size = 0;
+		siw_dbg(base_dev, "QP [%u]: [SRQ 0x%p] attached\n",
+			qp->qp_num, qp->srq);
+	} else if (num_rqe) {
+		if (qp->kernel_verbs)
+			qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe));
+		else
+			qp->recvq =
+				vmalloc_user(num_rqe * sizeof(struct siw_rqe));
+
+		if (qp->recvq == NULL) {
+			siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe);
+			rv = -ENOMEM;
+			goto err_out_xa;
+		}
+		qp->attrs.rq_size = num_rqe;
+	}
+	qp->attrs.sq_size = num_sqe;
+	qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
+	qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
+
+	/* Make those two tunables fixed for now. */
+	qp->tx_ctx.gso_seg_limit = 1;
+	qp->tx_ctx.zcopy_tx = zcopy_tx;
+
+	qp->attrs.state = SIW_QP_STATE_IDLE;
+
+	if (udata) {
+		struct siw_uresp_create_qp uresp = {};
+
+		uresp.num_sqe = num_sqe;
+		uresp.num_rqe = num_rqe;
+		uresp.qp_id = qp_id(qp);
+
+		if (qp->sendq) {
+			qp->xa_sq_index =
+				siw_create_uobj(uctx, qp->sendq,
+					num_sqe * sizeof(struct siw_sqe));
+		}
+		if (qp->recvq) {
+			qp->xa_rq_index =
+				 siw_create_uobj(uctx, qp->recvq,
+					num_rqe * sizeof(struct siw_rqe));
+		}
+		if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY ||
+		    qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) {
+			rv = -ENOMEM;
+			goto err_out_xa;
+		}
+		uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT;
+		uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT;
+
+		if (udata->outlen < sizeof(uresp)) {
+			rv = -EINVAL;
+			goto err_out_xa;
+		}
+		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (rv)
+			goto err_out_xa;
+	}
+	qp->tx_cpu = siw_get_tx_cpu(sdev);
+	if (qp->tx_cpu < 0) {
+		rv = -EINVAL;
+		goto err_out_xa;
+	}
+	INIT_LIST_HEAD(&qp->devq);
+	spin_lock_irqsave(&sdev->lock, flags);
+	list_add_tail(&qp->devq, &sdev->qp_list);
+	spin_unlock_irqrestore(&sdev->lock, flags);
+
+	return qp->ib_qp;
+
+err_out_xa:
+	xa_erase(&sdev->qp_xa, qp_id(qp));
+err_out:
+	kfree(siw_base_qp);
+
+	if (qp) {
+		if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
+			kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
+		if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
+			kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
+
+		vfree(qp->sendq);
+		vfree(qp->recvq);
+		kfree(qp);
+	}
+	atomic_dec(&sdev->num_qp);
+
+	return ERR_PTR(rv);
+}
+
+/*
+ * Minimum siw_query_qp() verb interface.
+ *
+ * @qp_attr_mask is not used but all available information is provided
+ */
+int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct siw_qp *qp;
+	struct siw_device *sdev;
+
+	if (base_qp && qp_attr && qp_init_attr) {
+		qp = to_siw_qp(base_qp);
+		sdev = to_siw_dev(base_qp->device);
+	} else {
+		return -EINVAL;
+	}
+	qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
+	qp_attr->cap.max_send_wr = qp->attrs.sq_size;
+	qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
+	qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
+	qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
+	qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+	qp_attr->max_rd_atomic = qp->attrs.irq_size;
+	qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
+
+	qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
+				   IB_ACCESS_REMOTE_WRITE |
+				   IB_ACCESS_REMOTE_READ;
+
+	qp_init_attr->qp_type = base_qp->qp_type;
+	qp_init_attr->send_cq = base_qp->send_cq;
+	qp_init_attr->recv_cq = base_qp->recv_cq;
+	qp_init_attr->srq = base_qp->srq;
+
+	qp_init_attr->cap = qp_attr->cap;
+
+	return 0;
+}
+
+int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
+			int attr_mask, struct ib_udata *udata)
+{
+	struct siw_qp_attrs new_attrs;
+	enum siw_qp_attr_mask siw_attr_mask = 0;
+	struct siw_qp *qp = to_siw_qp(base_qp);
+	int rv = 0;
+
+	if (!attr_mask)
+		return 0;
+
+	memset(&new_attrs, 0, sizeof(new_attrs));
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+		siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
+
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+			new_attrs.flags |= SIW_RDMA_READ_ENABLED;
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+			new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+		if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
+			new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
+	}
+	if (attr_mask & IB_QP_STATE) {
+		siw_dbg_qp(qp, "desired IB QP state: %s\n",
+			   ib_qp_state_to_string[attr->qp_state]);
+
+		new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
+
+		if (new_attrs.state > SIW_QP_STATE_RTS)
+			qp->tx_ctx.tx_suspend = 1;
+
+		siw_attr_mask |= SIW_QP_ATTR_STATE;
+	}
+	if (!siw_attr_mask)
+		goto out;
+
+	down_write(&qp->state_lock);
+
+	rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
+
+	up_write(&qp->state_lock);
+out:
+	return rv;
+}
+
+int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
+{
+	struct siw_qp *qp = to_siw_qp(base_qp);
+	struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp);
+	struct siw_ucontext *uctx =
+		rdma_udata_to_drv_context(udata, struct siw_ucontext,
+					  base_ucontext);
+	struct siw_qp_attrs qp_attrs;
+
+	siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep);
+
+	/*
+	 * Mark QP as in process of destruction to prevent from
+	 * any async callbacks to RDMA core
+	 */
+	qp->attrs.flags |= SIW_QP_IN_DESTROY;
+	qp->rx_stream.rx_suspend = 1;
+
+	if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
+		kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
+	if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
+		kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
+
+	down_write(&qp->state_lock);
+
+	qp_attrs.state = SIW_QP_STATE_ERROR;
+	siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
+
+	if (qp->cep) {
+		siw_cep_put(qp->cep);
+		qp->cep = NULL;
+	}
+	up_write(&qp->state_lock);
+
+	kfree(qp->tx_ctx.mpa_crc_hd);
+	kfree(qp->rx_stream.mpa_crc_hd);
+
+	qp->scq = qp->rcq = NULL;
+
+	siw_qp_put(qp);
+	kfree(siw_base_qp);
+
+	return 0;
+}
+
+/*
+ * siw_copy_inline_sgl()
+ *
+ * Prepare sgl of inlined data for sending. For userland callers
+ * function checks if given buffer addresses and len's are within
+ * process context bounds.
+ * Data from all provided sge's are copied together into the wqe,
+ * referenced by a single sge.
+ */
+static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
+			       struct siw_sqe *sqe)
+{
+	struct ib_sge *core_sge = core_wr->sg_list;
+	void *kbuf = &sqe->sge[1];
+	int num_sge = core_wr->num_sge, bytes = 0;
+
+	sqe->sge[0].laddr = (u64)kbuf;
+	sqe->sge[0].lkey = 0;
+
+	while (num_sge--) {
+		if (!core_sge->length) {
+			core_sge++;
+			continue;
+		}
+		bytes += core_sge->length;
+		if (bytes > SIW_MAX_INLINE) {
+			bytes = -EINVAL;
+			break;
+		}
+		memcpy(kbuf, (void *)(uintptr_t)core_sge->addr,
+		       core_sge->length);
+
+		kbuf += core_sge->length;
+		core_sge++;
+	}
+	sqe->sge[0].length = bytes > 0 ? bytes : 0;
+	sqe->num_sge = bytes > 0 ? 1 : 0;
+
+	return bytes;
+}
+
+/*
+ * siw_post_send()
+ *
+ * Post a list of S-WR's to a SQ.
+ *
+ * @base_qp:	Base QP contained in siw QP
+ * @wr:		Null terminated list of user WR's
+ * @bad_wr:	Points to failing WR in case of synchronous failure.
+ */
+int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
+		  const struct ib_send_wr **bad_wr)
+{
+	struct siw_qp *qp = to_siw_qp(base_qp);
+	struct siw_wqe *wqe = tx_wqe(qp);
+
+	unsigned long flags;
+	int rv = 0;
+
+	/*
+	 * Try to acquire QP state lock. Must be non-blocking
+	 * to accommodate kernel clients needs.
+	 */
+	if (!down_read_trylock(&qp->state_lock)) {
+		*bad_wr = wr;
+		siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state);
+		return -ENOTCONN;
+	}
+	if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
+		up_read(&qp->state_lock);
+		*bad_wr = wr;
+		siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state);
+		return -ENOTCONN;
+	}
+	if (wr && !qp->kernel_verbs) {
+		siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
+		up_read(&qp->state_lock);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+	spin_lock_irqsave(&qp->sq_lock, flags);
+
+	while (wr) {
+		u32 idx = qp->sq_put % qp->attrs.sq_size;
+		struct siw_sqe *sqe = &qp->sendq[idx];
+
+		if (sqe->flags) {
+			siw_dbg_qp(qp, "sq full\n");
+			rv = -ENOMEM;
+			break;
+		}
+		if (wr->num_sge > qp->attrs.sq_max_sges) {
+			siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
+			rv = -EINVAL;
+			break;
+		}
+		sqe->id = wr->wr_id;
+
+		if ((wr->send_flags & IB_SEND_SIGNALED) ||
+		    (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
+			sqe->flags |= SIW_WQE_SIGNALLED;
+
+		if (wr->send_flags & IB_SEND_FENCE)
+			sqe->flags |= SIW_WQE_READ_FENCE;
+
+		switch (wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (wr->send_flags & IB_SEND_SOLICITED)
+				sqe->flags |= SIW_WQE_SOLICITED;
+
+			if (!(wr->send_flags & IB_SEND_INLINE)) {
+				siw_copy_sgl(wr->sg_list, sqe->sge,
+					     wr->num_sge);
+				sqe->num_sge = wr->num_sge;
+			} else {
+				rv = siw_copy_inline_sgl(wr, sqe);
+				if (rv <= 0) {
+					rv = -EINVAL;
+					break;
+				}
+				sqe->flags |= SIW_WQE_INLINE;
+				sqe->num_sge = 1;
+			}
+			if (wr->opcode == IB_WR_SEND)
+				sqe->opcode = SIW_OP_SEND;
+			else {
+				sqe->opcode = SIW_OP_SEND_REMOTE_INV;
+				sqe->rkey = wr->ex.invalidate_rkey;
+			}
+			break;
+
+		case IB_WR_RDMA_READ_WITH_INV:
+		case IB_WR_RDMA_READ:
+			/*
+			 * iWarp restricts RREAD sink to SGL containing
+			 * 1 SGE only. we could relax to SGL with multiple
+			 * elements referring the SAME ltag or even sending
+			 * a private per-rreq tag referring to a checked
+			 * local sgl with MULTIPLE ltag's.
+			 */
+			if (unlikely(wr->num_sge != 1)) {
+				rv = -EINVAL;
+				break;
+			}
+			siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
+			/*
+			 * NOTE: zero length RREAD is allowed!
+			 */
+			sqe->raddr = rdma_wr(wr)->remote_addr;
+			sqe->rkey = rdma_wr(wr)->rkey;
+			sqe->num_sge = 1;
+
+			if (wr->opcode == IB_WR_RDMA_READ)
+				sqe->opcode = SIW_OP_READ;
+			else
+				sqe->opcode = SIW_OP_READ_LOCAL_INV;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (!(wr->send_flags & IB_SEND_INLINE)) {
+				siw_copy_sgl(wr->sg_list, &sqe->sge[0],
+					     wr->num_sge);
+				sqe->num_sge = wr->num_sge;
+			} else {
+				rv = siw_copy_inline_sgl(wr, sqe);
+				if (unlikely(rv < 0)) {
+					rv = -EINVAL;
+					break;
+				}
+				sqe->flags |= SIW_WQE_INLINE;
+				sqe->num_sge = 1;
+			}
+			sqe->raddr = rdma_wr(wr)->remote_addr;
+			sqe->rkey = rdma_wr(wr)->rkey;
+			sqe->opcode = SIW_OP_WRITE;
+			break;
+
+		case IB_WR_REG_MR:
+			sqe->base_mr = (uint64_t)reg_wr(wr)->mr;
+			sqe->rkey = reg_wr(wr)->key;
+			sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
+			sqe->opcode = SIW_OP_REG_MR;
+			break;
+
+		case IB_WR_LOCAL_INV:
+			sqe->rkey = wr->ex.invalidate_rkey;
+			sqe->opcode = SIW_OP_INVAL_STAG;
+			break;
+
+		default:
+			siw_dbg_qp(qp, "ib wr type %d unsupported\n",
+				   wr->opcode);
+			rv = -EINVAL;
+			break;
+		}
+		siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n",
+			   sqe->opcode, sqe->flags, (void *)sqe->id);
+
+		if (unlikely(rv < 0))
+			break;
+
+		/* make SQE only valid after completely written */
+		smp_wmb();
+		sqe->flags |= SIW_WQE_VALID;
+
+		qp->sq_put++;
+		wr = wr->next;
+	}
+
+	/*
+	 * Send directly if SQ processing is not in progress.
+	 * Eventual immediate errors (rv < 0) do not affect the involved
+	 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
+	 * processing, if new work is already pending. But rv must be passed
+	 * to caller.
+	 */
+	if (wqe->wr_status != SIW_WR_IDLE) {
+		spin_unlock_irqrestore(&qp->sq_lock, flags);
+		goto skip_direct_sending;
+	}
+	rv = siw_activate_tx(qp);
+	spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+	if (rv <= 0)
+		goto skip_direct_sending;
+
+	if (qp->kernel_verbs) {
+		rv = siw_sq_start(qp);
+	} else {
+		qp->tx_ctx.in_syscall = 1;
+
+		if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
+			siw_qp_cm_drop(qp, 0);
+
+		qp->tx_ctx.in_syscall = 0;
+	}
+skip_direct_sending:
+
+	up_read(&qp->state_lock);
+
+	if (rv >= 0)
+		return 0;
+	/*
+	 * Immediate error
+	 */
+	siw_dbg_qp(qp, "error %d\n", rv);
+
+	*bad_wr = wr;
+	return rv;
+}
+
+/*
+ * siw_post_receive()
+ *
+ * Post a list of R-WR's to a RQ.
+ *
+ * @base_qp:	Base QP contained in siw QP
+ * @wr:		Null terminated list of user WR's
+ * @bad_wr:	Points to failing WR in case of synchronous failure.
+ */
+int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
+		     const struct ib_recv_wr **bad_wr)
+{
+	struct siw_qp *qp = to_siw_qp(base_qp);
+	unsigned long flags;
+	int rv = 0;
+
+	if (qp->srq) {
+		*bad_wr = wr;
+		return -EOPNOTSUPP; /* what else from errno.h? */
+	}
+	/*
+	 * Try to acquire QP state lock. Must be non-blocking
+	 * to accommodate kernel clients needs.
+	 */
+	if (!down_read_trylock(&qp->state_lock)) {
+		*bad_wr = wr;
+		return -ENOTCONN;
+	}
+	if (!qp->kernel_verbs) {
+		siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n");
+		up_read(&qp->state_lock);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+	if (qp->attrs.state > SIW_QP_STATE_RTS) {
+		up_read(&qp->state_lock);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+	/*
+	 * Serialize potentially multiple producers.
+	 * Not needed for single threaded consumer side.
+	 */
+	spin_lock_irqsave(&qp->rq_lock, flags);
+
+	while (wr) {
+		u32 idx = qp->rq_put % qp->attrs.rq_size;
+		struct siw_rqe *rqe = &qp->recvq[idx];
+
+		if (rqe->flags) {
+			siw_dbg_qp(qp, "RQ full\n");
+			rv = -ENOMEM;
+			break;
+		}
+		if (wr->num_sge > qp->attrs.rq_max_sges) {
+			siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
+			rv = -EINVAL;
+			break;
+		}
+		rqe->id = wr->wr_id;
+		rqe->num_sge = wr->num_sge;
+		siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
+
+		/* make sure RQE is completely written before valid */
+		smp_wmb();
+
+		rqe->flags = SIW_WQE_VALID;
+
+		qp->rq_put++;
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&qp->rq_lock, flags);
+
+	up_read(&qp->state_lock);
+
+	if (rv < 0) {
+		siw_dbg_qp(qp, "error %d\n", rv);
+		*bad_wr = wr;
+	}
+	return rv > 0 ? 0 : rv;
+}
+
+void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
+{
+	struct siw_cq *cq = to_siw_cq(base_cq);
+	struct siw_device *sdev = to_siw_dev(base_cq->device);
+	struct siw_ucontext *ctx =
+		rdma_udata_to_drv_context(udata, struct siw_ucontext,
+					  base_ucontext);
+
+	siw_dbg_cq(cq, "free CQ resources\n");
+
+	siw_cq_flush(cq);
+
+	if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
+		kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
+
+	atomic_dec(&sdev->num_cq);
+
+	vfree(cq->queue);
+}
+
+/*
+ * siw_create_cq()
+ *
+ * Populate CQ of requested size
+ *
+ * @base_cq: CQ as allocated by RDMA midlayer
+ * @attr: Initial CQ attributes
+ * @udata: relates to user context
+ */
+
+int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(base_cq->device);
+	struct siw_cq *cq = to_siw_cq(base_cq);
+	int rv, size = attr->cqe;
+
+	if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
+		siw_dbg(base_cq->device, "too many CQ's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (size < 1 || size > sdev->attrs.max_cqe) {
+		siw_dbg(base_cq->device, "CQ size error: %d\n", size);
+		rv = -EINVAL;
+		goto err_out;
+	}
+	size = roundup_pow_of_two(size);
+	cq->base_cq.cqe = size;
+	cq->num_cqe = size;
+	cq->xa_cq_index = SIW_INVAL_UOBJ_KEY;
+
+	if (!udata) {
+		cq->kernel_verbs = 1;
+		cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
+				    sizeof(struct siw_cq_ctrl));
+	} else {
+		cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
+					 sizeof(struct siw_cq_ctrl));
+	}
+	if (cq->queue == NULL) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	get_random_bytes(&cq->id, 4);
+	siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
+
+	spin_lock_init(&cq->lock);
+
+	cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify;
+
+	if (udata) {
+		struct siw_uresp_create_cq uresp = {};
+		struct siw_ucontext *ctx =
+			rdma_udata_to_drv_context(udata, struct siw_ucontext,
+						  base_ucontext);
+
+		cq->xa_cq_index =
+			siw_create_uobj(ctx, cq->queue,
+					size * sizeof(struct siw_cqe) +
+						sizeof(struct siw_cq_ctrl));
+		if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) {
+			rv = -ENOMEM;
+			goto err_out;
+		}
+		uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT;
+		uresp.cq_id = cq->id;
+		uresp.num_cqe = size;
+
+		if (udata->outlen < sizeof(uresp)) {
+			rv = -EINVAL;
+			goto err_out;
+		}
+		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (rv)
+			goto err_out;
+	}
+	return 0;
+
+err_out:
+	siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
+
+	if (cq && cq->queue) {
+		struct siw_ucontext *ctx =
+			rdma_udata_to_drv_context(udata, struct siw_ucontext,
+						  base_ucontext);
+		if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
+			kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
+		vfree(cq->queue);
+	}
+	atomic_dec(&sdev->num_cq);
+
+	return rv;
+}
+
+/*
+ * siw_poll_cq()
+ *
+ * Reap CQ entries if available and copy work completion status into
+ * array of WC's provided by caller. Returns number of reaped CQE's.
+ *
+ * @base_cq:	Base CQ contained in siw CQ.
+ * @num_cqe:	Maximum number of CQE's to reap.
+ * @wc:		Array of work completions to be filled by siw.
+ */
+int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
+{
+	struct siw_cq *cq = to_siw_cq(base_cq);
+	int i;
+
+	for (i = 0; i < num_cqe; i++) {
+		if (!siw_reap_cqe(cq, wc))
+			break;
+		wc++;
+	}
+	return i;
+}
+
+/*
+ * siw_req_notify_cq()
+ *
+ * Request notification for new CQE's added to that CQ.
+ * Defined flags:
+ * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
+ *   event if a WQE with notification flag set enters the CQ
+ * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
+ *   event if a WQE enters the CQ.
+ * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
+ *   number of not reaped CQE's regardless of its notification
+ *   type and current or new CQ notification settings.
+ *
+ * @base_cq:	Base CQ contained in siw CQ.
+ * @flags:	Requested notification flags.
+ */
+int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
+{
+	struct siw_cq *cq = to_siw_cq(base_cq);
+
+	siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
+
+	if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		/* CQ event for next solicited completion */
+		smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED);
+	else
+		/* CQ event for any signalled completion */
+		smp_store_mb(*cq->notify, SIW_NOTIFY_ALL);
+
+	if (flags & IB_CQ_REPORT_MISSED_EVENTS)
+		return cq->cq_put - cq->cq_get;
+
+	return 0;
+}
+
+/*
+ * siw_dereg_mr()
+ *
+ * Release Memory Region.
+ *
+ * @base_mr: Base MR contained in siw MR.
+ * @udata: points to user context, unused.
+ */
+int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
+{
+	struct siw_mr *mr = to_siw_mr(base_mr);
+	struct siw_device *sdev = to_siw_dev(base_mr->device);
+
+	siw_dbg_mem(mr->mem, "deregister MR\n");
+
+	atomic_dec(&sdev->num_mr);
+
+	siw_mr_drop_mem(mr);
+	kfree_rcu(mr, rcu);
+
+	return 0;
+}
+
+/*
+ * siw_reg_user_mr()
+ *
+ * Register Memory Region.
+ *
+ * @pd:		Protection Domain
+ * @start:	starting address of MR (virtual address)
+ * @len:	len of MR
+ * @rnic_va:	not used by siw
+ * @rights:	MR access rights
+ * @udata:	user buffer to communicate STag and Key.
+ */
+struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
+			      u64 rnic_va, int rights, struct ib_udata *udata)
+{
+	struct siw_mr *mr = NULL;
+	struct siw_umem *umem = NULL;
+	struct siw_ureq_reg_mr ureq;
+	struct siw_device *sdev = to_siw_dev(pd->device);
+
+	unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
+	int rv;
+
+	siw_dbg_pd(pd, "start: 0x%016llx, va: 0x%016llx, len: %llu\n",
+		   (unsigned long long)start, (unsigned long long)rnic_va,
+		   (unsigned long long)len);
+
+	if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+		siw_dbg_pd(pd, "too many mr's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (!len) {
+		rv = -EINVAL;
+		goto err_out;
+	}
+	if (mem_limit != RLIM_INFINITY) {
+		unsigned long num_pages =
+			(PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
+		mem_limit >>= PAGE_SHIFT;
+
+		if (num_pages > mem_limit - current->mm->locked_vm) {
+			siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
+				   num_pages, mem_limit,
+				   current->mm->locked_vm);
+			rv = -ENOMEM;
+			goto err_out;
+		}
+	}
+	umem = siw_umem_get(start, len, ib_access_writable(rights));
+	if (IS_ERR(umem)) {
+		rv = PTR_ERR(umem);
+		siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
+		umem = NULL;
+		goto err_out;
+	}
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
+	if (rv)
+		goto err_out;
+
+	if (udata) {
+		struct siw_uresp_reg_mr uresp = {};
+		struct siw_mem *mem = mr->mem;
+
+		if (udata->inlen < sizeof(ureq)) {
+			rv = -EINVAL;
+			goto err_out;
+		}
+		rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
+		if (rv)
+			goto err_out;
+
+		mr->base_mr.lkey |= ureq.stag_key;
+		mr->base_mr.rkey |= ureq.stag_key;
+		mem->stag |= ureq.stag_key;
+		uresp.stag = mem->stag;
+
+		if (udata->outlen < sizeof(uresp)) {
+			rv = -EINVAL;
+			goto err_out;
+		}
+		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (rv)
+			goto err_out;
+	}
+	mr->mem->stag_valid = 1;
+
+	return &mr->base_mr;
+
+err_out:
+	atomic_dec(&sdev->num_mr);
+	if (mr) {
+		if (mr->mem)
+			siw_mr_drop_mem(mr);
+		kfree_rcu(mr, rcu);
+	} else {
+		if (umem)
+			siw_umem_release(umem, false);
+	}
+	return ERR_PTR(rv);
+}
+
+struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			   u32 max_sge, struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+	struct siw_mr *mr = NULL;
+	struct siw_pbl *pbl = NULL;
+	int rv;
+
+	if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+		siw_dbg_pd(pd, "too many mr's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (mr_type != IB_MR_TYPE_MEM_REG) {
+		siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
+		rv = -EOPNOTSUPP;
+		goto err_out;
+	}
+	if (max_sge > SIW_MAX_SGE_PBL) {
+		siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	pbl = siw_pbl_alloc(max_sge);
+	if (IS_ERR(pbl)) {
+		rv = PTR_ERR(pbl);
+		siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
+		pbl = NULL;
+		goto err_out;
+	}
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
+	if (rv)
+		goto err_out;
+
+	mr->mem->is_pbl = 1;
+
+	siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
+
+	return &mr->base_mr;
+
+err_out:
+	atomic_dec(&sdev->num_mr);
+
+	if (!mr) {
+		kfree(pbl);
+	} else {
+		if (mr->mem)
+			siw_mr_drop_mem(mr);
+		kfree_rcu(mr, rcu);
+	}
+	siw_dbg_pd(pd, "failed: %d\n", rv);
+
+	return ERR_PTR(rv);
+}
+
+/* Just used to count number of pages being mapped */
+static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
+{
+	return 0;
+}
+
+int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
+		  unsigned int *sg_off)
+{
+	struct scatterlist *slp;
+	struct siw_mr *mr = to_siw_mr(base_mr);
+	struct siw_mem *mem = mr->mem;
+	struct siw_pbl *pbl = mem->pbl;
+	struct siw_pble *pble;
+	u64 pbl_size;
+	int i, rv;
+
+	if (!pbl) {
+		siw_dbg_mem(mem, "no PBL allocated\n");
+		return -EINVAL;
+	}
+	pble = pbl->pbe;
+
+	if (pbl->max_buf < num_sle) {
+		siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
+			    mem->pbl->max_buf, num_sle);
+		return -ENOMEM;
+	}
+	for_each_sg(sl, slp, num_sle, i) {
+		if (sg_dma_len(slp) == 0) {
+			siw_dbg_mem(mem, "empty SGE\n");
+			return -EINVAL;
+		}
+		if (i == 0) {
+			pble->addr = sg_dma_address(slp);
+			pble->size = sg_dma_len(slp);
+			pble->pbl_off = 0;
+			pbl_size = pble->size;
+			pbl->num_buf = 1;
+		} else {
+			/* Merge PBL entries if adjacent */
+			if (pble->addr + pble->size == sg_dma_address(slp)) {
+				pble->size += sg_dma_len(slp);
+			} else {
+				pble++;
+				pbl->num_buf++;
+				pble->addr = sg_dma_address(slp);
+				pble->size = sg_dma_len(slp);
+				pble->pbl_off = pbl_size;
+			}
+			pbl_size += sg_dma_len(slp);
+		}
+		siw_dbg_mem(mem,
+			"sge[%d], size %llu, addr 0x%016llx, total %llu\n",
+			i, pble->size, pble->addr, pbl_size);
+	}
+	rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
+	if (rv > 0) {
+		mem->len = base_mr->length;
+		mem->va = base_mr->iova;
+		siw_dbg_mem(mem,
+			"%llu bytes, start 0x%016llx, %u SLE to %u entries\n",
+			mem->len, mem->va, num_sle, pbl->num_buf);
+	}
+	return rv;
+}
+
+/*
+ * siw_get_dma_mr()
+ *
+ * Create a (empty) DMA memory region, where no umem is attached.
+ */
+struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+	struct siw_mr *mr = NULL;
+	int rv;
+
+	if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+		siw_dbg_pd(pd, "too many mr's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
+	if (rv)
+		goto err_out;
+
+	mr->mem->stag_valid = 1;
+
+	siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
+
+	return &mr->base_mr;
+
+err_out:
+	if (rv)
+		kfree(mr);
+
+	atomic_dec(&sdev->num_mr);
+
+	return ERR_PTR(rv);
+}
+
+/*
+ * siw_create_srq()
+ *
+ * Create Shared Receive Queue of attributes @init_attrs
+ * within protection domain given by @pd.
+ *
+ * @base_srq:	Base SRQ contained in siw SRQ.
+ * @init_attrs:	SRQ init attributes.
+ * @udata:	points to user context
+ */
+int siw_create_srq(struct ib_srq *base_srq,
+		   struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
+{
+	struct siw_srq *srq = to_siw_srq(base_srq);
+	struct ib_srq_attr *attrs = &init_attrs->attr;
+	struct siw_device *sdev = to_siw_dev(base_srq->device);
+	struct siw_ucontext *ctx =
+		rdma_udata_to_drv_context(udata, struct siw_ucontext,
+					  base_ucontext);
+	int rv;
+
+	if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
+		siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
+	    attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
+		rv = -EINVAL;
+		goto err_out;
+	}
+	srq->max_sge = attrs->max_sge;
+	srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
+	srq->xa_srq_index = SIW_INVAL_UOBJ_KEY;
+	srq->limit = attrs->srq_limit;
+	if (srq->limit)
+		srq->armed = 1;
+
+	srq->kernel_verbs = !udata;
+
+	if (udata)
+		srq->recvq =
+			vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
+	else
+		srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe));
+
+	if (srq->recvq == NULL) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (udata) {
+		struct siw_uresp_create_srq uresp = {};
+
+		srq->xa_srq_index = siw_create_uobj(
+			ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe));
+
+		if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) {
+			rv = -ENOMEM;
+			goto err_out;
+		}
+		uresp.srq_key = srq->xa_srq_index;
+		uresp.num_rqe = srq->num_rqe;
+
+		if (udata->outlen < sizeof(uresp)) {
+			rv = -EINVAL;
+			goto err_out;
+		}
+		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (rv)
+			goto err_out;
+	}
+	spin_lock_init(&srq->lock);
+
+	siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: success\n", srq);
+
+	return 0;
+
+err_out:
+	if (srq->recvq) {
+		if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
+			kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
+		vfree(srq->recvq);
+	}
+	atomic_dec(&sdev->num_srq);
+
+	return rv;
+}
+
+/*
+ * siw_modify_srq()
+ *
+ * Modify SRQ. The caller may resize SRQ and/or set/reset notification
+ * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
+ *
+ * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
+ * parameter. siw_modify_srq() does not check the attrs->max_sge param.
+ */
+int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
+		   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct siw_srq *srq = to_siw_srq(base_srq);
+	unsigned long flags;
+	int rv = 0;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		/* resize request not yet supported */
+		rv = -EOPNOTSUPP;
+		goto out;
+	}
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attrs->srq_limit) {
+			if (unlikely(attrs->srq_limit > srq->num_rqe)) {
+				rv = -EINVAL;
+				goto out;
+			}
+			srq->armed = 1;
+		} else {
+			srq->armed = 0;
+		}
+		srq->limit = attrs->srq_limit;
+	}
+out:
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return rv;
+}
+
+/*
+ * siw_query_srq()
+ *
+ * Query SRQ attributes.
+ */
+int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
+{
+	struct siw_srq *srq = to_siw_srq(base_srq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	attrs->max_wr = srq->num_rqe;
+	attrs->max_sge = srq->max_sge;
+	attrs->srq_limit = srq->limit;
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return 0;
+}
+
+/*
+ * siw_destroy_srq()
+ *
+ * Destroy SRQ.
+ * It is assumed that the SRQ is not referenced by any
+ * QP anymore - the code trusts the RDMA core environment to keep track
+ * of QP references.
+ */
+void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
+{
+	struct siw_srq *srq = to_siw_srq(base_srq);
+	struct siw_device *sdev = to_siw_dev(base_srq->device);
+	struct siw_ucontext *ctx =
+		rdma_udata_to_drv_context(udata, struct siw_ucontext,
+					  base_ucontext);
+
+	if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
+		kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
+
+	vfree(srq->recvq);
+	atomic_dec(&sdev->num_srq);
+}
+
+/*
+ * siw_post_srq_recv()
+ *
+ * Post a list of receive queue elements to SRQ.
+ * NOTE: The function does not check or lock a certain SRQ state
+ *       during the post operation. The code simply trusts the
+ *       RDMA core environment.
+ *
+ * @base_srq:	Base SRQ contained in siw SRQ
+ * @wr:		List of R-WR's
+ * @bad_wr:	Updated to failing WR if posting fails.
+ */
+int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
+		      const struct ib_recv_wr **bad_wr)
+{
+	struct siw_srq *srq = to_siw_srq(base_srq);
+	unsigned long flags;
+	int rv = 0;
+
+	if (unlikely(!srq->kernel_verbs)) {
+		siw_dbg_pd(base_srq->pd,
+			   "[SRQ 0x%p]: no kernel post_recv for mapped srq\n",
+			   srq);
+		rv = -EINVAL;
+		goto out;
+	}
+	/*
+	 * Serialize potentially multiple producers.
+	 * Also needed to serialize potentially multiple
+	 * consumers.
+	 */
+	spin_lock_irqsave(&srq->lock, flags);
+
+	while (wr) {
+		u32 idx = srq->rq_put % srq->num_rqe;
+		struct siw_rqe *rqe = &srq->recvq[idx];
+
+		if (rqe->flags) {
+			siw_dbg_pd(base_srq->pd, "SRQ full\n");
+			rv = -ENOMEM;
+			break;
+		}
+		if (unlikely(wr->num_sge > srq->max_sge)) {
+			siw_dbg_pd(base_srq->pd,
+				   "[SRQ 0x%p]: too many sge's: %d\n", srq,
+				   wr->num_sge);
+			rv = -EINVAL;
+			break;
+		}
+		rqe->id = wr->wr_id;
+		rqe->num_sge = wr->num_sge;
+		siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
+
+		/* Make sure S-RQE is completely written before valid */
+		smp_wmb();
+
+		rqe->flags = SIW_WQE_VALID;
+
+		srq->rq_put++;
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&srq->lock, flags);
+out:
+	if (unlikely(rv < 0)) {
+		siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: error %d\n", srq, rv);
+		*bad_wr = wr;
+	}
+	return rv;
+}
+
+void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
+{
+	struct ib_event event;
+	struct ib_qp *base_qp = qp->ib_qp;
+
+	/*
+	 * Do not report asynchronous errors on QP which gets
+	 * destroyed via verbs interface (siw_destroy_qp())
+	 */
+	if (qp->attrs.flags & SIW_QP_IN_DESTROY)
+		return;
+
+	event.event = etype;
+	event.device = base_qp->device;
+	event.element.qp = base_qp;
+
+	if (base_qp->event_handler) {
+		siw_dbg_qp(qp, "reporting event %d\n", etype);
+		base_qp->event_handler(&event, base_qp->qp_context);
+	}
+}
+
+void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
+{
+	struct ib_event event;
+	struct ib_cq *base_cq = &cq->base_cq;
+
+	event.event = etype;
+	event.device = base_cq->device;
+	event.element.cq = base_cq;
+
+	if (base_cq->event_handler) {
+		siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
+		base_cq->event_handler(&event, base_cq->cq_context);
+	}
+}
+
+void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
+{
+	struct ib_event event;
+	struct ib_srq *base_srq = &srq->base_srq;
+
+	event.event = etype;
+	event.device = base_srq->device;
+	event.element.srq = base_srq;
+
+	if (base_srq->event_handler) {
+		siw_dbg_pd(srq->base_srq.pd,
+			   "reporting SRQ event %d\n", etype);
+		base_srq->event_handler(&event, base_srq->srq_context);
+	}
+}
+
+void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype)
+{
+	struct ib_event event;
+
+	event.event = etype;
+	event.device = &sdev->base_dev;
+	event.element.port_num = port;
+
+	siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
+
+	ib_dispatch_event(&event);
+}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h
new file mode 100644
index 000000000000..1910869281cb
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_verbs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_VERBS_H
+#define _SIW_VERBS_H
+
+#include <linux/errno.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+
+/*
+ * siw_copy_sgl()
+ *
+ * Copy SGL from RDMA core representation to local
+ * representation.
+ */
+static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge,
+				int num_sge)
+{
+	while (num_sge--) {
+		siw_sge->laddr = sge->addr;
+		siw_sge->length = sge->length;
+		siw_sge->lkey = sge->lkey;
+
+		siw_sge++;
+		sge++;
+	}
+}
+
+int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata);
+void siw_dealloc_ucontext(struct ib_ucontext *base_ctx);
+int siw_query_port(struct ib_device *base_dev, u8 port,
+		   struct ib_port_attr *attr);
+int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
+			   struct ib_port_immutable *port_immutable);
+int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
+		     struct ib_udata *udata);
+int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata);
+int siw_query_port(struct ib_device *base_dev, u8 port,
+		   struct ib_port_attr *attr);
+int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey);
+int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
+		  union ib_gid *gid);
+int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
+void siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
+struct ib_qp *siw_create_qp(struct ib_pd *base_pd,
+			    struct ib_qp_init_attr *attr,
+			    struct ib_udata *udata);
+int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
+			int attr_mask, struct ib_udata *udata);
+int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata);
+int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
+		  const struct ib_send_wr **bad_wr);
+int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
+		     const struct ib_recv_wr **bad_wr);
+void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata);
+int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc);
+int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags);
+struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len,
+			      u64 rnic_va, int rights, struct ib_udata *udata);
+struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type,
+			   u32 max_sge, struct ib_udata *udata);
+struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights);
+int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
+		  unsigned int *sg_off);
+int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata);
+int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *attr,
+		   struct ib_udata *udata);
+int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr,
+		   enum ib_srq_attr_mask mask, struct ib_udata *udata);
+int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr);
+void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata);
+int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
+		      const struct ib_recv_wr **bad_wr);
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma);
+void siw_qp_event(struct siw_qp *qp, enum ib_event_type type);
+void siw_cq_event(struct siw_cq *cq, enum ib_event_type type);
+void siw_srq_event(struct siw_srq *srq, enum ib_event_type type);
+void siw_port_event(struct siw_device *dev, u8 port, enum ib_event_type type);
+
+#endif
diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h
index 26213f49f5c8..64c14cb0022f 100644
--- a/include/uapi/rdma/rdma_user_ioctl_cmds.h
+++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h
@@ -103,6 +103,7 @@ enum rdma_driver_id {
 	RDMA_DRIVER_HFI1,
 	RDMA_DRIVER_QIB,
 	RDMA_DRIVER_EFA,
+	RDMA_DRIVER_SIW,
 };
 
 #endif
diff --git a/include/uapi/rdma/siw-abi.h b/include/uapi/rdma/siw-abi.h
new file mode 100644
index 000000000000..3dd8071ace7b
--- /dev/null
+++ b/include/uapi/rdma/siw-abi.h
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_USER_H
+#define _SIW_USER_H
+
+#include <linux/types.h>
+
+#define SIW_NODE_DESC_COMMON "Software iWARP stack"
+#define SIW_ABI_VERSION 1
+#define SIW_MAX_SGE 6
+#define SIW_UOBJ_MAX_KEY 0x08FFFF
+#define SIW_INVAL_UOBJ_KEY (SIW_UOBJ_MAX_KEY + 1)
+
+struct siw_uresp_create_cq {
+	__u32 cq_id;
+	__u32 num_cqe;
+	__aligned_u64 cq_key;
+};
+
+struct siw_uresp_create_qp {
+	__u32 qp_id;
+	__u32 num_sqe;
+	__u32 num_rqe;
+	__u32 pad;
+	__aligned_u64 sq_key;
+	__aligned_u64 rq_key;
+};
+
+struct siw_ureq_reg_mr {
+	__u8 stag_key;
+	__u8 reserved[3];
+	__u32 pad;
+};
+
+struct siw_uresp_reg_mr {
+	__u32 stag;
+	__u32 pad;
+};
+
+struct siw_uresp_create_srq {
+	__u32 num_rqe;
+	__u32 pad;
+	__aligned_u64 srq_key;
+};
+
+struct siw_uresp_alloc_ctx {
+	__u32 dev_id;
+	__u32 pad;
+};
+
+enum siw_opcode {
+	SIW_OP_WRITE,
+	SIW_OP_READ,
+	SIW_OP_READ_LOCAL_INV,
+	SIW_OP_SEND,
+	SIW_OP_SEND_WITH_IMM,
+	SIW_OP_SEND_REMOTE_INV,
+
+	/* Unsupported */
+	SIW_OP_FETCH_AND_ADD,
+	SIW_OP_COMP_AND_SWAP,
+
+	SIW_OP_RECEIVE,
+	/* provider internal SQE */
+	SIW_OP_READ_RESPONSE,
+	/*
+	 * below opcodes valid for
+	 * in-kernel clients only
+	 */
+	SIW_OP_INVAL_STAG,
+	SIW_OP_REG_MR,
+	SIW_NUM_OPCODES
+};
+
+/* Keep it same as ibv_sge to allow for memcpy */
+struct siw_sge {
+	__aligned_u64 laddr;
+	__u32 length;
+	__u32 lkey;
+};
+
+/*
+ * Inline data are kept within the work request itself occupying
+ * the space of sge[1] .. sge[n]. Therefore, inline data cannot be
+ * supported if SIW_MAX_SGE is below 2 elements.
+ */
+#define SIW_MAX_INLINE (sizeof(struct siw_sge) * (SIW_MAX_SGE - 1))
+
+#if SIW_MAX_SGE < 2
+#error "SIW_MAX_SGE must be at least 2"
+#endif
+
+enum siw_wqe_flags {
+	SIW_WQE_VALID = 1,
+	SIW_WQE_INLINE = (1 << 1),
+	SIW_WQE_SIGNALLED = (1 << 2),
+	SIW_WQE_SOLICITED = (1 << 3),
+	SIW_WQE_READ_FENCE = (1 << 4),
+	SIW_WQE_REM_INVAL = (1 << 5),
+	SIW_WQE_COMPLETED = (1 << 6)
+};
+
+/* Send Queue Element */
+struct siw_sqe {
+	__aligned_u64 id;
+	__u16 flags;
+	__u8 num_sge;
+	/* Contains enum siw_opcode values */
+	__u8 opcode;
+	__u32 rkey;
+	union {
+		__aligned_u64 raddr;
+		__aligned_u64 base_mr;
+	};
+	union {
+		struct siw_sge sge[SIW_MAX_SGE];
+		__aligned_u64 access;
+	};
+};
+
+/* Receive Queue Element */
+struct siw_rqe {
+	__aligned_u64 id;
+	__u16 flags;
+	__u8 num_sge;
+	/*
+	 * only used by kernel driver,
+	 * ignored if set by user
+	 */
+	__u8 opcode;
+	__u32 unused;
+	struct siw_sge sge[SIW_MAX_SGE];
+};
+
+enum siw_notify_flags {
+	SIW_NOTIFY_NOT = (0),
+	SIW_NOTIFY_SOLICITED = (1 << 0),
+	SIW_NOTIFY_NEXT_COMPLETION = (1 << 1),
+	SIW_NOTIFY_MISSED_EVENTS = (1 << 2),
+	SIW_NOTIFY_ALL = SIW_NOTIFY_SOLICITED | SIW_NOTIFY_NEXT_COMPLETION |
+			 SIW_NOTIFY_MISSED_EVENTS
+};
+
+enum siw_wc_status {
+	SIW_WC_SUCCESS,
+	SIW_WC_LOC_LEN_ERR,
+	SIW_WC_LOC_PROT_ERR,
+	SIW_WC_LOC_QP_OP_ERR,
+	SIW_WC_WR_FLUSH_ERR,
+	SIW_WC_BAD_RESP_ERR,
+	SIW_WC_LOC_ACCESS_ERR,
+	SIW_WC_REM_ACCESS_ERR,
+	SIW_WC_REM_INV_REQ_ERR,
+	SIW_WC_GENERAL_ERR,
+	SIW_NUM_WC_STATUS
+};
+
+struct siw_cqe {
+	__aligned_u64 id;
+	__u8 flags;
+	__u8 opcode;
+	__u16 status;
+	__u32 bytes;
+	union {
+		__aligned_u64 imm_data;
+		__u32 inval_stag;
+	};
+	/* QP number or QP pointer */
+	union {
+		struct ib_qp *base_qp;
+		__aligned_u64 qp_id;
+	};
+};
+
+/*
+ * Shared structure between user and kernel
+ * to control CQ arming.
+ */
+struct siw_cq_ctrl {
+	__aligned_u64 notify;
+};
+#endif
-- 
cgit v1.2.3


From 10694ac92c063681246a360a45019f05855ab755 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 19 Jun 2019 10:32:32 -0400
Subject: xprtrdma: Fix a BUG when tracing is enabled with NFSv4.1 on RDMA

A backchannel reply does not set task->tk_client.

Fixes: 0c77668ddb4e ("SUNRPC: Introduce trace points in ... ")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index df9851cb82b2..f0678e3ac2d4 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -559,7 +559,8 @@ TRACE_EVENT(xprtrdma_post_send,
 		const struct rpc_rqst *rqst = &req->rl_slot;
 
 		__entry->task_id = rqst->rq_task->tk_pid;
-		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->client_id = rqst->rq_task->tk_client ?
+				     rqst->rq_task->tk_client->cl_clid : -1;
 		__entry->req = req;
 		__entry->num_sge = req->rl_sendctx->sc_wr.num_sge;
 		__entry->signaled = req->rl_sendctx->sc_wr.send_flags &
-- 
cgit v1.2.3


From 2afc5e1b9c340ff20848c8dd8fb60342617bce52 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:29 +0300
Subject: IB/mlx5: Introduce MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD

Introduce MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD and its initial
implementation.

This object is from type class FD and will be used to read DEVX
async events.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c         | 95 +++++++++++++++++++++++++++++++
 include/uapi/rdma/mlx5_user_ioctl_cmds.h  | 10 ++++
 include/uapi/rdma/mlx5_user_ioctl_verbs.h |  4 ++
 3 files changed, 109 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 931f587dfb8f..ed01523f0f02 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -33,6 +33,17 @@ struct devx_async_data {
 	struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
 };
 
+struct devx_async_event_file {
+	struct ib_uobject uobj;
+	/* Head of events that are subscribed to this FD */
+	struct list_head subscribed_events_list;
+	spinlock_t lock;
+	wait_queue_head_t poll_wait;
+	struct list_head event_list;
+	struct mlx5_ib_dev *dev;
+	u8 omit_data:1;
+};
+
 #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
 struct devx_obj {
 	struct mlx5_core_dev	*mdev;
@@ -1365,6 +1376,37 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
 	return 0;
 }
 
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE);
+	struct devx_async_event_file *ev_file;
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+	struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
+	u32 flags;
+	int err;
+
+	err = uverbs_get_flags32(&flags, attrs,
+		MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS,
+		MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA);
+
+	if (err)
+		return err;
+
+	ev_file = container_of(uobj, struct devx_async_event_file,
+			       uobj);
+	spin_lock_init(&ev_file->lock);
+	INIT_LIST_HEAD(&ev_file->event_list);
+	init_waitqueue_head(&ev_file->poll_wait);
+	if (flags & MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA)
+		ev_file->omit_data = 1;
+	INIT_LIST_HEAD(&ev_file->subscribed_events_list);
+	ev_file->dev = dev;
+	return 0;
+}
+
 static void devx_query_callback(int status, struct mlx5_async_work *context)
 {
 	struct devx_async_data *async_data =
@@ -1719,6 +1761,32 @@ static const struct file_operations devx_async_cmd_event_fops = {
 	.llseek	 = no_llseek,
 };
 
+static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
+				     size_t count, loff_t *pos)
+{
+	return -EINVAL;
+}
+
+static __poll_t devx_async_event_poll(struct file *filp,
+				      struct poll_table_struct *wait)
+{
+	return 0;
+}
+
+static int devx_async_event_close(struct inode *inode, struct file *filp)
+{
+	uverbs_close_fd(filp);
+	return 0;
+}
+
+static const struct file_operations devx_async_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read	 = devx_async_event_read,
+	.poll    = devx_async_event_poll,
+	.release = devx_async_event_close,
+	.llseek	 = no_llseek,
+};
+
 static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
 						   enum rdma_remove_reason why)
 {
@@ -1738,6 +1806,12 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
 	return 0;
 };
 
+static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
+					    enum rdma_remove_reason why)
+{
+	return 0;
+};
+
 DECLARE_UVERBS_NAMED_METHOD(
 	MLX5_IB_METHOD_DEVX_UMEM_REG,
 	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE,
@@ -1903,6 +1977,24 @@ DECLARE_UVERBS_NAMED_OBJECT(
 			     O_RDONLY),
 	&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC));
 
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC,
+	UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE,
+			MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS,
+			enum mlx5_ib_uapi_devx_create_event_channel_flags,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+	MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+	UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_event_file),
+			     devx_hot_unplug_async_event_file,
+			     &devx_async_event_fops, "[devx_async_event]",
+			     O_RDONLY),
+	&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC));
+
 static bool devx_is_supported(struct ib_device *device)
 {
 	struct mlx5_ib_dev *dev = to_mdev(device);
@@ -1923,5 +2015,8 @@ const struct uapi_definition mlx5_ib_devx_defs[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
 		MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
 		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
 	{},
 };
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index d404c951954c..6ad8f4f11ddd 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -127,16 +127,26 @@ enum mlx5_ib_devx_async_cmd_fd_alloc_attrs {
 	MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
 };
 
+enum mlx5_ib_devx_async_event_fd_alloc_attrs {
+	MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS,
+};
+
 enum mlx5_ib_devx_async_cmd_fd_methods {
 	MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT),
 };
 
+enum mlx5_ib_devx_async_event_fd_methods {
+	MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT),
+};
+
 enum mlx5_ib_objects {
 	MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT),
 	MLX5_IB_OBJECT_DEVX_OBJ,
 	MLX5_IB_OBJECT_DEVX_UMEM,
 	MLX5_IB_OBJECT_FLOW_MATCHER,
 	MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+	MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
 };
 
 enum mlx5_ib_flow_matcher_create_attrs {
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index a8f34c237458..b44691315d39 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -63,5 +63,9 @@ enum mlx5_ib_uapi_dm_type {
 	MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM,
 };
 
+enum mlx5_ib_uapi_devx_create_event_channel_flags {
+	MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA = 1 << 0,
+};
+
 #endif
 
-- 
cgit v1.2.3


From 7597385371425febdaa8c6a1da3625d4ffff16f5 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:31 +0300
Subject: IB/mlx5: Enable subscription for device events over DEVX

Enable subscription for device events over DEVX.

Each subscription is added to the two level xarray data structure
according to its event number and the DEVX object information in case was
given with the given target fd.

Those events will be reported over the given fd once will occur.
Downstream patches will mange the dispatching to any subscription.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c        | 560 ++++++++++++++++++++++++++++++-
 include/uapi/rdma/mlx5_user_ioctl_cmds.h |   9 +
 2 files changed, 562 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index a9affc905bfa..9c21cafc44a6 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -14,6 +14,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
+#include <linux/xarray.h>
 
 #define UVERBS_MODULE_NAME mlx5_ib
 #include <rdma/uverbs_named_ioctl.h>
@@ -33,6 +34,40 @@ struct devx_async_data {
 	struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
 };
 
+/* first level XA value data structure */
+struct devx_event {
+	struct xarray object_ids; /* second XA level, Key = object id */
+	struct list_head unaffiliated_list;
+};
+
+/* second level XA value data structure */
+struct devx_obj_event {
+	struct rcu_head rcu;
+	struct list_head obj_sub_list;
+};
+
+struct devx_event_subscription {
+	struct list_head file_list; /* headed in ev_file->
+				     * subscribed_events_list
+				     */
+	struct list_head xa_list; /* headed in devx_event->unaffiliated_list or
+				   * devx_obj_event->obj_sub_list
+				   */
+	struct list_head obj_list; /* headed in devx_object */
+	struct list_head event_list; /* headed in ev_file->event_list or in
+				      * temp list via subscription
+				      */
+
+	u8 is_cleaned:1;
+	u32 xa_key_level1;
+	u32 xa_key_level2;
+	struct rcu_head	rcu;
+	u64 cookie;
+	struct devx_async_event_file *ev_file;
+	struct file *filp; /* Upon hot unplug we need a direct access to */
+	struct eventfd_ctx *eventfd;
+};
+
 struct devx_async_event_file {
 	struct ib_uobject uobj;
 	/* Head of events that are subscribed to this FD */
@@ -55,6 +90,7 @@ struct devx_obj {
 		struct mlx5_ib_devx_mr	devx_mr;
 		struct mlx5_core_dct	core_dct;
 	};
+	struct list_head event_sub; /* holds devx_event_subscription entries */
 };
 
 struct devx_umem {
@@ -160,6 +196,104 @@ bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id)
 	return false;
 }
 
+static bool is_legacy_unaffiliated_event_num(u16 event_num)
+{
+	switch (event_num) {
+	case MLX5_EVENT_TYPE_PORT_CHANGE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool is_legacy_obj_event_num(u16 event_num)
+{
+	switch (event_num) {
+	case MLX5_EVENT_TYPE_PATH_MIG:
+	case MLX5_EVENT_TYPE_COMM_EST:
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_DCT_DRAINED:
+	case MLX5_EVENT_TYPE_COMP:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static u16 get_legacy_obj_type(u16 opcode)
+{
+	switch (opcode) {
+	case MLX5_CMD_OP_CREATE_RQ:
+		return MLX5_EVENT_QUEUE_TYPE_RQ;
+	case MLX5_CMD_OP_CREATE_QP:
+		return MLX5_EVENT_QUEUE_TYPE_QP;
+	case MLX5_CMD_OP_CREATE_SQ:
+		return MLX5_EVENT_QUEUE_TYPE_SQ;
+	case MLX5_CMD_OP_CREATE_DCT:
+		return MLX5_EVENT_QUEUE_TYPE_DCT;
+	default:
+		return 0;
+	}
+}
+
+static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num)
+{
+	u16 opcode;
+
+	opcode = (obj->obj_id >> 32) & 0xffff;
+
+	if (is_legacy_obj_event_num(event_num))
+		return get_legacy_obj_type(opcode);
+
+	switch (opcode) {
+	case MLX5_CMD_OP_CREATE_GENERAL_OBJECT:
+		return (obj->obj_id >> 48);
+	case MLX5_CMD_OP_CREATE_RQ:
+		return MLX5_OBJ_TYPE_RQ;
+	case MLX5_CMD_OP_CREATE_QP:
+		return MLX5_OBJ_TYPE_QP;
+	case MLX5_CMD_OP_CREATE_SQ:
+		return MLX5_OBJ_TYPE_SQ;
+	case MLX5_CMD_OP_CREATE_DCT:
+		return MLX5_OBJ_TYPE_DCT;
+	case MLX5_CMD_OP_CREATE_TIR:
+		return MLX5_OBJ_TYPE_TIR;
+	case MLX5_CMD_OP_CREATE_TIS:
+		return MLX5_OBJ_TYPE_TIS;
+	case MLX5_CMD_OP_CREATE_PSV:
+		return MLX5_OBJ_TYPE_PSV;
+	case MLX5_OBJ_TYPE_MKEY:
+		return MLX5_OBJ_TYPE_MKEY;
+	case MLX5_CMD_OP_CREATE_RMP:
+		return MLX5_OBJ_TYPE_RMP;
+	case MLX5_CMD_OP_CREATE_XRC_SRQ:
+		return MLX5_OBJ_TYPE_XRC_SRQ;
+	case MLX5_CMD_OP_CREATE_XRQ:
+		return MLX5_OBJ_TYPE_XRQ;
+	case MLX5_CMD_OP_CREATE_RQT:
+		return MLX5_OBJ_TYPE_RQT;
+	case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
+		return MLX5_OBJ_TYPE_FLOW_COUNTER;
+	case MLX5_CMD_OP_CREATE_CQ:
+		return MLX5_OBJ_TYPE_CQ;
+	default:
+		return 0;
+	}
+}
+
+static u32 get_dec_obj_id(u64 obj_id)
+{
+	return (obj_id & 0xffffffff);
+}
+
 /*
  * As the obj_id in the firmware is not globally unique the object type
  * must be considered upon checking for a valid object id.
@@ -1126,14 +1260,47 @@ static void devx_cleanup_mkey(struct devx_obj *obj)
 		 mlx5_base_mkey(obj->devx_mr.mmkey.key));
 }
 
+static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
+				      struct devx_event_subscription *sub)
+{
+	struct devx_event *event;
+	struct devx_obj_event *xa_val_level2;
+
+	if (sub->is_cleaned)
+		return;
+
+	sub->is_cleaned = 1;
+	list_del_rcu(&sub->xa_list);
+
+	if (list_empty(&sub->obj_list))
+		return;
+
+	list_del_rcu(&sub->obj_list);
+	/* check whether key level 1 for this obj_sub_list is empty */
+	event = xa_load(&dev->devx_event_table.event_xa,
+			sub->xa_key_level1);
+	WARN_ON(!event);
+
+	xa_val_level2 = xa_load(&event->object_ids, sub->xa_key_level2);
+	if (list_empty(&xa_val_level2->obj_sub_list)) {
+		xa_erase(&event->object_ids,
+			 sub->xa_key_level2);
+		kfree_rcu(xa_val_level2, rcu);
+	}
+}
+
 static int devx_obj_cleanup(struct ib_uobject *uobject,
 			    enum rdma_remove_reason why,
 			    struct uverbs_attr_bundle *attrs)
 {
 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
+	struct mlx5_devx_event_table *devx_event_table;
 	struct devx_obj *obj = uobject->object;
+	struct devx_event_subscription *sub_entry, *tmp;
+	struct mlx5_ib_dev *dev;
 	int ret;
 
+	dev = mlx5_udata_to_mdev(&attrs->driver_udata);
 	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
 		devx_cleanup_mkey(obj);
 
@@ -1145,10 +1312,14 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
 	if (ib_is_destroy_retryable(ret, why, uobject))
 		return ret;
 
-	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
-		struct mlx5_ib_dev *dev =
-			mlx5_udata_to_mdev(&attrs->driver_udata);
+	devx_event_table = &dev->devx_event_table;
+
+	mutex_lock(&devx_event_table->event_xa_lock);
+	list_for_each_entry_safe(sub_entry, tmp, &obj->event_sub, obj_list)
+		devx_cleanup_subscription(dev, sub_entry);
+	mutex_unlock(&devx_event_table->event_xa_lock);
 
+	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
 		call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu,
 			  devx_free_indirect_mkey);
 		return ret;
@@ -1220,6 +1391,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 
 	uobj->object = obj;
 	obj->mdev = dev->mdev;
+	INIT_LIST_HEAD(&obj->event_sub);
 	devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen,
 				   &obj_id);
 	WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32));
@@ -1404,6 +1576,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)(
 		ev_file->omit_data = 1;
 	INIT_LIST_HEAD(&ev_file->subscribed_events_list);
 	ev_file->dev = dev;
+	get_device(&dev->ib_dev.dev);
 	return 0;
 }
 
@@ -1516,6 +1689,331 @@ sub_bytes:
 	return err;
 }
 
+static void
+subscribe_event_xa_dealloc(struct mlx5_devx_event_table *devx_event_table,
+			   u32 key_level1,
+			   bool is_level2,
+			   u32 key_level2)
+{
+	struct devx_event *event;
+	struct devx_obj_event *xa_val_level2;
+
+	/* Level 1 is valid for future use, no need to free */
+	if (!is_level2)
+		return;
+
+	event = xa_load(&devx_event_table->event_xa, key_level1);
+	WARN_ON(!event);
+
+	xa_val_level2 = xa_load(&event->object_ids,
+				key_level2);
+	if (list_empty(&xa_val_level2->obj_sub_list)) {
+		xa_erase(&event->object_ids,
+			 key_level2);
+		kfree_rcu(xa_val_level2, rcu);
+	}
+}
+
+static int
+subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table,
+			 u32 key_level1,
+			 bool is_level2,
+			 u32 key_level2)
+{
+	struct devx_obj_event *obj_event;
+	struct devx_event *event;
+	int err;
+
+	event = xa_load(&devx_event_table->event_xa, key_level1);
+	if (!event) {
+		event = kzalloc(sizeof(*event), GFP_KERNEL);
+		if (!event)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&event->unaffiliated_list);
+		xa_init(&event->object_ids);
+
+		err = xa_insert(&devx_event_table->event_xa,
+				key_level1,
+				event,
+				GFP_KERNEL);
+		if (err) {
+			kfree(event);
+			return err;
+		}
+	}
+
+	if (!is_level2)
+		return 0;
+
+	obj_event = xa_load(&event->object_ids, key_level2);
+	if (!obj_event) {
+		obj_event = kzalloc(sizeof(*obj_event), GFP_KERNEL);
+		if (!obj_event)
+			/* Level1 is valid for future use, no need to free */
+			return -ENOMEM;
+
+		err = xa_insert(&event->object_ids,
+				key_level2,
+				obj_event,
+				GFP_KERNEL);
+		if (err)
+			return err;
+		INIT_LIST_HEAD(&obj_event->obj_sub_list);
+	}
+
+	return 0;
+}
+
+static bool is_valid_events_legacy(int num_events, u16 *event_type_num_list,
+				   struct devx_obj *obj)
+{
+	int i;
+
+	for (i = 0; i < num_events; i++) {
+		if (obj) {
+			if (!is_legacy_obj_event_num(event_type_num_list[i]))
+				return false;
+		} else if (!is_legacy_unaffiliated_event_num(
+				event_type_num_list[i])) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+#define MAX_SUPP_EVENT_NUM 255
+static bool is_valid_events(struct mlx5_core_dev *dev,
+			    int num_events, u16 *event_type_num_list,
+			    struct devx_obj *obj)
+{
+	__be64 *aff_events;
+	__be64 *unaff_events;
+	int mask_entry;
+	int mask_bit;
+	int i;
+
+	if (MLX5_CAP_GEN(dev, event_cap)) {
+		aff_events = MLX5_CAP_DEV_EVENT(dev,
+						user_affiliated_events);
+		unaff_events = MLX5_CAP_DEV_EVENT(dev,
+						  user_unaffiliated_events);
+	} else {
+		return is_valid_events_legacy(num_events, event_type_num_list,
+					      obj);
+	}
+
+	for (i = 0; i < num_events; i++) {
+		if (event_type_num_list[i] > MAX_SUPP_EVENT_NUM)
+			return false;
+
+		mask_entry = event_type_num_list[i] / 64;
+		mask_bit = event_type_num_list[i] % 64;
+
+		if (obj) {
+			/* CQ completion */
+			if (event_type_num_list[i] == 0)
+				continue;
+
+			if (!(be64_to_cpu(aff_events[mask_entry]) &
+					(1ull << mask_bit)))
+				return false;
+
+			continue;
+		}
+
+		if (!(be64_to_cpu(unaff_events[mask_entry]) &
+				(1ull << mask_bit)))
+			return false;
+	}
+
+	return true;
+}
+
+#define MAX_NUM_EVENTS 16
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *devx_uobj = uverbs_attr_get_uobject(
+				attrs,
+				MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE);
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+	struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
+	struct ib_uobject *fd_uobj;
+	struct devx_obj *obj = NULL;
+	struct devx_async_event_file *ev_file;
+	struct mlx5_devx_event_table *devx_event_table = &dev->devx_event_table;
+	u16 *event_type_num_list;
+	struct devx_event_subscription *event_sub, *tmp_sub;
+	struct list_head sub_list;
+	int redirect_fd;
+	bool use_eventfd = false;
+	int num_events;
+	int num_alloc_xa_entries = 0;
+	u16 obj_type = 0;
+	u64 cookie = 0;
+	u32 obj_id = 0;
+	int err;
+	int i;
+
+	if (!c->devx_uid)
+		return -EINVAL;
+
+	if (!IS_ERR(devx_uobj)) {
+		obj = (struct devx_obj *)devx_uobj->object;
+		if (obj)
+			obj_id = get_dec_obj_id(obj->obj_id);
+	}
+
+	fd_uobj = uverbs_attr_get_uobject(attrs,
+				MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE);
+	if (IS_ERR(fd_uobj))
+		return PTR_ERR(fd_uobj);
+
+	ev_file = container_of(fd_uobj, struct devx_async_event_file,
+			       uobj);
+
+	if (uverbs_attr_is_valid(attrs,
+				 MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM)) {
+		err = uverbs_copy_from(&redirect_fd, attrs,
+			       MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM);
+		if (err)
+			return err;
+
+		use_eventfd = true;
+	}
+
+	if (uverbs_attr_is_valid(attrs,
+				 MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE)) {
+		if (use_eventfd)
+			return -EINVAL;
+
+		err = uverbs_copy_from(&cookie, attrs,
+				MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE);
+		if (err)
+			return err;
+	}
+
+	num_events = uverbs_attr_ptr_get_array_size(
+		attrs, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST,
+		sizeof(u16));
+
+	if (num_events < 0)
+		return num_events;
+
+	if (num_events > MAX_NUM_EVENTS)
+		return -EINVAL;
+
+	event_type_num_list = uverbs_attr_get_alloced_ptr(attrs,
+			MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST);
+
+	if (!is_valid_events(dev->mdev, num_events, event_type_num_list, obj))
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&sub_list);
+
+	/* Protect from concurrent subscriptions to same XA entries to allow
+	 * both to succeed
+	 */
+	mutex_lock(&devx_event_table->event_xa_lock);
+	for (i = 0; i < num_events; i++) {
+		u32 key_level1;
+
+		if (obj)
+			obj_type = get_dec_obj_type(obj,
+						    event_type_num_list[i]);
+		key_level1 = event_type_num_list[i] | obj_type << 16;
+
+		err = subscribe_event_xa_alloc(devx_event_table,
+					       key_level1,
+					       obj,
+					       obj_id);
+		if (err)
+			goto err;
+
+		num_alloc_xa_entries++;
+		event_sub = kzalloc(sizeof(*event_sub), GFP_KERNEL);
+		if (!event_sub)
+			goto err;
+
+		list_add_tail(&event_sub->event_list, &sub_list);
+		if (use_eventfd) {
+			event_sub->eventfd =
+				eventfd_ctx_fdget(redirect_fd);
+
+			if (IS_ERR(event_sub)) {
+				err = PTR_ERR(event_sub->eventfd);
+				event_sub->eventfd = NULL;
+				goto err;
+			}
+		}
+
+		event_sub->cookie = cookie;
+		event_sub->ev_file = ev_file;
+		event_sub->filp = fd_uobj->object;
+		/* May be needed upon cleanup the devx object/subscription */
+		event_sub->xa_key_level1 = key_level1;
+		event_sub->xa_key_level2 = obj_id;
+		INIT_LIST_HEAD(&event_sub->obj_list);
+	}
+
+	/* Once all the allocations and the XA data insertions were done we
+	 * can go ahead and add all the subscriptions to the relevant lists
+	 * without concern of a failure.
+	 */
+	list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) {
+		struct devx_event *event;
+		struct devx_obj_event *obj_event;
+
+		list_del_init(&event_sub->event_list);
+
+		spin_lock_irq(&ev_file->lock);
+		list_add_tail_rcu(&event_sub->file_list,
+				  &ev_file->subscribed_events_list);
+		spin_unlock_irq(&ev_file->lock);
+
+		event = xa_load(&devx_event_table->event_xa,
+				event_sub->xa_key_level1);
+		WARN_ON(!event);
+
+		if (!obj) {
+			list_add_tail_rcu(&event_sub->xa_list,
+					  &event->unaffiliated_list);
+			continue;
+		}
+
+		obj_event = xa_load(&event->object_ids, obj_id);
+		WARN_ON(!obj_event);
+		list_add_tail_rcu(&event_sub->xa_list,
+				  &obj_event->obj_sub_list);
+		list_add_tail_rcu(&event_sub->obj_list,
+				  &obj->event_sub);
+	}
+
+	mutex_unlock(&devx_event_table->event_xa_lock);
+	return 0;
+
+err:
+	list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) {
+		list_del(&event_sub->event_list);
+
+		subscribe_event_xa_dealloc(devx_event_table,
+					   event_sub->xa_key_level1,
+					   obj,
+					   obj_id);
+
+		if (event_sub->eventfd)
+			eventfd_ctx_put(event_sub->eventfd);
+
+		kfree(event_sub);
+	}
+
+	mutex_unlock(&devx_event_table->event_xa_lock);
+	return err;
+}
+
 static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 			 struct uverbs_attr_bundle *attrs,
 			 struct devx_umem *obj)
@@ -1682,14 +2180,21 @@ void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev)
 void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_devx_event_table *table = &dev->devx_event_table;
+	struct devx_event_subscription *sub, *tmp;
+	struct devx_event *event;
 	void *entry;
 	unsigned long id;
 
 	mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb);
-
-	xa_for_each(&table->event_xa, id, entry)
+	mutex_lock(&dev->devx_event_table.event_xa_lock);
+	xa_for_each(&table->event_xa, id, entry) {
+		event = entry;
+		list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list,
+					 xa_list)
+			devx_cleanup_subscription(dev, sub);
 		kfree(entry);
-
+	}
+	mutex_unlock(&dev->devx_event_table.event_xa_lock);
 	xa_destroy(&table->event_xa);
 }
 
@@ -1805,7 +2310,26 @@ static __poll_t devx_async_event_poll(struct file *filp,
 
 static int devx_async_event_close(struct inode *inode, struct file *filp)
 {
+	struct devx_async_event_file *ev_file = filp->private_data;
+	struct devx_event_subscription *event_sub, *event_sub_tmp;
+
+	mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock);
+	/* delete the subscriptions which are related to this FD */
+	list_for_each_entry_safe(event_sub, event_sub_tmp,
+				 &ev_file->subscribed_events_list, file_list) {
+		devx_cleanup_subscription(ev_file->dev, event_sub);
+		if (event_sub->eventfd)
+			eventfd_ctx_put(event_sub->eventfd);
+
+		list_del_rcu(&event_sub->file_list);
+		/* subscription may not be used by the read API any more */
+		kfree_rcu(event_sub, rcu);
+	}
+
+	mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock);
+
 	uverbs_close_fd(filp);
+	put_device(&ev_file->dev->ib_dev.dev);
 	return 0;
 }
 
@@ -1973,10 +2497,32 @@ DECLARE_UVERBS_NAMED_METHOD(
 		UVERBS_ATTR_TYPE(u64),
 		UA_MANDATORY));
 
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT,
+	UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE,
+		MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+		UVERBS_ACCESS_READ,
+		UA_MANDATORY),
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE,
+		MLX5_IB_OBJECT_DEVX_OBJ,
+		UVERBS_ACCESS_READ,
+		UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST,
+		UVERBS_ATTR_MIN_SIZE(sizeof(u16)),
+		UA_MANDATORY,
+		UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE,
+		UVERBS_ATTR_TYPE(u64),
+		UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM,
+		UVERBS_ATTR_TYPE(u32),
+		UA_OPTIONAL));
+
 DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX,
 			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER),
 			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR),
-			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN));
+			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN),
+			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT));
 
 DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ,
 			    UVERBS_TYPE_ALLOC_IDR(devx_obj_cleanup),
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index 6ad8f4f11ddd..d0da070cf0ab 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -51,6 +51,7 @@ enum mlx5_ib_devx_methods {
 	MLX5_IB_METHOD_DEVX_OTHER  = (1U << UVERBS_ID_NS_SHIFT),
 	MLX5_IB_METHOD_DEVX_QUERY_UAR,
 	MLX5_IB_METHOD_DEVX_QUERY_EQN,
+	MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT,
 };
 
 enum  mlx5_ib_devx_other_attrs {
@@ -93,6 +94,14 @@ enum mlx5_ib_devx_obj_query_async_attrs {
 	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN,
 };
 
+enum mlx5_ib_devx_subscribe_event_attrs {
+	MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE,
+	MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST,
+	MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM,
+	MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE,
+};
+
 enum  mlx5_ib_devx_query_eqn_attrs {
 	MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT),
 	MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN,
-- 
cgit v1.2.3


From 5ec9d8ee87c627a2c981d871e41f6e2a942f53fd Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:32 +0300
Subject: IB/mlx5: Implement DEVX dispatching event

Implement DEVX dispatching event by looking up for the applicable
subscriptions for the reported event and using their target fd to
signal/set the event.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c         | 303 +++++++++++++++++++++++++++++-
 include/uapi/rdma/mlx5_user_ioctl_verbs.h |   5 +
 2 files changed, 305 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 9c21cafc44a6..867b9778c063 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -34,6 +34,11 @@ struct devx_async_data {
 	struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
 };
 
+struct devx_async_event_data {
+	struct list_head list; /* headed in ev_file->event_list */
+	struct mlx5_ib_uapi_devx_async_event_hdr hdr;
+};
+
 /* first level XA value data structure */
 struct devx_event {
 	struct xarray object_ids; /* second XA level, Key = object id */
@@ -77,6 +82,8 @@ struct devx_async_event_file {
 	struct list_head event_list;
 	struct mlx5_ib_dev *dev;
 	u8 omit_data:1;
+	u8 is_overflow_err:1;
+	u8 is_destroyed:1;
 };
 
 #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
@@ -289,6 +296,29 @@ static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num)
 	}
 }
 
+static u16 get_event_obj_type(unsigned long event_type, struct mlx5_eqe *eqe)
+{
+	switch (event_type) {
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+	case MLX5_EVENT_TYPE_PATH_MIG:
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+	case MLX5_EVENT_TYPE_COMM_EST:
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+		return eqe->data.qp_srq.type;
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+		return 0;
+	case MLX5_EVENT_TYPE_DCT_DRAINED:
+		return MLX5_EVENT_QUEUE_TYPE_DCT;
+	default:
+		return MLX5_GET(affiliated_event_header, &eqe->data, obj_type);
+	}
+}
+
 static u32 get_dec_obj_id(u64 obj_id)
 {
 	return (obj_id & 0xffffffff);
@@ -2161,10 +2191,170 @@ static int devx_umem_cleanup(struct ib_uobject *uobject,
 	return 0;
 }
 
+static bool is_unaffiliated_event(struct mlx5_core_dev *dev,
+				  unsigned long event_type)
+{
+	__be64 *unaff_events;
+	int mask_entry;
+	int mask_bit;
+
+	if (!MLX5_CAP_GEN(dev, event_cap))
+		return is_legacy_unaffiliated_event_num(event_type);
+
+	unaff_events = MLX5_CAP_DEV_EVENT(dev,
+					  user_unaffiliated_events);
+	WARN_ON(event_type > MAX_SUPP_EVENT_NUM);
+
+	mask_entry = event_type / 64;
+	mask_bit = event_type % 64;
+
+	if (!(be64_to_cpu(unaff_events[mask_entry]) & (1ull << mask_bit)))
+		return false;
+
+	return true;
+}
+
+static u32 devx_get_obj_id_from_event(unsigned long event_type, void *data)
+{
+	struct mlx5_eqe *eqe = data;
+	u32 obj_id = 0;
+
+	switch (event_type) {
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+	case MLX5_EVENT_TYPE_PATH_MIG:
+	case MLX5_EVENT_TYPE_COMM_EST:
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+		obj_id = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+		break;
+	case MLX5_EVENT_TYPE_DCT_DRAINED:
+		obj_id = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
+		break;
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+		obj_id = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
+		break;
+	default:
+		obj_id = MLX5_GET(affiliated_event_header, &eqe->data, obj_id);
+		break;
+	}
+
+	return obj_id;
+}
+
+static int deliver_event(struct devx_event_subscription *event_sub,
+			 const void *data)
+{
+	struct devx_async_event_file *ev_file;
+	struct devx_async_event_data *event_data;
+	unsigned long flags;
+
+	ev_file = event_sub->ev_file;
+
+	if (ev_file->omit_data) {
+		spin_lock_irqsave(&ev_file->lock, flags);
+		if (!list_empty(&event_sub->event_list)) {
+			spin_unlock_irqrestore(&ev_file->lock, flags);
+			return 0;
+		}
+
+		list_add_tail(&event_sub->event_list, &ev_file->event_list);
+		spin_unlock_irqrestore(&ev_file->lock, flags);
+		wake_up_interruptible(&ev_file->poll_wait);
+		return 0;
+	}
+
+	event_data = kzalloc(sizeof(*event_data) + sizeof(struct mlx5_eqe),
+			     GFP_ATOMIC);
+	if (!event_data) {
+		spin_lock_irqsave(&ev_file->lock, flags);
+		ev_file->is_overflow_err = 1;
+		spin_unlock_irqrestore(&ev_file->lock, flags);
+		return -ENOMEM;
+	}
+
+	event_data->hdr.cookie = event_sub->cookie;
+	memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe));
+
+	spin_lock_irqsave(&ev_file->lock, flags);
+	list_add_tail(&event_data->list, &ev_file->event_list);
+	spin_unlock_irqrestore(&ev_file->lock, flags);
+	wake_up_interruptible(&ev_file->poll_wait);
+
+	return 0;
+}
+
+static void dispatch_event_fd(struct list_head *fd_list,
+			      const void *data)
+{
+	struct devx_event_subscription *item;
+
+	list_for_each_entry_rcu(item, fd_list, xa_list) {
+		if (!get_file_rcu(item->filp))
+			continue;
+
+		if (item->eventfd) {
+			eventfd_signal(item->eventfd, 1);
+			fput(item->filp);
+			continue;
+		}
+
+		deliver_event(item, data);
+		fput(item->filp);
+	}
+}
+
 static int devx_event_notifier(struct notifier_block *nb,
 			       unsigned long event_type, void *data)
 {
-	return NOTIFY_DONE;
+	struct mlx5_devx_event_table *table;
+	struct mlx5_ib_dev *dev;
+	struct devx_event *event;
+	struct devx_obj_event *obj_event;
+	u16 obj_type = 0;
+	bool is_unaffiliated;
+	u32 obj_id;
+
+	/* Explicit filtering to kernel events which may occur frequently */
+	if (event_type == MLX5_EVENT_TYPE_CMD ||
+	    event_type == MLX5_EVENT_TYPE_PAGE_REQUEST)
+		return NOTIFY_OK;
+
+	table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb);
+	dev = container_of(table, struct mlx5_ib_dev, devx_event_table);
+	is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type);
+
+	if (!is_unaffiliated)
+		obj_type = get_event_obj_type(event_type, data);
+
+	rcu_read_lock();
+	event = xa_load(&table->event_xa, event_type | (obj_type << 16));
+	if (!event) {
+		rcu_read_unlock();
+		return NOTIFY_DONE;
+	}
+
+	if (is_unaffiliated) {
+		dispatch_event_fd(&event->unaffiliated_list, data);
+		rcu_read_unlock();
+		return NOTIFY_OK;
+	}
+
+	obj_id = devx_get_obj_id_from_event(event_type, data);
+	obj_event = xa_load(&event->object_ids, obj_id);
+	if (!obj_event) {
+		rcu_read_unlock();
+		return NOTIFY_DONE;
+	}
+
+	dispatch_event_fd(&obj_event->obj_sub_list, data);
+
+	rcu_read_unlock();
+	return NOTIFY_OK;
 }
 
 void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev)
@@ -2299,19 +2489,108 @@ static const struct file_operations devx_async_cmd_event_fops = {
 static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
 				     size_t count, loff_t *pos)
 {
-	return -EINVAL;
+	struct devx_async_event_file *ev_file = filp->private_data;
+	struct devx_event_subscription *event_sub;
+	struct devx_async_event_data *uninitialized_var(event);
+	int ret = 0;
+	size_t eventsz;
+	bool omit_data;
+	void *event_data;
+
+	omit_data = ev_file->omit_data;
+
+	spin_lock_irq(&ev_file->lock);
+
+	if (ev_file->is_overflow_err) {
+		ev_file->is_overflow_err = 0;
+		spin_unlock_irq(&ev_file->lock);
+		return -EOVERFLOW;
+	}
+
+	if (ev_file->is_destroyed) {
+		spin_unlock_irq(&ev_file->lock);
+		return -EIO;
+	}
+
+	while (list_empty(&ev_file->event_list)) {
+		spin_unlock_irq(&ev_file->lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(ev_file->poll_wait,
+			    (!list_empty(&ev_file->event_list) ||
+			     ev_file->is_destroyed))) {
+			return -ERESTARTSYS;
+		}
+
+		spin_lock_irq(&ev_file->lock);
+		if (ev_file->is_destroyed) {
+			spin_unlock_irq(&ev_file->lock);
+			return -EIO;
+		}
+	}
+
+	if (omit_data) {
+		event_sub = list_first_entry(&ev_file->event_list,
+					struct devx_event_subscription,
+					event_list);
+		eventsz = sizeof(event_sub->cookie);
+		event_data = &event_sub->cookie;
+	} else {
+		event = list_first_entry(&ev_file->event_list,
+				      struct devx_async_event_data, list);
+		eventsz = sizeof(struct mlx5_eqe) +
+			sizeof(struct mlx5_ib_uapi_devx_async_event_hdr);
+		event_data = &event->hdr;
+	}
+
+	if (eventsz > count) {
+		spin_unlock_irq(&ev_file->lock);
+		return -EINVAL;
+	}
+
+	if (omit_data)
+		list_del_init(&event_sub->event_list);
+	else
+		list_del(&event->list);
+
+	spin_unlock_irq(&ev_file->lock);
+
+	if (copy_to_user(buf, event_data, eventsz))
+		/* This points to an application issue, not a kernel concern */
+		ret = -EFAULT;
+	else
+		ret = eventsz;
+
+	if (!omit_data)
+		kfree(event);
+	return ret;
 }
 
 static __poll_t devx_async_event_poll(struct file *filp,
 				      struct poll_table_struct *wait)
 {
-	return 0;
+	struct devx_async_event_file *ev_file = filp->private_data;
+	__poll_t pollflags = 0;
+
+	poll_wait(filp, &ev_file->poll_wait, wait);
+
+	spin_lock_irq(&ev_file->lock);
+	if (ev_file->is_destroyed)
+		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+	else if (!list_empty(&ev_file->event_list))
+		pollflags = EPOLLIN | EPOLLRDNORM;
+	spin_unlock_irq(&ev_file->lock);
+
+	return pollflags;
 }
 
 static int devx_async_event_close(struct inode *inode, struct file *filp)
 {
 	struct devx_async_event_file *ev_file = filp->private_data;
 	struct devx_event_subscription *event_sub, *event_sub_tmp;
+	struct devx_async_event_data *entry, *tmp;
 
 	mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock);
 	/* delete the subscriptions which are related to this FD */
@@ -2328,6 +2607,15 @@ static int devx_async_event_close(struct inode *inode, struct file *filp)
 
 	mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock);
 
+	/* free the pending events allocation */
+	if (!ev_file->omit_data) {
+		spin_lock_irq(&ev_file->lock);
+		list_for_each_entry_safe(entry, tmp,
+					 &ev_file->event_list, list)
+			kfree(entry); /* read can't come any more */
+		spin_unlock_irq(&ev_file->lock);
+	}
+
 	uverbs_close_fd(filp);
 	put_device(&ev_file->dev->ib_dev.dev);
 	return 0;
@@ -2363,6 +2651,15 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
 static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
 					    enum rdma_remove_reason why)
 {
+	struct devx_async_event_file *ev_file =
+		container_of(uobj, struct devx_async_event_file,
+			     uobj);
+
+	spin_lock_irq(&ev_file->lock);
+	ev_file->is_destroyed = 1;
+	spin_unlock_irq(&ev_file->lock);
+
+	wake_up_interruptible(&ev_file->poll_wait);
 	return 0;
 };
 
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index b44691315d39..7e9900b0e746 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -67,5 +67,10 @@ enum mlx5_ib_uapi_devx_create_event_channel_flags {
 	MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA = 1 << 0,
 };
 
+struct mlx5_ib_uapi_devx_async_event_hdr {
+	__aligned_u64	cookie;
+	__u8		out_data[];
+};
+
 #endif
 
-- 
cgit v1.2.3


From 0b07ee944701dabcddc294d903b5e8e21c2c5d95 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:17 +0530
Subject: PM / QOS: Pass request type to dev_pm_qos_{add|remove}_notifier()

In order to use the same set of routines to register notifiers for
different request types, update the existing
dev_pm_qos_{add|remove}_notifier() routines with an additional
parameter: request-type.

For now, it only supports resume-latency request type but will be
extended to frequency limit (min/max) constraints later on.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/pm_qos_interface.txt | 10 ++++++----
 drivers/base/power/domain.c              |  8 +++++---
 drivers/base/power/qos.c                 | 14 ++++++++++++--
 include/linux/pm_qos.h                   | 12 ++++++++----
 4 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index 19c5f7b1a7ba..ec7d662d1707 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -164,12 +164,14 @@ directory.
 Notification mechanisms:
 The per-device PM QoS framework has a per-device notification tree.
 
-int dev_pm_qos_add_notifier(device, notifier):
-Adds a notification callback function for the device.
+int dev_pm_qos_add_notifier(device, notifier, type):
+Adds a notification callback function for the device for a particular request
+type.
+
 The callback is called when the aggregated value of the device constraints list
-is changed (for resume latency device PM QoS only).
+is changed.
 
-int dev_pm_qos_remove_notifier(device, notifier):
+int dev_pm_qos_remove_notifier(device, notifier, type):
 Removes the notification callback function for the device.
 
 
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 33c30c1e6a30..b063bc41b0a9 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1536,7 +1536,8 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	if (ret)
 		genpd_free_dev_data(dev, gpd_data);
 	else
-		dev_pm_qos_add_notifier(dev, &gpd_data->nb);
+		dev_pm_qos_add_notifier(dev, &gpd_data->nb,
+					DEV_PM_QOS_RESUME_LATENCY);
 
 	return ret;
 }
@@ -1569,7 +1570,8 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 
 	pdd = dev->power.subsys_data->domain_data;
 	gpd_data = to_gpd_data(pdd);
-	dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
+	dev_pm_qos_remove_notifier(dev, &gpd_data->nb,
+				   DEV_PM_QOS_RESUME_LATENCY);
 
 	genpd_lock(genpd);
 
@@ -1597,7 +1599,7 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 
  out:
 	genpd_unlock(genpd);
-	dev_pm_qos_add_notifier(dev, &gpd_data->nb);
+	dev_pm_qos_add_notifier(dev, &gpd_data->nb, DEV_PM_QOS_RESUME_LATENCY);
 
 	return ret;
 }
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 6c91f8df1d59..cfd463212513 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -467,6 +467,7 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request);
  *
  * @dev: target device for the constraint
  * @notifier: notifier block managed by caller.
+ * @type: request type.
  *
  * Will register the notifier into a notification chain that gets called
  * upon changes to the target value for the device.
@@ -474,10 +475,14 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request);
  * If the device's constraints object doesn't exist when this routine is called,
  * it will be created (or error code will be returned if that fails).
  */
-int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier)
+int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
+			    enum dev_pm_qos_req_type type)
 {
 	int ret = 0;
 
+	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
+		return -EINVAL;
+
 	mutex_lock(&dev_pm_qos_mtx);
 
 	if (IS_ERR(dev->power.qos))
@@ -500,15 +505,20 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier);
  *
  * @dev: target device for the constraint
  * @notifier: notifier block to be removed.
+ * @type: request type.
  *
  * Will remove the notifier from the notification chain that gets called
  * upon changes to the target value.
  */
 int dev_pm_qos_remove_notifier(struct device *dev,
-			       struct notifier_block *notifier)
+			       struct notifier_block *notifier,
+			       enum dev_pm_qos_req_type type)
 {
 	int retval = 0;
 
+	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
+		return -EINVAL;
+
 	mutex_lock(&dev_pm_qos_mtx);
 
 	/* Silently return if the constraints object is not present. */
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 6ea1ae373d77..58e8749ceac5 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -146,9 +146,11 @@ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
 int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value);
 int dev_pm_qos_remove_request(struct dev_pm_qos_request *req);
 int dev_pm_qos_add_notifier(struct device *dev,
-			    struct notifier_block *notifier);
+			    struct notifier_block *notifier,
+			    enum dev_pm_qos_req_type type);
 int dev_pm_qos_remove_notifier(struct device *dev,
-			       struct notifier_block *notifier);
+			       struct notifier_block *notifier,
+			       enum dev_pm_qos_req_type type);
 void dev_pm_qos_constraints_init(struct device *dev);
 void dev_pm_qos_constraints_destroy(struct device *dev);
 int dev_pm_qos_add_ancestor_request(struct device *dev,
@@ -202,10 +204,12 @@ static inline int dev_pm_qos_update_request(struct dev_pm_qos_request *req,
 static inline int dev_pm_qos_remove_request(struct dev_pm_qos_request *req)
 			{ return 0; }
 static inline int dev_pm_qos_add_notifier(struct device *dev,
-					  struct notifier_block *notifier)
+					  struct notifier_block *notifier,
+					  enum dev_pm_qos_req_type type)
 			{ return 0; }
 static inline int dev_pm_qos_remove_notifier(struct device *dev,
-					     struct notifier_block *notifier)
+					     struct notifier_block *notifier,
+					     enum dev_pm_qos_req_type type)
 			{ return 0; }
 static inline void dev_pm_qos_constraints_init(struct device *dev)
 {
-- 
cgit v1.2.3


From 8262331eaaf751076fb2c781f492bafd8344591d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:18 +0530
Subject: PM / QOS: Rename __dev_pm_qos_read_value() and
 dev_pm_qos_raw_read_value()

dev_pm_qos_read_value() will soon need to support more constraint types
(min/max frequency) and will have another argument to it, i.e. type of
the constraint. While that is fine for the existing users of
dev_pm_qos_read_value(), but not that optimal for the callers of
__dev_pm_qos_read_value() and dev_pm_qos_raw_read_value() as all the
callers of these two routines are only looking for resume latency
constraint.

Lets make these two routines care only about the resume latency
constraint and rename them to __dev_pm_qos_resume_latency() and
dev_pm_qos_raw_resume_latency().

Suggested-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain_governor.c |  2 +-
 drivers/base/power/qos.c             | 13 +++++++++----
 drivers/base/power/runtime.c         |  2 +-
 drivers/cpuidle/governor.c           |  2 +-
 include/linux/pm_qos.h               |  8 ++++----
 5 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index 3838045c9277..20e56a5be01f 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -66,7 +66,7 @@ static bool default_suspend_ok(struct device *dev)
 	td->constraint_changed = false;
 	td->cached_suspend_ok = false;
 	td->effective_constraint_ns = 0;
-	constraint_ns = __dev_pm_qos_read_value(dev);
+	constraint_ns = __dev_pm_qos_resume_latency(dev);
 
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index cfd463212513..7a0d197f0809 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -90,16 +90,16 @@ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask)
 EXPORT_SYMBOL_GPL(dev_pm_qos_flags);
 
 /**
- * __dev_pm_qos_read_value - Get PM QoS constraint for a given device.
+ * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device.
  * @dev: Device to get the PM QoS constraint value for.
  *
  * This routine must be called with dev->power.lock held.
  */
-s32 __dev_pm_qos_read_value(struct device *dev)
+s32 __dev_pm_qos_resume_latency(struct device *dev)
 {
 	lockdep_assert_held(&dev->power.lock);
 
-	return dev_pm_qos_raw_read_value(dev);
+	return dev_pm_qos_raw_resume_latency(dev);
 }
 
 /**
@@ -112,7 +112,12 @@ s32 dev_pm_qos_read_value(struct device *dev)
 	s32 ret;
 
 	spin_lock_irqsave(&dev->power.lock, flags);
-	ret = __dev_pm_qos_read_value(dev);
+
+	if (IS_ERR_OR_NULL(dev->power.qos))
+		ret = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+	else
+		ret = pm_qos_read_value(&dev->power.qos->resume_latency);
+
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
 	return ret;
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 952a1e7057c7..b75335508d2c 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -275,7 +275,7 @@ static int rpm_check_suspend_allowed(struct device *dev)
 	    || (dev->power.request_pending
 			&& dev->power.request == RPM_REQ_RESUME))
 		retval = -EAGAIN;
-	else if (__dev_pm_qos_read_value(dev) == 0)
+	else if (__dev_pm_qos_resume_latency(dev) == 0)
 		retval = -EPERM;
 	else if (dev->power.runtime_status == RPM_SUSPENDED)
 		retval = 1;
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index 9fddf828a76f..2e3e14192bee 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -110,7 +110,7 @@ int cpuidle_governor_latency_req(unsigned int cpu)
 {
 	int global_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 	struct device *device = get_cpu_device(cpu);
-	int device_req = dev_pm_qos_raw_read_value(device);
+	int device_req = dev_pm_qos_raw_resume_latency(device);
 
 	return device_req < global_req ? device_req : global_req;
 }
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 58e8749ceac5..5e09d4980786 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -139,7 +139,7 @@ s32 pm_qos_read_value(struct pm_qos_constraints *c);
 #ifdef CONFIG_PM
 enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask);
 enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask);
-s32 __dev_pm_qos_read_value(struct device *dev);
+s32 __dev_pm_qos_resume_latency(struct device *dev);
 s32 dev_pm_qos_read_value(struct device *dev);
 int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
 			   enum dev_pm_qos_req_type type, s32 value);
@@ -176,7 +176,7 @@ static inline s32 dev_pm_qos_requested_flags(struct device *dev)
 	return dev->power.qos->flags_req->data.flr.flags;
 }
 
-static inline s32 dev_pm_qos_raw_read_value(struct device *dev)
+static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev)
 {
 	return IS_ERR_OR_NULL(dev->power.qos) ?
 		PM_QOS_RESUME_LATENCY_NO_CONSTRAINT :
@@ -189,7 +189,7 @@ static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev,
 static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev,
 							s32 mask)
 			{ return PM_QOS_FLAGS_UNDEFINED; }
-static inline s32 __dev_pm_qos_read_value(struct device *dev)
+static inline s32 __dev_pm_qos_resume_latency(struct device *dev)
 			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
 static inline s32 dev_pm_qos_read_value(struct device *dev)
 			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
@@ -245,7 +245,7 @@ static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev)
 	return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
 }
 static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; }
-static inline s32 dev_pm_qos_raw_read_value(struct device *dev)
+static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev)
 {
 	return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
 }
-- 
cgit v1.2.3


From 2a79ea5ec53973c8711b54d33ace5c77659dc8f8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:19 +0530
Subject: PM / QOS: Pass request type to dev_pm_qos_read_value()

In order to allow dev_pm_qos_read_value() to read values for different
QoS requests, pass request type as a parameter to these routines.

For now, it only supports resume-latency request type but will be
extended to frequency limit (min/max) constraints later on.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/pm_qos_interface.txt |  2 +-
 drivers/base/power/domain_governor.c     |  2 +-
 drivers/base/power/qos.c                 | 17 ++++++++++++-----
 include/linux/pm_qos.h                   | 16 +++++++++++++---
 4 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index ec7d662d1707..cfcb1df39799 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -123,7 +123,7 @@ Will remove the element.  After removal it will update the aggregate target and
 call the notification trees if the target was changed as a result of removing
 the request.
 
-s32 dev_pm_qos_read_value(device):
+s32 dev_pm_qos_read_value(device, type):
 Returns the aggregated value for a given device's constraints list.
 
 enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index 20e56a5be01f..daa8c7689f7e 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -33,7 +33,7 @@ static int dev_update_qos_constraint(struct device *dev, void *data)
 		 * take its current PM QoS constraint (that's the only thing
 		 * known at this point anyway).
 		 */
-		constraint_ns = dev_pm_qos_read_value(dev);
+		constraint_ns = dev_pm_qos_read_value(dev, DEV_PM_QOS_RESUME_LATENCY);
 		constraint_ns *= NSEC_PER_USEC;
 	}
 
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 7a0d197f0809..2461fed0efa0 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -105,18 +105,25 @@ s32 __dev_pm_qos_resume_latency(struct device *dev)
 /**
  * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked).
  * @dev: Device to get the PM QoS constraint value for.
+ * @type: QoS request type.
  */
-s32 dev_pm_qos_read_value(struct device *dev)
+s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type)
 {
+	struct dev_pm_qos *qos = dev->power.qos;
 	unsigned long flags;
 	s32 ret;
 
 	spin_lock_irqsave(&dev->power.lock, flags);
 
-	if (IS_ERR_OR_NULL(dev->power.qos))
-		ret = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
-	else
-		ret = pm_qos_read_value(&dev->power.qos->resume_latency);
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
+		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT
+			: pm_qos_read_value(&qos->resume_latency);
+		break;
+	default:
+		WARN_ON(1);
+		ret = 0;
+	}
 
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 5e09d4980786..9a21b7ba72ae 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -140,7 +140,7 @@ s32 pm_qos_read_value(struct pm_qos_constraints *c);
 enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask);
 enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask);
 s32 __dev_pm_qos_resume_latency(struct device *dev);
-s32 dev_pm_qos_read_value(struct device *dev);
+s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type);
 int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
 			   enum dev_pm_qos_req_type type, s32 value);
 int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value);
@@ -191,8 +191,18 @@ static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev,
 			{ return PM_QOS_FLAGS_UNDEFINED; }
 static inline s32 __dev_pm_qos_resume_latency(struct device *dev)
 			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
-static inline s32 dev_pm_qos_read_value(struct device *dev)
-			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
+static inline s32 dev_pm_qos_read_value(struct device *dev,
+					enum dev_pm_qos_req_type type)
+{
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
+		return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+}
+
 static inline int dev_pm_qos_add_request(struct device *dev,
 					 struct dev_pm_qos_request *req,
 					 enum dev_pm_qos_req_type type,
-- 
cgit v1.2.3


From 208637b37824c8956fe28d277835a403ee35fa84 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:20 +0530
Subject: PM / QoS: Add support for MIN/MAX frequency constraints

This patch introduces the min-frequency and max-frequency device
constraints, which will be used by the cpufreq core to begin with.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/qos.c | 111 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/pm_qos.h   |  12 +++++
 2 files changed, 109 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 2461fed0efa0..6c90fd7e2ff8 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -120,6 +120,14 @@ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type)
 		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT
 			: pm_qos_read_value(&qos->resume_latency);
 		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE
+			: pm_qos_read_value(&qos->min_frequency);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE
+			: pm_qos_read_value(&qos->max_frequency);
+		break;
 	default:
 		WARN_ON(1);
 		ret = 0;
@@ -161,6 +169,14 @@ static int apply_constraint(struct dev_pm_qos_request *req,
 			req->dev->power.set_latency_tolerance(req->dev, value);
 		}
 		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = pm_qos_update_target(&qos->min_frequency,
+					   &req->data.pnode, action, value);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = pm_qos_update_target(&qos->max_frequency,
+					   &req->data.pnode, action, value);
+		break;
 	case DEV_PM_QOS_FLAGS:
 		ret = pm_qos_update_flags(&qos->flags, &req->data.flr,
 					  action, value);
@@ -189,12 +205,11 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	if (!qos)
 		return -ENOMEM;
 
-	n = kzalloc(sizeof(*n), GFP_KERNEL);
+	n = kzalloc(3 * sizeof(*n), GFP_KERNEL);
 	if (!n) {
 		kfree(qos);
 		return -ENOMEM;
 	}
-	BLOCKING_INIT_NOTIFIER_HEAD(n);
 
 	c = &qos->resume_latency;
 	plist_head_init(&c->list);
@@ -203,6 +218,7 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
 	c->type = PM_QOS_MIN;
 	c->notifiers = n;
+	BLOCKING_INIT_NOTIFIER_HEAD(n);
 
 	c = &qos->latency_tolerance;
 	plist_head_init(&c->list);
@@ -211,6 +227,24 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
 	c->type = PM_QOS_MIN;
 
+	c = &qos->min_frequency;
+	plist_head_init(&c->list);
+	c->target_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	c->default_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	c->no_constraint_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	c->type = PM_QOS_MAX;
+	c->notifiers = ++n;
+	BLOCKING_INIT_NOTIFIER_HEAD(n);
+
+	c = &qos->max_frequency;
+	plist_head_init(&c->list);
+	c->target_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+	c->default_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+	c->no_constraint_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+	c->type = PM_QOS_MIN;
+	c->notifiers = ++n;
+	BLOCKING_INIT_NOTIFIER_HEAD(n);
+
 	INIT_LIST_HEAD(&qos->flags.list);
 
 	spin_lock_irq(&dev->power.lock);
@@ -264,11 +298,25 @@ void dev_pm_qos_constraints_destroy(struct device *dev)
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
 		memset(req, 0, sizeof(*req));
 	}
+
 	c = &qos->latency_tolerance;
 	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
 		memset(req, 0, sizeof(*req));
 	}
+
+	c = &qos->min_frequency;
+	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
+		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE);
+		memset(req, 0, sizeof(*req));
+	}
+
+	c = &qos->max_frequency;
+	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
+		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
+		memset(req, 0, sizeof(*req));
+	}
+
 	f = &qos->flags;
 	list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) {
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
@@ -380,6 +428,8 @@ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req,
 	switch(req->type) {
 	case DEV_PM_QOS_RESUME_LATENCY:
 	case DEV_PM_QOS_LATENCY_TOLERANCE:
+	case DEV_PM_QOS_MIN_FREQUENCY:
+	case DEV_PM_QOS_MAX_FREQUENCY:
 		curr_value = req->data.pnode.prio;
 		break;
 	case DEV_PM_QOS_FLAGS:
@@ -492,9 +542,6 @@ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
 {
 	int ret = 0;
 
-	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
-		return -EINVAL;
-
 	mutex_lock(&dev_pm_qos_mtx);
 
 	if (IS_ERR(dev->power.qos))
@@ -502,10 +549,28 @@ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
 	else if (!dev->power.qos)
 		ret = dev_pm_qos_constraints_allocate(dev);
 
-	if (!ret)
+	if (ret)
+		goto unlock;
+
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
 		ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers,
 						       notifier);
+		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = blocking_notifier_chain_register(dev->power.qos->min_frequency.notifiers,
+						       notifier);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = blocking_notifier_chain_register(dev->power.qos->max_frequency.notifiers,
+						       notifier);
+		break;
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+	}
 
+unlock:
 	mutex_unlock(&dev_pm_qos_mtx);
 	return ret;
 }
@@ -526,20 +591,35 @@ int dev_pm_qos_remove_notifier(struct device *dev,
 			       struct notifier_block *notifier,
 			       enum dev_pm_qos_req_type type)
 {
-	int retval = 0;
-
-	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
-		return -EINVAL;
+	int ret = 0;
 
 	mutex_lock(&dev_pm_qos_mtx);
 
 	/* Silently return if the constraints object is not present. */
-	if (!IS_ERR_OR_NULL(dev->power.qos))
-		retval = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
-							    notifier);
+	if (IS_ERR_OR_NULL(dev->power.qos))
+		goto unlock;
+
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
+		ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
+							 notifier);
+		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = blocking_notifier_chain_unregister(dev->power.qos->min_frequency.notifiers,
+							 notifier);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = blocking_notifier_chain_unregister(dev->power.qos->max_frequency.notifiers,
+							 notifier);
+		break;
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+	}
 
+unlock:
 	mutex_unlock(&dev_pm_qos_mtx);
-	return retval;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier);
 
@@ -599,6 +679,9 @@ static void __dev_pm_qos_drop_user_request(struct device *dev,
 		req = dev->power.qos->flags_req;
 		dev->power.qos->flags_req = NULL;
 		break;
+	default:
+		WARN_ON(1);
+		return;
 	}
 	__dev_pm_qos_remove_request(req);
 	kfree(req);
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 9a21b7ba72ae..2aebbc5b9950 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -40,6 +40,8 @@ enum pm_qos_flags_status {
 #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT	PM_QOS_LATENCY_ANY
 #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS	PM_QOS_LATENCY_ANY_NS
 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE	0
+#define PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE	0
+#define PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE	(-1)
 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT	(-1)
 
 #define PM_QOS_FLAG_NO_POWER_OFF	(1 << 0)
@@ -58,6 +60,8 @@ struct pm_qos_flags_request {
 enum dev_pm_qos_req_type {
 	DEV_PM_QOS_RESUME_LATENCY = 1,
 	DEV_PM_QOS_LATENCY_TOLERANCE,
+	DEV_PM_QOS_MIN_FREQUENCY,
+	DEV_PM_QOS_MAX_FREQUENCY,
 	DEV_PM_QOS_FLAGS,
 };
 
@@ -99,10 +103,14 @@ struct pm_qos_flags {
 struct dev_pm_qos {
 	struct pm_qos_constraints resume_latency;
 	struct pm_qos_constraints latency_tolerance;
+	struct pm_qos_constraints min_frequency;
+	struct pm_qos_constraints max_frequency;
 	struct pm_qos_flags flags;
 	struct dev_pm_qos_request *resume_latency_req;
 	struct dev_pm_qos_request *latency_tolerance_req;
 	struct dev_pm_qos_request *flags_req;
+	struct dev_pm_qos_request *min_frequency_req;
+	struct dev_pm_qos_request *max_frequency_req;
 };
 
 /* Action requested to pm_qos_update_target */
@@ -197,6 +205,10 @@ static inline s32 dev_pm_qos_read_value(struct device *dev,
 	switch (type) {
 	case DEV_PM_QOS_RESUME_LATENCY:
 		return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		return PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		return PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
 	default:
 		WARN_ON(1);
 		return 0;
-- 
cgit v1.2.3


From 2ac295d4f0c095310addbcb03d91d2a4c9f7d435 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Jun 2019 20:48:55 -0400
Subject: convenience helper get_tree_nodev()

counterpart of mount_nodev().  Switch hugetlb and pseudo to it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hugetlbfs/inode.c       | 2 +-
 fs/libfs.c                 | 2 +-
 fs/super.c                 | 8 ++++++++
 include/linux/fs_context.h | 3 +++
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1dcc57189382..a478df035651 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1299,7 +1299,7 @@ static int hugetlbfs_get_tree(struct fs_context *fc)
 	int err = hugetlbfs_validate(fc);
 	if (err)
 		return err;
-	return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super);
+	return get_tree_nodev(fc, hugetlbfs_fill_super);
 }
 
 static void hugetlbfs_fs_context_free(struct fs_context *fc)
diff --git a/fs/libfs.c b/fs/libfs.c
index 7e6811ba4edd..c9463dc6a5d4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -270,7 +270,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 
 static int pseudo_fs_get_tree(struct fs_context *fc)
 {
-	return vfs_get_super(fc, vfs_get_independent_super, pseudo_fs_fill_super);
+	return get_tree_nodev(fc, pseudo_fs_fill_super);
 }
 
 static void pseudo_fs_free(struct fs_context *fc)
diff --git a/fs/super.c b/fs/super.c
index ca2302501d32..3318225b0878 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1198,6 +1198,14 @@ int vfs_get_super(struct fs_context *fc,
 }
 EXPORT_SYMBOL(vfs_get_super);
 
+int get_tree_nodev(struct fs_context *fc,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	return vfs_get_super(fc, vfs_get_independent_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_nodev);
+
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index c995b852ba40..38b1ec918a4e 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -151,6 +151,9 @@ extern int vfs_get_super(struct fs_context *fc,
 			 enum vfs_get_super_keying keying,
 			 int (*fill_super)(struct super_block *sb,
 					   struct fs_context *fc));
+extern int get_tree_nodev(struct fs_context *fc,
+			 int (*fill_super)(struct super_block *sb,
+					   struct fs_context *fc));
 
 extern const struct file_operations fscontext_fops;
 
-- 
cgit v1.2.3


From c23a0bbab30cc1714b6b1d6a1c153a5ccab3f0d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 22 May 2019 21:23:39 -0400
Subject: convenience helper: get_tree_single()

counterpart of mount_single(); switch fusectl to it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/control.c          | 2 +-
 fs/super.c                 | 8 ++++++++
 include/linux/fs_context.h | 3 +++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 14ce1e47f980..c23f6f243ad4 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -346,7 +346,7 @@ static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx)
 
 static int fuse_ctl_get_tree(struct fs_context *fc)
 {
-	return vfs_get_super(fc, vfs_get_single_super, fuse_ctl_fill_super);
+	return get_tree_single(fc, fuse_ctl_fill_super);
 }
 
 static const struct fs_context_operations fuse_ctl_context_ops = {
diff --git a/fs/super.c b/fs/super.c
index 3318225b0878..113c58f19425 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1206,6 +1206,14 @@ int get_tree_nodev(struct fs_context *fc,
 }
 EXPORT_SYMBOL(get_tree_nodev);
 
+int get_tree_single(struct fs_context *fc,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	return vfs_get_super(fc, vfs_get_single_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_single);
+
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 38b1ec918a4e..1775969e000d 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -154,6 +154,9 @@ extern int vfs_get_super(struct fs_context *fc,
 extern int get_tree_nodev(struct fs_context *fc,
 			 int (*fill_super)(struct super_block *sb,
 					   struct fs_context *fc));
+extern int get_tree_single(struct fs_context *fc,
+			 int (*fill_super)(struct super_block *sb,
+					   struct fs_context *fc));
 
 extern const struct file_operations fscontext_fops;
 
-- 
cgit v1.2.3


From 14a253ce4210cd2ef133b392062477e9d656db4a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 30 May 2019 15:59:57 -0400
Subject: init_rootfs(): don't bother with init_ramfs_fs()

the only thing done by the latter is making ramfs visible
to mount(2); we don't need it there - rootfs is separate
and, in fact, made visible to mount(2) in the same init_rootfs().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ramfs/inode.c      | 6 +-----
 include/linux/ramfs.h | 1 -
 init/do_mounts.c      | 2 --
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 11201b2d06b9..733c6b4193dc 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -266,12 +266,8 @@ static struct file_system_type ramfs_fs_type = {
 	.fs_flags	= FS_USERNS_MOUNT,
 };
 
-int __init init_ramfs_fs(void)
+static int __init init_ramfs_fs(void)
 {
-	static unsigned long once;
-
-	if (test_and_set_bit(0, &once))
-		return 0;
 	return register_filesystem(&ramfs_fs_type);
 }
 fs_initcall(init_ramfs_fs);
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 5ef7d54caac2..ee582bdb7fda 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -19,7 +19,6 @@ extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize);
 
 extern const struct file_operations ramfs_file_operations;
 extern const struct vm_operations_struct generic_file_vm_ops;
-extern int __init init_ramfs_fs(void);
 
 int ramfs_fill_super(struct super_block *sb, void *data, int silent);
 
diff --git a/init/do_mounts.c b/init/do_mounts.c
index f8c230c77035..c170d8b309b1 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -658,8 +658,6 @@ int __init init_rootfs(void)
 		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
 		err = shmem_init();
 		is_tmpfs = true;
-	} else {
-		err = init_ramfs_fs();
 	}
 
 	if (err)
-- 
cgit v1.2.3


From fd3e007f6c6a0f677e4ee8aca4b9bab8ad6cab9a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 30 May 2019 17:48:35 -0400
Subject: don't bother with registering rootfs

init_mount_tree() can get to rootfs_fs_type directly and that simplifies
a lot of things.  We don't need to register it, we don't need to look
it up *and* we don't need to bother with preventing subsequent userland
mounts.  That's the way we should've done that from the very beginning.

There is a user-visible change, namely the disappearance of "rootfs"
from /proc/filesystems.  Note that it's been unmountable all along
and it didn't show up in /proc/mounts; however, it *is* a user-visible
change and theoretically some script might've been using its presence
in /proc/filesystems to tell 2.4.11+ from earlier kernels.

*IF* any complaints about behaviour change do show up, we could fake
it in /proc/filesystems.  I very much doubt we'll have to, though.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c       |  7 +------
 include/linux/init.h |  3 +++
 init/do_mounts.c     | 15 ++-------------
 3 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1141641dff96..2db2f4c36c50 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3686,13 +3686,8 @@ static void __init init_mount_tree(void)
 	struct mount *m;
 	struct mnt_namespace *ns;
 	struct path root;
-	struct file_system_type *type;
 
-	type = get_fs_type("rootfs");
-	if (!type)
-		panic("Can't find rootfs type");
-	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
-	put_filesystem(type);
+	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 
diff --git a/include/linux/init.h b/include/linux/init.h
index 5255069f5a9f..cbe93521397e 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -137,6 +137,8 @@ extern initcall_entry_t __con_initcall_start[], __con_initcall_end[];
 /* Used for contructor calls. */
 typedef void (*ctor_fn_t)(void);
 
+struct file_system_type;
+
 /* Defined in init/main.c */
 extern int do_one_initcall(initcall_t fn);
 extern char __initdata boot_command_line[];
@@ -147,6 +149,7 @@ extern unsigned int reset_devices;
 void setup_arch(char **);
 void prepare_namespace(void);
 int __init init_rootfs(void);
+extern struct file_system_type rootfs_fs_type;
 
 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
 extern bool rodata_enabled;
diff --git a/init/do_mounts.c b/init/do_mounts.c
index c170d8b309b1..e7f0b0f18cce 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -629,19 +629,15 @@ static bool is_tmpfs;
 static struct dentry *rootfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	static unsigned long once;
 	void *fill = ramfs_fill_super;
 
-	if (test_and_set_bit(0, &once))
-		return ERR_PTR(-ENODEV);
-
 	if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs)
 		fill = shmem_fill_super;
 
 	return mount_nodev(fs_type, flags, data, fill);
 }
 
-static struct file_system_type rootfs_fs_type = {
+struct file_system_type rootfs_fs_type = {
 	.name		= "rootfs",
 	.mount		= rootfs_mount,
 	.kill_sb	= kill_litter_super,
@@ -649,19 +645,12 @@ static struct file_system_type rootfs_fs_type = {
 
 int __init init_rootfs(void)
 {
-	int err = register_filesystem(&rootfs_fs_type);
-
-	if (err)
-		return err;
+	int err = 0;
 
 	if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
 		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
 		err = shmem_init();
 		is_tmpfs = true;
 	}
-
-	if (err)
-		unregister_filesystem(&rootfs_fs_type);
-
 	return err;
 }
-- 
cgit v1.2.3


From 33488845f211afcdb7e5c00a3152890e06cdc78e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 31 May 2019 20:09:15 -0400
Subject: constify ksys_mount() string arguments

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/base/devtmpfs.c  | 3 +--
 fs/namespace.c           | 4 ++--
 include/linux/syscalls.h | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 0dbc43068eeb..ba5c80903efe 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -357,8 +357,7 @@ int devtmpfs_mount(const char *mntdir)
 	if (!thread)
 		return 0;
 
-	err = ksys_mount("devtmpfs", (char *)mntdir, "devtmpfs", MS_SILENT,
-			 NULL);
+	err = ksys_mount("devtmpfs", mntdir, "devtmpfs", MS_SILENT, NULL);
 	if (err)
 		printk(KERN_INFO "devtmpfs: error mounting %i\n", err);
 	else
diff --git a/fs/namespace.c b/fs/namespace.c
index 2db2f4c36c50..e272c2403014 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3295,8 +3295,8 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name)
 }
 EXPORT_SYMBOL(mount_subtree);
 
-int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
-	       unsigned long flags, void __user *data)
+int ksys_mount(const char __user *dev_name, const char __user *dir_name,
+	       const char __user *type, unsigned long flags, void __user *data)
 {
 	int ret;
 	char *kernel_type;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..2a0ac10a6f95 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1228,8 +1228,8 @@ asmlinkage long sys_ni_syscall(void);
  * the ksys_xyzyyz() functions prototyped below.
  */
 
-int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
-	       unsigned long flags, void __user *data);
+int ksys_mount(const char __user *dev_name, const char __user *dir_name,
+	       const char __user *type, unsigned long flags, void __user *data);
 int ksys_umount(char __user *name, int flags);
 int ksys_dup(unsigned int fildes);
 int ksys_chroot(const char __user *filename);
-- 
cgit v1.2.3


From 037f11b4752f717201143a1dc5d6acf3cb71ddfa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Jun 2019 18:09:44 -0400
Subject: mnt_init(): call shmem_init() unconditionally

No point having two call sites (earlier in init_rootfs() from
mnt_init() in case we are going to use shmem-style rootfs,
later from do_basic_setup() unconditionally), along with the
logics in shmem_init() itself to make the second call a no-op...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c       | 2 ++
 include/linux/init.h | 2 +-
 init/do_mounts.c     | 9 ++-------
 init/main.c          | 1 -
 mm/shmem.c           | 4 ----
 5 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index e272c2403014..e6990f3d526d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,7 @@
 #include <linux/sched/task.h>
 #include <uapi/linux/mount.h>
 #include <linux/fs_context.h>
+#include <linux/shmem_fs.h>
 
 #include "pnode.h"
 #include "internal.h"
@@ -3740,6 +3741,7 @@ void __init mnt_init(void)
 	fs_kobj = kobject_create_and_add("fs", NULL);
 	if (!fs_kobj)
 		printk(KERN_WARNING "%s: kobj create error\n", __func__);
+	shmem_init();
 	init_rootfs();
 	init_mount_tree();
 }
diff --git a/include/linux/init.h b/include/linux/init.h
index cbe93521397e..212fc9e2f691 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -148,7 +148,7 @@ extern unsigned int reset_devices;
 /* used by init/main.c */
 void setup_arch(char **);
 void prepare_namespace(void);
-int __init init_rootfs(void);
+void __init init_rootfs(void);
 extern struct file_system_type rootfs_fs_type;
 
 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
diff --git a/init/do_mounts.c b/init/do_mounts.c
index e7f0b0f18cce..864c032e995d 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -643,14 +643,9 @@ struct file_system_type rootfs_fs_type = {
 	.kill_sb	= kill_litter_super,
 };
 
-int __init init_rootfs(void)
+void __init init_rootfs(void)
 {
-	int err = 0;
-
 	if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
-		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
-		err = shmem_init();
+		(!root_fs_names || strstr(root_fs_names, "tmpfs")))
 		is_tmpfs = true;
-	}
-	return err;
 }
diff --git a/init/main.c b/init/main.c
index 5a2c69b4d7b3..4dbc7243557e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1000,7 +1000,6 @@ static void __init do_initcalls(void)
 static void __init do_basic_setup(void)
 {
 	cpuset_init_smp();
-	shmem_init();
 	driver_init();
 	init_irq_proc();
 	do_ctors();
diff --git a/mm/shmem.c b/mm/shmem.c
index 1bb3b8dc8bb2..1f67ec9e2062 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3775,10 +3775,6 @@ int __init shmem_init(void)
 {
 	int error;
 
-	/* If rootfs called this, don't re-init */
-	if (shmem_inode_cachep)
-		return 0;
-
 	shmem_init_inodecache();
 
 	error = register_filesystem(&shmem_fs_type);
-- 
cgit v1.2.3


From 7ade1ff96c7aa7e10445688a433d7ae39a13c6c9 Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:31 +0300
Subject: RDMA/restrack: Introduce statistic counter

Introduce statistic counter as a new resource. It allows a user to monitor
specific objects (e.g., QPs) by binding to a counter.

In some cases a user counter resource is created with task other then
"current", because its creation is done as part of rdmatool call.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/restrack.c | 22 +++++++++++++++++-----
 include/rdma/rdma_counter.h        | 18 ++++++++++++++++++
 include/rdma/restrack.h            |  4 ++++
 3 files changed, 39 insertions(+), 5 deletions(-)
 create mode 100644 include/rdma/rdma_counter.h

(limited to 'include')

diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 3b5ff2f7b5f8..95573f292aae 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -6,6 +6,7 @@
 #include <rdma/rdma_cm.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/restrack.h>
+#include <rdma/rdma_counter.h>
 #include <linux/mutex.h>
 #include <linux/sched/task.h>
 #include <linux/pid_namespace.h>
@@ -45,6 +46,7 @@ static const char *type2str(enum rdma_restrack_type type)
 		[RDMA_RESTRACK_CM_ID] = "CM_ID",
 		[RDMA_RESTRACK_MR] = "MR",
 		[RDMA_RESTRACK_CTX] = "CTX",
+		[RDMA_RESTRACK_COUNTER] = "COUNTER",
 	};
 
 	return names[type];
@@ -169,6 +171,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
 		return container_of(res, struct ib_mr, res)->device;
 	case RDMA_RESTRACK_CTX:
 		return container_of(res, struct ib_ucontext, res)->device;
+	case RDMA_RESTRACK_COUNTER:
+		return container_of(res, struct rdma_counter, res)->device;
 	default:
 		WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
 		return NULL;
@@ -203,15 +207,22 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res)
 
 	kref_init(&res->kref);
 	init_completion(&res->comp);
-	if (res->type != RDMA_RESTRACK_QP)
-		ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
-				&rt->next_id, GFP_KERNEL);
-	else {
+	if (res->type == RDMA_RESTRACK_QP) {
 		/* Special case to ensure that LQPN points to right QP */
 		struct ib_qp *qp = container_of(res, struct ib_qp, res);
 
 		ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL);
 		res->id = ret ? 0 : qp->qp_num;
+	} else if (res->type == RDMA_RESTRACK_COUNTER) {
+		/* Special case to ensure that cntn points to right counter */
+		struct rdma_counter *counter;
+
+		counter = container_of(res, struct rdma_counter, res);
+		ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL);
+		res->id = ret ? 0 : counter->id;
+	} else {
+		ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
+				      &rt->next_id, GFP_KERNEL);
 	}
 
 	if (!ret)
@@ -237,7 +248,8 @@ EXPORT_SYMBOL(rdma_restrack_kadd);
  */
 void rdma_restrack_uadd(struct rdma_restrack_entry *res)
 {
-	if (res->type != RDMA_RESTRACK_CM_ID)
+	if ((res->type != RDMA_RESTRACK_CM_ID) &&
+	    (res->type != RDMA_RESTRACK_COUNTER))
 		res->task = NULL;
 
 	if (!res->task)
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
new file mode 100644
index 000000000000..283ac1a0cdb7
--- /dev/null
+++ b/include/rdma/rdma_counter.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_COUNTER_H_
+#define _RDMA_COUNTER_H_
+
+#include <rdma/ib_verbs.h>
+#include <rdma/restrack.h>
+
+struct rdma_counter {
+	struct rdma_restrack_entry	res;
+	struct ib_device		*device;
+	uint32_t			id;
+	u8				port;
+};
+#endif /* _RDMA_COUNTER_H_ */
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index ecf3c7702a4f..4041a4d96524 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -42,6 +42,10 @@ enum rdma_restrack_type {
 	 * @RDMA_RESTRACK_CTX: Verbs contexts (CTX)
 	 */
 	RDMA_RESTRACK_CTX,
+	/**
+	 * @RDMA_RESTRACK_COUNTER: Statistic Counter
+	 */
+	RDMA_RESTRACK_COUNTER,
 	/**
 	 * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations
 	 */
-- 
cgit v1.2.3


From 413d3347503bc39e17577eaf16451fd492a68558 Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:34 +0300
Subject: RDMA/counter: Add set/clear per-port auto mode support

Add an API to support set/clear per-port auto mode.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/Makefile   |  2 +-
 drivers/infiniband/core/counters.c | 74 ++++++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/device.c   |  5 +++
 include/rdma/ib_verbs.h            |  2 ++
 include/rdma/rdma_counter.h        | 24 +++++++++++++
 include/uapi/rdma/rdma_netlink.h   | 26 ++++++++++++++
 6 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/core/counters.c

(limited to 'include')

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 42f1b2a4f746..09881bd5f12d 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,7 +11,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
-				nldev.o restrack.o
+				nldev.o restrack.o counters.o
 
 ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
new file mode 100644
index 000000000000..6167914fba06
--- /dev/null
+++ b/drivers/infiniband/core/counters.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_counter.h>
+
+#include "core_priv.h"
+#include "restrack.h"
+
+#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE)
+
+static int __counter_set_mode(struct rdma_counter_mode *curr,
+			      enum rdma_nl_counter_mode new_mode,
+			      enum rdma_nl_counter_mask new_mask)
+{
+	if ((new_mode == RDMA_COUNTER_MODE_AUTO) &&
+	    ((new_mask & (~ALL_AUTO_MODE_MASKS)) ||
+	     (curr->mode != RDMA_COUNTER_MODE_NONE)))
+		return -EINVAL;
+
+	curr->mode = new_mode;
+	curr->mask = new_mask;
+	return 0;
+}
+
+/**
+ * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode
+ *
+ * When @on is true, the @mask must be set
+ */
+int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
+			       bool on, enum rdma_nl_counter_mask mask)
+{
+	struct rdma_port_counter *port_counter;
+	int ret;
+
+	port_counter = &dev->port_data[port].port_counter;
+	mutex_lock(&port_counter->lock);
+	if (on) {
+		ret = __counter_set_mode(&port_counter->mode,
+					 RDMA_COUNTER_MODE_AUTO, mask);
+	} else {
+		if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = __counter_set_mode(&port_counter->mode,
+					 RDMA_COUNTER_MODE_NONE, 0);
+	}
+
+out:
+	mutex_unlock(&port_counter->lock);
+	return ret;
+}
+
+void rdma_counter_init(struct ib_device *dev)
+{
+	struct rdma_port_counter *port_counter;
+	u32 port;
+
+	if (!dev->ops.alloc_hw_stats || !dev->port_data)
+		return;
+
+	rdma_for_each_port(dev, port) {
+		port_counter = &dev->port_data[port].port_counter;
+		port_counter->mode.mode = RDMA_COUNTER_MODE_NONE;
+		mutex_init(&port_counter->lock);
+	}
+}
+
+void rdma_counter_release(struct ib_device *dev)
+{
+}
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8a6ccb936dfe..6579865e4866 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -46,6 +46,7 @@
 #include <rdma/rdma_netlink.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
 
 #include "core_priv.h"
 #include "restrack.h"
@@ -492,10 +493,12 @@ static void ib_device_release(struct device *device)
 	if (dev->port_data) {
 		ib_cache_release_one(dev);
 		ib_security_release_port_pkey_list(dev);
+		rdma_counter_release(dev);
 		kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
 				       pdata[0]),
 			  rcu_head);
 	}
+
 	xa_destroy(&dev->compat_devs);
 	xa_destroy(&dev->client_data);
 	kfree_rcu(dev, rcu_head);
@@ -1316,6 +1319,8 @@ int ib_register_device(struct ib_device *device, const char *name)
 
 	ib_device_register_rdmacg(device);
 
+	rdma_counter_init(device);
+
 	/*
 	 * Ensure that ADD uevent is not fired because it
 	 * is too early amd device is not initialized yet.
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 26e9c2594913..3d19c056fbc0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -62,6 +62,7 @@
 #include <linux/irqflags.h>
 #include <linux/preempt.h>
 #include <uapi/rdma/ib_user_verbs.h>
+#include <rdma/rdma_counter.h>
 #include <rdma/restrack.h>
 #include <rdma/signature.h>
 #include <uapi/rdma/rdma_user_ioctl.h>
@@ -2119,6 +2120,7 @@ struct ib_port_data {
 	spinlock_t netdev_lock;
 	struct net_device __rcu *netdev;
 	struct hlist_node ndev_hash_link;
+	struct rdma_port_counter port_counter;
 };
 
 /* rdma netdev type - specifies protocol type */
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index 283ac1a0cdb7..8dd2619c015d 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -6,8 +6,26 @@
 #ifndef _RDMA_COUNTER_H_
 #define _RDMA_COUNTER_H_
 
+#include <linux/mutex.h>
+
 #include <rdma/ib_verbs.h>
 #include <rdma/restrack.h>
+#include <rdma/rdma_netlink.h>
+
+struct auto_mode_param {
+	int qp_type;
+};
+
+struct rdma_counter_mode {
+	enum rdma_nl_counter_mode mode;
+	enum rdma_nl_counter_mask mask;
+	struct auto_mode_param param;
+};
+
+struct rdma_port_counter {
+	struct rdma_counter_mode mode;
+	struct mutex lock;
+};
 
 struct rdma_counter {
 	struct rdma_restrack_entry	res;
@@ -15,4 +33,10 @@ struct rdma_counter {
 	uint32_t			id;
 	u8				port;
 };
+
+void rdma_counter_init(struct ib_device *dev);
+void rdma_counter_release(struct ib_device *dev);
+int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
+			       bool on, enum rdma_nl_counter_mask mask);
+
 #endif /* _RDMA_COUNTER_H_ */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 650cee8c4bf1..e3cd912e9cef 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -507,4 +507,30 @@ enum rdma_nldev_attr {
 	 */
 	RDMA_NLDEV_ATTR_MAX
 };
+
+/*
+ * Supported counter bind modes. All modes are mutual-exclusive.
+ */
+enum rdma_nl_counter_mode {
+	RDMA_COUNTER_MODE_NONE,
+
+	/*
+	 * A qp is bound with a counter automatically during initialization
+	 * based on the auto mode (e.g., qp type, ...)
+	 */
+	RDMA_COUNTER_MODE_AUTO,
+
+	/*
+	 * Always the end
+	 */
+	RDMA_COUNTER_MODE_MAX,
+};
+
+/*
+ * Supported criteria in counter auto mode.
+ * Currently only "qp type" is supported
+ */
+enum rdma_nl_counter_mask {
+	RDMA_COUNTER_MASK_QP_TYPE = 1,
+};
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From 99fa331dc8629be55ac7a0cca0dc56492070ddac Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:35 +0300
Subject: RDMA/counter: Add "auto" configuration mode support

In auto mode all QPs belong to one category are bind automatically to a
single counter set. Currently only "qp type" is supported.

In this mode the qp counter is set in RST2INIT modification, and when a qp
is destroyed the counter is unbound.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/counters.c | 221 +++++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/device.c   |   3 +
 drivers/infiniband/core/verbs.c    |   9 ++
 include/rdma/ib_verbs.h            |  18 +++
 include/rdma/rdma_counter.h        |   8 ++
 5 files changed, 259 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index 6167914fba06..615ee731a1de 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -54,6 +54,227 @@ out:
 	return ret;
 }
 
+static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
+					       enum rdma_nl_counter_mode mode)
+{
+	struct rdma_counter *counter;
+
+	if (!dev->ops.counter_dealloc)
+		return NULL;
+
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	if (!counter)
+		return NULL;
+
+	counter->device    = dev;
+	counter->port      = port;
+	counter->res.type  = RDMA_RESTRACK_COUNTER;
+	counter->mode.mode = mode;
+	kref_init(&counter->kref);
+	mutex_init(&counter->lock);
+
+	return counter;
+}
+
+static void rdma_counter_free(struct rdma_counter *counter)
+{
+	rdma_restrack_del(&counter->res);
+	kfree(counter);
+}
+
+static void auto_mode_init_counter(struct rdma_counter *counter,
+				   const struct ib_qp *qp,
+				   enum rdma_nl_counter_mask new_mask)
+{
+	struct auto_mode_param *param = &counter->mode.param;
+
+	counter->mode.mode = RDMA_COUNTER_MODE_AUTO;
+	counter->mode.mask = new_mask;
+
+	if (new_mask & RDMA_COUNTER_MASK_QP_TYPE)
+		param->qp_type = qp->qp_type;
+}
+
+static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter,
+			    enum rdma_nl_counter_mask auto_mask)
+{
+	struct auto_mode_param *param = &counter->mode.param;
+	bool match = true;
+
+	if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res))
+		return false;
+
+	/* Ensure that counter belong to right PID */
+	if (!rdma_is_kernel_res(&counter->res) &&
+	    !rdma_is_kernel_res(&qp->res) &&
+	    (task_pid_vnr(counter->res.task) != current->pid))
+		return false;
+
+	if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE)
+		match &= (param->qp_type == qp->qp_type);
+
+	return match;
+}
+
+static int __rdma_counter_bind_qp(struct rdma_counter *counter,
+				  struct ib_qp *qp)
+{
+	int ret;
+
+	if (qp->counter)
+		return -EINVAL;
+
+	if (!qp->device->ops.counter_bind_qp)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&counter->lock);
+	ret = qp->device->ops.counter_bind_qp(counter, qp);
+	mutex_unlock(&counter->lock);
+
+	return ret;
+}
+
+static int __rdma_counter_unbind_qp(struct ib_qp *qp)
+{
+	struct rdma_counter *counter = qp->counter;
+	int ret;
+
+	if (!qp->device->ops.counter_unbind_qp)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&counter->lock);
+	ret = qp->device->ops.counter_unbind_qp(qp);
+	mutex_unlock(&counter->lock);
+
+	return ret;
+}
+
+/**
+ * rdma_get_counter_auto_mode - Find the counter that @qp should be bound
+ *     with in auto mode
+ *
+ * Return: The counter (with ref-count increased) if found
+ */
+static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp,
+						       u8 port)
+{
+	struct rdma_port_counter *port_counter;
+	struct rdma_counter *counter = NULL;
+	struct ib_device *dev = qp->device;
+	struct rdma_restrack_entry *res;
+	struct rdma_restrack_root *rt;
+	unsigned long id = 0;
+
+	port_counter = &dev->port_data[port].port_counter;
+	rt = &dev->res[RDMA_RESTRACK_COUNTER];
+	xa_lock(&rt->xa);
+	xa_for_each(&rt->xa, id, res) {
+		if (!rdma_is_visible_in_pid_ns(res))
+			continue;
+
+		counter = container_of(res, struct rdma_counter, res);
+		if ((counter->device != qp->device) || (counter->port != port))
+			goto next;
+
+		if (auto_mode_match(qp, counter, port_counter->mode.mask))
+			break;
+next:
+		counter = NULL;
+	}
+
+	if (counter && !kref_get_unless_zero(&counter->kref))
+		counter = NULL;
+
+	xa_unlock(&rt->xa);
+	return counter;
+}
+
+static void rdma_counter_res_add(struct rdma_counter *counter,
+				 struct ib_qp *qp)
+{
+	if (rdma_is_kernel_res(&qp->res)) {
+		rdma_restrack_set_task(&counter->res, qp->res.kern_name);
+		rdma_restrack_kadd(&counter->res);
+	} else {
+		rdma_restrack_attach_task(&counter->res, qp->res.task);
+		rdma_restrack_uadd(&counter->res);
+	}
+}
+
+static void counter_release(struct kref *kref)
+{
+	struct rdma_counter *counter;
+
+	counter = container_of(kref, struct rdma_counter, kref);
+	counter->device->ops.counter_dealloc(counter);
+	rdma_counter_free(counter);
+}
+
+/**
+ * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on
+ *   the auto-mode rule
+ */
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port)
+{
+	struct rdma_port_counter *port_counter;
+	struct ib_device *dev = qp->device;
+	struct rdma_counter *counter;
+	int ret;
+
+	if (!rdma_is_port_valid(dev, port))
+		return -EINVAL;
+
+	port_counter = &dev->port_data[port].port_counter;
+	if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO)
+		return 0;
+
+	counter = rdma_get_counter_auto_mode(qp, port);
+	if (counter) {
+		ret = __rdma_counter_bind_qp(counter, qp);
+		if (ret) {
+			kref_put(&counter->kref, counter_release);
+			return ret;
+		}
+	} else {
+		counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_AUTO);
+		if (!counter)
+			return -ENOMEM;
+
+		auto_mode_init_counter(counter, qp, port_counter->mode.mask);
+
+		ret = __rdma_counter_bind_qp(counter, qp);
+		if (ret) {
+			rdma_counter_free(counter);
+			return ret;
+		}
+
+		rdma_counter_res_add(counter, qp);
+	}
+
+	return 0;
+}
+
+/**
+ * rdma_counter_unbind_qp - Unbind a qp from a counter
+ * @force:
+ *   true - Decrease the counter ref-count anyway (e.g., qp destroy)
+ */
+int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
+{
+	struct rdma_counter *counter = qp->counter;
+	int ret;
+
+	if (!counter)
+		return -EINVAL;
+
+	ret = __rdma_counter_unbind_qp(qp);
+	if (ret && !force)
+		return ret;
+
+	kref_put(&counter->kref, counter_release);
+	return 0;
+}
+
 void rdma_counter_init(struct ib_device *dev)
 {
 	struct rdma_port_counter *port_counter;
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 6579865e4866..f3181b74c863 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2471,6 +2471,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, alloc_xrcd);
 	SET_DEVICE_OP(dev_ops, attach_mcast);
 	SET_DEVICE_OP(dev_ops, check_mr_status);
+	SET_DEVICE_OP(dev_ops, counter_bind_qp);
+	SET_DEVICE_OP(dev_ops, counter_dealloc);
+	SET_DEVICE_OP(dev_ops, counter_unbind_qp);
 	SET_DEVICE_OP(dev_ops, create_ah);
 	SET_DEVICE_OP(dev_ops, create_counters);
 	SET_DEVICE_OP(dev_ops, create_cq);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 4a04e94a72db..92349bf37589 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1690,6 +1690,14 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
 		}
 	}
 
+	/*
+	 * Bind this qp to a counter automatically based on the rdma counter
+	 * rules. This only set in RST2INIT with port specified
+	 */
+	if (!qp->counter && (attr_mask & IB_QP_PORT) &&
+	    ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT))
+		rdma_counter_bind_qp_auto(qp, attr->port_num);
+
 	ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
 	if (ret)
 		goto out;
@@ -1885,6 +1893,7 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
 	if (!qp->uobject)
 		rdma_rw_cleanup_mrs(qp);
 
+	rdma_counter_unbind_qp(qp, true);
 	rdma_restrack_del(&qp->res);
 	ret = qp->device->ops.destroy_qp(qp, udata);
 	if (!ret) {
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 3d19c056fbc0..0205472eb73a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1698,6 +1698,9 @@ struct ib_qp {
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
 	struct rdma_restrack_entry     res;
+
+	/* The counter the qp is bind to */
+	struct rdma_counter    *counter;
 };
 
 struct ib_dm {
@@ -2485,6 +2488,21 @@ struct ib_device_ops {
 			 u8 pdata_len);
 	int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog);
 	int (*iw_destroy_listen)(struct iw_cm_id *cm_id);
+	/**
+	 * counter_bind_qp - Bind a QP to a counter.
+	 * @counter - The counter to be bound. If counter->id is zero then
+	 *   the driver needs to allocate a new counter and set counter->id
+	 */
+	int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp);
+	/**
+	 * counter_unbind_qp - Unbind the qp from the dynamically-allocated
+	 *   counter and bind it onto the default one
+	 */
+	int (*counter_unbind_qp)(struct ib_qp *qp);
+	/**
+	 * counter_dealloc -De-allocate the hw counter
+	 */
+	int (*counter_dealloc)(struct rdma_counter *counter);
 
 	DECLARE_RDMA_OBJ_SIZE(ib_ah);
 	DECLARE_RDMA_OBJ_SIZE(ib_cq);
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index 8dd2619c015d..9f93a2403c9c 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -7,11 +7,14 @@
 #define _RDMA_COUNTER_H_
 
 #include <linux/mutex.h>
+#include <linux/pid_namespace.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/restrack.h>
 #include <rdma/rdma_netlink.h>
 
+struct ib_qp;
+
 struct auto_mode_param {
 	int qp_type;
 };
@@ -31,6 +34,9 @@ struct rdma_counter {
 	struct rdma_restrack_entry	res;
 	struct ib_device		*device;
 	uint32_t			id;
+	struct kref			kref;
+	struct rdma_counter_mode	mode;
+	struct mutex			lock;
 	u8				port;
 };
 
@@ -38,5 +44,7 @@ void rdma_counter_init(struct ib_device *dev);
 void rdma_counter_release(struct ib_device *dev);
 int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
 			       bool on, enum rdma_nl_counter_mask mask);
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port);
+int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
 
 #endif /* _RDMA_COUNTER_H_ */
-- 
cgit v1.2.3


From d14133dd41614aaaac1fa0505c7dab01f4211d2c Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:36 +0300
Subject: IB/mlx5: Support set qp counter

Support bind a qp with counter. If counter is null then bind the qp to the
default counter. Different QP state has different operation:

- RESET: Set the counter field so that it will take effective during
  RST2INIT change;
- RTS: Issue an RTS2RTS change to update the QP counter;
- Other: Set the counter field and mark the counter_pending flag, when QP
  is moved to RTS state and this flag is set, then issue an RTS2RTS
  modification to update the counter.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  6 +++
 drivers/infiniband/hw/mlx5/qp.c      | 76 +++++++++++++++++++++++++++++++++++-
 include/linux/mlx5/qp.h              |  1 +
 3 files changed, 81 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7373e9da0919..c482f19958b3 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -439,6 +439,10 @@ struct mlx5_ib_qp {
 	u32			flags_en;
 	/* storage for qp sub type when core qp type is IB_QPT_DRIVER */
 	enum ib_qp_type		qp_sub_type;
+	/* A flag to indicate if there's a new counter is configured
+	 * but not take effective
+	 */
+	u32                     counter_pending;
 };
 
 struct mlx5_ib_cq_buf {
@@ -1468,4 +1472,6 @@ void mlx5_ib_put_xlt_emergency_page(void);
 int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 			struct mlx5_bfreg_info *bfregi, u32 bfregn,
 			bool dyn_bfreg);
+
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
 #endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 8b7a60ada92c..2a97619ed603 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -34,6 +34,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/rdma_counter.h>
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
 #include "ib_rep.h"
@@ -3380,6 +3381,35 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
 	return tx_port_affinity;
 }
 
+static int __mlx5_ib_qp_set_counter(struct ib_qp *qp,
+				    struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_qp_context context = {};
+	struct mlx5_ib_port *mibport = NULL;
+	struct mlx5_ib_qp_base *base;
+	u32 set_id;
+
+	if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id))
+		return 0;
+
+	if (counter) {
+		set_id = counter->id;
+	} else {
+		mibport = &dev->port[mqp->port - 1];
+		set_id = mibport->cnts.set_id;
+	}
+
+	base = &mqp->trans_qp.base;
+	context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff);
+	context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24);
+	return mlx5_core_qp_modify(dev->mdev,
+				   MLX5_CMD_OP_RTS2RTS_QP,
+				   MLX5_QP_OPTPAR_COUNTER_SET_ID,
+				   &context, &base->mqp);
+}
+
 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state,
@@ -3433,6 +3463,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	struct mlx5_ib_port *mibport = NULL;
 	enum mlx5_qp_state mlx5_cur, mlx5_new;
 	enum mlx5_qp_optpar optpar;
+	u32 set_id = 0;
 	int mlx5_st;
 	int err;
 	u16 op;
@@ -3595,8 +3626,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			port_num = 0;
 
 		mibport = &dev->port[port_num];
+		if (ibqp->counter)
+			set_id = ibqp->counter->id;
+		else
+			set_id = mibport->cnts.set_id;
 		context->qp_counter_set_usr_page |=
-			cpu_to_be32((u32)(mibport->cnts.set_id) << 24);
+			cpu_to_be32(set_id << 24);
 	}
 
 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
@@ -3624,7 +3659,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 
 		raw_qp_param.operation = op;
 		if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-			raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id;
+			raw_qp_param.rq_q_ctr_id = set_id;
 			raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
 		}
 
@@ -3701,6 +3736,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 		qp->db.db[MLX5_SND_DBR] = 0;
 	}
 
+	if ((new_state == IB_QPS_RTS) && qp->counter_pending) {
+		err = __mlx5_ib_qp_set_counter(ibqp, ibqp->counter);
+		if (!err)
+			qp->counter_pending = 0;
+	}
+
 out:
 	kfree(context);
 	return err;
@@ -6435,3 +6476,34 @@ void mlx5_ib_drain_rq(struct ib_qp *qp)
 
 	handle_drain_completion(cq, &rdrain, dev);
 }
+
+/**
+ * Bind a qp to a counter. If @counter is NULL then bind the qp to
+ * the default counter
+ */
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter)
+{
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	int err = 0;
+
+	mutex_lock(&mqp->mutex);
+	if (mqp->state == IB_QPS_RESET) {
+		qp->counter = counter;
+		goto out;
+	}
+
+	if (mqp->state == IB_QPS_RTS) {
+		err = __mlx5_ib_qp_set_counter(qp, counter);
+		if (!err)
+			qp->counter = counter;
+
+		goto out;
+	}
+
+	mqp->counter_pending = 1;
+	qp->counter = counter;
+
+out:
+	mutex_unlock(&mqp->mutex);
+	return err;
+}
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 937041101504..ae63b1ae9004 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -71,6 +71,7 @@ enum mlx5_qp_optpar {
 	MLX5_QP_OPTPAR_CQN_RCV			= 1 << 19,
 	MLX5_QP_OPTPAR_DC_HS			= 1 << 20,
 	MLX5_QP_OPTPAR_DC_KEY			= 1 << 21,
+	MLX5_QP_OPTPAR_COUNTER_SET_ID		= 1 << 25,
 };
 
 enum mlx5_qp_state {
-- 
cgit v1.2.3


From b47ae6f803b727952dfb37afd83e51c465147b85 Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:39 +0300
Subject: RDMA/nldev: Allow counter auto mode configration through RDMA netlink

Provide an option to enable/disable per-port counter auto mode through
RDMA netlink. Limit it to users with ADMIN capability only.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 78 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/rdma/rdma_netlink.h |  8 +++++
 2 files changed, 86 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index d9ebfb50962b..9a4cf285f447 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -126,6 +126,9 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_RES_USECNT]		= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_SM_LID]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_SUBNET_PREFIX]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_STAT_MODE]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_STAT_RES]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]	= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID]	= { .type = NLA_U32 },
 	[RDMA_NLDEV_NET_NS_FD]			= { .type = NLA_U32 },
@@ -1482,6 +1485,78 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return err;
 }
 
+static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	u32 index, port, mode, mask = 0;
+	struct ib_device *device;
+	struct sk_buff *msg;
+	int ret;
+
+	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	/* Currently only counter for QP is supported */
+	if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+	    !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_MODE])
+		return -EINVAL;
+
+	if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_STAT_SET),
+			0, 0);
+
+	mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
+	if (mode != RDMA_COUNTER_MODE_AUTO) {
+		ret = -EMSGSIZE;
+		goto err_msg;
+	}
+
+	if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
+		mask = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+
+	ret = rdma_counter_set_auto_mode(device, port,
+					 mask ? true : false, mask);
+	if (ret)
+		goto err_msg;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) {
+		ret = -EMSGSIZE;
+		goto err_msg;
+	}
+
+	nlmsg_end(msg, nlh);
+	ib_device_put(device);
+	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_msg:
+	nlmsg_free(msg);
+err:
+	ib_device_put(device);
+	return ret;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 	[RDMA_NLDEV_CMD_GET] = {
 		.doit = nldev_get_doit,
@@ -1535,6 +1610,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 	},
 	[RDMA_NLDEV_CMD_SYS_SET] = {
 		.doit = nldev_set_sys_set_doit,
+	},
+	[RDMA_NLDEV_CMD_STAT_SET] = {
+		.doit = nldev_stat_set_doit,
 		.flags = RDMA_NL_ADMIN_PERM,
 	},
 };
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index e3cd912e9cef..0cb47d23fd86 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -281,6 +281,8 @@ enum rdma_nldev_command {
 
 	RDMA_NLDEV_CMD_GET_CHARDEV,
 
+	RDMA_NLDEV_CMD_STAT_SET,
+
 	RDMA_NLDEV_NUM_OPS
 };
 
@@ -488,6 +490,12 @@ enum rdma_nldev_attr {
 	 * File descriptor handle of the net namespace object
 	 */
 	RDMA_NLDEV_NET_NS_FD,			/* u32 */
+	/*
+	 * Counter-specific attributes.
+	 */
+	RDMA_NLDEV_ATTR_STAT_MODE,		/* u32 */
+	RDMA_NLDEV_ATTR_STAT_RES,		/* u32 */
+	RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK,	/* u32 */
 
 	/*
 	 * Information about a chardev.
-- 
cgit v1.2.3


From c4ffee7c9bdba7b189df3251e375c4c7e93a91ac Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:40 +0300
Subject: RDMA/netlink: Implement counter dumpit calback

This patch adds the ability to return all available counters together with
their properties and hwstats.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/counters.c |  26 ++++-
 drivers/infiniband/core/device.c   |   2 +
 drivers/infiniband/core/nldev.c    | 213 +++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h            |  10 ++
 include/rdma/rdma_counter.h        |   3 +
 include/uapi/rdma/rdma_netlink.h   |  22 ++--
 6 files changed, 268 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index 615ee731a1de..3741b9e5126a 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -59,7 +59,7 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
 {
 	struct rdma_counter *counter;
 
-	if (!dev->ops.counter_dealloc)
+	if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
 		return NULL;
 
 	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
@@ -69,16 +69,25 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
 	counter->device    = dev;
 	counter->port      = port;
 	counter->res.type  = RDMA_RESTRACK_COUNTER;
+	counter->stats     = dev->ops.counter_alloc_stats(counter);
+	if (!counter->stats)
+		goto err_stats;
+
 	counter->mode.mode = mode;
 	kref_init(&counter->kref);
 	mutex_init(&counter->lock);
 
 	return counter;
+
+err_stats:
+	kfree(counter);
+	return NULL;
 }
 
 static void rdma_counter_free(struct rdma_counter *counter)
 {
 	rdma_restrack_del(&counter->res);
+	kfree(counter->stats);
 	kfree(counter);
 }
 
@@ -275,6 +284,21 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
 	return 0;
 }
 
+int rdma_counter_query_stats(struct rdma_counter *counter)
+{
+	struct ib_device *dev = counter->device;
+	int ret;
+
+	if (!dev->ops.counter_update_stats)
+		return -EINVAL;
+
+	mutex_lock(&counter->lock);
+	ret = dev->ops.counter_update_stats(counter);
+	mutex_unlock(&counter->lock);
+
+	return ret;
+}
+
 void rdma_counter_init(struct ib_device *dev)
 {
 	struct rdma_port_counter *port_counter;
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index f3181b74c863..bdf61499e6d5 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2471,9 +2471,11 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, alloc_xrcd);
 	SET_DEVICE_OP(dev_ops, attach_mcast);
 	SET_DEVICE_OP(dev_ops, check_mr_status);
+	SET_DEVICE_OP(dev_ops, counter_alloc_stats);
 	SET_DEVICE_OP(dev_ops, counter_bind_qp);
 	SET_DEVICE_OP(dev_ops, counter_dealloc);
 	SET_DEVICE_OP(dev_ops, counter_unbind_qp);
+	SET_DEVICE_OP(dev_ops, counter_update_stats);
 	SET_DEVICE_OP(dev_ops, create_ah);
 	SET_DEVICE_OP(dev_ops, create_counters);
 	SET_DEVICE_OP(dev_ops, create_cq);
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 9a4cf285f447..cebc15b23b15 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -129,6 +129,13 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]	= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_STAT_MODE]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_STAT_RES]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_STAT_COUNTER]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY]	= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]       = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]       = { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY]  = { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING },
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]	= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID]	= { .type = NLA_U32 },
 	[RDMA_NLDEV_NET_NS_FD]			= { .type = NLA_U32 },
@@ -636,6 +643,152 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 err:	return -EMSGSIZE;
 }
 
+static int fill_stat_counter_mode(struct sk_buff *msg,
+				  struct rdma_counter *counter)
+{
+	struct rdma_counter_mode *m = &counter->mode;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode))
+		return -EMSGSIZE;
+
+	if (m->mode == RDMA_COUNTER_MODE_AUTO)
+		if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) &&
+		    nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type))
+			return -EMSGSIZE;
+
+	return 0;
+}
+
+static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn)
+{
+	struct nlattr *entry_attr;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+	if (!entry_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn))
+		goto err;
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+	return -EMSGSIZE;
+}
+
+static int fill_stat_counter_qps(struct sk_buff *msg,
+				 struct rdma_counter *counter)
+{
+	struct rdma_restrack_entry *res;
+	struct rdma_restrack_root *rt;
+	struct nlattr *table_attr;
+	struct ib_qp *qp = NULL;
+	unsigned long id = 0;
+	int ret = 0;
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP);
+
+	rt = &counter->device->res[RDMA_RESTRACK_QP];
+	xa_lock(&rt->xa);
+	xa_for_each(&rt->xa, id, res) {
+		if (!rdma_is_visible_in_pid_ns(res))
+			continue;
+
+		qp = container_of(res, struct ib_qp, res);
+		if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+			continue;
+
+		if (!qp->counter || (qp->counter->id != counter->id))
+			continue;
+
+		ret = fill_stat_counter_qp_entry(msg, qp->qp_num);
+		if (ret)
+			goto err;
+	}
+
+	xa_unlock(&rt->xa);
+	nla_nest_end(msg, table_attr);
+	return 0;
+
+err:
+	xa_unlock(&rt->xa);
+	nla_nest_cancel(msg, table_attr);
+	return ret;
+}
+
+static int fill_stat_hwcounter_entry(struct sk_buff *msg,
+				     const char *name, u64 value)
+{
+	struct nlattr *entry_attr;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY);
+	if (!entry_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,
+			   name))
+		goto err;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE,
+			      value, RDMA_NLDEV_ATTR_PAD))
+		goto err;
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+	return -EMSGSIZE;
+}
+
+static int fill_stat_counter_hwcounters(struct sk_buff *msg,
+					struct rdma_counter *counter)
+{
+	struct rdma_hw_stats *st = counter->stats;
+	struct nlattr *table_attr;
+	int i;
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+	if (!table_attr)
+		return -EMSGSIZE;
+
+	for (i = 0; i < st->num_counters; i++)
+		if (fill_stat_hwcounter_entry(msg, st->names[i], st->value[i]))
+			goto err;
+
+	nla_nest_end(msg, table_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, table_attr);
+	return -EMSGSIZE;
+}
+
+static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin,
+				  struct rdma_restrack_entry *res,
+				  uint32_t port)
+{
+	struct rdma_counter *counter =
+		container_of(res, struct rdma_counter, res);
+
+	if (port && port != counter->port)
+		return 0;
+
+	/* Dump it even query failed */
+	rdma_counter_query_stats(counter);
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) ||
+	    fill_res_name_pid(msg, &counter->res) ||
+	    fill_stat_counter_mode(msg, counter) ||
+	    fill_stat_counter_qps(msg, counter) ||
+	    fill_stat_counter_hwcounters(msg, counter))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 			  struct netlink_ext_ack *extack)
 {
@@ -1003,6 +1156,13 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
 		.entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
 		.id = RDMA_NLDEV_ATTR_RES_PDN,
 	},
+	[RDMA_RESTRACK_COUNTER] = {
+		.fill_res_func = fill_res_counter_entry,
+		.nldev_cmd = RDMA_NLDEV_CMD_STAT_GET,
+		.nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER,
+		.entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,
+		.id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID,
+	},
 };
 
 static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -1239,6 +1399,7 @@ RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID);
 RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
 RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
 RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
+RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER);
 
 static LIST_HEAD(link_ops);
 static DECLARE_RWSEM(link_ops_rwsem);
@@ -1557,6 +1718,54 @@ err:
 	return ret;
 }
 
+static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	int ret;
+
+	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
+		return -EINVAL;
+
+	switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+	case RDMA_NLDEV_ATTR_RES_QP:
+		ret = nldev_res_get_counter_doit(skb, nlh, extack);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int nldev_stat_get_dumpit(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	int ret;
+
+	ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, NULL);
+	if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
+		return -EINVAL;
+
+	switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+	case RDMA_NLDEV_ATTR_RES_QP:
+		ret = nldev_res_get_counter_dumpit(skb, cb);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 	[RDMA_NLDEV_CMD_GET] = {
 		.doit = nldev_get_doit,
@@ -1615,6 +1824,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 		.doit = nldev_stat_set_doit,
 		.flags = RDMA_NL_ADMIN_PERM,
 	},
+	[RDMA_NLDEV_CMD_STAT_GET] = {
+		.doit = nldev_stat_get_doit,
+		.dump = nldev_stat_get_dumpit,
+	},
 };
 
 void __init nldev_init(void)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0205472eb73a..0c5151a12ae4 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2503,6 +2503,16 @@ struct ib_device_ops {
 	 * counter_dealloc -De-allocate the hw counter
 	 */
 	int (*counter_dealloc)(struct rdma_counter *counter);
+	/**
+	 * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in
+	 * the driver initialized data.
+	 */
+	struct rdma_hw_stats *(*counter_alloc_stats)(
+		struct rdma_counter *counter);
+	/**
+	 * counter_update_stats - Query the stats value of this counter
+	 */
+	int (*counter_update_stats)(struct rdma_counter *counter);
 
 	DECLARE_RDMA_OBJ_SIZE(ib_ah);
 	DECLARE_RDMA_OBJ_SIZE(ib_cq);
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index 9f93a2403c9c..f2a5c8efc404 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -37,6 +37,7 @@ struct rdma_counter {
 	struct kref			kref;
 	struct rdma_counter_mode	mode;
 	struct mutex			lock;
+	struct rdma_hw_stats		*stats;
 	u8				port;
 };
 
@@ -47,4 +48,6 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
 int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port);
 int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
 
+int rdma_counter_query_stats(struct rdma_counter *counter);
+
 #endif /* _RDMA_COUNTER_H_ */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 0cb47d23fd86..18dd88c0add8 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -283,6 +283,8 @@ enum rdma_nldev_command {
 
 	RDMA_NLDEV_CMD_STAT_SET,
 
+	RDMA_NLDEV_CMD_STAT_GET, /* can dump */
+
 	RDMA_NLDEV_NUM_OPS
 };
 
@@ -490,13 +492,6 @@ enum rdma_nldev_attr {
 	 * File descriptor handle of the net namespace object
 	 */
 	RDMA_NLDEV_NET_NS_FD,			/* u32 */
-	/*
-	 * Counter-specific attributes.
-	 */
-	RDMA_NLDEV_ATTR_STAT_MODE,		/* u32 */
-	RDMA_NLDEV_ATTR_STAT_RES,		/* u32 */
-	RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK,	/* u32 */
-
 	/*
 	 * Information about a chardev.
 	 * CHARDEV_TYPE is the name of the chardev ABI (ie uverbs, umad, etc)
@@ -509,6 +504,19 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_CHARDEV_ABI,		/* u64 */
 	RDMA_NLDEV_ATTR_CHARDEV,		/* u64 */
 	RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID,       /* u64 */
+	/*
+	 * Counter-specific attributes.
+	 */
+	RDMA_NLDEV_ATTR_STAT_MODE,		/* u32 */
+	RDMA_NLDEV_ATTR_STAT_RES,		/* u32 */
+	RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK,	/* u32 */
+	RDMA_NLDEV_ATTR_STAT_COUNTER,		/* nested table */
+	RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,	/* nested table */
+	RDMA_NLDEV_ATTR_STAT_COUNTER_ID,	/* u32 */
+	RDMA_NLDEV_ATTR_STAT_HWCOUNTERS,	/* nested table */
+	RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY,	/* nested table */
+	RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,	/* string */
+	RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE,	/* u64 */
 
 	/*
 	 * Always the end
-- 
cgit v1.2.3


From f34a55e497e81347ffbdc6e828f123520d33ce5d Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:42 +0300
Subject: RDMA/core: Get sum value of all counters when perform a sysfs stat
 read

Since a QP can only be bound to one counter, then if it is bound to a
separate counter, for backward compatibility purpose, the statistic value
must be:
* stat of default counter
+ stat of all running allocated counters
+ stat of all deallocated counters (history stats)

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/counters.c | 89 ++++++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/sysfs.c    | 10 +++--
 include/rdma/rdma_counter.h        |  2 +
 3 files changed, 98 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index 3741b9e5126a..8810a8a8d1f5 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -158,6 +158,20 @@ static int __rdma_counter_unbind_qp(struct ib_qp *qp)
 	return ret;
 }
 
+static void counter_history_stat_update(const struct rdma_counter *counter)
+{
+	struct ib_device *dev = counter->device;
+	struct rdma_port_counter *port_counter;
+	int i;
+
+	port_counter = &dev->port_data[counter->port].port_counter;
+	if (!port_counter->hstats)
+		return;
+
+	for (i = 0; i < counter->stats->num_counters; i++)
+		port_counter->hstats->value[i] += counter->stats->value[i];
+}
+
 /**
  * rdma_get_counter_auto_mode - Find the counter that @qp should be bound
  *     with in auto mode
@@ -215,6 +229,7 @@ static void counter_release(struct kref *kref)
 	struct rdma_counter *counter;
 
 	counter = container_of(kref, struct rdma_counter, kref);
+	counter_history_stat_update(counter);
 	counter->device->ops.counter_dealloc(counter);
 	rdma_counter_free(counter);
 }
@@ -299,6 +314,55 @@ int rdma_counter_query_stats(struct rdma_counter *counter)
 	return ret;
 }
 
+static u64 get_running_counters_hwstat_sum(struct ib_device *dev,
+					   u8 port, u32 index)
+{
+	struct rdma_restrack_entry *res;
+	struct rdma_restrack_root *rt;
+	struct rdma_counter *counter;
+	unsigned long id = 0;
+	u64 sum = 0;
+
+	rt = &dev->res[RDMA_RESTRACK_COUNTER];
+	xa_lock(&rt->xa);
+	xa_for_each(&rt->xa, id, res) {
+		if (!rdma_restrack_get(res))
+			continue;
+
+		xa_unlock(&rt->xa);
+
+		counter = container_of(res, struct rdma_counter, res);
+		if ((counter->device != dev) || (counter->port != port) ||
+		    rdma_counter_query_stats(counter))
+			goto next;
+
+		sum += counter->stats->value[index];
+
+next:
+		xa_lock(&rt->xa);
+		rdma_restrack_put(res);
+	}
+
+	xa_unlock(&rt->xa);
+	return sum;
+}
+
+/**
+ * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a
+ *   specific port, including the running ones and history data
+ */
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index)
+{
+	struct rdma_port_counter *port_counter;
+	u64 sum;
+
+	port_counter = &dev->port_data[port].port_counter;
+	sum = get_running_counters_hwstat_sum(dev, port, index);
+	sum += port_counter->hstats->value[index];
+
+	return sum;
+}
+
 void rdma_counter_init(struct ib_device *dev)
 {
 	struct rdma_port_counter *port_counter;
@@ -311,9 +375,34 @@ void rdma_counter_init(struct ib_device *dev)
 		port_counter = &dev->port_data[port].port_counter;
 		port_counter->mode.mode = RDMA_COUNTER_MODE_NONE;
 		mutex_init(&port_counter->lock);
+
+		port_counter->hstats = dev->ops.alloc_hw_stats(dev, port);
+		if (!port_counter->hstats)
+			goto fail;
 	}
+
+	return;
+
+fail:
+	rdma_for_each_port(dev, port) {
+		port_counter = &dev->port_data[port].port_counter;
+		kfree(port_counter->hstats);
+		port_counter->hstats = NULL;
+	}
+
+	return;
 }
 
 void rdma_counter_release(struct ib_device *dev)
 {
+	struct rdma_port_counter *port_counter;
+	u32 port;
+
+	if (!dev->ops.alloc_hw_stats)
+		return;
+
+	rdma_for_each_port(dev, port) {
+		port_counter = &dev->port_data[port].port_counter;
+		kfree(port_counter->hstats);
+	}
 }
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index c78d0c9646ae..c59b80e0a740 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -43,6 +43,7 @@
 #include <rdma/ib_mad.h>
 #include <rdma/ib_pma.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
 
 struct ib_port;
 
@@ -800,9 +801,12 @@ static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
 	return 0;
 }
 
-static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+static ssize_t print_hw_stat(struct ib_device *dev, int port_num,
+			     struct rdma_hw_stats *stats, int index, char *buf)
 {
-	return sprintf(buf, "%llu\n", stats->value[index]);
+	u64 v = rdma_counter_get_hwstat_value(dev, port_num, index);
+
+	return sprintf(buf, "%llu\n", stats->value[index] + v);
 }
 
 static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
@@ -828,7 +832,7 @@ static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
 	ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
 	if (ret)
 		goto unlock;
-	ret = print_hw_stat(stats, hsa->index, buf);
+	ret = print_hw_stat(dev, hsa->port_num, stats, hsa->index, buf);
 unlock:
 	mutex_unlock(&stats->lock);
 
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index f2a5c8efc404..bf2c3578768f 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -27,6 +27,7 @@ struct rdma_counter_mode {
 
 struct rdma_port_counter {
 	struct rdma_counter_mode mode;
+	struct rdma_hw_stats *hstats;
 	struct mutex lock;
 };
 
@@ -49,5 +50,6 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port);
 int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
 
 int rdma_counter_query_stats(struct rdma_counter *counter);
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index);
 
 #endif /* _RDMA_COUNTER_H_ */
-- 
cgit v1.2.3


From 1bd8e0a9d0fd1be03d2833a0c15ac676bdf275d8 Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:43 +0300
Subject: RDMA/counter: Allow manual mode configuration support

In manual mode a QP is bound to a counter manually. If counter is not
specified then a new one will be allocated.

Manual mode is enabled when user binds a QP, and disabled when the last
manually bound QP is unbound.

When auto-mode is turned off and there are counters left, manual mode is
enabled so that the user is able to access these counters.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/counters.c | 219 ++++++++++++++++++++++++++++++++++++-
 include/rdma/rdma_counter.h        |   7 ++
 include/uapi/rdma/rdma_netlink.h   |   6 +
 3 files changed, 229 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index 8810a8a8d1f5..0ebe36e9fa7b 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -27,7 +27,9 @@ static int __counter_set_mode(struct rdma_counter_mode *curr,
 /**
  * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode
  *
- * When @on is true, the @mask must be set
+ * When @on is true, the @mask must be set; When @on is false, it goes
+ * into manual mode if there's any counter, so that the user is able to
+ * manually access them.
  */
 int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
 			       bool on, enum rdma_nl_counter_mask mask)
@@ -45,8 +47,13 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
 			ret = -EINVAL;
 			goto out;
 		}
-		ret = __counter_set_mode(&port_counter->mode,
-					 RDMA_COUNTER_MODE_NONE, 0);
+
+		if (port_counter->num_counters)
+			ret = __counter_set_mode(&port_counter->mode,
+						 RDMA_COUNTER_MODE_MANUAL, 0);
+		else
+			ret = __counter_set_mode(&port_counter->mode,
+						 RDMA_COUNTER_MODE_NONE, 0);
 	}
 
 out:
@@ -57,7 +64,9 @@ out:
 static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
 					       enum rdma_nl_counter_mode mode)
 {
+	struct rdma_port_counter *port_counter;
 	struct rdma_counter *counter;
+	int ret;
 
 	if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
 		return NULL;
@@ -73,12 +82,27 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
 	if (!counter->stats)
 		goto err_stats;
 
+	port_counter = &dev->port_data[port].port_counter;
+	mutex_lock(&port_counter->lock);
+	if (mode == RDMA_COUNTER_MODE_MANUAL) {
+		ret = __counter_set_mode(&port_counter->mode,
+					 RDMA_COUNTER_MODE_MANUAL, 0);
+		if (ret)
+			goto err_mode;
+	}
+
+	port_counter->num_counters++;
+	mutex_unlock(&port_counter->lock);
+
 	counter->mode.mode = mode;
 	kref_init(&counter->kref);
 	mutex_init(&counter->lock);
 
 	return counter;
 
+err_mode:
+	mutex_unlock(&port_counter->lock);
+	kfree(counter->stats);
 err_stats:
 	kfree(counter);
 	return NULL;
@@ -86,6 +110,18 @@ err_stats:
 
 static void rdma_counter_free(struct rdma_counter *counter)
 {
+	struct rdma_port_counter *port_counter;
+
+	port_counter = &counter->device->port_data[counter->port].port_counter;
+	mutex_lock(&port_counter->lock);
+	port_counter->num_counters--;
+	if (!port_counter->num_counters &&
+	    (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL))
+		__counter_set_mode(&port_counter->mode, RDMA_COUNTER_MODE_NONE,
+				   0);
+
+	mutex_unlock(&port_counter->lock);
+
 	rdma_restrack_del(&counter->res);
 	kfree(counter->stats);
 	kfree(counter);
@@ -363,6 +399,183 @@ u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index)
 	return sum;
 }
 
+static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num)
+{
+	struct rdma_restrack_entry *res = NULL;
+	struct ib_qp *qp = NULL;
+
+	res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num);
+	if (IS_ERR(res))
+		return NULL;
+
+	if (!rdma_is_visible_in_pid_ns(res))
+		goto err;
+
+	qp = container_of(res, struct ib_qp, res);
+	if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+		goto err;
+
+	return qp;
+
+err:
+	rdma_restrack_put(&qp->res);
+	return NULL;
+}
+
+static int rdma_counter_bind_qp_manual(struct rdma_counter *counter,
+				       struct ib_qp *qp)
+{
+	if ((counter->device != qp->device) || (counter->port != qp->port))
+		return -EINVAL;
+
+	return __rdma_counter_bind_qp(counter, qp);
+}
+
+static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev,
+						   u32 counter_id)
+{
+	struct rdma_restrack_entry *res;
+	struct rdma_counter *counter;
+
+	res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id);
+	if (IS_ERR(res))
+		return NULL;
+
+	if (!rdma_is_visible_in_pid_ns(res)) {
+		rdma_restrack_put(res);
+		return NULL;
+	}
+
+	counter = container_of(res, struct rdma_counter, res);
+	kref_get(&counter->kref);
+	rdma_restrack_put(res);
+
+	return counter;
+}
+
+/**
+ * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id
+ */
+int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
+			  u32 qp_num, u32 counter_id)
+{
+	struct rdma_counter *counter;
+	struct ib_qp *qp;
+	int ret;
+
+	qp = rdma_counter_get_qp(dev, qp_num);
+	if (!qp)
+		return -ENOENT;
+
+	counter = rdma_get_counter_by_id(dev, counter_id);
+	if (!counter) {
+		ret = -ENOENT;
+		goto err;
+	}
+
+	if (counter->res.task != qp->res.task) {
+		ret = -EINVAL;
+		goto err_task;
+	}
+
+	ret = rdma_counter_bind_qp_manual(counter, qp);
+	if (ret)
+		goto err_task;
+
+	rdma_restrack_put(&qp->res);
+	return 0;
+
+err_task:
+	kref_put(&counter->kref, counter_release);
+err:
+	rdma_restrack_put(&qp->res);
+	return ret;
+}
+
+/**
+ * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it
+ *   The id of new counter is returned in @counter_id
+ */
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
+				u32 qp_num, u32 *counter_id)
+{
+	struct rdma_counter *counter;
+	struct ib_qp *qp;
+	int ret;
+
+	if (!rdma_is_port_valid(dev, port))
+		return -EINVAL;
+
+	qp = rdma_counter_get_qp(dev, qp_num);
+	if (!qp)
+		return -ENOENT;
+
+	if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_MANUAL);
+	if (!counter) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = rdma_counter_bind_qp_manual(counter, qp);
+	if (ret)
+		goto err_bind;
+
+	if (counter_id)
+		*counter_id = counter->id;
+
+	rdma_counter_res_add(counter, qp);
+
+	rdma_restrack_put(&qp->res);
+	return ret;
+
+err_bind:
+	rdma_counter_free(counter);
+err:
+	rdma_restrack_put(&qp->res);
+	return ret;
+}
+
+/**
+ * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter
+ */
+int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
+			    u32 qp_num, u32 counter_id)
+{
+	struct rdma_port_counter *port_counter;
+	struct ib_qp *qp;
+	int ret;
+
+	if (!rdma_is_port_valid(dev, port))
+		return -EINVAL;
+
+	qp = rdma_counter_get_qp(dev, qp_num);
+	if (!qp)
+		return -ENOENT;
+
+	if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	port_counter = &dev->port_data[port].port_counter;
+	if (!qp->counter || qp->counter->id != counter_id ||
+	    port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = rdma_counter_unbind_qp(qp, false);
+
+out:
+	rdma_restrack_put(&qp->res);
+	return ret;
+}
+
 void rdma_counter_init(struct ib_device *dev)
 {
 	struct rdma_port_counter *port_counter;
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index bf2c3578768f..6603e10eb352 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -28,6 +28,7 @@ struct rdma_counter_mode {
 struct rdma_port_counter {
 	struct rdma_counter_mode mode;
 	struct rdma_hw_stats *hstats;
+	unsigned int num_counters;
 	struct mutex lock;
 };
 
@@ -51,5 +52,11 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
 
 int rdma_counter_query_stats(struct rdma_counter *counter);
 u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index);
+int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
+			  u32 qp_num, u32 counter_id);
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
+				u32 qp_num, u32 *counter_id);
+int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
+			    u32 qp_num, u32 counter_id);
 
 #endif /* _RDMA_COUNTER_H_ */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 18dd88c0add8..ec86fab3d040 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -536,6 +536,12 @@ enum rdma_nl_counter_mode {
 	 */
 	RDMA_COUNTER_MODE_AUTO,
 
+	/*
+	 * Which qp are bound with which counter is explicitly specified
+	 * by the user
+	 */
+	RDMA_COUNTER_MODE_MANUAL,
+
 	/*
 	 * Always the end
 	 */
-- 
cgit v1.2.3


From b389327df90530d47931d0f5616b5cd6abb96c96 Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:44 +0300
Subject: RDMA/nldev: Allow counter manual mode configration through RDMA
 netlink

Provide an option to allow users to manually bind a qp with a counter
through RDMA netlink. Limit it to users with ADMIN capability only.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 111 ++++++++++++++++++++++++++++++++++-----
 include/rdma/rdma_counter.h      |   3 ++
 include/uapi/rdma/rdma_netlink.h |   2 +
 3 files changed, 103 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index cebc15b23b15..3d750eca53d5 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -1649,8 +1649,8 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 			       struct netlink_ext_ack *extack)
 {
+	u32 index, port, mode, mask = 0, qpn, cntn = 0;
 	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
-	u32 index, port, mode, mask = 0;
 	struct ib_device *device;
 	struct sk_buff *msg;
 	int ret;
@@ -1688,30 +1688,111 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 			0, 0);
 
 	mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
-	if (mode != RDMA_COUNTER_MODE_AUTO) {
-		ret = -EMSGSIZE;
-		goto err_msg;
+	if (mode == RDMA_COUNTER_MODE_AUTO) {
+		if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
+			mask = nla_get_u32(
+				tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+
+		ret = rdma_counter_set_auto_mode(device, port,
+						 mask ? true : false, mask);
+		if (ret)
+			goto err_msg;
+	} else {
+		qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+		if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) {
+			cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+			ret = rdma_counter_bind_qpn(device, port, qpn, cntn);
+		} else {
+			ret = rdma_counter_bind_qpn_alloc(device, port,
+							  qpn, &cntn);
+		}
+		if (ret)
+			goto err_msg;
+
+		if (fill_nldev_handle(msg, device) ||
+		    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+		    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+		    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+			ret = -EMSGSIZE;
+			goto err_fill;
+		}
 	}
 
-	if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
-		mask = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+	nlmsg_end(msg, nlh);
+	ib_device_put(device);
+	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_fill:
+	rdma_counter_unbind_qpn(device, port, qpn, cntn);
+err_msg:
+	nlmsg_free(msg);
+err:
+	ib_device_put(device);
+	return ret;
+}
+
+static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index, port, qpn, cntn;
+	int ret;
 
-	ret = rdma_counter_set_auto_mode(device, port,
-					 mask ? true : false, mask);
+	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+	    !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] ||
+	    !tb[RDMA_NLDEV_ATTR_RES_LQPN])
+		return -EINVAL;
+
+	if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_STAT_SET),
+			0, 0);
+
+	cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+	qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+	ret = rdma_counter_unbind_qpn(device, port, qpn, cntn);
 	if (ret)
-		goto err_msg;
+		goto err_unbind;
 
-	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode) ||
-	    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) {
+	if (fill_nldev_handle(msg, device) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
 		ret = -EMSGSIZE;
-		goto err_msg;
+		goto err_fill;
 	}
 
 	nlmsg_end(msg, nlh);
 	ib_device_put(device);
 	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
 
-err_msg:
+err_fill:
+	rdma_counter_bind_qpn(device, port, qpn, cntn);
+err_unbind:
 	nlmsg_free(msg);
 err:
 	ib_device_put(device);
@@ -1828,6 +1909,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 		.doit = nldev_stat_get_doit,
 		.dump = nldev_stat_get_dumpit,
 	},
+	[RDMA_NLDEV_CMD_STAT_DEL] = {
+		.doit = nldev_stat_del_doit,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 };
 
 void __init nldev_init(void)
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index 6603e10eb352..68827700ba95 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -58,5 +58,8 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
 				u32 qp_num, u32 *counter_id);
 int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
 			    u32 qp_num, u32 counter_id);
+int rdma_counter_get_mode(struct ib_device *dev, u8 port,
+			  enum rdma_nl_counter_mode *mode,
+			  enum rdma_nl_counter_mask *mask);
 
 #endif /* _RDMA_COUNTER_H_ */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index ec86fab3d040..ce6fd66e7aa3 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -285,6 +285,8 @@ enum rdma_nldev_command {
 
 	RDMA_NLDEV_CMD_STAT_GET, /* can dump */
 
+	RDMA_NLDEV_CMD_STAT_DEL,
+
 	RDMA_NLDEV_NUM_OPS
 };
 
-- 
cgit v1.2.3


From 6e7be47a53459ba3d288c3240ccd948fc699c377 Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:46 +0300
Subject: RDMA/nldev: Allow get default counter statistics through RDMA netlink

This patch adds the ability to return the hwstats of per-port default
counters (which can also be queried through sysfs nodes).

Signed-off-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c | 98 ++++++++++++++++++++++++++++++++++++++++-
 drivers/infiniband/core/sysfs.c |  6 +++
 include/rdma/ib_verbs.h         |  1 +
 3 files changed, 104 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 4993f47b0731..a4431ed566b6 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -1799,6 +1799,99 @@ err:
 	return ret;
 }
 
+static int stat_get_doit_default_counter(struct sk_buff *skb,
+					 struct nlmsghdr *nlh,
+					 struct netlink_ext_ack *extack,
+					 struct nlattr *tb[])
+{
+	struct rdma_hw_stats *stats;
+	struct nlattr *table_attr;
+	struct ib_device *device;
+	int ret, num_cnts, i;
+	struct sk_buff *msg;
+	u32 index, port;
+	u64 v;
+
+	if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	if (!device->ops.alloc_hw_stats || !device->ops.get_hw_stats) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_STAT_GET),
+			0, 0);
+
+	if (fill_nldev_handle(msg, device) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) {
+		ret = -EMSGSIZE;
+		goto err_msg;
+	}
+
+	stats = device->port_data ? device->port_data[port].hw_stats : NULL;
+	if (stats == NULL) {
+		ret = -EINVAL;
+		goto err_msg;
+	}
+	mutex_lock(&stats->lock);
+
+	num_cnts = device->ops.get_hw_stats(device, stats, port, 0);
+	if (num_cnts < 0) {
+		ret = -EINVAL;
+		goto err_stats;
+	}
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+	if (!table_attr) {
+		ret = -EMSGSIZE;
+		goto err_stats;
+	}
+	for (i = 0; i < num_cnts; i++) {
+		v = stats->value[i] +
+			rdma_counter_get_hwstat_value(device, port, i);
+		if (fill_stat_hwcounter_entry(msg, stats->names[i], v)) {
+			ret = -EMSGSIZE;
+			goto err_table;
+		}
+	}
+	nla_nest_end(msg, table_attr);
+
+	mutex_unlock(&stats->lock);
+	nlmsg_end(msg, nlh);
+	ib_device_put(device);
+	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_table:
+	nla_nest_cancel(msg, table_attr);
+err_stats:
+	mutex_unlock(&stats->lock);
+err_msg:
+	nlmsg_free(msg);
+err:
+	ib_device_put(device);
+	return ret;
+}
+
 static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
 			    struct netlink_ext_ack *extack, struct nlattr *tb[])
 
@@ -1871,9 +1964,12 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
 			  nldev_policy, extack);
-	if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
+	if (ret)
 		return -EINVAL;
 
+	if (!tb[RDMA_NLDEV_ATTR_STAT_RES])
+		return stat_get_doit_default_counter(skb, nlh, extack, tb);
+
 	switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
 	case RDMA_NLDEV_ATTR_RES_QP:
 		ret = stat_get_doit_qp(skb, nlh, extack, tb);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index c59b80e0a740..b477295a96c2 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -1003,6 +1003,8 @@ static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
 			goto err;
 		port->hw_stats_ag = hsag;
 		port->hw_stats = stats;
+		if (device->port_data)
+			device->port_data[port_num].hw_stats = stats;
 	} else {
 		struct kobject *kobj = &device->dev.kobj;
 		ret = sysfs_create_group(kobj, hsag);
@@ -1293,6 +1295,8 @@ const struct attribute_group ib_dev_attr_group = {
 
 void ib_free_port_attrs(struct ib_core_device *coredev)
 {
+	struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+	bool is_full_dev = &device->coredev == coredev;
 	struct kobject *p, *t;
 
 	list_for_each_entry_safe(p, t, &coredev->port_list, entry) {
@@ -1302,6 +1306,8 @@ void ib_free_port_attrs(struct ib_core_device *coredev)
 		if (port->hw_stats_ag)
 			free_hsag(&port->kobj, port->hw_stats_ag);
 		kfree(port->hw_stats);
+		if (device->port_data && is_full_dev)
+			device->port_data[port->port_num].hw_stats = NULL;
 
 		if (port->pma_table)
 			sysfs_remove_group(p, port->pma_table);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0c5151a12ae4..50806bef9f20 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2124,6 +2124,7 @@ struct ib_port_data {
 	struct net_device __rcu *netdev;
 	struct hlist_node ndev_hash_link;
 	struct rdma_port_counter port_counter;
+	struct rdma_hw_stats *hw_stats;
 };
 
 /* rdma netdev type - specifies protocol type */
-- 
cgit v1.2.3


From c5d4355d10d414a96ca870b731756b89d068d57a Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 5 Jul 2019 19:33:22 +0530
Subject: libnvdimm: nd_region flush callback support

This patch adds functionality to perform flush from guest
to host over VIRTIO. We are registering a callback based
on 'nd_region' type. virtio_pmem driver requires this special
flush function. For rest of the region types we are registering
existing flush function. Report error returned by host fsync
failure to userspace.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c     |  4 ++--
 drivers/nvdimm/claim.c       |  6 ++++--
 drivers/nvdimm/nd.h          |  1 +
 drivers/nvdimm/pmem.c        | 13 ++++++++-----
 drivers/nvdimm/region_devs.c | 26 ++++++++++++++++++++++++--
 include/linux/libnvdimm.h    |  9 ++++++++-
 6 files changed, 47 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 23022cf20d26..c02fa27dd3f3 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2426,7 +2426,7 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
 		offset = to_interleave_offset(offset, mmio);
 
 	writeq(cmd, mmio->addr.base + offset);
-	nvdimm_flush(nfit_blk->nd_region);
+	nvdimm_flush(nfit_blk->nd_region, NULL);
 
 	if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
 		readq(mmio->addr.base + offset);
@@ -2475,7 +2475,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
 	}
 
 	if (rw)
-		nvdimm_flush(nfit_blk->nd_region);
+		nvdimm_flush(nfit_blk->nd_region, NULL);
 
 	rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
 	return rc;
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 26c1c7618891..2985ca949912 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -255,7 +255,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
 	sector_t sector = offset >> 9;
-	int rc = 0;
+	int rc = 0, ret = 0;
 
 	if (unlikely(!size))
 		return 0;
@@ -293,7 +293,9 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	}
 
 	memcpy_flushcache(nsio->addr + offset, buf, size);
-	nvdimm_flush(to_nd_region(ndns->dev.parent));
+	ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL);
+	if (ret)
+		rc = ret;
 
 	return rc;
 }
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index d24304c0e6d7..1b9955651379 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -155,6 +155,7 @@ struct nd_region {
 	struct badblocks bb;
 	struct nd_interleave_set *nd_set;
 	struct nd_percpu_lane __percpu *lane;
+	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 	struct nd_mapping mapping[0];
 };
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 24d7fe7c74ed..223da63d1bd7 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -184,6 +184,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 {
+	int ret = 0;
 	blk_status_t rc = 0;
 	bool do_acct;
 	unsigned long start;
@@ -193,7 +194,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 	struct nd_region *nd_region = to_region(pmem);
 
 	if (bio->bi_opf & REQ_PREFLUSH)
-		nvdimm_flush(nd_region);
+		ret = nvdimm_flush(nd_region, bio);
 
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
@@ -208,7 +209,10 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 		nd_iostat_end(bio, start);
 
 	if (bio->bi_opf & REQ_FUA)
-		nvdimm_flush(nd_region);
+		ret = nvdimm_flush(nd_region, bio);
+
+	if (ret)
+		bio->bi_status = errno_to_blk_status(ret);
 
 	bio_endio(bio);
 	return BLK_QC_T_NONE;
@@ -477,7 +481,6 @@ static int pmem_attach_disk(struct device *dev,
 	}
 	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
 	pmem->dax_dev = dax_dev;
-
 	gendev = disk_to_dev(disk);
 	gendev->groups = pmem_attribute_groups;
 
@@ -535,14 +538,14 @@ static int nd_pmem_remove(struct device *dev)
 		sysfs_put(pmem->bb_state);
 		pmem->bb_state = NULL;
 	}
-	nvdimm_flush(to_nd_region(dev->parent));
+	nvdimm_flush(to_nd_region(dev->parent), NULL);
 
 	return 0;
 }
 
 static void nd_pmem_shutdown(struct device *dev)
 {
-	nvdimm_flush(to_nd_region(dev->parent));
+	nvdimm_flush(to_nd_region(dev->parent), NULL);
 }
 
 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 4fed9ce9c2fe..eca2e62af134 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -287,7 +287,9 @@ static ssize_t deep_flush_store(struct device *dev, struct device_attribute *att
 		return rc;
 	if (!flush)
 		return -EINVAL;
-	nvdimm_flush(nd_region);
+	rc = nvdimm_flush(nd_region, NULL);
+	if (rc)
+		return rc;
 
 	return len;
 }
@@ -1077,6 +1079,11 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	dev->of_node = ndr_desc->of_node;
 	nd_region->ndr_size = resource_size(ndr_desc->res);
 	nd_region->ndr_start = ndr_desc->res->start;
+	if (ndr_desc->flush)
+		nd_region->flush = ndr_desc->flush;
+	else
+		nd_region->flush = NULL;
+
 	nd_device_register(dev);
 
 	return nd_region;
@@ -1117,11 +1124,24 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
 
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
+{
+	int rc = 0;
+
+	if (!nd_region->flush)
+		rc = generic_nvdimm_flush(nd_region);
+	else {
+		if (nd_region->flush(nd_region, bio))
+			rc = -EIO;
+	}
+
+	return rc;
+}
 /**
  * nvdimm_flush - flush any posted write queues between the cpu and pmem media
  * @nd_region: blk or interleaved pmem region
  */
-void nvdimm_flush(struct nd_region *nd_region)
+int generic_nvdimm_flush(struct nd_region *nd_region)
 {
 	struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
 	int i, idx;
@@ -1145,6 +1165,8 @@ void nvdimm_flush(struct nd_region *nd_region)
 		if (ndrd_get_flush_wpq(ndrd, i, 0))
 			writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
 	wmb();
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nvdimm_flush);
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 03d5c3aece9d..e13100f424c8 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 #include <linux/spinlock.h>
+#include <linux/bio.h>
 
 struct badrange_entry {
 	u64 start;
@@ -57,6 +58,9 @@ enum {
 	 */
 	ND_REGION_PERSIST_MEMCTRL = 2,
 
+	/* Platform provides asynchronous flush mechanism */
+	ND_REGION_ASYNC = 3,
+
 	/* mark newly adjusted resources as requiring a label update */
 	DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
@@ -113,6 +117,7 @@ struct nd_mapping_desc {
 	int position;
 };
 
+struct nd_region;
 struct nd_region_desc {
 	struct resource *res;
 	struct nd_mapping_desc *mapping;
@@ -125,6 +130,7 @@ struct nd_region_desc {
 	int target_node;
 	unsigned long flags;
 	struct device_node *of_node;
+	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 };
 
 struct device;
@@ -252,7 +258,8 @@ unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr);
 unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
 void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
-void nvdimm_flush(struct nd_region *nd_region);
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio);
+int generic_nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
 int nvdimm_has_cache(struct nd_region *nd_region);
 int nvdimm_in_overwrite(struct nvdimm *nvdimm);
-- 
cgit v1.2.3


From 6e84200c0a2994b991259d19450eee561029bf70 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 5 Jul 2019 19:33:23 +0530
Subject: virtio-pmem: Add virtio pmem driver

This patch adds virtio-pmem driver for KVM guest.

Guest reads the persistent memory range information from
Qemu over VIRTIO and registers it on nvdimm_bus. It also
creates a nd_region object with the persistent memory
range information so that existing 'nvdimm/pmem' driver
can reserve this into system memory map. This way
'virtio-pmem' driver uses existing functionality of pmem
driver to register persistent memory compatible for DAX
capable filesystems.

This also provides function to perform guest flush over
VIRTIO from 'pmem' driver when userspace performs flush
on DAX memory range.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jakub Staron <jstaron@google.com>
Tested-by: Jakub Staron <jstaron@google.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/Makefile          |   1 +
 drivers/nvdimm/nd_virtio.c       | 125 +++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/virtio_pmem.c     | 122 ++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/virtio_pmem.h     |  55 +++++++++++++++++
 drivers/virtio/Kconfig           |  11 ++++
 include/uapi/linux/virtio_ids.h  |   1 +
 include/uapi/linux/virtio_pmem.h |  34 +++++++++++
 7 files changed, 349 insertions(+)
 create mode 100644 drivers/nvdimm/nd_virtio.c
 create mode 100644 drivers/nvdimm/virtio_pmem.c
 create mode 100644 drivers/nvdimm/virtio_pmem.h
 create mode 100644 include/uapi/linux/virtio_pmem.h

(limited to 'include')

diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 6f2a088afad6..cefe233e0b52 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_OF_PMEM) += of_pmem.o
+obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o
 
 nd_pmem-y := pmem.o
 
diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
new file mode 100644
index 000000000000..8645275c08c2
--- /dev/null
+++ b/drivers/nvdimm/nd_virtio.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio_pmem.c: Virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and provides a virtio based flushing
+ * interface.
+ */
+#include "virtio_pmem.h"
+#include "nd.h"
+
+ /* The interrupt handler */
+void virtio_pmem_host_ack(struct virtqueue *vq)
+{
+	struct virtio_pmem *vpmem = vq->vdev->priv;
+	struct virtio_pmem_request *req_data, *req_buf;
+	unsigned long flags;
+	unsigned int len;
+
+	spin_lock_irqsave(&vpmem->pmem_lock, flags);
+	while ((req_data = virtqueue_get_buf(vq, &len)) != NULL) {
+		req_data->done = true;
+		wake_up(&req_data->host_acked);
+
+		if (!list_empty(&vpmem->req_list)) {
+			req_buf = list_first_entry(&vpmem->req_list,
+					struct virtio_pmem_request, list);
+			req_buf->wq_buf_avail = true;
+			wake_up(&req_buf->wq_buf);
+			list_del(&req_buf->list);
+		}
+	}
+	spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+}
+EXPORT_SYMBOL_GPL(virtio_pmem_host_ack);
+
+ /* The request submission function */
+static int virtio_pmem_flush(struct nd_region *nd_region)
+{
+	struct virtio_device *vdev = nd_region->provider_data;
+	struct virtio_pmem *vpmem  = vdev->priv;
+	struct virtio_pmem_request *req_data;
+	struct scatterlist *sgs[2], sg, ret;
+	unsigned long flags;
+	int err, err1;
+
+	might_sleep();
+	req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+	if (!req_data)
+		return -ENOMEM;
+
+	req_data->done = false;
+	init_waitqueue_head(&req_data->host_acked);
+	init_waitqueue_head(&req_data->wq_buf);
+	INIT_LIST_HEAD(&req_data->list);
+	req_data->req.type = cpu_to_virtio32(vdev, VIRTIO_PMEM_REQ_TYPE_FLUSH);
+	sg_init_one(&sg, &req_data->req, sizeof(req_data->req));
+	sgs[0] = &sg;
+	sg_init_one(&ret, &req_data->resp.ret, sizeof(req_data->resp));
+	sgs[1] = &ret;
+
+	spin_lock_irqsave(&vpmem->pmem_lock, flags);
+	 /*
+	  * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
+	  * queue does not have free descriptor. We add the request
+	  * to req_list and wait for host_ack to wake us up when free
+	  * slots are available.
+	  */
+	while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req_data,
+					GFP_ATOMIC)) == -ENOSPC) {
+
+		dev_info(&vdev->dev, "failed to send command to virtio pmem device, no free slots in the virtqueue\n");
+		req_data->wq_buf_avail = false;
+		list_add_tail(&req_data->list, &vpmem->req_list);
+		spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+
+		/* A host response results in "host_ack" getting called */
+		wait_event(req_data->wq_buf, req_data->wq_buf_avail);
+		spin_lock_irqsave(&vpmem->pmem_lock, flags);
+	}
+	err1 = virtqueue_kick(vpmem->req_vq);
+	spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+	/*
+	 * virtqueue_add_sgs failed with error different than -ENOSPC, we can't
+	 * do anything about that.
+	 */
+	if (err || !err1) {
+		dev_info(&vdev->dev, "failed to send command to virtio pmem device\n");
+		err = -EIO;
+	} else {
+		/* A host repsonse results in "host_ack" getting called */
+		wait_event(req_data->host_acked, req_data->done);
+		err = virtio32_to_cpu(vdev, req_data->resp.ret);
+	}
+
+	kfree(req_data);
+	return err;
+};
+
+/* The asynchronous flush callback function */
+int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
+{
+	/*
+	 * Create child bio for asynchronous flush and chain with
+	 * parent bio. Otherwise directly call nd_region flush.
+	 */
+	if (bio && bio->bi_iter.bi_sector != -1) {
+		struct bio *child = bio_alloc(GFP_ATOMIC, 0);
+
+		if (!child)
+			return -ENOMEM;
+		bio_copy_dev(child, bio);
+		child->bi_opf = REQ_PREFLUSH;
+		child->bi_iter.bi_sector = -1;
+		bio_chain(child, bio);
+		submit_bio(child);
+		return 0;
+	}
+	if (virtio_pmem_flush(nd_region))
+		return -EIO;
+
+	return 0;
+};
+EXPORT_SYMBOL_GPL(async_pmem_flush);
+MODULE_LICENSE("GPL");
diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
new file mode 100644
index 000000000000..5e3d07b47e0c
--- /dev/null
+++ b/drivers/nvdimm/virtio_pmem.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio_pmem.c: Virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and registers the virtual pmem device
+ * with libnvdimm core.
+ */
+#include "virtio_pmem.h"
+#include "nd.h"
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+ /* Initialize virt queue */
+static int init_vq(struct virtio_pmem *vpmem)
+{
+	/* single vq */
+	vpmem->req_vq = virtio_find_single_vq(vpmem->vdev,
+					virtio_pmem_host_ack, "flush_queue");
+	if (IS_ERR(vpmem->req_vq))
+		return PTR_ERR(vpmem->req_vq);
+
+	spin_lock_init(&vpmem->pmem_lock);
+	INIT_LIST_HEAD(&vpmem->req_list);
+
+	return 0;
+};
+
+static int virtio_pmem_probe(struct virtio_device *vdev)
+{
+	struct nd_region_desc ndr_desc = {};
+	int nid = dev_to_node(&vdev->dev);
+	struct nd_region *nd_region;
+	struct virtio_pmem *vpmem;
+	struct resource res;
+	int err = 0;
+
+	if (!vdev->config->get) {
+		dev_err(&vdev->dev, "%s failure: config access disabled\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), GFP_KERNEL);
+	if (!vpmem) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	vpmem->vdev = vdev;
+	vdev->priv = vpmem;
+	err = init_vq(vpmem);
+	if (err) {
+		dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
+		goto out_err;
+	}
+
+	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
+			start, &vpmem->start);
+	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
+			size, &vpmem->size);
+
+	res.start = vpmem->start;
+	res.end   = vpmem->start + vpmem->size - 1;
+	vpmem->nd_desc.provider_name = "virtio-pmem";
+	vpmem->nd_desc.module = THIS_MODULE;
+
+	vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev,
+						&vpmem->nd_desc);
+	if (!vpmem->nvdimm_bus) {
+		dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n");
+		err = -ENXIO;
+		goto out_vq;
+	}
+
+	dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus);
+
+	ndr_desc.res = &res;
+	ndr_desc.numa_node = nid;
+	ndr_desc.flush = async_pmem_flush;
+	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+	set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
+	nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc);
+	if (!nd_region) {
+		dev_err(&vdev->dev, "failed to create nvdimm region\n");
+		err = -ENXIO;
+		goto out_nd;
+	}
+	nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
+	return 0;
+out_nd:
+	nvdimm_bus_unregister(vpmem->nvdimm_bus);
+out_vq:
+	vdev->config->del_vqs(vdev);
+out_err:
+	return err;
+}
+
+static void virtio_pmem_remove(struct virtio_device *vdev)
+{
+	struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
+
+	nvdimm_bus_unregister(nvdimm_bus);
+	vdev->config->del_vqs(vdev);
+	vdev->config->reset(vdev);
+}
+
+static struct virtio_driver virtio_pmem_driver = {
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.probe			= virtio_pmem_probe,
+	.remove			= virtio_pmem_remove,
+};
+
+module_virtio_driver(virtio_pmem_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio pmem driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
new file mode 100644
index 000000000000..0dddefe594c4
--- /dev/null
+++ b/drivers/nvdimm/virtio_pmem.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * virtio_pmem.h: virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and provides a virtio based flushing
+ * interface.
+ **/
+
+#ifndef _LINUX_VIRTIO_PMEM_H
+#define _LINUX_VIRTIO_PMEM_H
+
+#include <linux/module.h>
+#include <uapi/linux/virtio_pmem.h>
+#include <linux/libnvdimm.h>
+#include <linux/spinlock.h>
+
+struct virtio_pmem_request {
+	struct virtio_pmem_req req;
+	struct virtio_pmem_resp resp;
+
+	/* Wait queue to process deferred work after ack from host */
+	wait_queue_head_t host_acked;
+	bool done;
+
+	/* Wait queue to process deferred work after virt queue buffer avail */
+	wait_queue_head_t wq_buf;
+	bool wq_buf_avail;
+	struct list_head list;
+};
+
+struct virtio_pmem {
+	struct virtio_device *vdev;
+
+	/* Virtio pmem request queue */
+	struct virtqueue *req_vq;
+
+	/* nvdimm bus registers virtio pmem device */
+	struct nvdimm_bus *nvdimm_bus;
+	struct nvdimm_bus_descriptor nd_desc;
+
+	/* List to store deferred work if virtqueue is full */
+	struct list_head req_list;
+
+	/* Synchronize virtqueue data */
+	spinlock_t pmem_lock;
+
+	/* Memory region information */
+	__u64 start;
+	__u64 size;
+};
+
+void virtio_pmem_host_ack(struct virtqueue *vq);
+int async_pmem_flush(struct nd_region *nd_region, struct bio *bio);
+#endif
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 023fc3bc01c6..078615cf2afc 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -43,6 +43,17 @@ config VIRTIO_PCI_LEGACY
 
 	  If unsure, say Y.
 
+config VIRTIO_PMEM
+	tristate "Support for virtio pmem driver"
+	depends on VIRTIO
+	depends on LIBNVDIMM
+	help
+	  This driver provides access to virtio-pmem devices, storage devices
+	  that are mapped into the physical address space - similar to NVDIMMs
+	   - with a virtio-based flushing interface.
+
+	  If unsure, say Y.
+
 config VIRTIO_BALLOON
 	tristate "Virtio balloon driver"
 	depends on VIRTIO
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 6d5c3b2d4f4d..32b2f94d1f58 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -43,5 +43,6 @@
 #define VIRTIO_ID_INPUT        18 /* virtio input */
 #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
 #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
+#define VIRTIO_ID_PMEM         27 /* virtio pmem */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h
new file mode 100644
index 000000000000..efcd72f2d20d
--- /dev/null
+++ b/include/uapi/linux/virtio_pmem.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Definitions for virtio-pmem devices.
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * Author(s): Pankaj Gupta <pagupta@redhat.com>
+ */
+
+#ifndef _UAPI_LINUX_VIRTIO_PMEM_H
+#define _UAPI_LINUX_VIRTIO_PMEM_H
+
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+
+struct virtio_pmem_config {
+	__u64 start;
+	__u64 size;
+};
+
+#define VIRTIO_PMEM_REQ_TYPE_FLUSH      0
+
+struct virtio_pmem_resp {
+	/* Host return status corresponding to flush request */
+	__u32 ret;
+};
+
+struct virtio_pmem_req {
+	/* command type */
+	__u32 type;
+};
+
+#endif
-- 
cgit v1.2.3


From fefc1d97fa4b5e016bbe15447dc3edcd9e1bcb9f Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 5 Jul 2019 19:33:24 +0530
Subject: libnvdimm: add dax_dev sync flag

This patch adds 'DAXDEV_SYNC' flag which is set
for nd_region doing synchronous flush. This later
is used to disable MAP_SYNC functionality for
ext4 & xfs filesystem for devices don't support
synchronous flush.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/bus.c            |  2 +-
 drivers/dax/super.c          | 19 ++++++++++++++++++-
 drivers/md/dm.c              |  3 ++-
 drivers/nvdimm/pmem.c        |  5 ++++-
 drivers/nvdimm/region_devs.c |  7 +++++++
 drivers/s390/block/dcssblk.c |  2 +-
 include/linux/dax.h          | 24 ++++++++++++++++++++++--
 include/linux/libnvdimm.h    |  1 +
 8 files changed, 56 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 2109cfe80219..5f184e751c82 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -388,7 +388,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
 	 * No 'host' or dax_operations since there is no access to this
 	 * device outside of mmap of the resulting character device.
 	 */
-	dax_dev = alloc_dax(dev_dax, NULL, NULL);
+	dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
 	if (!dax_dev)
 		goto err;
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 4e5ae7e8b557..8ab12068eea3 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -195,6 +195,8 @@ enum dax_device_flags {
 	DAXDEV_ALIVE,
 	/* gate whether dax_flush() calls the low level flush routine */
 	DAXDEV_WRITE_CACHE,
+	/* flag to check if device supports synchronous flush */
+	DAXDEV_SYNC,
 };
 
 /**
@@ -372,6 +374,18 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
 
+bool __dax_synchronous(struct dax_device *dax_dev)
+{
+	return test_bit(DAXDEV_SYNC, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(__dax_synchronous);
+
+void __set_dax_synchronous(struct dax_device *dax_dev)
+{
+	set_bit(DAXDEV_SYNC, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(__set_dax_synchronous);
+
 bool dax_alive(struct dax_device *dax_dev)
 {
 	lockdep_assert_held(&dax_srcu);
@@ -526,7 +540,7 @@ static void dax_add_host(struct dax_device *dax_dev, const char *host)
 }
 
 struct dax_device *alloc_dax(void *private, const char *__host,
-		const struct dax_operations *ops)
+		const struct dax_operations *ops, unsigned long flags)
 {
 	struct dax_device *dax_dev;
 	const char *host;
@@ -549,6 +563,9 @@ struct dax_device *alloc_dax(void *private, const char *__host,
 	dax_add_host(dax_dev, host);
 	dax_dev->ops = ops;
 	dax_dev->private = private;
+	if (flags & DAXDEV_F_SYNC)
+		set_dax_synchronous(dax_dev);
+
 	return dax_dev;
 
  err_dev:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5475081dcbd6..b1caa7188209 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1991,7 +1991,8 @@ static struct mapped_device *alloc_dev(int minor)
 	sprintf(md->disk->disk_name, "dm-%d", minor);
 
 	if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
-		md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+		md->dax_dev = alloc_dax(md, md->disk->disk_name,
+					&dm_dax_ops, 0);
 		if (!md->dax_dev)
 			goto bad;
 	}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 223da63d1bd7..8be868e2a18b 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -376,6 +376,7 @@ static int pmem_attach_disk(struct device *dev,
 	struct gendisk *disk;
 	void *addr;
 	int rc;
+	unsigned long flags = 0UL;
 
 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
@@ -474,7 +475,9 @@ static int pmem_attach_disk(struct device *dev,
 	nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
 	disk->bb = &pmem->bb;
 
-	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
+	if (is_nvdimm_sync(nd_region))
+		flags = DAXDEV_F_SYNC;
+	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
 	if (!dax_dev) {
 		put_disk(disk);
 		return -ENOMEM;
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index eca2e62af134..56f2227f192a 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1211,6 +1211,13 @@ int nvdimm_has_cache(struct nd_region *nd_region)
 }
 EXPORT_SYMBOL_GPL(nvdimm_has_cache);
 
+bool is_nvdimm_sync(struct nd_region *nd_region)
+{
+	return is_nd_pmem(&nd_region->dev) &&
+		!test_bit(ND_REGION_ASYNC, &nd_region->flags);
+}
+EXPORT_SYMBOL_GPL(is_nvdimm_sync);
+
 struct conflict_context {
 	struct nd_region *nd_region;
 	resource_size_t start, size;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index d04d4378ca50..63502ca537eb 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -679,7 +679,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 		goto put_dev;
 
 	dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
-			&dcssblk_dax_ops);
+			&dcssblk_dax_ops, DAXDEV_F_SYNC);
 	if (!dev_info->dax_dev) {
 		rc = -ENOMEM;
 		goto put_dev;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index becaea5f4488..8b535bc4526f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,6 +7,9 @@
 #include <linux/radix-tree.h>
 #include <asm/pgtable.h>
 
+/* Flag for synchronous flush */
+#define DAXDEV_F_SYNC (1UL << 0)
+
 typedef unsigned long dax_entry_t;
 
 struct iomap_ops;
@@ -38,18 +41,28 @@ extern struct attribute_group dax_attribute_group;
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *dax_get_by_host(const char *host);
 struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops);
+		const struct dax_operations *ops, unsigned long flags);
 void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
+bool __dax_synchronous(struct dax_device *dax_dev);
+static inline bool dax_synchronous(struct dax_device *dax_dev)
+{
+	return  __dax_synchronous(dax_dev);
+}
+void __set_dax_synchronous(struct dax_device *dax_dev);
+static inline void set_dax_synchronous(struct dax_device *dax_dev)
+{
+	__set_dax_synchronous(dax_dev);
+}
 #else
 static inline struct dax_device *dax_get_by_host(const char *host)
 {
 	return NULL;
 }
 static inline struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops)
+		const struct dax_operations *ops, unsigned long flags)
 {
 	/*
 	 * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
@@ -70,6 +83,13 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
 {
 	return false;
 }
+static inline bool dax_synchronous(struct dax_device *dax_dev)
+{
+	return true;
+}
+static inline void set_dax_synchronous(struct dax_device *dax_dev)
+{
+}
 #endif
 
 struct writeback_control;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index e13100f424c8..7a64b3ddb408 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -263,6 +263,7 @@ int generic_nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
 int nvdimm_has_cache(struct nd_region *nd_region);
 int nvdimm_in_overwrite(struct nvdimm *nvdimm);
+bool is_nvdimm_sync(struct nd_region *nd_region);
 
 static inline int nvdimm_ctl(struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len, int *cmd_rc)
-- 
cgit v1.2.3


From 32de1484648a837db5dea0a7007fe7136804e392 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 5 Jul 2019 19:33:26 +0530
Subject: dax: check synchronous mapping is supported

This patch introduces 'daxdev_mapping_supported' helper
which checks if 'MAP_SYNC' is supported with filesystem
mapping. It also checks if corresponding dax_device is
synchronous. Virtio pmem device is asynchronous and
does not not support VM_SYNC.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8b535bc4526f..9bd8528bd305 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -56,6 +56,18 @@ static inline void set_dax_synchronous(struct dax_device *dax_dev)
 {
 	__set_dax_synchronous(dax_dev);
 }
+/*
+ * Check if given mapping is supported by the file / underlying device.
+ */
+static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
+					     struct dax_device *dax_dev)
+{
+	if (!(vma->vm_flags & VM_SYNC))
+		return true;
+	if (!IS_DAX(file_inode(vma->vm_file)))
+		return false;
+	return dax_synchronous(dax_dev);
+}
 #else
 static inline struct dax_device *dax_get_by_host(const char *host)
 {
@@ -90,6 +102,11 @@ static inline bool dax_synchronous(struct dax_device *dax_dev)
 static inline void set_dax_synchronous(struct dax_device *dax_dev)
 {
 }
+static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
+				struct dax_device *dax_dev)
+{
+	return !(vma->vm_flags & VM_SYNC);
+}
 #endif
 
 struct writeback_control;
-- 
cgit v1.2.3


From 7e0a0e38fcfea47e74b0ff6da6266f00bcd2af43 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 1 May 2019 10:49:27 -0400
Subject: SUNRPC: Replace the queue timer with a delayed work function

The queue timer function, which walks the RPC queue in order to locate
candidates for waking up is one of the current constraints against
removing the bh-safe queue spin locks. Replace it with a delayed
work queue, so that we can do the actual rpc task wake ups from an
ordinary process context.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/sched.h |  3 ++-
 net/sunrpc/sched.c           | 30 ++++++++++++++++++++----------
 2 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index d0e451868f02..7d8db5dcac04 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -183,8 +183,9 @@ struct rpc_task_setup {
 #define RPC_NR_PRIORITY		(1 + RPC_PRIORITY_PRIVILEGED - RPC_PRIORITY_LOW)
 
 struct rpc_timer {
-	struct timer_list timer;
 	struct list_head list;
+	unsigned long expires;
+	struct delayed_work dwork;
 };
 
 /*
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index a2c114812717..e0a0cf381eba 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -46,7 +46,7 @@ static mempool_t	*rpc_buffer_mempool __read_mostly;
 
 static void			rpc_async_schedule(struct work_struct *);
 static void			 rpc_release_task(struct rpc_task *task);
-static void __rpc_queue_timer_fn(struct timer_list *t);
+static void __rpc_queue_timer_fn(struct work_struct *);
 
 /*
  * RPC tasks sit here while waiting for conditions to improve.
@@ -87,13 +87,19 @@ __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
 	task->tk_timeout = 0;
 	list_del(&task->u.tk_wait.timer_list);
 	if (list_empty(&queue->timer_list.list))
-		del_timer(&queue->timer_list.timer);
+		cancel_delayed_work(&queue->timer_list.dwork);
 }
 
 static void
 rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires)
 {
-	timer_reduce(&queue->timer_list.timer, expires);
+	unsigned long now = jiffies;
+	queue->timer_list.expires = expires;
+	if (time_before_eq(expires, now))
+		expires = 0;
+	else
+		expires -= now;
+	mod_delayed_work(rpciod_workqueue, &queue->timer_list.dwork, expires);
 }
 
 /*
@@ -107,7 +113,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task,
 		task->tk_pid, jiffies_to_msecs(timeout - jiffies));
 
 	task->tk_timeout = timeout;
-	rpc_set_queue_timer(queue, timeout);
+	if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires))
+		rpc_set_queue_timer(queue, timeout);
 	list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
 }
 
@@ -250,7 +257,8 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c
 	queue->maxpriority = nr_queues - 1;
 	rpc_reset_waitqueue_priority(queue);
 	queue->qlen = 0;
-	timer_setup(&queue->timer_list.timer, __rpc_queue_timer_fn, 0);
+	queue->timer_list.expires = 0;
+	INIT_DEFERRABLE_WORK(&queue->timer_list.dwork, __rpc_queue_timer_fn);
 	INIT_LIST_HEAD(&queue->timer_list.list);
 	rpc_assign_waitqueue_name(queue, qname);
 }
@@ -269,7 +277,7 @@ EXPORT_SYMBOL_GPL(rpc_init_wait_queue);
 
 void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
 {
-	del_timer_sync(&queue->timer_list.timer);
+	cancel_delayed_work_sync(&queue->timer_list.dwork);
 }
 EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
 
@@ -759,13 +767,15 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_status);
 
-static void __rpc_queue_timer_fn(struct timer_list *t)
+static void __rpc_queue_timer_fn(struct work_struct *work)
 {
-	struct rpc_wait_queue *queue = from_timer(queue, t, timer_list.timer);
+	struct rpc_wait_queue *queue = container_of(work,
+			struct rpc_wait_queue,
+			timer_list.dwork.work);
 	struct rpc_task *task, *n;
 	unsigned long expires, now, timeo;
 
-	spin_lock(&queue->lock);
+	spin_lock_bh(&queue->lock);
 	expires = now = jiffies;
 	list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
 		timeo = task->tk_timeout;
@@ -780,7 +790,7 @@ static void __rpc_queue_timer_fn(struct timer_list *t)
 	}
 	if (!list_empty(&queue->timer_list.list))
 		rpc_set_queue_timer(queue, expires);
-	spin_unlock(&queue->lock);
+	spin_unlock_bh(&queue->lock);
 }
 
 static void __rpc_atrun(struct rpc_task *task)
-- 
cgit v1.2.3


From 4f8943f8088348ec01456b075d44ad19dce3d698 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 1 May 2019 16:28:29 -0400
Subject: SUNRPC: Replace direct task wakeups from softirq context

Replace the direct task wakeups from inside a softirq context with
wakeups from a process context.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtsock.h |  5 +++
 net/sunrpc/xprtsock.c           | 78 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 77 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index b81d0b3e0799..7638dbe7bc50 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -56,6 +56,7 @@ struct sock_xprt {
 	 */
 	unsigned long		sock_state;
 	struct delayed_work	connect_worker;
+	struct work_struct	error_worker;
 	struct work_struct	recv_worker;
 	struct mutex		recv_mutex;
 	struct sockaddr_storage	srcaddr;
@@ -84,6 +85,10 @@ struct sock_xprt {
 #define XPRT_SOCK_CONNECTING	1U
 #define XPRT_SOCK_DATA_READY	(2)
 #define XPRT_SOCK_UPD_TIMEOUT	(3)
+#define XPRT_SOCK_WAKE_ERROR	(4)
+#define XPRT_SOCK_WAKE_WRITE	(5)
+#define XPRT_SOCK_WAKE_PENDING	(6)
+#define XPRT_SOCK_WAKE_DISCONNECT	(7)
 
 #endif /* __KERNEL__ */
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 36652352a38c..92af57019b96 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1211,6 +1211,15 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 
 	clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+	clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state);
+	clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state);
+	clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state);
+}
+
+static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr)
+{
+	set_bit(nr, &transport->sock_state);
+	queue_work(xprtiod_workqueue, &transport->error_worker);
 }
 
 static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
@@ -1231,6 +1240,7 @@ static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
  */
 static void xs_error_report(struct sock *sk)
 {
+	struct sock_xprt *transport;
 	struct rpc_xprt *xprt;
 	int err;
 
@@ -1238,13 +1248,14 @@ static void xs_error_report(struct sock *sk)
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
 
+	transport = container_of(xprt, struct sock_xprt, xprt);
 	err = -sk->sk_err;
 	if (err == 0)
 		goto out;
 	dprintk("RPC:       xs_error_report client %p, error=%d...\n",
 			xprt, -err);
 	trace_rpc_socket_error(xprt, sk->sk_socket, err);
-	xprt_wake_pending_tasks(xprt, err);
+	xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR);
  out:
 	read_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1507,7 +1518,7 @@ static void xs_tcp_state_change(struct sock *sk)
 			xprt->stat.connect_count++;
 			xprt->stat.connect_time += (long)jiffies -
 						   xprt->stat.connect_start;
-			xprt_wake_pending_tasks(xprt, -EAGAIN);
+			xs_run_error_worker(transport, XPRT_SOCK_WAKE_PENDING);
 		}
 		spin_unlock(&xprt->transport_lock);
 		break;
@@ -1525,7 +1536,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		/* The server initiated a shutdown of the socket */
 		xprt->connect_cookie++;
 		clear_bit(XPRT_CONNECTED, &xprt->state);
-		xs_tcp_force_close(xprt);
+		xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
 		/* fall through */
 	case TCP_CLOSING:
 		/*
@@ -1547,7 +1558,7 @@ static void xs_tcp_state_change(struct sock *sk)
 			xprt_clear_connecting(xprt);
 		clear_bit(XPRT_CLOSING, &xprt->state);
 		/* Trigger the socket release */
-		xs_tcp_force_close(xprt);
+		xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
 	}
  out:
 	read_unlock_bh(&sk->sk_callback_lock);
@@ -1556,6 +1567,7 @@ static void xs_tcp_state_change(struct sock *sk)
 static void xs_write_space(struct sock *sk)
 {
 	struct socket_wq *wq;
+	struct sock_xprt *transport;
 	struct rpc_xprt *xprt;
 
 	if (!sk->sk_socket)
@@ -1564,13 +1576,14 @@ static void xs_write_space(struct sock *sk)
 
 	if (unlikely(!(xprt = xprt_from_sock(sk))))
 		return;
+	transport = container_of(xprt, struct sock_xprt, xprt);
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
 		goto out;
 
-	if (xprt_write_space(xprt))
-		sk->sk_write_pending--;
+	xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE);
+	sk->sk_write_pending--;
 out:
 	rcu_read_unlock();
 }
@@ -2461,6 +2474,56 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 			delay);
 }
 
+static void xs_wake_disconnect(struct sock_xprt *transport)
+{
+	if (test_and_clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state))
+		xs_tcp_force_close(&transport->xprt);
+}
+
+static void xs_wake_write(struct sock_xprt *transport)
+{
+	if (test_and_clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state))
+		xprt_write_space(&transport->xprt);
+}
+
+static void xs_wake_error(struct sock_xprt *transport)
+{
+	int sockerr;
+	int sockerr_len = sizeof(sockerr);
+
+	if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
+		return;
+	mutex_lock(&transport->recv_mutex);
+	if (transport->sock == NULL)
+		goto out;
+	if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
+		goto out;
+	if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR,
+				(char *)&sockerr, &sockerr_len) != 0)
+		goto out;
+	if (sockerr < 0)
+		xprt_wake_pending_tasks(&transport->xprt, sockerr);
+out:
+	mutex_unlock(&transport->recv_mutex);
+}
+
+static void xs_wake_pending(struct sock_xprt *transport)
+{
+	if (test_and_clear_bit(XPRT_SOCK_WAKE_PENDING, &transport->sock_state))
+		xprt_wake_pending_tasks(&transport->xprt, -EAGAIN);
+}
+
+static void xs_error_handle(struct work_struct *work)
+{
+	struct sock_xprt *transport = container_of(work,
+			struct sock_xprt, error_worker);
+
+	xs_wake_disconnect(transport);
+	xs_wake_write(transport);
+	xs_wake_error(transport);
+	xs_wake_pending(transport);
+}
+
 /**
  * xs_local_print_stats - display AF_LOCAL socket-specifc stats
  * @xprt: rpc_xprt struct containing statistics
@@ -2873,6 +2936,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
 	xprt->timeout = &xs_local_default_timeout;
 
 	INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+	INIT_WORK(&transport->error_worker, xs_error_handle);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket);
 
 	switch (sun->sun_family) {
@@ -2943,6 +3007,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
 	xprt->timeout = &xs_udp_default_timeout;
 
 	INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn);
+	INIT_WORK(&transport->error_worker, xs_error_handle);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket);
 
 	switch (addr->sa_family) {
@@ -3024,6 +3089,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 		(xprt->timeout->to_retries + 1);
 
 	INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+	INIT_WORK(&transport->error_worker, xs_error_handle);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
 
 	switch (addr->sa_family) {
-- 
cgit v1.2.3


From 21f0ffaff510b0530bfdf77da7133c0b99dee2fe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 28 Apr 2017 10:52:42 -0400
Subject: SUNRPC: Add basic load balancing to the transport switch

For now, just count the queue length. It is less accurate than counting
number of bytes queued, but easier to implement.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h          |  1 +
 include/linux/sunrpc/xprtmultipath.h |  2 ++
 net/sunrpc/clnt.c                    | 40 +++++++++++++++++++++++++++++++++---
 net/sunrpc/xprtmultipath.c           | 20 +++++++++++++++++-
 4 files changed, 59 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a6d9fce7f20e..15322c1d9c8c 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -238,6 +238,7 @@ struct rpc_xprt {
 	/*
 	 * Send stuff
 	 */
+	atomic_long_t		queuelen;
 	spinlock_t		transport_lock;	/* lock transport info */
 	spinlock_t		reserve_lock;	/* lock slot table */
 	spinlock_t		queue_lock;	/* send/receive queue lock */
diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index af1257c030d2..c6cce3fbf29d 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -15,6 +15,8 @@ struct rpc_xprt_switch {
 	struct kref		xps_kref;
 
 	unsigned int		xps_nxprts;
+	unsigned int		xps_nactive;
+	atomic_long_t		xps_queuelen;
 	struct list_head	xps_xprt_list;
 
 	struct net *		xps_net;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b03bfa055c08..976eab68bb5d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -968,13 +968,47 @@ out:
 }
 EXPORT_SYMBOL_GPL(rpc_bind_new_program);
 
+static struct rpc_xprt *
+rpc_task_get_xprt(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt_switch *xps;
+	struct rpc_xprt *xprt= xprt_iter_get_next(&clnt->cl_xpi);
+
+	if (!xprt)
+		return NULL;
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+	atomic_long_inc(&xps->xps_queuelen);
+	rcu_read_unlock();
+	atomic_long_inc(&xprt->queuelen);
+
+	return xprt;
+}
+
+static void
+rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+	struct rpc_xprt_switch *xps;
+
+	atomic_long_dec(&xprt->queuelen);
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+	atomic_long_dec(&xps->xps_queuelen);
+	rcu_read_unlock();
+
+	xprt_put(xprt);
+}
+
 void rpc_task_release_transport(struct rpc_task *task)
 {
 	struct rpc_xprt *xprt = task->tk_xprt;
 
 	if (xprt) {
 		task->tk_xprt = NULL;
-		xprt_put(xprt);
+		if (task->tk_client)
+			rpc_task_release_xprt(task->tk_client, xprt);
+		else
+			xprt_put(xprt);
 	}
 }
 EXPORT_SYMBOL_GPL(rpc_task_release_transport);
@@ -983,6 +1017,7 @@ void rpc_task_release_client(struct rpc_task *task)
 {
 	struct rpc_clnt *clnt = task->tk_client;
 
+	rpc_task_release_transport(task);
 	if (clnt != NULL) {
 		/* Remove from client task list */
 		spin_lock(&clnt->cl_lock);
@@ -992,14 +1027,13 @@ void rpc_task_release_client(struct rpc_task *task)
 
 		rpc_release_client(clnt);
 	}
-	rpc_task_release_transport(task);
 }
 
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
 	if (!task->tk_xprt)
-		task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
+		task->tk_xprt = rpc_task_get_xprt(clnt);
 }
 
 static
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 8394124126f8..394e427533be 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -36,6 +36,7 @@ static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps,
 	if (xps->xps_nxprts == 0)
 		xps->xps_net = xprt->xprt_net;
 	xps->xps_nxprts++;
+	xps->xps_nactive++;
 }
 
 /**
@@ -62,6 +63,7 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
 {
 	if (unlikely(xprt == NULL))
 		return;
+	xps->xps_nactive--;
 	xps->xps_nxprts--;
 	if (xps->xps_nxprts == 0)
 		xps->xps_net = NULL;
@@ -317,8 +319,24 @@ struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct list_head *head,
 static
 struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi)
 {
-	return xprt_iter_next_entry_multiple(xpi,
+	struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
+	struct rpc_xprt *xprt;
+	unsigned long xprt_queuelen;
+	unsigned long xps_queuelen;
+	unsigned long xps_avglen;
+
+	do {
+		xprt = xprt_iter_next_entry_multiple(xpi,
 			xprt_switch_find_next_entry_roundrobin);
+		if (xprt == NULL)
+			break;
+		xprt_queuelen = atomic_long_read(&xprt->queuelen);
+		if (xprt_queuelen <= 2)
+			break;
+		xps_queuelen = atomic_long_read(&xps->xps_queuelen);
+		xps_avglen = DIV_ROUND_UP(xps_queuelen, xps->xps_nactive);
+	} while (xprt_queuelen > xps_avglen);
+	return xprt;
 }
 
 static
-- 
cgit v1.2.3


From 1c341b777501613aad83f9c233a3fe5701cff083 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 22 May 2019 08:38:57 -0400
Subject: NFS: Add deferred cache invalidation for close-to-open consistency
 violations

If the client detects that close-to-open cache consistency has been
violated, and that the file or directory has been changed on the
server, then do a cache invalidation when we're done working with
the file.
The reason we don't do an immediate cache invalidation is that we
want to avoid performance problems due to false positives. Also,
note that we cannot guarantee cache consistency in this situation
even if we do invalidate the cache.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           |  4 ++++
 fs/nfs/inode.c         | 15 +++++++++++----
 include/linux/nfs_fs.h |  2 ++
 3 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 57b6a45576ad..bd1f9555447b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -80,6 +80,10 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
 		ctx->dup_cookie = 0;
 		ctx->cred = get_cred(cred);
 		spin_lock(&dir->i_lock);
+		if (list_empty(&nfsi->open_files) &&
+		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA |
+				NFS_INO_REVAL_FORCED;
 		list_add(&ctx->list, &nfsi->open_files);
 		spin_unlock(&dir->i_lock);
 		return ctx;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 53777813ca95..ea52c71534b5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -208,7 +208,7 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 	}
 
 	if (inode->i_mapping->nrpages == 0)
-		flags &= ~NFS_INO_INVALID_DATA;
+		flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
 	nfsi->cache_validity |= flags;
 	if (flags & NFS_INO_INVALID_DATA)
 		nfs_fscache_invalidate(inode);
@@ -652,7 +652,8 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 	i_size_write(inode, offset);
 	/* Optimisation */
 	if (offset == 0)
-		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+		NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA |
+				NFS_INO_DATA_INVAL_DEFER);
 	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
 
 	spin_unlock(&inode->i_lock);
@@ -1032,6 +1033,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
+	if (list_empty(&nfsi->open_files) &&
+	    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+		nfsi->cache_validity |= NFS_INO_INVALID_DATA |
+			NFS_INO_REVAL_FORCED;
 	list_add_tail_rcu(&ctx->list, &nfsi->open_files);
 	spin_unlock(&inode->i_lock);
 }
@@ -1313,7 +1318,8 @@ int nfs_revalidate_mapping(struct inode *inode,
 
 	set_bit(NFS_INO_INVALIDATING, bitlock);
 	smp_wmb();
-	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA|
+			NFS_INO_DATA_INVAL_DEFER);
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
 	ret = nfs_invalidate_mapping(inode, mapping);
@@ -1871,7 +1877,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				dprintk("NFS: change_attr change on server for file %s/%ld\n",
 						inode->i_sb->s_id,
 						inode->i_ino);
-			}
+			} else if (!have_delegation)
+				nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
 			inode_set_iversion_raw(inode, fattr->change_attr);
 			attr_changed = true;
 		}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d363d5765cdf..0a11712a80e3 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -223,6 +223,8 @@ struct nfs4_copy_state {
 #define NFS_INO_INVALID_MTIME	BIT(10)		/* cached mtime is invalid */
 #define NFS_INO_INVALID_SIZE	BIT(11)		/* cached size is invalid */
 #define NFS_INO_INVALID_OTHER	BIT(12)		/* other attrs are invalid */
+#define NFS_INO_DATA_INVAL_DEFER	\
+				BIT(13)		/* Deferred cache invalidation */
 
 #define NFS_INO_INVALID_ATTR	(NFS_INO_INVALID_CHANGE \
 		| NFS_INO_INVALID_CTIME \
-- 
cgit v1.2.3


From 612b41f808a98a124b23d72229693c3181733291 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 27 Apr 2017 08:50:51 -0400
Subject: SUNRPC: Allow creation of RPC clients with multiple connections

Add an argument to struct rpc_create_args that allows the specification
of how many transport connections you want to set up to the server.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 17 ++++++++++++++++-
 net/sunrpc/xprtmultipath.c  |  3 +--
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 6e8073140a5d..4619098affa3 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -124,6 +124,7 @@ struct rpc_create_args {
 	u32			prognumber;	/* overrides program->number */
 	u32			version;
 	rpc_authflavor_t	authflavor;
+	u32			nconnect;
 	unsigned long		flags;
 	char			*client_name;
 	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 976eab68bb5d..b6aca8cb5ae6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -528,6 +528,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 		.bc_xprt = args->bc_xprt,
 	};
 	char servername[48];
+	struct rpc_clnt *clnt;
+	int i;
 
 	if (args->bc_xprt) {
 		WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
@@ -590,7 +592,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
 		xprt->resvport = 0;
 
-	return rpc_create_xprt(args, xprt);
+	clnt = rpc_create_xprt(args, xprt);
+	if (IS_ERR(clnt) || args->nconnect <= 1)
+		return clnt;
+
+	for (i = 0; i < args->nconnect - 1; i++) {
+		if (rpc_clnt_add_xprt(clnt, &xprtargs, NULL, NULL) < 0)
+			break;
+	}
+	return clnt;
 }
 EXPORT_SYMBOL_GPL(rpc_create);
 
@@ -2730,6 +2740,10 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
 		return -ENOMEM;
 	data->xps = xprt_switch_get(xps);
 	data->xprt = xprt_get(xprt);
+	if (rpc_xprt_switch_has_addr(data->xps, (struct sockaddr *)&xprt->addr)) {
+		rpc_cb_add_xprt_release(data);
+		goto success;
+	}
 
 	task = rpc_call_null_helper(clnt, xprt, NULL,
 			RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS,
@@ -2737,6 +2751,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	rpc_put_task(task);
+success:
 	return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 394e427533be..9d66ce53355d 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -52,8 +52,7 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
 	if (xprt == NULL)
 		return;
 	spin_lock(&xps->xps_lock);
-	if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
-	    !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+	if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
 		xprt_switch_add_xprt_locked(xps, xprt);
 	spin_unlock(&xps->xps_lock);
 }
-- 
cgit v1.2.3


From 6619079d05404cb32be29af329b87ac3b0ad4f96 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 27 Apr 2017 11:13:40 -0400
Subject: NFSv4: Allow multiple connections to NFSv4.x (x>0) servers

If the user specifies the -onconn=<number> mount option, and the transport
protocol is TCP, then set up <number> connections to the server. The
connections will all go to the same IP address.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c           |  2 ++
 fs/nfs/internal.h         |  1 +
 fs/nfs/nfs4client.c       | 11 +++++++++--
 include/linux/nfs_fs_sb.h |  1 +
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d7e4f0848e28..fa6953e56a71 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -175,6 +175,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 	clp->cl_rpcclient = ERR_PTR(-EINVAL);
 
 	clp->cl_proto = cl_init->proto;
+	clp->cl_nconnect = cl_init->nconnect;
 	clp->cl_net = get_net(cl_init->net);
 
 	clp->cl_principal = "*";
@@ -493,6 +494,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 	struct rpc_create_args args = {
 		.net		= clp->cl_net,
 		.protocol	= clp->cl_proto,
+		.nconnect	= clp->cl_nconnect,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
 		.timeout	= cl_init->timeparms,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bba09dace5d6..4a49dc1495c5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -82,6 +82,7 @@ struct nfs_client_initdata {
 	struct nfs_subversion *nfs_mod;
 	int proto;
 	u32 minorversion;
+	unsigned int nconnect;
 	struct net *net;
 	const struct rpc_timeout *timeparms;
 	const struct cred *cred;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 81b9b6d7927a..5c244c440658 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -859,7 +859,8 @@ static int nfs4_set_client(struct nfs_server *server,
 		const size_t addrlen,
 		const char *ip_addr,
 		int proto, const struct rpc_timeout *timeparms,
-		u32 minorversion, struct net *net)
+		u32 minorversion, unsigned int nconnect,
+		struct net *net)
 {
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
@@ -875,6 +876,8 @@ static int nfs4_set_client(struct nfs_server *server,
 	};
 	struct nfs_client *clp;
 
+	if (minorversion > 0 && proto == XPRT_TRANSPORT_TCP)
+		cl_init.nconnect = nconnect;
 	if (server->flags & NFS_MOUNT_NORESVPORT)
 		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 	if (server->options & NFS_OPTION_MIGRATION)
@@ -1074,6 +1077,7 @@ static int nfs4_init_server(struct nfs_server *server,
 			data->nfs_server.protocol,
 			&timeparms,
 			data->minorversion,
+			data->nfs_server.nconnect,
 			data->net);
 	if (error < 0)
 		return error;
@@ -1163,6 +1167,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				XPRT_TRANSPORT_RDMA,
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
+				parent_client->cl_nconnect,
 				parent_client->cl_net);
 	if (!error)
 		goto init_server;
@@ -1176,6 +1181,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				XPRT_TRANSPORT_TCP,
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
+				parent_client->cl_nconnect,
 				parent_client->cl_net);
 	if (error < 0)
 		goto error;
@@ -1271,7 +1277,8 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
 	set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
 	error = nfs4_set_client(server, hostname, sap, salen, buf,
 				clp->cl_proto, clnt->cl_timeout,
-				clp->cl_minorversion, net);
+				clp->cl_minorversion,
+				clp->cl_nconnect, net);
 	clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
 	if (error != 0) {
 		nfs_server_insert_lists(server);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 1e78032a174b..a87fe854f008 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -58,6 +58,7 @@ struct nfs_client {
 	struct nfs_subversion *	cl_nfs_mod;	/* pointer to nfs version module */
 
 	u32			cl_minorversion;/* NFSv4 minorversion */
+	unsigned int		cl_nconnect;	/* Number of connections */
 	const char *		cl_principal;  /* used for machine cred */
 
 #if IS_ENABLED(CONFIG_NFS_V4)
-- 
cgit v1.2.3


From 5a0c257f8e0f4c4b3c33dff545317c21a921303e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 30 May 2019 10:41:28 +1000
Subject: NFS: send state management on a single connection.

With NFSv4.1, different network connections need to be explicitly
bound to a session.  During session startup, this is not possible
so only a single connection must be used for session startup.

So add a task flag to disable the default round-robin choice of
connections (when nconnect > 1) and force the use of a single
connection.
Then use that flag on all requests for session management - for
consistence, include NFSv4.0 management (SETCLIENTID) and session
destruction

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c            | 22 +++++++++++++---------
 include/linux/sunrpc/sched.h |  1 +
 net/sunrpc/clnt.c            | 24 +++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 26626ea1f197..d115d9973efc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5992,7 +5992,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_setclientid_ops,
 		.callback_data = &setclientid,
-		.flags = RPC_TASK_TIMEOUT,
+		.flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	int status;
 
@@ -6058,7 +6058,8 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
 	dprintk("NFS call  setclientid_confirm auth=%s, (client ID %llx)\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
 		clp->cl_clientid);
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_setclientid_confirm(clp, status);
 	dprintk("NFS reply setclientid_confirm: %d\n", status);
 	return status;
@@ -7639,7 +7640,7 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
 		NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
 
 	status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args,
-				&res.seq_res, 0);
+				&res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
 	dprintk("NFS reply  secinfo: %d\n", status);
 
 	put_cred(cred);
@@ -7977,7 +7978,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred,
 		.rpc_client = clp->cl_rpcclient,
 		.callback_ops = &nfs4_exchange_id_call_ops,
 		.rpc_message = &msg,
-		.flags = RPC_TASK_TIMEOUT,
+		.flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	struct nfs41_exchange_id_data *calldata;
 	int status;
@@ -8202,7 +8203,8 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
 	};
 	int status;
 
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_destroy_clientid(clp, status);
 	if (status)
 		dprintk("NFS: Got error %d from the server %s on "
@@ -8481,7 +8483,8 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 	nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
 	args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
 
-	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_create_session(clp, status);
 
 	switch (status) {
@@ -8557,7 +8560,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
 	if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
 		return 0;
 
-	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_destroy_session(session->clp, status);
 
 	if (status)
@@ -8811,7 +8815,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
 		.rpc_client = clp->cl_rpcclient,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_reclaim_complete_call_ops,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	int status = -ENOMEM;
 
@@ -9330,7 +9334,7 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
 
 	dprintk("--> %s\n", __func__);
 	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
-				&res.seq_res, 0);
+				&res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
 	dprintk("<-- %s status=%d\n", __func__, status);
 
 	put_cred(cred);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index d0e451868f02..11424bdf09e6 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -126,6 +126,7 @@ struct rpc_task_setup {
 #define RPC_CALL_MAJORSEEN	0x0020		/* major timeout seen */
 #define RPC_TASK_ROOTCREDS	0x0040		/* force root creds */
 #define RPC_TASK_DYNAMIC	0x0080		/* task was kmalloc'ed */
+#define	RPC_TASK_NO_ROUND_ROBIN	0x0100		/* send requests on "main" xprt */
 #define RPC_TASK_SOFT		0x0200		/* Use soft timeouts */
 #define RPC_TASK_SOFTCONN	0x0400		/* Fail if can't connect */
 #define RPC_TASK_SENT		0x0800		/* message was sent */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b6aca8cb5ae6..d599fab8adcb 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -995,6 +995,24 @@ rpc_task_get_xprt(struct rpc_clnt *clnt)
 	return xprt;
 }
 
+static struct rpc_xprt *
+rpc_task_get_first_xprt(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt_switch *xps;
+	struct rpc_xprt *xprt;
+
+	rcu_read_lock();
+	xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	if (xprt) {
+		atomic_long_inc(&xprt->queuelen);
+		xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+		atomic_long_inc(&xps->xps_queuelen);
+	}
+	rcu_read_unlock();
+
+	return xprt;
+}
+
 static void
 rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
@@ -1042,7 +1060,11 @@ void rpc_task_release_client(struct rpc_task *task)
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
-	if (!task->tk_xprt)
+	if (task->tk_xprt)
+		return;
+	if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
+		task->tk_xprt = rpc_task_get_first_xprt(clnt);
+	else
 		task->tk_xprt = rpc_task_get_xprt(clnt);
 }
 
-- 
cgit v1.2.3


From a332518fda4731c07394164b3edcbb6efaf4c4d7 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Thu, 23 May 2019 16:13:50 -0400
Subject: SUNRPC: Count ops completing with tk_status < 0

We often see various error conditions with NFS4.x that show up with
a very high operation count all completing with tk_status < 0 in a
short period of time.  Add a count to rpc_iostats to record on a
per-op basis the ops that complete in this manner, which will
enable lower overhead diagnostics.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/metrics.h | 7 ++++++-
 net/sunrpc/stats.c             | 8 ++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h
index 1b3751327575..0ee3f7052846 100644
--- a/include/linux/sunrpc/metrics.h
+++ b/include/linux/sunrpc/metrics.h
@@ -30,7 +30,7 @@
 #include <linux/ktime.h>
 #include <linux/spinlock.h>
 
-#define RPC_IOSTATS_VERS	"1.0"
+#define RPC_IOSTATS_VERS	"1.1"
 
 struct rpc_iostats {
 	spinlock_t		om_lock;
@@ -66,6 +66,11 @@ struct rpc_iostats {
 	ktime_t			om_queue,	/* queued for xmit */
 				om_rtt,		/* RPC RTT */
 				om_execute;	/* RPC execution */
+	/*
+	 * The count of operations that complete with tk_status < 0.
+	 * These statuses usually indicate error conditions.
+	 */
+	unsigned long           om_error_status;
 } ____cacheline_aligned;
 
 struct rpc_task;
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 2f7bde82450b..48ea776364f8 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -177,6 +177,8 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
 
 	execute = ktime_sub(now, task->tk_start);
 	op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute);
+	if (task->tk_status < 0)
+		op_metrics->om_error_status++;
 
 	spin_unlock(&op_metrics->om_lock);
 
@@ -219,13 +221,14 @@ static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b)
 	a->om_queue = ktime_add(a->om_queue, b->om_queue);
 	a->om_rtt = ktime_add(a->om_rtt, b->om_rtt);
 	a->om_execute = ktime_add(a->om_execute, b->om_execute);
+	a->om_error_status += b->om_error_status;
 }
 
 static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
 			       int op, const struct rpc_procinfo *procs)
 {
 	_print_name(seq, op, procs);
-	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu\n",
+	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n",
 		   stats->om_ops,
 		   stats->om_ntrans,
 		   stats->om_timeouts,
@@ -233,7 +236,8 @@ static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
 		   stats->om_bytes_recv,
 		   ktime_to_ms(stats->om_queue),
 		   ktime_to_ms(stats->om_rtt),
-		   ktime_to_ms(stats->om_execute));
+		   ktime_to_ms(stats->om_execute),
+		   stats->om_error_status);
 }
 
 void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt)
-- 
cgit v1.2.3


From 1dd7382b1bb608e7ccae3672621eaceca355ae8b Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 1 Jul 2019 21:14:01 +0300
Subject: net/mlx5: Introduce VHCA tunnel device capability

When using the device emulation feature (introduced in Bluefield-1 SOC),
a privileged function (the device emulation manager) will be able to
create a channel to execute commands on behalf of the emulated function.

This channel will be a general object of type VHCA_TUNNEL that will have
a unique ID for each emulated function. This ID will be passed in each
cmd that will be issued by the emulation SW in a well known offset in
the command header.

This channel is needed since the emulated function doesn't have a normal
command interface to the HCA HW, but some basic configuration for that
function is needed (e.g. initialize and enable the HCA). For that matter,
a specific command-set was defined and only those commands will be issued
by the HCA.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 06881b79167e..ba60bd17a92a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1390,7 +1390,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   reserved_at_6c8[0x28];
 	u8	   sf_base_id[0x10];
 
-	u8	   reserved_at_700[0x100];
+	u8	   reserved_at_700[0x80];
+	u8	   vhca_tunnel_commands[0x40];
+	u8	   reserved_at_7c0[0x40];
 };
 
 enum mlx5_flow_destination_type {
@@ -9694,7 +9696,7 @@ struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
 	u8         opcode[0x10];
 	u8         uid[0x10];
 
-	u8         reserved_at_20[0x10];
+	u8         vhca_tunnel_id[0x10];
 	u8         obj_type[0x10];
 
 	u8         obj_id[0x20];
-- 
cgit v1.2.3


From df616d7a442b90798d63fbf4447154bbbb9040b1 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 27 Jun 2019 16:07:45 +0900
Subject: mtd: abi: do not use C++ style comments in uapi header

Linux kernel tolerates C++ style comments these days. Actually, the
SPDX License tags for .c files start with //.

On the other hand, uapi headers are written in more strict C, where
the C++ comment style is forbidden.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 include/uapi/mtd/mtd-abi.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/mtd/mtd-abi.h b/include/uapi/mtd/mtd-abi.h
index aff5b5e59845..47ffe3208c27 100644
--- a/include/uapi/mtd/mtd-abi.h
+++ b/include/uapi/mtd/mtd-abi.h
@@ -113,11 +113,11 @@ struct mtd_write_req {
 #define MTD_CAP_NVRAM		(MTD_WRITEABLE | MTD_BIT_WRITEABLE | MTD_NO_ERASE)
 
 /* Obsolete ECC byte placement modes (used with obsolete MEMGETOOBSEL) */
-#define MTD_NANDECC_OFF		0	// Switch off ECC (Not recommended)
-#define MTD_NANDECC_PLACE	1	// Use the given placement in the structure (YAFFS1 legacy mode)
-#define MTD_NANDECC_AUTOPLACE	2	// Use the default placement scheme
-#define MTD_NANDECC_PLACEONLY	3	// Use the given placement in the structure (Do not store ecc result on read)
-#define MTD_NANDECC_AUTOPL_USR 	4	// Use the given autoplacement scheme rather than using the default
+#define MTD_NANDECC_OFF		0	/* Switch off ECC (Not recommended) */
+#define MTD_NANDECC_PLACE	1	/* Use the given placement in the structure (YAFFS1 legacy mode) */
+#define MTD_NANDECC_AUTOPLACE	2	/* Use the default placement scheme */
+#define MTD_NANDECC_PLACEONLY	3	/* Use the given placement in the structure (Do not store ecc result on read) */
+#define MTD_NANDECC_AUTOPL_USR 	4	/* Use the given autoplacement scheme rather than using the default */
 
 /* OTP mode selection */
 #define MTD_OTP_OFF		0
-- 
cgit v1.2.3


From 97a385e558292ba0851906783642239865670a5f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 1 May 2019 16:40:32 -0400
Subject: libceph: remove ceph_get_direct_page_vector()

This function is entirely unused.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h |  4 ----
 net/ceph/pagevec.c           | 33 ---------------------------------
 2 files changed, 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 337d5049ff93..a3cddf5f0e60 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -299,10 +299,6 @@ int ceph_wait_for_latest_osdmap(struct ceph_client *client,
 
 /* pagevec.c */
 extern void ceph_release_page_vector(struct page **pages, int num_pages);
-
-extern struct page **ceph_get_direct_page_vector(const void __user *data,
-						 int num_pages,
-						 bool write_page);
 extern void ceph_put_page_vector(struct page **pages, int num_pages,
 				 bool dirty);
 extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 74cafc0142ea..64305e7056a1 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -10,39 +10,6 @@
 
 #include <linux/ceph/libceph.h>
 
-/*
- * build a vector of user pages
- */
-struct page **ceph_get_direct_page_vector(const void __user *data,
-					  int num_pages, bool write_page)
-{
-	struct page **pages;
-	int got = 0;
-	int rc = 0;
-
-	pages = kmalloc_array(num_pages, sizeof(*pages), GFP_NOFS);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-
-	while (got < num_pages) {
-		rc = get_user_pages_fast(
-		    (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
-		    num_pages - got, write_page ? FOLL_WRITE : 0, pages + got);
-		if (rc < 0)
-			break;
-		BUG_ON(rc == 0);
-		got += rc;
-	}
-	if (rc < 0)
-		goto fail;
-	return pages;
-
-fail:
-	ceph_put_page_vector(pages, got, false);
-	return ERR_PTR(rc);
-}
-EXPORT_SYMBOL(ceph_get_direct_page_vector);
-
 void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
 {
 	int i;
-- 
cgit v1.2.3


From 6c37f0e64173571914a443f74d36e5a22dabfc05 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 3 Jun 2019 14:45:16 -0400
Subject: libceph: add ceph_decode_entity_addr

Add a function for decoding an entity_addr_t. Once
CEPH_FEATURE_MSG_ADDR2 is enabled, the server daemons will start
encoding entity_addr_t differently.

Add a new helper function that can handle either format.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/decode.h |  2 +
 net/ceph/Makefile           |  2 +-
 net/ceph/decode.c           | 90 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 net/ceph/decode.c

(limited to 'include')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6c2a48d42e0..1c0a665bfc03 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -230,6 +230,8 @@ static inline void ceph_decode_addr(struct ceph_entity_addr *a)
 	WARN_ON(a->in_addr.ss_family == 512);
 }
 
+extern int ceph_decode_entity_addr(void **p, void *end,
+				   struct ceph_entity_addr *addr);
 /*
  * encoders
  */
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index db09defe27d0..59d0ba2072de 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_CEPH_LIB) += libceph.o
 
 libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
-	mon_client.o \
+	mon_client.o decode.o \
 	cls_lock_client.o \
 	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
 	striper.o \
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
new file mode 100644
index 000000000000..b82981199549
--- /dev/null
+++ b/net/ceph/decode.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/decode.h>
+
+static int
+ceph_decode_entity_addr_versioned(void **p, void *end,
+				  struct ceph_entity_addr *addr)
+{
+	int ret;
+	u8 struct_v;
+	u32 struct_len, addr_len;
+	void *struct_end;
+
+	ret = ceph_start_decoding(p, end, 1, "entity_addr_t", &struct_v,
+				  &struct_len);
+	if (ret)
+		goto bad;
+
+	ret = -EINVAL;
+	struct_end = *p + struct_len;
+
+	ceph_decode_copy_safe(p, end, &addr->type, sizeof(addr->type), bad);
+
+	/*
+	 * TYPE_NONE == 0
+	 * TYPE_LEGACY == 1
+	 *
+	 * Clients that don't support ADDR2 always send TYPE_NONE.
+	 * For now, since all we support is msgr1, just set this to 0
+	 * when we get a TYPE_LEGACY type.
+	 */
+	if (addr->type == cpu_to_le32(1))
+		addr->type = 0;
+
+	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+
+	ceph_decode_32_safe(p, end, addr_len, bad);
+	if (addr_len > sizeof(addr->in_addr))
+		goto bad;
+
+	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+	if (addr_len) {
+		ceph_decode_copy_safe(p, end, &addr->in_addr, addr_len, bad);
+
+		addr->in_addr.ss_family =
+			le16_to_cpu((__force __le16)addr->in_addr.ss_family);
+	}
+
+	/* Advance past anything the client doesn't yet understand */
+	*p = struct_end;
+	ret = 0;
+bad:
+	return ret;
+}
+
+static int
+ceph_decode_entity_addr_legacy(void **p, void *end,
+			       struct ceph_entity_addr *addr)
+{
+	int ret = -EINVAL;
+
+	/* Skip rest of type field */
+	ceph_decode_skip_n(p, end, 3, bad);
+	addr->type = 0;
+	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+	ceph_decode_copy_safe(p, end, &addr->in_addr,
+			      sizeof(addr->in_addr), bad);
+	addr->in_addr.ss_family =
+			be16_to_cpu((__force __be16)addr->in_addr.ss_family);
+	ret = 0;
+bad:
+	return ret;
+}
+
+int
+ceph_decode_entity_addr(void **p, void *end, struct ceph_entity_addr *addr)
+{
+	u8 marker;
+
+	ceph_decode_8_safe(p, end, marker, bad);
+	if (marker == 1)
+		return ceph_decode_entity_addr_versioned(p, end, addr);
+	else if (marker == 0)
+		return ceph_decode_entity_addr_legacy(p, end, addr);
+bad:
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addr);
+
-- 
cgit v1.2.3


From 0bfb0f288992adbf8d1f0d5f22f0fd398b146316 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 31 May 2019 15:32:28 -0400
Subject: libceph: ADDR2 support for monmap

Switch the MonMap decoder to use the new decoding routine for
entity_addr_t's.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mon_client.h |  1 -
 net/ceph/mon_client.c           | 21 +++++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 3a4688af7455..b4d134d3312a 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -104,7 +104,6 @@ struct ceph_mon_client {
 #endif
 };
 
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
 extern int ceph_monmap_contains(struct ceph_monmap *m,
 				struct ceph_entity_addr *addr);
 
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 895679d3529b..0520bf9825aa 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -39,7 +39,7 @@ static int __validate_auth(struct ceph_mon_client *monc);
 /*
  * Decode a monmap blob (e.g., during mount).
  */
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 {
 	struct ceph_monmap *m = NULL;
 	int i, err = -EINVAL;
@@ -50,7 +50,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	ceph_decode_32_safe(&p, end, len, bad);
 	ceph_decode_need(&p, end, len, bad);
 
-	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+	dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
 	p += sizeof(u16);  /* skip version */
 
 	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
@@ -58,7 +58,6 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	epoch = ceph_decode_32(&p);
 
 	num_mon = ceph_decode_32(&p);
-	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
 
 	if (num_mon > CEPH_MAX_MON)
 		goto bad;
@@ -68,17 +67,22 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	m->fsid = fsid;
 	m->epoch = epoch;
 	m->num_mon = num_mon;
-	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
-	for (i = 0; i < num_mon; i++)
-		ceph_decode_addr(&m->mon_inst[i].addr);
-
+	for (i = 0; i < num_mon; ++i) {
+		struct ceph_entity_inst *inst = &m->mon_inst[i];
+
+		/* copy name portion */
+		ceph_decode_copy_safe(&p, end, &inst->name,
+					sizeof(inst->name), bad);
+		err = ceph_decode_entity_addr(&p, end, &inst->addr);
+		if (err)
+			goto bad;
+	}
 	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
 	     m->num_mon);
 	for (i = 0; i < m->num_mon; i++)
 		dout("monmap_decode  mon%d is %s\n", i,
 		     ceph_pr_addr(&m->mon_inst[i].addr));
 	return m;
-
 bad:
 	dout("monmap_decode failed with %d\n", err);
 	kfree(m);
@@ -469,6 +473,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
+		ceph_msg_dump(msg);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From d3c3c0a841d5dafc5395be363996d619255a732f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 17 Jun 2019 06:57:25 -0400
Subject: libceph: use TYPE_LEGACY for entity addrs instead of TYPE_NONE

Going forward, we'll have different address types so let's use
the addr2 TYPE_LEGACY for internal tracking rather than TYPE_NONE.

Also, make ceph_pr_addr print the address type value as well.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/decode.h |  7 +++++++
 net/ceph/decode.c           | 18 ++++++------------
 net/ceph/messenger.c        |  7 +++++--
 3 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 1c0a665bfc03..ce488d95be89 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -218,16 +218,23 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
 /*
  * sockaddr_storage <-> ceph_sockaddr
  */
+#define CEPH_ENTITY_ADDR_TYPE_NONE	0
+#define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
+
 static inline void ceph_encode_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = htons(a->in_addr.ss_family);
 	a->in_addr.ss_family = *(__u16 *)&ss_family;
+
+	/* Banner addresses require TYPE_NONE */
+	a->type = CEPH_ENTITY_ADDR_TYPE_NONE;
 }
 static inline void ceph_decode_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
 	a->in_addr.ss_family = ntohs(ss_family);
 	WARN_ON(a->in_addr.ss_family == 512);
+	a->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 }
 
 extern int ceph_decode_entity_addr(void **p, void *end,
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index b82981199549..eea529595a7a 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -21,17 +21,6 @@ ceph_decode_entity_addr_versioned(void **p, void *end,
 
 	ceph_decode_copy_safe(p, end, &addr->type, sizeof(addr->type), bad);
 
-	/*
-	 * TYPE_NONE == 0
-	 * TYPE_LEGACY == 1
-	 *
-	 * Clients that don't support ADDR2 always send TYPE_NONE.
-	 * For now, since all we support is msgr1, just set this to 0
-	 * when we get a TYPE_LEGACY type.
-	 */
-	if (addr->type == cpu_to_le32(1))
-		addr->type = 0;
-
 	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
 
 	ceph_decode_32_safe(p, end, addr_len, bad);
@@ -61,7 +50,12 @@ ceph_decode_entity_addr_legacy(void **p, void *end,
 
 	/* Skip rest of type field */
 	ceph_decode_skip_n(p, end, 3, bad);
-	addr->type = 0;
+
+	/*
+	 * Clients that don't support ADDR2 always send TYPE_NONE, change it
+	 * to TYPE_LEGACY for forward compatibility.
+	 */
+	addr->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
 	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
 	ceph_decode_copy_safe(p, end, &addr->in_addr,
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 8d0c51dd4666..0a3ef33cf7ac 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -199,12 +199,14 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 
 	switch (ss.ss_family) {
 	case AF_INET:
-		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
+		snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu",
+			 le32_to_cpu(addr->type), &in4->sin_addr,
 			 ntohs(in4->sin_port));
 		break;
 
 	case AF_INET6:
-		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr,
+		snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu",
+			 le32_to_cpu(addr->type), &in6->sin6_addr,
 			 ntohs(in6->sin6_port));
 		break;
 
@@ -1982,6 +1984,7 @@ int ceph_parse_ips(const char *c, const char *end,
 		}
 
 		addr_set_port(&addr[i], port);
+		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 
 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
 
-- 
cgit v1.2.3


From 2c66de560fa2dda0a600e908897116914db8f500 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 17 Jun 2019 09:24:31 -0400
Subject: libceph: rename ceph_encode_addr to ceph_encode_banner_addr

...ditto for the decode function. We only use these functions to fix
up banner addresses now, so let's name them more appropriately.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/decode.h | 4 ++--
 net/ceph/messenger.c        | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index ce488d95be89..450384fe487c 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -221,7 +221,7 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
 #define CEPH_ENTITY_ADDR_TYPE_NONE	0
 #define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
 
-static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = htons(a->in_addr.ss_family);
 	a->in_addr.ss_family = *(__u16 *)&ss_family;
@@ -229,7 +229,7 @@ static inline void ceph_encode_addr(struct ceph_entity_addr *a)
 	/* Banner addresses require TYPE_NONE */
 	a->type = CEPH_ENTITY_ADDR_TYPE_NONE;
 }
-static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
 	a->in_addr.ss_family = ntohs(ss_family);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 0a3ef33cf7ac..0473d9a7b1f4 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -222,7 +222,7 @@ EXPORT_SYMBOL(ceph_pr_addr);
 static void encode_my_addr(struct ceph_messenger *msgr)
 {
 	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-	ceph_encode_addr(&msgr->my_enc_addr);
+	ceph_encode_banner_addr(&msgr->my_enc_addr);
 }
 
 /*
@@ -1734,14 +1734,14 @@ static int read_partial_banner(struct ceph_connection *con)
 	ret = read_partial(con, end, size, &con->actual_peer_addr);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_addr(&con->actual_peer_addr);
+	ceph_decode_banner_addr(&con->actual_peer_addr);
 
 	size = sizeof (con->peer_addr_for_me);
 	end += size;
 	ret = read_partial(con, end, size, &con->peer_addr_for_me);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_addr(&con->peer_addr_for_me);
+	ceph_decode_banner_addr(&con->peer_addr_for_me);
 
 out:
 	return ret;
-- 
cgit v1.2.3


From 6adaaafdd81d5c01875fe233ab73deb81b34caa1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 31 May 2019 12:24:22 -0400
Subject: libceph: turn on CEPH_FEATURE_MSG_ADDR2

Now that the client can handle either address formatting, advertise to
the peer that we can support it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_features.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 65a38c4a02a1..39e6f4c57580 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -211,6 +211,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_MON_STATEFUL_SUB |	\
 	 CEPH_FEATURE_CRUSH_TUNABLES5 |		\
 	 CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING |	\
+	 CEPH_FEATURE_MSG_ADDR2 |		\
 	 CEPH_FEATURE_CEPHX_V2)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT	0
-- 
cgit v1.2.3


From 441d367644e2f60b37f36bfc656deee551acba5b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 5 Jun 2019 17:24:22 -0400
Subject: iversion: add a routine to update a raw value with a larger one

Under ceph, clients can be independently updating iversion themselves,
while working under comprehensive sets of caps on an inode. In that
situation we always want to prefer the largest value of a change
attribute. Add a new function that will update a raw value with a larger
one, but otherwise leave it alone.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/iversion.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index be50ef7cedab..2917ef990d43 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -112,6 +112,30 @@ inode_peek_iversion_raw(const struct inode *inode)
 	return atomic64_read(&inode->i_version);
 }
 
+/**
+ * inode_set_max_iversion_raw - update i_version new value is larger
+ * @inode: inode to set
+ * @val: new i_version to set
+ *
+ * Some self-managed filesystems (e.g Ceph) will only update the i_version
+ * value if the new value is larger than the one we already have.
+ */
+static inline void
+inode_set_max_iversion_raw(struct inode *inode, u64 val)
+{
+	u64 cur, old;
+
+	cur = inode_peek_iversion_raw(inode);
+	for (;;) {
+		if (cur > val)
+			break;
+		old = atomic64_cmpxchg(&inode->i_version, cur, val);
+		if (likely(old == cur))
+			break;
+		cur = old;
+	}
+}
+
 /**
  * inode_set_iversion - set i_version to a particular value
  * @inode: inode to set
-- 
cgit v1.2.3


From 49ada6e8dc9f64ad1e8dd6f7b453c9e584e9f897 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 20 Jun 2019 12:09:08 +0800
Subject: ceph: more precise CEPH_CLIENT_CAPS_PENDING_CAPSNAP

Client uses this flag to tell mds if there is more cap snap need to
flush. It's mainly for the case that client needs to re-send cap/snap
flushes after mds failover, but CEPH_CAP_ANY_FILE_WR on corresponding
inodes are all released before mds failover.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c               | 41 ++++++++++++++++++++++++++++++-----------
 include/linux/ceph/ceph_fs.h |  2 +-
 2 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f9055cdec3c7..d98dcd976c80 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1295,7 +1295,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
  * caller should hold snap_rwsem (read), s_mutex.
  */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
-		      int op, bool sync, int used, int want, int retain,
+		      int op, int flags, int used, int want, int retain,
 		      int flushing, u64 flush_tid, u64 oldest_flush_tid)
 	__releases(cap->ci->i_ceph_lock)
 {
@@ -1393,12 +1393,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	arg.mode = inode->i_mode;
 
 	arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
-	if (list_empty(&ci->i_cap_snaps))
-		arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
-	else
-		arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
-	if (sync)
-		arg.flags |= CEPH_CLIENT_CAPS_SYNC;
+	if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
+	    !list_empty(&ci->i_cap_snaps)) {
+		struct ceph_cap_snap *capsnap;
+		list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
+			if (capsnap->cap_flush.tid)
+				break;
+			if (capsnap->need_flush) {
+				flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
+				break;
+			}
+		}
+	}
+	arg.flags = flags;
 
 	spin_unlock(&ci->i_ceph_lock);
 
@@ -2085,7 +2092,7 @@ ack:
 		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
 				cap_used, want, retain, flushing,
 				flush_tid, oldest_flush_tid);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
@@ -2155,7 +2162,8 @@ retry_locked:
 						&flush_tid, &oldest_flush_tid);
 
 		/* __send_cap drops i_ceph_lock */
-		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+				     CEPH_CLIENT_CAPS_SYNC,
 				     __ceph_caps_used(ci),
 				     __ceph_caps_wanted(ci),
 				     (cap->issued | cap->implemented),
@@ -2328,9 +2336,17 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 	struct ceph_cap_flush *cf;
 	int ret;
 	u64 first_tid = 0;
+	u64 last_snap_flush = 0;
 
 	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
 
+	list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
+		if (!cf->caps) {
+			last_snap_flush = cf->tid;
+			break;
+		}
+	}
+
 	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
 		if (cf->tid < first_tid)
 			continue;
@@ -2348,10 +2364,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 			dout("kick_flushing_caps %p cap %p tid %llu %s\n",
 			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
 			ci->i_ceph_flags |= CEPH_I_NODELAY;
+
 			ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-					  false, __ceph_caps_used(ci),
+					 (cf->tid < last_snap_flush ?
+					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
+					  __ceph_caps_used(ci),
 					  __ceph_caps_wanted(ci),
-					  cap->issued | cap->implemented,
+					  (cap->issued | cap->implemented),
 					  cf->caps, cf->tid, oldest_flush_tid);
 			if (ret) {
 				pr_err("kick_flushing_caps: error sending "
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 3ac0feaf2b5e..cb21c5cf12c3 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -682,7 +682,7 @@ extern const char *ceph_cap_op_name(int op);
 /* flags field in client cap messages (version >= 10) */
 #define CEPH_CLIENT_CAPS_SYNC			(1<<0)
 #define CEPH_CLIENT_CAPS_NO_CAPSNAP		(1<<1)
-#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP	(1<<2);
+#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP	(1<<2)
 
 /*
  * caps message, used for capability callbacks, acks, requests, etc.
-- 
cgit v1.2.3


From 94e85771881027e62afdddadd31e3eec73025990 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 8 Jul 2019 12:50:09 +0200
Subject: libceph: rename r_unsafe_item to r_private_item

This list item remained from when we had safe and unsafe replies
(commit vs ack).  It has since become a private list item for use by
clients.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/file.c                  | 6 +++---
 include/linux/ceph/osd_client.h | 2 +-
 net/ceph/osd_client.c           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a06090c8281e..d5bee928603a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1026,7 +1026,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			req->r_callback = ceph_aio_complete_req;
 			req->r_inode = inode;
 			req->r_priv = aio_req;
-			list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
+			list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
 
 			pos += len;
 			continue;
@@ -1086,8 +1086,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		while (!list_empty(&osd_reqs)) {
 			req = list_first_entry(&osd_reqs,
 					       struct ceph_osd_request,
-					       r_unsafe_item);
-			list_del_init(&req->r_unsafe_item);
+					       r_private_item);
+			list_del_init(&req->r_private_item);
 			if (ret >= 0)
 				ret = ceph_osdc_start_request(req->r_osdc,
 							      req, false);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2294f963dab7..024f6fed0ac5 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -198,9 +198,9 @@ struct ceph_osd_request {
 	bool              r_mempool;
 	struct completion r_completion;       /* private to osd_client.c */
 	ceph_osdc_callback_t r_callback;
-	struct list_head  r_unsafe_item;
 
 	struct inode *r_inode;         	      /* for use by callbacks */
+	struct list_head r_private_item;      /* ditto */
 	void *r_priv;			      /* ditto */
 
 	/* set by submitter */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 54170a35ecec..6495982c5c07 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -478,7 +478,7 @@ static void request_release_checks(struct ceph_osd_request *req)
 {
 	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
 	WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
-	WARN_ON(!list_empty(&req->r_unsafe_item));
+	WARN_ON(!list_empty(&req->r_private_item));
 	WARN_ON(req->r_osd);
 }
 
@@ -538,7 +538,7 @@ static void request_init(struct ceph_osd_request *req)
 	init_completion(&req->r_completion);
 	RB_CLEAR_NODE(&req->r_node);
 	RB_CLEAR_NODE(&req->r_mc_node);
-	INIT_LIST_HEAD(&req->r_unsafe_item);
+	INIT_LIST_HEAD(&req->r_private_item);
 
 	target_init(&req->r_t);
 }
-- 
cgit v1.2.3


From ef83171b49c66d851a1a0dc6da5b4a4d8ee6ce9a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 8 Apr 2019 14:16:05 +0200
Subject: libceph: bump CEPH_MSG_MAX_DATA_LEN (again)

This time for rbd object map.  Object maps are limited in size to
256000000 objects, two bits per object.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 include/linux/ceph/libceph.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index a3cddf5f0e60..82156da3c650 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -84,11 +84,13 @@ struct ceph_options {
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
 
 /*
- * Handle the largest possible rbd object in one message.
+ * The largest possible rbd data object is 32M.
+ * The largest possible rbd object map object is 64M.
+ *
  * There is no limit on the size of cephfs objects, but it has to obey
  * rsize and wsize mount options anyway.
  */
-#define CEPH_MSG_MAX_DATA_LEN	(32*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN	(64*1024*1024)
 
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
 
-- 
cgit v1.2.3


From 68ada915eea10f36760ffe414810390a104df093 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 14 Jun 2019 18:16:51 +0200
Subject: libceph: change ceph_osdc_call() to take page vector for response

This will be used for loading object map.  rbd_obj_read_sync() isn't
suitable because object map must be accessed through class methods.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 drivers/block/rbd.c             |  8 ++++----
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/cls_lock_client.c      |  2 +-
 net/ceph/osd_client.c           | 10 +++++-----
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6d1df82eb883..f0814c148b1c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4076,7 +4076,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
-			     reply_page, &inbound_size);
+			     &reply_page, &inbound_size);
 	if (!ret) {
 		memcpy(inbound, page_address(reply_page), inbound_size);
 		ret = inbound_size;
@@ -5102,7 +5102,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
-			     req_page, sizeof(u64), reply_page, &reply_len);
+			     req_page, sizeof(u64), &reply_page, &reply_len);
 	if (ret)
 		return ret == -EOPNOTSUPP ? 1 : ret;
 
@@ -5114,7 +5114,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
-			     req_page, sizeof(u64), reply_page, &reply_len);
+			     req_page, sizeof(u64), &reply_page, &reply_len);
 	if (ret)
 		return ret;
 
@@ -5145,7 +5145,7 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
-			     req_page, sizeof(u64), reply_page, &reply_len);
+			     req_page, sizeof(u64), &reply_page, &reply_len);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 024f6fed0ac5..c567cfa4f107 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -497,7 +497,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 		   const char *class, const char *method,
 		   unsigned int flags,
 		   struct page *req_page, size_t req_len,
-		   struct page *resp_page, size_t *resp_len);
+		   struct page **resp_pages, size_t *resp_len);
 
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			       struct ceph_vino vino,
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index b1d12bf4b83e..fb59094caf13 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -363,7 +363,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
 	dout("%s lock_name %s\n", __func__, lock_name);
 	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info",
 			     CEPH_OSD_FLAG_READ, get_info_op_page,
-			     get_info_op_buf_size, reply_page, &reply_len);
+			     get_info_op_buf_size, &reply_page, &reply_len);
 
 	dout("%s: status %d\n", __func__, ret);
 	if (ret >= 0) {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 6495982c5c07..a90fbfce7e93 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5050,12 +5050,12 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 		   const char *class, const char *method,
 		   unsigned int flags,
 		   struct page *req_page, size_t req_len,
-		   struct page *resp_page, size_t *resp_len)
+		   struct page **resp_pages, size_t *resp_len)
 {
 	struct ceph_osd_request *req;
 	int ret;
 
-	if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+	if (req_len > PAGE_SIZE)
 		return -E2BIG;
 
 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
@@ -5073,8 +5073,8 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	if (req_page)
 		osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
 						  0, false, false);
-	if (resp_page)
-		osd_req_op_cls_response_data_pages(req, 0, &resp_page,
+	if (resp_pages)
+		osd_req_op_cls_response_data_pages(req, 0, resp_pages,
 						   *resp_len, 0, false, false);
 
 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
@@ -5085,7 +5085,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0) {
 		ret = req->r_ops[0].rval;
-		if (resp_page)
+		if (resp_pages)
 			*resp_len = req->r_ops[0].outdata_len;
 	}
 
-- 
cgit v1.2.3


From 4cf3e6dff7ea517544e1da7810a0b3ebba380d2c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 14 Jun 2019 18:00:19 +0200
Subject: libceph: export osd_req_op_data() macro

We already have one exported wrapper around it for extent.osd_data and
rbd_object_map_update_finish() needs another one for cls.request_data.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/ceph/osd_client.h | 8 ++++++++
 net/ceph/osd_client.c           | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index c567cfa4f107..ad7fe5d10dcd 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -389,6 +389,14 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
 void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
 
+#define osd_req_op_data(oreq, whch, typ, fld)				\
+({									\
+	struct ceph_osd_request *__oreq = (oreq);			\
+	unsigned int __whch = (whch);					\
+	BUG_ON(__whch >= __oreq->r_num_ops);				\
+	&__oreq->r_ops[__whch].typ.fld;					\
+})
+
 extern void osd_req_op_init(struct ceph_osd_request *osd_req,
 			    unsigned int which, u16 opcode, u32 flags);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a90fbfce7e93..0b2df09b2554 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -171,14 +171,6 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
 	osd_data->num_bvecs = num_bvecs;
 }
 
-#define osd_req_op_data(oreq, whch, typ, fld)				\
-({									\
-	struct ceph_osd_request *__oreq = (oreq);			\
-	unsigned int __whch = (whch);					\
-	BUG_ON(__whch >= __oreq->r_num_ops);				\
-	&__oreq->r_ops[__whch].typ.fld;					\
-})
-
 static struct ceph_osd_data *
 osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
 {
-- 
cgit v1.2.3


From 22e8bd51bb0469d1a524130a057f894ff632376a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 5 Jun 2019 19:25:11 +0200
Subject: rbd: support for object-map and fast-diff

Speed up reads, discards and zeroouts through RBD_OBJ_FLAG_MAY_EXIST
and RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT based on object map.

Invalid object maps are not trusted, but still updated.  Note that we
never iterate, resize or invalidate object maps.  If object-map feature
is enabled but object map fails to load, we just fail the requester
(either "rbd map" or I/O, by way of post-acquire action).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c                  | 720 ++++++++++++++++++++++++++++++++++-
 drivers/block/rbd_types.h            |  10 +
 include/linux/ceph/cls_lock_client.h |   3 +
 include/linux/ceph/striper.h         |   2 +
 net/ceph/cls_lock_client.c           |  45 +++
 net/ceph/striper.c                   |  17 +
 6 files changed, 794 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3d861d3013f8..0df91665c4eb 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -115,6 +115,8 @@ static int atomic_dec_return_safe(atomic_t *v)
 #define RBD_FEATURE_LAYERING		(1ULL<<0)
 #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
 #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
+#define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
+#define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
 #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
 #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
 #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
@@ -122,6 +124,8 @@ static int atomic_dec_return_safe(atomic_t *v)
 #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
 				 RBD_FEATURE_STRIPINGV2 |	\
 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
+				 RBD_FEATURE_OBJECT_MAP |	\
+				 RBD_FEATURE_FAST_DIFF |	\
 				 RBD_FEATURE_DEEP_FLATTEN |	\
 				 RBD_FEATURE_DATA_POOL |	\
 				 RBD_FEATURE_OPERATIONS)
@@ -227,6 +231,8 @@ enum obj_operation_type {
 #define RBD_OBJ_FLAG_DELETION			(1U << 0)
 #define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
 #define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
+#define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
+#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
 
 enum rbd_obj_read_state {
 	RBD_OBJ_READ_START = 1,
@@ -261,14 +267,18 @@ enum rbd_obj_read_state {
  */
 enum rbd_obj_write_state {
 	RBD_OBJ_WRITE_START = 1,
+	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
 	RBD_OBJ_WRITE_OBJECT,
 	__RBD_OBJ_WRITE_COPYUP,
 	RBD_OBJ_WRITE_COPYUP,
+	RBD_OBJ_WRITE_POST_OBJECT_MAP,
 };
 
 enum rbd_obj_copyup_state {
 	RBD_OBJ_COPYUP_START = 1,
 	RBD_OBJ_COPYUP_READ_PARENT,
+	__RBD_OBJ_COPYUP_OBJECT_MAPS,
+	RBD_OBJ_COPYUP_OBJECT_MAPS,
 	__RBD_OBJ_COPYUP_WRITE_OBJECT,
 	RBD_OBJ_COPYUP_WRITE_OBJECT,
 };
@@ -419,6 +429,11 @@ struct rbd_device {
 	int			acquire_err;
 	struct completion	releasing_wait;
 
+	spinlock_t		object_map_lock;
+	u8			*object_map;
+	u64			object_map_size;	/* in objects */
+	u64			object_map_flags;
+
 	struct workqueue_struct	*task_wq;
 
 	struct rbd_spec		*parent_spec;
@@ -620,6 +635,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 				u8 *order, u64 *snap_size);
 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 		u64 *snap_features);
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
 
 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
@@ -1768,6 +1784,466 @@ static void rbd_img_request_destroy(struct kref *kref)
 	kmem_cache_free(rbd_img_request_cache, img_request);
 }
 
+#define BITS_PER_OBJ	2
+#define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
+#define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
+
+static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
+				   u64 *index, u8 *shift)
+{
+	u32 off;
+
+	rbd_assert(objno < rbd_dev->object_map_size);
+	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
+	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
+}
+
+static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+	u64 index;
+	u8 shift;
+
+	lockdep_assert_held(&rbd_dev->object_map_lock);
+	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
+	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
+}
+
+static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
+{
+	u64 index;
+	u8 shift;
+	u8 *p;
+
+	lockdep_assert_held(&rbd_dev->object_map_lock);
+	rbd_assert(!(val & ~OBJ_MASK));
+
+	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
+	p = &rbd_dev->object_map[index];
+	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
+}
+
+static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+	u8 state;
+
+	spin_lock(&rbd_dev->object_map_lock);
+	state = __rbd_object_map_get(rbd_dev, objno);
+	spin_unlock(&rbd_dev->object_map_lock);
+	return state;
+}
+
+static bool use_object_map(struct rbd_device *rbd_dev)
+{
+	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
+		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
+}
+
+static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
+{
+	u8 state;
+
+	/* fall back to default logic if object map is disabled or invalid */
+	if (!use_object_map(rbd_dev))
+		return true;
+
+	state = rbd_object_map_get(rbd_dev, objno);
+	return state != OBJECT_NONEXISTENT;
+}
+
+static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
+				struct ceph_object_id *oid)
+{
+	if (snap_id == CEPH_NOSNAP)
+		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
+				rbd_dev->spec->image_id);
+	else
+		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
+				rbd_dev->spec->image_id, snap_id);
+}
+
+static int rbd_object_map_lock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	u8 lock_type;
+	char *lock_tag;
+	struct ceph_locker *lockers;
+	u32 num_lockers;
+	bool broke_lock = false;
+	int ret;
+
+	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+again:
+	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
+	if (ret != -EBUSY || broke_lock) {
+		if (ret == -EEXIST)
+			ret = 0; /* already locked by myself */
+		if (ret)
+			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
+		return ret;
+	}
+
+	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
+				 RBD_LOCK_NAME, &lock_type, &lock_tag,
+				 &lockers, &num_lockers);
+	if (ret) {
+		if (ret == -ENOENT)
+			goto again;
+
+		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
+		return ret;
+	}
+
+	kfree(lock_tag);
+	if (num_lockers == 0)
+		goto again;
+
+	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
+		 ENTITY_NAME(lockers[0].id.name));
+
+	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
+				  RBD_LOCK_NAME, lockers[0].id.cookie,
+				  &lockers[0].id.name);
+	ceph_free_lockers(lockers, num_lockers);
+	if (ret) {
+		if (ret == -ENOENT)
+			goto again;
+
+		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
+		return ret;
+	}
+
+	broke_lock = true;
+	goto again;
+}
+
+static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	int ret;
+
+	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+			      "");
+	if (ret && ret != -ENOENT)
+		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
+}
+
+static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
+{
+	u8 struct_v;
+	u32 struct_len;
+	u32 header_len;
+	void *header_end;
+	int ret;
+
+	ceph_decode_32_safe(p, end, header_len, e_inval);
+	header_end = *p + header_len;
+
+	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
+				  &struct_len);
+	if (ret)
+		return ret;
+
+	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
+
+	*p = header_end;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int __rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	struct page **pages;
+	void *p, *end;
+	size_t reply_len;
+	u64 num_objects;
+	u64 object_map_bytes;
+	u64 object_map_size;
+	int num_pages;
+	int ret;
+
+	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
+
+	num_objects = ceph_get_num_objects(&rbd_dev->layout,
+					   rbd_dev->mapping.size);
+	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
+					    BITS_PER_BYTE);
+	num_pages = calc_pages_for(0, object_map_bytes) + 1;
+	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	reply_len = num_pages * PAGE_SIZE;
+	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
+	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
+			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
+			     NULL, 0, pages, &reply_len);
+	if (ret)
+		goto out;
+
+	p = page_address(pages[0]);
+	end = p + min(reply_len, (size_t)PAGE_SIZE);
+	ret = decode_object_map_header(&p, end, &object_map_size);
+	if (ret)
+		goto out;
+
+	if (object_map_size != num_objects) {
+		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
+			 object_map_size, num_objects);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (offset_in_page(p) + object_map_bytes > reply_len) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
+	if (!rbd_dev->object_map) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rbd_dev->object_map_size = object_map_size;
+	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
+				   offset_in_page(p), object_map_bytes);
+
+out:
+	ceph_release_page_vector(pages, num_pages);
+	return ret;
+}
+
+static void rbd_object_map_free(struct rbd_device *rbd_dev)
+{
+	kvfree(rbd_dev->object_map);
+	rbd_dev->object_map = NULL;
+	rbd_dev->object_map_size = 0;
+}
+
+static int rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = __rbd_object_map_load(rbd_dev);
+	if (ret)
+		return ret;
+
+	ret = rbd_dev_v2_get_flags(rbd_dev);
+	if (ret) {
+		rbd_object_map_free(rbd_dev);
+		return ret;
+	}
+
+	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
+		rbd_warn(rbd_dev, "object map is invalid");
+
+	return 0;
+}
+
+static int rbd_object_map_open(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = rbd_object_map_lock(rbd_dev);
+	if (ret)
+		return ret;
+
+	ret = rbd_object_map_load(rbd_dev);
+	if (ret) {
+		rbd_object_map_unlock(rbd_dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void rbd_object_map_close(struct rbd_device *rbd_dev)
+{
+	rbd_object_map_free(rbd_dev);
+	rbd_object_map_unlock(rbd_dev);
+}
+
+/*
+ * This function needs snap_id (or more precisely just something to
+ * distinguish between HEAD and snapshot object maps), new_state and
+ * current_state that were passed to rbd_object_map_update().
+ *
+ * To avoid allocating and stashing a context we piggyback on the OSD
+ * request.  A HEAD update has two ops (assert_locked).  For new_state
+ * and current_state we decode our own object_map_update op, encoded in
+ * rbd_cls_object_map_update().
+ */
+static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
+					struct ceph_osd_request *osd_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_osd_data *osd_data;
+	u64 objno;
+	u8 state, new_state, current_state;
+	bool has_current_state;
+	void *p;
+
+	if (osd_req->r_result)
+		return osd_req->r_result;
+
+	/*
+	 * Nothing to do for a snapshot object map.
+	 */
+	if (osd_req->r_num_ops == 1)
+		return 0;
+
+	/*
+	 * Update in-memory HEAD object map.
+	 */
+	rbd_assert(osd_req->r_num_ops == 2);
+	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
+	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
+
+	p = page_address(osd_data->pages[0]);
+	objno = ceph_decode_64(&p);
+	rbd_assert(objno == obj_req->ex.oe_objno);
+	rbd_assert(ceph_decode_64(&p) == objno + 1);
+	new_state = ceph_decode_8(&p);
+	has_current_state = ceph_decode_8(&p);
+	if (has_current_state)
+		current_state = ceph_decode_8(&p);
+
+	spin_lock(&rbd_dev->object_map_lock);
+	state = __rbd_object_map_get(rbd_dev, objno);
+	if (!has_current_state || current_state == state ||
+	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
+		__rbd_object_map_set(rbd_dev, objno, new_state);
+	spin_unlock(&rbd_dev->object_map_lock);
+
+	return 0;
+}
+
+static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+	int result;
+
+	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
+	     osd_req->r_result, obj_req);
+
+	result = rbd_object_map_update_finish(obj_req, osd_req);
+	rbd_obj_handle_request(obj_req, result);
+}
+
+static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
+{
+	u8 state = rbd_object_map_get(rbd_dev, objno);
+
+	if (state == new_state ||
+	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
+	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
+		return false;
+
+	return true;
+}
+
+static int rbd_cls_object_map_update(struct ceph_osd_request *req,
+				     int which, u64 objno, u8 new_state,
+				     const u8 *current_state)
+{
+	struct page **pages;
+	void *p, *start;
+	int ret;
+
+	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
+	if (ret)
+		return ret;
+
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	p = start = page_address(pages[0]);
+	ceph_encode_64(&p, objno);
+	ceph_encode_64(&p, objno + 1);
+	ceph_encode_8(&p, new_state);
+	if (current_state) {
+		ceph_encode_8(&p, 1);
+		ceph_encode_8(&p, *current_state);
+	} else {
+		ceph_encode_8(&p, 0);
+	}
+
+	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
+					  false, true);
+	return 0;
+}
+
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
+				 u8 new_state, const u8 *current_state)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_osd_request *req;
+	int num_ops = 1;
+	int which = 0;
+	int ret;
+
+	if (snap_id == CEPH_NOSNAP) {
+		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
+			return 1;
+
+		num_ops++; /* assert_locked */
+	}
+
+	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
+	req->r_callback = rbd_object_map_callback;
+	req->r_priv = obj_req;
+
+	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
+	req->r_flags = CEPH_OSD_FLAG_WRITE;
+	ktime_get_real_ts64(&req->r_mtime);
+
+	if (snap_id == CEPH_NOSNAP) {
+		/*
+		 * Protect against possible race conditions during lock
+		 * ownership transitions.
+		 */
+		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
+					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
+		if (ret)
+			return ret;
+	}
+
+	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
+					new_state, current_state);
+	if (ret)
+		return ret;
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	ceph_osdc_start_request(osdc, req, false);
+	return 0;
+}
+
 static void prune_extents(struct ceph_file_extent *img_extents,
 			  u32 *num_img_extents, u64 overlap)
 {
@@ -1975,6 +2451,7 @@ static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
 	if (ret)
 		return ret;
 
+	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
 		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
 
@@ -2022,6 +2499,7 @@ static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
 	if (rbd_obj_copyup_enabled(obj_req))
 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
 	if (!obj_req->num_img_extents) {
+		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
 		if (rbd_obj_is_entire(obj_req))
 			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
 	}
@@ -2407,6 +2885,20 @@ static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
 	queue_work(rbd_wq, &img_req->work);
 }
 
+static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
+		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
+		return true;
+	}
+
+	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
+	     obj_req->ex.oe_objno);
+	return false;
+}
+
 static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
 {
 	struct ceph_osd_request *osd_req;
@@ -2482,10 +2974,17 @@ static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
 	int ret;
 
+again:
 	switch (obj_req->read_state) {
 	case RBD_OBJ_READ_START:
 		rbd_assert(!*result);
 
+		if (!rbd_obj_may_exist(obj_req)) {
+			*result = -ENOENT;
+			obj_req->read_state = RBD_OBJ_READ_OBJECT;
+			goto again;
+		}
+
 		ret = rbd_obj_read_object(obj_req);
 		if (ret) {
 			*result = ret;
@@ -2536,6 +3035,44 @@ static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
 	}
 }
 
+static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
+		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
+
+	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
+	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
+		dout("%s %p noop for nonexistent\n", __func__, obj_req);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	u8 new_state;
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return 1;
+
+	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
+		new_state = OBJECT_PENDING;
+	else
+		new_state = OBJECT_EXISTS;
+
+	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
+}
+
 static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
 {
 	struct ceph_osd_request *osd_req;
@@ -2706,6 +3243,41 @@ static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
 	return rbd_obj_read_from_parent(obj_req);
 }
 
+static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
+	u8 new_state;
+	u32 i;
+	int ret;
+
+	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return;
+
+	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
+		return;
+
+	for (i = 0; i < snapc->num_snaps; i++) {
+		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
+		    i + 1 < snapc->num_snaps)
+			new_state = OBJECT_EXISTS_CLEAN;
+		else
+			new_state = OBJECT_EXISTS;
+
+		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
+					    new_state, NULL);
+		if (ret < 0) {
+			obj_req->pending.result = ret;
+			return;
+		}
+
+		rbd_assert(!ret);
+		obj_req->pending.num_pending++;
+	}
+}
+
 static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
 {
 	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
@@ -2749,6 +3321,7 @@ static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
 
 static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
 {
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
 	int ret;
 
 again:
@@ -2776,6 +3349,25 @@ again:
 			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
 		}
 
+		rbd_obj_copyup_object_maps(obj_req);
+		if (!obj_req->pending.num_pending) {
+			*result = obj_req->pending.result;
+			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
+			goto again;
+		}
+		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
+		return false;
+	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
+		if (!pending_result_dec(&obj_req->pending, result))
+			return false;
+		/* fall through */
+	case RBD_OBJ_COPYUP_OBJECT_MAPS:
+		if (*result) {
+			rbd_warn(rbd_dev, "snap object map update failed: %d",
+				 *result);
+			return true;
+		}
+
 		rbd_obj_copyup_write_object(obj_req);
 		if (!obj_req->pending.num_pending) {
 			*result = obj_req->pending.result;
@@ -2795,6 +3387,27 @@ again:
 	}
 }
 
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	u8 current_state = OBJECT_PENDING;
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return 1;
+
+	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
+		return 1;
+
+	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
+				     &current_state);
+}
+
 static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
 {
 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
@@ -2805,6 +3418,24 @@ again:
 	case RBD_OBJ_WRITE_START:
 		rbd_assert(!*result);
 
+		if (rbd_obj_write_is_noop(obj_req))
+			return true;
+
+		ret = rbd_obj_write_pre_object_map(obj_req);
+		if (ret < 0) {
+			*result = ret;
+			return true;
+		}
+		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
+		if (ret > 0)
+			goto again;
+		return false;
+	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
+		if (*result) {
+			rbd_warn(rbd_dev, "pre object map update failed: %d",
+				 *result);
+			return true;
+		}
 		ret = rbd_obj_write_object(obj_req);
 		if (ret) {
 			*result = ret;
@@ -2837,8 +3468,23 @@ again:
 			return false;
 		/* fall through */
 	case RBD_OBJ_WRITE_COPYUP:
-		if (*result)
+		if (*result) {
 			rbd_warn(rbd_dev, "copyup failed: %d", *result);
+			return true;
+		}
+		ret = rbd_obj_write_post_object_map(obj_req);
+		if (ret < 0) {
+			*result = ret;
+			return true;
+		}
+		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
+		if (ret > 0)
+			goto again;
+		return false;
+	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
+		if (*result)
+			rbd_warn(rbd_dev, "post object map update failed: %d",
+				 *result);
 		return true;
 	default:
 		BUG();
@@ -2892,7 +3538,8 @@ static bool need_exclusive_lock(struct rbd_img_request *img_req)
 		return false;
 
 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
-	if (rbd_dev->opts->lock_on_read)
+	if (rbd_dev->opts->lock_on_read ||
+	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
 		return true;
 
 	return rbd_img_is_write(img_req);
@@ -3431,7 +4078,7 @@ static int rbd_try_lock(struct rbd_device *rbd_dev)
 		if (ret)
 			goto out; /* request lock or error */
 
-		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
+		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
 			 ENTITY_NAME(lockers[0].id.name));
 
 		ret = ceph_monc_blacklist_add(&client->monc,
@@ -3458,6 +4105,19 @@ out:
 	return ret;
 }
 
+static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
+		ret = rbd_object_map_open(rbd_dev);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 /*
  * Return:
  *   0 - lock acquired
@@ -3501,6 +4161,17 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
 	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
 	rbd_assert(list_empty(&rbd_dev->running_list));
 
+	ret = rbd_post_acquire_action(rbd_dev);
+	if (ret) {
+		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
+		/*
+		 * Can't stay in RBD_LOCK_STATE_LOCKED because
+		 * rbd_lock_add_request() would let the request through,
+		 * assuming that e.g. object map is locked and loaded.
+		 */
+		rbd_unlock(rbd_dev);
+	}
+
 out:
 	wake_lock_waiters(rbd_dev, ret);
 	up_write(&rbd_dev->lock_rwsem);
@@ -3574,10 +4245,17 @@ static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
 	return true;
 }
 
+static void rbd_pre_release_action(struct rbd_device *rbd_dev)
+{
+	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
+		rbd_object_map_close(rbd_dev);
+}
+
 static void __rbd_release_lock(struct rbd_device *rbd_dev)
 {
 	rbd_assert(list_empty(&rbd_dev->running_list));
 
+	rbd_pre_release_action(rbd_dev);
 	rbd_unlock(rbd_dev);
 }
 
@@ -4864,6 +5542,8 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
 	init_completion(&rbd_dev->acquire_wait);
 	init_completion(&rbd_dev->releasing_wait);
 
+	spin_lock_init(&rbd_dev->object_map_lock);
+
 	rbd_dev->dev.bus = &rbd_bus_type;
 	rbd_dev->dev.type = &rbd_device_type;
 	rbd_dev->dev.parent = &rbd_root_dev;
@@ -5045,6 +5725,32 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
 						&rbd_dev->header.features);
 }
 
+/*
+ * These are generic image flags, but since they are used only for
+ * object map, store them in rbd_dev->object_map_flags.
+ *
+ * For the same reason, this function is called only on object map
+ * (re)load and not on header refresh.
+ */
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
+{
+	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
+	__le64 flags;
+	int ret;
+
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, "get_flags",
+				  &snapid, sizeof(snapid),
+				  &flags, sizeof(flags));
+	if (ret < 0)
+		return ret;
+	if (ret < sizeof(flags))
+		return -EBADMSG;
+
+	rbd_dev->object_map_flags = le64_to_cpu(flags);
+	return 0;
+}
+
 struct parent_image_info {
 	u64		pool_id;
 	const char	*pool_ns;
@@ -6018,6 +6724,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
 	struct rbd_image_header	*header;
 
 	rbd_dev_parent_put(rbd_dev);
+	rbd_object_map_free(rbd_dev);
 	rbd_dev_mapping_clear(rbd_dev);
 
 	/* Free dynamic fields from the header, then zero it out */
@@ -6267,6 +6974,13 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
 	if (ret)
 		goto err_out_probe;
 
+	if (rbd_dev->spec->snap_id != CEPH_NOSNAP &&
+	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
+		ret = rbd_object_map_load(rbd_dev);
+		if (ret)
+			goto err_out_probe;
+	}
+
 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
 		ret = rbd_dev_v2_parent_info(rbd_dev);
 		if (ret)
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index 62ff50d3e7a6..ac98ab6ccd3b 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -18,6 +18,7 @@
 /* For format version 2, rbd image 'foo' consists of objects
  *   rbd_id.foo		- id of image
  *   rbd_header.<id>	- image metadata
+ *   rbd_object_map.<id> - optional image object map
  *   rbd_data.<id>.0000000000000000
  *   rbd_data.<id>.0000000000000001
  *   ...		- data
@@ -25,6 +26,7 @@
  */
 
 #define RBD_HEADER_PREFIX      "rbd_header."
+#define RBD_OBJECT_MAP_PREFIX  "rbd_object_map."
 #define RBD_ID_PREFIX          "rbd_id."
 #define RBD_V2_DATA_FORMAT     "%s.%016llx"
 
@@ -39,6 +41,14 @@ enum rbd_notify_op {
 	RBD_NOTIFY_OP_HEADER_UPDATE      = 3,
 };
 
+#define OBJECT_NONEXISTENT	0
+#define OBJECT_EXISTS		1
+#define OBJECT_PENDING		2
+#define OBJECT_EXISTS_CLEAN	3
+
+#define RBD_FLAG_OBJECT_MAP_INVALID	(1ULL << 0)
+#define RBD_FLAG_FAST_DIFF_INVALID	(1ULL << 1)
+
 /*
  * For format version 1, rbd image 'foo' consists of objects
  *   foo.rbd		- image metadata
diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h
index bea6c77d2093..17bc7584d1fe 100644
--- a/include/linux/ceph/cls_lock_client.h
+++ b/include/linux/ceph/cls_lock_client.h
@@ -52,4 +52,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
 		       char *lock_name, u8 *type, char **tag,
 		       struct ceph_locker **lockers, u32 *num_lockers);
 
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+			   char *lock_name, u8 type, char *cookie, char *tag);
+
 #endif
diff --git a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h
index cbd0d24b7148..3486636c0e6e 100644
--- a/include/linux/ceph/striper.h
+++ b/include/linux/ceph/striper.h
@@ -66,4 +66,6 @@ int ceph_extent_to_file(struct ceph_file_layout *l,
 			struct ceph_file_extent **file_extents,
 			u32 *num_file_extents);
 
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size);
+
 #endif
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index fb59094caf13..17447c19d937 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -6,6 +6,7 @@
 
 #include <linux/ceph/cls_lock_client.h>
 #include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
 
 /**
  * ceph_cls_lock - grab rados lock for object
@@ -378,3 +379,47 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
 	return ret;
 }
 EXPORT_SYMBOL(ceph_cls_lock_info);
+
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+			   char *lock_name, u8 type, char *cookie, char *tag)
+{
+	int assert_op_buf_size;
+	int name_len = strlen(lock_name);
+	int cookie_len = strlen(cookie);
+	int tag_len = strlen(tag);
+	struct page **pages;
+	void *p, *end;
+	int ret;
+
+	assert_op_buf_size = name_len + sizeof(__le32) +
+			     cookie_len + sizeof(__le32) +
+			     tag_len + sizeof(__le32) +
+			     sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+	if (assert_op_buf_size > PAGE_SIZE)
+		return -E2BIG;
+
+	ret = osd_req_op_cls_init(req, which, "lock", "assert_locked");
+	if (ret)
+		return ret;
+
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	p = page_address(pages[0]);
+	end = p + assert_op_buf_size;
+
+	/* encode cls_lock_assert_op struct */
+	ceph_start_encoding(&p, 1, 1,
+			    assert_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+	ceph_encode_string(&p, end, lock_name, name_len);
+	ceph_encode_8(&p, type);
+	ceph_encode_string(&p, end, cookie, cookie_len);
+	ceph_encode_string(&p, end, tag, tag_len);
+	WARN_ON(p != end);
+
+	osd_req_op_cls_request_data_pages(req, which, pages, assert_op_buf_size,
+					  0, false, true);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_cls_assert_locked);
diff --git a/net/ceph/striper.c b/net/ceph/striper.c
index c36462dc86b7..3b3fa75d1189 100644
--- a/net/ceph/striper.c
+++ b/net/ceph/striper.c
@@ -259,3 +259,20 @@ int ceph_extent_to_file(struct ceph_file_layout *l,
 	return 0;
 }
 EXPORT_SYMBOL(ceph_extent_to_file);
+
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size)
+{
+	u64 period = (u64)l->stripe_count * l->object_size;
+	u64 num_periods = DIV64_U64_ROUND_UP(size, period);
+	u64 remainder_bytes;
+	u64 remainder_objs = 0;
+
+	div64_u64_rem(size, period, &remainder_bytes);
+	if (remainder_bytes > 0 &&
+	    remainder_bytes < (u64)l->stripe_count * l->stripe_unit)
+		remainder_objs = l->stripe_count -
+			    DIV_ROUND_UP_ULL(remainder_bytes, l->stripe_unit);
+
+	return num_periods * l->stripe_count - remainder_objs;
+}
+EXPORT_SYMBOL(ceph_get_num_objects);
-- 
cgit v1.2.3


From f10ff380fd7dfba4a36d40f8dd00fe17da8a1a10 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Mon, 8 Jul 2019 12:17:48 -0300
Subject: RDMA/rvt: Do not use a kernel header in the ABI

rvt was using ib_sge as part of it's ABI, which is not allowed. Introduce
a new struct with the same layout and use it instead.

Fixes: dabac6e460ce ("IB/hfi1: Move receive work queue struct into uapi directory")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/rdmavt/qp.c | 32 +++++++++++++++++++++++++++-----
 include/uapi/rdma/rvt-abi.h       |  9 +++++++--
 2 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 11b4d3c1efd4..0b0a241c57ff 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1847,8 +1847,11 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 			wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
 			wqe->wr_id = wr->wr_id;
 			wqe->num_sge = wr->num_sge;
-			for (i = 0; i < wr->num_sge; i++)
-				wqe->sg_list[i] = wr->sg_list[i];
+			for (i = 0; i < wr->num_sge; i++) {
+				wqe->sg_list[i].addr = wr->sg_list[i].addr;
+				wqe->sg_list[i].length = wr->sg_list[i].length;
+				wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
+			}
 			/*
 			 * Make sure queue entry is written
 			 * before the head index.
@@ -2250,8 +2253,11 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 		wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
 		wqe->wr_id = wr->wr_id;
 		wqe->num_sge = wr->num_sge;
-		for (i = 0; i < wr->num_sge; i++)
-			wqe->sg_list[i] = wr->sg_list[i];
+		for (i = 0; i < wr->num_sge; i++) {
+			wqe->sg_list[i].addr = wr->sg_list[i].addr;
+			wqe->sg_list[i].length = wr->sg_list[i].length;
+			wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
+		}
 		/* Make sure queue entry is written before the head index. */
 		smp_store_release(&wq->head, next);
 		spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
@@ -2259,6 +2265,22 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 	return 0;
 }
 
+/*
+ * rvt used the internal kernel struct as part of its ABI, for now make sure
+ * the kernel struct does not change layout. FIXME: rvt should never cast the
+ * user struct to a kernel struct.
+ */
+static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge)
+{
+	BUILD_BUG_ON(offsetof(struct ib_sge, addr) !=
+		     offsetof(struct rvt_wqe_sge, addr));
+	BUILD_BUG_ON(offsetof(struct ib_sge, length) !=
+		     offsetof(struct rvt_wqe_sge, length));
+	BUILD_BUG_ON(offsetof(struct ib_sge, lkey) !=
+		     offsetof(struct rvt_wqe_sge, lkey));
+	return (struct ib_sge *)sge;
+}
+
 /*
  * Validate a RWQE and fill in the SGE state.
  * Return 1 if OK.
@@ -2282,7 +2304,7 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
 			continue;
 		/* Check LKEY */
 		ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
-				  NULL, &wqe->sg_list[i],
+				  NULL, rvt_cast_sge(&wqe->sg_list[i]),
 				  IB_ACCESS_LOCAL_WRITE);
 		if (unlikely(ret <= 0))
 			goto bad_lkey;
diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h
index d2e35d24f1a9..7328293c715c 100644
--- a/include/uapi/rdma/rvt-abi.h
+++ b/include/uapi/rdma/rvt-abi.h
@@ -10,11 +10,16 @@
 
 #include <linux/types.h>
 #include <rdma/ib_user_verbs.h>
-#include <rdma/ib_verbs.h>
 #ifndef RDMA_ATOMIC_UAPI
 #define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name
 #endif
 
+struct rvt_wqe_sge {
+	__aligned_u64 addr;
+	__u32 length;
+	__u32 lkey;
+};
+
 /*
  * This structure is used to contain the head pointer, tail pointer,
  * and completion queue entries as a single memory allocation so
@@ -39,7 +44,7 @@ struct rvt_rwqe {
 	__u64 wr_id;
 	__u8 num_sge;
 	__u8 padding[7];
-	struct ib_sge sg_list[];
+	struct rvt_wqe_sge sg_list[];
 };
 
 /*
-- 
cgit v1.2.3


From 89705e92700170888236555fe91b45e4c1bb0985 Mon Sep 17 00:00:00 2001
From: Danit Goldberg <danitg@mellanox.com>
Date: Fri, 5 Jul 2019 19:21:57 +0300
Subject: IB/mlx5: Report correctly tag matching rendezvous capability

Userspace expects the IB_TM_CAP_RC bit to indicate that the device
supports RC transport tag matching with rendezvous offload. However the
firmware splits this into two capabilities for eager and rendezvous tag
matching.

Only if the FW supports both modes should userspace be told the tag
matching capability is available.

Cc: <stable@vger.kernel.org> # 4.13
Fixes: eb761894351d ("IB/mlx5: Fill XRQ capabilities")
Signed-off-by: Danit Goldberg <danitg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 8 ++++++--
 include/rdma/ib_verbs.h           | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 7581571bd9cd..56d4b1e9dd23 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1046,15 +1046,19 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	}
 
 	if (MLX5_CAP_GEN(mdev, tag_matching)) {
-		props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
 		props->tm_caps.max_num_tags =
 			(1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
-		props->tm_caps.flags = IB_TM_CAP_RC;
 		props->tm_caps.max_ops =
 			1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
 		props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
 	}
 
+	if (MLX5_CAP_GEN(mdev, tag_matching) &&
+	    MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
+		props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
+		props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
+	}
+
 	if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
 		props->cq_caps.max_cq_moderation_count =
 						MLX5_MAX_CQ_COUNT;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 50806bef9f20..4053be51b7fa 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -307,8 +307,8 @@ struct ib_rss_caps {
 };
 
 enum ib_tm_cap_flags {
-	/*  Support tag matching on RC transport */
-	IB_TM_CAP_RC		    = 1 << 0,
+	/*  Support tag matching with rendezvous offload for RC transport */
+	IB_TM_CAP_RNDV_RC = 1 << 0,
 };
 
 struct ib_tm_caps {
-- 
cgit v1.2.3


From f4915455dcf07c4f237d6160a4b6adb0575d2909 Mon Sep 17 00:00:00 2001
From: Yamin Friedman <yaminf@mellanox.com>
Date: Mon, 8 Jul 2019 13:59:02 +0300
Subject: linux/dim: Implement RDMA adaptive moderation (DIM)

RDMA DIM implements a different algorithm from net DIM and is based on
completions which is how we can implement interrupt moderation in RDMA.

The algorithm optimizes for number of completions and ratio between
completions and events. In order to avoid long latencies, the
implementation performs fast reduction of moderation level when the
traffic changes.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/dim.h |  36 ++++++++++++++++++
 lib/dim/Makefile    |   6 +--
 lib/dim/rdma_dim.c  | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 146 insertions(+), 4 deletions(-)
 create mode 100644 lib/dim/rdma_dim.c

(limited to 'include')

diff --git a/include/linux/dim.h b/include/linux/dim.h
index aa9bdd47a648..aa69730c3b8d 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -82,6 +82,7 @@ struct dim_stats {
  * @prev_stats: Measured rates from previous iteration (for comparison)
  * @start_sample: Sampled data at start of current iteration
  * @work: Work to perform on action required
+ * @priv: A pointer to the struct that points to dim
  * @profile_ix: Current moderation profile
  * @mode: CQ period count mode
  * @tune_state: Algorithm tuning state (see below)
@@ -95,6 +96,7 @@ struct dim {
 	struct dim_sample start_sample;
 	struct dim_sample measuring_sample;
 	struct work_struct work;
+	void *priv;
 	u8 profile_ix;
 	u8 mode;
 	u8 tune_state;
@@ -363,4 +365,38 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
  */
 void net_dim(struct dim *dim, struct dim_sample end_sample);
 
+/* RDMA DIM */
+
+/*
+ * RDMA DIM profile:
+ * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES.
+ */
+#define RDMA_DIM_PARAMS_NUM_PROFILES 9
+#define RDMA_DIM_START_PROFILE 0
+
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+	{1,   0, 1,  0},
+	{1,   0, 4,  0},
+	{2,   0, 4,  0},
+	{2,   0, 8,  0},
+	{4,   0, 8,  0},
+	{16,  0, 8,  0},
+	{16,  0, 16, 0},
+	{32,  0, 16, 0},
+	{32,  0, 32, 0},
+};
+
+/**
+ * rdma_dim - Runs the adaptive moderation.
+ * @dim: The moderation struct.
+ * @completions: The number of completions collected in this round.
+ *
+ * Each call to rdma_dim takes the latest amount of completions that
+ * have been collected and counts them as a new event.
+ * Once enough events have been collected the algorithm decides a new
+ * moderation level.
+ */
+void rdma_dim(struct dim *dim, u64 completions);
+
 #endif /* DIM_H */
diff --git a/lib/dim/Makefile b/lib/dim/Makefile
index 160afe288df0..1d6858a108cb 100644
--- a/lib/dim/Makefile
+++ b/lib/dim/Makefile
@@ -2,8 +2,6 @@
 # DIM Dynamic Interrupt Moderation library
 #
 
-obj-$(CONFIG_DIMLIB) = net_dim.o
+obj-$(CONFIG_DIMLIB) += dim.o
 
-net_dim-y = \
-	dim.o		\
-	net_dim.o
+dim-y := dim.o net_dim.o rdma_dim.o
diff --git a/lib/dim/rdma_dim.c b/lib/dim/rdma_dim.c
new file mode 100644
index 000000000000..f7e26c7b4749
--- /dev/null
+++ b/lib/dim/rdma_dim.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/dim.h>
+
+static int rdma_dim_step(struct dim *dim)
+{
+	if (dim->tune_state == DIM_GOING_RIGHT) {
+		if (dim->profile_ix == (RDMA_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+	}
+	if (dim->tune_state == DIM_GOING_LEFT) {
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+	}
+
+	return DIM_STEPPED;
+}
+
+static int rdma_dim_stats_compare(struct dim_stats *curr,
+				  struct dim_stats *prev)
+{
+	/* first stat */
+	if (!prev->cpms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms))
+		return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER :
+						DIM_STATS_WORSE;
+
+	if (IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio))
+		return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER :
+						DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool rdma_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_ix = dim->profile_ix;
+	u8 state = dim->tune_state;
+	int stats_res;
+	int step_res;
+
+	if (state != DIM_PARKING_ON_TOP && state != DIM_PARKING_TIRED) {
+		stats_res = rdma_dim_stats_compare(curr_stats,
+						   &dim->prev_stats);
+
+		switch (stats_res) {
+		case DIM_STATS_SAME:
+			if (curr_stats->cpe_ratio <= 50 * prev_ix)
+				dim->profile_ix = 0;
+			break;
+		case DIM_STATS_WORSE:
+			dim_turn(dim);
+			/* fall through */
+		case DIM_STATS_BETTER:
+			step_res = rdma_dim_step(dim);
+			if (step_res == DIM_ON_EDGE)
+				dim_turn(dim);
+			break;
+		}
+	}
+
+	dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void rdma_dim(struct dim *dim, u64 completions)
+{
+	struct dim_sample *curr_sample = &dim->measuring_sample;
+	struct dim_stats curr_stats;
+	u32 nevents;
+
+	dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0,
+				     curr_sample->comp_ctr + completions,
+				     &dim->measuring_sample);
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = curr_sample->event_ctr - dim->start_sample.event_ctr;
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats);
+		if (rdma_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0,
+					     curr_sample->comp_ctr,
+					     &dim->start_sample);
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
+EXPORT_SYMBOL(rdma_dim);
-- 
cgit v1.2.3


From da6629793aa6944db6c8a908ca1a52d87f1489aa Mon Sep 17 00:00:00 2001
From: Yamin Friedman <yaminf@mellanox.com>
Date: Mon, 8 Jul 2019 13:59:03 +0300
Subject: RDMA/core: Provide RDMA DIM support for ULPs

Added the interface in the infiniband driver that applies the rdma_dim
adaptive moderation. There is now a special function for allocating an
ib_cq that uses rdma_dim.

Performance improvement (ConnectX-5 100GbE, x86) running FIO benchmark over
NVMf between two equal end-hosts with 56 cores across a Mellanox switch
using null_blk device:

READS without DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 3.8GiB/s | 7.7M | 1401  usec               | 2442  usec
4k       | 7.0GiB/s | 1.8M | 4817  usec               | 6587  usec
64k      | 10.7GiB/s| 175k | 9896  usec               | 10028 usec

IO WRITES without DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 3.6GiB/s | 7.5M | 1434  usec               | 2474  usec
4k       | 6.3GiB/s | 1.6M | 938   usec               | 1221  usec
64k      | 10.7GiB/s| 175k | 8979  usec               | 12780 usec

IO READS with DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 4GiB/s   | 8.2M | 816    usec              | 889   usec
4k       | 10.1GiB/s| 2.65M| 3359   usec              | 5080  usec
64k      | 10.7GiB/s| 175k | 9896   usec              | 10028 usec

IO WRITES with DIM:
blk size | BW       | IOPS  | 99th percentile latency | 99.99th latency
512B     | 3.9GiB/s | 8.1M  | 799   usec              | 922   usec
4k       | 9.6GiB/s | 2.5M  | 717   usec              | 1004  usec
64k      | 10.7GiB/s| 176k  | 8586  usec              | 12256 usec

The rdma_dim algorithm was designed to measure the effectiveness of
moderation on the flow in a general way and thus should be appropriate
for all RDMA storage protocols.

rdma_dim is configured to be the default option based on performance
improvement seen after extensive tests.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cq.c | 45 ++++++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h      |  4 ++++
 2 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index 00d70f166209..ffd6e24109d5 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -18,6 +18,40 @@
 #define IB_POLL_FLAGS \
 	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+	struct dim *dim = container_of(w, struct dim, work);
+	struct ib_cq *cq = dim->priv;
+
+	u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+	u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+	dim->state = DIM_START_MEASURE;
+
+	cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static void rdma_dim_init(struct ib_cq *cq)
+{
+	struct dim *dim;
+
+	if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
+	    cq->poll_ctx == IB_POLL_DIRECT)
+		return;
+
+	dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
+	if (!dim)
+		return;
+
+	dim->state = DIM_START_MEASURE;
+	dim->tune_state = DIM_GOING_RIGHT;
+	dim->profile_ix = RDMA_DIM_START_PROFILE;
+	dim->priv = cq;
+	cq->dim = dim;
+
+	INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
+}
+
 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
 			   int batch)
 {
@@ -78,6 +112,7 @@ static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
 static int ib_poll_handler(struct irq_poll *iop, int budget)
 {
 	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+	struct dim *dim = cq->dim;
 	int completed;
 
 	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
@@ -87,6 +122,9 @@ static int ib_poll_handler(struct irq_poll *iop, int budget)
 			irq_poll_sched(&cq->iop);
 	}
 
+	if (dim)
+		rdma_dim(dim, completed);
+
 	return completed;
 }
 
@@ -105,6 +143,8 @@ static void ib_cq_poll_work(struct work_struct *work)
 	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
 		queue_work(cq->comp_wq, &cq->work);
+	else if (cq->dim)
+		rdma_dim(cq->dim, completed);
 }
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
@@ -161,6 +201,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 
 	rdma_restrack_kadd(&cq->res);
 
+	rdma_dim_init(cq);
+
 	switch (cq->poll_ctx) {
 	case IB_POLL_DIRECT:
 		cq->comp_handler = ib_cq_completion_direct;
@@ -223,6 +265,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 
 	rdma_restrack_del(&cq->res);
 	cq->device->ops.destroy_cq(cq, udata);
+	if (cq->dim)
+		cancel_work_sync(&cq->dim->work);
+	kfree(cq->dim);
 	kfree(cq->wc);
 	kfree(cq);
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 4053be51b7fa..c5f8a9f17063 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -61,6 +61,7 @@
 #include <linux/cgroup_rdma.h>
 #include <linux/irqflags.h>
 #include <linux/preempt.h>
+#include <linux/dim.h>
 #include <uapi/rdma/ib_user_verbs.h>
 #include <rdma/rdma_counter.h>
 #include <rdma/restrack.h>
@@ -1509,6 +1510,7 @@ struct ib_cq {
 		struct work_struct	work;
 	};
 	struct workqueue_struct *comp_wq;
+	struct dim *dim;
 	/*
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
@@ -2576,6 +2578,8 @@ struct ib_device {
 	u16                          is_switch:1;
 	/* Indicates kernel verbs support, should not be used in drivers */
 	u16                          kverbs_provider:1;
+	/* CQ adaptive moderation (RDMA DIM) */
+	u16                          use_cq_dim:1;
 	u8                           node_type;
 	u8                           phys_port_cnt;
 	struct ib_device_attr        attrs;
-- 
cgit v1.2.3


From f8fc8cd9c612c31f92b19b72f619fa043ec76e5e Mon Sep 17 00:00:00 2001
From: Yamin Friedman <yaminf@mellanox.com>
Date: Mon, 8 Jul 2019 13:59:04 +0300
Subject: RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation
 to netlink

Added parameter in ib_device for enabling dynamic interrupt moderation so
that it can be configured in userspace using rdma tool.

In order to set adaptive-moderation for an ib device the command is:
rdma dev set [DEV] adaptive-moderation [on|off]
Please set on/off.

rdma dev show
0: mlx5_0: node_type ca fw 16.26.0055 node_guid 248a:0703:00a5:29d0
sys_image_guid 248a:0703:00a5:29d0 adaptive-moderation on

rdma resource show cq
dev mlx5_0 cqn 0 cqe 1023 users 4 poll-ctx UNBOUND_WORKQUEUE
adaptive-moderation off comm [ib_core]

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/Kconfig          |  1 +
 drivers/infiniband/core/core_priv.h |  1 +
 drivers/infiniband/core/device.c    |  9 +++++++++
 drivers/infiniband/core/nldev.c     | 14 ++++++++++++++
 include/uapi/rdma/rdma_netlink.h    |  5 +++++
 5 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index f277cb7aea29..85e103b147cc 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -7,6 +7,7 @@ menuconfig INFINIBAND
 	depends on m || IPV6 != m
 	depends on !ALPHA
 	select IRQ_POLL
+	select DIMLIB
 	---help---
 	  Core support for InfiniBand (IB).  Make sure to also select
 	  any protocols you wish to use as well as drivers for your
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index a953c2fa2e78..888d89ce81df 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -60,6 +60,7 @@ extern bool ib_devices_shared_netns;
 int ib_device_register_sysfs(struct ib_device *device);
 void ib_device_unregister_sysfs(struct ib_device *device);
 int ib_device_rename(struct ib_device *ibdev, const char *name);
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim);
 
 typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
 	      struct net_device *idev, void *cookie);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index bdf61499e6d5..7f4affe8a10d 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -448,6 +448,15 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
 	return 0;
 }
 
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
+{
+	if (use_dim > 1)
+		return -EINVAL;
+	ibdev->use_cq_dim = use_dim;
+
+	return 0;
+}
+
 static int alloc_name(struct ib_device *ibdev, const char *name)
 {
 	struct ib_device *device;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index a4431ed566b6..d9f2a30e6467 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -52,6 +52,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
 	[RDMA_NLDEV_ATTR_CHARDEV_TYPE]		= { .type = NLA_NUL_STRING,
 					.len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE },
+	[RDMA_NLDEV_ATTR_DEV_DIM]               = { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_DEV_INDEX]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_DEV_NAME]		= { .type = NLA_NUL_STRING,
 					.len = IB_DEVICE_NAME_MAX },
@@ -252,6 +253,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 		return -EMSGSIZE;
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
 		return -EMSGSIZE;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim))
+		return -EMSGSIZE;
 
 	/*
 	 * Link type is determined on first port and mlx4 device
@@ -552,6 +555,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	    nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
 		goto err;
 
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL)))
+		goto err;
+
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
 		goto err;
 	if (!rdma_is_kernel_res(res) &&
@@ -870,6 +876,14 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto put_done;
 	}
 
+	if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) {
+		u8 use_dim;
+
+		use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]);
+		err = ib_device_set_dim(device,  use_dim);
+		goto done;
+	}
+
 done:
 	ib_device_put(device);
 put_done:
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index ce6fd66e7aa3..8e277783fa96 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -520,6 +520,11 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,	/* string */
 	RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE,	/* u64 */
 
+	/*
+	 * CQ adaptive moderatio (DIM)
+	 */
+	RDMA_NLDEV_ATTR_DEV_DIM,                /* u8 */
+
 	/*
 	 * Always the end
 	 */
-- 
cgit v1.2.3


From 67d874c3b2c69d65274fa5ce44ba939788d5729d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 8 Jul 2019 16:27:52 +0530
Subject: cpufreq: Register notifiers with the PM QoS framework

Register notifiers for min/max frequency constraints with the PM QoS
framework. The constraints are also taken into consideration in
cpufreq_set_policy().

This also relocates cpufreq_policy_put_kobj() as it is required to be
called from cpufreq_policy_alloc() now.

refresh_frequency_limits() is updated to avoid calling
cpufreq_set_policy() for inactive policies and handle_update() is
updated to have proper locking in place.

No constraints are added until now though.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 135 +++++++++++++++++++++++++++++++++++-----------
 include/linux/cpufreq.h   |   3 ++
 2 files changed, 108 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ceb57af15ca0..b96ef6db1bfe 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -26,6 +26,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
@@ -999,7 +1000,7 @@ static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)
 {
 	struct device *dev = get_cpu_device(cpu);
 
-	if (!dev)
+	if (unlikely(!dev))
 		return;
 
 	if (cpumask_test_and_set_cpu(cpu, policy->real_cpus))
@@ -1117,14 +1118,16 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 
 static void refresh_frequency_limits(struct cpufreq_policy *policy)
 {
-	struct cpufreq_policy new_policy = *policy;
-
-	pr_debug("updating policy for CPU %u\n", policy->cpu);
+	struct cpufreq_policy new_policy;
 
-	new_policy.min = policy->user_policy.min;
-	new_policy.max = policy->user_policy.max;
+	if (!policy_is_inactive(policy)) {
+		new_policy = *policy;
+		pr_debug("updating policy for CPU %u\n", policy->cpu);
 
-	cpufreq_set_policy(policy, &new_policy);
+		new_policy.min = policy->user_policy.min;
+		new_policy.max = policy->user_policy.max;
+		cpufreq_set_policy(policy, &new_policy);
+	}
 }
 
 static void handle_update(struct work_struct *work)
@@ -1133,14 +1136,60 @@ static void handle_update(struct work_struct *work)
 		container_of(work, struct cpufreq_policy, update);
 
 	pr_debug("handle_update for cpu %u called\n", policy->cpu);
+	down_write(&policy->rwsem);
 	refresh_frequency_limits(policy);
+	up_write(&policy->rwsem);
+}
+
+static int cpufreq_notifier_min(struct notifier_block *nb, unsigned long freq,
+				void *data)
+{
+	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_min);
+
+	schedule_work(&policy->update);
+	return 0;
+}
+
+static int cpufreq_notifier_max(struct notifier_block *nb, unsigned long freq,
+				void *data)
+{
+	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_max);
+
+	schedule_work(&policy->update);
+	return 0;
+}
+
+static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
+{
+	struct kobject *kobj;
+	struct completion *cmp;
+
+	down_write(&policy->rwsem);
+	cpufreq_stats_free_table(policy);
+	kobj = &policy->kobj;
+	cmp = &policy->kobj_unregister;
+	up_write(&policy->rwsem);
+	kobject_put(kobj);
+
+	/*
+	 * We need to make sure that the underlying kobj is
+	 * actually not referenced anymore by anybody before we
+	 * proceed with unloading.
+	 */
+	pr_debug("waiting for dropping of refcount\n");
+	wait_for_completion(cmp);
+	pr_debug("wait complete\n");
 }
 
 static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 {
 	struct cpufreq_policy *policy;
+	struct device *dev = get_cpu_device(cpu);
 	int ret;
 
+	if (!dev)
+		return NULL;
+
 	policy = kzalloc(sizeof(*policy), GFP_KERNEL);
 	if (!policy)
 		return NULL;
@@ -1157,7 +1206,7 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
 				   cpufreq_global_kobject, "policy%u", cpu);
 	if (ret) {
-		pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
+		dev_err(dev, "%s: failed to init policy->kobj: %d\n", __func__, ret);
 		/*
 		 * The entire policy object will be freed below, but the extra
 		 * memory allocated for the kobject name needs to be freed by
@@ -1167,6 +1216,25 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 		goto err_free_real_cpus;
 	}
 
+	policy->nb_min.notifier_call = cpufreq_notifier_min;
+	policy->nb_max.notifier_call = cpufreq_notifier_max;
+
+	ret = dev_pm_qos_add_notifier(dev, &policy->nb_min,
+				      DEV_PM_QOS_MIN_FREQUENCY);
+	if (ret) {
+		dev_err(dev, "Failed to register MIN QoS notifier: %d (%*pbl)\n",
+			ret, cpumask_pr_args(policy->cpus));
+		goto err_kobj_remove;
+	}
+
+	ret = dev_pm_qos_add_notifier(dev, &policy->nb_max,
+				      DEV_PM_QOS_MAX_FREQUENCY);
+	if (ret) {
+		dev_err(dev, "Failed to register MAX QoS notifier: %d (%*pbl)\n",
+			ret, cpumask_pr_args(policy->cpus));
+		goto err_min_qos_notifier;
+	}
+
 	INIT_LIST_HEAD(&policy->policy_list);
 	init_rwsem(&policy->rwsem);
 	spin_lock_init(&policy->transition_lock);
@@ -1177,6 +1245,11 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	policy->cpu = cpu;
 	return policy;
 
+err_min_qos_notifier:
+	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+				   DEV_PM_QOS_MIN_FREQUENCY);
+err_kobj_remove:
+	cpufreq_policy_put_kobj(policy);
 err_free_real_cpus:
 	free_cpumask_var(policy->real_cpus);
 err_free_rcpumask:
@@ -1189,30 +1262,9 @@ err_free_policy:
 	return NULL;
 }
 
-static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
-{
-	struct kobject *kobj;
-	struct completion *cmp;
-
-	down_write(&policy->rwsem);
-	cpufreq_stats_free_table(policy);
-	kobj = &policy->kobj;
-	cmp = &policy->kobj_unregister;
-	up_write(&policy->rwsem);
-	kobject_put(kobj);
-
-	/*
-	 * We need to make sure that the underlying kobj is
-	 * actually not referenced anymore by anybody before we
-	 * proceed with unloading.
-	 */
-	pr_debug("waiting for dropping of refcount\n");
-	wait_for_completion(cmp);
-	pr_debug("wait complete\n");
-}
-
 static void cpufreq_policy_free(struct cpufreq_policy *policy)
 {
+	struct device *dev = get_cpu_device(policy->cpu);
 	unsigned long flags;
 	int cpu;
 
@@ -1224,6 +1276,11 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
 		per_cpu(cpufreq_cpu_data, cpu) = NULL;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
+	dev_pm_qos_remove_notifier(dev, &policy->nb_max,
+				   DEV_PM_QOS_MAX_FREQUENCY);
+	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+				   DEV_PM_QOS_MIN_FREQUENCY);
+
 	cpufreq_policy_put_kobj(policy);
 	free_cpumask_var(policy->real_cpus);
 	free_cpumask_var(policy->related_cpus);
@@ -2283,6 +2340,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 		       struct cpufreq_policy *new_policy)
 {
 	struct cpufreq_governor *old_gov;
+	struct device *cpu_dev = get_cpu_device(policy->cpu);
+	unsigned long min, max;
 	int ret;
 
 	pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
@@ -2297,11 +2356,27 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 	if (new_policy->min > new_policy->max)
 		return -EINVAL;
 
+	/*
+	 * PM QoS framework collects all the requests from users and provide us
+	 * the final aggregated value here.
+	 */
+	min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
+	max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
+
+	if (min > new_policy->min)
+		new_policy->min = min;
+	if (max < new_policy->max)
+		new_policy->max = max;
+
 	/* verify the cpu speed can be set within this limit */
 	ret = cpufreq_driver->verify(new_policy);
 	if (ret)
 		return ret;
 
+	/*
+	 * The notifier-chain shall be removed once all the users of
+	 * CPUFREQ_ADJUST are moved to use the QoS framework.
+	 */
 	/* adjust if necessary - all reasons */
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 			CPUFREQ_ADJUST, new_policy);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index a1467aa7f58b..95425941f46d 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -147,6 +147,9 @@ struct cpufreq_policy {
 
 	/* Pointer to the cooling device if used for thermal mitigation */
 	struct thermal_cooling_device *cdev;
+
+	struct notifier_block nb_min;
+	struct notifier_block nb_max;
 };
 
 struct cpufreq_freqs {
-- 
cgit v1.2.3


From c57b25bdf7cd374af106992356536bf5df7c255b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:22 +0530
Subject: cpufreq: intel_pstate: Reuse refresh_frequency_limits()

The implementation of intel_pstate_update_max_freq() is quite similar to
refresh_frequency_limits(), lets reuse it.

Finding minimum of policy->user_policy.max and policy->cpuinfo.max_freq
in intel_pstate_update_max_freq() is redundant as cpufreq_set_policy()
will call the ->verify() callback of intel-pstate driver, which will do
this comparison anyway and so dropping it from
intel_pstate_update_max_freq() doesn't harm.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c      | 3 ++-
 drivers/cpufreq/intel_pstate.c | 7 +------
 include/linux/cpufreq.h        | 1 +
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b96ef6db1bfe..79bac52919a5 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1116,7 +1116,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 	return ret;
 }
 
-static void refresh_frequency_limits(struct cpufreq_policy *policy)
+void refresh_frequency_limits(struct cpufreq_policy *policy)
 {
 	struct cpufreq_policy new_policy;
 
@@ -1129,6 +1129,7 @@ static void refresh_frequency_limits(struct cpufreq_policy *policy)
 		cpufreq_set_policy(policy, &new_policy);
 	}
 }
+EXPORT_SYMBOL(refresh_frequency_limits);
 
 static void handle_update(struct work_struct *work)
 {
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index f2ff5de988c1..cc27d4c59dca 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -898,7 +898,6 @@ static void intel_pstate_update_policies(void)
 static void intel_pstate_update_max_freq(unsigned int cpu)
 {
 	struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
-	struct cpufreq_policy new_policy;
 	struct cpudata *cpudata;
 
 	if (!policy)
@@ -908,11 +907,7 @@ static void intel_pstate_update_max_freq(unsigned int cpu)
 	policy->cpuinfo.max_freq = global.turbo_disabled_mf ?
 			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
 
-	memcpy(&new_policy, policy, sizeof(*policy));
-	new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
-	new_policy.min = min(policy->user_policy.min, new_policy.max);
-
-	cpufreq_set_policy(policy, &new_policy);
+	refresh_frequency_limits(policy);
 
 	cpufreq_cpu_release(policy);
 }
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 95425941f46d..1fa37b675a80 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -207,6 +207,7 @@ void cpufreq_cpu_release(struct cpufreq_policy *policy);
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
 int cpufreq_set_policy(struct cpufreq_policy *policy,
 		       struct cpufreq_policy *new_policy);
+void refresh_frequency_limits(struct cpufreq_policy *policy);
 void cpufreq_update_policy(unsigned int cpu);
 void cpufreq_update_limits(unsigned int cpu);
 bool have_governor_per_policy(void);
-- 
cgit v1.2.3


From 18c49926c4bf4915e5194d1de3299c0537229f9f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 5 Jul 2019 16:21:24 +0530
Subject: cpufreq: Add QoS requests for userspace constraints

This implements QoS requests to manage userspace configuration of min
and max frequency.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: syzbot <syzbot+de771ae9390dffed7266@syzkaller.appspotmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 98 ++++++++++++++++++++++++++---------------------
 include/linux/cpufreq.h   |  8 +---
 2 files changed, 57 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 79bac52919a5..99aa7d20b458 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -718,23 +718,15 @@ static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf)
 static ssize_t store_##file_name					\
 (struct cpufreq_policy *policy, const char *buf, size_t count)		\
 {									\
-	int ret, temp;							\
-	struct cpufreq_policy new_policy;				\
+	unsigned long val;						\
+	int ret;							\
 									\
-	memcpy(&new_policy, policy, sizeof(*policy));			\
-	new_policy.min = policy->user_policy.min;			\
-	new_policy.max = policy->user_policy.max;			\
-									\
-	ret = sscanf(buf, "%u", &new_policy.object);			\
+	ret = sscanf(buf, "%lu", &val);					\
 	if (ret != 1)							\
 		return -EINVAL;						\
 									\
-	temp = new_policy.object;					\
-	ret = cpufreq_set_policy(policy, &new_policy);		\
-	if (!ret)							\
-		policy->user_policy.object = temp;			\
-									\
-	return ret ? ret : count;					\
+	ret = dev_pm_qos_update_request(policy->object##_freq_req, val);\
+	return ret >= 0 ? count : ret;					\
 }
 
 store_one(scaling_min_freq, min);
@@ -1124,8 +1116,6 @@ void refresh_frequency_limits(struct cpufreq_policy *policy)
 		new_policy = *policy;
 		pr_debug("updating policy for CPU %u\n", policy->cpu);
 
-		new_policy.min = policy->user_policy.min;
-		new_policy.max = policy->user_policy.max;
 		cpufreq_set_policy(policy, &new_policy);
 	}
 }
@@ -1281,6 +1271,9 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
 				   DEV_PM_QOS_MAX_FREQUENCY);
 	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
 				   DEV_PM_QOS_MIN_FREQUENCY);
+	dev_pm_qos_remove_request(policy->max_freq_req);
+	dev_pm_qos_remove_request(policy->min_freq_req);
+	kfree(policy->min_freq_req);
 
 	cpufreq_policy_put_kobj(policy);
 	free_cpumask_var(policy->real_cpus);
@@ -1359,16 +1352,50 @@ static int cpufreq_online(unsigned int cpu)
 	cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
 
 	if (new_policy) {
-		policy->user_policy.min = policy->min;
-		policy->user_policy.max = policy->max;
+		struct device *dev = get_cpu_device(cpu);
 
 		for_each_cpu(j, policy->related_cpus) {
 			per_cpu(cpufreq_cpu_data, j) = policy;
 			add_cpu_dev_symlink(policy, j);
 		}
-	} else {
-		policy->min = policy->user_policy.min;
-		policy->max = policy->user_policy.max;
+
+		policy->min_freq_req = kzalloc(2 * sizeof(*policy->min_freq_req),
+					       GFP_KERNEL);
+		if (!policy->min_freq_req)
+			goto out_destroy_policy;
+
+		ret = dev_pm_qos_add_request(dev, policy->min_freq_req,
+					     DEV_PM_QOS_MIN_FREQUENCY,
+					     policy->min);
+		if (ret < 0) {
+			/*
+			 * So we don't call dev_pm_qos_remove_request() for an
+			 * uninitialized request.
+			 */
+			kfree(policy->min_freq_req);
+			policy->min_freq_req = NULL;
+
+			dev_err(dev, "Failed to add min-freq constraint (%d)\n",
+				ret);
+			goto out_destroy_policy;
+		}
+
+		/*
+		 * This must be initialized right here to avoid calling
+		 * dev_pm_qos_remove_request() on uninitialized request in case
+		 * of errors.
+		 */
+		policy->max_freq_req = policy->min_freq_req + 1;
+
+		ret = dev_pm_qos_add_request(dev, policy->max_freq_req,
+					     DEV_PM_QOS_MAX_FREQUENCY,
+					     policy->max);
+		if (ret < 0) {
+			policy->max_freq_req = NULL;
+			dev_err(dev, "Failed to add max-freq constraint (%d)\n",
+				ret);
+			goto out_destroy_policy;
+		}
 	}
 
 	if (cpufreq_driver->get && has_target()) {
@@ -2342,7 +2369,6 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 {
 	struct cpufreq_governor *old_gov;
 	struct device *cpu_dev = get_cpu_device(policy->cpu);
-	unsigned long min, max;
 	int ret;
 
 	pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
@@ -2350,24 +2376,12 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 
 	memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));
 
-	/*
-	* This check works well when we store new min/max freq attributes,
-	* because new_policy is a copy of policy with one field updated.
-	*/
-	if (new_policy->min > new_policy->max)
-		return -EINVAL;
-
 	/*
 	 * PM QoS framework collects all the requests from users and provide us
 	 * the final aggregated value here.
 	 */
-	min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
-	max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
-
-	if (min > new_policy->min)
-		new_policy->min = min;
-	if (max < new_policy->max)
-		new_policy->max = max;
+	new_policy->min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
+	new_policy->max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
 
 	/* verify the cpu speed can be set within this limit */
 	ret = cpufreq_driver->verify(new_policy);
@@ -2456,10 +2470,9 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
  * @cpu: CPU to re-evaluate the policy for.
  *
  * Update the current frequency for the cpufreq policy of @cpu and use
- * cpufreq_set_policy() to re-apply the min and max limits saved in the
- * user_policy sub-structure of that policy, which triggers the evaluation
- * of policy notifiers and the cpufreq driver's ->verify() callback for the
- * policy in question, among other things.
+ * cpufreq_set_policy() to re-apply the min and max limits, which triggers the
+ * evaluation of policy notifiers and the cpufreq driver's ->verify() callback
+ * for the policy in question, among other things.
  */
 void cpufreq_update_policy(unsigned int cpu)
 {
@@ -2519,10 +2532,9 @@ static int cpufreq_boost_set_sw(int state)
 			break;
 		}
 
-		down_write(&policy->rwsem);
-		policy->user_policy.max = policy->max;
-		cpufreq_governor_limits(policy);
-		up_write(&policy->rwsem);
+		ret = dev_pm_qos_update_request(policy->max_freq_req, policy->max);
+		if (ret)
+			break;
 	}
 
 	return ret;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1fa37b675a80..afc683021ac5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -50,11 +50,6 @@ struct cpufreq_cpuinfo {
 	unsigned int		transition_latency;
 };
 
-struct cpufreq_user_policy {
-	unsigned int		min;    /* in kHz */
-	unsigned int		max;    /* in kHz */
-};
-
 struct cpufreq_policy {
 	/* CPUs sharing clock, require sw coordination */
 	cpumask_var_t		cpus;	/* Online CPUs only */
@@ -84,7 +79,8 @@ struct cpufreq_policy {
 	struct work_struct	update; /* if update_policy() needs to be
 					 * called, but you're in IRQ context */
 
-	struct cpufreq_user_policy user_policy;
+	struct dev_pm_qos_request *min_freq_req;
+	struct dev_pm_qos_request *max_freq_req;
 	struct cpufreq_frequency_table	*freq_table;
 	enum cpufreq_table_sorting freq_table_sorted;
 
-- 
cgit v1.2.3


From 87b512def792579641499d9bef1d640994ea9c18 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 27 Jun 2019 20:50:46 -0500
Subject: objtool: Add support for C jump tables

Objtool doesn't know how to read C jump tables, so it has to whitelist
functions which use them, causing missing ORC unwinder data for such
functions, e.g. ___bpf_prog_run().

C jump tables are very similar to GCC switch jump tables, which objtool
already knows how to read.  So adding support for C jump tables is easy.
It just needs to be able to find the tables and distinguish them from
other data.

To allow the jump tables to be found, create an __annotate_jump_table
macro which can be used to annotate them.

The annotation is done by placing the jump table in an
.rodata..c_jump_table section.  The '.rodata' prefix ensures that the data
will be placed in the rodata section by the vmlinux linker script.  The
double periods are part of an existing convention which distinguishes
kernel sections from GCC sections.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Kairui Song <kasong@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lkml.kernel.org/r/0ba2ca30442b16b97165992381ce643dc27b3d1a.1561685471.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h |  5 +++++
 tools/objtool/check.c    | 27 ++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8aaf7cd026b0..f0fd5636fddb 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -116,9 +116,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	".pushsection .discard.unreachable\n\t"				\
 	".long 999b - .\n\t"						\
 	".popsection\n\t"
+
+/* Annotate a C jump table to allow objtool to follow the code flow */
+#define __annotate_jump_table __section(".rodata..c_jump_table")
+
 #else
 #define annotate_reachable()
 #define annotate_unreachable()
+#define __annotate_jump_table
 #endif
 
 #ifndef ASM_UNREACHABLE
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 172f99195726..27818a93f0b1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -18,6 +18,8 @@
 
 #define FAKE_JUMP_OFFSET -1
 
+#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table"
+
 struct alternative {
 	struct list_head list;
 	struct instruction *insn;
@@ -1035,9 +1037,15 @@ static struct rela *find_switch_table(struct objtool_file *file,
 
 		/*
 		 * Make sure the .rodata address isn't associated with a
-		 * symbol.  gcc jump tables are anonymous data.
+		 * symbol.  GCC jump tables are anonymous data.
+		 *
+		 * Also support C jump tables which are in the same format as
+		 * switch jump tables.  For objtool to recognize them, they
+		 * need to be placed in the C_JUMP_TABLE_SECTION section.  They
+		 * have symbols associated with them.
 		 */
-		if (find_symbol_containing(rodata_sec, table_offset))
+		if (find_symbol_containing(rodata_sec, table_offset) &&
+		    strcmp(rodata_sec->name, C_JUMP_TABLE_SECTION))
 			continue;
 
 		rodata_rela = find_rela_by_dest(rodata_sec, table_offset);
@@ -1277,13 +1285,18 @@ static void mark_rodata(struct objtool_file *file)
 	bool found = false;
 
 	/*
-	 * This searches for the .rodata section or multiple .rodata.func_name
-	 * sections if -fdata-sections is being used. The .str.1.1 and .str.1.8
-	 * rodata sections are ignored as they don't contain jump tables.
+	 * Search for the following rodata sections, each of which can
+	 * potentially contain jump tables:
+	 *
+	 * - .rodata: can contain GCC switch tables
+	 * - .rodata.<func>: same, if -fdata-sections is being used
+	 * - .rodata..c_jump_table: contains C annotated jump tables
+	 *
+	 * .rodata.str1.* sections are ignored; they don't contain jump tables.
 	 */
 	for_each_sec(file, sec) {
-		if (!strncmp(sec->name, ".rodata", 7) &&
-		    !strstr(sec->name, ".str1.")) {
+		if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) ||
+		    !strcmp(sec->name, C_JUMP_TABLE_SECTION)) {
 			sec->rodata = true;
 			found = true;
 		}
-- 
cgit v1.2.3


From f6b6aefee70aa5261deec7feab80c249bf58397f Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 30 May 2019 08:05:58 -0500
Subject: PCI: Fix typos and whitespace errors

Fix typos in drivers/pci.  Comment and whitespace changes only.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
---
 drivers/pci/ats.c                            |  2 +-
 drivers/pci/controller/dwc/pcie-armada8k.c   |  2 +-
 drivers/pci/controller/dwc/pcie-kirin.c      |  2 +-
 drivers/pci/controller/pci-aardvark.c        |  2 +-
 drivers/pci/controller/pcie-iproc-platform.c |  2 +-
 drivers/pci/controller/pcie-iproc.c          |  2 +-
 drivers/pci/controller/vmd.c                 |  2 +-
 drivers/pci/mmap.c                           |  2 +-
 drivers/pci/msi.c                            | 43 ++++++++++++++--------------
 drivers/pci/p2pdma.c                         |  6 ++--
 drivers/pci/pci-bridge-emul.c                |  2 +-
 drivers/pci/pci-pf-stub.c                    |  2 +-
 drivers/pci/pci.c                            |  2 +-
 drivers/pci/pcie/aer_inject.c                |  2 +-
 include/linux/pci.h                          |  2 +-
 include/linux/pci_ids.h                      |  6 ++--
 16 files changed, 41 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 97c08146534a..e18499243f84 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -432,7 +432,7 @@ EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
  * @pdev: PCI device structure
  *
  * Returns negative value when PASID capability is not present.
- * Otherwise it returns the numer of supported PASIDs.
+ * Otherwise it returns the number of supported PASIDs.
  */
 int pci_max_pasids(struct pci_dev *pdev)
 {
diff --git a/drivers/pci/controller/dwc/pcie-armada8k.c b/drivers/pci/controller/dwc/pcie-armada8k.c
index 0c389a30ef5d..9012d5f60be9 100644
--- a/drivers/pci/controller/dwc/pcie-armada8k.c
+++ b/drivers/pci/controller/dwc/pcie-armada8k.c
@@ -55,7 +55,7 @@ struct armada8k_pcie {
 #define PCIE_ARUSER_REG			(PCIE_VENDOR_REGS_OFFSET + 0x5C)
 #define PCIE_AWUSER_REG			(PCIE_VENDOR_REGS_OFFSET + 0x60)
 /*
- * AR/AW Cache defauls: Normal memory, Write-Back, Read / Write
+ * AR/AW Cache defaults: Normal memory, Write-Back, Read / Write
  * allocate
  */
 #define ARCACHE_DEFAULT_VALUE		0x3511
diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c
index 9b599296205d..8df1914226be 100644
--- a/drivers/pci/controller/dwc/pcie-kirin.c
+++ b/drivers/pci/controller/dwc/pcie-kirin.c
@@ -2,7 +2,7 @@
 /*
  * PCIe host controller driver for Kirin Phone SoCs
  *
- * Copyright (C) 2017 Hilisicon Electronics Co., Ltd.
+ * Copyright (C) 2017 HiSilicon Electronics Co., Ltd.
  *		http://www.huawei.com
  *
  * Author: Xiaowei Song <songxiaowei@huawei.com>
diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c
index 134e0306ff00..fc0fe4d4de49 100644
--- a/drivers/pci/controller/pci-aardvark.c
+++ b/drivers/pci/controller/pci-aardvark.c
@@ -308,7 +308,7 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie)
 
 	advk_writel(pcie, PCIE_ISR1_ALL_MASK, PCIE_ISR1_MASK_REG);
 
-	/* Unmask all MSI's */
+	/* Unmask all MSIs */
 	advk_writel(pcie, 0, PCIE_MSI_MASK_REG);
 
 	/* Enable summary interrupt for GIC SPI source */
diff --git a/drivers/pci/controller/pcie-iproc-platform.c b/drivers/pci/controller/pcie-iproc-platform.c
index f30f5f3fb5c1..5a3550b6bb29 100644
--- a/drivers/pci/controller/pcie-iproc-platform.c
+++ b/drivers/pci/controller/pcie-iproc-platform.c
@@ -87,7 +87,7 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 
 	/*
 	 * DT nodes are not used by all platforms that use the iProc PCIe
-	 * core driver. For platforms that require explict inbound mapping
+	 * core driver. For platforms that require explicit inbound mapping
 	 * configuration, "dma-ranges" would have been present in DT
 	 */
 	pcie->need_ib_cfg = of_property_read_bool(np, "dma-ranges");
diff --git a/drivers/pci/controller/pcie-iproc.c b/drivers/pci/controller/pcie-iproc.c
index e3ca46497470..2d457bfdaf66 100644
--- a/drivers/pci/controller/pcie-iproc.c
+++ b/drivers/pci/controller/pcie-iproc.c
@@ -163,7 +163,7 @@ enum iproc_pcie_ib_map_type {
  * @size_unit: inbound mapping region size unit, could be SZ_1K, SZ_1M, or
  * SZ_1G
  * @region_sizes: list of supported inbound mapping region sizes in KB, MB, or
- * GB, depedning on the size unit
+ * GB, depending on the size unit
  * @nr_sizes: number of supported inbound mapping region sizes
  * @nr_windows: number of supported inbound mapping windows for the region
  * @imap_addr_offset: register offset between the upper and lower 32-bit
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 999a5509e57e..4575e0c6dc4b 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -627,7 +627,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	 * 32-bit resources.  __pci_assign_resource() enforces that
 	 * artificial restriction to make sure everything will fit.
 	 *
-	 * The only way we could use a 64-bit non-prefechable MEMBAR is
+	 * The only way we could use a 64-bit non-prefetchable MEMBAR is
 	 * if its address is <4GB so that we can convert it to a 32-bit
 	 * resource.  To be visible to the host OS, all VMD endpoints must
 	 * be initially configured by platform BIOS, which includes setting
diff --git a/drivers/pci/mmap.c b/drivers/pci/mmap.c
index 24505b08de40..b8c9011987f4 100644
--- a/drivers/pci/mmap.c
+++ b/drivers/pci/mmap.c
@@ -73,7 +73,7 @@ int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
 #elif defined(HAVE_PCI_MMAP) /* && !ARCH_GENERIC_PCI_MMAP_RESOURCE */
 
 /*
- * Legacy setup: Impement pci_mmap_resource_range() as a wrapper around
+ * Legacy setup: Implement pci_mmap_resource_range() as a wrapper around
  * the architecture's pci_mmap_page_range(), converting to "user visible"
  * addresses as necessary.
  */
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index e039b740fe74..59a6d232f77a 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -237,7 +237,7 @@ static void msi_set_mask_bit(struct irq_data *data, u32 flag)
 }
 
 /**
- * pci_msi_mask_irq - Generic irq chip callback to mask PCI/MSI interrupts
+ * pci_msi_mask_irq - Generic IRQ chip callback to mask PCI/MSI interrupts
  * @data:	pointer to irqdata associated to that interrupt
  */
 void pci_msi_mask_irq(struct irq_data *data)
@@ -247,7 +247,7 @@ void pci_msi_mask_irq(struct irq_data *data)
 EXPORT_SYMBOL_GPL(pci_msi_mask_irq);
 
 /**
- * pci_msi_unmask_irq - Generic irq chip callback to unmask PCI/MSI interrupts
+ * pci_msi_unmask_irq - Generic IRQ chip callback to unmask PCI/MSI interrupts
  * @data:	pointer to irqdata associated to that interrupt
  */
 void pci_msi_unmask_irq(struct irq_data *data)
@@ -588,11 +588,11 @@ static int msi_verify_entries(struct pci_dev *dev)
  * msi_capability_init - configure device's MSI capability structure
  * @dev: pointer to the pci_dev data structure of MSI device function
  * @nvec: number of interrupts to allocate
- * @affd: description of automatic irq affinity assignments (may be %NULL)
+ * @affd: description of automatic IRQ affinity assignments (may be %NULL)
  *
  * Setup the MSI capability structure of the device with the requested
  * number of interrupts.  A return value of zero indicates the successful
- * setup of an entry with the new MSI irq.  A negative return value indicates
+ * setup of an entry with the new MSI IRQ.  A negative return value indicates
  * an error, and a positive return value indicates the number of interrupts
  * which could have been allocated.
  */
@@ -609,7 +609,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 	if (!entry)
 		return -ENOMEM;
 
-	/* All MSIs are unmasked by default, Mask them all */
+	/* All MSIs are unmasked by default; mask them all */
 	mask = msi_mask(entry->msi_attrib.multi_cap);
 	msi_mask_irq(entry, mask, mask);
 
@@ -637,7 +637,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 		return ret;
 	}
 
-	/* Set MSI enabled bits	 */
+	/* Set MSI enabled bits	*/
 	pci_intx_for_msi(dev, 0);
 	pci_msi_set_enable(dev, 1);
 	dev->msi_enabled = 1;
@@ -729,11 +729,11 @@ static void msix_program_entries(struct pci_dev *dev,
  * @dev: pointer to the pci_dev data structure of MSI-X device function
  * @entries: pointer to an array of struct msix_entry entries
  * @nvec: number of @entries
- * @affd: Optional pointer to enable automatic affinity assignement
+ * @affd: Optional pointer to enable automatic affinity assignment
  *
  * Setup the MSI-X capability structure of device function with a
- * single MSI-X irq. A return of zero indicates the successful setup of
- * requested MSI-X entries with allocated irqs or non-zero for otherwise.
+ * single MSI-X IRQ. A return of zero indicates the successful setup of
+ * requested MSI-X entries with allocated IRQs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 				int nvec, struct irq_affinity *affd)
@@ -789,7 +789,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 out_avail:
 	if (ret < 0) {
 		/*
-		 * If we had some success, report the number of irqs
+		 * If we had some success, report the number of IRQs
 		 * we succeeded in setting up.
 		 */
 		struct msi_desc *entry;
@@ -812,7 +812,7 @@ out_free:
 /**
  * pci_msi_supported - check whether MSI may be enabled on a device
  * @dev: pointer to the pci_dev data structure of MSI device function
- * @nvec: how many MSIs have been requested ?
+ * @nvec: how many MSIs have been requested?
  *
  * Look at global flags, the device itself, and its parent buses
  * to determine if MSI/-X are supported for the device. If MSI/-X is
@@ -896,7 +896,7 @@ static void pci_msi_shutdown(struct pci_dev *dev)
 	/* Keep cached state to be restored */
 	__pci_msi_desc_mask_irq(desc, mask, ~mask);
 
-	/* Restore dev->irq to its default pin-assertion irq */
+	/* Restore dev->irq to its default pin-assertion IRQ */
 	dev->irq = desc->msi_attrib.default_irq;
 	pcibios_alloc_irq(dev);
 }
@@ -958,7 +958,7 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 		}
 	}
 
-	/* Check whether driver already requested for MSI irq */
+	/* Check whether driver already requested for MSI IRQ */
 	if (dev->msi_enabled) {
 		pci_info(dev, "can't enable MSI-X (MSI IRQ already assigned)\n");
 		return -EINVAL;
@@ -1026,7 +1026,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 	if (!pci_msi_supported(dev, minvec))
 		return -EINVAL;
 
-	/* Check whether driver already requested MSI-X irqs */
+	/* Check whether driver already requested MSI-X IRQs */
 	if (dev->msix_enabled) {
 		pci_info(dev, "can't enable MSI (MSI-X already enabled)\n");
 		return -EINVAL;
@@ -1113,8 +1113,8 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
  * pci_enable_msix_range - configure device's MSI-X capability structure
  * @dev: pointer to the pci_dev data structure of MSI-X device function
  * @entries: pointer to an array of MSI-X entries
- * @minvec: minimum number of MSI-X irqs requested
- * @maxvec: maximum number of MSI-X irqs requested
+ * @minvec: minimum number of MSI-X IRQs requested
+ * @maxvec: maximum number of MSI-X IRQs requested
  *
  * Setup the MSI-X capability structure of device function with a maximum
  * possible number of interrupts in the range between @minvec and @maxvec
@@ -1179,7 +1179,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 			return msi_vecs;
 	}
 
-	/* use legacy irq if allowed */
+	/* use legacy IRQ if allowed */
 	if (flags & PCI_IRQ_LEGACY) {
 		if (min_vecs == 1 && dev->irq) {
 			/*
@@ -1248,7 +1248,7 @@ int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
 EXPORT_SYMBOL(pci_irq_vector);
 
 /**
- * pci_irq_get_affinity - return the affinity of a particular msi vector
+ * pci_irq_get_affinity - return the affinity of a particular MSI vector
  * @dev:	PCI device to operate on
  * @nr:		device-relative interrupt vector index (0-based).
  */
@@ -1280,7 +1280,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 EXPORT_SYMBOL(pci_irq_get_affinity);
 
 /**
- * pci_irq_get_node - return the numa node of a particular msi vector
+ * pci_irq_get_node - return the NUMA node of a particular MSI vector
  * @pdev:	PCI device to operate on
  * @vec:	device-relative interrupt vector index (0-based).
  */
@@ -1330,7 +1330,7 @@ void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg)
 /**
  * pci_msi_domain_calc_hwirq - Generate a unique ID for an MSI source
  * @dev:	Pointer to the PCI device
- * @desc:	Pointer to the msi descriptor
+ * @desc:	Pointer to the MSI descriptor
  *
  * The ID number is only used within the irqdomain.
  */
@@ -1348,7 +1348,8 @@ static inline bool pci_msi_desc_is_multi_msi(struct msi_desc *desc)
 }
 
 /**
- * pci_msi_domain_check_cap - Verify that @domain supports the capabilities for @dev
+ * pci_msi_domain_check_cap - Verify that @domain supports the capabilities
+ * 			      for @dev
  * @domain:	The interrupt domain to check
  * @info:	The domain info for verification
  * @dev:	The device to check
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 742928d0053e..d953cc7d9a54 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -223,7 +223,7 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
 
 /*
  * Note this function returns the parent PCI device with a
- * reference taken. It is the caller's responsibily to drop
+ * reference taken. It is the caller's responsibility to drop
  * the reference.
  */
 static struct pci_dev *find_parent_pci_dev(struct device *dev)
@@ -380,7 +380,7 @@ static int upstream_bridge_distance(struct pci_dev *provider,
 
 	/*
 	 * Allow the connection if both devices are on a whitelisted root
-	 * complex, but add an arbitary large value to the distance.
+	 * complex, but add an arbitrary large value to the distance.
 	 */
 	if (root_complex_whitelist(provider) &&
 	    root_complex_whitelist(client))
@@ -439,7 +439,7 @@ static int upstream_bridge_distance_warn(struct pci_dev *provider,
 }
 
 /**
- * pci_p2pdma_distance_many - Determive the cumulative distance between
+ * pci_p2pdma_distance_many - Determine the cumulative distance between
  *	a p2pdma provider and the clients in use.
  * @provider: p2pdma provider to check against the client list
  * @clients: array of devices to check (NULL-terminated)
diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c
index 83fb077d0b41..06083b86d4f4 100644
--- a/drivers/pci/pci-bridge-emul.c
+++ b/drivers/pci/pci-bridge-emul.c
@@ -305,7 +305,7 @@ int pci_bridge_emul_init(struct pci_bridge_emul *bridge,
 }
 
 /*
- * Cleanup a pci_bridge_emul structure that was previously initilized
+ * Cleanup a pci_bridge_emul structure that was previously initialized
  * using pci_bridge_emul_init().
  */
 void pci_bridge_emul_cleanup(struct pci_bridge_emul *bridge)
diff --git a/drivers/pci/pci-pf-stub.c b/drivers/pci/pci-pf-stub.c
index 9795649fc6f9..ef293e735c55 100644
--- a/drivers/pci/pci-pf-stub.c
+++ b/drivers/pci/pci-pf-stub.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* pci-pf-stub - simple stub driver for PCI SR-IOV PF device
  *
- * This driver is meant to act as a "whitelist" for devices that provde
+ * This driver is meant to act as a "whitelist" for devices that provide
  * SR-IOV functionality while at the same time not actually needing a
  * driver of their own.
  */
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8abc843b1615..3fd4eaa32b21 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4501,7 +4501,7 @@ static int pci_af_flr(struct pci_dev *dev, int probe)
 
 	/*
 	 * Wait for Transaction Pending bit to clear.  A word-aligned test
-	 * is used, so we use the conrol offset rather than status and shift
+	 * is used, so we use the control offset rather than status and shift
 	 * the test bit to match.
 	 */
 	if (!pci_wait_for_pending(dev, pos + PCI_AF_CTRL,
diff --git a/drivers/pci/pcie/aer_inject.c b/drivers/pci/pcie/aer_inject.c
index 043b8b0cfcc5..6988fe7389b9 100644
--- a/drivers/pci/pcie/aer_inject.c
+++ b/drivers/pci/pcie/aer_inject.c
@@ -2,7 +2,7 @@
 /*
  * PCIe AER software error injection support.
  *
- * Debuging PCIe AER code is quite difficult because it is hard to
+ * Debugging PCIe AER code is quite difficult because it is hard to
  * trigger various real hardware errors. Software based error
  * injection can fake almost all kinds of errors with the help of a
  * user space helper tool aer-inject, which can be gotten from:
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..fb207a22d686 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -382,7 +382,7 @@ struct pci_dev {
 
 	unsigned int	is_busmaster:1;		/* Is busmaster */
 	unsigned int	no_msi:1;		/* May not use MSI */
-	unsigned int	no_64bit_msi:1; 	/* May only use 32-bit MSIs */
+	unsigned int	no_64bit_msi:1;		/* May only use 32-bit MSIs */
 	unsigned int	block_cfg_access:1;	/* Config space access blocked */
 	unsigned int	broken_parity_status:1;	/* Generates false positive parity */
 	unsigned int	irq_reroute_variant:2;	/* Needs IRQ rerouting variant */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 70e86148cb1e..0dd239f11e91 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1112,7 +1112,7 @@
 
 #define PCI_VENDOR_ID_AL		0x10b9
 #define PCI_DEVICE_ID_AL_M1533		0x1533
-#define PCI_DEVICE_ID_AL_M1535 		0x1535
+#define PCI_DEVICE_ID_AL_M1535		0x1535
 #define PCI_DEVICE_ID_AL_M1541		0x1541
 #define PCI_DEVICE_ID_AL_M1563		0x1563
 #define PCI_DEVICE_ID_AL_M1621		0x1621
@@ -1752,7 +1752,7 @@
 #define PCI_VENDOR_ID_STALLION		0x124d
 
 /* Allied Telesyn */
-#define PCI_VENDOR_ID_AT    		0x1259
+#define PCI_VENDOR_ID_AT		0x1259
 #define PCI_SUBDEVICE_ID_AT_2700FX	0x2701
 #define PCI_SUBDEVICE_ID_AT_2701FX	0x2703
 
@@ -2550,7 +2550,7 @@
 #define PCI_DEVICE_ID_KORENIX_JETCARDF2	0x1700
 #define PCI_DEVICE_ID_KORENIX_JETCARDF3	0x17ff
 
-#define PCI_VENDOR_ID_HUAWEI         	0x19e5
+#define PCI_VENDOR_ID_HUAWEI		0x19e5
 
 #define PCI_VENDOR_ID_NETRONOME		0x19ee
 #define PCI_DEVICE_ID_NETRONOME_NFP4000	0x4000
-- 
cgit v1.2.3


From 390d57728d8e6f7283030cb20d3b5459771a32f1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 9 Jul 2019 09:44:47 -0300
Subject: RDMA/core: Make rdma_counter.h compile stand alone

5.4-rc1 will have new compile time debugging to test that headers can be
compiled stand alone. Many rdma headers are already broken and excluded
from the mechanism, however to avoid compile failures during the merge
window fix enough so that the newly added header compiles clean.

Fixes: 413d3347503b ("RDMA/counter: Add set/clear per-port auto mode support")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Mark Zhang <markz@mellanox.com>
---
 include/rdma/rdma_counter.h | 2 +-
 include/rdma/restrack.h     | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index 68827700ba95..eb99856e8b30 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -9,10 +9,10 @@
 #include <linux/mutex.h>
 #include <linux/pid_namespace.h>
 
-#include <rdma/ib_verbs.h>
 #include <rdma/restrack.h>
 #include <rdma/rdma_netlink.h>
 
+struct ib_device;
 struct ib_qp;
 
 struct auto_mode_param {
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index 4041a4d96524..b0fc6b26bdf5 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -14,6 +14,9 @@
 #include <uapi/rdma/rdma_netlink.h>
 #include <linux/xarray.h>
 
+struct ib_device;
+struct sk_buff;
+
 /**
  * enum rdma_restrack_type - HW objects to track
  */
@@ -52,8 +55,6 @@ enum rdma_restrack_type {
 	RDMA_RESTRACK_MAX
 };
 
-struct ib_device;
-
 /**
  * struct rdma_restrack_entry - metadata per-entry
  */
-- 
cgit v1.2.3


From 1310051c720a83c5717658bcbff710b260f2bff9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 19 Jun 2019 10:32:43 -0400
Subject: xprtrdma: Replace use of xdr_stream_pos in rpcrdma_marshal_req

This is a latent bug. xdr_stream_pos works by subtracting
xdr_stream::nwords from xdr_buf::len. But xdr_stream::nwords is not
initialized by xdr_init_encode().

It works today only because all fields in rpcrdma_req::rl_stream
are initialized to zero by rpcrdma_req_create, making the
subtraction in xdr_stream_pos always a no-op.

I found this issue via code inspection. It was introduced by commit
39f4cd9e9982 ("xprtrdma: Harden chunk list encoding against send
buffer overflow"), but the code has changed enough since then that
this fix can't be automatically applied to stable.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 9 +++++----
 net/sunrpc/xprtrdma/rpc_rdma.c | 6 +++---
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index f0678e3ac2d4..59492a93fe1d 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -470,13 +470,12 @@ TRACE_DEFINE_ENUM(rpcrdma_replych);
 
 TRACE_EVENT(xprtrdma_marshal,
 	TP_PROTO(
-		const struct rpc_rqst *rqst,
-		unsigned int hdrlen,
+		const struct rpcrdma_req *req,
 		unsigned int rtype,
 		unsigned int wtype
 	),
 
-	TP_ARGS(rqst, hdrlen, rtype, wtype),
+	TP_ARGS(req, rtype, wtype),
 
 	TP_STRUCT__entry(
 		__field(unsigned int, task_id)
@@ -491,10 +490,12 @@ TRACE_EVENT(xprtrdma_marshal,
 	),
 
 	TP_fast_assign(
+		const struct rpc_rqst *rqst = &req->rl_slot;
+
 		__entry->task_id = rqst->rq_task->tk_pid;
 		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
-		__entry->hdrlen = hdrlen;
+		__entry->hdrlen = req->rl_hdrbuf.len;
 		__entry->headlen = rqst->rq_snd_buf.head[0].iov_len;
 		__entry->pagelen = rqst->rq_snd_buf.page_len;
 		__entry->taillen = rqst->rq_snd_buf.tail[0].iov_len;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 85115a2e2639..97bfb804b6c6 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -867,12 +867,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 	if (ret)
 		goto out_err;
 
-	trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype);
-
-	ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr),
+	ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
 					&rqst->rq_snd_buf, rtype);
 	if (ret)
 		goto out_err;
+
+	trace_xprtrdma_marshal(req, rtype, wtype);
 	return 0;
 
 out_err:
-- 
cgit v1.2.3


From 05eb06d86685e7d9dac60e6bbb46d7f4c30b056e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 19 Jun 2019 10:32:48 -0400
Subject: xprtrdma: Fix occasional transport deadlock

Under high I/O workloads, I've noticed that an RPC/RDMA transport
occasionally deadlocks (IOPS goes to zero, and doesn't recover).
Diagnosis shows that the sendctx queue is empty, but when sendctxs
are returned to the queue, the xprt_write_space wake-up never
occurs. The wake-up logic in rpcrdma_sendctx_put_locked is racy.

I noticed that both EMPTY_SCQ and XPRT_WRITE_SPACE are implemented
via an atomic bit. Just one of those is sufficient. Removing
EMPTY_SCQ in favor of the generic bit mechanism makes the deadlock
un-reproducible.

Without EMPTY_SCQ, rpcrdma_buffer::rb_flags is no longer used and
is therefore removed.

Unfortunately this patch does not apply cleanly to stable. If
needed, someone will have to port it and test it.

Fixes: 2fad659209d5 ("xprtrdma: Wait on empty sendctx queue")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h  | 27 +++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/frwr_ops.c  |  6 +++++-
 net/sunrpc/xprtrdma/rpc_rdma.c  | 26 ++++++++++++--------------
 net/sunrpc/xprtrdma/verbs.c     | 11 +++--------
 net/sunrpc/xprtrdma/xprt_rdma.h |  6 ------
 5 files changed, 47 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 59492a93fe1d..2fb415136f7b 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -539,6 +539,33 @@ TRACE_EVENT(xprtrdma_marshal_failed,
 	)
 );
 
+TRACE_EVENT(xprtrdma_prepsend_failed,
+	TP_PROTO(const struct rpc_rqst *rqst,
+		 int ret
+	),
+
+	TP_ARGS(rqst, ret),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__entry->ret = ret;
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x: ret=%d",
+		__entry->task_id, __entry->client_id, __entry->xid,
+		__entry->ret
+	)
+);
+
 TRACE_EVENT(xprtrdma_post_send,
 	TP_PROTO(
 		const struct rpcrdma_req *req,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 794ba4ca0994..ac47314fb751 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -391,7 +391,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 			rpcrdma_mr_recycle(mr);
 		mr = rpcrdma_mr_get(r_xprt);
 		if (!mr)
-			return ERR_PTR(-EAGAIN);
+			goto out_getmr_err;
 	} while (mr->frwr.fr_state != FRWR_IS_INVALID);
 	frwr = &mr->frwr;
 	frwr->fr_state = FRWR_IS_VALID;
@@ -448,6 +448,10 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 	*out = mr;
 	return seg;
 
+out_getmr_err:
+	xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
+	return ERR_PTR(-EAGAIN);
+
 out_dmamap_err:
 	mr->mr_dir = DMA_NONE;
 	trace_xprtrdma_frwr_sgerr(mr, i);
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 97bfb804b6c6..59b214ba8813 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -699,22 +699,28 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
 			  struct rpcrdma_req *req, u32 hdrlen,
 			  struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
 {
+	int ret;
+
+	ret = -EAGAIN;
 	req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
 	if (!req->rl_sendctx)
-		return -EAGAIN;
+		goto err;
 	req->rl_sendctx->sc_wr.num_sge = 0;
 	req->rl_sendctx->sc_unmap_count = 0;
 	req->rl_sendctx->sc_req = req;
 	__clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
 
+	ret = -EIO;
 	if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
-		return -EIO;
-
+		goto err;
 	if (rtype != rpcrdma_areadch)
 		if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype))
-			return -EIO;
-
+			goto err;
 	return 0;
+
+err:
+	trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
+	return ret;
 }
 
 /**
@@ -877,15 +883,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 
 out_err:
 	trace_xprtrdma_marshal_failed(rqst, ret);
-	switch (ret) {
-	case -EAGAIN:
-		xprt_wait_for_buffer_space(rqst->rq_xprt);
-		break;
-	case -ENOBUFS:
-		break;
-	default:
-		r_xprt->rx_stats.failed_marshal_count++;
-	}
+	r_xprt->rx_stats.failed_marshal_count++;
 	return ret;
 }
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e71315e9dca2..0be5a36cacb6 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -901,7 +901,7 @@ out_emptyq:
 	 * completions recently. This is a sign the Send Queue is
 	 * backing up. Cause the caller to pause and try again.
 	 */
-	set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags);
+	xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
 	r_xprt->rx_stats.empty_sendctx_q++;
 	return NULL;
 }
@@ -936,10 +936,7 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
 	/* Paired with READ_ONCE */
 	smp_store_release(&buf->rb_sc_tail, next_tail);
 
-	if (test_and_clear_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags)) {
-		smp_mb__after_atomic();
-		xprt_write_space(&sc->sc_xprt->rx_xprt);
-	}
+	xprt_write_space(&sc->sc_xprt->rx_xprt);
 }
 
 static void
@@ -977,8 +974,6 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
 	r_xprt->rx_stats.mrs_allocated += count;
 	spin_unlock(&buf->rb_mrlock);
 	trace_xprtrdma_createmrs(r_xprt, count);
-
-	xprt_write_space(&r_xprt->rx_xprt);
 }
 
 static void
@@ -990,6 +985,7 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
 						   rx_buf);
 
 	rpcrdma_mrs_create(r_xprt);
+	xprt_write_space(&r_xprt->rx_xprt);
 }
 
 /**
@@ -1089,7 +1085,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	int i, rc;
 
-	buf->rb_flags = 0;
 	buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests;
 	buf->rb_bc_srv_max_requests = 0;
 	spin_lock_init(&buf->rb_mrlock);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index d1e0749bcbc4..2c6c5d8c3de1 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -391,7 +391,6 @@ struct rpcrdma_buffer {
 	struct list_head	rb_recv_bufs;
 	struct list_head	rb_allreqs;
 
-	unsigned long		rb_flags;
 	u32			rb_max_requests;
 	u32			rb_credits;	/* most recent credit grant */
 
@@ -402,11 +401,6 @@ struct rpcrdma_buffer {
 	struct delayed_work	rb_refresh_worker;
 };
 
-/* rb_flags */
-enum {
-	RPCRDMA_BUF_F_EMPTY_SCQ = 0,
-};
-
 /*
  * Statistics for RPCRDMA
  */
-- 
cgit v1.2.3


From 847568942f93e0af77e4bb8a098899f310cb3a88 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 19 Jun 2019 10:32:59 -0400
Subject: xprtrdma: Remove fr_state

Now that both the Send and Receive completions are handled in
process context, it is safe to DMA unmap and return MRs to the
free or recycle lists directly in the completion handlers.

Doing this means rpcrdma_frwr no longer needs to track the state of
each MR, meaning that a VALID or FLUSHED MR can no longer appear on
an xprt's MR free list. Thus there is no longer a need to track the
MR's registration state in rpcrdma_frwr.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h  |  19 +---
 net/sunrpc/xprtrdma/frwr_ops.c  | 204 ++++++++++++++++++----------------------
 net/sunrpc/xprtrdma/rpc_rdma.c  |   2 +-
 net/sunrpc/xprtrdma/xprt_rdma.h |  11 +--
 4 files changed, 96 insertions(+), 140 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 2fb415136f7b..1d2547084998 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -181,18 +181,6 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event,
 				),					\
 				TP_ARGS(task, mr, nsegs))
 
-TRACE_DEFINE_ENUM(FRWR_IS_INVALID);
-TRACE_DEFINE_ENUM(FRWR_IS_VALID);
-TRACE_DEFINE_ENUM(FRWR_FLUSHED_FR);
-TRACE_DEFINE_ENUM(FRWR_FLUSHED_LI);
-
-#define xprtrdma_show_frwr_state(x)					\
-		__print_symbolic(x,					\
-				{ FRWR_IS_INVALID, "INVALID" },		\
-				{ FRWR_IS_VALID, "VALID" },		\
-				{ FRWR_FLUSHED_FR, "FLUSHED_FR" },	\
-				{ FRWR_FLUSHED_LI, "FLUSHED_LI" })
-
 DECLARE_EVENT_CLASS(xprtrdma_frwr_done,
 	TP_PROTO(
 		const struct ib_wc *wc,
@@ -203,22 +191,19 @@ DECLARE_EVENT_CLASS(xprtrdma_frwr_done,
 
 	TP_STRUCT__entry(
 		__field(const void *, mr)
-		__field(unsigned int, state)
 		__field(unsigned int, status)
 		__field(unsigned int, vendor_err)
 	),
 
 	TP_fast_assign(
 		__entry->mr = container_of(frwr, struct rpcrdma_mr, frwr);
-		__entry->state = frwr->fr_state;
 		__entry->status = wc->status;
 		__entry->vendor_err = __entry->status ? wc->vendor_err : 0;
 	),
 
 	TP_printk(
-		"mr=%p state=%s: %s (%u/0x%x)",
-		__entry->mr, xprtrdma_show_frwr_state(__entry->state),
-		rdma_show_wc_status(__entry->status),
+		"mr=%p: %s (%u/0x%x)",
+		__entry->mr, rdma_show_wc_status(__entry->status),
 		__entry->status, __entry->vendor_err
 	)
 );
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index ac47314fb751..5c480bc13075 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -168,7 +168,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
 		goto out_list_err;
 
 	mr->frwr.fr_mr = frmr;
-	mr->frwr.fr_state = FRWR_IS_INVALID;
 	mr->mr_dir = DMA_NONE;
 	INIT_LIST_HEAD(&mr->mr_list);
 	INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
@@ -297,65 +296,6 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt)
 		     (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth);
 }
 
-/**
- * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
- * @cq:	completion queue (ignored)
- * @wc:	completed WR
- *
- */
-static void
-frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
-{
-	struct ib_cqe *cqe = wc->wr_cqe;
-	struct rpcrdma_frwr *frwr =
-			container_of(cqe, struct rpcrdma_frwr, fr_cqe);
-
-	/* WARNING: Only wr_cqe and status are reliable at this point */
-	if (wc->status != IB_WC_SUCCESS)
-		frwr->fr_state = FRWR_FLUSHED_FR;
-	trace_xprtrdma_wc_fastreg(wc, frwr);
-}
-
-/**
- * frwr_wc_localinv - Invoked by RDMA provider for a flushed LocalInv WC
- * @cq:	completion queue (ignored)
- * @wc:	completed WR
- *
- */
-static void
-frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
-{
-	struct ib_cqe *cqe = wc->wr_cqe;
-	struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
-						 fr_cqe);
-
-	/* WARNING: Only wr_cqe and status are reliable at this point */
-	if (wc->status != IB_WC_SUCCESS)
-		frwr->fr_state = FRWR_FLUSHED_LI;
-	trace_xprtrdma_wc_li(wc, frwr);
-}
-
-/**
- * frwr_wc_localinv_wake - Invoked by RDMA provider for a signaled LocalInv WC
- * @cq:	completion queue (ignored)
- * @wc:	completed WR
- *
- * Awaken anyone waiting for an MR to finish being fenced.
- */
-static void
-frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
-{
-	struct ib_cqe *cqe = wc->wr_cqe;
-	struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
-						 fr_cqe);
-
-	/* WARNING: Only wr_cqe and status are reliable at this point */
-	if (wc->status != IB_WC_SUCCESS)
-		frwr->fr_state = FRWR_FLUSHED_LI;
-	trace_xprtrdma_wc_li_wake(wc, frwr);
-	complete(&frwr->fr_linv_done);
-}
-
 /**
  * frwr_map - Register a memory region
  * @r_xprt: controlling transport
@@ -378,23 +318,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
-	struct rpcrdma_frwr *frwr;
 	struct rpcrdma_mr *mr;
 	struct ib_mr *ibmr;
 	struct ib_reg_wr *reg_wr;
 	int i, n;
 	u8 key;
 
-	mr = NULL;
-	do {
-		if (mr)
-			rpcrdma_mr_recycle(mr);
-		mr = rpcrdma_mr_get(r_xprt);
-		if (!mr)
-			goto out_getmr_err;
-	} while (mr->frwr.fr_state != FRWR_IS_INVALID);
-	frwr = &mr->frwr;
-	frwr->fr_state = FRWR_IS_VALID;
+	mr = rpcrdma_mr_get(r_xprt);
+	if (!mr)
+		goto out_getmr_err;
 
 	if (nsegs > ia->ri_max_frwr_depth)
 		nsegs = ia->ri_max_frwr_depth;
@@ -423,7 +355,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 	if (!mr->mr_nents)
 		goto out_dmamap_err;
 
-	ibmr = frwr->fr_mr;
+	ibmr = mr->frwr.fr_mr;
 	n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
 	if (unlikely(n != mr->mr_nents))
 		goto out_mapmr_err;
@@ -433,7 +365,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 	key = (u8)(ibmr->rkey & 0x000000FF);
 	ib_update_fast_reg_key(ibmr, ++key);
 
-	reg_wr = &frwr->fr_regwr;
+	reg_wr = &mr->frwr.fr_regwr;
 	reg_wr->mr = ibmr;
 	reg_wr->key = ibmr->rkey;
 	reg_wr->access = writing ?
@@ -464,6 +396,23 @@ out_mapmr_err:
 	return ERR_PTR(-EIO);
 }
 
+/**
+ * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
+ * @cq:	completion queue (ignored)
+ * @wc:	completed WR
+ *
+ */
+static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr =
+		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_fastreg(wc, frwr);
+	/* The MR will get recycled when the associated req is retransmitted */
+}
+
 /**
  * frwr_send - post Send WR containing the RPC Call message
  * @ia: interface adapter
@@ -516,31 +465,72 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
 		if (mr->mr_handle == rep->rr_inv_rkey) {
 			list_del_init(&mr->mr_list);
 			trace_xprtrdma_mr_remoteinv(mr);
-			mr->frwr.fr_state = FRWR_IS_INVALID;
 			rpcrdma_mr_unmap_and_put(mr);
 			break;	/* only one invalidated MR per RPC */
 		}
 }
 
+static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
+{
+	if (wc->status != IB_WC_SUCCESS)
+		rpcrdma_mr_recycle(mr);
+	else
+		rpcrdma_mr_unmap_and_put(mr);
+}
+
 /**
- * frwr_unmap_sync - invalidate memory regions that were registered for @req
- * @r_xprt: controlling transport
- * @mrs: list of MRs to process
+ * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
+ * @cq:	completion queue (ignored)
+ * @wc:	completed WR
  *
- * Sleeps until it is safe for the host CPU to access the
- * previously mapped memory regions.
+ */
+static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr =
+		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_li(wc, frwr);
+	__frwr_release_mr(wc, mr);
+}
+
+/**
+ * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
+ * @cq:	completion queue (ignored)
+ * @wc:	completed WR
  *
- * Caller ensures that @mrs is not empty before the call. This
- * function empties the list.
+ * Awaken anyone waiting for an MR to finish being fenced.
  */
-void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
+static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr =
+		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_li_wake(wc, frwr);
+	complete(&frwr->fr_linv_done);
+	__frwr_release_mr(wc, mr);
+}
+
+/**
+ * frwr_unmap_sync - invalidate memory regions that were registered for @req
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req with a non-empty list of MRs to process
+ *
+ * Sleeps until it is safe for the host CPU to access the previously mapped
+ * memory regions.
+ */
+void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
 	struct ib_send_wr *first, **prev, *last;
 	const struct ib_send_wr *bad_wr;
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_frwr *frwr;
 	struct rpcrdma_mr *mr;
-	int count, rc;
+	int rc;
 
 	/* ORDER: Invalidate all of the MRs first
 	 *
@@ -548,33 +538,32 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
 	 * a single ib_post_send() call.
 	 */
 	frwr = NULL;
-	count = 0;
 	prev = &first;
-	list_for_each_entry(mr, mrs, mr_list) {
-		mr->frwr.fr_state = FRWR_IS_INVALID;
+	while (!list_empty(&req->rl_registered)) {
+		mr = rpcrdma_mr_pop(&req->rl_registered);
 
-		frwr = &mr->frwr;
 		trace_xprtrdma_mr_localinv(mr);
+		r_xprt->rx_stats.local_inv_needed++;
 
+		frwr = &mr->frwr;
 		frwr->fr_cqe.done = frwr_wc_localinv;
 		last = &frwr->fr_invwr;
-		memset(last, 0, sizeof(*last));
+		last->next = NULL;
 		last->wr_cqe = &frwr->fr_cqe;
+		last->sg_list = NULL;
+		last->num_sge = 0;
 		last->opcode = IB_WR_LOCAL_INV;
+		last->send_flags = IB_SEND_SIGNALED;
 		last->ex.invalidate_rkey = mr->mr_handle;
-		count++;
 
 		*prev = last;
 		prev = &last->next;
 	}
-	if (!frwr)
-		goto unmap;
 
 	/* Strong send queue ordering guarantees that when the
 	 * last WR in the chain completes, all WRs in the chain
 	 * are complete.
 	 */
-	last->send_flags = IB_SEND_SIGNALED;
 	frwr->fr_cqe.done = frwr_wc_localinv_wake;
 	reinit_completion(&frwr->fr_linv_done);
 
@@ -582,29 +571,20 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
 	 * replaces the QP. The RPC reply handler won't call us
 	 * unless ri_id->qp is a valid pointer.
 	 */
-	r_xprt->rx_stats.local_inv_needed++;
 	bad_wr = NULL;
-	rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
-	if (bad_wr != first)
-		wait_for_completion(&frwr->fr_linv_done);
-	if (rc)
-		goto out_release;
+	rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+	trace_xprtrdma_post_send(req, rc);
 
-	/* ORDER: Now DMA unmap all of the MRs, and return
-	 * them to the free MR list.
+	/* The final LOCAL_INV WR in the chain is supposed to
+	 * do the wake. If it was never posted, the wake will
+	 * not happen, so don't wait in that case.
 	 */
-unmap:
-	while (!list_empty(mrs)) {
-		mr = rpcrdma_mr_pop(mrs);
-		rpcrdma_mr_unmap_and_put(mr);
-	}
-	return;
-
-out_release:
-	pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc);
+	if (bad_wr != first)
+		wait_for_completion(&frwr->fr_linv_done);
+	if (!rc)
+		return;
 
-	/* Unmap and release the MRs in the LOCAL_INV WRs that did not
-	 * get posted.
+	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
 	 */
 	while (bad_wr) {
 		frwr = container_of(bad_wr, struct rpcrdma_frwr,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index fbc0a9ff14b1..f23450b176dd 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1277,7 +1277,7 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 * RPC has relinquished all its Send Queue entries.
 	 */
 	if (!list_empty(&req->rl_registered))
-		frwr_unmap_sync(r_xprt, &req->rl_registered);
+		frwr_unmap_sync(r_xprt, req);
 
 	/* Ensure that any DMA mapped pages associated with
 	 * the Send of the RPC Call have been unmapped before
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 3c39aa3c113c..a9de116a5c1a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -240,17 +240,9 @@ struct rpcrdma_sendctx {
  * An external memory region is any buffer or page that is registered
  * on the fly (ie, not pre-registered).
  */
-enum rpcrdma_frwr_state {
-	FRWR_IS_INVALID,	/* ready to be used */
-	FRWR_IS_VALID,		/* in use */
-	FRWR_FLUSHED_FR,	/* flushed FASTREG WR */
-	FRWR_FLUSHED_LI,	/* flushed LOCALINV WR */
-};
-
 struct rpcrdma_frwr {
 	struct ib_mr			*fr_mr;
 	struct ib_cqe			fr_cqe;
-	enum rpcrdma_frwr_state		fr_state;
 	struct completion		fr_linv_done;
 	union {
 		struct ib_reg_wr	fr_regwr;
@@ -567,8 +559,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 				struct rpcrdma_mr **mr);
 int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
 void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
-void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt,
-		     struct list_head *mrs);
+void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
-- 
cgit v1.2.3


From d8099feda4833bab96b1bf312e9e6aad6b771570 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 19 Jun 2019 10:33:10 -0400
Subject: xprtrdma: Reduce context switching due to Local Invalidation

Since commit ba69cd122ece ("xprtrdma: Remove support for FMR memory
registration"), FRWR is the only supported memory registration mode.

We can take advantage of the asynchronous nature of FRWR's LOCAL_INV
Work Requests to get rid of the completion wait by having the
LOCAL_INV completion handler take care of DMA unmapping MRs and
waking the upper layer RPC waiter.

This eliminates two context switches when local invalidation is
necessary. As a side benefit, we will no longer need the per-xprt
deferred completion work queue.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h  |   1 +
 net/sunrpc/xprtrdma/frwr_ops.c  | 103 +++++++++++++++++++++++++++++++++++++++-
 net/sunrpc/xprtrdma/rpc_rdma.c  |  61 ++++++++++++------------
 net/sunrpc/xprtrdma/verbs.c     |  17 -------
 net/sunrpc/xprtrdma/xprt_rdma.h |   8 ++--
 5 files changed, 137 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 1d2547084998..98023d91a72d 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -712,6 +712,7 @@ TRACE_EVENT(xprtrdma_wc_receive,
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg);
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li);
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake);
+DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_done);
 
 TRACE_EVENT(xprtrdma_frwr_alloc,
 	TP_PROTO(
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 524cac0a0715..0b6dad7580a1 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -542,7 +542,10 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
  * @req: rpcrdma_req with a non-empty list of MRs to process
  *
  * Sleeps until it is safe for the host CPU to access the previously mapped
- * memory regions.
+ * memory regions. This guarantees that registered MRs are properly fenced
+ * from the server before the RPC consumer accesses the data in them. It
+ * also ensures proper Send flow control: waking the next RPC waits until
+ * this RPC has relinquished all its Send Queue entries.
  */
 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
@@ -616,3 +619,101 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 		rpcrdma_mr_recycle(mr);
 	}
 }
+
+/**
+ * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
+ * @cq:	completion queue (ignored)
+ * @wc:	completed WR
+ *
+ */
+static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr =
+		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_li_done(wc, frwr);
+	rpcrdma_complete_rqst(frwr->fr_req->rl_reply);
+	__frwr_release_mr(wc, mr);
+}
+
+/**
+ * frwr_unmap_async - invalidate memory regions that were registered for @req
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req with a non-empty list of MRs to process
+ *
+ * This guarantees that registered MRs are properly fenced from the
+ * server before the RPC consumer accesses the data in them. It also
+ * ensures proper Send flow control: waking the next RPC waits until
+ * this RPC has relinquished all its Send Queue entries.
+ */
+void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	struct ib_send_wr *first, *last, **prev;
+	const struct ib_send_wr *bad_wr;
+	struct rpcrdma_frwr *frwr;
+	struct rpcrdma_mr *mr;
+	int rc;
+
+	/* Chain the LOCAL_INV Work Requests and post them with
+	 * a single ib_post_send() call.
+	 */
+	frwr = NULL;
+	prev = &first;
+	while (!list_empty(&req->rl_registered)) {
+		mr = rpcrdma_mr_pop(&req->rl_registered);
+
+		trace_xprtrdma_mr_localinv(mr);
+		r_xprt->rx_stats.local_inv_needed++;
+
+		frwr = &mr->frwr;
+		frwr->fr_cqe.done = frwr_wc_localinv;
+		frwr->fr_req = req;
+		last = &frwr->fr_invwr;
+		last->next = NULL;
+		last->wr_cqe = &frwr->fr_cqe;
+		last->sg_list = NULL;
+		last->num_sge = 0;
+		last->opcode = IB_WR_LOCAL_INV;
+		last->send_flags = IB_SEND_SIGNALED;
+		last->ex.invalidate_rkey = mr->mr_handle;
+
+		*prev = last;
+		prev = &last->next;
+	}
+
+	/* Strong send queue ordering guarantees that when the
+	 * last WR in the chain completes, all WRs in the chain
+	 * are complete. The last completion will wake up the
+	 * RPC waiter.
+	 */
+	frwr->fr_cqe.done = frwr_wc_localinv_done;
+
+	/* Transport disconnect drains the receive CQ before it
+	 * replaces the QP. The RPC reply handler won't call us
+	 * unless ri_id->qp is a valid pointer.
+	 */
+	bad_wr = NULL;
+	rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+	trace_xprtrdma_post_send(req, rc);
+	if (!rc)
+		return;
+
+	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
+	 */
+	while (bad_wr) {
+		frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
+		mr = container_of(frwr, struct rpcrdma_mr, frwr);
+		bad_wr = bad_wr->next;
+
+		rpcrdma_mr_recycle(mr);
+	}
+
+	/* The final LOCAL_INV WR in the chain is supposed to
+	 * do the wake. If it was never posted, the wake will
+	 * not happen, so wake here in that case.
+	 */
+	rpcrdma_complete_rqst(req->rl_reply);
+}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 67d72d68ca6c..33b6e6a03f68 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1268,24 +1268,15 @@ out_badheader:
 	goto out;
 }
 
-void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+/* Ensure that any DMA mapped pages associated with
+ * the Send of the RPC Call have been unmapped before
+ * allowing the RPC to complete. This protects argument
+ * memory not controlled by the RPC client from being
+ * re-used before we're done with it.
+ */
+static void rpcrdma_release_tx(struct rpcrdma_xprt *r_xprt,
+			       struct rpcrdma_req *req)
 {
-	/* Invalidate and unmap the data payloads before waking
-	 * the waiting application. This guarantees the memory
-	 * regions are properly fenced from the server before the
-	 * application accesses the data. It also ensures proper
-	 * send flow control: waking the next RPC waits until this
-	 * RPC has relinquished all its Send Queue entries.
-	 */
-	if (!list_empty(&req->rl_registered))
-		frwr_unmap_sync(r_xprt, req);
-
-	/* Ensure that any DMA mapped pages associated with
-	 * the Send of the RPC Call have been unmapped before
-	 * allowing the RPC to complete. This protects argument
-	 * memory not controlled by the RPC client from being
-	 * re-used before we're done with it.
-	 */
 	if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
 		r_xprt->rx_stats.reply_waits_for_send++;
 		out_of_line_wait_on_bit(&req->rl_flags,
@@ -1295,24 +1286,23 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	}
 }
 
-/* Reply handling runs in the poll worker thread. Anything that
- * might wait is deferred to a separate workqueue.
+/**
+ * rpcrdma_release_rqst - Release hardware resources
+ * @r_xprt: controlling transport instance
+ * @req: request with resources to release
+ *
  */
-void rpcrdma_deferred_completion(struct work_struct *work)
+void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
-	struct rpcrdma_rep *rep =
-			container_of(work, struct rpcrdma_rep, rr_work);
-	struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
-	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+	if (!list_empty(&req->rl_registered))
+		frwr_unmap_sync(r_xprt, req);
 
-	trace_xprtrdma_defer_cmp(rep);
-	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
-		frwr_reminv(rep, &req->rl_registered);
-	rpcrdma_release_rqst(r_xprt, req);
-	rpcrdma_complete_rqst(rep);
+	rpcrdma_release_tx(r_xprt, req);
 }
 
-/* Process received RPC/RDMA messages.
+/**
+ * rpcrdma_reply_handler - Process received RPC/RDMA messages
+ * @rep: Incoming rpcrdma_rep object to process
  *
  * Errors must result in the RPC task either being awakened, or
  * allowed to timeout, to discover the errors at that time.
@@ -1374,7 +1364,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	rep->rr_rqst = rqst;
 
 	trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
-	queue_work(buf->rb_completion_wq, &rep->rr_work);
+
+	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
+		frwr_reminv(rep, &req->rl_registered);
+	if (!list_empty(&req->rl_registered)) {
+		frwr_unmap_async(r_xprt, req);
+		/* LocalInv completion will complete the RPC */
+	} else {
+		rpcrdma_release_tx(r_xprt, req);
+		rpcrdma_complete_rqst(rep);
+	}
 	return;
 
 out_badversion:
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 0be5a36cacb6..c50a4b295bd7 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -89,14 +89,12 @@ static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
  */
 static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
 {
-	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
 	/* Flush Receives, then wait for deferred Reply work
 	 * to complete.
 	 */
 	ib_drain_rq(ia->ri_id->qp);
-	drain_workqueue(buf->rb_completion_wq);
 
 	/* Deferred Reply processing might have scheduled
 	 * local invalidations.
@@ -1056,7 +1054,6 @@ static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp)
 
 	rep->rr_cqe.done = rpcrdma_wc_receive;
 	rep->rr_rxprt = r_xprt;
-	INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion);
 	rep->rr_recv_wr.next = NULL;
 	rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
 	rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
@@ -1117,15 +1114,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	if (rc)
 		goto out;
 
-	buf->rb_completion_wq = alloc_workqueue("rpcrdma-%s",
-						WQ_MEM_RECLAIM | WQ_HIGHPRI,
-						0,
-			r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]);
-	if (!buf->rb_completion_wq) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
 	return 0;
 out:
 	rpcrdma_buffer_destroy(buf);
@@ -1199,11 +1187,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
 	cancel_delayed_work_sync(&buf->rb_refresh_worker);
 
-	if (buf->rb_completion_wq) {
-		destroy_workqueue(buf->rb_completion_wq);
-		buf->rb_completion_wq = NULL;
-	}
-
 	rpcrdma_sendctxs_destroy(buf);
 
 	while (!list_empty(&buf->rb_recv_bufs)) {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index a39652884308..e465221c9c96 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -202,10 +202,9 @@ struct rpcrdma_rep {
 	bool			rr_temp;
 	struct rpcrdma_regbuf	*rr_rdmabuf;
 	struct rpcrdma_xprt	*rr_rxprt;
-	struct work_struct	rr_work;
+	struct rpc_rqst		*rr_rqst;
 	struct xdr_buf		rr_hdrbuf;
 	struct xdr_stream	rr_stream;
-	struct rpc_rqst		*rr_rqst;
 	struct list_head	rr_list;
 	struct ib_recv_wr	rr_recv_wr;
 };
@@ -240,10 +239,12 @@ struct rpcrdma_sendctx {
  * An external memory region is any buffer or page that is registered
  * on the fly (ie, not pre-registered).
  */
+struct rpcrdma_req;
 struct rpcrdma_frwr {
 	struct ib_mr			*fr_mr;
 	struct ib_cqe			fr_cqe;
 	struct completion		fr_linv_done;
+	struct rpcrdma_req		*fr_req;
 	union {
 		struct ib_reg_wr	fr_regwr;
 		struct ib_send_wr	fr_invwr;
@@ -388,7 +389,6 @@ struct rpcrdma_buffer {
 	u32			rb_bc_srv_max_requests;
 	u32			rb_bc_max_requests;
 
-	struct workqueue_struct *rb_completion_wq;
 	struct delayed_work	rb_refresh_worker;
 };
 
@@ -561,6 +561,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
 void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
@@ -585,7 +586,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
 void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
 void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt,
 			  struct rpcrdma_req *req);
-void rpcrdma_deferred_completion(struct work_struct *work);
 
 static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
 {
-- 
cgit v1.2.3


From 675dd90ad0932f2c03912a5252458d792bd7033a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 19 Jun 2019 10:33:42 -0400
Subject: xprtrdma: Modernize ops->connect

Adapt and apply changes that were made to the TCP socket connect
code. See the following commits for details on the purpose of
these changes:

Commit 7196dbb02ea0 ("SUNRPC: Allow changing of the TCP timeout parameters on the fly")
Commit 3851f1cdb2b8 ("SUNRPC: Limit the reconnect backoff timer to the max RPC message timeout")
Commit 02910177aede ("SUNRPC: Fix reconnection timeouts")

Some common transport code is moved to xprt.c to satisfy the code
duplication police.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h     |  3 ++
 include/trace/events/rpcrdma.h  | 31 +++++++++++++++++++
 net/sunrpc/sched.c              |  1 +
 net/sunrpc/xprt.c               | 32 ++++++++++++++++++++
 net/sunrpc/xprtrdma/transport.c | 66 +++++++++++++++++++++++++++++++----------
 net/sunrpc/xprtrdma/xprt_rdma.h |  1 +
 net/sunrpc/xprtsock.c           | 23 ++------------
 7 files changed, 121 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a6d9fce7f20e..cc78fd38ea7d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -334,6 +334,9 @@ struct xprt_class {
  */
 struct rpc_xprt		*xprt_create_transport(struct xprt_create *args);
 void			xprt_connect(struct rpc_task *task);
+unsigned long		xprt_reconnect_delay(const struct rpc_xprt *xprt);
+void			xprt_reconnect_backoff(struct rpc_xprt *xprt,
+					       unsigned long init_to);
 void			xprt_reserve(struct rpc_task *task);
 void			xprt_retry_reserve(struct rpc_task *task);
 int			xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 98023d91a72d..f6a4eaa85a3e 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -375,6 +375,37 @@ DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc);
 DEFINE_RXPRT_EVENT(xprtrdma_op_close);
 DEFINE_RXPRT_EVENT(xprtrdma_op_connect);
 
+TRACE_EVENT(xprtrdma_op_set_cto,
+	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
+		unsigned long connect,
+		unsigned long reconnect
+	),
+
+	TP_ARGS(r_xprt, connect, reconnect),
+
+	TP_STRUCT__entry(
+		__field(const void *, r_xprt)
+		__field(unsigned long, connect)
+		__field(unsigned long, reconnect)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
+	),
+
+	TP_fast_assign(
+		__entry->r_xprt = r_xprt;
+		__entry->connect = connect;
+		__entry->reconnect = reconnect;
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
+	),
+
+	TP_printk("peer=[%s]:%s r_xprt=%p: connect=%lu reconnect=%lu",
+		__get_str(addr), __get_str(port), __entry->r_xprt,
+		__entry->connect / HZ, __entry->reconnect / HZ
+	)
+);
+
 TRACE_EVENT(xprtrdma_qp_event,
 	TP_PROTO(
 		const struct rpcrdma_xprt *r_xprt,
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index bb04ae52803a..5ad5dead7bfc 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -58,6 +58,7 @@ static struct rpc_wait_queue delay_queue;
  */
 struct workqueue_struct *rpciod_workqueue __read_mostly;
 struct workqueue_struct *xprtiod_workqueue __read_mostly;
+EXPORT_SYMBOL_GPL(xprtiod_workqueue);
 
 unsigned long
 rpc_task_timeout(const struct rpc_task *task)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ad21880d5601..b1f54b7ccc0c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -850,6 +850,38 @@ void xprt_connect(struct rpc_task *task)
 	xprt_release_write(xprt, task);
 }
 
+/**
+ * xprt_reconnect_delay - compute the wait before scheduling a connect
+ * @xprt: transport instance
+ *
+ */
+unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt)
+{
+	unsigned long start, now = jiffies;
+
+	start = xprt->stat.connect_start + xprt->reestablish_timeout;
+	if (time_after(start, now))
+		return start - now;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xprt_reconnect_delay);
+
+/**
+ * xprt_reconnect_backoff - compute the new re-establish timeout
+ * @xprt: transport instance
+ * @init_to: initial reestablish timeout
+ *
+ */
+void xprt_reconnect_backoff(struct rpc_xprt *xprt, unsigned long init_to)
+{
+	xprt->reestablish_timeout <<= 1;
+	if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
+		xprt->reestablish_timeout = xprt->max_reconnect_timeout;
+	if (xprt->reestablish_timeout < init_to)
+		xprt->reestablish_timeout = init_to;
+}
+EXPORT_SYMBOL_GPL(xprt_reconnect_backoff);
+
 enum xprt_xid_rb_cmp {
 	XID_RB_EQUAL,
 	XID_RB_LEFT,
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 3688e0782587..4993aa49ecbe 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -298,6 +298,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
 	module_put(THIS_MODULE);
 }
 
+/* 60 second timeout, no retries */
 static const struct rpc_timeout xprt_rdma_default_timeout = {
 	.to_initval = 60 * HZ,
 	.to_maxval = 60 * HZ,
@@ -323,8 +324,9 @@ xprt_setup_rdma(struct xprt_create *args)
 	if (!xprt)
 		return ERR_PTR(-ENOMEM);
 
-	/* 60 second timeout, no retries */
 	xprt->timeout = &xprt_rdma_default_timeout;
+	xprt->connect_timeout = xprt->timeout->to_initval;
+	xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
 	xprt->bind_timeout = RPCRDMA_BIND_TO;
 	xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
 	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
@@ -487,31 +489,64 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
 }
 
 /**
- * xprt_rdma_connect - try to establish a transport connection
+ * xprt_rdma_set_connect_timeout - set timeouts for establishing a connection
+ * @xprt: controlling transport instance
+ * @connect_timeout: reconnect timeout after client disconnects
+ * @reconnect_timeout: reconnect timeout after server disconnects
+ *
+ */
+static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt,
+					      unsigned long connect_timeout,
+					      unsigned long reconnect_timeout)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	trace_xprtrdma_op_set_cto(r_xprt, connect_timeout, reconnect_timeout);
+
+	spin_lock(&xprt->transport_lock);
+
+	if (connect_timeout < xprt->connect_timeout) {
+		struct rpc_timeout to;
+		unsigned long initval;
+
+		to = *xprt->timeout;
+		initval = connect_timeout;
+		if (initval < RPCRDMA_INIT_REEST_TO << 1)
+			initval = RPCRDMA_INIT_REEST_TO << 1;
+		to.to_initval = initval;
+		to.to_maxval = initval;
+		r_xprt->rx_timeout = to;
+		xprt->timeout = &r_xprt->rx_timeout;
+		xprt->connect_timeout = connect_timeout;
+	}
+
+	if (reconnect_timeout < xprt->max_reconnect_timeout)
+		xprt->max_reconnect_timeout = reconnect_timeout;
+
+	spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * xprt_rdma_connect - schedule an attempt to reconnect
  * @xprt: transport state
- * @task: RPC scheduler context
+ * @task: RPC scheduler context (unused)
  *
  */
 static void
 xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	unsigned long delay;
 
 	trace_xprtrdma_op_connect(r_xprt);
+
+	delay = 0;
 	if (r_xprt->rx_ep.rep_connected != 0) {
-		/* Reconnect */
-		schedule_delayed_work(&r_xprt->rx_connect_worker,
-				      xprt->reestablish_timeout);
-		xprt->reestablish_timeout <<= 1;
-		if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
-			xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
-		else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
-			xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
-	} else {
-		schedule_delayed_work(&r_xprt->rx_connect_worker, 0);
-		if (!RPC_IS_ASYNC(task))
-			flush_delayed_work(&r_xprt->rx_connect_worker);
+		delay = xprt_reconnect_delay(xprt);
+		xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
 	}
+	queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker,
+			   delay);
 }
 
 /**
@@ -769,6 +804,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
 	.send_request		= xprt_rdma_send_request,
 	.close			= xprt_rdma_close,
 	.destroy		= xprt_rdma_destroy,
+	.set_connect_timeout	= xprt_rdma_tcp_set_connect_timeout,
 	.print_stats		= xprt_rdma_print_stats,
 	.enable_swap		= xprt_rdma_enable_swap,
 	.disable_swap		= xprt_rdma_disable_swap,
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 117e32816e4f..8378f45d2da7 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -432,6 +432,7 @@ struct rpcrdma_xprt {
 	struct rpcrdma_ep	rx_ep;
 	struct rpcrdma_buffer	rx_buf;
 	struct delayed_work	rx_connect_worker;
+	struct rpc_timeout	rx_timeout;
 	struct rpcrdma_stats	rx_stats;
 };
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c69951ed2ebc..b154600085d6 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2402,25 +2402,6 @@ out:
 	xprt_wake_pending_tasks(xprt, status);
 }
 
-static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt)
-{
-	unsigned long start, now = jiffies;
-
-	start = xprt->stat.connect_start + xprt->reestablish_timeout;
-	if (time_after(start, now))
-		return start - now;
-	return 0;
-}
-
-static void xs_reconnect_backoff(struct rpc_xprt *xprt)
-{
-	xprt->reestablish_timeout <<= 1;
-	if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
-		xprt->reestablish_timeout = xprt->max_reconnect_timeout;
-	if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
-		xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
-}
-
 /**
  * xs_connect - connect a socket to a remote endpoint
  * @xprt: pointer to transport structure
@@ -2450,8 +2431,8 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 		/* Start by resetting any existing state */
 		xs_reset_transport(transport);
 
-		delay = xs_reconnect_delay(xprt);
-		xs_reconnect_backoff(xprt);
+		delay = xprt_reconnect_delay(xprt);
+		xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO);
 
 	} else
 		dprintk("RPC:       xs_connect scheduled xprt %p\n", xprt);
-- 
cgit v1.2.3


From 0fa03c624d8fc9932d0f27c39a9deca6a37e0e17 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 19 Apr 2019 13:34:07 -0600
Subject: io_uring: add support for sendmsg()

This is done through IORING_OP_SENDMSG. There's a new sqe->msg_flags
for the flags argument, and the msghdr struct is passed in the
sqe->addr field.

We use MSG_DONTWAIT to force an inline fast path if sendmsg() doesn't
block, and punt to async execution if it would have.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/socket.h        |  4 ++++
 include/uapi/linux/io_uring.h |  2 ++
 net/socket.c                  |  7 +++++++
 4 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9f0ef4956f87..5d4cd8c4132d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1390,6 +1390,43 @@ static int io_sync_file_range(struct io_kiocb *req,
 	return 0;
 }
 
+static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	struct socket *sock;
+	int ret;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
+	sock = sock_from_file(req->file, &ret);
+	if (sock) {
+		struct user_msghdr __user *msg;
+		unsigned flags;
+
+		flags = READ_ONCE(sqe->msg_flags);
+		if (flags & MSG_DONTWAIT)
+			req->flags |= REQ_F_NOWAIT;
+		else if (force_nonblock)
+			flags |= MSG_DONTWAIT;
+
+		msg = (struct user_msghdr __user *) (unsigned long)
+			READ_ONCE(sqe->addr);
+
+		ret = __sys_sendmsg_sock(sock, msg, flags);
+		if (force_nonblock && ret == -EAGAIN)
+			return ret;
+	}
+
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_put_req(req);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 static void io_poll_remove_one(struct io_kiocb *req)
 {
 	struct io_poll_iocb *poll = &req->poll;
@@ -1675,6 +1712,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_SYNC_FILE_RANGE:
 		ret = io_sync_file_range(req, s->sqe, force_nonblock);
 		break;
+	case IORING_OP_SENDMSG:
+		ret = io_sendmsg(req, s->sqe, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index b57cd8bf96e2..9d770ef3ced5 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -12,6 +12,7 @@
 
 struct pid;
 struct cred;
+struct socket;
 
 #define __sockaddr_check_size(size)	\
 	BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
@@ -374,6 +375,9 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
 extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
 			  unsigned int vlen, unsigned int flags,
 			  bool forbid_cmsg_compat);
+extern long __sys_sendmsg_sock(struct socket *sock,
+			       struct user_msghdr __user *msg,
+			       unsigned int flags);
 
 /* helpers which do the actual work for syscalls */
 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 10b7c45f6d57..d74742d6269f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -27,6 +27,7 @@ struct io_uring_sqe {
 		__u32		fsync_flags;
 		__u16		poll_events;
 		__u32		sync_range_flags;
+		__u32		msg_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -58,6 +59,7 @@ struct io_uring_sqe {
 #define IORING_OP_POLL_ADD	6
 #define IORING_OP_POLL_REMOVE	7
 #define IORING_OP_SYNC_FILE_RANGE	8
+#define IORING_OP_SENDMSG	9
 
 /*
  * sqe->fsync_flags
diff --git a/net/socket.c b/net/socket.c
index bffec466b4f1..b9536940255e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2313,6 +2313,13 @@ out_freeiov:
 /*
  *	BSD sendmsg interface
  */
+long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
+			unsigned int flags)
+{
+	struct msghdr msg_sys;
+
+	return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
+}
 
 long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
 		   bool forbid_cmsg_compat)
-- 
cgit v1.2.3


From aa1fa28fc73ea6b740ee7b62bf3b07141883dbb8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 19 Apr 2019 13:38:09 -0600
Subject: io_uring: add support for recvmsg()

This is done through IORING_OP_RECVMSG. This opcode uses the same
sqe->msg_flags that IORING_OP_SENDMSG added, and we pass in the
msghdr struct in the sqe->addr field as well.

We use MSG_DONTWAIT to force an inline fast path if recvmsg() doesn't
block, and punt to async execution if it would have.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 31 +++++++++++++++++++++++++++----
 include/linux/socket.h        |  3 +++
 include/uapi/linux/io_uring.h |  1 +
 net/socket.c                  |  8 ++++++++
 4 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5d4cd8c4132d..8d86e31b0762 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1390,10 +1390,12 @@ static int io_sync_file_range(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		      bool force_nonblock)
-{
 #if defined(CONFIG_NET)
+static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			   bool force_nonblock,
+		   long (*fn)(struct socket *, struct user_msghdr __user *,
+				unsigned int))
+{
 	struct socket *sock;
 	int ret;
 
@@ -1414,7 +1416,7 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		msg = (struct user_msghdr __user *) (unsigned long)
 			READ_ONCE(sqe->addr);
 
-		ret = __sys_sendmsg_sock(sock, msg, flags);
+		ret = fn(sock, msg, flags);
 		if (force_nonblock && ret == -EAGAIN)
 			return ret;
 	}
@@ -1422,6 +1424,24 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
 	io_put_req(req);
 	return 0;
+}
+#endif
+
+static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
 #else
 	return -EOPNOTSUPP;
 #endif
@@ -1715,6 +1735,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_SENDMSG:
 		ret = io_sendmsg(req, s->sqe, force_nonblock);
 		break;
+	case IORING_OP_RECVMSG:
+		ret = io_recvmsg(req, s->sqe, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 9d770ef3ced5..97523818cb14 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -378,6 +378,9 @@ extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
 extern long __sys_sendmsg_sock(struct socket *sock,
 			       struct user_msghdr __user *msg,
 			       unsigned int flags);
+extern long __sys_recvmsg_sock(struct socket *sock,
+			       struct user_msghdr __user *msg,
+			       unsigned int flags);
 
 /* helpers which do the actual work for syscalls */
 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d74742d6269f..1e1652f25cc1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -60,6 +60,7 @@ struct io_uring_sqe {
 #define IORING_OP_POLL_REMOVE	7
 #define IORING_OP_SYNC_FILE_RANGE	8
 #define IORING_OP_SENDMSG	9
+#define IORING_OP_RECVMSG	10
 
 /*
  * sqe->fsync_flags
diff --git a/net/socket.c b/net/socket.c
index b9536940255e..98354cc18840 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2494,6 +2494,14 @@ out_freeiov:
  *	BSD recvmsg interface
  */
 
+long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
+			unsigned int flags)
+{
+	struct msghdr msg_sys;
+
+	return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+}
+
 long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
 		   bool forbid_cmsg_compat)
 {
-- 
cgit v1.2.3


From 6605bdd59c21bb34c8f14ac4d6f2d419185f3528 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 28 Jun 2019 09:53:29 -0700
Subject: nvme: add I/O characteristics fields

Several new fields have been introduced in version 1.4 of the NVMe spec
at offsets that were defined as reserved in version 1.3d of the NVMe
spec. Update the definition of the nvme_id_ns data structure such that
it is in sync with version 1.4 of the NVMe spec. This change preserves
backwards compatibility.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d98b2d8baf4e..01aa6a6c241d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -315,7 +315,7 @@ struct nvme_id_ns {
 	__u8			nmic;
 	__u8			rescap;
 	__u8			fpi;
-	__u8			rsvd33;
+	__u8			dlfeat;
 	__le16			nawun;
 	__le16			nawupf;
 	__le16			nacwu;
@@ -324,11 +324,17 @@ struct nvme_id_ns {
 	__le16			nabspf;
 	__le16			noiob;
 	__u8			nvmcap[16];
-	__u8			rsvd64[28];
+	__le16			npwg;
+	__le16			npwa;
+	__le16			npdg;
+	__le16			npda;
+	__le16			nows;
+	__u8			rsvd74[18];
 	__le32			anagrpid;
 	__u8			rsvd96[3];
 	__u8			nsattr;
-	__u8			rsvd100[4];
+	__le16			nvmsetid;
+	__le16			endgid;
 	__u8			nguid[16];
 	__u8			eui64[8];
 	struct nvme_lbaf	lbaf[16];
-- 
cgit v1.2.3


From 13990cf8a180cc070f0b1266140e799db8754289 Mon Sep 17 00:00:00 2001
From: Amol Surati <suratiamol@gmail.com>
Date: Sun, 7 Jul 2019 14:27:29 +0530
Subject: ide: use BIT() macro for defining bit-flags

The BIT() macro is available for defining the required bit-flags.

Since it operates on an unsigned value and expands to an unsigned result,
using it, instead of an expression like (1 << x), also fixes the problem
of shifting a signed 32-bit value by 31 bits (e.g. 1 << 31).

Signed-off-by: Amol Surati <suratiamol@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ide.h | 272 ++++++++++++++++++++++++++--------------------------
 1 file changed, 136 insertions(+), 136 deletions(-)

(limited to 'include')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 971cf76a78a0..46b771d6999e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -253,9 +253,9 @@ static inline void ide_std_init_ports(struct ide_hw *hw,
  * Special Driver Flags
  */
 enum {
-	IDE_SFLAG_SET_GEOMETRY		= (1 << 0),
-	IDE_SFLAG_RECALIBRATE		= (1 << 1),
-	IDE_SFLAG_SET_MULTMODE		= (1 << 2),
+	IDE_SFLAG_SET_GEOMETRY		= BIT(0),
+	IDE_SFLAG_RECALIBRATE		= BIT(1),
+	IDE_SFLAG_SET_MULTMODE		= BIT(2),
 };
 
 /*
@@ -267,13 +267,13 @@ typedef enum {
 } ide_startstop_t;
 
 enum {
-	IDE_VALID_ERROR 		= (1 << 1),
+	IDE_VALID_ERROR 		= BIT(1),
 	IDE_VALID_FEATURE		= IDE_VALID_ERROR,
-	IDE_VALID_NSECT 		= (1 << 2),
-	IDE_VALID_LBAL			= (1 << 3),
-	IDE_VALID_LBAM			= (1 << 4),
-	IDE_VALID_LBAH			= (1 << 5),
-	IDE_VALID_DEVICE		= (1 << 6),
+	IDE_VALID_NSECT 		= BIT(2),
+	IDE_VALID_LBAL			= BIT(3),
+	IDE_VALID_LBAM			= BIT(4),
+	IDE_VALID_LBAH			= BIT(5),
+	IDE_VALID_DEVICE		= BIT(6),
 	IDE_VALID_LBA			= IDE_VALID_LBAL |
 					  IDE_VALID_LBAM |
 					  IDE_VALID_LBAH,
@@ -289,24 +289,24 @@ enum {
 };
 
 enum {
-	IDE_TFLAG_LBA48			= (1 << 0),
-	IDE_TFLAG_WRITE			= (1 << 1),
-	IDE_TFLAG_CUSTOM_HANDLER	= (1 << 2),
-	IDE_TFLAG_DMA_PIO_FALLBACK	= (1 << 3),
+	IDE_TFLAG_LBA48			= BIT(0),
+	IDE_TFLAG_WRITE			= BIT(1),
+	IDE_TFLAG_CUSTOM_HANDLER	= BIT(2),
+	IDE_TFLAG_DMA_PIO_FALLBACK	= BIT(3),
 	/* force 16-bit I/O operations */
-	IDE_TFLAG_IO_16BIT		= (1 << 4),
+	IDE_TFLAG_IO_16BIT		= BIT(4),
 	/* struct ide_cmd was allocated using kmalloc() */
-	IDE_TFLAG_DYN			= (1 << 5),
-	IDE_TFLAG_FS			= (1 << 6),
-	IDE_TFLAG_MULTI_PIO		= (1 << 7),
-	IDE_TFLAG_SET_XFER		= (1 << 8),
+	IDE_TFLAG_DYN			= BIT(5),
+	IDE_TFLAG_FS			= BIT(6),
+	IDE_TFLAG_MULTI_PIO		= BIT(7),
+	IDE_TFLAG_SET_XFER		= BIT(8),
 };
 
 enum {
-	IDE_FTFLAG_FLAGGED		= (1 << 0),
-	IDE_FTFLAG_SET_IN_FLAGS		= (1 << 1),
-	IDE_FTFLAG_OUT_DATA		= (1 << 2),
-	IDE_FTFLAG_IN_DATA		= (1 << 3),
+	IDE_FTFLAG_FLAGGED		= BIT(0),
+	IDE_FTFLAG_SET_IN_FLAGS		= BIT(1),
+	IDE_FTFLAG_OUT_DATA		= BIT(2),
+	IDE_FTFLAG_IN_DATA		= BIT(3),
 };
 
 struct ide_taskfile {
@@ -357,13 +357,13 @@ struct ide_cmd {
 /* ATAPI packet command flags */
 enum {
 	/* set when an error is considered normal - no retry (ide-tape) */
-	PC_FLAG_ABORT			= (1 << 0),
-	PC_FLAG_SUPPRESS_ERROR		= (1 << 1),
-	PC_FLAG_WAIT_FOR_DSC		= (1 << 2),
-	PC_FLAG_DMA_OK			= (1 << 3),
-	PC_FLAG_DMA_IN_PROGRESS		= (1 << 4),
-	PC_FLAG_DMA_ERROR		= (1 << 5),
-	PC_FLAG_WRITING			= (1 << 6),
+	PC_FLAG_ABORT			= BIT(0),
+	PC_FLAG_SUPPRESS_ERROR		= BIT(1),
+	PC_FLAG_WAIT_FOR_DSC		= BIT(2),
+	PC_FLAG_DMA_OK			= BIT(3),
+	PC_FLAG_DMA_IN_PROGRESS		= BIT(4),
+	PC_FLAG_DMA_ERROR		= BIT(5),
+	PC_FLAG_WRITING			= BIT(6),
 };
 
 #define ATAPI_WAIT_PC		(60 * HZ)
@@ -417,111 +417,111 @@ struct ide_disk_ops {
 
 /* ATAPI device flags */
 enum {
-	IDE_AFLAG_DRQ_INTERRUPT		= (1 << 0),
+	IDE_AFLAG_DRQ_INTERRUPT		= BIT(0),
 
 	/* ide-cd */
 	/* Drive cannot eject the disc. */
-	IDE_AFLAG_NO_EJECT		= (1 << 1),
+	IDE_AFLAG_NO_EJECT		= BIT(1),
 	/* Drive is a pre ATAPI 1.2 drive. */
-	IDE_AFLAG_PRE_ATAPI12		= (1 << 2),
+	IDE_AFLAG_PRE_ATAPI12		= BIT(2),
 	/* TOC addresses are in BCD. */
-	IDE_AFLAG_TOCADDR_AS_BCD	= (1 << 3),
+	IDE_AFLAG_TOCADDR_AS_BCD	= BIT(3),
 	/* TOC track numbers are in BCD. */
-	IDE_AFLAG_TOCTRACKS_AS_BCD	= (1 << 4),
+	IDE_AFLAG_TOCTRACKS_AS_BCD	= BIT(4),
 	/* Saved TOC information is current. */
-	IDE_AFLAG_TOC_VALID		= (1 << 6),
+	IDE_AFLAG_TOC_VALID		= BIT(6),
 	/* We think that the drive door is locked. */
-	IDE_AFLAG_DOOR_LOCKED		= (1 << 7),
+	IDE_AFLAG_DOOR_LOCKED		= BIT(7),
 	/* SET_CD_SPEED command is unsupported. */
-	IDE_AFLAG_NO_SPEED_SELECT	= (1 << 8),
-	IDE_AFLAG_VERTOS_300_SSD	= (1 << 9),
-	IDE_AFLAG_VERTOS_600_ESD	= (1 << 10),
-	IDE_AFLAG_SANYO_3CD		= (1 << 11),
-	IDE_AFLAG_FULL_CAPS_PAGE	= (1 << 12),
-	IDE_AFLAG_PLAY_AUDIO_OK		= (1 << 13),
-	IDE_AFLAG_LE_SPEED_FIELDS	= (1 << 14),
+	IDE_AFLAG_NO_SPEED_SELECT	= BIT(8),
+	IDE_AFLAG_VERTOS_300_SSD	= BIT(9),
+	IDE_AFLAG_VERTOS_600_ESD	= BIT(10),
+	IDE_AFLAG_SANYO_3CD		= BIT(11),
+	IDE_AFLAG_FULL_CAPS_PAGE	= BIT(12),
+	IDE_AFLAG_PLAY_AUDIO_OK		= BIT(13),
+	IDE_AFLAG_LE_SPEED_FIELDS	= BIT(14),
 
 	/* ide-floppy */
 	/* Avoid commands not supported in Clik drive */
-	IDE_AFLAG_CLIK_DRIVE		= (1 << 15),
+	IDE_AFLAG_CLIK_DRIVE		= BIT(15),
 	/* Requires BH algorithm for packets */
-	IDE_AFLAG_ZIP_DRIVE		= (1 << 16),
+	IDE_AFLAG_ZIP_DRIVE		= BIT(16),
 	/* Supports format progress report */
-	IDE_AFLAG_SRFP			= (1 << 17),
+	IDE_AFLAG_SRFP			= BIT(17),
 
 	/* ide-tape */
-	IDE_AFLAG_IGNORE_DSC		= (1 << 18),
+	IDE_AFLAG_IGNORE_DSC		= BIT(18),
 	/* 0 When the tape position is unknown */
-	IDE_AFLAG_ADDRESS_VALID		= (1 <<	19),
+	IDE_AFLAG_ADDRESS_VALID		= BIT(19),
 	/* Device already opened */
-	IDE_AFLAG_BUSY			= (1 << 20),
+	IDE_AFLAG_BUSY			= BIT(20),
 	/* Attempt to auto-detect the current user block size */
-	IDE_AFLAG_DETECT_BS		= (1 << 21),
+	IDE_AFLAG_DETECT_BS		= BIT(21),
 	/* Currently on a filemark */
-	IDE_AFLAG_FILEMARK		= (1 << 22),
+	IDE_AFLAG_FILEMARK		= BIT(22),
 	/* 0 = no tape is loaded, so we don't rewind after ejecting */
-	IDE_AFLAG_MEDIUM_PRESENT	= (1 << 23),
+	IDE_AFLAG_MEDIUM_PRESENT	= BIT(23),
 
-	IDE_AFLAG_NO_AUTOCLOSE		= (1 << 24),
+	IDE_AFLAG_NO_AUTOCLOSE		= BIT(24),
 };
 
 /* device flags */
 enum {
 	/* restore settings after device reset */
-	IDE_DFLAG_KEEP_SETTINGS		= (1 << 0),
+	IDE_DFLAG_KEEP_SETTINGS		= BIT(0),
 	/* device is using DMA for read/write */
-	IDE_DFLAG_USING_DMA		= (1 << 1),
+	IDE_DFLAG_USING_DMA		= BIT(1),
 	/* okay to unmask other IRQs */
-	IDE_DFLAG_UNMASK		= (1 << 2),
+	IDE_DFLAG_UNMASK		= BIT(2),
 	/* don't attempt flushes */
-	IDE_DFLAG_NOFLUSH		= (1 << 3),
+	IDE_DFLAG_NOFLUSH		= BIT(3),
 	/* DSC overlap */
-	IDE_DFLAG_DSC_OVERLAP		= (1 << 4),
+	IDE_DFLAG_DSC_OVERLAP		= BIT(4),
 	/* give potential excess bandwidth */
-	IDE_DFLAG_NICE1			= (1 << 5),
+	IDE_DFLAG_NICE1			= BIT(5),
 	/* device is physically present */
-	IDE_DFLAG_PRESENT		= (1 << 6),
+	IDE_DFLAG_PRESENT		= BIT(6),
 	/* disable Host Protected Area */
-	IDE_DFLAG_NOHPA			= (1 << 7),
+	IDE_DFLAG_NOHPA			= BIT(7),
 	/* id read from device (synthetic if not set) */
-	IDE_DFLAG_ID_READ		= (1 << 8),
-	IDE_DFLAG_NOPROBE		= (1 << 9),
+	IDE_DFLAG_ID_READ		= BIT(8),
+	IDE_DFLAG_NOPROBE		= BIT(9),
 	/* need to do check_media_change() */
-	IDE_DFLAG_REMOVABLE		= (1 << 10),
+	IDE_DFLAG_REMOVABLE		= BIT(10),
 	/* needed for removable devices */
-	IDE_DFLAG_ATTACH		= (1 << 11),
-	IDE_DFLAG_FORCED_GEOM		= (1 << 12),
+	IDE_DFLAG_ATTACH		= BIT(11),
+	IDE_DFLAG_FORCED_GEOM		= BIT(12),
 	/* disallow setting unmask bit */
-	IDE_DFLAG_NO_UNMASK		= (1 << 13),
+	IDE_DFLAG_NO_UNMASK		= BIT(13),
 	/* disallow enabling 32-bit I/O */
-	IDE_DFLAG_NO_IO_32BIT		= (1 << 14),
+	IDE_DFLAG_NO_IO_32BIT		= BIT(14),
 	/* for removable only: door lock/unlock works */
-	IDE_DFLAG_DOORLOCKING		= (1 << 15),
+	IDE_DFLAG_DOORLOCKING		= BIT(15),
 	/* disallow DMA */
-	IDE_DFLAG_NODMA			= (1 << 16),
+	IDE_DFLAG_NODMA			= BIT(16),
 	/* powermanagement told us not to do anything, so sleep nicely */
-	IDE_DFLAG_BLOCKED		= (1 << 17),
+	IDE_DFLAG_BLOCKED		= BIT(17),
 	/* sleeping & sleep field valid */
-	IDE_DFLAG_SLEEPING		= (1 << 18),
-	IDE_DFLAG_POST_RESET		= (1 << 19),
-	IDE_DFLAG_UDMA33_WARNED		= (1 << 20),
-	IDE_DFLAG_LBA48			= (1 << 21),
+	IDE_DFLAG_SLEEPING		= BIT(18),
+	IDE_DFLAG_POST_RESET		= BIT(19),
+	IDE_DFLAG_UDMA33_WARNED		= BIT(20),
+	IDE_DFLAG_LBA48			= BIT(21),
 	/* status of write cache */
-	IDE_DFLAG_WCACHE		= (1 << 22),
+	IDE_DFLAG_WCACHE		= BIT(22),
 	/* used for ignoring ATA_DF */
-	IDE_DFLAG_NOWERR		= (1 << 23),
+	IDE_DFLAG_NOWERR		= BIT(23),
 	/* retrying in PIO */
-	IDE_DFLAG_DMA_PIO_RETRY		= (1 << 24),
-	IDE_DFLAG_LBA			= (1 << 25),
+	IDE_DFLAG_DMA_PIO_RETRY		= BIT(24),
+	IDE_DFLAG_LBA			= BIT(25),
 	/* don't unload heads */
-	IDE_DFLAG_NO_UNLOAD		= (1 << 26),
+	IDE_DFLAG_NO_UNLOAD		= BIT(26),
 	/* heads unloaded, please don't reset port */
-	IDE_DFLAG_PARKED		= (1 << 27),
-	IDE_DFLAG_MEDIA_CHANGED		= (1 << 28),
+	IDE_DFLAG_PARKED		= BIT(27),
+	IDE_DFLAG_MEDIA_CHANGED		= BIT(28),
 	/* write protect */
-	IDE_DFLAG_WP			= (1 << 29),
-	IDE_DFLAG_FORMAT_IN_PROGRESS	= (1 << 30),
-	IDE_DFLAG_NIEN_QUIRK		= (1 << 31),
+	IDE_DFLAG_WP			= BIT(29),
+	IDE_DFLAG_FORMAT_IN_PROGRESS	= BIT(30),
+	IDE_DFLAG_NIEN_QUIRK		= BIT(31),
 };
 
 struct ide_drive_s {
@@ -709,7 +709,7 @@ struct ide_dma_ops {
 };
 
 enum {
-	IDE_PFLAG_PROBING		= (1 << 0),
+	IDE_PFLAG_PROBING		= BIT(0),
 };
 
 struct ide_host;
@@ -862,7 +862,7 @@ extern struct mutex ide_setting_mtx;
  * configurable drive settings
  */
 
-#define DS_SYNC	(1 << 0)
+#define DS_SYNC	BIT(0)
 
 struct ide_devset {
 	int		(*get)(ide_drive_t *);
@@ -1000,15 +1000,15 @@ static inline void ide_proc_unregister_driver(ide_drive_t *drive,
 
 enum {
 	/* enter/exit functions */
-	IDE_DBG_FUNC =			(1 << 0),
+	IDE_DBG_FUNC =			BIT(0),
 	/* sense key/asc handling */
-	IDE_DBG_SENSE =			(1 << 1),
+	IDE_DBG_SENSE =			BIT(1),
 	/* packet commands handling */
-	IDE_DBG_PC =			(1 << 2),
+	IDE_DBG_PC =			BIT(2),
 	/* request handling */
-	IDE_DBG_RQ =			(1 << 3),
+	IDE_DBG_RQ =			BIT(3),
 	/* driver probing/setup */
-	IDE_DBG_PROBE =			(1 << 4),
+	IDE_DBG_PROBE =			BIT(4),
 };
 
 /* DRV_NAME has to be defined in the driver before using the macro below */
@@ -1171,10 +1171,10 @@ ssize_t ide_park_store(struct device *dev, struct device_attribute *attr,
  * the tail of our block device request queue and wait for their completion.
  */
 enum {
-	REQ_IDETAPE_PC1		= (1 << 0), /* packet command (first stage) */
-	REQ_IDETAPE_PC2		= (1 << 1), /* packet command (second stage) */
-	REQ_IDETAPE_READ	= (1 << 2),
-	REQ_IDETAPE_WRITE	= (1 << 3),
+	REQ_IDETAPE_PC1		= BIT(0), /* packet command (first stage) */
+	REQ_IDETAPE_PC2		= BIT(1), /* packet command (second stage) */
+	REQ_IDETAPE_READ	= BIT(2),
+	REQ_IDETAPE_WRITE	= BIT(3),
 };
 
 int ide_queue_pc_tail(ide_drive_t *, struct gendisk *, struct ide_atapi_pc *,
@@ -1264,71 +1264,71 @@ struct ide_pci_enablebit {
 
 enum {
 	/* Uses ISA control ports not PCI ones. */
-	IDE_HFLAG_ISA_PORTS		= (1 << 0),
+	IDE_HFLAG_ISA_PORTS		= BIT(0),
 	/* single port device */
-	IDE_HFLAG_SINGLE		= (1 << 1),
+	IDE_HFLAG_SINGLE		= BIT(1),
 	/* don't use legacy PIO blacklist */
-	IDE_HFLAG_PIO_NO_BLACKLIST	= (1 << 2),
+	IDE_HFLAG_PIO_NO_BLACKLIST	= BIT(2),
 	/* set for the second port of QD65xx */
-	IDE_HFLAG_QD_2ND_PORT		= (1 << 3),
+	IDE_HFLAG_QD_2ND_PORT		= BIT(3),
 	/* use PIO8/9 for prefetch off/on */
-	IDE_HFLAG_ABUSE_PREFETCH	= (1 << 4),
+	IDE_HFLAG_ABUSE_PREFETCH	= BIT(4),
 	/* use PIO6/7 for fast-devsel off/on */
-	IDE_HFLAG_ABUSE_FAST_DEVSEL	= (1 << 5),
+	IDE_HFLAG_ABUSE_FAST_DEVSEL	= BIT(5),
 	/* use 100-102 and 200-202 PIO values to set DMA modes */
-	IDE_HFLAG_ABUSE_DMA_MODES	= (1 << 6),
+	IDE_HFLAG_ABUSE_DMA_MODES	= BIT(6),
 	/*
 	 * keep DMA setting when programming PIO mode, may be used only
 	 * for hosts which have separate PIO and DMA timings (ie. PMAC)
 	 */
-	IDE_HFLAG_SET_PIO_MODE_KEEP_DMA	= (1 << 7),
+	IDE_HFLAG_SET_PIO_MODE_KEEP_DMA	= BIT(7),
 	/* program host for the transfer mode after programming device */
-	IDE_HFLAG_POST_SET_MODE		= (1 << 8),
+	IDE_HFLAG_POST_SET_MODE		= BIT(8),
 	/* don't program host/device for the transfer mode ("smart" hosts) */
-	IDE_HFLAG_NO_SET_MODE		= (1 << 9),
+	IDE_HFLAG_NO_SET_MODE		= BIT(9),
 	/* trust BIOS for programming chipset/device for DMA */
-	IDE_HFLAG_TRUST_BIOS_FOR_DMA	= (1 << 10),
+	IDE_HFLAG_TRUST_BIOS_FOR_DMA	= BIT(10),
 	/* host is CS5510/CS5520 */
-	IDE_HFLAG_CS5520		= (1 << 11),
+	IDE_HFLAG_CS5520		= BIT(11),
 	/* ATAPI DMA is unsupported */
-	IDE_HFLAG_NO_ATAPI_DMA		= (1 << 12),
+	IDE_HFLAG_NO_ATAPI_DMA		= BIT(12),
 	/* set if host is a "non-bootable" controller */
-	IDE_HFLAG_NON_BOOTABLE		= (1 << 13),
+	IDE_HFLAG_NON_BOOTABLE		= BIT(13),
 	/* host doesn't support DMA */
-	IDE_HFLAG_NO_DMA		= (1 << 14),
+	IDE_HFLAG_NO_DMA		= BIT(14),
 	/* check if host is PCI IDE device before allowing DMA */
-	IDE_HFLAG_NO_AUTODMA		= (1 << 15),
+	IDE_HFLAG_NO_AUTODMA		= BIT(15),
 	/* host uses MMIO */
-	IDE_HFLAG_MMIO			= (1 << 16),
+	IDE_HFLAG_MMIO			= BIT(16),
 	/* no LBA48 */
-	IDE_HFLAG_NO_LBA48		= (1 << 17),
+	IDE_HFLAG_NO_LBA48		= BIT(17),
 	/* no LBA48 DMA */
-	IDE_HFLAG_NO_LBA48_DMA		= (1 << 18),
+	IDE_HFLAG_NO_LBA48_DMA		= BIT(18),
 	/* data FIFO is cleared by an error */
-	IDE_HFLAG_ERROR_STOPS_FIFO	= (1 << 19),
+	IDE_HFLAG_ERROR_STOPS_FIFO	= BIT(19),
 	/* serialize ports */
-	IDE_HFLAG_SERIALIZE		= (1 << 20),
+	IDE_HFLAG_SERIALIZE		= BIT(20),
 	/* host is DTC2278 */
-	IDE_HFLAG_DTC2278		= (1 << 21),
+	IDE_HFLAG_DTC2278		= BIT(21),
 	/* 4 devices on a single set of I/O ports */
-	IDE_HFLAG_4DRIVES		= (1 << 22),
+	IDE_HFLAG_4DRIVES		= BIT(22),
 	/* host is TRM290 */
-	IDE_HFLAG_TRM290		= (1 << 23),
+	IDE_HFLAG_TRM290		= BIT(23),
 	/* use 32-bit I/O ops */
-	IDE_HFLAG_IO_32BIT		= (1 << 24),
+	IDE_HFLAG_IO_32BIT		= BIT(24),
 	/* unmask IRQs */
-	IDE_HFLAG_UNMASK_IRQS		= (1 << 25),
-	IDE_HFLAG_BROKEN_ALTSTATUS	= (1 << 26),
+	IDE_HFLAG_UNMASK_IRQS		= BIT(25),
+	IDE_HFLAG_BROKEN_ALTSTATUS	= BIT(26),
 	/* serialize ports if DMA is possible (for sl82c105) */
-	IDE_HFLAG_SERIALIZE_DMA		= (1 << 27),
+	IDE_HFLAG_SERIALIZE_DMA		= BIT(27),
 	/* force host out of "simplex" mode */
-	IDE_HFLAG_CLEAR_SIMPLEX		= (1 << 28),
+	IDE_HFLAG_CLEAR_SIMPLEX		= BIT(28),
 	/* DSC overlap is unsupported */
-	IDE_HFLAG_NO_DSC		= (1 << 29),
+	IDE_HFLAG_NO_DSC		= BIT(29),
 	/* never use 32-bit I/O ops */
-	IDE_HFLAG_NO_IO_32BIT		= (1 << 30),
+	IDE_HFLAG_NO_IO_32BIT		= BIT(30),
 	/* never unmask IRQs */
-	IDE_HFLAG_NO_UNMASK_IRQS	= (1 << 31),
+	IDE_HFLAG_NO_UNMASK_IRQS	= BIT(31),
 };
 
 #ifdef CONFIG_BLK_DEV_OFFBOARD
@@ -1536,16 +1536,16 @@ struct ide_timing {
 };
 
 enum {
-	IDE_TIMING_SETUP	= (1 << 0),
-	IDE_TIMING_ACT8B	= (1 << 1),
-	IDE_TIMING_REC8B	= (1 << 2),
-	IDE_TIMING_CYC8B	= (1 << 3),
+	IDE_TIMING_SETUP	= BIT(0),
+	IDE_TIMING_ACT8B	= BIT(1),
+	IDE_TIMING_REC8B	= BIT(2),
+	IDE_TIMING_CYC8B	= BIT(3),
 	IDE_TIMING_8BIT		= IDE_TIMING_ACT8B | IDE_TIMING_REC8B |
 				  IDE_TIMING_CYC8B,
-	IDE_TIMING_ACTIVE	= (1 << 4),
-	IDE_TIMING_RECOVER	= (1 << 5),
-	IDE_TIMING_CYCLE	= (1 << 6),
-	IDE_TIMING_UDMA		= (1 << 7),
+	IDE_TIMING_ACTIVE	= BIT(4),
+	IDE_TIMING_RECOVER	= BIT(5),
+	IDE_TIMING_CYCLE	= BIT(6),
+	IDE_TIMING_UDMA		= BIT(7),
 	IDE_TIMING_ALL		= IDE_TIMING_SETUP | IDE_TIMING_8BIT |
 				  IDE_TIMING_ACTIVE | IDE_TIMING_RECOVER |
 				  IDE_TIMING_CYCLE | IDE_TIMING_UDMA,
-- 
cgit v1.2.3


From 73a2b047c81046a7f734a5759ab5fdedbb6968fd Mon Sep 17 00:00:00 2001
From: Alastair D'Silva <alastair@d-silva.org>
Date: Wed, 5 Jun 2019 13:15:45 +0200
Subject: ocxl: Update for AFU descriptor template version 1.1

The OpenCAPI discovery and configuration specification has been
updated and introduces version 1.1 of the AFU descriptor template,
with new fields to better define the memory layout of an OpenCAPI
adapter.

The ocxl driver doesn't do much yet to support LPC memory but as we
start seeing (non-LPC) AFU images using the new template, this patch
updates the config space parsing code to avoid spitting a warning.

Signed-off-by: Alastair D'Silva <alastair@d-silva.org>
Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
Reviewed-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
Acked-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190605111545.19762-1-fbarrat@linux.ibm.com
---
 drivers/misc/ocxl/config.c | 181 ++++++++++++++++++++++++++++++++++++++++-----
 include/misc/ocxl.h        |   5 +-
 2 files changed, 165 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
index 5e65acb8e134..c8e19bfb5ef9 100644
--- a/drivers/misc/ocxl/config.c
+++ b/drivers/misc/ocxl/config.c
@@ -20,11 +20,14 @@
 #define OCXL_DVSEC_TEMPL_MMIO_GLOBAL_SZ  0x28
 #define OCXL_DVSEC_TEMPL_MMIO_PP         0x30
 #define OCXL_DVSEC_TEMPL_MMIO_PP_SZ      0x38
-#define OCXL_DVSEC_TEMPL_MEM_SZ          0x3C
-#define OCXL_DVSEC_TEMPL_WWID            0x40
+#define OCXL_DVSEC_TEMPL_ALL_MEM_SZ      0x3C
+#define OCXL_DVSEC_TEMPL_LPC_MEM_START   0x40
+#define OCXL_DVSEC_TEMPL_WWID            0x48
+#define OCXL_DVSEC_TEMPL_LPC_MEM_SZ      0x58
 
 #define OCXL_MAX_AFU_PER_FUNCTION 64
-#define OCXL_TEMPL_LEN            0x58
+#define OCXL_TEMPL_LEN_1_0        0x58
+#define OCXL_TEMPL_LEN_1_1        0x60
 #define OCXL_TEMPL_NAME_LEN       24
 #define OCXL_CFG_TIMEOUT     3
 
@@ -269,34 +272,72 @@ static int read_afu_info(struct pci_dev *dev, struct ocxl_fn_config *fn,
 	return 0;
 }
 
+/**
+ * Read the template version from the AFU
+ * dev: the device for the AFU
+ * fn: the AFU offsets
+ * len: outputs the template length
+ * version: outputs the major<<8,minor version
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int read_template_version(struct pci_dev *dev, struct ocxl_fn_config *fn,
+		u16 *len, u16 *version)
+{
+	u32 val32;
+	u8 major, minor;
+	int rc;
+
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_VERSION, &val32);
+	if (rc)
+		return rc;
+
+	*len = EXTRACT_BITS(val32, 16, 31);
+	major = EXTRACT_BITS(val32, 8, 15);
+	minor = EXTRACT_BITS(val32, 0, 7);
+	*version = (major << 8) + minor;
+	return 0;
+}
+
 int ocxl_config_check_afu_index(struct pci_dev *dev,
 				struct ocxl_fn_config *fn, int afu_idx)
 {
-	u32 val;
-	int rc, templ_major, templ_minor, len;
+	int rc;
+	u16 templ_version;
+	u16 len, expected_len;
 
 	pci_write_config_byte(dev,
 			fn->dvsec_afu_info_pos + OCXL_DVSEC_AFU_INFO_AFU_IDX,
 			afu_idx);
-	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_VERSION, &val);
+
+	rc = read_template_version(dev, fn, &len, &templ_version);
 	if (rc)
 		return rc;
 
-	/* AFU index map can have holes */
-	if (!val)
+	/* AFU index map can have holes, in which case we read all 0's */
+	if (!templ_version && !len)
 		return 0;
 
-	templ_major = EXTRACT_BITS(val, 8, 15);
-	templ_minor = EXTRACT_BITS(val, 0, 7);
 	dev_dbg(&dev->dev, "AFU descriptor template version %d.%d\n",
-		templ_major, templ_minor);
-
-	len = EXTRACT_BITS(val, 16, 31);
-	if (len != OCXL_TEMPL_LEN) {
-		dev_warn(&dev->dev,
-			"Unexpected template length in AFU information (%#x)\n",
-			len);
+		templ_version >> 8, templ_version & 0xFF);
+
+	switch (templ_version) {
+	case 0x0005: // v0.5 was used prior to the spec approval
+	case 0x0100:
+		expected_len = OCXL_TEMPL_LEN_1_0;
+		break;
+	case 0x0101:
+		expected_len = OCXL_TEMPL_LEN_1_1;
+		break;
+	default:
+		dev_warn(&dev->dev, "Unknown AFU template version %#x\n",
+			templ_version);
+		expected_len = len;
 	}
+	if (len != expected_len)
+		dev_warn(&dev->dev,
+			"Unexpected template length %#x in AFU information, expected %#x for version %#x\n",
+			len, expected_len, templ_version);
 	return 1;
 }
 
@@ -434,6 +475,102 @@ static int validate_afu(struct pci_dev *dev, struct ocxl_afu_config *afu)
 	return 0;
 }
 
+/**
+ * Populate AFU metadata regarding LPC memory
+ * dev: the device for the AFU
+ * fn: the AFU offsets
+ * afu: the AFU struct to populate the LPC metadata into
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int read_afu_lpc_memory_info(struct pci_dev *dev,
+				struct ocxl_fn_config *fn,
+				struct ocxl_afu_config *afu)
+{
+	int rc;
+	u32 val32;
+	u16 templ_version;
+	u16 templ_len;
+	u64 total_mem_size = 0;
+	u64 lpc_mem_size = 0;
+
+	afu->lpc_mem_offset = 0;
+	afu->lpc_mem_size = 0;
+	afu->special_purpose_mem_offset = 0;
+	afu->special_purpose_mem_size = 0;
+	/*
+	 * For AFUs following template v1.0, the LPC memory covers the
+	 * total memory. Its size is a power of 2.
+	 *
+	 * For AFUs with template >= v1.01, the total memory size is
+	 * still a power of 2, but it is split in 2 parts:
+	 * - the LPC memory, whose size can now be anything
+	 * - the remainder memory is a special purpose memory, whose
+	 *   definition is AFU-dependent. It is not accessible through
+	 *   the usual commands for LPC memory
+	 */
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_ALL_MEM_SZ, &val32);
+	if (rc)
+		return rc;
+
+	val32 = EXTRACT_BITS(val32, 0, 7);
+	if (!val32)
+		return 0; /* No LPC memory */
+
+	/*
+	 * The configuration space spec allows for a memory size of up
+	 * to 2^255 bytes.
+	 *
+	 * Current generation hardware uses 56-bit physical addresses,
+	 * but we won't be able to get near close to that, as we won't
+	 * have a hole big enough in the memory map.  Let it pass in
+	 * the driver for now. We'll get an error from the firmware
+	 * when trying to configure something too big.
+	 */
+	total_mem_size = 1ull << val32;
+
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_LPC_MEM_START, &val32);
+	if (rc)
+		return rc;
+
+	afu->lpc_mem_offset = val32;
+
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_LPC_MEM_START + 4, &val32);
+	if (rc)
+		return rc;
+
+	afu->lpc_mem_offset |= (u64) val32 << 32;
+
+	rc = read_template_version(dev, fn, &templ_len, &templ_version);
+	if (rc)
+		return rc;
+
+	if (templ_version >= 0x0101) {
+		rc = read_afu_info(dev, fn,
+				OCXL_DVSEC_TEMPL_LPC_MEM_SZ, &val32);
+		if (rc)
+			return rc;
+		lpc_mem_size = val32;
+
+		rc = read_afu_info(dev, fn,
+				OCXL_DVSEC_TEMPL_LPC_MEM_SZ + 4, &val32);
+		if (rc)
+			return rc;
+		lpc_mem_size |= (u64) val32 << 32;
+	} else {
+		lpc_mem_size = total_mem_size;
+	}
+	afu->lpc_mem_size = lpc_mem_size;
+
+	if (lpc_mem_size < total_mem_size) {
+		afu->special_purpose_mem_offset =
+			afu->lpc_mem_offset + lpc_mem_size;
+		afu->special_purpose_mem_size =
+			total_mem_size - lpc_mem_size;
+	}
+	return 0;
+}
+
 int ocxl_config_read_afu(struct pci_dev *dev, struct ocxl_fn_config *fn,
 			struct ocxl_afu_config *afu, u8 afu_idx)
 {
@@ -467,10 +604,9 @@ int ocxl_config_read_afu(struct pci_dev *dev, struct ocxl_fn_config *fn,
 	if (rc)
 		return rc;
 
-	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_MEM_SZ, &val32);
+	rc = read_afu_lpc_memory_info(dev, fn, afu);
 	if (rc)
 		return rc;
-	afu->log_mem_size = EXTRACT_BITS(val32, 0, 7);
 
 	rc = read_afu_control(dev, afu);
 	if (rc)
@@ -487,7 +623,12 @@ int ocxl_config_read_afu(struct pci_dev *dev, struct ocxl_fn_config *fn,
 	dev_dbg(&dev->dev, "  pp mmio bar = %hhu\n", afu->pp_mmio_bar);
 	dev_dbg(&dev->dev, "  pp mmio offset = %#llx\n", afu->pp_mmio_offset);
 	dev_dbg(&dev->dev, "  pp mmio stride = %#x\n", afu->pp_mmio_stride);
-	dev_dbg(&dev->dev, "  mem size (log) = %hhu\n", afu->log_mem_size);
+	dev_dbg(&dev->dev, "  lpc_mem offset = %#llx\n", afu->lpc_mem_offset);
+	dev_dbg(&dev->dev, "  lpc_mem size = %#llx\n", afu->lpc_mem_size);
+	dev_dbg(&dev->dev, "  special purpose mem offset = %#llx\n",
+		afu->special_purpose_mem_offset);
+	dev_dbg(&dev->dev, "  special purpose mem size = %#llx\n",
+		afu->special_purpose_mem_size);
 	dev_dbg(&dev->dev, "  pasid supported (log) = %u\n",
 		afu->pasid_supported_log);
 	dev_dbg(&dev->dev, "  actag supported = %u\n",
diff --git a/include/misc/ocxl.h b/include/misc/ocxl.h
index 5c4b4916e6be..06dd5839e438 100644
--- a/include/misc/ocxl.h
+++ b/include/misc/ocxl.h
@@ -32,7 +32,10 @@ struct ocxl_afu_config {
 	u8 pp_mmio_bar;         /* per-process MMIO area */
 	u64 pp_mmio_offset;
 	u32 pp_mmio_stride;
-	u8 log_mem_size;
+	u64 lpc_mem_offset;
+	u64 lpc_mem_size;
+	u64 special_purpose_mem_offset;
+	u64 special_purpose_mem_size;
 	u8 pasid_supported_log;
 	u16 actag_supported;
 };
-- 
cgit v1.2.3


From 9b0eb69b75bccada2d341d7e7ca342f0cb1c9a6a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:48 -0700
Subject: cgroup, blkcg: Prepare some symbols for module and !CONFIG_CGROUP
 usages

btrfs is going to use css_put() and wbc helpers to improve cgroup
writeback support.  Add dummy css_get() definition and export wbc
helpers to prepare for module and !CONFIG_CGROUP builds.

Reported-by: kbuild test robot <lkp@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     | 1 +
 fs/fs-writeback.c      | 3 +++
 include/linux/cgroup.h | 1 +
 3 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8afa52b0d148..ad7a91dec934 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -48,6 +48,7 @@ struct blkcg blkcg_root;
 EXPORT_SYMBOL_GPL(blkcg_root);
 
 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+EXPORT_SYMBOL_GPL(blkcg_root_css);
 
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9ebfb1b28430..a8a40bc26c2f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -270,6 +270,7 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
 	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
 		wb_put(wb);
 }
+EXPORT_SYMBOL_GPL(__inode_attach_wb);
 
 /**
  * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
@@ -582,6 +583,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 	if (unlikely(wb_dying(wbc->wb)))
 		inode_switch_wbs(inode, wbc->wb_id);
 }
+EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
 
 /**
  * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -701,6 +703,7 @@ void wbc_detach_inode(struct writeback_control *wbc)
 	wb_put(wbc->wb);
 	wbc->wb = NULL;
 }
+EXPORT_SYMBOL_GPL(wbc_detach_inode);
 
 /**
  * wbc_account_io - account IO issued during writeback
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3745ecdad925..852d885df10a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -699,6 +699,7 @@ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
 struct cgroup_subsys_state;
 struct cgroup;
 
+static inline void css_get(struct cgroup_subsys_state *css) {}
 static inline void css_put(struct cgroup_subsys_state *css) {}
 static inline int cgroup_attach_task_all(struct task_struct *from,
 					 struct task_struct *t) { return 0; }
-- 
cgit v1.2.3


From 34e51a5e1a6e939ed7d99c38173821ab86d577f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:49 -0700
Subject: blkcg, writeback: Rename wbc_account_io() to
 wbc_account_cgroup_owner()

wbc_account_io() does a very specific job - try to see which cgroup is
actually dirtying an inode and transfer its ownership to the majority
dirtier if needed.  The name is too generic and confusing.  Let's
rename it to something more specific.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/admin-guide/cgroup-v2.rst | 2 +-
 fs/btrfs/extent_io.c                    | 4 ++--
 fs/buffer.c                             | 2 +-
 fs/ext4/page-io.c                       | 2 +-
 fs/f2fs/data.c                          | 4 ++--
 fs/fs-writeback.c                       | 8 ++++----
 fs/mpage.c                              | 2 +-
 include/linux/writeback.h               | 8 ++++----
 8 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index a5c845338d6d..6223f485f7e1 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2114,7 +2114,7 @@ following two functions.
 	a queue (device) has been associated with the bio and
 	before submission.
 
-  wbc_account_io(@wbc, @page, @bytes)
+  wbc_account_cgroup_owner(@wbc, @page, @bytes)
 	Should be called for each data segment being written out.
 	While this function doesn't care exactly when it's called
 	during the writeback session, it's the easiest and most
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index db337e53aab3..5106008f5e28 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2911,7 +2911,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 			bio = NULL;
 		} else {
 			if (wbc)
-				wbc_account_io(wbc, page, page_size);
+				wbc_account_cgroup_owner(wbc, page, page_size);
 			return 0;
 		}
 	}
@@ -2924,7 +2924,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 	bio->bi_opf = opf;
 	if (wbc) {
 		wbc_init_bio(wbc, bio);
-		wbc_account_io(wbc, page, page_size);
+		wbc_account_cgroup_owner(wbc, page, page_size);
 	}
 
 	*bio_ret = bio;
diff --git a/fs/buffer.c b/fs/buffer.c
index e450c55f6434..40547bbbea94 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3093,7 +3093,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 
 	if (wbc) {
 		wbc_init_bio(wbc, bio);
-		wbc_account_io(wbc, bh->b_page, bh->b_size);
+		wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
 	}
 
 	submit_bio(bio);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4690618a92e9..56e287f5ee50 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -404,7 +404,7 @@ submit_and_retry:
 	ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
 	if (ret != bh->b_size)
 		goto submit_and_retry;
-	wbc_account_io(io->io_wbc, page, bh->b_size);
+	wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
 	io->io_next_block++;
 	return 0;
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index eda4181d2092..e1cab1717ac7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -470,7 +470,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	}
 
 	if (fio->io_wbc && !is_read_io(fio->op))
-		wbc_account_io(fio->io_wbc, page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
 
 	bio_set_op_attrs(bio, fio->op, fio->op_flags);
 
@@ -537,7 +537,7 @@ alloc_new:
 	}
 
 	if (fio->io_wbc)
-		wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
 
 	io->last_block_in_bio = fio->new_blkaddr;
 	f2fs_trace_ios(fio, 0);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a8a40bc26c2f..0aef79e934bb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -706,7 +706,7 @@ void wbc_detach_inode(struct writeback_control *wbc)
 EXPORT_SYMBOL_GPL(wbc_detach_inode);
 
 /**
- * wbc_account_io - account IO issued during writeback
+ * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
  * @wbc: writeback_control of the writeback in progress
  * @page: page being written out
  * @bytes: number of bytes being written out
@@ -715,8 +715,8 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
  * controlled by @wbc.  Keep the book for foreign inode detection.  See
  * wbc_detach_inode().
  */
-void wbc_account_io(struct writeback_control *wbc, struct page *page,
-		    size_t bytes)
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+			      size_t bytes)
 {
 	struct cgroup_subsys_state *css;
 	int id;
@@ -753,7 +753,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
 	else
 		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
 }
-EXPORT_SYMBOL_GPL(wbc_account_io);
+EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
 
 /**
  * inode_congested - test whether an inode is congested
diff --git a/fs/mpage.c b/fs/mpage.c
index 436a85260394..a63620cdb73a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -647,7 +647,7 @@ alloc_new:
 	 * the confused fail path above (OOM) will be very confused when
 	 * it finds all bh marked clean (i.e. it will not write anything)
 	 */
-	wbc_account_io(wbc, page, PAGE_SIZE);
+	wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
 		bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 738a0c24874f..dda5cf228172 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -188,8 +188,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 				 struct inode *inode)
 	__releases(&inode->i_lock);
 void wbc_detach_inode(struct writeback_control *wbc);
-void wbc_account_io(struct writeback_control *wbc, struct page *page,
-		    size_t bytes);
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+			      size_t bytes);
 void cgroup_writeback_umount(void);
 
 /**
@@ -291,8 +291,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 {
 }
 
-static inline void wbc_account_io(struct writeback_control *wbc,
-				  struct page *page, size_t bytes)
+static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
+					    struct page *page, size_t bytes)
 {
 }
 
-- 
cgit v1.2.3


From 27b36d8fa81fa8274fb72f4eb1484026f6b6daa8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:50 -0700
Subject: blkcg, writeback: Add wbc->no_cgroup_owner

When writeback IOs are bounced through async layers, the IOs should
only be accounted against the wbc from the original bdi writeback to
avoid confusing cgroup inode ownership arbitration.  Add
wbc->no_cgroup_owner to allow disabling wbc cgroup owner accounting.
This will be used make btrfs compression work well with cgroup IO
control.

v2: Renamed from no_wbc_acct to no_cgroup_owner and added comment as
    per Jan.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c         | 2 +-
 include/linux/writeback.h | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0aef79e934bb..542b02d170f8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -727,7 +727,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
 	 * regular writeback instead of writing things out itself.
 	 */
-	if (!wbc->wb)
+	if (!wbc->wb || wbc->no_cgroup_owner)
 		return;
 
 	css = mem_cgroup_css_from_page(page);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index dda5cf228172..33a50fa09fac 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -68,6 +68,15 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
+
+	/*
+	 * When writeback IOs are bounced through async layers, only the
+	 * initial synchronous phase should be accounted towards inode
+	 * cgroup ownership arbitration to avoid confusion.  Later stages
+	 * can set the following flag to disable the accounting.
+	 */
+	unsigned no_cgroup_owner:1;
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback *wb;	/* wb this writeback is issued under */
 	struct inode *inode;		/* inode being written out */
-- 
cgit v1.2.3


From 653c45c6b90c9659facbef10546d1f3a8e37d0cf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:51 -0700
Subject: blkcg, writeback: Implement wbc_blkcg_css()

Add a helper to determine the target blkcg from wbc.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/writeback.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 33a50fa09fac..e056a22075cf 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -11,6 +11,7 @@
 #include <linux/flex_proportions.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/blk_types.h>
+#include <linux/blk-cgroup.h>
 
 struct bio;
 
@@ -101,6 +102,16 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc)
 	return 0;
 }
 
+static inline struct cgroup_subsys_state *
+wbc_blkcg_css(struct writeback_control *wbc)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+	if (wbc->wb)
+		return wbc->wb->blkcg_css;
+#endif
+	return blkcg_root_css;
+}
+
 /*
  * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
  * and are measured against each other in.  There always is one global
-- 
cgit v1.2.3


From d3f77dfdc71835f8db71ca57d272b1fbec9dfc18 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:52 -0700
Subject: blkcg: implement REQ_CGROUP_PUNT

When a shared kthread needs to issue a bio for a cgroup, doing so
synchronously can lead to priority inversions as the kthread can be
trapped waiting for that cgroup.  This patch implements
REQ_CGROUP_PUNT flag which makes submit_bio() punt the actual issuing
to a dedicated per-blkcg work item to avoid such priority inversions.

This will be used to fix priority inversions in btrfs compression and
should be generally useful as we grow filesystem support for
comprehensive IO control.

Cc: Chris Mason <clm@fb.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c          | 53 +++++++++++++++++++++++++++++++++++++++++++++
 block/blk-core.c            |  3 +++
 include/linux/backing-dev.h |  1 +
 include/linux/blk-cgroup.h  | 16 +++++++++++++-
 include/linux/blk_types.h   | 10 +++++++++
 include/linux/writeback.h   | 13 ++++++++---
 6 files changed, 92 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ad7a91dec934..24ed26957367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -55,6 +55,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
 
 static bool blkcg_debug_stats = false;
+static struct workqueue_struct *blkcg_punt_bio_wq;
 
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
@@ -89,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
 {
 	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
 
+	WARN_ON(!bio_list_empty(&blkg->async_bios));
+
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
 	if (blkg->parent)
@@ -114,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
 	call_rcu(&blkg->rcu_head, __blkg_release);
 }
 
+static void blkg_async_bio_workfn(struct work_struct *work)
+{
+	struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+					     async_bio_work);
+	struct bio_list bios = BIO_EMPTY_LIST;
+	struct bio *bio;
+
+	/* as long as there are pending bios, @blkg can't go away */
+	spin_lock_bh(&blkg->async_bio_lock);
+	bio_list_merge(&bios, &blkg->async_bios);
+	bio_list_init(&blkg->async_bios);
+	spin_unlock_bh(&blkg->async_bio_lock);
+
+	while ((bio = bio_list_pop(&bios)))
+		submit_bio(bio);
+}
+
 /**
  * blkg_alloc - allocate a blkg
  * @blkcg: block cgroup the new blkg is associated with
@@ -142,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
+	spin_lock_init(&blkg->async_bio_lock);
+	bio_list_init(&blkg->async_bios);
+	INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
 	blkg->blkcg = blkcg;
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1528,6 +1551,25 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
 
+bool __blkcg_punt_bio_submit(struct bio *bio)
+{
+	struct blkcg_gq *blkg = bio->bi_blkg;
+
+	/* consume the flag first */
+	bio->bi_opf &= ~REQ_CGROUP_PUNT;
+
+	/* never bounce for the root cgroup */
+	if (!blkg->parent)
+		return false;
+
+	spin_lock_bh(&blkg->async_bio_lock);
+	bio_list_add(&blkg->async_bios, bio);
+	spin_unlock_bh(&blkg->async_bio_lock);
+
+	queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+	return true;
+}
+
 /*
  * Scale the accumulated delay based on how long it has been since we updated
  * the delay.  We only call this when we are adding delay, in case it's been a
@@ -1729,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
 	atomic64_add(delta, &blkg->delay_nsec);
 }
 
+static int __init blkcg_init(void)
+{
+	blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+					    WQ_MEM_RECLAIM | WQ_FREEZABLE |
+					    WQ_UNBOUND | WQ_SYSFS, 0);
+	if (!blkcg_punt_bio_wq)
+		return -ENOMEM;
+	return 0;
+}
+subsys_initcall(blkcg_init);
+
 module_param(blkcg_debug_stats, bool, 0644);
 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index edd009213f5b..260e36a2c343 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1128,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
  */
 blk_qc_t submit_bio(struct bio *bio)
 {
+	if (blkcg_punt_bio_submit(bio))
+		return BLK_QC_T_NONE;
+
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f9b029180241..35b31d176f74 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
 
 extern struct workqueue_struct *bdi_wq;
+extern struct workqueue_struct *bdi_async_bio_wq;
 
 static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
 {
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 33f23a858438..689a58231288 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -132,13 +132,17 @@ struct blkcg_gq {
 
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
-	struct rcu_head			rcu_head;
+	spinlock_t			async_bio_lock;
+	struct bio_list			async_bios;
+	struct work_struct		async_bio_work;
 
 	atomic_t			use_delay;
 	atomic64_t			delay_nsec;
 	atomic64_t			delay_start;
 	u64				last_delay;
 	int				last_use;
+
+	struct rcu_head			rcu_head;
 };
 
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
 				  struct bio *bio) { return false; }
 #endif
 
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+	if (bio->bi_opf & REQ_CGROUP_PUNT)
+		return __blkcg_punt_bio_submit(bio);
+	else
+		return false;
+}
 
 static inline void blkcg_bio_issue_init(struct bio *bio)
 {
@@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
 
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
 static inline void blkcg_bio_issue_init(struct bio *bio) { }
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio) { return true; }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6a53799c3fe2..feff3fe4467e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,14 @@ enum req_flag_bits {
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
 	__REQ_NOWAIT,           /* Don't wait if request will block */
+	/*
+	 * When a shared kthread needs to issue a bio for a cgroup, doing
+	 * so synchronously can lead to priority inversions as the kthread
+	 * can be trapped waiting for that cgroup.  CGROUP_PUNT flag makes
+	 * submit_bio() punt the actual issuing to a dedicated per-blkcg
+	 * work item to avoid such priority inversions.
+	 */
+	__REQ_CGROUP_PUNT,
 
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
@@ -337,6 +345,8 @@ enum req_flag_bits {
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
+#define REQ_CGROUP_PUNT		(1ULL << __REQ_CGROUP_PUNT)
+
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
 #define REQ_HIPRI		(1ULL << __REQ_HIPRI)
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e056a22075cf..8945aac31392 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -78,6 +78,8 @@ struct writeback_control {
 	 */
 	unsigned no_cgroup_owner:1;
 
+	unsigned punt_to_cgroup:1;	/* cgrp punting, see __REQ_CGROUP_PUNT */
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback *wb;	/* wb this writeback is issued under */
 	struct inode *inode;		/* inode being written out */
@@ -94,12 +96,17 @@ struct writeback_control {
 
 static inline int wbc_to_write_flags(struct writeback_control *wbc)
 {
+	int flags = 0;
+
+	if (wbc->punt_to_cgroup)
+		flags = REQ_CGROUP_PUNT;
+
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		return REQ_SYNC;
+		flags |= REQ_SYNC;
 	else if (wbc->for_kupdate || wbc->for_background)
-		return REQ_BACKGROUND;
+		flags |= REQ_BACKGROUND;
 
-	return 0;
+	return flags;
 }
 
 static inline struct cgroup_subsys_state *
-- 
cgit v1.2.3


From 113ab72ed4794c193509a97d7c6d32a6886e1682 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 10 Jul 2019 13:53:10 +0900
Subject: block: Fix potential overflow in blk_report_zones()

For large values of the number of zones reported and/or large zone
sizes, the sector increment calculated with

blk_queue_zone_sectors(q) * n

in blk_report_zones() loop can overflow the unsigned int type used for
the calculation as both "n" and blk_queue_zone_sectors() value are
unsigned int. E.g. for a device with 256 MB zones (524288 sectors),
overflow happens with 8192 or more zones reported.

Changing the return type of blk_queue_zone_sectors() to sector_t, fixes
this problem and avoids overflow problem for all other callers of this
helper too. The same change is also applied to the bdev_zone_sectors()
helper.

Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 2 +-
 include/linux/blkdev.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ae7e91bd0618..3249738242b4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
 static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
 					     sector_t nr_sectors)
 {
-	unsigned long zone_sectors = blk_queue_zone_sectors(q);
+	sector_t zone_sectors = blk_queue_zone_sectors(q);
 
 	return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0c482371c8b3..259bd7ad8312 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -681,7 +681,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
 	}
 }
 
-static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
+static inline sector_t blk_queue_zone_sectors(struct request_queue *q)
 {
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
@@ -1418,7 +1418,7 @@ static inline bool bdev_is_zoned(struct block_device *bdev)
 	return false;
 }
 
-static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
+static inline sector_t bdev_zone_sectors(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 
-- 
cgit v1.2.3


From 36847a005489cfb74dc6388952da73346f867dca Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 11 Jul 2019 00:56:08 +0900
Subject: block: Remove unused definitions

The ELV_MQUEUE_XXX definitions in include/linux/elevator.h are unused
since the removal of elevator_may_queue_fn in kernel 5.0. Remove these
definitions and also remove the documentation of elevator_may_queue_fn
in Documentiation/block/biodoc.txt.

Acked-by: Marcos Paulo de Souza <marcos.souza.org@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt | 5 -----
 include/linux/elevator.h       | 9 ---------
 2 files changed, 14 deletions(-)

(limited to 'include')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 31c177663ed5..5a4a799fe61b 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -843,11 +843,6 @@ elevator_latter_req_fn		These return the request before or after the
 
 elevator_completed_req_fn	called when a request is completed.
 
-elevator_may_queue_fn		returns true if the scheduler wants to allow the
-				current context to queue a new request even if
-				it is over the queue limit. This must be used
-				very carefully!!
-
 elevator_set_req_fn
 elevator_put_req_fn		Must be used to allocate and free any elevator
 				specific storage for a request.
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 169bb2e02516..38590c30a11d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -160,15 +160,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
 #define ELEVATOR_INSERT_FLUSH	5
 #define ELEVATOR_INSERT_SORT_MERGE	6
 
-/*
- * return values from elevator_may_queue_fn
- */
-enum {
-	ELV_MQUEUE_MAY,
-	ELV_MQUEUE_NO,
-	ELV_MQUEUE_MUST,
-};
-
 #define rq_end_sector(rq)	(blk_rq_pos(rq) + blk_rq_sectors(rq))
 #define rb_entry_rq(node)	rb_entry((node), struct request, rb_node)
 
-- 
cgit v1.2.3


From 9305d5d721f2bd5e2eeb670035159b560ca211ca Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 11 Jul 2019 00:57:41 +0900
Subject: block: Fix elevator name declaration

The elevator_name field in struct elevator_type is declared as an array
of characters (ELV_NAME_MAX size) but in practice used as a string
pointer with its initialization done statically within each
elevator elevator_type structure declaration.

Change the declaration of elevator_name to the more appropriate
"const char *" type.

Acked-by: Marcos Paulo de Souza <marcos.souza.org@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/elevator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 38590c30a11d..17cd0078377c 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -75,7 +75,7 @@ struct elevator_type
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
-	char elevator_name[ELV_NAME_MAX];
+	const char *elevator_name;
 	const char *elevator_alias;
 	struct module *elevator_owner;
 #ifdef CONFIG_BLK_DEBUG_FS
-- 
cgit v1.2.3


From 9c1f2a5dc2948b9f1170d4202c84745f0b0ff0c9 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Tue, 4 Jun 2019 12:01:46 -0500
Subject: mailbox: omap: Add support for TI K3 SoCs

The TI K3 AM65x and J721E family of SoCs have a new Mailbox IP that
is based on the existing Mailbox IP present in OMAP architecture based
SoCs. Each instance of the legacy OMAP Mailbox IP is now a single cluster
within the newer Mailbox IP instance on K3 architecture based SoCs. A
single K3 Mailbox IP instance has multiple clusters with each cluster
providing the same functionality as the existing OMAP Mailbox IP.

Reuse the existing OMAP Mailbox driver to extend the support for this
newer IP present within the Main NavSS block on K3 SoCs. The K3 family
of SoCs use 64-bit ARMv8 processors for running Linux, so the driver is
also enhanced to deal with the differences between the 32-bit message
payloads and the 64-bit pointers used by the client drivers.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/Kconfig        |  2 +-
 drivers/mailbox/omap-mailbox.c | 43 +++++++++++++++++++++++++-----------------
 include/linux/omap-mailbox.h   |  4 +++-
 3 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index b709481a8de6..ab4eb750bbdd 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -54,7 +54,7 @@ config ARMADA_37XX_RWTM_MBOX
 
 config OMAP2PLUS_MBOX
 	tristate "OMAP2+ Mailbox framework support"
-	depends on ARCH_OMAP2PLUS
+	depends on ARCH_OMAP2PLUS || ARCH_K3
 	help
 	  Mailbox implementation for OMAP family chips with hardware for
 	  interprocessor communication involving DSP, IVA1.0 and IVA2 in
diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index ca50177a33f2..a3cd63583cf7 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -3,7 +3,7 @@
  * OMAP mailbox driver
  *
  * Copyright (C) 2006-2009 Nokia Corporation. All rights reserved.
- * Copyright (C) 2013-2016 Texas Instruments Incorporated - http://www.ti.com
+ * Copyright (C) 2013-2019 Texas Instruments Incorporated - http://www.ti.com
  *
  * Contact: Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
  *          Suman Anna <s-anna@ti.com>
@@ -141,14 +141,14 @@ void mbox_write_reg(struct omap_mbox_device *mdev, u32 val, size_t ofs)
 }
 
 /* Mailbox FIFO handle functions */
-static mbox_msg_t mbox_fifo_read(struct omap_mbox *mbox)
+static u32 mbox_fifo_read(struct omap_mbox *mbox)
 {
 	struct omap_mbox_fifo *fifo = &mbox->rx_fifo;
 
-	return (mbox_msg_t)mbox_read_reg(mbox->parent, fifo->msg);
+	return mbox_read_reg(mbox->parent, fifo->msg);
 }
 
-static void mbox_fifo_write(struct omap_mbox *mbox, mbox_msg_t msg)
+static void mbox_fifo_write(struct omap_mbox *mbox, u32 msg)
 {
 	struct omap_mbox_fifo *fifo = &mbox->tx_fifo;
 
@@ -256,14 +256,16 @@ static void mbox_rx_work(struct work_struct *work)
 {
 	struct omap_mbox_queue *mq =
 			container_of(work, struct omap_mbox_queue, work);
-	mbox_msg_t msg;
+	mbox_msg_t data;
+	u32 msg;
 	int len;
 
 	while (kfifo_len(&mq->fifo) >= sizeof(msg)) {
 		len = kfifo_out(&mq->fifo, (unsigned char *)&msg, sizeof(msg));
 		WARN_ON(len != sizeof(msg));
+		data = msg;
 
-		mbox_chan_received_data(mq->mbox->chan, (void *)msg);
+		mbox_chan_received_data(mq->mbox->chan, (void *)data);
 		spin_lock_irq(&mq->lock);
 		if (mq->full) {
 			mq->full = false;
@@ -286,7 +288,7 @@ static void __mbox_tx_interrupt(struct omap_mbox *mbox)
 static void __mbox_rx_interrupt(struct omap_mbox *mbox)
 {
 	struct omap_mbox_queue *mq = mbox->rxq;
-	mbox_msg_t msg;
+	u32 msg;
 	int len;
 
 	while (!mbox_fifo_empty(mbox)) {
@@ -540,13 +542,13 @@ static void omap_mbox_chan_shutdown(struct mbox_chan *chan)
 	mutex_unlock(&mdev->cfg_lock);
 }
 
-static int omap_mbox_chan_send_noirq(struct omap_mbox *mbox, void *data)
+static int omap_mbox_chan_send_noirq(struct omap_mbox *mbox, u32 msg)
 {
 	int ret = -EBUSY;
 
 	if (!mbox_fifo_full(mbox)) {
 		_omap_mbox_enable_irq(mbox, IRQ_RX);
-		mbox_fifo_write(mbox, (mbox_msg_t)data);
+		mbox_fifo_write(mbox, msg);
 		ret = 0;
 		_omap_mbox_disable_irq(mbox, IRQ_RX);
 
@@ -558,12 +560,12 @@ static int omap_mbox_chan_send_noirq(struct omap_mbox *mbox, void *data)
 	return ret;
 }
 
-static int omap_mbox_chan_send(struct omap_mbox *mbox, void *data)
+static int omap_mbox_chan_send(struct omap_mbox *mbox, u32 msg)
 {
 	int ret = -EBUSY;
 
 	if (!mbox_fifo_full(mbox)) {
-		mbox_fifo_write(mbox, (mbox_msg_t)data);
+		mbox_fifo_write(mbox, msg);
 		ret = 0;
 	}
 
@@ -576,14 +578,15 @@ static int omap_mbox_chan_send_data(struct mbox_chan *chan, void *data)
 {
 	struct omap_mbox *mbox = mbox_chan_to_omap_mbox(chan);
 	int ret;
+	u32 msg = omap_mbox_message(data);
 
 	if (!mbox)
 		return -EINVAL;
 
 	if (mbox->send_no_irq)
-		ret = omap_mbox_chan_send_noirq(mbox, data);
+		ret = omap_mbox_chan_send_noirq(mbox, msg);
 	else
-		ret = omap_mbox_chan_send(mbox, data);
+		ret = omap_mbox_chan_send(mbox, msg);
 
 	return ret;
 }
@@ -656,6 +659,10 @@ static const struct of_device_id omap_mailbox_of_match[] = {
 		.compatible	= "ti,omap4-mailbox",
 		.data		= &omap4_data,
 	},
+	{
+		.compatible	= "ti,am654-mailbox",
+		.data		= &omap4_data,
+	},
 	{
 		/* end */
 	},
@@ -830,7 +837,10 @@ static int omap_mbox_probe(struct platform_device *pdev)
 	mdev->intr_type = intr_type;
 	mdev->mboxes = list;
 
-	/* OMAP does not have a Tx-Done IRQ, but rather a Tx-Ready IRQ */
+	/*
+	 * OMAP/K3 Mailbox IP does not have a Tx-Done IRQ, but rather a Tx-Ready
+	 * IRQ and is needed to run the Tx state machine
+	 */
 	mdev->controller.txdone_irq = true;
 	mdev->controller.dev = mdev->dev;
 	mdev->controller.ops = &omap_mbox_chan_ops;
@@ -899,9 +909,8 @@ static int __init omap_mbox_init(void)
 		return err;
 
 	/* kfifo size sanity check: alignment and minimal size */
-	mbox_kfifo_size = ALIGN(mbox_kfifo_size, sizeof(mbox_msg_t));
-	mbox_kfifo_size = max_t(unsigned int, mbox_kfifo_size,
-							sizeof(mbox_msg_t));
+	mbox_kfifo_size = ALIGN(mbox_kfifo_size, sizeof(u32));
+	mbox_kfifo_size = max_t(unsigned int, mbox_kfifo_size, sizeof(u32));
 
 	err = platform_driver_register(&omap_mbox_driver);
 	if (err)
diff --git a/include/linux/omap-mailbox.h b/include/linux/omap-mailbox.h
index 6dbcd2da0332..8aa984ec1f38 100644
--- a/include/linux/omap-mailbox.h
+++ b/include/linux/omap-mailbox.h
@@ -6,7 +6,9 @@
 #ifndef OMAP_MAILBOX_H
 #define OMAP_MAILBOX_H
 
-typedef u32 mbox_msg_t;
+typedef uintptr_t mbox_msg_t;
+
+#define omap_mbox_message(data) (u32)(mbox_msg_t)(data)
 
 typedef int __bitwise omap_mbox_irq_t;
 #define IRQ_TX ((__force omap_mbox_irq_t) 1)
-- 
cgit v1.2.3


From ff956826a403f5cf189978d5ff6b3eb53aa11610 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:24 +0800
Subject: intel_rapl: introduce intel_rapl.h

Create a new header file for the common definitions that might be used
by different RAPL Interface.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS                   |   1 +
 drivers/powercap/intel_rapl.c | 101 +------------------------------------
 include/linux/intel_rapl.h    | 113 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+), 99 deletions(-)
 create mode 100644 include/linux/intel_rapl.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 28a36f1efe02..9ded49d371da 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12679,6 +12679,7 @@ F:	drivers/base/power/
 F:	include/linux/pm.h
 F:	include/linux/pm_*
 F:	include/linux/powercap.h
+F:	include/linux/intel_rapl.h
 F:	drivers/powercap/
 F:	kernel/configs/nopm.config
 
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 9be9f20ff056..adb35ec9f939 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -18,8 +18,9 @@
 #include <linux/cpu.h>
 #include <linux/powercap.h>
 #include <linux/suspend.h>
-#include <asm/iosf_mbi.h>
+#include <linux/intel_rapl.h>
 
+#include <asm/iosf_mbi.h>
 #include <asm/processor.h>
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
@@ -74,59 +75,9 @@ enum unit_type {
 	TIME_UNIT,
 };
 
-enum rapl_domain_type {
-	RAPL_DOMAIN_PACKAGE, /* entire package/socket */
-	RAPL_DOMAIN_PP0, /* core power plane */
-	RAPL_DOMAIN_PP1, /* graphics uncore */
-	RAPL_DOMAIN_DRAM,/* DRAM control_type */
-	RAPL_DOMAIN_PLATFORM, /* PSys control_type */
-	RAPL_DOMAIN_MAX,
-};
-
-enum rapl_domain_reg_id {
-	RAPL_DOMAIN_REG_LIMIT,
-	RAPL_DOMAIN_REG_STATUS,
-	RAPL_DOMAIN_REG_PERF,
-	RAPL_DOMAIN_REG_POLICY,
-	RAPL_DOMAIN_REG_INFO,
-	RAPL_DOMAIN_REG_MAX,
-};
-
 /* per domain data, some are optional */
-enum rapl_primitives {
-	ENERGY_COUNTER,
-	POWER_LIMIT1,
-	POWER_LIMIT2,
-	FW_LOCK,
-
-	PL1_ENABLE,  /* power limit 1, aka long term */
-	PL1_CLAMP,   /* allow frequency to go below OS request */
-	PL2_ENABLE,  /* power limit 2, aka short term, instantaneous */
-	PL2_CLAMP,
-
-	TIME_WINDOW1, /* long term */
-	TIME_WINDOW2, /* short term */
-	THERMAL_SPEC_POWER,
-	MAX_POWER,
-
-	MIN_POWER,
-	MAX_TIME_WINDOW,
-	THROTTLED_TIME,
-	PRIORITY_LEVEL,
-
-	/* below are not raw primitive data */
-	AVERAGE_POWER,
-	NR_RAPL_PRIMITIVES,
-};
-
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
 
-/* Can be expanded to include events, etc.*/
-struct rapl_domain_data {
-	u64 primitives[NR_RAPL_PRIMITIVES];
-	unsigned long timestamp;
-};
-
 struct msrl_action {
 	u32 msr_no;
 	u64 clear_mask;
@@ -138,60 +89,12 @@ struct msrl_action {
 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
 
-#define NR_POWER_LIMITS (2)
-struct rapl_power_limit {
-	struct powercap_zone_constraint *constraint;
-	int prim_id; /* primitive ID used to enable */
-	struct rapl_domain *domain;
-	const char *name;
-	u64 last_power_limit;
-};
-
 static const char pl1_name[] = "long_term";
 static const char pl2_name[] = "short_term";
 
-struct rapl_package;
-struct rapl_domain {
-	const char *name;
-	enum rapl_domain_type id;
-	int regs[RAPL_DOMAIN_REG_MAX];
-	struct powercap_zone power_zone;
-	struct rapl_domain_data rdd;
-	struct rapl_power_limit rpl[NR_POWER_LIMITS];
-	u64 attr_map; /* track capabilities */
-	unsigned int state;
-	unsigned int domain_energy_unit;
-	struct rapl_package *rp;
-};
 #define power_zone_to_rapl_domain(_zone) \
 	container_of(_zone, struct rapl_domain, power_zone)
 
-/* maximum rapl package domain name: package-%d-die-%d */
-#define PACKAGE_DOMAIN_NAME_LENGTH 30
-
-
-/* Each rapl package contains multiple domains, these are the common
- * data across RAPL domains within a package.
- */
-struct rapl_package {
-	unsigned int id; /* logical die id, equals physical 1-die systems */
-	unsigned int nr_domains;
-	unsigned long domain_map; /* bit map of active domains */
-	unsigned int power_unit;
-	unsigned int energy_unit;
-	unsigned int time_unit;
-	struct rapl_domain *domains; /* array of domains, sized at runtime */
-	struct powercap_zone *power_zone; /* keep track of parent zone */
-	unsigned long power_limit_irq; /* keep track of package power limit
-					* notify interrupt enable status.
-					*/
-	struct list_head plist;
-	int lead_cpu; /* one active cpu per package for access */
-	/* Track active cpus */
-	struct cpumask cpumask;
-	char name[PACKAGE_DOMAIN_NAME_LENGTH];
-};
-
 struct rapl_defaults {
 	u8 floor_freq_reg_addr;
 	int (*check_unit)(struct rapl_package *rp, int cpu);
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
new file mode 100644
index 000000000000..94716036d829
--- /dev/null
+++ b/include/linux/intel_rapl.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Data types and headers for RAPL support
+ *
+ *  Copyright (C) 2019  Intel Corporation.
+ *
+ *  Author: Zhang Rui <rui.zhang@intel.com>
+ */
+
+#ifndef __INTEL_RAPL_H__
+#define __INTEL_RAPL_H__
+
+#include <linux/types.h>
+#include <linux/powercap.h>
+
+enum rapl_domain_type {
+	RAPL_DOMAIN_PACKAGE,	/* entire package/socket */
+	RAPL_DOMAIN_PP0,	/* core power plane */
+	RAPL_DOMAIN_PP1,	/* graphics uncore */
+	RAPL_DOMAIN_DRAM,	/* DRAM control_type */
+	RAPL_DOMAIN_PLATFORM,	/* PSys control_type */
+	RAPL_DOMAIN_MAX,
+};
+
+enum rapl_domain_reg_id {
+	RAPL_DOMAIN_REG_LIMIT,
+	RAPL_DOMAIN_REG_STATUS,
+	RAPL_DOMAIN_REG_PERF,
+	RAPL_DOMAIN_REG_POLICY,
+	RAPL_DOMAIN_REG_INFO,
+	RAPL_DOMAIN_REG_MAX,
+};
+
+struct rapl_package;
+
+enum rapl_primitives {
+	ENERGY_COUNTER,
+	POWER_LIMIT1,
+	POWER_LIMIT2,
+	FW_LOCK,
+
+	PL1_ENABLE,		/* power limit 1, aka long term */
+	PL1_CLAMP,		/* allow frequency to go below OS request */
+	PL2_ENABLE,		/* power limit 2, aka short term, instantaneous */
+	PL2_CLAMP,
+
+	TIME_WINDOW1,		/* long term */
+	TIME_WINDOW2,		/* short term */
+	THERMAL_SPEC_POWER,
+	MAX_POWER,
+
+	MIN_POWER,
+	MAX_TIME_WINDOW,
+	THROTTLED_TIME,
+	PRIORITY_LEVEL,
+
+	/* below are not raw primitive data */
+	AVERAGE_POWER,
+	NR_RAPL_PRIMITIVES,
+};
+
+struct rapl_domain_data {
+	u64 primitives[NR_RAPL_PRIMITIVES];
+	unsigned long timestamp;
+};
+
+#define NR_POWER_LIMITS (2)
+struct rapl_power_limit {
+	struct powercap_zone_constraint *constraint;
+	int prim_id;		/* primitive ID used to enable */
+	struct rapl_domain *domain;
+	const char *name;
+	u64 last_power_limit;
+};
+
+struct rapl_package;
+
+struct rapl_domain {
+	const char *name;
+	enum rapl_domain_type id;
+	int regs[RAPL_DOMAIN_REG_MAX];
+	struct powercap_zone power_zone;
+	struct rapl_domain_data rdd;
+	struct rapl_power_limit rpl[NR_POWER_LIMITS];
+	u64 attr_map;		/* track capabilities */
+	unsigned int state;
+	unsigned int domain_energy_unit;
+	struct rapl_package *rp;
+};
+
+/* maximum rapl package domain name: package-%d-die-%d */
+#define PACKAGE_DOMAIN_NAME_LENGTH 30
+
+struct rapl_package {
+	unsigned int id;	/* logical die id, equals physical 1-die systems */
+	unsigned int nr_domains;
+	unsigned long domain_map;	/* bit map of active domains */
+	unsigned int power_unit;
+	unsigned int energy_unit;
+	unsigned int time_unit;
+	struct rapl_domain *domains;	/* array of domains, sized at runtime */
+	struct powercap_zone *power_zone;	/* keep track of parent zone */
+	unsigned long power_limit_irq;	/* keep track of package power limit
+					 * notify interrupt enable status.
+					 */
+	struct list_head plist;
+	int lead_cpu;		/* one active cpu per package for access */
+	/* Track active cpus */
+	struct cpumask cpumask;
+	char name[PACKAGE_DOMAIN_NAME_LENGTH];
+};
+
+#endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3


From 7ebf8eff63b4f349e7b2ded6aa5036d94bdf94b9 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:25 +0800
Subject: intel_rapl: introduce struct rapl_if_private

Introduce a new structure, rapl_if_private, to save the private data
for different RAPL Interface.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 59 +++++++++++++++++++++----------------------
 include/linux/intel_rapl.h    | 15 +++++++++++
 2 files changed, 44 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index adb35ec9f939..e05d92d67525 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -75,6 +75,9 @@ enum unit_type {
 	TIME_UNIT,
 };
 
+/* private data for RAPL MSR Interface */
+static struct rapl_if_priv rapl_msr_priv;
+
 /* per domain data, some are optional */
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
 
@@ -155,17 +158,14 @@ static const char * const rapl_domain_names[] = {
 	"psys",
 };
 
-static struct powercap_control_type *control_type; /* PowerCap Controller */
-static struct rapl_domain *platform_rapl_domain; /* Platform (PSys) domain */
-
 /* caller to ensure CPU hotplug lock is held */
-static struct rapl_package *rapl_find_package_domain(int cpu)
+static struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
 {
 	int id = topology_logical_die_id(cpu);
 	struct rapl_package *rp;
 
 	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (rp->id == id)
+		if (rp->id == id && rp->priv->control_type == priv->control_type)
 			return rp;
 	}
 
@@ -1090,12 +1090,12 @@ static void rapl_update_domain_data(struct rapl_package *rp)
 
 static void rapl_unregister_powercap(void)
 {
-	if (platform_rapl_domain) {
-		powercap_unregister_zone(control_type,
-					 &platform_rapl_domain->power_zone);
-		kfree(platform_rapl_domain);
+	if (&rapl_msr_priv.platform_rapl_domain) {
+		powercap_unregister_zone(rapl_msr_priv.control_type,
+					 &rapl_msr_priv.platform_rapl_domain->power_zone);
+		kfree(rapl_msr_priv.platform_rapl_domain);
 	}
-	powercap_unregister_control_type(control_type);
+	powercap_unregister_control_type(rapl_msr_priv.control_type);
 }
 
 static int rapl_package_register_powercap(struct rapl_package *rp)
@@ -1113,7 +1113,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 			nr_pl = find_nr_power_limit(rd);
 			pr_debug("register package domain %s\n", rp->name);
 			power_zone = powercap_register_zone(&rd->power_zone,
-							control_type,
+							rp->priv->control_type,
 							rp->name, NULL,
 							&zone_ops[rd->id],
 							nr_pl,
@@ -1140,7 +1140,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 		/* number of power limits per domain varies */
 		nr_pl = find_nr_power_limit(rd);
 		power_zone = powercap_register_zone(&rd->power_zone,
-						control_type, rd->name,
+						rp->priv->control_type, rd->name,
 						rp->power_zone,
 						&zone_ops[rd->id], nr_pl,
 						&constraint_ops);
@@ -1161,7 +1161,7 @@ err_cleanup:
 	 */
 	while (--rd >= rp->domains) {
 		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
-		powercap_unregister_zone(control_type, &rd->power_zone);
+		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
 	}
 
 	return ret;
@@ -1191,9 +1191,9 @@ static int __init rapl_register_psys(void)
 	rd->rpl[0].name = pl1_name;
 	rd->rpl[1].prim_id = PL2_ENABLE;
 	rd->rpl[1].name = pl2_name;
-	rd->rp = rapl_find_package_domain(0);
+	rd->rp = rapl_find_package_domain(0, &rapl_msr_priv);
 
-	power_zone = powercap_register_zone(&rd->power_zone, control_type,
+	power_zone = powercap_register_zone(&rd->power_zone, rapl_msr_priv.control_type,
 					    "psys", NULL,
 					    &zone_ops[RAPL_DOMAIN_PLATFORM],
 					    2, &constraint_ops);
@@ -1203,17 +1203,17 @@ static int __init rapl_register_psys(void)
 		return PTR_ERR(power_zone);
 	}
 
-	platform_rapl_domain = rd;
+	rapl_msr_priv.platform_rapl_domain = rd;
 
 	return 0;
 }
 
 static int __init rapl_register_powercap(void)
 {
-	control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
-	if (IS_ERR(control_type)) {
+	rapl_msr_priv.control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
+	if (IS_ERR(rapl_msr_priv.control_type)) {
 		pr_debug("failed to register powercap control_type.\n");
-		return PTR_ERR(control_type);
+		return PTR_ERR(rapl_msr_priv.control_type);
 	}
 	return 0;
 }
@@ -1338,16 +1338,16 @@ static void rapl_remove_package(struct rapl_package *rp)
 		}
 		pr_debug("remove package, undo power limit on %s: %s\n",
 			 rp->name, rd->name);
-		powercap_unregister_zone(control_type, &rd->power_zone);
+		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
 	}
 	/* do parent zone last */
-	powercap_unregister_zone(control_type, &rd_package->power_zone);
+	powercap_unregister_zone(rp->priv->control_type, &rd_package->power_zone);
 	list_del(&rp->plist);
 	kfree(rp);
 }
 
 /* called from CPU hotplug notifier, hotplug lock held */
-static struct rapl_package *rapl_add_package(int cpu)
+static struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
 {
 	int id = topology_logical_die_id(cpu);
 	struct rapl_package *rp;
@@ -1361,6 +1361,7 @@ static struct rapl_package *rapl_add_package(int cpu)
 	/* add the new package to the list */
 	rp->id = id;
 	rp->lead_cpu = cpu;
+	rp->priv = priv;
 
 	if (topology_max_die_per_package() > 1)
 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
@@ -1399,9 +1400,9 @@ static int rapl_cpu_online(unsigned int cpu)
 {
 	struct rapl_package *rp;
 
-	rp = rapl_find_package_domain(cpu);
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
 	if (!rp) {
-		rp = rapl_add_package(cpu);
+		rp = rapl_add_package(cpu, &rapl_msr_priv);
 		if (IS_ERR(rp))
 			return PTR_ERR(rp);
 	}
@@ -1414,7 +1415,7 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 	struct rapl_package *rp;
 	int lead_cpu;
 
-	rp = rapl_find_package_domain(cpu);
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
 	if (!rp)
 		return 0;
 
@@ -1427,8 +1428,6 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 	return 0;
 }
 
-static enum cpuhp_state pcap_rapl_online;
-
 static void power_limit_state_save(void)
 {
 	struct rapl_package *rp;
@@ -1538,7 +1537,7 @@ static int __init rapl_init(void)
 				rapl_cpu_online, rapl_cpu_down_prep);
 	if (ret < 0)
 		goto err_unreg;
-	pcap_rapl_online = ret;
+	rapl_msr_priv.pcap_rapl_online = ret;
 
 	/* Don't bail out if PSys is not supported */
 	rapl_register_psys();
@@ -1550,7 +1549,7 @@ static int __init rapl_init(void)
 	return 0;
 
 err_unreg_all:
-	cpuhp_remove_state(pcap_rapl_online);
+	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
 
 err_unreg:
 	rapl_unregister_powercap();
@@ -1560,7 +1559,7 @@ err_unreg:
 static void __exit rapl_exit(void)
 {
 	unregister_pm_notifier(&rapl_pm_notifier);
-	cpuhp_remove_state(pcap_rapl_online);
+	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
 	rapl_unregister_powercap();
 }
 
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 94716036d829..7bf1683e4a63 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -88,6 +88,20 @@ struct rapl_domain {
 	struct rapl_package *rp;
 };
 
+/**
+ * struct rapl_if_priv: private data for different RAPL interfaces
+ * @control_type:		Each RAPL interface must have its own powercap
+ *				control type.
+ * @platform_rapl_domain:	Optional. Some RAPL interface may have platform
+ *				level RAPL control.
+ * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
+ */
+struct rapl_if_priv {
+	struct powercap_control_type *control_type;
+	struct rapl_domain *platform_rapl_domain;
+	enum cpuhp_state pcap_rapl_online;
+};
+
 /* maximum rapl package domain name: package-%d-die-%d */
 #define PACKAGE_DOMAIN_NAME_LENGTH 30
 
@@ -108,6 +122,7 @@ struct rapl_package {
 	/* Track active cpus */
 	struct cpumask cpumask;
 	char name[PACKAGE_DOMAIN_NAME_LENGTH];
+	struct rapl_if_priv *priv;
 };
 
 #endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3


From 7fde2712a7adab721eaabafbd8ff93dff3262d35 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:26 +0800
Subject: intel_rapl: abstract register address

MSR and MMIO RAPL interface have different sets of registers, thus the
RAPL register address should be obtained from interface specific
structure, i.e. struct rapl_if_private, instead.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 73 +++++++++++++++++++------------------------
 include/linux/intel_rapl.h    |  4 +++
 2 files changed, 37 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index e05d92d67525..9f22aed49f24 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -76,7 +76,19 @@ enum unit_type {
 };
 
 /* private data for RAPL MSR Interface */
-static struct rapl_if_priv rapl_msr_priv;
+static struct rapl_if_priv rapl_msr_priv = {
+	.reg_unit = MSR_RAPL_POWER_UNIT,
+	.regs[RAPL_DOMAIN_PACKAGE] = {
+		MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO },
+	.regs[RAPL_DOMAIN_PP0] = {
+		MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 },
+	.regs[RAPL_DOMAIN_PP1] = {
+		MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 },
+	.regs[RAPL_DOMAIN_DRAM] = {
+		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
+	.regs[RAPL_DOMAIN_PLATFORM] = {
+		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
+};
 
 /* per domain data, some are optional */
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
@@ -541,15 +553,17 @@ static void rapl_init_domains(struct rapl_package *rp)
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		unsigned int mask = rp->domain_map & (1 << i);
+
+		rd->regs[RAPL_DOMAIN_REG_LIMIT] = rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
+		rd->regs[RAPL_DOMAIN_REG_STATUS] = rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
+		rd->regs[RAPL_DOMAIN_REG_PERF] = rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
+		rd->regs[RAPL_DOMAIN_REG_POLICY] = rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
+		rd->regs[RAPL_DOMAIN_REG_INFO] = rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
+
 		switch (mask) {
 		case BIT(RAPL_DOMAIN_PACKAGE):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
 			rd->id = RAPL_DOMAIN_PACKAGE;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PKG_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PKG_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = MSR_PKG_PERF_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = 0;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = MSR_PKG_POWER_INFO;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			rd->rpl[1].prim_id = PL2_ENABLE;
@@ -558,33 +572,18 @@ static void rapl_init_domains(struct rapl_package *rp)
 		case BIT(RAPL_DOMAIN_PP0):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
 			rd->id = RAPL_DOMAIN_PP0;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PP0_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PP0_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = 0;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = MSR_PP0_POLICY;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = 0;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			break;
 		case BIT(RAPL_DOMAIN_PP1):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
 			rd->id = RAPL_DOMAIN_PP1;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PP1_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PP1_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = 0;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = MSR_PP1_POLICY;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = 0;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			break;
 		case BIT(RAPL_DOMAIN_DRAM):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
 			rd->id = RAPL_DOMAIN_DRAM;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_DRAM_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_DRAM_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = MSR_DRAM_PERF_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = 0;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = MSR_DRAM_POWER_INFO;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			rd->domain_energy_unit =
@@ -806,9 +805,9 @@ static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
 	u64 msr_val;
 	u32 value;
 
-	if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) {
+	if (rdmsrl_safe_on_cpu(cpu, rp->priv->reg_unit, &msr_val)) {
 		pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
-			MSR_RAPL_POWER_UNIT, cpu);
+			rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
 
@@ -832,9 +831,9 @@ static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 	u64 msr_val;
 	u32 value;
 
-	if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) {
+	if (rdmsrl_safe_on_cpu(cpu, rp->priv->reg_unit, &msr_val)) {
 		pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
-			MSR_RAPL_POWER_UNIT, cpu);
+			rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
 	value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
@@ -1173,10 +1172,10 @@ static int __init rapl_register_psys(void)
 	struct powercap_zone *power_zone;
 	u64 val;
 
-	if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_ENERGY_STATUS, &val) || !val)
+	if (rdmsrl_safe_on_cpu(0, rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS], &val) || !val)
 		return -ENODEV;
 
-	if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_POWER_LIMIT, &val) || !val)
+	if (rdmsrl_safe_on_cpu(0, rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT], &val) || !val)
 		return -ENODEV;
 
 	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
@@ -1185,8 +1184,8 @@ static int __init rapl_register_psys(void)
 
 	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
 	rd->id = RAPL_DOMAIN_PLATFORM;
-	rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PLATFORM_POWER_LIMIT;
-	rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PLATFORM_ENERGY_STATUS;
+	rd->regs[RAPL_DOMAIN_REG_LIMIT] = rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+	rd->regs[RAPL_DOMAIN_REG_STATUS] = rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
 	rd->rpl[0].prim_id = PL1_ENABLE;
 	rd->rpl[0].name = pl1_name;
 	rd->rpl[1].prim_id = PL2_ENABLE;
@@ -1218,23 +1217,17 @@ static int __init rapl_register_powercap(void)
 	return 0;
 }
 
-static int rapl_check_domain(int cpu, int domain)
+static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
 {
-	unsigned msr;
+	u32 reg;
 	u64 val = 0;
 
 	switch (domain) {
 	case RAPL_DOMAIN_PACKAGE:
-		msr = MSR_PKG_ENERGY_STATUS;
-		break;
 	case RAPL_DOMAIN_PP0:
-		msr = MSR_PP0_ENERGY_STATUS;
-		break;
 	case RAPL_DOMAIN_PP1:
-		msr = MSR_PP1_ENERGY_STATUS;
-		break;
 	case RAPL_DOMAIN_DRAM:
-		msr = MSR_DRAM_ENERGY_STATUS;
+		reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
 		break;
 	case RAPL_DOMAIN_PLATFORM:
 		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
@@ -1246,7 +1239,7 @@ static int rapl_check_domain(int cpu, int domain)
 	/* make sure domain counters are available and contains non-zero
 	 * values, otherwise skip it.
 	 */
-	if (rdmsrl_safe_on_cpu(cpu, msr, &val) || !val)
+	if (rdmsrl_safe_on_cpu(cpu, reg, &val) || !val)
 		return -ENODEV;
 
 	return 0;
@@ -1293,7 +1286,7 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu)
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		/* use physical package id to read counters */
-		if (!rapl_check_domain(cpu, i)) {
+		if (!rapl_check_domain(cpu, i, rp)) {
 			rp->domain_map |= 1 << i;
 			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
 		}
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 7bf1683e4a63..ec2c9e83274f 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -95,11 +95,15 @@ struct rapl_domain {
  * @platform_rapl_domain:	Optional. Some RAPL interface may have platform
  *				level RAPL control.
  * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
+ * @reg_unit:			Register for getting energy/power/time unit.
+ * @regs:			Register sets for different RAPL Domains.
  */
 struct rapl_if_priv {
 	struct powercap_control_type *control_type;
 	struct rapl_domain *platform_rapl_domain;
 	enum cpuhp_state pcap_rapl_online;
+	u32 reg_unit;
+	u32 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
 };
 
 /* maximum rapl package domain name: package-%d-die-%d */
-- 
cgit v1.2.3


From beea8df821d928e7755917da6c1e45d6afde5148 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:27 +0800
Subject: intel_rapl: abstract register access operations

MSR and MMIO RAPL interfaces have different ways to access the registers,
thus in order to abstract the register access operations, two callbacks,
.read_raw()/.write_raw() are introduced, and they should be implemented by
MSR RAPL and MMIO RAPL interface driver respectly.

This patch implements them for the MSR I/F only.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 110 ++++++++++++++++++++++--------------------
 include/linux/intel_rapl.h    |  13 +++++
 2 files changed, 70 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 9f22aed49f24..d3b9d1cf4d48 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -93,13 +93,6 @@ static struct rapl_if_priv rapl_msr_priv = {
 /* per domain data, some are optional */
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
 
-struct msrl_action {
-	u32 msr_no;
-	u64 clear_mask;
-	u64 set_mask;
-	int err;
-};
-
 #define	DOMAIN_STATE_INACTIVE           BIT(0)
 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
@@ -692,16 +685,16 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 			enum rapl_primitives prim,
 			bool xlate, u64 *data)
 {
-	u64 value, final;
-	u32 msr;
+	u64 value;
 	struct rapl_primitive_info *rp = &rpi[prim];
+	struct reg_action ra;
 	int cpu;
 
 	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
 		return -EINVAL;
 
-	msr = rd->regs[rp->id];
-	if (!msr)
+	ra.reg = rd->regs[rp->id];
+	if (!ra.reg)
 		return -EINVAL;
 
 	cpu = rd->rp->lead_cpu;
@@ -717,47 +710,23 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 		return 0;
 	}
 
-	if (rdmsrl_safe_on_cpu(cpu, msr, &value)) {
-		pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu);
+	ra.mask = rp->mask;
+
+	if (rd->rp->priv->read_raw(cpu, &ra)) {
+		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
 		return -EIO;
 	}
 
-	final = value & rp->mask;
-	final = final >> rp->shift;
+	value = ra.value >> rp->shift;
+
 	if (xlate)
-		*data = rapl_unit_xlate(rd, rp->unit, final, 0);
+		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
 	else
-		*data = final;
+		*data = value;
 
 	return 0;
 }
 
-
-static int msrl_update_safe(u32 msr_no, u64 clear_mask, u64 set_mask)
-{
-	int err;
-	u64 val;
-
-	err = rdmsrl_safe(msr_no, &val);
-	if (err)
-		goto out;
-
-	val &= ~clear_mask;
-	val |= set_mask;
-
-	err = wrmsrl_safe(msr_no, val);
-
-out:
-	return err;
-}
-
-static void msrl_update_func(void *info)
-{
-	struct msrl_action *ma = info;
-
-	ma->err = msrl_update_safe(ma->msr_no, ma->clear_mask, ma->set_mask);
-}
-
 /* Similar use of primitive info in the read counterpart */
 static int rapl_write_data_raw(struct rapl_domain *rd,
 			enum rapl_primitives prim,
@@ -766,7 +735,7 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 	struct rapl_primitive_info *rp = &rpi[prim];
 	int cpu;
 	u64 bits;
-	struct msrl_action ma;
+	struct reg_action ra;
 	int ret;
 
 	cpu = rd->rp->lead_cpu;
@@ -774,17 +743,13 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 	bits <<= rp->shift;
 	bits &= rp->mask;
 
-	memset(&ma, 0, sizeof(ma));
+	memset(&ra, 0, sizeof(ra));
 
-	ma.msr_no = rd->regs[rp->id];
-	ma.clear_mask = rp->mask;
-	ma.set_mask = bits;
+	ra.reg = rd->regs[rp->id];
+	ra.mask = rp->mask;
+	ra.value = bits;
 
-	ret = smp_call_function_single(cpu, msrl_update_func, &ma, 1);
-	if (ret)
-		WARN_ON_ONCE(ret);
-	else
-		ret = ma.err;
+	ret = rd->rp->priv->write_raw(cpu, &ra);
 
 	return ret;
 }
@@ -1507,6 +1472,43 @@ static struct notifier_block rapl_pm_notifier = {
 	.notifier_call = rapl_pm_callback,
 };
 
+static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
+{
+	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
+		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
+		return -EIO;
+	}
+	ra->value &= ra->mask;
+	return 0;
+}
+
+static void rapl_msr_update_func(void *info)
+{
+	struct reg_action *ra = info;
+	u64 val;
+
+	ra->err = rdmsrl_safe(ra->reg, &val);
+	if (ra->err)
+		return;
+
+	val &= ~ra->mask;
+	val |= ra->value;
+
+	ra->err = wrmsrl_safe(ra->reg, val);
+}
+
+
+static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
+{
+	int ret;
+
+	ret = smp_call_function_single(cpu, rapl_msr_update_func, ra, 1);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
+	return ra->err;
+}
+
 static int __init rapl_init(void)
 {
 	const struct x86_cpu_id *id;
@@ -1522,6 +1524,8 @@ static int __init rapl_init(void)
 
 	rapl_defaults = (struct rapl_defaults *)id->driver_data;
 
+	rapl_msr_priv.read_raw = rapl_msr_read_raw;
+	rapl_msr_priv.write_raw = rapl_msr_write_raw;
 	ret = rapl_register_powercap();
 	if (ret)
 		return ret;
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index ec2c9e83274f..ff215d64d114 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -88,6 +88,13 @@ struct rapl_domain {
 	struct rapl_package *rp;
 };
 
+struct reg_action {
+	u32 reg;
+	u64 mask;
+	u64 value;
+	int err;
+};
+
 /**
  * struct rapl_if_priv: private data for different RAPL interfaces
  * @control_type:		Each RAPL interface must have its own powercap
@@ -97,6 +104,10 @@ struct rapl_domain {
  * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
  * @reg_unit:			Register for getting energy/power/time unit.
  * @regs:			Register sets for different RAPL Domains.
+ * @read_raw:			Callback for reading RAPL interface specific
+ *				registers.
+ * @write_raw:			Callback for writing RAPL interface specific
+ *				registers.
  */
 struct rapl_if_priv {
 	struct powercap_control_type *control_type;
@@ -104,6 +115,8 @@ struct rapl_if_priv {
 	enum cpuhp_state pcap_rapl_online;
 	u32 reg_unit;
 	u32 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
+	int (*read_raw)(int cpu, struct reg_action *ra);
+	int (*write_raw)(int cpu, struct reg_action *ra);
 };
 
 /* maximum rapl package domain name: package-%d-die-%d */
-- 
cgit v1.2.3


From 3382388d714891fc0f575926189f33d22e7c960b Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:30 +0800
Subject: intel_rapl: abstract RAPL common code

Split intel_rapl.c to intel_rapl_common.c and intel_rapl_msr.c, where
intel_rapl_common.c contains the common code that can be used by both MSR
and MMIO interface.
intel_rapl_msr.c contains the implementation of RAPL MSR interface.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/Kconfig             |   11 +-
 drivers/powercap/Makefile            |    3 +-
 drivers/powercap/intel_rapl.c        | 1574 ----------------------------------
 drivers/powercap/intel_rapl_common.c | 1469 +++++++++++++++++++++++++++++++
 drivers/powercap/intel_rapl_msr.c    |  163 ++++
 include/linux/intel_rapl.h           |    7 +
 6 files changed, 1648 insertions(+), 1579 deletions(-)
 delete mode 100644 drivers/powercap/intel_rapl.c
 create mode 100644 drivers/powercap/intel_rapl_common.c
 create mode 100644 drivers/powercap/intel_rapl_msr.c

(limited to 'include')

diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig
index 42d3798c88f0..dc1c1381d7fa 100644
--- a/drivers/powercap/Kconfig
+++ b/drivers/powercap/Kconfig
@@ -16,14 +16,17 @@ menuconfig POWERCAP
 
 if POWERCAP
 # Client driver configurations go here.
+config INTEL_RAPL_CORE
+	tristate
+
 config INTEL_RAPL
-	tristate "Intel RAPL Support"
+	tristate "Intel RAPL Support via MSR Interface"
 	depends on X86 && IOSF_MBI
-	default n
+	select INTEL_RAPL_CORE
 	---help---
 	  This enables support for the Intel Running Average Power Limit (RAPL)
-	  technology which allows power limits to be enforced and monitored on
-	  modern Intel processors (Sandy Bridge and later).
+	  technology via MSR interface, which allows power limits to be enforced
+	  and monitored on modern Intel processors (Sandy Bridge and later).
 
 	  In RAPL, the platform level settings are divided into domains for
 	  fine grained control. These domains include processor package, DRAM
diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile
index 81c8ccaba6e7..7255c94ec61c 100644
--- a/drivers/powercap/Makefile
+++ b/drivers/powercap/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_POWERCAP)	+= powercap_sys.o
-obj-$(CONFIG_INTEL_RAPL) += intel_rapl.o
+obj-$(CONFIG_INTEL_RAPL_CORE) += intel_rapl_common.o
+obj-$(CONFIG_INTEL_RAPL) += intel_rapl_msr.o
 obj-$(CONFIG_IDLE_INJECT) += idle_inject.o
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
deleted file mode 100644
index aa54c06ed518..000000000000
--- a/drivers/powercap/intel_rapl.c
+++ /dev/null
@@ -1,1574 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Running Average Power Limit (RAPL) Driver
- * Copyright (c) 2013, Intel Corporation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/log2.h>
-#include <linux/bitmap.h>
-#include <linux/delay.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
-#include <linux/powercap.h>
-#include <linux/suspend.h>
-#include <linux/intel_rapl.h>
-
-#include <asm/iosf_mbi.h>
-#include <asm/processor.h>
-#include <asm/cpu_device_id.h>
-#include <asm/intel-family.h>
-
-/* Local defines */
-#define MSR_PLATFORM_POWER_LIMIT	0x0000065C
-
-/* bitmasks for RAPL MSRs, used by primitive access functions */
-#define ENERGY_STATUS_MASK      0xffffffff
-
-#define POWER_LIMIT1_MASK       0x7FFF
-#define POWER_LIMIT1_ENABLE     BIT(15)
-#define POWER_LIMIT1_CLAMP      BIT(16)
-
-#define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
-#define POWER_LIMIT2_ENABLE     BIT_ULL(47)
-#define POWER_LIMIT2_CLAMP      BIT_ULL(48)
-#define POWER_PACKAGE_LOCK      BIT_ULL(63)
-#define POWER_PP_LOCK           BIT(31)
-
-#define TIME_WINDOW1_MASK       (0x7FULL<<17)
-#define TIME_WINDOW2_MASK       (0x7FULL<<49)
-
-#define POWER_UNIT_OFFSET	0
-#define POWER_UNIT_MASK		0x0F
-
-#define ENERGY_UNIT_OFFSET	0x08
-#define ENERGY_UNIT_MASK	0x1F00
-
-#define TIME_UNIT_OFFSET	0x10
-#define TIME_UNIT_MASK		0xF0000
-
-#define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
-#define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
-#define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
-#define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
-
-#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
-#define PP_POLICY_MASK         0x1F
-
-/* Non HW constants */
-#define RAPL_PRIMITIVE_DERIVED       BIT(1) /* not from raw data */
-#define RAPL_PRIMITIVE_DUMMY         BIT(2)
-
-#define TIME_WINDOW_MAX_MSEC 40000
-#define TIME_WINDOW_MIN_MSEC 250
-#define ENERGY_UNIT_SCALE    1000 /* scale from driver unit to powercap unit */
-enum unit_type {
-	ARBITRARY_UNIT, /* no translation */
-	POWER_UNIT,
-	ENERGY_UNIT,
-	TIME_UNIT,
-};
-
-/* private data for RAPL MSR Interface */
-static struct rapl_if_priv rapl_msr_priv = {
-	.reg_unit = MSR_RAPL_POWER_UNIT,
-	.regs[RAPL_DOMAIN_PACKAGE] = {
-		MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO },
-	.regs[RAPL_DOMAIN_PP0] = {
-		MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 },
-	.regs[RAPL_DOMAIN_PP1] = {
-		MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 },
-	.regs[RAPL_DOMAIN_DRAM] = {
-		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
-	.regs[RAPL_DOMAIN_PLATFORM] = {
-		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
-};
-
-/* per domain data, some are optional */
-#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
-
-#define	DOMAIN_STATE_INACTIVE           BIT(0)
-#define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
-#define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
-
-static const char pl1_name[] = "long_term";
-static const char pl2_name[] = "short_term";
-
-#define power_zone_to_rapl_domain(_zone) \
-	container_of(_zone, struct rapl_domain, power_zone)
-
-struct rapl_defaults {
-	u8 floor_freq_reg_addr;
-	int (*check_unit)(struct rapl_package *rp, int cpu);
-	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
-	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
-				bool to_raw);
-	unsigned int dram_domain_energy_unit;
-};
-static struct rapl_defaults *rapl_defaults;
-
-/* Sideband MBI registers */
-#define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
-#define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
-
-#define PACKAGE_PLN_INT_SAVED   BIT(0)
-#define MAX_PRIM_NAME (32)
-
-/* per domain data. used to describe individual knobs such that access function
- * can be consolidated into one instead of many inline functions.
- */
-struct rapl_primitive_info {
-	const char *name;
-	u64 mask;
-	int shift;
-	enum rapl_domain_reg_id id;
-	enum unit_type unit;
-	u32 flag;
-};
-
-#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
-		.name = #p,			\
-		.mask = m,			\
-		.shift = s,			\
-		.id = i,			\
-		.unit = u,			\
-		.flag = f			\
-	}
-
-static void rapl_init_domains(struct rapl_package *rp);
-static int rapl_read_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			bool xlate, u64 *data);
-static int rapl_write_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			unsigned long long value);
-static u64 rapl_unit_xlate(struct rapl_domain *rd,
-			enum unit_type type, u64 value,
-			int to_raw);
-static void package_power_limit_irq_save(struct rapl_package *rp);
-
-static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */
-
-static const char * const rapl_domain_names[] = {
-	"package",
-	"core",
-	"uncore",
-	"dram",
-	"psys",
-};
-
-/* caller to ensure CPU hotplug lock is held */
-static struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
-{
-	int id = topology_logical_die_id(cpu);
-	struct rapl_package *rp;
-
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (rp->id == id && rp->priv->control_type == priv->control_type)
-			return rp;
-	}
-
-	return NULL;
-}
-
-static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw)
-{
-	struct rapl_domain *rd;
-	u64 energy_now;
-
-	/* prevent CPU hotplug, make sure the RAPL domain does not go
-	 * away while reading the counter.
-	 */
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-
-	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
-		*energy_raw = energy_now;
-		put_online_cpus();
-
-		return 0;
-	}
-	put_online_cpus();
-
-	return -EIO;
-}
-
-static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
-
-	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
-	return 0;
-}
-
-static int release_zone(struct powercap_zone *power_zone)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
-	struct rapl_package *rp = rd->rp;
-
-	/* package zone is the last zone of a package, we can free
-	 * memory here since all children has been unregistered.
-	 */
-	if (rd->id == RAPL_DOMAIN_PACKAGE) {
-		kfree(rd);
-		rp->domains = NULL;
-	}
-
-	return 0;
-
-}
-
-static int find_nr_power_limit(struct rapl_domain *rd)
-{
-	int i, nr_pl = 0;
-
-	for (i = 0; i < NR_POWER_LIMITS; i++) {
-		if (rd->rpl[i].name)
-			nr_pl++;
-	}
-
-	return nr_pl;
-}
-
-static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
-
-	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
-		return -EACCES;
-
-	get_online_cpus();
-	rapl_write_data_raw(rd, PL1_ENABLE, mode);
-	if (rapl_defaults->set_floor_freq)
-		rapl_defaults->set_floor_freq(rd, mode);
-	put_online_cpus();
-
-	return 0;
-}
-
-static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
-	u64 val;
-
-	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
-		*mode = false;
-		return 0;
-	}
-	get_online_cpus();
-	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
-		put_online_cpus();
-		return -EIO;
-	}
-	*mode = val;
-	put_online_cpus();
-
-	return 0;
-}
-
-/* per RAPL domain ops, in the order of rapl_domain_type */
-static const struct powercap_zone_ops zone_ops[] = {
-	/* RAPL_DOMAIN_PACKAGE */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_PP0 */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_PP1 */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_DRAM */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_PLATFORM */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-};
-
-
-/*
- * Constraint index used by powercap can be different than power limit (PL)
- * index in that some  PLs maybe missing due to non-existant MSRs. So we
- * need to convert here by finding the valid PLs only (name populated).
- */
-static int contraint_to_pl(struct rapl_domain *rd, int cid)
-{
-	int i, j;
-
-	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
-		if ((rd->rpl[i].name) && j++ == cid) {
-			pr_debug("%s: index %d\n", __func__, i);
-			return i;
-		}
-	}
-	pr_err("Cannot find matching power limit for constraint %d\n", cid);
-
-	return -EINVAL;
-}
-
-static int set_power_limit(struct powercap_zone *power_zone, int cid,
-			u64 power_limit)
-{
-	struct rapl_domain *rd;
-	struct rapl_package *rp;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto set_exit;
-	}
-
-	rp = rd->rp;
-
-	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
-		dev_warn(&power_zone->dev, "%s locked by BIOS, monitoring only\n",
-			rd->name);
-		ret = -EACCES;
-		goto set_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
-		break;
-	case PL2_ENABLE:
-		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	if (!ret)
-		package_power_limit_irq_save(rp);
-set_exit:
-	put_online_cpus();
-	return ret;
-}
-
-static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
-					u64 *data)
-{
-	struct rapl_domain *rd;
-	u64 val;
-	int prim;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto get_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		prim = POWER_LIMIT1;
-		break;
-	case PL2_ENABLE:
-		prim = POWER_LIMIT2;
-		break;
-	default:
-		put_online_cpus();
-		return -EINVAL;
-	}
-	if (rapl_read_data_raw(rd, prim, true, &val))
-		ret = -EIO;
-	else
-		*data = val;
-
-get_exit:
-	put_online_cpus();
-
-	return ret;
-}
-
-static int set_time_window(struct powercap_zone *power_zone, int cid,
-								u64 window)
-{
-	struct rapl_domain *rd;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto set_time_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		rapl_write_data_raw(rd, TIME_WINDOW1, window);
-		break;
-	case PL2_ENABLE:
-		rapl_write_data_raw(rd, TIME_WINDOW2, window);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-set_time_exit:
-	put_online_cpus();
-	return ret;
-}
-
-static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
-{
-	struct rapl_domain *rd;
-	u64 val;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto get_time_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
-		break;
-	case PL2_ENABLE:
-		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
-		break;
-	default:
-		put_online_cpus();
-		return -EINVAL;
-	}
-	if (!ret)
-		*data = val;
-
-get_time_exit:
-	put_online_cpus();
-
-	return ret;
-}
-
-static const char *get_constraint_name(struct powercap_zone *power_zone, int cid)
-{
-	struct rapl_domain *rd;
-	int id;
-
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id >= 0)
-		return rd->rpl[id].name;
-
-	return NULL;
-}
-
-
-static int get_max_power(struct powercap_zone *power_zone, int id,
-					u64 *data)
-{
-	struct rapl_domain *rd;
-	u64 val;
-	int prim;
-	int ret = 0;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		prim = THERMAL_SPEC_POWER;
-		break;
-	case PL2_ENABLE:
-		prim = MAX_POWER;
-		break;
-	default:
-		put_online_cpus();
-		return -EINVAL;
-	}
-	if (rapl_read_data_raw(rd, prim, true, &val))
-		ret = -EIO;
-	else
-		*data = val;
-
-	put_online_cpus();
-
-	return ret;
-}
-
-static const struct powercap_zone_constraint_ops constraint_ops = {
-	.set_power_limit_uw = set_power_limit,
-	.get_power_limit_uw = get_current_power_limit,
-	.set_time_window_us = set_time_window,
-	.get_time_window_us = get_time_window,
-	.get_max_power_uw = get_max_power,
-	.get_name = get_constraint_name,
-};
-
-/* called after domain detection and package level data are set */
-static void rapl_init_domains(struct rapl_package *rp)
-{
-	int i;
-	struct rapl_domain *rd = rp->domains;
-
-	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
-		unsigned int mask = rp->domain_map & (1 << i);
-
-		rd->regs[RAPL_DOMAIN_REG_LIMIT] = rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
-		rd->regs[RAPL_DOMAIN_REG_STATUS] = rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
-		rd->regs[RAPL_DOMAIN_REG_PERF] = rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
-		rd->regs[RAPL_DOMAIN_REG_POLICY] = rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
-		rd->regs[RAPL_DOMAIN_REG_INFO] = rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
-
-		switch (mask) {
-		case BIT(RAPL_DOMAIN_PACKAGE):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
-			rd->id = RAPL_DOMAIN_PACKAGE;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			rd->rpl[1].prim_id = PL2_ENABLE;
-			rd->rpl[1].name = pl2_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP0):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
-			rd->id = RAPL_DOMAIN_PP0;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP1):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
-			rd->id = RAPL_DOMAIN_PP1;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_DRAM):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
-			rd->id = RAPL_DOMAIN_DRAM;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			rd->domain_energy_unit =
-				rapl_defaults->dram_domain_energy_unit;
-			if (rd->domain_energy_unit)
-				pr_info("DRAM domain energy unit %dpj\n",
-					rd->domain_energy_unit);
-			break;
-		}
-		if (mask) {
-			rd->rp = rp;
-			rd++;
-		}
-	}
-}
-
-static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
-			u64 value, int to_raw)
-{
-	u64 units = 1;
-	struct rapl_package *rp = rd->rp;
-	u64 scale = 1;
-
-	switch (type) {
-	case POWER_UNIT:
-		units = rp->power_unit;
-		break;
-	case ENERGY_UNIT:
-		scale = ENERGY_UNIT_SCALE;
-		/* per domain unit takes precedence */
-		if (rd->domain_energy_unit)
-			units = rd->domain_energy_unit;
-		else
-			units = rp->energy_unit;
-		break;
-	case TIME_UNIT:
-		return rapl_defaults->compute_time_window(rp, value, to_raw);
-	case ARBITRARY_UNIT:
-	default:
-		return value;
-	};
-
-	if (to_raw)
-		return div64_u64(value, units) * scale;
-
-	value *= units;
-
-	return div64_u64(value, scale);
-}
-
-/* in the order of enum rapl_primitives */
-static struct rapl_primitive_info rpi[] = {
-	/* name, mask, shift, msr index, unit divisor */
-	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
-				RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
-				RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
-				RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
-				RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
-				RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
-				0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
-				RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
-				RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
-				RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
-				RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
-				RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
-	/* non-hardware */
-	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
-				RAPL_PRIMITIVE_DERIVED),
-	{NULL, 0, 0, 0},
-};
-
-/* Read primitive data based on its related struct rapl_primitive_info.
- * if xlate flag is set, return translated data based on data units, i.e.
- * time, energy, and power.
- * RAPL MSRs are non-architectual and are laid out not consistently across
- * domains. Here we use primitive info to allow writing consolidated access
- * functions.
- * For a given primitive, it is processed by MSR mask and shift. Unit conversion
- * is pre-assigned based on RAPL unit MSRs read at init time.
- * 63-------------------------- 31--------------------------- 0
- * |                           xxxxx (mask)                   |
- * |                                |<- shift ----------------|
- * 63-------------------------- 31--------------------------- 0
- */
-static int rapl_read_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			bool xlate, u64 *data)
-{
-	u64 value;
-	struct rapl_primitive_info *rp = &rpi[prim];
-	struct reg_action ra;
-	int cpu;
-
-	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
-		return -EINVAL;
-
-	ra.reg = rd->regs[rp->id];
-	if (!ra.reg)
-		return -EINVAL;
-
-	cpu = rd->rp->lead_cpu;
-
-	/* special-case package domain, which uses a different bit*/
-	if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
-		rp->mask = POWER_PACKAGE_LOCK;
-		rp->shift = 63;
-	}
-	/* non-hardware data are collected by the polling thread */
-	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
-		*data = rd->rdd.primitives[prim];
-		return 0;
-	}
-
-	ra.mask = rp->mask;
-
-	if (rd->rp->priv->read_raw(cpu, &ra)) {
-		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
-		return -EIO;
-	}
-
-	value = ra.value >> rp->shift;
-
-	if (xlate)
-		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
-	else
-		*data = value;
-
-	return 0;
-}
-
-/* Similar use of primitive info in the read counterpart */
-static int rapl_write_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			unsigned long long value)
-{
-	struct rapl_primitive_info *rp = &rpi[prim];
-	int cpu;
-	u64 bits;
-	struct reg_action ra;
-	int ret;
-
-	cpu = rd->rp->lead_cpu;
-	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
-	bits <<= rp->shift;
-	bits &= rp->mask;
-
-	memset(&ra, 0, sizeof(ra));
-
-	ra.reg = rd->regs[rp->id];
-	ra.mask = rp->mask;
-	ra.value = bits;
-
-	ret = rd->rp->priv->write_raw(cpu, &ra);
-
-	return ret;
-}
-
-/*
- * Raw RAPL data stored in MSRs are in certain scales. We need to
- * convert them into standard units based on the units reported in
- * the RAPL unit MSRs. This is specific to CPUs as the method to
- * calculate units differ on different CPUs.
- * We convert the units to below format based on CPUs.
- * i.e.
- * energy unit: picoJoules  : Represented in picoJoules by default
- * power unit : microWatts  : Represented in milliWatts by default
- * time unit  : microseconds: Represented in seconds by default
- */
-static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
-{
-	struct reg_action ra;
-	u32 value;
-
-	ra.reg = rp->priv->reg_unit;
-	ra.mask = ~0;
-	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
-			rp->priv->reg_unit, cpu);
-		return -ENODEV;
-	}
-
-	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
-	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
-
-	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
-	rp->power_unit = 1000000 / (1 << value);
-
-	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
-	rp->time_unit = 1000000 / (1 << value);
-
-	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
-		rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
-
-	return 0;
-}
-
-static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
-{
-	struct reg_action ra;
-	u32 value;
-
-	ra.reg = rp->priv->reg_unit;
-	ra.mask = ~0;
-	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
-			rp->priv->reg_unit, cpu);
-		return -ENODEV;
-	}
-
-	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
-	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
-
-	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
-	rp->power_unit = (1 << value) * 1000;
-
-	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
-	rp->time_unit = 1000000 / (1 << value);
-
-	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
-		rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
-
-	return 0;
-}
-
-static void power_limit_irq_save_cpu(void *info)
-{
-	u32 l, h = 0;
-	struct rapl_package *rp = (struct rapl_package *)info;
-
-	/* save the state of PLN irq mask bit before disabling it */
-	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
-	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
-		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
-		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
-	}
-	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
-	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
-}
-
-
-/* REVISIT:
- * When package power limit is set artificially low by RAPL, LVT
- * thermal interrupt for package power limit should be ignored
- * since we are not really exceeding the real limit. The intention
- * is to avoid excessive interrupts while we are trying to save power.
- * A useful feature might be routing the package_power_limit interrupt
- * to userspace via eventfd. once we have a usecase, this is simple
- * to do by adding an atomic notifier.
- */
-
-static void package_power_limit_irq_save(struct rapl_package *rp)
-{
-	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
-		return;
-
-	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
-}
-
-/*
- * Restore per package power limit interrupt enable state. Called from cpu
- * hotplug code on package removal.
- */
-static void package_power_limit_irq_restore(struct rapl_package *rp)
-{
-	u32 l, h;
-
-	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
-		return;
-
-	/* irq enable state not saved, nothing to restore */
-	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
-		return;
-
-	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
-
-	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
-		l |= PACKAGE_THERM_INT_PLN_ENABLE;
-	else
-		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
-
-	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
-}
-
-static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
-{
-	int nr_powerlimit = find_nr_power_limit(rd);
-
-	/* always enable clamp such that p-state can go below OS requested
-	 * range. power capping priority over guranteed frequency.
-	 */
-	rapl_write_data_raw(rd, PL1_CLAMP, mode);
-
-	/* some domains have pl2 */
-	if (nr_powerlimit > 1) {
-		rapl_write_data_raw(rd, PL2_ENABLE, mode);
-		rapl_write_data_raw(rd, PL2_CLAMP, mode);
-	}
-}
-
-static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
-{
-	static u32 power_ctrl_orig_val;
-	u32 mdata;
-
-	if (!rapl_defaults->floor_freq_reg_addr) {
-		pr_err("Invalid floor frequency config register\n");
-		return;
-	}
-
-	if (!power_ctrl_orig_val)
-		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
-			      rapl_defaults->floor_freq_reg_addr,
-			      &power_ctrl_orig_val);
-	mdata = power_ctrl_orig_val;
-	if (enable) {
-		mdata &= ~(0x7f << 8);
-		mdata |= 1 << 8;
-	}
-	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
-		       rapl_defaults->floor_freq_reg_addr, mdata);
-}
-
-static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
-					bool to_raw)
-{
-	u64 f, y; /* fraction and exp. used for time unit */
-
-	/*
-	 * Special processing based on 2^Y*(1+F/4), refer
-	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
-	 */
-	if (!to_raw) {
-		f = (value & 0x60) >> 5;
-		y = value & 0x1f;
-		value = (1 << y) * (4 + f) * rp->time_unit / 4;
-	} else {
-		do_div(value, rp->time_unit);
-		y = ilog2(value);
-		f = div64_u64(4 * (value - (1 << y)), 1 << y);
-		value = (y & 0x1f) | ((f & 0x3) << 5);
-	}
-	return value;
-}
-
-static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
-					bool to_raw)
-{
-	/*
-	 * Atom time unit encoding is straight forward val * time_unit,
-	 * where time_unit is default to 1 sec. Never 0.
-	 */
-	if (!to_raw)
-		return (value) ? value *= rp->time_unit : rp->time_unit;
-	else
-		value = div64_u64(value, rp->time_unit);
-
-	return value;
-}
-
-static const struct rapl_defaults rapl_defaults_core = {
-	.floor_freq_reg_addr = 0,
-	.check_unit = rapl_check_unit_core,
-	.set_floor_freq = set_floor_freq_default,
-	.compute_time_window = rapl_compute_time_window_core,
-};
-
-static const struct rapl_defaults rapl_defaults_hsw_server = {
-	.check_unit = rapl_check_unit_core,
-	.set_floor_freq = set_floor_freq_default,
-	.compute_time_window = rapl_compute_time_window_core,
-	.dram_domain_energy_unit = 15300,
-};
-
-static const struct rapl_defaults rapl_defaults_byt = {
-	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = set_floor_freq_atom,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct rapl_defaults rapl_defaults_tng = {
-	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = set_floor_freq_atom,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct rapl_defaults rapl_defaults_ann = {
-	.floor_freq_reg_addr = 0,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = NULL,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct rapl_defaults rapl_defaults_cht = {
-	.floor_freq_reg_addr = 0,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = NULL,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct x86_cpu_id rapl_ids[] __initconst = {
-	INTEL_CPU_FAM6(SANDYBRIDGE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(SANDYBRIDGE_X,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(IVYBRIDGE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(IVYBRIDGE_X,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(HASWELL_CORE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(HASWELL_ULT,		rapl_defaults_core),
-	INTEL_CPU_FAM6(HASWELL_GT3E,		rapl_defaults_core),
-	INTEL_CPU_FAM6(HASWELL_X,		rapl_defaults_hsw_server),
-
-	INTEL_CPU_FAM6(BROADWELL_CORE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(BROADWELL_GT3E,		rapl_defaults_core),
-	INTEL_CPU_FAM6(BROADWELL_XEON_D,	rapl_defaults_core),
-	INTEL_CPU_FAM6(BROADWELL_X,		rapl_defaults_hsw_server),
-
-	INTEL_CPU_FAM6(SKYLAKE_DESKTOP,		rapl_defaults_core),
-	INTEL_CPU_FAM6(SKYLAKE_MOBILE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(SKYLAKE_X,		rapl_defaults_hsw_server),
-	INTEL_CPU_FAM6(KABYLAKE_MOBILE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(KABYLAKE_DESKTOP,	rapl_defaults_core),
-	INTEL_CPU_FAM6(CANNONLAKE_MOBILE,	rapl_defaults_core),
-	INTEL_CPU_FAM6(ICELAKE_MOBILE,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(ATOM_SILVERMONT,		rapl_defaults_byt),
-	INTEL_CPU_FAM6(ATOM_AIRMONT,		rapl_defaults_cht),
-	INTEL_CPU_FAM6(ATOM_SILVERMONT_MID,	rapl_defaults_tng),
-	INTEL_CPU_FAM6(ATOM_AIRMONT_MID,	rapl_defaults_ann),
-	INTEL_CPU_FAM6(ATOM_GOLDMONT,		rapl_defaults_core),
-	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS,	rapl_defaults_core),
-	INTEL_CPU_FAM6(ATOM_GOLDMONT_X,		rapl_defaults_core),
-	INTEL_CPU_FAM6(ATOM_TREMONT_X,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(XEON_PHI_KNL,		rapl_defaults_hsw_server),
-	INTEL_CPU_FAM6(XEON_PHI_KNM,		rapl_defaults_hsw_server),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
-
-/* Read once for all raw primitive data for domains */
-static void rapl_update_domain_data(struct rapl_package *rp)
-{
-	int dmn, prim;
-	u64 val;
-
-	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
-		pr_debug("update %s domain %s data\n", rp->name,
-			 rp->domains[dmn].name);
-		/* exclude non-raw primitives */
-		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
-			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
-						rpi[prim].unit, &val))
-				rp->domains[dmn].rdd.primitives[prim] =	val;
-		}
-	}
-
-}
-
-static int rapl_package_register_powercap(struct rapl_package *rp)
-{
-	struct rapl_domain *rd;
-	struct powercap_zone *power_zone = NULL;
-	int nr_pl, ret;
-
-	/* Update the domain data of the new package */
-	rapl_update_domain_data(rp);
-
-	/* first we register package domain as the parent zone*/
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
-		if (rd->id == RAPL_DOMAIN_PACKAGE) {
-			nr_pl = find_nr_power_limit(rd);
-			pr_debug("register package domain %s\n", rp->name);
-			power_zone = powercap_register_zone(&rd->power_zone,
-							rp->priv->control_type,
-							rp->name, NULL,
-							&zone_ops[rd->id],
-							nr_pl,
-							&constraint_ops);
-			if (IS_ERR(power_zone)) {
-				pr_debug("failed to register power zone %s\n",
-					rp->name);
-				return PTR_ERR(power_zone);
-			}
-			/* track parent zone in per package/socket data */
-			rp->power_zone = power_zone;
-			/* done, only one package domain per socket */
-			break;
-		}
-	}
-	if (!power_zone) {
-		pr_err("no package domain found, unknown topology!\n");
-		return -ENODEV;
-	}
-	/* now register domains as children of the socket/package*/
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
-		if (rd->id == RAPL_DOMAIN_PACKAGE)
-			continue;
-		/* number of power limits per domain varies */
-		nr_pl = find_nr_power_limit(rd);
-		power_zone = powercap_register_zone(&rd->power_zone,
-						rp->priv->control_type, rd->name,
-						rp->power_zone,
-						&zone_ops[rd->id], nr_pl,
-						&constraint_ops);
-
-		if (IS_ERR(power_zone)) {
-			pr_debug("failed to register power_zone, %s:%s\n",
-				rp->name, rd->name);
-			ret = PTR_ERR(power_zone);
-			goto err_cleanup;
-		}
-	}
-	return 0;
-
-err_cleanup:
-	/*
-	 * Clean up previously initialized domains within the package if we
-	 * failed after the first domain setup.
-	 */
-	while (--rd >= rp->domains) {
-		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
-		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
-	}
-
-	return ret;
-}
-
-static int __init rapl_add_platform_domain(struct rapl_if_priv *priv)
-{
-	struct rapl_domain *rd;
-	struct powercap_zone *power_zone;
-	struct reg_action ra;
-	int ret;
-
-	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
-	ra.mask = ~0;
-	ret = priv->read_raw(0, &ra);
-	if (ret || !ra.value)
-		return -ENODEV;
-
-	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
-	ra.mask = ~0;
-	ret = priv->read_raw(0, &ra);
-	if (ret || !ra.value)
-		return -ENODEV;
-
-	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
-	if (!rd)
-		return -ENOMEM;
-
-	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
-	rd->id = RAPL_DOMAIN_PLATFORM;
-	rd->regs[RAPL_DOMAIN_REG_LIMIT] = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
-	rd->regs[RAPL_DOMAIN_REG_STATUS] = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
-	rd->rpl[0].prim_id = PL1_ENABLE;
-	rd->rpl[0].name = pl1_name;
-	rd->rpl[1].prim_id = PL2_ENABLE;
-	rd->rpl[1].name = pl2_name;
-	rd->rp = rapl_find_package_domain(0, priv);
-
-	power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
-					    "psys", NULL,
-					    &zone_ops[RAPL_DOMAIN_PLATFORM],
-					    2, &constraint_ops);
-
-	if (IS_ERR(power_zone)) {
-		kfree(rd);
-		return PTR_ERR(power_zone);
-	}
-
-	priv->platform_rapl_domain = rd;
-
-	return 0;
-}
-
-static void rapl_remove_platform_domain(struct rapl_if_priv *priv)
-{
-	if (priv->platform_rapl_domain) {
-		powercap_unregister_zone(priv->control_type,
-			&priv->platform_rapl_domain->power_zone);
-		kfree(priv->platform_rapl_domain);
-	}
-}
-
-static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
-{
-	struct reg_action ra;
-
-	switch (domain) {
-	case RAPL_DOMAIN_PACKAGE:
-	case RAPL_DOMAIN_PP0:
-	case RAPL_DOMAIN_PP1:
-	case RAPL_DOMAIN_DRAM:
-		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
-		break;
-	case RAPL_DOMAIN_PLATFORM:
-		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
-		return -EINVAL;
-	default:
-		pr_err("invalid domain id %d\n", domain);
-		return -EINVAL;
-	}
-	/* make sure domain counters are available and contains non-zero
-	 * values, otherwise skip it.
-	 */
-
-	ra.mask = ~0;
-	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
-		return -ENODEV;
-
-	return 0;
-}
-
-
-/*
- * Check if power limits are available. Two cases when they are not available:
- * 1. Locked by BIOS, in this case we still provide read-only access so that
- *    users can see what limit is set by the BIOS.
- * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
- *    exist at all. In this case, we do not show the contraints in powercap.
- *
- * Called after domains are detected and initialized.
- */
-static void rapl_detect_powerlimit(struct rapl_domain *rd)
-{
-	u64 val64;
-	int i;
-
-	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
-	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
-		if (val64) {
-			pr_info("RAPL %s domain %s locked by BIOS\n",
-				rd->rp->name, rd->name);
-			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
-		}
-	}
-	/* check if power limit MSRs exists, otherwise domain is monitoring only */
-	for (i = 0; i < NR_POWER_LIMITS; i++) {
-		int prim = rd->rpl[i].prim_id;
-		if (rapl_read_data_raw(rd, prim, false, &val64))
-			rd->rpl[i].name = NULL;
-	}
-}
-
-/* Detect active and valid domains for the given CPU, caller must
- * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
- */
-static int rapl_detect_domains(struct rapl_package *rp, int cpu)
-{
-	struct rapl_domain *rd;
-	int i;
-
-	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
-		/* use physical package id to read counters */
-		if (!rapl_check_domain(cpu, i, rp)) {
-			rp->domain_map |= 1 << i;
-			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
-		}
-	}
-	rp->nr_domains = bitmap_weight(&rp->domain_map,	RAPL_DOMAIN_MAX);
-	if (!rp->nr_domains) {
-		pr_debug("no valid rapl domains found in %s\n", rp->name);
-		return -ENODEV;
-	}
-	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
-
-	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
-			GFP_KERNEL);
-	if (!rp->domains)
-		return -ENOMEM;
-
-	rapl_init_domains(rp);
-
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
-		rapl_detect_powerlimit(rd);
-
-	return 0;
-}
-
-/* called from CPU hotplug notifier, hotplug lock held */
-static void rapl_remove_package(struct rapl_package *rp)
-{
-	struct rapl_domain *rd, *rd_package = NULL;
-
-	package_power_limit_irq_restore(rp);
-
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
-		rapl_write_data_raw(rd, PL1_ENABLE, 0);
-		rapl_write_data_raw(rd, PL1_CLAMP, 0);
-		if (find_nr_power_limit(rd) > 1) {
-			rapl_write_data_raw(rd, PL2_ENABLE, 0);
-			rapl_write_data_raw(rd, PL2_CLAMP, 0);
-		}
-		if (rd->id == RAPL_DOMAIN_PACKAGE) {
-			rd_package = rd;
-			continue;
-		}
-		pr_debug("remove package, undo power limit on %s: %s\n",
-			 rp->name, rd->name);
-		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
-	}
-	/* do parent zone last */
-	powercap_unregister_zone(rp->priv->control_type, &rd_package->power_zone);
-	list_del(&rp->plist);
-	kfree(rp);
-}
-
-/* called from CPU hotplug notifier, hotplug lock held */
-static struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
-{
-	int id = topology_logical_die_id(cpu);
-	struct rapl_package *rp;
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	int ret;
-
-	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
-	if (!rp)
-		return ERR_PTR(-ENOMEM);
-
-	/* add the new package to the list */
-	rp->id = id;
-	rp->lead_cpu = cpu;
-	rp->priv = priv;
-
-	if (topology_max_die_per_package() > 1)
-		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
-			"package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
-	else
-		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
-			c->phys_proc_id);
-
-	/* check if the package contains valid domains */
-	if (rapl_detect_domains(rp, cpu) ||
-		rapl_defaults->check_unit(rp, cpu)) {
-		ret = -ENODEV;
-		goto err_free_package;
-	}
-	ret = rapl_package_register_powercap(rp);
-	if (!ret) {
-		INIT_LIST_HEAD(&rp->plist);
-		list_add(&rp->plist, &rapl_packages);
-		return rp;
-	}
-
-err_free_package:
-	kfree(rp->domains);
-	kfree(rp);
-	return ERR_PTR(ret);
-}
-
-/* Handles CPU hotplug on multi-socket systems.
- * If a CPU goes online as the first CPU of the physical package
- * we add the RAPL package to the system. Similarly, when the last
- * CPU of the package is removed, we remove the RAPL package and its
- * associated domains. Cooling devices are handled accordingly at
- * per-domain level.
- */
-static int rapl_cpu_online(unsigned int cpu)
-{
-	struct rapl_package *rp;
-
-	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
-	if (!rp) {
-		rp = rapl_add_package(cpu, &rapl_msr_priv);
-		if (IS_ERR(rp))
-			return PTR_ERR(rp);
-	}
-	cpumask_set_cpu(cpu, &rp->cpumask);
-	return 0;
-}
-
-static int rapl_cpu_down_prep(unsigned int cpu)
-{
-	struct rapl_package *rp;
-	int lead_cpu;
-
-	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
-	if (!rp)
-		return 0;
-
-	cpumask_clear_cpu(cpu, &rp->cpumask);
-	lead_cpu = cpumask_first(&rp->cpumask);
-	if (lead_cpu >= nr_cpu_ids)
-		rapl_remove_package(rp);
-	else if (rp->lead_cpu == cpu)
-		rp->lead_cpu = lead_cpu;
-	return 0;
-}
-
-static void power_limit_state_save(void)
-{
-	struct rapl_package *rp;
-	struct rapl_domain *rd;
-	int nr_pl, ret, i;
-
-	get_online_cpus();
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (!rp->power_zone)
-			continue;
-		rd = power_zone_to_rapl_domain(rp->power_zone);
-		nr_pl = find_nr_power_limit(rd);
-		for (i = 0; i < nr_pl; i++) {
-			switch (rd->rpl[i].prim_id) {
-			case PL1_ENABLE:
-				ret = rapl_read_data_raw(rd,
-						POWER_LIMIT1,
-						true,
-						&rd->rpl[i].last_power_limit);
-				if (ret)
-					rd->rpl[i].last_power_limit = 0;
-				break;
-			case PL2_ENABLE:
-				ret = rapl_read_data_raw(rd,
-						POWER_LIMIT2,
-						true,
-						&rd->rpl[i].last_power_limit);
-				if (ret)
-					rd->rpl[i].last_power_limit = 0;
-				break;
-			}
-		}
-	}
-	put_online_cpus();
-}
-
-static void power_limit_state_restore(void)
-{
-	struct rapl_package *rp;
-	struct rapl_domain *rd;
-	int nr_pl, i;
-
-	get_online_cpus();
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (!rp->power_zone)
-			continue;
-		rd = power_zone_to_rapl_domain(rp->power_zone);
-		nr_pl = find_nr_power_limit(rd);
-		for (i = 0; i < nr_pl; i++) {
-			switch (rd->rpl[i].prim_id) {
-			case PL1_ENABLE:
-				if (rd->rpl[i].last_power_limit)
-					rapl_write_data_raw(rd,
-						POWER_LIMIT1,
-						rd->rpl[i].last_power_limit);
-				break;
-			case PL2_ENABLE:
-				if (rd->rpl[i].last_power_limit)
-					rapl_write_data_raw(rd,
-						POWER_LIMIT2,
-						rd->rpl[i].last_power_limit);
-				break;
-			}
-		}
-	}
-	put_online_cpus();
-}
-
-static int rapl_pm_callback(struct notifier_block *nb,
-	unsigned long mode, void *_unused)
-{
-	switch (mode) {
-	case PM_SUSPEND_PREPARE:
-		power_limit_state_save();
-		break;
-	case PM_POST_SUSPEND:
-		power_limit_state_restore();
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block rapl_pm_notifier = {
-	.notifier_call = rapl_pm_callback,
-};
-
-static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
-{
-	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
-		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
-		return -EIO;
-	}
-	ra->value &= ra->mask;
-	return 0;
-}
-
-static void rapl_msr_update_func(void *info)
-{
-	struct reg_action *ra = info;
-	u64 val;
-
-	ra->err = rdmsrl_safe(ra->reg, &val);
-	if (ra->err)
-		return;
-
-	val &= ~ra->mask;
-	val |= ra->value;
-
-	ra->err = wrmsrl_safe(ra->reg, val);
-}
-
-
-static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
-{
-	int ret;
-
-	ret = smp_call_function_single(cpu, rapl_msr_update_func, ra, 1);
-	if (WARN_ON_ONCE(ret))
-		return ret;
-
-	return ra->err;
-}
-
-static int __init rapl_init(void)
-{
-	const struct x86_cpu_id *id;
-	int ret;
-
-	id = x86_match_cpu(rapl_ids);
-	if (!id) {
-		pr_err("driver does not support CPU family %d model %d\n",
-			boot_cpu_data.x86, boot_cpu_data.x86_model);
-
-		return -ENODEV;
-	}
-
-	rapl_defaults = (struct rapl_defaults *)id->driver_data;
-
-	rapl_msr_priv.read_raw = rapl_msr_read_raw;
-	rapl_msr_priv.write_raw = rapl_msr_write_raw;
-
-	rapl_msr_priv.control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
-	if (IS_ERR(rapl_msr_priv.control_type)) {
-		pr_debug("failed to register powercap control_type.\n");
-		return PTR_ERR(rapl_msr_priv.control_type);
-	}
-
-	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
-				rapl_cpu_online, rapl_cpu_down_prep);
-	if (ret < 0)
-		goto err_unreg;
-	rapl_msr_priv.pcap_rapl_online = ret;
-
-	/* Don't bail out if PSys is not supported */
-	rapl_add_platform_domain(&rapl_msr_priv);
-
-	ret = register_pm_notifier(&rapl_pm_notifier);
-	if (ret)
-		goto err_unreg_all;
-
-	return 0;
-
-err_unreg_all:
-	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
-
-err_unreg:
-	powercap_unregister_control_type(rapl_msr_priv.control_type);
-	return ret;
-}
-
-static void __exit rapl_exit(void)
-{
-	unregister_pm_notifier(&rapl_pm_notifier);
-	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
-	rapl_remove_platform_domain(&rapl_msr_priv);
-	powercap_unregister_control_type(rapl_msr_priv.control_type);
-}
-
-module_init(rapl_init);
-module_exit(rapl_exit);
-
-MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit)");
-MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
new file mode 100644
index 000000000000..34a82531a7cf
--- /dev/null
+++ b/drivers/powercap/intel_rapl_common.c
@@ -0,0 +1,1469 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Common code for Intel Running Average Power Limit (RAPL) support.
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/sysfs.h>
+#include <linux/cpu.h>
+#include <linux/powercap.h>
+#include <linux/suspend.h>
+#include <asm/iosf_mbi.h>
+#include <linux/intel_rapl.h>
+
+#include <linux/processor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+/* Local defines */
+#define MSR_PLATFORM_POWER_LIMIT	0x0000065C
+
+/* bitmasks for RAPL MSRs, used by primitive access functions */
+#define ENERGY_STATUS_MASK      0xffffffff
+
+#define POWER_LIMIT1_MASK       0x7FFF
+#define POWER_LIMIT1_ENABLE     BIT(15)
+#define POWER_LIMIT1_CLAMP      BIT(16)
+
+#define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
+#define POWER_LIMIT2_ENABLE     BIT_ULL(47)
+#define POWER_LIMIT2_CLAMP      BIT_ULL(48)
+#define POWER_PACKAGE_LOCK      BIT_ULL(63)
+#define POWER_PP_LOCK           BIT(31)
+
+#define TIME_WINDOW1_MASK       (0x7FULL<<17)
+#define TIME_WINDOW2_MASK       (0x7FULL<<49)
+
+#define POWER_UNIT_OFFSET	0
+#define POWER_UNIT_MASK		0x0F
+
+#define ENERGY_UNIT_OFFSET	0x08
+#define ENERGY_UNIT_MASK	0x1F00
+
+#define TIME_UNIT_OFFSET	0x10
+#define TIME_UNIT_MASK		0xF0000
+
+#define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
+#define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
+#define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
+#define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
+
+#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
+#define PP_POLICY_MASK         0x1F
+
+/* Non HW constants */
+#define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
+#define RAPL_PRIMITIVE_DUMMY         BIT(2)
+
+#define TIME_WINDOW_MAX_MSEC 40000
+#define TIME_WINDOW_MIN_MSEC 250
+#define ENERGY_UNIT_SCALE    1000	/* scale from driver unit to powercap unit */
+enum unit_type {
+	ARBITRARY_UNIT,		/* no translation */
+	POWER_UNIT,
+	ENERGY_UNIT,
+	TIME_UNIT,
+};
+
+/* per domain data, some are optional */
+#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
+
+#define	DOMAIN_STATE_INACTIVE           BIT(0)
+#define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
+#define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
+
+static const char pl1_name[] = "long_term";
+static const char pl2_name[] = "short_term";
+
+#define power_zone_to_rapl_domain(_zone) \
+	container_of(_zone, struct rapl_domain, power_zone)
+
+struct rapl_defaults {
+	u8 floor_freq_reg_addr;
+	int (*check_unit)(struct rapl_package *rp, int cpu);
+	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
+	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
+				    bool to_raw);
+	unsigned int dram_domain_energy_unit;
+};
+static struct rapl_defaults *rapl_defaults;
+
+/* Sideband MBI registers */
+#define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
+#define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
+
+#define PACKAGE_PLN_INT_SAVED   BIT(0)
+#define MAX_PRIM_NAME (32)
+
+/* per domain data. used to describe individual knobs such that access function
+ * can be consolidated into one instead of many inline functions.
+ */
+struct rapl_primitive_info {
+	const char *name;
+	u64 mask;
+	int shift;
+	enum rapl_domain_reg_id id;
+	enum unit_type unit;
+	u32 flag;
+};
+
+#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
+		.name = #p,			\
+		.mask = m,			\
+		.shift = s,			\
+		.id = i,			\
+		.unit = u,			\
+		.flag = f			\
+	}
+
+static void rapl_init_domains(struct rapl_package *rp);
+static int rapl_read_data_raw(struct rapl_domain *rd,
+			      enum rapl_primitives prim,
+			      bool xlate, u64 *data);
+static int rapl_write_data_raw(struct rapl_domain *rd,
+			       enum rapl_primitives prim,
+			       unsigned long long value);
+static u64 rapl_unit_xlate(struct rapl_domain *rd,
+			   enum unit_type type, u64 value, int to_raw);
+static void package_power_limit_irq_save(struct rapl_package *rp);
+static int rapl_init_core(void);
+static void rapl_remove_core(void);
+
+static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */
+
+static const char *const rapl_domain_names[] = {
+	"package",
+	"core",
+	"uncore",
+	"dram",
+	"psys",
+};
+
+static int get_energy_counter(struct powercap_zone *power_zone,
+			      u64 *energy_raw)
+{
+	struct rapl_domain *rd;
+	u64 energy_now;
+
+	/* prevent CPU hotplug, make sure the RAPL domain does not go
+	 * away while reading the counter.
+	 */
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+
+	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
+		*energy_raw = energy_now;
+		put_online_cpus();
+
+		return 0;
+	}
+	put_online_cpus();
+
+	return -EIO;
+}
+
+static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
+
+	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
+	return 0;
+}
+
+static int release_zone(struct powercap_zone *power_zone)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
+	struct rapl_package *rp = rd->rp;
+
+	/* package zone is the last zone of a package, we can free
+	 * memory here since all children has been unregistered.
+	 */
+	if (rd->id == RAPL_DOMAIN_PACKAGE) {
+		kfree(rd);
+		rp->domains = NULL;
+	}
+
+	return 0;
+
+}
+
+static int find_nr_power_limit(struct rapl_domain *rd)
+{
+	int i, nr_pl = 0;
+
+	for (i = 0; i < NR_POWER_LIMITS; i++) {
+		if (rd->rpl[i].name)
+			nr_pl++;
+	}
+
+	return nr_pl;
+}
+
+static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
+
+	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
+		return -EACCES;
+
+	get_online_cpus();
+	rapl_write_data_raw(rd, PL1_ENABLE, mode);
+	if (rapl_defaults->set_floor_freq)
+		rapl_defaults->set_floor_freq(rd, mode);
+	put_online_cpus();
+
+	return 0;
+}
+
+static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
+	u64 val;
+
+	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
+		*mode = false;
+		return 0;
+	}
+	get_online_cpus();
+	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
+		put_online_cpus();
+		return -EIO;
+	}
+	*mode = val;
+	put_online_cpus();
+
+	return 0;
+}
+
+/* per RAPL domain ops, in the order of rapl_domain_type */
+static const struct powercap_zone_ops zone_ops[] = {
+	/* RAPL_DOMAIN_PACKAGE */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_PP0 */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_PP1 */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_DRAM */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_PLATFORM */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+};
+
+/*
+ * Constraint index used by powercap can be different than power limit (PL)
+ * index in that some  PLs maybe missing due to non-existent MSRs. So we
+ * need to convert here by finding the valid PLs only (name populated).
+ */
+static int contraint_to_pl(struct rapl_domain *rd, int cid)
+{
+	int i, j;
+
+	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
+		if ((rd->rpl[i].name) && j++ == cid) {
+			pr_debug("%s: index %d\n", __func__, i);
+			return i;
+		}
+	}
+	pr_err("Cannot find matching power limit for constraint %d\n", cid);
+
+	return -EINVAL;
+}
+
+static int set_power_limit(struct powercap_zone *power_zone, int cid,
+			   u64 power_limit)
+{
+	struct rapl_domain *rd;
+	struct rapl_package *rp;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto set_exit;
+	}
+
+	rp = rd->rp;
+
+	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
+		dev_warn(&power_zone->dev,
+			 "%s locked by BIOS, monitoring only\n", rd->name);
+		ret = -EACCES;
+		goto set_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
+		break;
+	case PL2_ENABLE:
+		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	if (!ret)
+		package_power_limit_irq_save(rp);
+set_exit:
+	put_online_cpus();
+	return ret;
+}
+
+static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
+				   u64 *data)
+{
+	struct rapl_domain *rd;
+	u64 val;
+	int prim;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto get_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		prim = POWER_LIMIT1;
+		break;
+	case PL2_ENABLE:
+		prim = POWER_LIMIT2;
+		break;
+	default:
+		put_online_cpus();
+		return -EINVAL;
+	}
+	if (rapl_read_data_raw(rd, prim, true, &val))
+		ret = -EIO;
+	else
+		*data = val;
+
+get_exit:
+	put_online_cpus();
+
+	return ret;
+}
+
+static int set_time_window(struct powercap_zone *power_zone, int cid,
+			   u64 window)
+{
+	struct rapl_domain *rd;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto set_time_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		rapl_write_data_raw(rd, TIME_WINDOW1, window);
+		break;
+	case PL2_ENABLE:
+		rapl_write_data_raw(rd, TIME_WINDOW2, window);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+set_time_exit:
+	put_online_cpus();
+	return ret;
+}
+
+static int get_time_window(struct powercap_zone *power_zone, int cid,
+			   u64 *data)
+{
+	struct rapl_domain *rd;
+	u64 val;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto get_time_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
+		break;
+	case PL2_ENABLE:
+		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
+		break;
+	default:
+		put_online_cpus();
+		return -EINVAL;
+	}
+	if (!ret)
+		*data = val;
+
+get_time_exit:
+	put_online_cpus();
+
+	return ret;
+}
+
+static const char *get_constraint_name(struct powercap_zone *power_zone,
+				       int cid)
+{
+	struct rapl_domain *rd;
+	int id;
+
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id >= 0)
+		return rd->rpl[id].name;
+
+	return NULL;
+}
+
+static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
+{
+	struct rapl_domain *rd;
+	u64 val;
+	int prim;
+	int ret = 0;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		prim = THERMAL_SPEC_POWER;
+		break;
+	case PL2_ENABLE:
+		prim = MAX_POWER;
+		break;
+	default:
+		put_online_cpus();
+		return -EINVAL;
+	}
+	if (rapl_read_data_raw(rd, prim, true, &val))
+		ret = -EIO;
+	else
+		*data = val;
+
+	put_online_cpus();
+
+	return ret;
+}
+
+static const struct powercap_zone_constraint_ops constraint_ops = {
+	.set_power_limit_uw = set_power_limit,
+	.get_power_limit_uw = get_current_power_limit,
+	.set_time_window_us = set_time_window,
+	.get_time_window_us = get_time_window,
+	.get_max_power_uw = get_max_power,
+	.get_name = get_constraint_name,
+};
+
+/* called after domain detection and package level data are set */
+static void rapl_init_domains(struct rapl_package *rp)
+{
+	int i;
+	struct rapl_domain *rd = rp->domains;
+
+	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
+		unsigned int mask = rp->domain_map & (1 << i);
+
+		rd->regs[RAPL_DOMAIN_REG_LIMIT] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
+		rd->regs[RAPL_DOMAIN_REG_STATUS] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
+		rd->regs[RAPL_DOMAIN_REG_PERF] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
+		rd->regs[RAPL_DOMAIN_REG_POLICY] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
+		rd->regs[RAPL_DOMAIN_REG_INFO] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
+
+		switch (mask) {
+		case BIT(RAPL_DOMAIN_PACKAGE):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
+			rd->id = RAPL_DOMAIN_PACKAGE;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			rd->rpl[1].prim_id = PL2_ENABLE;
+			rd->rpl[1].name = pl2_name;
+			break;
+		case BIT(RAPL_DOMAIN_PP0):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
+			rd->id = RAPL_DOMAIN_PP0;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			break;
+		case BIT(RAPL_DOMAIN_PP1):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
+			rd->id = RAPL_DOMAIN_PP1;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			break;
+		case BIT(RAPL_DOMAIN_DRAM):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
+			rd->id = RAPL_DOMAIN_DRAM;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			rd->domain_energy_unit =
+			    rapl_defaults->dram_domain_energy_unit;
+			if (rd->domain_energy_unit)
+				pr_info("DRAM domain energy unit %dpj\n",
+					rd->domain_energy_unit);
+			break;
+		}
+		if (mask) {
+			rd->rp = rp;
+			rd++;
+		}
+	}
+}
+
+static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
+			   u64 value, int to_raw)
+{
+	u64 units = 1;
+	struct rapl_package *rp = rd->rp;
+	u64 scale = 1;
+
+	switch (type) {
+	case POWER_UNIT:
+		units = rp->power_unit;
+		break;
+	case ENERGY_UNIT:
+		scale = ENERGY_UNIT_SCALE;
+		/* per domain unit takes precedence */
+		if (rd->domain_energy_unit)
+			units = rd->domain_energy_unit;
+		else
+			units = rp->energy_unit;
+		break;
+	case TIME_UNIT:
+		return rapl_defaults->compute_time_window(rp, value, to_raw);
+	case ARBITRARY_UNIT:
+	default:
+		return value;
+	};
+
+	if (to_raw)
+		return div64_u64(value, units) * scale;
+
+	value *= units;
+
+	return div64_u64(value, scale);
+}
+
+/* in the order of enum rapl_primitives */
+static struct rapl_primitive_info rpi[] = {
+	/* name, mask, shift, msr index, unit divisor */
+	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
+			    RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
+			    0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
+			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
+			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
+			    RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
+			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
+			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
+	/* non-hardware */
+	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
+			    RAPL_PRIMITIVE_DERIVED),
+	{NULL, 0, 0, 0},
+};
+
+/* Read primitive data based on its related struct rapl_primitive_info.
+ * if xlate flag is set, return translated data based on data units, i.e.
+ * time, energy, and power.
+ * RAPL MSRs are non-architectual and are laid out not consistently across
+ * domains. Here we use primitive info to allow writing consolidated access
+ * functions.
+ * For a given primitive, it is processed by MSR mask and shift. Unit conversion
+ * is pre-assigned based on RAPL unit MSRs read at init time.
+ * 63-------------------------- 31--------------------------- 0
+ * |                           xxxxx (mask)                   |
+ * |                                |<- shift ----------------|
+ * 63-------------------------- 31--------------------------- 0
+ */
+static int rapl_read_data_raw(struct rapl_domain *rd,
+			      enum rapl_primitives prim, bool xlate, u64 *data)
+{
+	u64 value;
+	struct rapl_primitive_info *rp = &rpi[prim];
+	struct reg_action ra;
+	int cpu;
+
+	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
+		return -EINVAL;
+
+	ra.reg = rd->regs[rp->id];
+	if (!ra.reg)
+		return -EINVAL;
+
+	cpu = rd->rp->lead_cpu;
+
+	/* special-case package domain, which uses a different bit */
+	if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
+		rp->mask = POWER_PACKAGE_LOCK;
+		rp->shift = 63;
+	}
+	/* non-hardware data are collected by the polling thread */
+	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
+		*data = rd->rdd.primitives[prim];
+		return 0;
+	}
+
+	ra.mask = rp->mask;
+
+	if (rd->rp->priv->read_raw(cpu, &ra)) {
+		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
+		return -EIO;
+	}
+
+	value = ra.value >> rp->shift;
+
+	if (xlate)
+		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
+	else
+		*data = value;
+
+	return 0;
+}
+
+/* Similar use of primitive info in the read counterpart */
+static int rapl_write_data_raw(struct rapl_domain *rd,
+			       enum rapl_primitives prim,
+			       unsigned long long value)
+{
+	struct rapl_primitive_info *rp = &rpi[prim];
+	int cpu;
+	u64 bits;
+	struct reg_action ra;
+	int ret;
+
+	cpu = rd->rp->lead_cpu;
+	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
+	bits <<= rp->shift;
+	bits &= rp->mask;
+
+	memset(&ra, 0, sizeof(ra));
+
+	ra.reg = rd->regs[rp->id];
+	ra.mask = rp->mask;
+	ra.value = bits;
+
+	ret = rd->rp->priv->write_raw(cpu, &ra);
+
+	return ret;
+}
+
+/*
+ * Raw RAPL data stored in MSRs are in certain scales. We need to
+ * convert them into standard units based on the units reported in
+ * the RAPL unit MSRs. This is specific to CPUs as the method to
+ * calculate units differ on different CPUs.
+ * We convert the units to below format based on CPUs.
+ * i.e.
+ * energy unit: picoJoules  : Represented in picoJoules by default
+ * power unit : microWatts  : Represented in milliWatts by default
+ * time unit  : microseconds: Represented in seconds by default
+ */
+static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
+{
+	struct reg_action ra;
+	u32 value;
+
+	ra.reg = rp->priv->reg_unit;
+	ra.mask = ~0;
+	if (rp->priv->read_raw(cpu, &ra)) {
+		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		       rp->priv->reg_unit, cpu);
+		return -ENODEV;
+	}
+
+	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
+	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
+
+	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
+	rp->power_unit = 1000000 / (1 << value);
+
+	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
+	rp->time_unit = 1000000 / (1 << value);
+
+	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
+		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
+
+	return 0;
+}
+
+static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
+{
+	struct reg_action ra;
+	u32 value;
+
+	ra.reg = rp->priv->reg_unit;
+	ra.mask = ~0;
+	if (rp->priv->read_raw(cpu, &ra)) {
+		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		       rp->priv->reg_unit, cpu);
+		return -ENODEV;
+	}
+
+	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
+	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
+
+	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
+	rp->power_unit = (1 << value) * 1000;
+
+	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
+	rp->time_unit = 1000000 / (1 << value);
+
+	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
+		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
+
+	return 0;
+}
+
+static void power_limit_irq_save_cpu(void *info)
+{
+	u32 l, h = 0;
+	struct rapl_package *rp = (struct rapl_package *)info;
+
+	/* save the state of PLN irq mask bit before disabling it */
+	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
+	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
+		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
+		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
+	}
+	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
+	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+}
+
+/* REVISIT:
+ * When package power limit is set artificially low by RAPL, LVT
+ * thermal interrupt for package power limit should be ignored
+ * since we are not really exceeding the real limit. The intention
+ * is to avoid excessive interrupts while we are trying to save power.
+ * A useful feature might be routing the package_power_limit interrupt
+ * to userspace via eventfd. once we have a usecase, this is simple
+ * to do by adding an atomic notifier.
+ */
+
+static void package_power_limit_irq_save(struct rapl_package *rp)
+{
+	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
+		return;
+
+	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
+}
+
+/*
+ * Restore per package power limit interrupt enable state. Called from cpu
+ * hotplug code on package removal.
+ */
+static void package_power_limit_irq_restore(struct rapl_package *rp)
+{
+	u32 l, h;
+
+	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
+		return;
+
+	/* irq enable state not saved, nothing to restore */
+	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
+		return;
+
+	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
+
+	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
+		l |= PACKAGE_THERM_INT_PLN_ENABLE;
+	else
+		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
+
+	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+}
+
+static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
+{
+	int nr_powerlimit = find_nr_power_limit(rd);
+
+	/* always enable clamp such that p-state can go below OS requested
+	 * range. power capping priority over guranteed frequency.
+	 */
+	rapl_write_data_raw(rd, PL1_CLAMP, mode);
+
+	/* some domains have pl2 */
+	if (nr_powerlimit > 1) {
+		rapl_write_data_raw(rd, PL2_ENABLE, mode);
+		rapl_write_data_raw(rd, PL2_CLAMP, mode);
+	}
+}
+
+static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
+{
+	static u32 power_ctrl_orig_val;
+	u32 mdata;
+
+	if (!rapl_defaults->floor_freq_reg_addr) {
+		pr_err("Invalid floor frequency config register\n");
+		return;
+	}
+
+	if (!power_ctrl_orig_val)
+		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
+			      rapl_defaults->floor_freq_reg_addr,
+			      &power_ctrl_orig_val);
+	mdata = power_ctrl_orig_val;
+	if (enable) {
+		mdata &= ~(0x7f << 8);
+		mdata |= 1 << 8;
+	}
+	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
+		       rapl_defaults->floor_freq_reg_addr, mdata);
+}
+
+static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
+					 bool to_raw)
+{
+	u64 f, y;		/* fraction and exp. used for time unit */
+
+	/*
+	 * Special processing based on 2^Y*(1+F/4), refer
+	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
+	 */
+	if (!to_raw) {
+		f = (value & 0x60) >> 5;
+		y = value & 0x1f;
+		value = (1 << y) * (4 + f) * rp->time_unit / 4;
+	} else {
+		do_div(value, rp->time_unit);
+		y = ilog2(value);
+		f = div64_u64(4 * (value - (1 << y)), 1 << y);
+		value = (y & 0x1f) | ((f & 0x3) << 5);
+	}
+	return value;
+}
+
+static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
+					 bool to_raw)
+{
+	/*
+	 * Atom time unit encoding is straight forward val * time_unit,
+	 * where time_unit is default to 1 sec. Never 0.
+	 */
+	if (!to_raw)
+		return (value) ? value *= rp->time_unit : rp->time_unit;
+
+	value = div64_u64(value, rp->time_unit);
+
+	return value;
+}
+
+static const struct rapl_defaults rapl_defaults_core = {
+	.floor_freq_reg_addr = 0,
+	.check_unit = rapl_check_unit_core,
+	.set_floor_freq = set_floor_freq_default,
+	.compute_time_window = rapl_compute_time_window_core,
+};
+
+static const struct rapl_defaults rapl_defaults_hsw_server = {
+	.check_unit = rapl_check_unit_core,
+	.set_floor_freq = set_floor_freq_default,
+	.compute_time_window = rapl_compute_time_window_core,
+	.dram_domain_energy_unit = 15300,
+};
+
+static const struct rapl_defaults rapl_defaults_byt = {
+	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = set_floor_freq_atom,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct rapl_defaults rapl_defaults_tng = {
+	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = set_floor_freq_atom,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct rapl_defaults rapl_defaults_ann = {
+	.floor_freq_reg_addr = 0,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = NULL,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct rapl_defaults rapl_defaults_cht = {
+	.floor_freq_reg_addr = 0,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = NULL,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct x86_cpu_id rapl_ids[] __initconst = {
+	INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core),
+	INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core),
+	INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(HASWELL_CORE, rapl_defaults_core),
+	INTEL_CPU_FAM6(HASWELL_ULT, rapl_defaults_core),
+	INTEL_CPU_FAM6(HASWELL_GT3E, rapl_defaults_core),
+	INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server),
+
+	INTEL_CPU_FAM6(BROADWELL_CORE, rapl_defaults_core),
+	INTEL_CPU_FAM6(BROADWELL_GT3E, rapl_defaults_core),
+	INTEL_CPU_FAM6(BROADWELL_XEON_D, rapl_defaults_core),
+	INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server),
+
+	INTEL_CPU_FAM6(SKYLAKE_DESKTOP, rapl_defaults_core),
+	INTEL_CPU_FAM6(SKYLAKE_MOBILE, rapl_defaults_core),
+	INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server),
+	INTEL_CPU_FAM6(KABYLAKE_MOBILE, rapl_defaults_core),
+	INTEL_CPU_FAM6(KABYLAKE_DESKTOP, rapl_defaults_core),
+	INTEL_CPU_FAM6(CANNONLAKE_MOBILE, rapl_defaults_core),
+	INTEL_CPU_FAM6(ICELAKE_MOBILE, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
+	INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
+	INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, rapl_defaults_tng),
+	INTEL_CPU_FAM6(ATOM_AIRMONT_MID, rapl_defaults_ann),
+	INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core),
+	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, rapl_defaults_core),
+	INTEL_CPU_FAM6(ATOM_GOLDMONT_X, rapl_defaults_core),
+	INTEL_CPU_FAM6(ATOM_TREMONT_X, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server),
+	INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server),
+	{}
+};
+
+MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
+
+/* Read once for all raw primitive data for domains */
+static void rapl_update_domain_data(struct rapl_package *rp)
+{
+	int dmn, prim;
+	u64 val;
+
+	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
+		pr_debug("update %s domain %s data\n", rp->name,
+			 rp->domains[dmn].name);
+		/* exclude non-raw primitives */
+		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
+			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
+						rpi[prim].unit, &val))
+				rp->domains[dmn].rdd.primitives[prim] = val;
+		}
+	}
+
+}
+
+static int rapl_package_register_powercap(struct rapl_package *rp)
+{
+	struct rapl_domain *rd;
+	struct powercap_zone *power_zone = NULL;
+	int nr_pl, ret;
+
+	/* Update the domain data of the new package */
+	rapl_update_domain_data(rp);
+
+	/* first we register package domain as the parent zone */
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		if (rd->id == RAPL_DOMAIN_PACKAGE) {
+			nr_pl = find_nr_power_limit(rd);
+			pr_debug("register package domain %s\n", rp->name);
+			power_zone = powercap_register_zone(&rd->power_zone,
+					    rp->priv->control_type, rp->name,
+					    NULL, &zone_ops[rd->id], nr_pl,
+					    &constraint_ops);
+			if (IS_ERR(power_zone)) {
+				pr_debug("failed to register power zone %s\n",
+					 rp->name);
+				return PTR_ERR(power_zone);
+			}
+			/* track parent zone in per package/socket data */
+			rp->power_zone = power_zone;
+			/* done, only one package domain per socket */
+			break;
+		}
+	}
+	if (!power_zone) {
+		pr_err("no package domain found, unknown topology!\n");
+		return -ENODEV;
+	}
+	/* now register domains as children of the socket/package */
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		if (rd->id == RAPL_DOMAIN_PACKAGE)
+			continue;
+		/* number of power limits per domain varies */
+		nr_pl = find_nr_power_limit(rd);
+		power_zone = powercap_register_zone(&rd->power_zone,
+						    rp->priv->control_type,
+						    rd->name, rp->power_zone,
+						    &zone_ops[rd->id], nr_pl,
+						    &constraint_ops);
+
+		if (IS_ERR(power_zone)) {
+			pr_debug("failed to register power_zone, %s:%s\n",
+				 rp->name, rd->name);
+			ret = PTR_ERR(power_zone);
+			goto err_cleanup;
+		}
+	}
+	return 0;
+
+err_cleanup:
+	/*
+	 * Clean up previously initialized domains within the package if we
+	 * failed after the first domain setup.
+	 */
+	while (--rd >= rp->domains) {
+		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
+		powercap_unregister_zone(rp->priv->control_type,
+					 &rd->power_zone);
+	}
+
+	return ret;
+}
+
+int rapl_add_platform_domain(struct rapl_if_priv *priv)
+{
+	struct rapl_domain *rd;
+	struct powercap_zone *power_zone;
+	struct reg_action ra;
+	int ret;
+
+	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
+	ra.mask = ~0;
+	ret = priv->read_raw(0, &ra);
+	if (ret || !ra.value)
+		return -ENODEV;
+
+	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+	ra.mask = ~0;
+	ret = priv->read_raw(0, &ra);
+	if (ret || !ra.value)
+		return -ENODEV;
+
+	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return -ENOMEM;
+
+	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
+	rd->id = RAPL_DOMAIN_PLATFORM;
+	rd->regs[RAPL_DOMAIN_REG_LIMIT] =
+	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+	rd->regs[RAPL_DOMAIN_REG_STATUS] =
+	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
+	rd->rpl[0].prim_id = PL1_ENABLE;
+	rd->rpl[0].name = pl1_name;
+	rd->rpl[1].prim_id = PL2_ENABLE;
+	rd->rpl[1].name = pl2_name;
+	rd->rp = rapl_find_package_domain(0, priv);
+
+	power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
+					    "psys", NULL,
+					    &zone_ops[RAPL_DOMAIN_PLATFORM],
+					    2, &constraint_ops);
+
+	if (IS_ERR(power_zone)) {
+		kfree(rd);
+		return PTR_ERR(power_zone);
+	}
+
+	priv->platform_rapl_domain = rd;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rapl_add_platform_domain);
+
+void rapl_remove_platform_domain(struct rapl_if_priv *priv)
+{
+	if (priv->platform_rapl_domain) {
+		powercap_unregister_zone(priv->control_type,
+				 &priv->platform_rapl_domain->power_zone);
+		kfree(priv->platform_rapl_domain);
+	}
+}
+EXPORT_SYMBOL_GPL(rapl_remove_platform_domain);
+
+static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
+{
+	struct reg_action ra;
+
+	switch (domain) {
+	case RAPL_DOMAIN_PACKAGE:
+	case RAPL_DOMAIN_PP0:
+	case RAPL_DOMAIN_PP1:
+	case RAPL_DOMAIN_DRAM:
+		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
+		break;
+	case RAPL_DOMAIN_PLATFORM:
+		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
+		return -EINVAL;
+	default:
+		pr_err("invalid domain id %d\n", domain);
+		return -EINVAL;
+	}
+	/* make sure domain counters are available and contains non-zero
+	 * values, otherwise skip it.
+	 */
+
+	ra.mask = ~0;
+	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
+		return -ENODEV;
+
+	return 0;
+}
+
+/*
+ * Check if power limits are available. Two cases when they are not available:
+ * 1. Locked by BIOS, in this case we still provide read-only access so that
+ *    users can see what limit is set by the BIOS.
+ * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
+ *    exist at all. In this case, we do not show the constraints in powercap.
+ *
+ * Called after domains are detected and initialized.
+ */
+static void rapl_detect_powerlimit(struct rapl_domain *rd)
+{
+	u64 val64;
+	int i;
+
+	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
+	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
+		if (val64) {
+			pr_info("RAPL %s domain %s locked by BIOS\n",
+				rd->rp->name, rd->name);
+			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
+		}
+	}
+	/* check if power limit MSR exists, otherwise domain is monitoring only */
+	for (i = 0; i < NR_POWER_LIMITS; i++) {
+		int prim = rd->rpl[i].prim_id;
+
+		if (rapl_read_data_raw(rd, prim, false, &val64))
+			rd->rpl[i].name = NULL;
+	}
+}
+
+/* Detect active and valid domains for the given CPU, caller must
+ * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
+ */
+static int rapl_detect_domains(struct rapl_package *rp, int cpu)
+{
+	struct rapl_domain *rd;
+	int i;
+
+	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
+		/* use physical package id to read counters */
+		if (!rapl_check_domain(cpu, i, rp)) {
+			rp->domain_map |= 1 << i;
+			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
+		}
+	}
+	rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
+	if (!rp->nr_domains) {
+		pr_debug("no valid rapl domains found in %s\n", rp->name);
+		return -ENODEV;
+	}
+	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
+
+	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
+			      GFP_KERNEL);
+	if (!rp->domains)
+		return -ENOMEM;
+
+	rapl_init_domains(rp);
+
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
+		rapl_detect_powerlimit(rd);
+
+	return 0;
+}
+
+/* called from CPU hotplug notifier, hotplug lock held */
+void rapl_remove_package(struct rapl_package *rp)
+{
+	struct rapl_domain *rd, *rd_package = NULL;
+
+	package_power_limit_irq_restore(rp);
+
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		rapl_write_data_raw(rd, PL1_ENABLE, 0);
+		rapl_write_data_raw(rd, PL1_CLAMP, 0);
+		if (find_nr_power_limit(rd) > 1) {
+			rapl_write_data_raw(rd, PL2_ENABLE, 0);
+			rapl_write_data_raw(rd, PL2_CLAMP, 0);
+		}
+		if (rd->id == RAPL_DOMAIN_PACKAGE) {
+			rd_package = rd;
+			continue;
+		}
+		pr_debug("remove package, undo power limit on %s: %s\n",
+			 rp->name, rd->name);
+		powercap_unregister_zone(rp->priv->control_type,
+					 &rd->power_zone);
+	}
+	/* do parent zone last */
+	powercap_unregister_zone(rp->priv->control_type,
+				 &rd_package->power_zone);
+	list_del(&rp->plist);
+	if (list_empty(&rapl_packages))
+		rapl_remove_core();
+	kfree(rp);
+}
+EXPORT_SYMBOL_GPL(rapl_remove_package);
+
+/* caller to ensure CPU hotplug lock is held */
+struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
+{
+	int id = topology_logical_die_id(cpu);
+	struct rapl_package *rp;
+
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (rp->id == id
+		    && rp->priv->control_type == priv->control_type)
+			return rp;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rapl_find_package_domain);
+
+/* called from CPU hotplug notifier, hotplug lock held */
+struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
+{
+	int id = topology_logical_die_id(cpu);
+	struct rapl_package *rp;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	int ret;
+
+	ret = rapl_init_core();
+	if (ret)
+		return ERR_PTR(ret);
+
+	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
+	if (!rp)
+		return ERR_PTR(-ENOMEM);
+
+	/* add the new package to the list */
+	rp->id = id;
+	rp->lead_cpu = cpu;
+	rp->priv = priv;
+
+	if (topology_max_die_per_package() > 1)
+		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
+			 "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
+	else
+		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
+			 c->phys_proc_id);
+
+	/* check if the package contains valid domains */
+	if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
+		ret = -ENODEV;
+		goto err_free_package;
+	}
+	ret = rapl_package_register_powercap(rp);
+	if (!ret) {
+		INIT_LIST_HEAD(&rp->plist);
+		list_add(&rp->plist, &rapl_packages);
+		return rp;
+	}
+
+err_free_package:
+	kfree(rp->domains);
+	kfree(rp);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(rapl_add_package);
+
+static void power_limit_state_save(void)
+{
+	struct rapl_package *rp;
+	struct rapl_domain *rd;
+	int nr_pl, ret, i;
+
+	get_online_cpus();
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (!rp->power_zone)
+			continue;
+		rd = power_zone_to_rapl_domain(rp->power_zone);
+		nr_pl = find_nr_power_limit(rd);
+		for (i = 0; i < nr_pl; i++) {
+			switch (rd->rpl[i].prim_id) {
+			case PL1_ENABLE:
+				ret = rapl_read_data_raw(rd,
+						 POWER_LIMIT1, true,
+						 &rd->rpl[i].last_power_limit);
+				if (ret)
+					rd->rpl[i].last_power_limit = 0;
+				break;
+			case PL2_ENABLE:
+				ret = rapl_read_data_raw(rd,
+						 POWER_LIMIT2, true,
+						 &rd->rpl[i].last_power_limit);
+				if (ret)
+					rd->rpl[i].last_power_limit = 0;
+				break;
+			}
+		}
+	}
+	put_online_cpus();
+}
+
+static void power_limit_state_restore(void)
+{
+	struct rapl_package *rp;
+	struct rapl_domain *rd;
+	int nr_pl, i;
+
+	get_online_cpus();
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (!rp->power_zone)
+			continue;
+		rd = power_zone_to_rapl_domain(rp->power_zone);
+		nr_pl = find_nr_power_limit(rd);
+		for (i = 0; i < nr_pl; i++) {
+			switch (rd->rpl[i].prim_id) {
+			case PL1_ENABLE:
+				if (rd->rpl[i].last_power_limit)
+					rapl_write_data_raw(rd, POWER_LIMIT1,
+					    rd->rpl[i].last_power_limit);
+				break;
+			case PL2_ENABLE:
+				if (rd->rpl[i].last_power_limit)
+					rapl_write_data_raw(rd, POWER_LIMIT2,
+					    rd->rpl[i].last_power_limit);
+				break;
+			}
+		}
+	}
+	put_online_cpus();
+}
+
+static int rapl_pm_callback(struct notifier_block *nb,
+			    unsigned long mode, void *_unused)
+{
+	switch (mode) {
+	case PM_SUSPEND_PREPARE:
+		power_limit_state_save();
+		break;
+	case PM_POST_SUSPEND:
+		power_limit_state_restore();
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rapl_pm_notifier = {
+	.notifier_call = rapl_pm_callback,
+};
+
+static int rapl_init_core(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	if (rapl_defaults)
+		return 0;
+
+	id = x86_match_cpu(rapl_ids);
+	if (!id) {
+		pr_err("driver does not support CPU family %d model %d\n",
+		       boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+		return -ENODEV;
+	}
+
+	rapl_defaults = (struct rapl_defaults *)id->driver_data;
+
+	ret = register_pm_notifier(&rapl_pm_notifier);
+
+	return 0;
+}
+
+static void rapl_remove_core(void)
+{
+	unregister_pm_notifier(&rapl_pm_notifier);
+	rapl_defaults = NULL;
+}
+
+MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
+MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
new file mode 100644
index 000000000000..89645222e3e0
--- /dev/null
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Running Average Power Limit (RAPL) Driver via MSR interface
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/sysfs.h>
+#include <linux/cpu.h>
+#include <linux/powercap.h>
+#include <linux/suspend.h>
+#include <linux/intel_rapl.h>
+#include <linux/processor.h>
+
+#include <asm/iosf_mbi.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+/* Local defines */
+#define MSR_PLATFORM_POWER_LIMIT	0x0000065C
+
+/* private data for RAPL MSR Interface */
+static struct rapl_if_priv rapl_msr_priv = {
+	.reg_unit = MSR_RAPL_POWER_UNIT,
+	.regs[RAPL_DOMAIN_PACKAGE] = {
+		MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO },
+	.regs[RAPL_DOMAIN_PP0] = {
+		MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 },
+	.regs[RAPL_DOMAIN_PP1] = {
+		MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 },
+	.regs[RAPL_DOMAIN_DRAM] = {
+		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
+	.regs[RAPL_DOMAIN_PLATFORM] = {
+		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
+};
+
+/* Handles CPU hotplug on multi-socket systems.
+ * If a CPU goes online as the first CPU of the physical package
+ * we add the RAPL package to the system. Similarly, when the last
+ * CPU of the package is removed, we remove the RAPL package and its
+ * associated domains. Cooling devices are handled accordingly at
+ * per-domain level.
+ */
+static int rapl_cpu_online(unsigned int cpu)
+{
+	struct rapl_package *rp;
+
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
+	if (!rp) {
+		rp = rapl_add_package(cpu, &rapl_msr_priv);
+		if (IS_ERR(rp))
+			return PTR_ERR(rp);
+	}
+	cpumask_set_cpu(cpu, &rp->cpumask);
+	return 0;
+}
+
+static int rapl_cpu_down_prep(unsigned int cpu)
+{
+	struct rapl_package *rp;
+	int lead_cpu;
+
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
+	if (!rp)
+		return 0;
+
+	cpumask_clear_cpu(cpu, &rp->cpumask);
+	lead_cpu = cpumask_first(&rp->cpumask);
+	if (lead_cpu >= nr_cpu_ids)
+		rapl_remove_package(rp);
+	else if (rp->lead_cpu == cpu)
+		rp->lead_cpu = lead_cpu;
+	return 0;
+}
+
+static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
+{
+	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
+		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
+		return -EIO;
+	}
+	ra->value &= ra->mask;
+	return 0;
+}
+
+static void rapl_msr_update_func(void *info)
+{
+	struct reg_action *ra = info;
+	u64 val;
+
+	ra->err = rdmsrl_safe(ra->reg, &val);
+	if (ra->err)
+		return;
+
+	val &= ~ra->mask;
+	val |= ra->value;
+
+	ra->err = wrmsrl_safe(ra->reg, val);
+}
+
+static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
+{
+	int ret;
+
+	ret = smp_call_function_single(cpu, rapl_msr_update_func, ra, 1);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
+	return ra->err;
+}
+
+static int __init rapl_msr_init(void)
+{
+	int ret;
+
+	rapl_msr_priv.read_raw = rapl_msr_read_raw;
+	rapl_msr_priv.write_raw = rapl_msr_write_raw;
+
+	rapl_msr_priv.control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
+	if (IS_ERR(rapl_msr_priv.control_type)) {
+		pr_debug("failed to register powercap control_type.\n");
+		return PTR_ERR(rapl_msr_priv.control_type);
+	}
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
+				rapl_cpu_online, rapl_cpu_down_prep);
+	if (ret < 0)
+		goto out;
+	rapl_msr_priv.pcap_rapl_online = ret;
+
+	/* Don't bail out if PSys is not supported */
+	rapl_add_platform_domain(&rapl_msr_priv);
+
+	return 0;
+
+out:
+	if (ret)
+		powercap_unregister_control_type(rapl_msr_priv.control_type);
+	return ret;
+}
+
+static void __exit rapl_msr_exit(void)
+{
+	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
+	rapl_remove_platform_domain(&rapl_msr_priv);
+	powercap_unregister_control_type(rapl_msr_priv.control_type);
+}
+
+module_init(rapl_msr_init);
+module_exit(rapl_msr_exit);
+
+MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit) control via MSR interface");
+MODULE_AUTHOR("Zhang Rui <rui.zhang@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index ff215d64d114..9579f458fe4d 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -142,4 +142,11 @@ struct rapl_package {
 	struct rapl_if_priv *priv;
 };
 
+struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv);
+struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv);
+void rapl_remove_package(struct rapl_package *rp);
+
+int rapl_add_platform_domain(struct rapl_if_priv *priv);
+void rapl_remove_platform_domain(struct rapl_if_priv *priv);
+
 #endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3


From d978e755aabe215cb67bf713e103ed3916ec306d Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:31 +0800
Subject: intel_rapl: support 64 bit register

RAPL MMIO interface uses 64 bit registers, thus force use 64 bit register
for all the RAPL code.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c |  6 +++---
 drivers/powercap/intel_rapl_msr.c    | 11 +++++++----
 include/linux/intel_rapl.h           |  8 ++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 34a82531a7cf..8e4de036f6d0 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -689,7 +689,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 	ra.mask = rp->mask;
 
 	if (rd->rp->priv->read_raw(cpu, &ra)) {
-		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
+		pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
 		return -EIO;
 	}
 
@@ -749,7 +749,7 @@ static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
 	ra.reg = rp->priv->reg_unit;
 	ra.mask = ~0;
 	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 		       rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
@@ -777,7 +777,7 @@ static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 	ra.reg = rp->priv->reg_unit;
 	ra.mask = ~0;
 	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 		       rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index 89645222e3e0..6cd8a8fb9238 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -84,8 +84,10 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 
 static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
 {
-	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
-		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
+	u32 msr = (u32)ra->reg;
+
+	if (rdmsrl_safe_on_cpu(cpu, msr, &ra->value)) {
+		pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu);
 		return -EIO;
 	}
 	ra->value &= ra->mask;
@@ -95,16 +97,17 @@ static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
 static void rapl_msr_update_func(void *info)
 {
 	struct reg_action *ra = info;
+	u32 msr = (u32)ra->reg;
 	u64 val;
 
-	ra->err = rdmsrl_safe(ra->reg, &val);
+	ra->err = rdmsrl_safe(msr, &val);
 	if (ra->err)
 		return;
 
 	val &= ~ra->mask;
 	val |= ra->value;
 
-	ra->err = wrmsrl_safe(ra->reg, val);
+	ra->err = wrmsrl_safe(msr, val);
 }
 
 static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 9579f458fe4d..649e19981eb0 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -78,7 +78,7 @@ struct rapl_package;
 struct rapl_domain {
 	const char *name;
 	enum rapl_domain_type id;
-	int regs[RAPL_DOMAIN_REG_MAX];
+	u64 regs[RAPL_DOMAIN_REG_MAX];
 	struct powercap_zone power_zone;
 	struct rapl_domain_data rdd;
 	struct rapl_power_limit rpl[NR_POWER_LIMITS];
@@ -89,7 +89,7 @@ struct rapl_domain {
 };
 
 struct reg_action {
-	u32 reg;
+	u64 reg;
 	u64 mask;
 	u64 value;
 	int err;
@@ -113,8 +113,8 @@ struct rapl_if_priv {
 	struct powercap_control_type *control_type;
 	struct rapl_domain *platform_rapl_domain;
 	enum cpuhp_state pcap_rapl_online;
-	u32 reg_unit;
-	u32 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
+	u64 reg_unit;
+	u64 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
 	int (*read_raw)(int cpu, struct reg_action *ra);
 	int (*write_raw)(int cpu, struct reg_action *ra);
 };
-- 
cgit v1.2.3


From 0c2ddedd8bcb88c4100acb9e0fc5ac8752d09501 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:32 +0800
Subject: intel_rapl: support two power limits for every RAPL domain

RAPL MSR interface supports 2 power limits for package domain, and 1 power
limit for other domains, while RAPL MMIO interface supports 2 power limits
for both package and dram domains.
And when 2 power limits are supported, the FW_LOCK bit is in bit 63 of the
register, instead of bit 31.

Remove the assumption that only pakcage domain supports 2 power limits.
And allow the RAPL interface driver to specify the number of power limits
supported, for every single RAPL domain it owns..

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c | 72 +++++++++++++-----------------------
 drivers/powercap/intel_rapl_msr.c    |  1 +
 include/linux/intel_rapl.h           |  2 +
 3 files changed, 28 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 8e4de036f6d0..db8df19d8133 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -38,8 +38,8 @@
 #define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
 #define POWER_LIMIT2_ENABLE     BIT_ULL(47)
 #define POWER_LIMIT2_CLAMP      BIT_ULL(48)
-#define POWER_PACKAGE_LOCK      BIT_ULL(63)
-#define POWER_PP_LOCK           BIT(31)
+#define POWER_HIGH_LOCK         BIT_ULL(63)
+#define POWER_LOW_LOCK          BIT(31)
 
 #define TIME_WINDOW1_MASK       (0x7FULL<<17)
 #define TIME_WINDOW2_MASK       (0x7FULL<<49)
@@ -513,60 +513,38 @@ static const struct powercap_zone_constraint_ops constraint_ops = {
 /* called after domain detection and package level data are set */
 static void rapl_init_domains(struct rapl_package *rp)
 {
-	int i;
+	enum rapl_domain_type i;
+	enum rapl_domain_reg_id j;
 	struct rapl_domain *rd = rp->domains;
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		unsigned int mask = rp->domain_map & (1 << i);
 
-		rd->regs[RAPL_DOMAIN_REG_LIMIT] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
-		rd->regs[RAPL_DOMAIN_REG_STATUS] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
-		rd->regs[RAPL_DOMAIN_REG_PERF] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
-		rd->regs[RAPL_DOMAIN_REG_POLICY] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
-		rd->regs[RAPL_DOMAIN_REG_INFO] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
-
-		switch (mask) {
-		case BIT(RAPL_DOMAIN_PACKAGE):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
-			rd->id = RAPL_DOMAIN_PACKAGE;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
+		if (!mask)
+			continue;
+
+		rd->rp = rp;
+		rd->name = rapl_domain_names[i];
+		rd->id = i;
+		rd->rpl[0].prim_id = PL1_ENABLE;
+		rd->rpl[0].name = pl1_name;
+		/* some domain may support two power limits */
+		if (rp->priv->limits[i] == 2) {
 			rd->rpl[1].prim_id = PL2_ENABLE;
 			rd->rpl[1].name = pl2_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP0):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
-			rd->id = RAPL_DOMAIN_PP0;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP1):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
-			rd->id = RAPL_DOMAIN_PP1;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_DRAM):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
-			rd->id = RAPL_DOMAIN_DRAM;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
+		}
+
+		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
+			rd->regs[j] = rp->priv->regs[i][j];
+
+		if (i == RAPL_DOMAIN_DRAM) {
 			rd->domain_energy_unit =
 			    rapl_defaults->dram_domain_energy_unit;
 			if (rd->domain_energy_unit)
 				pr_info("DRAM domain energy unit %dpj\n",
 					rd->domain_energy_unit);
-			break;
-		}
-		if (mask) {
-			rd->rp = rp;
-			rd++;
 		}
+		rd++;
 	}
 }
 
@@ -613,7 +591,7 @@ static struct rapl_primitive_info rpi[] = {
 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
+	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
@@ -675,9 +653,9 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 
 	cpu = rd->rp->lead_cpu;
 
-	/* special-case package domain, which uses a different bit */
-	if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
-		rp->mask = POWER_PACKAGE_LOCK;
+	/* domain with 2 limits has different bit */
+	if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
+		rp->mask = POWER_HIGH_LOCK;
 		rp->shift = 63;
 	}
 	/* non-hardware data are collected by the polling thread */
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index 6cd8a8fb9238..bc14a4579acb 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -41,6 +41,7 @@ static struct rapl_if_priv rapl_msr_priv = {
 		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
 	.regs[RAPL_DOMAIN_PLATFORM] = {
 		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
+	.limits[RAPL_DOMAIN_PACKAGE] = 2,
 };
 
 /* Handles CPU hotplug on multi-socket systems.
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 649e19981eb0..0c179d92d110 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -104,6 +104,7 @@ struct reg_action {
  * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
  * @reg_unit:			Register for getting energy/power/time unit.
  * @regs:			Register sets for different RAPL Domains.
+ * @limits:			Number of power limits supported by each domain.
  * @read_raw:			Callback for reading RAPL interface specific
  *				registers.
  * @write_raw:			Callback for writing RAPL interface specific
@@ -115,6 +116,7 @@ struct rapl_if_priv {
 	enum cpuhp_state pcap_rapl_online;
 	u64 reg_unit;
 	u64 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
+	int limits[RAPL_DOMAIN_MAX];
 	int (*read_raw)(int cpu, struct reg_action *ra);
 	int (*write_raw)(int cpu, struct reg_action *ra);
 };
-- 
cgit v1.2.3


From bedc0fd0f9b517698193d644f914b33951856fd2 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 11 Jul 2019 09:55:56 -0400
Subject: RDMA/core: Fix -Wunused-const-variable warnings

The commit below introduced a few compilation warnings.

In file included from ./include/rdma/ib_verbs.h:64,
                 from ./include/linux/mlx5/device.h:37,
                 from ./include/linux/mlx5/driver.h:51,
                 from drivers/net/ethernet/mellanox/mlx5/core/uar.c:36:
./include/linux/dim.h:378:1: warning: 'rdma_dim_prof' defined but not
used [-Wunused-const-variable=]
 rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
 ^~~~~~~~~~~~~
In file included from ./include/rdma/ib_verbs.h:64,
                 from ./include/linux/mlx5/device.h:37,
                 from ./include/linux/mlx5/driver.h:51,
                 from
drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c:37:
./include/linux/dim.h:378:1: warning: 'rdma_dim_prof' defined but not
used [-Wunused-const-variable=]
 rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
 ^~~~~~~~~~~~~

Since only ib_cq_rdma_dim_work() in drivers/infiniband/core/cq.c uses it,
just move the definition over there.

Fixes: f4915455dcf0 ("linux/dim: Implement RDMA adaptive moderation (DIM)")
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cq.c | 13 +++++++++++++
 include/linux/dim.h          | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index ffd6e24109d5..7c599878ccf7 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -18,6 +18,19 @@
 #define IB_POLL_FLAGS \
 	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+	{1,   0, 1,  0},
+	{1,   0, 4,  0},
+	{2,   0, 4,  0},
+	{2,   0, 8,  0},
+	{4,   0, 8,  0},
+	{16,  0, 8,  0},
+	{16,  0, 16, 0},
+	{32,  0, 16, 0},
+	{32,  0, 32, 0},
+};
+
 static void ib_cq_rdma_dim_work(struct work_struct *w)
 {
 	struct dim *dim = container_of(w, struct dim, work);
diff --git a/include/linux/dim.h b/include/linux/dim.h
index aa69730c3b8d..d3a0fbfff2bb 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -374,19 +374,6 @@ void net_dim(struct dim *dim, struct dim_sample end_sample);
 #define RDMA_DIM_PARAMS_NUM_PROFILES 9
 #define RDMA_DIM_START_PROFILE 0
 
-static const struct dim_cq_moder
-rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
-	{1,   0, 1,  0},
-	{1,   0, 4,  0},
-	{2,   0, 4,  0},
-	{2,   0, 8,  0},
-	{4,   0, 8,  0},
-	{16,  0, 8,  0},
-	{16,  0, 16, 0},
-	{32,  0, 16, 0},
-	{32,  0, 32, 0},
-};
-
 /**
  * rdma_dim - Runs the adaptive moderation.
  * @dim: The moderation struct.
-- 
cgit v1.2.3


From b516ea586d717472178e6ef1c152e85608b0ce32 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Mon, 8 Jul 2019 13:17:44 +0800
Subject: PCI: Enable NVIDIA HDA controllers

Many NVIDIA GPUs can be configured as either a single-function video device
or a multi-function device with video at function 0 and an HDA audio
controller at function 1.  The HDA controller can be enabled or disabled by
a bit in the function 0 config space.

Some BIOSes leave the HDA disabled, which means the HDMI connector from the
NVIDIA GPU may not work.  Sometimes the BIOS enables the HDA if an HDMI
cable is connected at boot time, but that doesn't handle hotplug cases.

Enable the HDA controller on device enumeration and resume and re-read the
header type, which tells us whether the GPU is a multi-function device.

This quirk is limited to NVIDIA PCI devices with the VGA Controller device
class.  This is expected to correspond to product configurations where the
NVIDIA GPU has connectors attached.  Other products where the device class
is 3D Controller are expected to correspond to configurations where the
NVIDIA GPU is dedicated (dGPU) and has no connectors.  See original post
(URL below) for more details.

This commit takes inspiration from an earlier patch by Daniel Drake.

Link: https://lore.kernel.org/r/20190708051744.24039-1-drake@endlessm.com v2
Link: https://lore.kernel.org/r/20190613063514.15317-1-drake@endlessm.com v1
Link: https://devtalk.nvidia.com/default/topic/1024022
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=75985
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Daniel Drake <drake@endlessm.com>
[bhelgaas: commit log, log message, return early if already enabled]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Aaron Plattner <aplattner@nvidia.com>
Cc: Peter Wu <peter@lekensteyn.nl>
Cc: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: Karol Herbst <kherbst@redhat.com>
Cc: Maik Freudenberg <hhfeuer@gmx.de>
---
 drivers/pci/quirks.c    | 30 ++++++++++++++++++++++++++++++
 include/linux/pci_ids.h |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index c66c0ca446c4..208aacf39329 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -5011,6 +5011,36 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
 			      PCI_CLASS_SERIAL_UNKNOWN, 8,
 			      quirk_gpu_usb_typec_ucsi);
 
+/*
+ * Enable the NVIDIA GPU integrated HDA controller if the BIOS left it
+ * disabled.  https://devtalk.nvidia.com/default/topic/1024022
+ */
+static void quirk_nvidia_hda(struct pci_dev *gpu)
+{
+	u8 hdr_type;
+	u32 val;
+
+	/* There was no integrated HDA controller before MCP89 */
+	if (gpu->device < PCI_DEVICE_ID_NVIDIA_GEFORCE_320M)
+		return;
+
+	/* Bit 25 at offset 0x488 enables the HDA controller */
+	pci_read_config_dword(gpu, 0x488, &val);
+	if (val & BIT(25))
+		return;
+
+	pci_info(gpu, "Enabling HDA controller\n");
+	pci_write_config_dword(gpu, 0x488, val | BIT(25));
+
+	/* The GPU becomes a multi-function device when the HDA is enabled */
+	pci_read_config_byte(gpu, PCI_HEADER_TYPE, &hdr_type);
+	gpu->multifunction = !!(hdr_type & 0x80);
+}
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+			       PCI_BASE_CLASS_DISPLAY, 16, quirk_nvidia_hda);
+DECLARE_PCI_FIXUP_CLASS_RESUME_EARLY(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+			       PCI_BASE_CLASS_DISPLAY, 16, quirk_nvidia_hda);
+
 /*
  * Some IDT switches incorrectly flag an ACS Source Validation error on
  * completions for config read requests even though PCIe r4.0, sec
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 70e86148cb1e..66898463b81f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1336,6 +1336,7 @@
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP78S_SMBUS    0x0752
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE       0x0759
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_SMBUS     0x07D8
+#define PCI_DEVICE_ID_NVIDIA_GEFORCE_320M           0x08A0
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP79_SMBUS     0x0AA2
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP89_SATA	    0x0D85
 
-- 
cgit v1.2.3


From db849faa9bef993a1379dc510623f750a72fa7ce Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Fri, 3 May 2019 13:14:59 -0700
Subject: net/mlx5e: Rx, Fix checksum calculation for new hardware

CQE checksum full mode in new HW, provides a full checksum of rx frame.
Covering bytes starting from eth protocol up to last byte in the received
frame (frame_size - ETH_HLEN), as expected by the stack.

Fixing up skb->csum by the driver is not required in such case. This fix
is to avoid wrong checksum calculation in drivers which already support
the new hardware with the new checksum mode.

Fixes: 85327a9c4150 ("net/mlx5: Update the list of the PCI supported devices")
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 7 ++++++-
 include/linux/mlx5/mlx5_ifc.h                     | 3 ++-
 4 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index cc6797e24571..cc227a7aa79f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -294,6 +294,7 @@ enum {
 	MLX5E_RQ_STATE_ENABLED,
 	MLX5E_RQ_STATE_AM,
 	MLX5E_RQ_STATE_NO_CSUM_COMPLETE,
+	MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */
 };
 
 struct mlx5e_cq {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a8e8350b38aa..98d75271fc73 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -855,6 +855,9 @@ static int mlx5e_open_rq(struct mlx5e_channel *c,
 	if (err)
 		goto err_destroy_rq;
 
+	if (MLX5_CAP_ETH(c->mdev, cqe_checksum_full))
+		__set_bit(MLX5E_RQ_STATE_CSUM_FULL, &c->rq.state);
+
 	if (params->rx_dim_enabled)
 		__set_bit(MLX5E_RQ_STATE_AM, &c->rq.state);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 13133e7f088e..8a5f9411cac6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -873,8 +873,14 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 		if (unlikely(get_ip_proto(skb, network_depth, proto) == IPPROTO_SCTP))
 			goto csum_unnecessary;
 
+		stats->csum_complete++;
 		skb->ip_summed = CHECKSUM_COMPLETE;
 		skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
+
+		if (test_bit(MLX5E_RQ_STATE_CSUM_FULL, &rq->state))
+			return; /* CQE csum covers all received bytes */
+
+		/* csum might need some fixups ...*/
 		if (network_depth > ETH_HLEN)
 			/* CQE csum is calculated from the IP header and does
 			 * not cover VLAN headers (if present). This will add
@@ -885,7 +891,6 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 						 skb->csum);
 
 		mlx5e_skb_padding_csum(skb, network_depth, proto, stats);
-		stats->csum_complete++;
 		return;
 	}
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5e74305e2e57..7e42efa143a0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -749,7 +749,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         swp[0x1];
 	u8         swp_csum[0x1];
 	u8         swp_lso[0x1];
-	u8         reserved_at_23[0xd];
+	u8         cqe_checksum_full[0x1];
+	u8         reserved_at_24[0xc];
 	u8         max_vxlan_udp_ports[0x8];
 	u8         reserved_at_38[0x6];
 	u8         max_geneve_opt_len[0x1];
-- 
cgit v1.2.3


From 63f9ba1bf8b6550365dc17a65d544cd75e68bf48 Mon Sep 17 00:00:00 2001
From: Petar Penkov <ppenkov@google.com>
Date: Fri, 5 Jul 2019 11:46:43 -0700
Subject: net: fib_rules: do not flow dissect local packets

Rules matching on loopback iif do not need early flow dissection as the
packet originates from the host. Stop counting such rules in
fib_rule_requires_fldissect

Signed-off-by: Petar Penkov <ppenkov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index eba8465e1d86..20dcadd8eed9 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -180,9 +180,9 @@ static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a,
 
 static inline bool fib_rule_requires_fldissect(struct fib_rule *rule)
 {
-	return rule->ip_proto ||
+	return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto ||
 		fib_rule_port_range_set(&rule->sport_range) ||
-		fib_rule_port_range_set(&rule->dport_range);
+		fib_rule_port_range_set(&rule->dport_range));
 }
 
 struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *,
-- 
cgit v1.2.3


From bd976e52725965ddcceb9abecbcc7ca46863665c Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Mon, 1 Jul 2019 14:09:16 +0900
Subject: block: Kill gfp_t argument of blkdev_report_zones()

Only GFP_KERNEL and GFP_NOIO are used with blkdev_report_zones(). In
preparation of using vmalloc() for large report buffer and zone array
allocations used by this function, remove its "gfp_t gfp_mask" argument
and rely on the caller context to use memalloc_noio_save/restore() where
necessary (block layer zone revalidation and dm-zoned I/O error path).

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c              | 31 +++++++++++++++++++------------
 drivers/block/null_blk.h       |  3 +--
 drivers/block/null_blk_zoned.c |  3 +--
 drivers/md/dm-flakey.c         |  5 ++---
 drivers/md/dm-linear.c         |  5 ++---
 drivers/md/dm-zoned-metadata.c | 16 ++++++++++++----
 drivers/md/dm.c                |  6 ++----
 drivers/scsi/sd.h              |  3 +--
 drivers/scsi/sd_zbc.c          |  6 ++----
 fs/f2fs/super.c                |  4 +---
 include/linux/blkdev.h         |  5 ++---
 include/linux/device-mapper.h  |  3 +--
 12 files changed, 46 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 3249738242b4..58ced170b424 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
+#include <linux/sched/mm.h>
 
 #include "blk.h"
 
@@ -117,8 +118,7 @@ static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep)
 }
 
 static int blk_report_zones(struct gendisk *disk, sector_t sector,
-			    struct blk_zone *zones, unsigned int *nr_zones,
-			    gfp_t gfp_mask)
+			    struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct request_queue *q = disk->queue;
 	unsigned int z = 0, n, nrz = *nr_zones;
@@ -127,8 +127,7 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
 
 	while (z < nrz && sector < capacity) {
 		n = nrz - z;
-		ret = disk->fops->report_zones(disk, sector, &zones[z], &n,
-					       gfp_mask);
+		ret = disk->fops->report_zones(disk, sector, &zones[z], &n);
 		if (ret)
 			return ret;
 		if (!n)
@@ -149,17 +148,18 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
  * @sector:	Sector from which to report zones
  * @zones:	Array of zone structures where to return the zones information
  * @nr_zones:	Number of zone structures in the zone array
- * @gfp_mask:	Memory allocation flags (for bio_alloc)
  *
  * Description:
  *    Get zone information starting from the zone containing @sector.
  *    The number of zone information reported may be less than the number
  *    requested by @nr_zones. The number of zones actually reported is
  *    returned in @nr_zones.
+ *    The caller must use memalloc_noXX_save/restore() calls to control
+ *    memory allocations done within this function (zone array and command
+ *    buffer allocation by the device driver).
  */
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
-			struct blk_zone *zones, unsigned int *nr_zones,
-			gfp_t gfp_mask)
+			struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 	unsigned int i, nrz;
@@ -184,7 +184,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 	nrz = min(*nr_zones,
 		  __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector));
 	ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector,
-			       zones, &nrz, gfp_mask);
+			       zones, &nrz);
 	if (ret)
 		return ret;
 
@@ -305,9 +305,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!zones)
 		return -ENOMEM;
 
-	ret = blkdev_report_zones(bdev, rep.sector,
-				  zones, &rep.nr_zones,
-				  GFP_KERNEL);
+	ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones);
 	if (ret)
 		goto out;
 
@@ -415,6 +413,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 	unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
 	unsigned int i, rep_nr_zones = 0, z = 0, nrz;
 	struct blk_zone *zones = NULL;
+	unsigned int noio_flag;
 	sector_t sector = 0;
 	int ret = 0;
 
@@ -427,6 +426,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 		return 0;
 	}
 
+	/*
+	 * Ensure that all memory allocations in this context are done as
+	 * if GFP_NOIO was specified.
+	 */
+	noio_flag = memalloc_noio_save();
+
 	if (!blk_queue_is_zoned(q) || !nr_zones) {
 		nr_zones = 0;
 		goto update;
@@ -449,7 +454,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 
 	while (z < nr_zones) {
 		nrz = min(nr_zones - z, rep_nr_zones);
-		ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO);
+		ret = blk_report_zones(disk, sector, zones, &nrz);
 		if (ret)
 			goto out;
 		if (!nrz)
@@ -480,6 +485,8 @@ update:
 	blk_mq_unfreeze_queue(q);
 
 out:
+	memalloc_noio_restore(noio_flag);
+
 	free_pages((unsigned long)zones,
 		   get_order(rep_nr_zones * sizeof(struct blk_zone)));
 	kfree(seq_zones_wlock);
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index 34b22d6523ba..4b9bbe3bb5a1 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -89,8 +89,7 @@ struct nullb {
 int null_zone_init(struct nullb_device *dev);
 void null_zone_exit(struct nullb_device *dev);
 int null_zone_report(struct gendisk *disk, sector_t sector,
-		     struct blk_zone *zones, unsigned int *nr_zones,
-		     gfp_t gfp_mask);
+		     struct blk_zone *zones, unsigned int *nr_zones);
 void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
 			unsigned int nr_sectors);
 void null_zone_reset(struct nullb_cmd *cmd, sector_t sector);
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index fca0c97ff1aa..cb28d93f2bd1 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -67,8 +67,7 @@ void null_zone_exit(struct nullb_device *dev)
 }
 
 int null_zone_report(struct gendisk *disk, sector_t sector,
-		     struct blk_zone *zones, unsigned int *nr_zones,
-		     gfp_t gfp_mask)
+		     struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct nullb *nullb = disk->private_data;
 	struct nullb_device *dev = nullb->dev;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index a9bc518156f2..2900fbde89b3 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -461,15 +461,14 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static int flakey_report_zones(struct dm_target *ti, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask)
+			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct flakey_c *fc = ti->private;
 	int ret;
 
 	/* Do report and remap it */
 	ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector),
-				  zones, nr_zones, gfp_mask);
+				  zones, nr_zones);
 	if (ret != 0)
 		return ret;
 
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ad980a38fb1e..ecefe6703736 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -137,15 +137,14 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static int linear_report_zones(struct dm_target *ti, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask)
+			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
 	int ret;
 
 	/* Do report and remap it */
 	ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector),
-				  zones, nr_zones, gfp_mask);
+				  zones, nr_zones);
 	if (ret != 0)
 		return ret;
 
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index d8334cd45d7c..9faf3e49c7af 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/crc32.h>
+#include <linux/sched/mm.h>
 
 #define	DM_MSG_PREFIX		"zoned metadata"
 
@@ -1162,8 +1163,7 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
 	while (sector < dev->capacity) {
 		/* Get zone information */
 		nr_blkz = DMZ_REPORT_NR_ZONES;
-		ret = blkdev_report_zones(dev->bdev, sector, blkz,
-					  &nr_blkz, GFP_KERNEL);
+		ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz);
 		if (ret) {
 			dmz_dev_err(dev, "Report zones failed %d", ret);
 			goto out;
@@ -1201,12 +1201,20 @@ out:
 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
 {
 	unsigned int nr_blkz = 1;
+	unsigned int noio_flag;
 	struct blk_zone blkz;
 	int ret;
 
-	/* Get zone information from disk */
+	/*
+	 * Get zone information from disk. Since blkdev_report_zones() uses
+	 * GFP_KERNEL by default for memory allocations, set the per-task
+	 * PF_MEMALLOC_NOIO flag so that all allocations are done as if
+	 * GFP_NOIO was specified.
+	 */
+	noio_flag = memalloc_noio_save();
 	ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
-				  &blkz, &nr_blkz, GFP_NOIO);
+				  &blkz, &nr_blkz);
+	memalloc_noio_restore(noio_flag);
 	if (!nr_blkz)
 		ret = -EIO;
 	if (ret) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5475081dcbd6..61f1152b74e9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -441,8 +441,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 }
 
 static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask)
+			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 #ifdef CONFIG_BLK_DEV_ZONED
 	struct mapped_device *md = disk->private_data;
@@ -480,8 +479,7 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 	 * So there is no need to loop here trying to fill the entire array
 	 * of zones.
 	 */
-	ret = tgt->type->report_zones(tgt, sector, zones,
-				      nr_zones, gfp_mask);
+	ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
 
 out:
 	dm_put_live_table(md, srcu_idx);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 5796ace76225..38c50946fc42 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -213,8 +213,7 @@ extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
 extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
 			    struct scsi_sense_hdr *sshdr);
 extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask);
+			       struct blk_zone *zones, unsigned int *nr_zones);
 
 #else /* CONFIG_BLK_DEV_ZONED */
 
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 7334024b64f1..ec3764c8f3f1 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -109,13 +109,11 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
  * @sector: Start 512B sector of the report
  * @zones: Array of zone descriptors
  * @nr_zones: Number of descriptors in the array
- * @gfp_mask: Memory allocation mask
  *
  * Execute a report zones command on the target disk.
  */
 int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
-			struct blk_zone *zones, unsigned int *nr_zones,
-			gfp_t gfp_mask)
+			struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	unsigned int i, buflen, nrz = *nr_zones;
@@ -134,7 +132,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 	 */
 	buflen = min(queue_max_hw_sectors(disk->queue) << 9,
 		     roundup((nrz + 1) * 64, 512));
-	buf = kmalloc(buflen, gfp_mask);
+	buf = kmalloc(buflen, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6b959bbb336a..4e91ba6c8a2e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2841,9 +2841,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 	while (zones && sector < nr_sectors) {
 
 		nr_zones = F2FS_REPORT_NR_ZONES;
-		err = blkdev_report_zones(bdev, sector,
-					  zones, &nr_zones,
-					  GFP_KERNEL);
+		err = blkdev_report_zones(bdev, sector, zones, &nr_zones);
 		if (err)
 			break;
 		if (!nr_zones) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 259bd7ad8312..05036e3e3458 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -347,7 +347,7 @@ struct queue_limits {
 extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
 			       sector_t sector, struct blk_zone *zones,
-			       unsigned int *nr_zones, gfp_t gfp_mask);
+			       unsigned int *nr_zones);
 extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
 			      sector_t nr_sectors, gfp_t gfp_mask);
 extern int blk_revalidate_disk_zones(struct gendisk *disk);
@@ -1673,8 +1673,7 @@ struct block_device_operations {
 	/* this callback is with swap_lock and sometimes page table lock held */
 	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 	int (*report_zones)(struct gendisk *, sector_t sector,
-			    struct blk_zone *zones, unsigned int *nr_zones,
-			    gfp_t gfp_mask);
+			    struct blk_zone *zones, unsigned int *nr_zones);
 	struct module *owner;
 	const struct pr_ops *pr_ops;
 };
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e1f51d607cc5..3b470cb03b66 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -95,8 +95,7 @@ typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **
 
 typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector,
 				   struct blk_zone *zones,
-				   unsigned int *nr_zones,
-				   gfp_t gfp_mask);
+				   unsigned int *nr_zones);
 
 /*
  * These iteration functions are typically used to check (and combine)
-- 
cgit v1.2.3


From 26202928fafad8bda8b478edb7e62c885be623d7 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Mon, 1 Jul 2019 14:09:18 +0900
Subject: block: Limit zone array allocation size

Limit the size of the struct blk_zone array used in
blk_revalidate_disk_zones() to avoid memory allocation failures leading
to disk revalidation failure. Also further reduce the likelyhood of
such failures by using kvcalloc() (that is vmalloc()) instead of
allocating contiguous pages with alloc_pages().

Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 36 ++++++++++++++++++++----------------
 include/linux/blkdev.h |  5 +++++
 2 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 58ced170b424..6c503824ba3f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,8 @@
 #include <linux/rbtree.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
 
 #include "blk.h"
@@ -371,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
  * Allocate an array of struct blk_zone to get nr_zones zone information.
  * The allocated array may be smaller than nr_zones.
  */
-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
+static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
 {
-	size_t size = *nr_zones * sizeof(struct blk_zone);
-	struct page *page;
-	int order;
-
-	for (order = get_order(size); order >= 0; order--) {
-		page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
-		if (page) {
-			*nr_zones = min_t(unsigned int, *nr_zones,
-				(PAGE_SIZE << order) / sizeof(struct blk_zone));
-			return page_address(page);
-		}
+	struct blk_zone *zones;
+	size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
+
+	/*
+	 * GFP_KERNEL here is meaningless as the caller task context has
+	 * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
+	 * with memalloc_noio_save().
+	 */
+	zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
+	if (!zones) {
+		*nr_zones = 0;
+		return NULL;
 	}
 
-	return NULL;
+	*nr_zones = nrz;
+
+	return zones;
 }
 
 void blk_queue_free_zone_bitmaps(struct request_queue *q)
@@ -448,7 +453,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 
 	/* Get zone information and initialize seq_zones_bitmap */
 	rep_nr_zones = nr_zones;
-	zones = blk_alloc_zones(q->node, &rep_nr_zones);
+	zones = blk_alloc_zones(&rep_nr_zones);
 	if (!zones)
 		goto out;
 
@@ -487,8 +492,7 @@ update:
 out:
 	memalloc_noio_restore(noio_flag);
 
-	free_pages((unsigned long)zones,
-		   get_order(rep_nr_zones * sizeof(struct blk_zone)));
+	kvfree(zones);
 	kfree(seq_zones_wlock);
 	kfree(seq_zones_bitmap);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 05036e3e3458..1ef375dafb1c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -344,6 +344,11 @@ struct queue_limits {
 
 #ifdef CONFIG_BLK_DEV_ZONED
 
+/*
+ * Maximum number of zones to report with a single report zones command.
+ */
+#define BLK_ZONED_REPORT_MAX_ZONES	8192U
+
 extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
 			       sector_t sector, struct blk_zone *zones,
-- 
cgit v1.2.3


From a101b043c44dfcb63bed7f29a675e9fa0259005e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 11 Jul 2019 16:33:12 -0400
Subject: SUNRPC: Fix transport accounting when caller specifies an rpc_xprt

Ensure that we do the required accounting for the round robin queue
when the caller to rpc_init_task() has passed in a transport to be
used.

Reported-by: Olga Kornievskaia <aglo@umich.edu>
Reported-by: Neil Brown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 42 ++++++++++++++++++++----------------------
 net/sunrpc/sched.c          |  3 ++-
 3 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 4619098affa3..4e070e00c143 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -164,6 +164,8 @@ void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
 void		rpc_task_release_transport(struct rpc_task *);
 void		rpc_task_release_client(struct rpc_task *);
+struct rpc_xprt	*rpc_task_get_xprt(struct rpc_clnt *clnt,
+		struct rpc_xprt *xprt);
 
 int		rpcb_create_local(struct net *);
 void		rpcb_put_local(struct net *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d599fab8adcb..383555d2b522 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -978,11 +978,10 @@ out:
 }
 EXPORT_SYMBOL_GPL(rpc_bind_new_program);
 
-static struct rpc_xprt *
-rpc_task_get_xprt(struct rpc_clnt *clnt)
+struct rpc_xprt *
+rpc_task_get_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
 	struct rpc_xprt_switch *xps;
-	struct rpc_xprt *xprt= xprt_iter_get_next(&clnt->cl_xpi);
 
 	if (!xprt)
 		return NULL;
@@ -995,24 +994,6 @@ rpc_task_get_xprt(struct rpc_clnt *clnt)
 	return xprt;
 }
 
-static struct rpc_xprt *
-rpc_task_get_first_xprt(struct rpc_clnt *clnt)
-{
-	struct rpc_xprt_switch *xps;
-	struct rpc_xprt *xprt;
-
-	rcu_read_lock();
-	xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
-	if (xprt) {
-		atomic_long_inc(&xprt->queuelen);
-		xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
-		atomic_long_inc(&xps->xps_queuelen);
-	}
-	rcu_read_unlock();
-
-	return xprt;
-}
-
 static void
 rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
@@ -1057,6 +1038,23 @@ void rpc_task_release_client(struct rpc_task *task)
 	}
 }
 
+static struct rpc_xprt *
+rpc_task_get_first_xprt(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt *xprt;
+
+	rcu_read_lock();
+	xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	rcu_read_unlock();
+	return rpc_task_get_xprt(clnt, xprt);
+}
+
+static struct rpc_xprt *
+rpc_task_get_next_xprt(struct rpc_clnt *clnt)
+{
+	return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi));
+}
+
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
@@ -1065,7 +1063,7 @@ void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 	if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
 		task->tk_xprt = rpc_task_get_first_xprt(clnt);
 	else
-		task->tk_xprt = rpc_task_get_xprt(clnt);
+		task->tk_xprt = rpc_task_get_next_xprt(clnt);
 }
 
 static
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 8a0779e963f9..1f275aba786f 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1092,7 +1092,8 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	/* Initialize workqueue for async tasks */
 	task->tk_workqueue = task_setup_data->workqueue;
 
-	task->tk_xprt = xprt_get(task_setup_data->rpc_xprt);
+	task->tk_xprt = rpc_task_get_xprt(task_setup_data->rpc_client,
+			xprt_get(task_setup_data->rpc_xprt));
 
 	task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred);
 
-- 
cgit v1.2.3


From 1df379924304b687263942452836db1d725155df Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 2 Jul 2019 12:03:50 +1000
Subject: clk: consoldiate the __clk_get_hw() declarations

Without this we were getting errors like:

In file included from drivers/clk/clkdev.c:22:0:
drivers/clk/clk.h:36:23: error: static declaration of '__clk_get_hw' follows non-static declaration
include/linux/clk-provider.h:808:16: note: previous declaration of '__clk_get_hw' was here

Fixes: 59fcdce425b7 ("clk: Remove ifdef for COMMON_CLK in clk-provider.h")
fixes: 73e0e496afda ("clkdev: Always allocate a struct clk and call __clk_get() w/ CCF")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.h             | 4 ----
 drivers/clk/imx/clk-imx6q.c   | 1 +
 drivers/clk/imx/clk-imx6sll.c | 1 +
 drivers/clk/imx/clk-imx6sx.c  | 1 +
 drivers/clk/imx/clk-imx6ul.c  | 1 +
 drivers/clk/imx/clk-imx7d.c   | 1 +
 drivers/clk/imx/clk.c         | 1 +
 include/linux/clk-provider.h  | 7 +++++++
 8 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk.h b/drivers/clk/clk.h
index d8400d623b34..2d801900cad5 100644
--- a/drivers/clk/clk.h
+++ b/drivers/clk/clk.h
@@ -33,10 +33,6 @@ clk_hw_create_clk(struct device *dev, struct clk_hw *hw, const char *dev_id,
 {
 	return (struct clk *)hw;
 }
-static struct clk_hw *__clk_get_hw(struct clk *clk)
-{
-	return (struct clk_hw *)clk;
-}
 static inline void __clk_put(struct clk *clk) { }
 
 #endif
diff --git a/drivers/clk/imx/clk-imx6q.c b/drivers/clk/imx/clk-imx6q.c
index 708e7c5590dd..fa5ef3cc2240 100644
--- a/drivers/clk/imx/clk-imx6q.c
+++ b/drivers/clk/imx/clk-imx6q.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/of.h>
diff --git a/drivers/clk/imx/clk-imx6sll.c b/drivers/clk/imx/clk-imx6sll.c
index 7eea448cb9a9..a9548c4b6d78 100644
--- a/drivers/clk/imx/clk-imx6sll.c
+++ b/drivers/clk/imx/clk-imx6sll.c
@@ -7,6 +7,7 @@
 #include <dt-bindings/clock/imx6sll-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk-imx6sx.c b/drivers/clk/imx/clk-imx6sx.c
index 91558b09bf9e..77748d6d4ccc 100644
--- a/drivers/clk/imx/clk-imx6sx.c
+++ b/drivers/clk/imx/clk-imx6sx.c
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/imx6sx-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk-imx6ul.c b/drivers/clk/imx/clk-imx6ul.c
index fd60d1549f71..e0e4625aacd0 100644
--- a/drivers/clk/imx/clk-imx6ul.c
+++ b/drivers/clk/imx/clk-imx6ul.c
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/imx6ul-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk-imx7d.c b/drivers/clk/imx/clk-imx7d.c
index 5b8a0c729f90..0ff3eb14d3af 100644
--- a/drivers/clk/imx/clk-imx7d.c
+++ b/drivers/clk/imx/clk-imx7d.c
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/imx7d-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk.c b/drivers/clk/imx/clk.c
index 1efed86217f7..588d1f45325d 100644
--- a/drivers/clk/imx/clk.c
+++ b/drivers/clk/imx/clk.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/clk.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/slab.h>
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 9ba000e3a50d..55d48140b0d0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -805,7 +805,14 @@ void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw);
 /* helper functions */
 const char *__clk_get_name(const struct clk *clk);
 const char *clk_hw_get_name(const struct clk_hw *hw);
+#ifdef CONFIG_COMMON_CLK
 struct clk_hw *__clk_get_hw(struct clk *clk);
+#else
+static inline struct clk_hw *__clk_get_hw(struct clk *clk)
+{
+	return (struct clk_hw *)clk;
+}
+#endif
 unsigned int clk_hw_get_num_parents(const struct clk_hw *hw);
 struct clk_hw *clk_hw_get_parent(const struct clk_hw *hw);
 struct clk_hw *clk_hw_get_parent_by_index(const struct clk_hw *hw,
-- 
cgit v1.2.3


From c1a970d06f8cf390354a4a426976ed7f960b71f1 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Wed, 10 Jul 2019 20:12:29 +0300
Subject: net: sched: Fix NULL-pointer dereference in tc_indr_block_ing_cmd()

After recent refactoring of block offlads infrastructure, indr_dev->block
pointer is dereferenced before it is verified to be non-NULL. Example stack
trace where this behavior leads to NULL-pointer dereference error when
creating vxlan dev on system with mlx5 NIC with offloads enabled:

[ 1157.852938] ==================================================================
[ 1157.866877] BUG: KASAN: null-ptr-deref in tc_indr_block_ing_cmd.isra.41+0x9c/0x160
[ 1157.880877] Read of size 4 at addr 0000000000000090 by task ip/3829
[ 1157.901637] CPU: 22 PID: 3829 Comm: ip Not tainted 5.2.0-rc6+ #488
[ 1157.914438] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
[ 1157.929031] Call Trace:
[ 1157.938318]  dump_stack+0x9a/0xeb
[ 1157.948362]  ? tc_indr_block_ing_cmd.isra.41+0x9c/0x160
[ 1157.960262]  ? tc_indr_block_ing_cmd.isra.41+0x9c/0x160
[ 1157.972082]  __kasan_report+0x176/0x192
[ 1157.982513]  ? tc_indr_block_ing_cmd.isra.41+0x9c/0x160
[ 1157.994348]  kasan_report+0xe/0x20
[ 1158.004324]  tc_indr_block_ing_cmd.isra.41+0x9c/0x160
[ 1158.015950]  ? tcf_block_setup+0x430/0x430
[ 1158.026558]  ? kasan_unpoison_shadow+0x30/0x40
[ 1158.037464]  __tc_indr_block_cb_register+0x5f5/0xf20
[ 1158.049288]  ? mlx5e_rep_indr_tc_block_unbind+0xa0/0xa0 [mlx5_core]
[ 1158.062344]  ? tc_indr_block_dev_put.part.47+0x5c0/0x5c0
[ 1158.074498]  ? rdma_roce_rescan_device+0x20/0x20 [ib_core]
[ 1158.086580]  ? br_device_event+0x98/0x480 [bridge]
[ 1158.097870]  ? strcmp+0x30/0x50
[ 1158.107578]  mlx5e_nic_rep_netdevice_event+0xdd/0x180 [mlx5_core]
[ 1158.120212]  notifier_call_chain+0x6d/0xa0
[ 1158.130753]  register_netdevice+0x6fc/0x7e0
[ 1158.141322]  ? netdev_change_features+0xa0/0xa0
[ 1158.152218]  ? vxlan_config_apply+0x210/0x310 [vxlan]
[ 1158.163593]  __vxlan_dev_create+0x2ad/0x520 [vxlan]
[ 1158.174770]  ? vxlan_changelink+0x490/0x490 [vxlan]
[ 1158.185870]  ? rcu_read_unlock+0x60/0x60 [vxlan]
[ 1158.196798]  vxlan_newlink+0x99/0xf0 [vxlan]
[ 1158.207303]  ? __vxlan_dev_create+0x520/0x520 [vxlan]
[ 1158.218601]  ? rtnl_create_link+0x3d0/0x450
[ 1158.228900]  __rtnl_newlink+0x8a7/0xb00
[ 1158.238701]  ? stack_access_ok+0x35/0x80
[ 1158.248450]  ? rtnl_link_unregister+0x1a0/0x1a0
[ 1158.258735]  ? find_held_lock+0x6d/0xd0
[ 1158.268379]  ? is_bpf_text_address+0x67/0xf0
[ 1158.278330]  ? lock_acquire+0xc1/0x1f0
[ 1158.287686]  ? is_bpf_text_address+0x5/0xf0
[ 1158.297449]  ? is_bpf_text_address+0x86/0xf0
[ 1158.307310]  ? kernel_text_address+0xec/0x100
[ 1158.317155]  ? arch_stack_walk+0x92/0xe0
[ 1158.326497]  ? __kernel_text_address+0xe/0x30
[ 1158.336213]  ? unwind_get_return_address+0x2f/0x50
[ 1158.346267]  ? create_prof_cpu_mask+0x20/0x20
[ 1158.355936]  ? arch_stack_walk+0x92/0xe0
[ 1158.365117]  ? stack_trace_save+0x8a/0xb0
[ 1158.374272]  ? stack_trace_consume_entry+0x80/0x80
[ 1158.384226]  ? match_held_lock+0x33/0x210
[ 1158.393216]  ? kasan_unpoison_shadow+0x30/0x40
[ 1158.402593]  rtnl_newlink+0x53/0x80
[ 1158.410925]  rtnetlink_rcv_msg+0x3a5/0x600
[ 1158.419777]  ? validate_linkmsg+0x400/0x400
[ 1158.428620]  ? find_held_lock+0x6d/0xd0
[ 1158.437117]  ? match_held_lock+0x1b/0x210
[ 1158.445760]  ? validate_linkmsg+0x400/0x400
[ 1158.454642]  netlink_rcv_skb+0xc7/0x1f0
[ 1158.463150]  ? netlink_ack+0x470/0x470
[ 1158.471538]  ? netlink_deliver_tap+0x1f3/0x5a0
[ 1158.480607]  netlink_unicast+0x2ae/0x350
[ 1158.489099]  ? netlink_attachskb+0x340/0x340
[ 1158.497935]  ? _copy_from_iter_full+0xde/0x3b0
[ 1158.506945]  ? __virt_addr_valid+0xb6/0xf0
[ 1158.515578]  ? __check_object_size+0x159/0x240
[ 1158.524515]  netlink_sendmsg+0x4d3/0x630
[ 1158.532879]  ? netlink_unicast+0x350/0x350
[ 1158.541400]  ? netlink_unicast+0x350/0x350
[ 1158.549805]  sock_sendmsg+0x94/0xa0
[ 1158.557561]  ___sys_sendmsg+0x49d/0x570
[ 1158.565625]  ? copy_msghdr_from_user+0x210/0x210
[ 1158.574457]  ? __fput+0x1e2/0x330
[ 1158.581948]  ? __kasan_slab_free+0x130/0x180
[ 1158.590407]  ? kmem_cache_free+0xb6/0x2d0
[ 1158.598574]  ? mark_lock+0xc7/0x790
[ 1158.606177]  ? task_work_run+0xcf/0x100
[ 1158.614165]  ? exit_to_usermode_loop+0x102/0x110
[ 1158.622954]  ? __lock_acquire+0x963/0x1ee0
[ 1158.631199]  ? lockdep_hardirqs_on+0x260/0x260
[ 1158.639777]  ? match_held_lock+0x1b/0x210
[ 1158.647918]  ? lockdep_hardirqs_on+0x260/0x260
[ 1158.656501]  ? match_held_lock+0x1b/0x210
[ 1158.664643]  ? __fget_light+0xa6/0xe0
[ 1158.672423]  ? __sys_sendmsg+0xd2/0x150
[ 1158.680334]  __sys_sendmsg+0xd2/0x150
[ 1158.688063]  ? __ia32_sys_shutdown+0x30/0x30
[ 1158.696435]  ? lock_downgrade+0x2e0/0x2e0
[ 1158.704541]  ? mark_held_locks+0x1a/0x90
[ 1158.712611]  ? mark_held_locks+0x1a/0x90
[ 1158.720619]  ? do_syscall_64+0x1e/0x2c0
[ 1158.728530]  do_syscall_64+0x78/0x2c0
[ 1158.736254]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1158.745414] RIP: 0033:0x7f62d505cb87
[ 1158.753070] Code: 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 80 00 00 00 00 8b 05 6a 2b 2c 00 48 63 d2 48 63 ff 85 c0 75 18 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 59 f3 c3 0f 1f 80 00 00[87/1817]
 48 89 f3 48
[ 1158.780924] RSP: 002b:00007fffd9832268 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[ 1158.793204] RAX: ffffffffffffffda RBX: 000000005d26048f RCX: 00007f62d505cb87
[ 1158.805111] RDX: 0000000000000000 RSI: 00007fffd98322d0 RDI: 0000000000000003
[ 1158.817055] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000006
[ 1158.828987] R10: 00007f62d50ce260 R11: 0000000000000246 R12: 0000000000000001
[ 1158.840909] R13: 000000000067e540 R14: 0000000000000000 R15: 000000000067ed20
[ 1158.852873] ==================================================================

Introduce new function tcf_block_non_null_shared() that verifies block
pointer before dereferencing it to obtain index. Use the function in
tc_indr_block_ing_cmd() to prevent NULL pointer dereference.

Fixes: 955bcb6ea0df ("drivers: net: use flow block API")
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 10 ++++++++++
 net/sched/cls_api.c   |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index b03d466182db..841faadceb6e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -60,6 +60,11 @@ static inline bool tcf_block_shared(struct tcf_block *block)
 	return block->index;
 }
 
+static inline bool tcf_block_non_null_shared(struct tcf_block *block)
+{
+	return block && block->index;
+}
+
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 {
 	WARN_ON(tcf_block_shared(block));
@@ -84,6 +89,11 @@ static inline bool tcf_block_shared(struct tcf_block *block)
 	return false;
 }
 
+static inline bool tcf_block_non_null_shared(struct tcf_block *block)
+{
+	return false;
+}
+
 static inline
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 638c1bc1ea1b..278014e26aec 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -684,7 +684,7 @@ static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
 		.command	= command,
 		.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
 		.net		= dev_net(indr_dev->dev),
-		.block_shared	= tcf_block_shared(indr_dev->block),
+		.block_shared	= tcf_block_non_null_shared(indr_dev->block),
 	};
 	INIT_LIST_HEAD(&bo.cb_list);
 
-- 
cgit v1.2.3


From 54638c6eaf445ecf901128599cfeb4620be47d2f Mon Sep 17 00:00:00 2001
From: Denis Efremov <efremov@linux.com>
Date: Wed, 10 Jul 2019 21:03:24 +0300
Subject: net: phy: make exported variables non-static

The variables phy_basic_ports_array, phy_fibre_port_array and
phy_all_ports_features_array are declared static and marked
EXPORT_SYMBOL_GPL(), which is at best an odd combination.
Because the variables were decided to be a part of API, this commit
removes the static attributes and adds the declarations to the header.

Fixes: 3c1bcc8614db ("net: ethernet: Convert phydev advertize and supported from u32 to link mode")
Signed-off-by: Denis Efremov <efremov@linux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 6 +++---
 include/linux/phy.h          | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 53878908adf4..6b5cb87f3866 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -56,19 +56,19 @@ EXPORT_SYMBOL_GPL(phy_10gbit_features);
 __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init;
 EXPORT_SYMBOL_GPL(phy_10gbit_fec_features);
 
-static const int phy_basic_ports_array[] = {
+const int phy_basic_ports_array[3] = {
 	ETHTOOL_LINK_MODE_Autoneg_BIT,
 	ETHTOOL_LINK_MODE_TP_BIT,
 	ETHTOOL_LINK_MODE_MII_BIT,
 };
 EXPORT_SYMBOL_GPL(phy_basic_ports_array);
 
-static const int phy_fibre_port_array[] = {
+const int phy_fibre_port_array[1] = {
 	ETHTOOL_LINK_MODE_FIBRE_BIT,
 };
 EXPORT_SYMBOL_GPL(phy_fibre_port_array);
 
-static const int phy_all_ports_features_array[] = {
+const int phy_all_ports_features_array[7] = {
 	ETHTOOL_LINK_MODE_Autoneg_BIT,
 	ETHTOOL_LINK_MODE_TP_BIT,
 	ETHTOOL_LINK_MODE_MII_BIT,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1739c6dc470e..462b90b73f93 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -55,6 +55,9 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_ini
 #define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features)
 #define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features)
 
+extern const int phy_basic_ports_array[3];
+extern const int phy_fibre_port_array[1];
+extern const int phy_all_ports_features_array[7];
 extern const int phy_10_100_features_array[4];
 extern const int phy_basic_t1_features_array[2];
 extern const int phy_gbit_features_array[2];
-- 
cgit v1.2.3


From 8a58ddae23796c733c5dfbd717538d89d036c5bd Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Mon, 1 Jul 2019 14:07:55 +0300
Subject: perf/core: Fix exclusive events' grouping

So far, we tried to disallow grouping exclusive events for the fear of
complications they would cause with moving between contexts. Specifically,
moving a software group to a hardware context would violate the exclusivity
rules if both groups contain matching exclusive events.

This attempt was, however, unsuccessful: the check that we have in the
perf_event_open() syscall is both wrong (looks at wrong PMU) and
insufficient (group leader may still be exclusive), as can be illustrated
by running:

  $ perf record -e '{intel_pt//,cycles}' uname
  $ perf record -e '{cycles,intel_pt//}' uname

ultimately successfully.

Furthermore, we are completely free to trigger the exclusivity violation
by:

   perf -e '{cycles,intel_pt//}' -e '{intel_pt//,instructions}'

even though the helpful perf record will not allow that, the ABI will.

The warning later in the perf_event_open() path will also not trigger, because
it's also wrong.

Fix all this by validating the original group before moving, getting rid
of broken safeguards and placing a useful one to perf_install_in_context().

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: mathieu.poirier@linaro.org
Cc: will.deacon@arm.com
Fixes: bed5b25ad9c8a ("perf: Add a pmu capability for "exclusive" events")
Link: https://lkml.kernel.org/r/20190701110755.24646-1-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  5 +++++
 kernel/events/core.c       | 34 ++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 16e38c286d46..e8ad3c590a23 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1055,6 +1055,11 @@ static inline int in_software_context(struct perf_event *event)
 	return event->ctx->pmu->task_ctx_nr == perf_sw_context;
 }
 
+static inline int is_exclusive_pmu(struct pmu *pmu)
+{
+	return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
+}
+
 extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5dd19bedbf64..eea9d52b010c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2553,6 +2553,9 @@ unlock:
 	return ret;
 }
 
+static bool exclusive_event_installable(struct perf_event *event,
+					struct perf_event_context *ctx);
+
 /*
  * Attach a performance event to a context.
  *
@@ -2567,6 +2570,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 
 	lockdep_assert_held(&ctx->mutex);
 
+	WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
+
 	if (event->cpu != -1)
 		event->cpu = cpu;
 
@@ -4360,7 +4365,7 @@ static int exclusive_event_init(struct perf_event *event)
 {
 	struct pmu *pmu = event->pmu;
 
-	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+	if (!is_exclusive_pmu(pmu))
 		return 0;
 
 	/*
@@ -4391,7 +4396,7 @@ static void exclusive_event_destroy(struct perf_event *event)
 {
 	struct pmu *pmu = event->pmu;
 
-	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+	if (!is_exclusive_pmu(pmu))
 		return;
 
 	/* see comment in exclusive_event_init() */
@@ -4411,14 +4416,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
 	return false;
 }
 
-/* Called under the same ctx::mutex as perf_install_in_context() */
 static bool exclusive_event_installable(struct perf_event *event,
 					struct perf_event_context *ctx)
 {
 	struct perf_event *iter_event;
 	struct pmu *pmu = event->pmu;
 
-	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+	lockdep_assert_held(&ctx->mutex);
+
+	if (!is_exclusive_pmu(pmu))
 		return true;
 
 	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
@@ -10947,11 +10953,6 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_alloc;
 	}
 
-	if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
-		err = -EBUSY;
-		goto err_context;
-	}
-
 	/*
 	 * Look up the group leader (we will attach this event to it):
 	 */
@@ -11039,6 +11040,18 @@ SYSCALL_DEFINE5(perf_event_open,
 				move_group = 0;
 			}
 		}
+
+		/*
+		 * Failure to create exclusive events returns -EBUSY.
+		 */
+		err = -EBUSY;
+		if (!exclusive_event_installable(group_leader, ctx))
+			goto err_locked;
+
+		for_each_sibling_event(sibling, group_leader) {
+			if (!exclusive_event_installable(sibling, ctx))
+				goto err_locked;
+		}
 	} else {
 		mutex_lock(&ctx->mutex);
 	}
@@ -11075,9 +11088,6 @@ SYSCALL_DEFINE5(perf_event_open,
 	 * because we need to serialize with concurrent event creation.
 	 */
 	if (!exclusive_event_installable(event, ctx)) {
-		/* exclusive and group stuff are assumed mutually exclusive */
-		WARN_ON_ONCE(move_group);
-
 		err = -EBUSY;
 		goto err_locked;
 	}
-- 
cgit v1.2.3


From 028b6e8a89de9133a869bb4cd1bc72445b1ec8ca Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Sun, 14 Jul 2019 19:20:47 +0300
Subject: clone: fix CLONE_PIDFD support

The introduction of clone3 syscall accidentally broke CLONE_PIDFD
support in traditional clone syscall on compat x86 and those
architectures that use do_fork to implement clone syscall.

This bug was found by strace test suite.

Link: https://strace.io/logs/strace/2019-07-12
Fixes: 7f192e3cd316 ("fork: add clone3")
Bisected-and-tested-by: Anatoly Pugachev <matorola@gmail.com>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Link: https://lore.kernel.org/r/20190714162047.GB10389@altlinux.org
Signed-off-by: Christian Brauner <christian@brauner.io>
---
 arch/x86/ia32/sys_ia32.c   |  4 ++++
 include/linux/sched/task.h |  1 +
 kernel/fork.c              | 17 +++++++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 64a6c952091e..21790307121e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -239,6 +239,7 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
 {
 	struct kernel_clone_args args = {
 		.flags		= (clone_flags & ~CSIGNAL),
+		.pidfd		= parent_tidptr,
 		.child_tid	= child_tidptr,
 		.parent_tid	= parent_tidptr,
 		.exit_signal	= (clone_flags & CSIGNAL),
@@ -246,5 +247,8 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
 		.tls		= tls_val,
 	};
 
+	if (!legacy_clone_args_valid(&args))
+		return -EINVAL;
+
 	return _do_fork(&args);
 }
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 109a0df5af39..0497091e40c1 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -89,6 +89,7 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern long _do_fork(struct kernel_clone_args *kargs);
+extern bool legacy_clone_args_valid(const struct kernel_clone_args *kargs);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8f3e2d97d771..ef1e05a68827 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2406,6 +2406,16 @@ long _do_fork(struct kernel_clone_args *args)
 	return nr;
 }
 
+bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
+{
+	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+	if ((kargs->flags & CLONE_PIDFD) &&
+	    (kargs->flags & CLONE_PARENT_SETTID))
+		return false;
+
+	return true;
+}
+
 #ifndef CONFIG_HAVE_COPY_THREAD_TLS
 /* For compatibility with architectures that call do_fork directly rather than
  * using the syscall entry points below. */
@@ -2417,6 +2427,7 @@ long do_fork(unsigned long clone_flags,
 {
 	struct kernel_clone_args args = {
 		.flags		= (clone_flags & ~CSIGNAL),
+		.pidfd		= parent_tidptr,
 		.child_tid	= child_tidptr,
 		.parent_tid	= parent_tidptr,
 		.exit_signal	= (clone_flags & CSIGNAL),
@@ -2424,6 +2435,9 @@ long do_fork(unsigned long clone_flags,
 		.stack_size	= stack_size,
 	};
 
+	if (!legacy_clone_args_valid(&args))
+		return -EINVAL;
+
 	return _do_fork(&args);
 }
 #endif
@@ -2505,8 +2519,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		.tls		= tls,
 	};
 
-	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
-	if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+	if (!legacy_clone_args_valid(&args))
 		return -EINVAL;
 
 	return _do_fork(&args);
-- 
cgit v1.2.3


From 05a70a8ec287c4381ee1441ca779deab7bae124d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Sun, 14 Jul 2019 21:22:05 +0200
Subject: unistd: protect clone3 via __ARCH_WANT_SYS_CLONE3

This lets us catch new architectures that implicitly make use of clone3
without setting __ARCH_WANT_SYS_CLONE3.
Failing on missing __ARCH_WANT_SYS_CLONE3 is a good indicator that they
either did not really want this syscall or haven't really thought about
whether it needs special treatment and just accidently included it in
their entrypoints by e.g. generating their syscall table automatically
via asm-generic/unistd.h

This patch has been compile-tested for the h8300 architecture which is
one of the architectures that does not yet implement clone3 and
generates its syscall table via asm-generic/unistd.h.

Signed-off-by: Christian Brauner <christian@brauner.io>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20190714192205.27190-3-christian@brauner.io
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christian Brauner <christian@brauner.io>
---
 include/uapi/asm-generic/unistd.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 9acfff0cd153..1be0e798e362 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -846,8 +846,10 @@ __SYSCALL(__NR_fsmount, sys_fsmount)
 __SYSCALL(__NR_fspick, sys_fspick)
 #define __NR_pidfd_open 434
 __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
+#ifdef __ARCH_WANT_SYS_CLONE3
 #define __NR_clone3 435
 __SYSCALL(__NR_clone3, sys_clone3)
+#endif
 
 #undef __NR_syscalls
 #define __NR_syscalls 436
-- 
cgit v1.2.3


From 6dfc43d3a19174faead54575c204aee106225f43 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 15 Jul 2019 15:16:20 +1000
Subject: mm: adjust apply_to_pfn_range interface for dropped token.

mm/pgtable: drop pgtable_t variable from pte_fn_t functions
drops the token came in via the hmm tree, this caused lots of
conflicts, but applying this cleanup patch should reduce it
to something easier to handle. Just accept the token is unused
at this point.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/mm.h    | 2 +-
 mm/as_dirty_helpers.c | 6 ++----
 mm/memory.c           | 6 +++---
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 798cdda9560e..c45f936bd81c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2687,7 +2687,7 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
 struct pfn_range_apply;
-typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+typedef int (*pter_fn_t)(pte_t *pte, unsigned long addr,
 			 struct pfn_range_apply *closure);
 struct pfn_range_apply {
 	struct mm_struct *mm;
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
index f600e31534fb..6352a3729408 100644
--- a/mm/as_dirty_helpers.c
+++ b/mm/as_dirty_helpers.c
@@ -26,7 +26,6 @@ struct apply_as {
 /**
  * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
  * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as
@@ -36,7 +35,7 @@ struct apply_as {
  *
  * Return: Always zero.
  */
-static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
+static int apply_pt_wrprotect(pte_t *pte,
 			      unsigned long addr,
 			      struct pfn_range_apply *closure)
 {
@@ -78,7 +77,6 @@ struct apply_as_clean {
 /**
  * apply_pt_clean - Leaf pte callback to clean a pte
  * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as_clean
@@ -91,7 +89,7 @@ struct apply_as_clean {
  *
  * Return: Always zero.
  */
-static int apply_pt_clean(pte_t *pte, pgtable_t token,
+static int apply_pt_clean(pte_t *pte,
 			  unsigned long addr,
 			  struct pfn_range_apply *closure)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 462aa47f8878..b8218e962231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2053,7 +2053,7 @@ static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = closure->ptefn(pte++, token, addr, closure);
+		err = closure->ptefn(pte++, addr, closure);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
@@ -2194,14 +2194,14 @@ struct page_range_apply {
  * Callback wrapper to enable use of apply_to_pfn_range for
  * the apply_to_page_range interface
  */
-static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
+static int apply_to_page_range_wrapper(pte_t *pte,
 				       unsigned long addr,
 				       struct pfn_range_apply *pter)
 {
 	struct page_range_apply *pra =
 		container_of(pter, typeof(*pra), pter);
 
-	return pra->fn(pte, token, addr, pra->data);
+	return pra->fn(pte, NULL, addr, pra->data);
 }
 
 /*
-- 
cgit v1.2.3


From dfd6f9ad36368b8dbd5f5a2b2f0a4705ae69a323 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 12 Jul 2019 11:01:21 +0200
Subject: ACPI: fix false-positive -Wuninitialized warning

clang gets confused by an uninitialized variable in what looks
to it like a never executed code path:

arch/x86/kernel/acpi/boot.c:618:13: error: variable 'polarity' is uninitialized when used here [-Werror,-Wuninitialized]
        polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
                   ^~~~~~~~
arch/x86/kernel/acpi/boot.c:606:32: note: initialize the variable 'polarity' to silence this warning
        int rc, irq, trigger, polarity;
                                      ^
                                       = 0
arch/x86/kernel/acpi/boot.c:617:12: error: variable 'trigger' is uninitialized when used here [-Werror,-Wuninitialized]
        trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
                  ^~~~~~~
arch/x86/kernel/acpi/boot.c:606:22: note: initialize the variable 'trigger' to silence this warning
        int rc, irq, trigger, polarity;
                            ^
                             = 0

This is unfortunately a design decision in clang and won't be fixed.

Changing the acpi_get_override_irq() macro to an inline function
reliably avoids the issue.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 451e7b544342..8309923eafe1 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -324,7 +324,10 @@ struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
 #ifdef CONFIG_X86_IO_APIC
 extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #else
-#define acpi_get_override_irq(gsi, trigger, polarity) (-1)
+static inline int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
+{
+	return -1;
+}
 #endif
 /*
  * This function undoes the effect of one call to acpi_register_gsi().
-- 
cgit v1.2.3


From 8da04e05cdfc715d414a1c5f8318c03030eb68fb Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 15 Jul 2019 09:56:30 +1000
Subject: intel_rapl: need linux/cpuhotplug.h for enum cpuhp_state

Fixes: 7ebf8eff63b4 ("intel_rapl: introduce struct rapl_if_private")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/intel_rapl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 0c179d92d110..efb3ce892c20 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -12,6 +12,7 @@
 
 #include <linux/types.h>
 #include <linux/powercap.h>
+#include <linux/cpuhotplug.h>
 
 enum rapl_domain_type {
 	RAPL_DOMAIN_PACKAGE,	/* entire package/socket */
-- 
cgit v1.2.3


From 387b14684f94483cbbb72843db406ec9a8d0d6d2 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Wed, 10 Apr 2019 08:32:41 -0300
Subject: docs: locking: convert docs to ReST and rename to *.rst

Convert the locking documents to ReST and add them to the
kernel development book where it belongs.

Most of the stuff here is just to make Sphinx to properly
parse the text file, as they're already in good shape,
not requiring massive changes in order to be parsed.

The conversion is actually:
  - add blank lines and identation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Federico Vaga <federico.vaga@vaga.pv.it>
---
 Documentation/kernel-hacking/locking.rst           |   2 +-
 Documentation/locking/index.rst                    |  24 +
 Documentation/locking/lockdep-design.rst           | 394 ++++++++++++++
 Documentation/locking/lockdep-design.txt           | 389 --------------
 Documentation/locking/lockstat.rst                 | 204 ++++++++
 Documentation/locking/lockstat.txt                 | 183 -------
 Documentation/locking/locktorture.rst              | 170 ++++++
 Documentation/locking/locktorture.txt              | 145 ------
 Documentation/locking/mutex-design.rst             | 152 ++++++
 Documentation/locking/mutex-design.txt             | 142 -----
 Documentation/locking/rt-mutex-design.rst          | 574 +++++++++++++++++++++
 Documentation/locking/rt-mutex-design.txt          | 559 --------------------
 Documentation/locking/rt-mutex.rst                 |  77 +++
 Documentation/locking/rt-mutex.txt                 |  73 ---
 Documentation/locking/spinlocks.rst                | 177 +++++++
 Documentation/locking/spinlocks.txt                | 167 ------
 Documentation/locking/ww-mutex-design.rst          | 393 ++++++++++++++
 Documentation/locking/ww-mutex-design.txt          | 383 --------------
 Documentation/pi-futex.txt                         |   2 +-
 .../translations/it_IT/kernel-hacking/locking.rst  |   2 +-
 drivers/gpu/drm/drm_modeset_lock.c                 |   2 +-
 include/linux/lockdep.h                            |   2 +-
 include/linux/mutex.h                              |   2 +-
 include/linux/rwsem.h                              |   2 +-
 kernel/locking/mutex.c                             |   2 +-
 kernel/locking/rtmutex.c                           |   2 +-
 lib/Kconfig.debug                                  |   4 +-
 27 files changed, 2176 insertions(+), 2052 deletions(-)
 create mode 100644 Documentation/locking/index.rst
 create mode 100644 Documentation/locking/lockdep-design.rst
 delete mode 100644 Documentation/locking/lockdep-design.txt
 create mode 100644 Documentation/locking/lockstat.rst
 delete mode 100644 Documentation/locking/lockstat.txt
 create mode 100644 Documentation/locking/locktorture.rst
 delete mode 100644 Documentation/locking/locktorture.txt
 create mode 100644 Documentation/locking/mutex-design.rst
 delete mode 100644 Documentation/locking/mutex-design.txt
 create mode 100644 Documentation/locking/rt-mutex-design.rst
 delete mode 100644 Documentation/locking/rt-mutex-design.txt
 create mode 100644 Documentation/locking/rt-mutex.rst
 delete mode 100644 Documentation/locking/rt-mutex.txt
 create mode 100644 Documentation/locking/spinlocks.rst
 delete mode 100644 Documentation/locking/spinlocks.txt
 create mode 100644 Documentation/locking/ww-mutex-design.rst
 delete mode 100644 Documentation/locking/ww-mutex-design.txt

(limited to 'include')

diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst
index dc698ea456e0..a8518ac0d31d 100644
--- a/Documentation/kernel-hacking/locking.rst
+++ b/Documentation/kernel-hacking/locking.rst
@@ -1364,7 +1364,7 @@ Futex API reference
 Further reading
 ===============
 
--  ``Documentation/locking/spinlocks.txt``: Linus Torvalds' spinlocking
+-  ``Documentation/locking/spinlocks.rst``: Linus Torvalds' spinlocking
    tutorial in the kernel sources.
 
 -  Unix Systems for Modern Architectures: Symmetric Multiprocessing and
diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst
new file mode 100644
index 000000000000..ef5da7fe9aac
--- /dev/null
+++ b/Documentation/locking/index.rst
@@ -0,0 +1,24 @@
+:orphan:
+
+=======
+locking
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    lockdep-design
+    lockstat
+    locktorture
+    mutex-design
+    rt-mutex-design
+    rt-mutex
+    spinlocks
+    ww-mutex-design
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/locking/lockdep-design.rst b/Documentation/locking/lockdep-design.rst
new file mode 100644
index 000000000000..23fcbc4d3fc0
--- /dev/null
+++ b/Documentation/locking/lockdep-design.rst
@@ -0,0 +1,394 @@
+Runtime locking correctness validator
+=====================================
+
+started by Ingo Molnar <mingo@redhat.com>
+
+additions by Arjan van de Ven <arjan@linux.intel.com>
+
+Lock-class
+----------
+
+The basic object the validator operates upon is a 'class' of locks.
+
+A class of locks is a group of locks that are logically the same with
+respect to locking rules, even if the locks may have multiple (possibly
+tens of thousands of) instantiations. For example a lock in the inode
+struct is one class, while each inode has its own instantiation of that
+lock class.
+
+The validator tracks the 'usage state' of lock-classes, and it tracks
+the dependencies between different lock-classes. Lock usage indicates
+how a lock is used with regard to its IRQ contexts, while lock
+dependency can be understood as lock order, where L1 -> L2 suggests that
+a task is attempting to acquire L2 while holding L1. From lockdep's
+perspective, the two locks (L1 and L2) are not necessarily related; that
+dependency just means the order ever happened. The validator maintains a
+continuing effort to prove lock usages and dependencies are correct or
+the validator will shoot a splat if incorrect.
+
+A lock-class's behavior is constructed by its instances collectively:
+when the first instance of a lock-class is used after bootup the class
+gets registered, then all (subsequent) instances will be mapped to the
+class and hence their usages and dependecies will contribute to those of
+the class. A lock-class does not go away when a lock instance does, but
+it can be removed if the memory space of the lock class (static or
+dynamic) is reclaimed, this happens for example when a module is
+unloaded or a workqueue is destroyed.
+
+State
+-----
+
+The validator tracks lock-class usage history and divides the usage into
+(4 usages * n STATEs + 1) categories:
+
+where the 4 usages can be:
+- 'ever held in STATE context'
+- 'ever held as readlock in STATE context'
+- 'ever held with STATE enabled'
+- 'ever held as readlock with STATE enabled'
+
+where the n STATEs are coded in kernel/locking/lockdep_states.h and as of
+now they include:
+- hardirq
+- softirq
+
+where the last 1 category is:
+- 'ever used'                                       [ == !unused        ]
+
+When locking rules are violated, these usage bits are presented in the
+locking error messages, inside curlies, with a total of 2 * n STATEs bits.
+A contrived example::
+
+   modprobe/2287 is trying to acquire lock:
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+   but task is already holding lock:
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+
+For a given lock, the bit positions from left to right indicate the usage
+of the lock and readlock (if exists), for each of the n STATEs listed
+above respectively, and the character displayed at each bit position
+indicates:
+
+   ===  ===================================================
+   '.'  acquired while irqs disabled and not in irq context
+   '-'  acquired in irq context
+   '+'  acquired with irqs enabled
+   '?'  acquired in irq context with irqs enabled.
+   ===  ===================================================
+
+The bits are illustrated with an example::
+
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+                         ||||
+                         ||| \-> softirq disabled and not in softirq context
+                         || \--> acquired in softirq context
+                         | \---> hardirq disabled and not in hardirq context
+                          \----> acquired in hardirq context
+
+
+For a given STATE, whether the lock is ever acquired in that STATE
+context and whether that STATE is enabled yields four possible cases as
+shown in the table below. The bit character is able to indicate which
+exact case is for the lock as of the reporting time.
+
+  +--------------+-------------+--------------+
+  |              | irq enabled | irq disabled |
+  +--------------+-------------+--------------+
+  | ever in irq  |      ?      |       -      |
+  +--------------+-------------+--------------+
+  | never in irq |      +      |       .      |
+  +--------------+-------------+--------------+
+
+The character '-' suggests irq is disabled because if otherwise the
+charactor '?' would have been shown instead. Similar deduction can be
+applied for '+' too.
+
+Unused locks (e.g., mutexes) cannot be part of the cause of an error.
+
+
+Single-lock state rules:
+------------------------
+
+A lock is irq-safe means it was ever used in an irq context, while a lock
+is irq-unsafe means it was ever acquired with irq enabled.
+
+A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
+following states must be exclusive: only one of them is allowed to be set
+for any lock-class based on its usage::
+
+ <hardirq-safe> or <hardirq-unsafe>
+ <softirq-safe> or <softirq-unsafe>
+
+This is because if a lock can be used in irq context (irq-safe) then it
+cannot be ever acquired with irq enabled (irq-unsafe). Otherwise, a
+deadlock may happen. For example, in the scenario that after this lock
+was acquired but before released, if the context is interrupted this
+lock will be attempted to acquire twice, which creates a deadlock,
+referred to as lock recursion deadlock.
+
+The validator detects and reports lock usage that violates these
+single-lock state rules.
+
+Multi-lock dependency rules:
+----------------------------
+
+The same lock-class must not be acquired twice, because this could lead
+to lock recursion deadlocks.
+
+Furthermore, two locks can not be taken in inverse order::
+
+ <L1> -> <L2>
+ <L2> -> <L1>
+
+because this could lead to a deadlock - referred to as lock inversion
+deadlock - as attempts to acquire the two locks form a circle which
+could lead to the two contexts waiting for each other permanently. The
+validator will find such dependency circle in arbitrary complexity,
+i.e., there can be any other locking sequence between the acquire-lock
+operations; the validator will still find whether these locks can be
+acquired in a circular fashion.
+
+Furthermore, the following usage based lock dependencies are not allowed
+between any two lock-classes::
+
+   <hardirq-safe>   ->  <hardirq-unsafe>
+   <softirq-safe>   ->  <softirq-unsafe>
+
+The first rule comes from the fact that a hardirq-safe lock could be
+taken by a hardirq context, interrupting a hardirq-unsafe lock - and
+thus could result in a lock inversion deadlock. Likewise, a softirq-safe
+lock could be taken by an softirq context, interrupting a softirq-unsafe
+lock.
+
+The above rules are enforced for any locking sequence that occurs in the
+kernel: when acquiring a new lock, the validator checks whether there is
+any rule violation between the new lock and any of the held locks.
+
+When a lock-class changes its state, the following aspects of the above
+dependency rules are enforced:
+
+- if a new hardirq-safe lock is discovered, we check whether it
+  took any hardirq-unsafe lock in the past.
+
+- if a new softirq-safe lock is discovered, we check whether it took
+  any softirq-unsafe lock in the past.
+
+- if a new hardirq-unsafe lock is discovered, we check whether any
+  hardirq-safe lock took it in the past.
+
+- if a new softirq-unsafe lock is discovered, we check whether any
+  softirq-safe lock took it in the past.
+
+(Again, we do these checks too on the basis that an interrupt context
+could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
+could lead to a lock inversion deadlock - even if that lock scenario did
+not trigger in practice yet.)
+
+Exception: Nested data dependencies leading to nested locking
+-------------------------------------------------------------
+
+There are a few cases where the Linux kernel acquires more than one
+instance of the same lock-class. Such cases typically happen when there
+is some sort of hierarchy within objects of the same type. In these
+cases there is an inherent "natural" ordering between the two objects
+(defined by the properties of the hierarchy), and the kernel grabs the
+locks in this fixed order on each of the objects.
+
+An example of such an object hierarchy that results in "nested locking"
+is that of a "whole disk" block-dev object and a "partition" block-dev
+object; the partition is "part of" the whole device and as long as one
+always takes the whole disk lock as a higher lock than the partition
+lock, the lock ordering is fully correct. The validator does not
+automatically detect this natural ordering, as the locking rule behind
+the ordering is not static.
+
+In order to teach the validator about this correct usage model, new
+versions of the various locking primitives were added that allow you to
+specify a "nesting level". An example call, for the block device mutex,
+looks like this::
+
+  enum bdev_bd_mutex_lock_class
+  {
+       BD_MUTEX_NORMAL,
+       BD_MUTEX_WHOLE,
+       BD_MUTEX_PARTITION
+  };
+
+mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
+
+In this case the locking is done on a bdev object that is known to be a
+partition.
+
+The validator treats a lock that is taken in such a nested fashion as a
+separate (sub)class for the purposes of validation.
+
+Note: When changing code to use the _nested() primitives, be careful and
+check really thoroughly that the hierarchy is correctly mapped; otherwise
+you can get false positives or false negatives.
+
+Annotations
+-----------
+
+Two constructs can be used to annotate and check where and if certain locks
+must be held: lockdep_assert_held*(&lock) and lockdep_*pin_lock(&lock).
+
+As the name suggests, lockdep_assert_held* family of macros assert that a
+particular lock is held at a certain time (and generate a WARN() otherwise).
+This annotation is largely used all over the kernel, e.g. kernel/sched/
+core.c::
+
+  void update_rq_clock(struct rq *rq)
+  {
+	s64 delta;
+
+	lockdep_assert_held(&rq->lock);
+	[...]
+  }
+
+where holding rq->lock is required to safely update a rq's clock.
+
+The other family of macros is lockdep_*pin_lock(), which is admittedly only
+used for rq->lock ATM. Despite their limited adoption these annotations
+generate a WARN() if the lock of interest is "accidentally" unlocked. This turns
+out to be especially helpful to debug code with callbacks, where an upper
+layer assumes a lock remains taken, but a lower layer thinks it can maybe drop
+and reacquire the lock ("unwittingly" introducing races). lockdep_pin_lock()
+returns a 'struct pin_cookie' that is then used by lockdep_unpin_lock() to check
+that nobody tampered with the lock, e.g. kernel/sched/sched.h::
+
+  static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
+  {
+	rf->cookie = lockdep_pin_lock(&rq->lock);
+	[...]
+  }
+
+  static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
+  {
+	[...]
+	lockdep_unpin_lock(&rq->lock, rf->cookie);
+  }
+
+While comments about locking requirements might provide useful information,
+the runtime checks performed by annotations are invaluable when debugging
+locking problems and they carry the same level of details when inspecting
+code.  Always prefer annotations when in doubt!
+
+Proof of 100% correctness:
+--------------------------
+
+The validator achieves perfect, mathematical 'closure' (proof of locking
+correctness) in the sense that for every simple, standalone single-task
+locking sequence that occurred at least once during the lifetime of the
+kernel, the validator proves it with a 100% certainty that no
+combination and timing of these locking sequences can cause any class of
+lock related deadlock. [1]_
+
+I.e. complex multi-CPU and multi-task locking scenarios do not have to
+occur in practice to prove a deadlock: only the simple 'component'
+locking chains have to occur at least once (anytime, in any
+task/context) for the validator to be able to prove correctness. (For
+example, complex deadlocks that would normally need more than 3 CPUs and
+a very unlikely constellation of tasks, irq-contexts and timings to
+occur, can be detected on a plain, lightly loaded single-CPU system as
+well!)
+
+This radically decreases the complexity of locking related QA of the
+kernel: what has to be done during QA is to trigger as many "simple"
+single-task locking dependencies in the kernel as possible, at least
+once, to prove locking correctness - instead of having to trigger every
+possible combination of locking interaction between CPUs, combined with
+every possible hardirq and softirq nesting scenario (which is impossible
+to do in practice).
+
+.. [1]
+
+    assuming that the validator itself is 100% correct, and no other
+    part of the system corrupts the state of the validator in any way.
+    We also assume that all NMI/SMM paths [which could interrupt
+    even hardirq-disabled codepaths] are correct and do not interfere
+    with the validator. We also assume that the 64-bit 'chain hash'
+    value is unique for every lock-chain in the system. Also, lock
+    recursion must not be higher than 20.
+
+Performance:
+------------
+
+The above rules require **massive** amounts of runtime checking. If we did
+that for every lock taken and for every irqs-enable event, it would
+render the system practically unusably slow. The complexity of checking
+is O(N^2), so even with just a few hundred lock-classes we'd have to do
+tens of thousands of checks for every event.
+
+This problem is solved by checking any given 'locking scenario' (unique
+sequence of locks taken after each other) only once. A simple stack of
+held locks is maintained, and a lightweight 64-bit hash value is
+calculated, which hash is unique for every lock chain. The hash value,
+when the chain is validated for the first time, is then put into a hash
+table, which hash-table can be checked in a lockfree manner. If the
+locking chain occurs again later on, the hash table tells us that we
+don't have to validate the chain again.
+
+Troubleshooting:
+----------------
+
+The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
+Exceeding this number will trigger the following lockdep warning:
+
+	(DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+
+By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
+desktop systems have less than 1,000 lock classes, so this warning
+normally results from lock-class leakage or failure to properly
+initialize locks.  These two problems are illustrated below:
+
+1.	Repeated module loading and unloading while running the validator
+	will result in lock-class leakage.  The issue here is that each
+	load of the module will create a new set of lock classes for
+	that module's locks, but module unloading does not remove old
+	classes (see below discussion of reuse of lock classes for why).
+	Therefore, if that module is loaded and unloaded repeatedly,
+	the number of lock classes will eventually reach the maximum.
+
+2.	Using structures such as arrays that have large numbers of
+	locks that are not explicitly initialized.  For example,
+	a hash table with 8192 buckets where each bucket has its own
+	spinlock_t will consume 8192 lock classes -unless- each spinlock
+	is explicitly initialized at runtime, for example, using the
+	run-time spin_lock_init() as opposed to compile-time initializers
+	such as __SPIN_LOCK_UNLOCKED().  Failure to properly initialize
+	the per-bucket spinlocks would guarantee lock-class overflow.
+	In contrast, a loop that called spin_lock_init() on each lock
+	would place all 8192 locks into a single lock class.
+
+	The moral of this story is that you should always explicitly
+	initialize your locks.
+
+One might argue that the validator should be modified to allow
+lock classes to be reused.  However, if you are tempted to make this
+argument, first review the code and think through the changes that would
+be required, keeping in mind that the lock classes to be removed are
+likely to be linked into the lock-dependency graph.  This turns out to
+be harder to do than to say.
+
+Of course, if you do run out of lock classes, the next thing to do is
+to find the offending lock classes.  First, the following command gives
+you the number of lock classes currently in use along with the maximum::
+
+	grep "lock-classes" /proc/lockdep_stats
+
+This command produces the following output on a modest system::
+
+	lock-classes:                          748 [max: 8191]
+
+If the number allocated (748 above) increases continually over time,
+then there is likely a leak.  The following command can be used to
+identify the leaking lock classes::
+
+	grep "BD" /proc/lockdep
+
+Run the command and save the output, then compare against the output from
+a later run of this command to identify the leakers.  This same output
+can also help you find situations where runtime lock initialization has
+been omitted.
diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt
deleted file mode 100644
index f189d130e543..000000000000
--- a/Documentation/locking/lockdep-design.txt
+++ /dev/null
@@ -1,389 +0,0 @@
-Runtime locking correctness validator
-=====================================
-
-started by Ingo Molnar <mingo@redhat.com>
-additions by Arjan van de Ven <arjan@linux.intel.com>
-
-Lock-class
-----------
-
-The basic object the validator operates upon is a 'class' of locks.
-
-A class of locks is a group of locks that are logically the same with
-respect to locking rules, even if the locks may have multiple (possibly
-tens of thousands of) instantiations. For example a lock in the inode
-struct is one class, while each inode has its own instantiation of that
-lock class.
-
-The validator tracks the 'usage state' of lock-classes, and it tracks
-the dependencies between different lock-classes. Lock usage indicates
-how a lock is used with regard to its IRQ contexts, while lock
-dependency can be understood as lock order, where L1 -> L2 suggests that
-a task is attempting to acquire L2 while holding L1. From lockdep's
-perspective, the two locks (L1 and L2) are not necessarily related; that
-dependency just means the order ever happened. The validator maintains a
-continuing effort to prove lock usages and dependencies are correct or
-the validator will shoot a splat if incorrect.
-
-A lock-class's behavior is constructed by its instances collectively:
-when the first instance of a lock-class is used after bootup the class
-gets registered, then all (subsequent) instances will be mapped to the
-class and hence their usages and dependecies will contribute to those of
-the class. A lock-class does not go away when a lock instance does, but
-it can be removed if the memory space of the lock class (static or
-dynamic) is reclaimed, this happens for example when a module is
-unloaded or a workqueue is destroyed.
-
-State
------
-
-The validator tracks lock-class usage history and divides the usage into
-(4 usages * n STATEs + 1) categories:
-
-where the 4 usages can be:
-- 'ever held in STATE context'
-- 'ever held as readlock in STATE context'
-- 'ever held with STATE enabled'
-- 'ever held as readlock with STATE enabled'
-
-where the n STATEs are coded in kernel/locking/lockdep_states.h and as of
-now they include:
-- hardirq
-- softirq
-
-where the last 1 category is:
-- 'ever used'                                       [ == !unused        ]
-
-When locking rules are violated, these usage bits are presented in the
-locking error messages, inside curlies, with a total of 2 * n STATEs bits.
-A contrived example:
-
-   modprobe/2287 is trying to acquire lock:
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-
-   but task is already holding lock:
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-
-
-For a given lock, the bit positions from left to right indicate the usage
-of the lock and readlock (if exists), for each of the n STATEs listed
-above respectively, and the character displayed at each bit position
-indicates:
-
-   '.'  acquired while irqs disabled and not in irq context
-   '-'  acquired in irq context
-   '+'  acquired with irqs enabled
-   '?'  acquired in irq context with irqs enabled.
-
-The bits are illustrated with an example:
-
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-                         ||||
-                         ||| \-> softirq disabled and not in softirq context
-                         || \--> acquired in softirq context
-                         | \---> hardirq disabled and not in hardirq context
-                          \----> acquired in hardirq context
-
-
-For a given STATE, whether the lock is ever acquired in that STATE
-context and whether that STATE is enabled yields four possible cases as
-shown in the table below. The bit character is able to indicate which
-exact case is for the lock as of the reporting time.
-
-   -------------------------------------------
-  |              | irq enabled | irq disabled |
-  |-------------------------------------------|
-  | ever in irq  |      ?      |       -      |
-  |-------------------------------------------|
-  | never in irq |      +      |       .      |
-   -------------------------------------------
-
-The character '-' suggests irq is disabled because if otherwise the
-charactor '?' would have been shown instead. Similar deduction can be
-applied for '+' too.
-
-Unused locks (e.g., mutexes) cannot be part of the cause of an error.
-
-
-Single-lock state rules:
-------------------------
-
-A lock is irq-safe means it was ever used in an irq context, while a lock
-is irq-unsafe means it was ever acquired with irq enabled.
-
-A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
-following states must be exclusive: only one of them is allowed to be set
-for any lock-class based on its usage:
-
- <hardirq-safe> or <hardirq-unsafe>
- <softirq-safe> or <softirq-unsafe>
-
-This is because if a lock can be used in irq context (irq-safe) then it
-cannot be ever acquired with irq enabled (irq-unsafe). Otherwise, a
-deadlock may happen. For example, in the scenario that after this lock
-was acquired but before released, if the context is interrupted this
-lock will be attempted to acquire twice, which creates a deadlock,
-referred to as lock recursion deadlock.
-
-The validator detects and reports lock usage that violates these
-single-lock state rules.
-
-Multi-lock dependency rules:
-----------------------------
-
-The same lock-class must not be acquired twice, because this could lead
-to lock recursion deadlocks.
-
-Furthermore, two locks can not be taken in inverse order:
-
- <L1> -> <L2>
- <L2> -> <L1>
-
-because this could lead to a deadlock - referred to as lock inversion
-deadlock - as attempts to acquire the two locks form a circle which
-could lead to the two contexts waiting for each other permanently. The
-validator will find such dependency circle in arbitrary complexity,
-i.e., there can be any other locking sequence between the acquire-lock
-operations; the validator will still find whether these locks can be
-acquired in a circular fashion.
-
-Furthermore, the following usage based lock dependencies are not allowed
-between any two lock-classes:
-
-   <hardirq-safe>   ->  <hardirq-unsafe>
-   <softirq-safe>   ->  <softirq-unsafe>
-
-The first rule comes from the fact that a hardirq-safe lock could be
-taken by a hardirq context, interrupting a hardirq-unsafe lock - and
-thus could result in a lock inversion deadlock. Likewise, a softirq-safe
-lock could be taken by an softirq context, interrupting a softirq-unsafe
-lock.
-
-The above rules are enforced for any locking sequence that occurs in the
-kernel: when acquiring a new lock, the validator checks whether there is
-any rule violation between the new lock and any of the held locks.
-
-When a lock-class changes its state, the following aspects of the above
-dependency rules are enforced:
-
-- if a new hardirq-safe lock is discovered, we check whether it
-  took any hardirq-unsafe lock in the past.
-
-- if a new softirq-safe lock is discovered, we check whether it took
-  any softirq-unsafe lock in the past.
-
-- if a new hardirq-unsafe lock is discovered, we check whether any
-  hardirq-safe lock took it in the past.
-
-- if a new softirq-unsafe lock is discovered, we check whether any
-  softirq-safe lock took it in the past.
-
-(Again, we do these checks too on the basis that an interrupt context
-could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
-could lead to a lock inversion deadlock - even if that lock scenario did
-not trigger in practice yet.)
-
-Exception: Nested data dependencies leading to nested locking
--------------------------------------------------------------
-
-There are a few cases where the Linux kernel acquires more than one
-instance of the same lock-class. Such cases typically happen when there
-is some sort of hierarchy within objects of the same type. In these
-cases there is an inherent "natural" ordering between the two objects
-(defined by the properties of the hierarchy), and the kernel grabs the
-locks in this fixed order on each of the objects.
-
-An example of such an object hierarchy that results in "nested locking"
-is that of a "whole disk" block-dev object and a "partition" block-dev
-object; the partition is "part of" the whole device and as long as one
-always takes the whole disk lock as a higher lock than the partition
-lock, the lock ordering is fully correct. The validator does not
-automatically detect this natural ordering, as the locking rule behind
-the ordering is not static.
-
-In order to teach the validator about this correct usage model, new
-versions of the various locking primitives were added that allow you to
-specify a "nesting level". An example call, for the block device mutex,
-looks like this:
-
-enum bdev_bd_mutex_lock_class
-{
-       BD_MUTEX_NORMAL,
-       BD_MUTEX_WHOLE,
-       BD_MUTEX_PARTITION
-};
-
- mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
-
-In this case the locking is done on a bdev object that is known to be a
-partition.
-
-The validator treats a lock that is taken in such a nested fashion as a
-separate (sub)class for the purposes of validation.
-
-Note: When changing code to use the _nested() primitives, be careful and
-check really thoroughly that the hierarchy is correctly mapped; otherwise
-you can get false positives or false negatives.
-
-Annotations
------------
-
-Two constructs can be used to annotate and check where and if certain locks
-must be held: lockdep_assert_held*(&lock) and lockdep_*pin_lock(&lock).
-
-As the name suggests, lockdep_assert_held* family of macros assert that a
-particular lock is held at a certain time (and generate a WARN() otherwise).
-This annotation is largely used all over the kernel, e.g. kernel/sched/
-core.c
-
-  void update_rq_clock(struct rq *rq)
-  {
-	s64 delta;
-
-	lockdep_assert_held(&rq->lock);
-	[...]
-  }
-
-where holding rq->lock is required to safely update a rq's clock.
-
-The other family of macros is lockdep_*pin_lock(), which is admittedly only
-used for rq->lock ATM. Despite their limited adoption these annotations
-generate a WARN() if the lock of interest is "accidentally" unlocked. This turns
-out to be especially helpful to debug code with callbacks, where an upper
-layer assumes a lock remains taken, but a lower layer thinks it can maybe drop
-and reacquire the lock ("unwittingly" introducing races). lockdep_pin_lock()
-returns a 'struct pin_cookie' that is then used by lockdep_unpin_lock() to check
-that nobody tampered with the lock, e.g. kernel/sched/sched.h
-
-  static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
-  {
-	rf->cookie = lockdep_pin_lock(&rq->lock);
-	[...]
-  }
-
-  static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
-  {
-	[...]
-	lockdep_unpin_lock(&rq->lock, rf->cookie);
-  }
-
-While comments about locking requirements might provide useful information,
-the runtime checks performed by annotations are invaluable when debugging
-locking problems and they carry the same level of details when inspecting
-code.  Always prefer annotations when in doubt!
-
-Proof of 100% correctness:
---------------------------
-
-The validator achieves perfect, mathematical 'closure' (proof of locking
-correctness) in the sense that for every simple, standalone single-task
-locking sequence that occurred at least once during the lifetime of the
-kernel, the validator proves it with a 100% certainty that no
-combination and timing of these locking sequences can cause any class of
-lock related deadlock. [*]
-
-I.e. complex multi-CPU and multi-task locking scenarios do not have to
-occur in practice to prove a deadlock: only the simple 'component'
-locking chains have to occur at least once (anytime, in any
-task/context) for the validator to be able to prove correctness. (For
-example, complex deadlocks that would normally need more than 3 CPUs and
-a very unlikely constellation of tasks, irq-contexts and timings to
-occur, can be detected on a plain, lightly loaded single-CPU system as
-well!)
-
-This radically decreases the complexity of locking related QA of the
-kernel: what has to be done during QA is to trigger as many "simple"
-single-task locking dependencies in the kernel as possible, at least
-once, to prove locking correctness - instead of having to trigger every
-possible combination of locking interaction between CPUs, combined with
-every possible hardirq and softirq nesting scenario (which is impossible
-to do in practice).
-
-[*] assuming that the validator itself is 100% correct, and no other
-    part of the system corrupts the state of the validator in any way.
-    We also assume that all NMI/SMM paths [which could interrupt
-    even hardirq-disabled codepaths] are correct and do not interfere
-    with the validator. We also assume that the 64-bit 'chain hash'
-    value is unique for every lock-chain in the system. Also, lock
-    recursion must not be higher than 20.
-
-Performance:
-------------
-
-The above rules require _massive_ amounts of runtime checking. If we did
-that for every lock taken and for every irqs-enable event, it would
-render the system practically unusably slow. The complexity of checking
-is O(N^2), so even with just a few hundred lock-classes we'd have to do
-tens of thousands of checks for every event.
-
-This problem is solved by checking any given 'locking scenario' (unique
-sequence of locks taken after each other) only once. A simple stack of
-held locks is maintained, and a lightweight 64-bit hash value is
-calculated, which hash is unique for every lock chain. The hash value,
-when the chain is validated for the first time, is then put into a hash
-table, which hash-table can be checked in a lockfree manner. If the
-locking chain occurs again later on, the hash table tells us that we
-don't have to validate the chain again.
-
-Troubleshooting:
-----------------
-
-The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
-Exceeding this number will trigger the following lockdep warning:
-
-	(DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
-
-By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
-desktop systems have less than 1,000 lock classes, so this warning
-normally results from lock-class leakage or failure to properly
-initialize locks.  These two problems are illustrated below:
-
-1.	Repeated module loading and unloading while running the validator
-	will result in lock-class leakage.  The issue here is that each
-	load of the module will create a new set of lock classes for
-	that module's locks, but module unloading does not remove old
-	classes (see below discussion of reuse of lock classes for why).
-	Therefore, if that module is loaded and unloaded repeatedly,
-	the number of lock classes will eventually reach the maximum.
-
-2.	Using structures such as arrays that have large numbers of
-	locks that are not explicitly initialized.  For example,
-	a hash table with 8192 buckets where each bucket has its own
-	spinlock_t will consume 8192 lock classes -unless- each spinlock
-	is explicitly initialized at runtime, for example, using the
-	run-time spin_lock_init() as opposed to compile-time initializers
-	such as __SPIN_LOCK_UNLOCKED().  Failure to properly initialize
-	the per-bucket spinlocks would guarantee lock-class overflow.
-	In contrast, a loop that called spin_lock_init() on each lock
-	would place all 8192 locks into a single lock class.
-
-	The moral of this story is that you should always explicitly
-	initialize your locks.
-
-One might argue that the validator should be modified to allow
-lock classes to be reused.  However, if you are tempted to make this
-argument, first review the code and think through the changes that would
-be required, keeping in mind that the lock classes to be removed are
-likely to be linked into the lock-dependency graph.  This turns out to
-be harder to do than to say.
-
-Of course, if you do run out of lock classes, the next thing to do is
-to find the offending lock classes.  First, the following command gives
-you the number of lock classes currently in use along with the maximum:
-
-	grep "lock-classes" /proc/lockdep_stats
-
-This command produces the following output on a modest system:
-
-	 lock-classes:                          748 [max: 8191]
-
-If the number allocated (748 above) increases continually over time,
-then there is likely a leak.  The following command can be used to
-identify the leaking lock classes:
-
-	grep "BD" /proc/lockdep
-
-Run the command and save the output, then compare against the output from
-a later run of this command to identify the leakers.  This same output
-can also help you find situations where runtime lock initialization has
-been omitted.
diff --git a/Documentation/locking/lockstat.rst b/Documentation/locking/lockstat.rst
new file mode 100644
index 000000000000..536eab8dbd99
--- /dev/null
+++ b/Documentation/locking/lockstat.rst
@@ -0,0 +1,204 @@
+===============
+Lock Statistics
+===============
+
+What
+====
+
+As the name suggests, it provides statistics on locks.
+
+
+Why
+===
+
+Because things like lock contention can severely impact performance.
+
+How
+===
+
+Lockdep already has hooks in the lock functions and maps lock instances to
+lock classes. We build on that (see Documentation/locking/lockdep-design.rst).
+The graph below shows the relation between the lock functions and the various
+hooks therein::
+
+        __acquire
+            |
+           lock _____
+            |        \
+            |    __contended
+            |         |
+            |       <wait>
+            | _______/
+            |/
+            |
+       __acquired
+            |
+            .
+          <hold>
+            .
+            |
+       __release
+            |
+         unlock
+
+  lock, unlock	- the regular lock functions
+  __*		- the hooks
+  <> 		- states
+
+With these hooks we provide the following statistics:
+
+ con-bounces
+	- number of lock contention that involved x-cpu data
+ contentions
+	- number of lock acquisitions that had to wait
+ wait time
+     min
+	- shortest (non-0) time we ever had to wait for a lock
+     max
+	- longest time we ever had to wait for a lock
+     total
+	- total time we spend waiting on this lock
+     avg
+	- average time spent waiting on this lock
+ acq-bounces
+	- number of lock acquisitions that involved x-cpu data
+ acquisitions
+	- number of times we took the lock
+ hold time
+     min
+	- shortest (non-0) time we ever held the lock
+     max
+	- longest time we ever held the lock
+     total
+	- total time this lock was held
+     avg
+	- average time this lock was held
+
+These numbers are gathered per lock class, per read/write state (when
+applicable).
+
+It also tracks 4 contention points per class. A contention point is a call site
+that had to wait on lock acquisition.
+
+Configuration
+-------------
+
+Lock statistics are enabled via CONFIG_LOCK_STAT.
+
+Usage
+-----
+
+Enable collection of statistics::
+
+	# echo 1 >/proc/sys/kernel/lock_stat
+
+Disable collection of statistics::
+
+	# echo 0 >/proc/sys/kernel/lock_stat
+
+Look at the current lock statistics::
+
+  ( line numbers not part of actual output, done for clarity in the explanation
+    below )
+
+  # less /proc/lock_stat
+
+  01 lock_stat version 0.4
+  02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+  03                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
+  04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+  05
+  06                         &mm->mmap_sem-W:            46             84           0.26         939.10       16371.53         194.90          47291        2922365           0.16     2220301.69 17464026916.32        5975.99
+  07                         &mm->mmap_sem-R:            37            100           1.31      299502.61      325629.52        3256.30         212344       34316685           0.10        7744.91    95016910.20           2.77
+  08                         ---------------
+  09                           &mm->mmap_sem              1          [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
+  10                           &mm->mmap_sem             96          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+  11                           &mm->mmap_sem             34          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+  12                           &mm->mmap_sem             17          [<ffffffff81127e71>] vm_munmap+0x41/0x80
+  13                         ---------------
+  14                           &mm->mmap_sem              1          [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
+  15                           &mm->mmap_sem             60          [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
+  16                           &mm->mmap_sem             41          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+  17                           &mm->mmap_sem             68          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+  18
+  19.............................................................................................................................................................................................................................
+  20
+  21                         unix_table_lock:           110            112           0.21          49.24         163.91           1.46          21094          66312           0.12         624.42       31589.81           0.48
+  22                         ---------------
+  23                         unix_table_lock             45          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+  24                         unix_table_lock             47          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+  25                         unix_table_lock             15          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+  26                         unix_table_lock              5          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+  27                         ---------------
+  28                         unix_table_lock             39          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+  29                         unix_table_lock             49          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+  30                         unix_table_lock             20          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+  31                         unix_table_lock              4          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+
+
+This excerpt shows the first two lock class statistics. Line 01 shows the
+output version - each time the format changes this will be updated. Line 02-04
+show the header with column descriptions. Lines 05-18 and 20-31 show the actual
+statistics. These statistics come in two parts; the actual stats separated by a
+short separator (line 08, 13) from the contention points.
+
+Lines 09-12 show the first 4 recorded contention points (the code
+which tries to get the lock) and lines 14-17 show the first 4 recorded
+contended points (the lock holder). It is possible that the max
+con-bounces point is missing in the statistics.
+
+The first lock (05-18) is a read/write lock, and shows two lines above the
+short separator. The contention points don't match the column descriptors,
+they have two: contentions and [<IP>] symbol. The second set of contention
+points are the points we're contending with.
+
+The integer part of the time values is in us.
+
+Dealing with nested locks, subclasses may appear::
+
+  32...........................................................................................................................................................................................................................
+  33
+  34                               &rq->lock:       13128          13128           0.43         190.53      103881.26           7.91          97454        3453404           0.00         401.11    13224683.11           3.82
+  35                               ---------
+  36                               &rq->lock          645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+  37                               &rq->lock          297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+  38                               &rq->lock          360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
+  39                               &rq->lock          428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
+  40                               ---------
+  41                               &rq->lock           77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+  42                               &rq->lock          174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+  43                               &rq->lock         4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+  44                               &rq->lock          893          [<ffffffff81340524>] schedule+0x157/0x7b8
+  45
+  46...........................................................................................................................................................................................................................
+  47
+  48                             &rq->lock/1:        1526          11488           0.33         388.73      136294.31          11.86          21461          38404           0.00          37.93      109388.53           2.84
+  49                             -----------
+  50                             &rq->lock/1        11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+  51                             -----------
+  52                             &rq->lock/1         5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+  53                             &rq->lock/1         1224          [<ffffffff81340524>] schedule+0x157/0x7b8
+  54                             &rq->lock/1         4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+  55                             &rq->lock/1          181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+
+Line 48 shows statistics for the second subclass (/1) of &rq->lock class
+(subclass starts from 0), since in this case, as line 50 suggests,
+double_rq_lock actually acquires a nested lock of two spinlocks.
+
+View the top contending locks::
+
+  # grep : /proc/lock_stat | head
+			clockevents_lock:       2926159        2947636           0.15       46882.81  1784540466.34         605.41        3381345        3879161           0.00        2260.97    53178395.68          13.71
+		     tick_broadcast_lock:        346460         346717           0.18        2257.43    39364622.71         113.54        3642919        4242696           0.00        2263.79    49173646.60          11.59
+		  &mapping->i_mmap_mutex:        203896         203899           3.36      645530.05 31767507988.39      155800.21        3361776        8893984           0.17        2254.15    14110121.02           1.59
+			       &rq->lock:        135014         136909           0.18         606.09      842160.68           6.15        1540728       10436146           0.00         728.72    17606683.41           1.69
+	       &(&zone->lru_lock)->rlock:         93000          94934           0.16          59.18      188253.78           1.98        1199912        3809894           0.15         391.40     3559518.81           0.93
+			 tasklist_lock-W:         40667          41130           0.23        1189.42      428980.51          10.43         270278         510106           0.16         653.51     3939674.91           7.72
+			 tasklist_lock-R:         21298          21305           0.20        1310.05      215511.12          10.12         186204         241258           0.14        1162.33     1179779.23           4.89
+			      rcu_node_1:         47656          49022           0.16         635.41      193616.41           3.95         844888        1865423           0.00         764.26     1656226.96           0.89
+       &(&dentry->d_lockref.lock)->rlock:         39791          40179           0.15        1302.08       88851.96           2.21        2790851       12527025           0.10        1910.75     3379714.27           0.27
+			      rcu_node_0:         29203          30064           0.16         786.55     1555573.00          51.74          88963         244254           0.00         398.87      428872.51           1.76
+
+Clear the statistics::
+
+  # echo 0 > /proc/lock_stat
diff --git a/Documentation/locking/lockstat.txt b/Documentation/locking/lockstat.txt
deleted file mode 100644
index fdbeb0c45ef3..000000000000
--- a/Documentation/locking/lockstat.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-
-LOCK STATISTICS
-
-- WHAT
-
-As the name suggests, it provides statistics on locks.
-
-- WHY
-
-Because things like lock contention can severely impact performance.
-
-- HOW
-
-Lockdep already has hooks in the lock functions and maps lock instances to
-lock classes. We build on that (see Documentation/locking/lockdep-design.txt).
-The graph below shows the relation between the lock functions and the various
-hooks therein.
-
-        __acquire
-            |
-           lock _____
-            |        \
-            |    __contended
-            |         |
-            |       <wait>
-            | _______/
-            |/
-            |
-       __acquired
-            |
-            .
-          <hold>
-            .
-            |
-       __release
-            |
-         unlock
-
-lock, unlock	- the regular lock functions
-__*		- the hooks
-<> 		- states
-
-With these hooks we provide the following statistics:
-
- con-bounces       - number of lock contention that involved x-cpu data
- contentions       - number of lock acquisitions that had to wait
- wait time min     - shortest (non-0) time we ever had to wait for a lock
-           max     - longest time we ever had to wait for a lock
-	   total   - total time we spend waiting on this lock
-	   avg     - average time spent waiting on this lock
- acq-bounces       - number of lock acquisitions that involved x-cpu data
- acquisitions      - number of times we took the lock
- hold time min     - shortest (non-0) time we ever held the lock
-	   max     - longest time we ever held the lock
-	   total   - total time this lock was held
-	   avg     - average time this lock was held
-
-These numbers are gathered per lock class, per read/write state (when
-applicable).
-
-It also tracks 4 contention points per class. A contention point is a call site
-that had to wait on lock acquisition.
-
- - CONFIGURATION
-
-Lock statistics are enabled via CONFIG_LOCK_STAT.
-
- - USAGE
-
-Enable collection of statistics:
-
-# echo 1 >/proc/sys/kernel/lock_stat
-
-Disable collection of statistics:
-
-# echo 0 >/proc/sys/kernel/lock_stat
-
-Look at the current lock statistics:
-
-( line numbers not part of actual output, done for clarity in the explanation
-  below )
-
-# less /proc/lock_stat
-
-01 lock_stat version 0.4
-02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-03                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
-04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-05
-06                         &mm->mmap_sem-W:            46             84           0.26         939.10       16371.53         194.90          47291        2922365           0.16     2220301.69 17464026916.32        5975.99
-07                         &mm->mmap_sem-R:            37            100           1.31      299502.61      325629.52        3256.30         212344       34316685           0.10        7744.91    95016910.20           2.77
-08                         ---------------
-09                           &mm->mmap_sem              1          [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
-10                           &mm->mmap_sem             96          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
-11                           &mm->mmap_sem             34          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
-12                           &mm->mmap_sem             17          [<ffffffff81127e71>] vm_munmap+0x41/0x80
-13                         ---------------
-14                           &mm->mmap_sem              1          [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
-15                           &mm->mmap_sem             60          [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
-16                           &mm->mmap_sem             41          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
-17                           &mm->mmap_sem             68          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
-18
-19.............................................................................................................................................................................................................................
-20
-21                         unix_table_lock:           110            112           0.21          49.24         163.91           1.46          21094          66312           0.12         624.42       31589.81           0.48
-22                         ---------------
-23                         unix_table_lock             45          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
-24                         unix_table_lock             47          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
-25                         unix_table_lock             15          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
-26                         unix_table_lock              5          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
-27                         ---------------
-28                         unix_table_lock             39          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
-29                         unix_table_lock             49          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
-30                         unix_table_lock             20          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
-31                         unix_table_lock              4          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
-
-
-This excerpt shows the first two lock class statistics. Line 01 shows the
-output version - each time the format changes this will be updated. Line 02-04
-show the header with column descriptions. Lines 05-18 and 20-31 show the actual
-statistics. These statistics come in two parts; the actual stats separated by a
-short separator (line 08, 13) from the contention points.
-
-Lines 09-12 show the first 4 recorded contention points (the code
-which tries to get the lock) and lines 14-17 show the first 4 recorded
-contended points (the lock holder). It is possible that the max
-con-bounces point is missing in the statistics.
-
-The first lock (05-18) is a read/write lock, and shows two lines above the
-short separator. The contention points don't match the column descriptors,
-they have two: contentions and [<IP>] symbol. The second set of contention
-points are the points we're contending with.
-
-The integer part of the time values is in us.
-
-Dealing with nested locks, subclasses may appear:
-
-32...........................................................................................................................................................................................................................
-33
-34                               &rq->lock:       13128          13128           0.43         190.53      103881.26           7.91          97454        3453404           0.00         401.11    13224683.11           3.82
-35                               ---------
-36                               &rq->lock          645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-37                               &rq->lock          297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-38                               &rq->lock          360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
-39                               &rq->lock          428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
-40                               ---------
-41                               &rq->lock           77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-42                               &rq->lock          174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-43                               &rq->lock         4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-44                               &rq->lock          893          [<ffffffff81340524>] schedule+0x157/0x7b8
-45
-46...........................................................................................................................................................................................................................
-47
-48                             &rq->lock/1:        1526          11488           0.33         388.73      136294.31          11.86          21461          38404           0.00          37.93      109388.53           2.84
-49                             -----------
-50                             &rq->lock/1        11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-51                             -----------
-52                             &rq->lock/1         5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-53                             &rq->lock/1         1224          [<ffffffff81340524>] schedule+0x157/0x7b8
-54                             &rq->lock/1         4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-55                             &rq->lock/1          181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-
-Line 48 shows statistics for the second subclass (/1) of &rq->lock class
-(subclass starts from 0), since in this case, as line 50 suggests,
-double_rq_lock actually acquires a nested lock of two spinlocks.
-
-View the top contending locks:
-
-# grep : /proc/lock_stat | head
-			clockevents_lock:       2926159        2947636           0.15       46882.81  1784540466.34         605.41        3381345        3879161           0.00        2260.97    53178395.68          13.71
-		     tick_broadcast_lock:        346460         346717           0.18        2257.43    39364622.71         113.54        3642919        4242696           0.00        2263.79    49173646.60          11.59
-		  &mapping->i_mmap_mutex:        203896         203899           3.36      645530.05 31767507988.39      155800.21        3361776        8893984           0.17        2254.15    14110121.02           1.59
-			       &rq->lock:        135014         136909           0.18         606.09      842160.68           6.15        1540728       10436146           0.00         728.72    17606683.41           1.69
-	       &(&zone->lru_lock)->rlock:         93000          94934           0.16          59.18      188253.78           1.98        1199912        3809894           0.15         391.40     3559518.81           0.93
-			 tasklist_lock-W:         40667          41130           0.23        1189.42      428980.51          10.43         270278         510106           0.16         653.51     3939674.91           7.72
-			 tasklist_lock-R:         21298          21305           0.20        1310.05      215511.12          10.12         186204         241258           0.14        1162.33     1179779.23           4.89
-			      rcu_node_1:         47656          49022           0.16         635.41      193616.41           3.95         844888        1865423           0.00         764.26     1656226.96           0.89
-       &(&dentry->d_lockref.lock)->rlock:         39791          40179           0.15        1302.08       88851.96           2.21        2790851       12527025           0.10        1910.75     3379714.27           0.27
-			      rcu_node_0:         29203          30064           0.16         786.55     1555573.00          51.74          88963         244254           0.00         398.87      428872.51           1.76
-
-Clear the statistics:
-
-# echo 0 > /proc/lock_stat
diff --git a/Documentation/locking/locktorture.rst b/Documentation/locking/locktorture.rst
new file mode 100644
index 000000000000..e79eeeca3ac6
--- /dev/null
+++ b/Documentation/locking/locktorture.rst
@@ -0,0 +1,170 @@
+==================================
+Kernel Lock Torture Test Operation
+==================================
+
+CONFIG_LOCK_TORTURE_TEST
+========================
+
+The CONFIG LOCK_TORTURE_TEST config option provides a kernel module
+that runs torture tests on core kernel locking primitives. The kernel
+module, 'locktorture', may be built after the fact on the running
+kernel to be tested, if desired. The tests periodically output status
+messages via printk(), which can be examined via the dmesg (perhaps
+grepping for "torture").  The test is started when the module is loaded,
+and stops when the module is unloaded. This program is based on how RCU
+is tortured, via rcutorture.
+
+This torture test consists of creating a number of kernel threads which
+acquire the lock and hold it for specific amount of time, thus simulating
+different critical region behaviors. The amount of contention on the lock
+can be simulated by either enlarging this critical region hold time and/or
+creating more kthreads.
+
+
+Module Parameters
+=================
+
+This module has the following parameters:
+
+
+Locktorture-specific
+--------------------
+
+nwriters_stress
+		  Number of kernel threads that will stress exclusive lock
+		  ownership (writers). The default value is twice the number
+		  of online CPUs.
+
+nreaders_stress
+		  Number of kernel threads that will stress shared lock
+		  ownership (readers). The default is the same amount of writer
+		  locks. If the user did not specify nwriters_stress, then
+		  both readers and writers be the amount of online CPUs.
+
+torture_type
+		  Type of lock to torture. By default, only spinlocks will
+		  be tortured. This module can torture the following locks,
+		  with string values as follows:
+
+		     - "lock_busted":
+				Simulates a buggy lock implementation.
+
+		     - "spin_lock":
+				spin_lock() and spin_unlock() pairs.
+
+		     - "spin_lock_irq":
+				spin_lock_irq() and spin_unlock_irq() pairs.
+
+		     - "rw_lock":
+				read/write lock() and unlock() rwlock pairs.
+
+		     - "rw_lock_irq":
+				read/write lock_irq() and unlock_irq()
+				rwlock pairs.
+
+		     - "mutex_lock":
+				mutex_lock() and mutex_unlock() pairs.
+
+		     - "rtmutex_lock":
+				rtmutex_lock() and rtmutex_unlock() pairs.
+				Kernel must have CONFIG_RT_MUTEX=y.
+
+		     - "rwsem_lock":
+				read/write down() and up() semaphore pairs.
+
+
+Torture-framework (RCU + locking)
+---------------------------------
+
+shutdown_secs
+		  The number of seconds to run the test before terminating
+		  the test and powering off the system.  The default is
+		  zero, which disables test termination and system shutdown.
+		  This capability is useful for automated testing.
+
+onoff_interval
+		  The number of seconds between each attempt to execute a
+		  randomly selected CPU-hotplug operation.  Defaults
+		  to zero, which disables CPU hotplugging.  In
+		  CONFIG_HOTPLUG_CPU=n kernels, locktorture will silently
+		  refuse to do any CPU-hotplug operations regardless of
+		  what value is specified for onoff_interval.
+
+onoff_holdoff
+		  The number of seconds to wait until starting CPU-hotplug
+		  operations.  This would normally only be used when
+		  locktorture was built into the kernel and started
+		  automatically at boot time, in which case it is useful
+		  in order to avoid confusing boot-time code with CPUs
+		  coming and going. This parameter is only useful if
+		  CONFIG_HOTPLUG_CPU is enabled.
+
+stat_interval
+		  Number of seconds between statistics-related printk()s.
+		  By default, locktorture will report stats every 60 seconds.
+		  Setting the interval to zero causes the statistics to
+		  be printed -only- when the module is unloaded, and this
+		  is the default.
+
+stutter
+		  The length of time to run the test before pausing for this
+		  same period of time.  Defaults to "stutter=5", so as
+		  to run and pause for (roughly) five-second intervals.
+		  Specifying "stutter=0" causes the test to run continuously
+		  without pausing, which is the old default behavior.
+
+shuffle_interval
+		  The number of seconds to keep the test threads affinitied
+		  to a particular subset of the CPUs, defaults to 3 seconds.
+		  Used in conjunction with test_no_idle_hz.
+
+verbose
+		  Enable verbose debugging printing, via printk(). Enabled
+		  by default. This extra information is mostly related to
+		  high-level errors and reports from the main 'torture'
+		  framework.
+
+
+Statistics
+==========
+
+Statistics are printed in the following format::
+
+  spin_lock-torture: Writes:  Total: 93746064  Max/Min: 0/0   Fail: 0
+     (A)		    (B)		   (C)		  (D)	       (E)
+
+  (A): Lock type that is being tortured -- torture_type parameter.
+
+  (B): Number of writer lock acquisitions. If dealing with a read/write
+       primitive a second "Reads" statistics line is printed.
+
+  (C): Number of times the lock was acquired.
+
+  (D): Min and max number of times threads failed to acquire the lock.
+
+  (E): true/false values if there were errors acquiring the lock. This should
+       -only- be positive if there is a bug in the locking primitive's
+       implementation. Otherwise a lock should never fail (i.e., spin_lock()).
+       Of course, the same applies for (C), above. A dummy example of this is
+       the "lock_busted" type.
+
+Usage
+=====
+
+The following script may be used to torture locks::
+
+	#!/bin/sh
+
+	modprobe locktorture
+	sleep 3600
+	rmmod locktorture
+	dmesg | grep torture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors.  The "rmmod" command forces a "SUCCESS",
+"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
+two are self-explanatory, while the last indicates that while there
+were no locking failures, CPU-hotplug problems were detected.
+
+Also see: Documentation/RCU/torture.txt
diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt
deleted file mode 100644
index 6a8df4cd19bf..000000000000
--- a/Documentation/locking/locktorture.txt
+++ /dev/null
@@ -1,145 +0,0 @@
-Kernel Lock Torture Test Operation
-
-CONFIG_LOCK_TORTURE_TEST
-
-The CONFIG LOCK_TORTURE_TEST config option provides a kernel module
-that runs torture tests on core kernel locking primitives. The kernel
-module, 'locktorture', may be built after the fact on the running
-kernel to be tested, if desired. The tests periodically output status
-messages via printk(), which can be examined via the dmesg (perhaps
-grepping for "torture").  The test is started when the module is loaded,
-and stops when the module is unloaded. This program is based on how RCU
-is tortured, via rcutorture.
-
-This torture test consists of creating a number of kernel threads which
-acquire the lock and hold it for specific amount of time, thus simulating
-different critical region behaviors. The amount of contention on the lock
-can be simulated by either enlarging this critical region hold time and/or
-creating more kthreads.
-
-
-MODULE PARAMETERS
-
-This module has the following parameters:
-
-
-	    ** Locktorture-specific **
-
-nwriters_stress   Number of kernel threads that will stress exclusive lock
-		  ownership (writers). The default value is twice the number
-		  of online CPUs.
-
-nreaders_stress   Number of kernel threads that will stress shared lock
-		  ownership (readers). The default is the same amount of writer
-		  locks. If the user did not specify nwriters_stress, then
-		  both readers and writers be the amount of online CPUs.
-
-torture_type	  Type of lock to torture. By default, only spinlocks will
-		  be tortured. This module can torture the following locks,
-		  with string values as follows:
-
-		     o "lock_busted": Simulates a buggy lock implementation.
-
-		     o "spin_lock": spin_lock() and spin_unlock() pairs.
-
-		     o "spin_lock_irq": spin_lock_irq() and spin_unlock_irq()
-					pairs.
-
-		     o "rw_lock": read/write lock() and unlock() rwlock pairs.
-
-		     o "rw_lock_irq": read/write lock_irq() and unlock_irq()
-				      rwlock pairs.
-
-		     o "mutex_lock": mutex_lock() and mutex_unlock() pairs.
-
-		     o "rtmutex_lock": rtmutex_lock() and rtmutex_unlock()
-				       pairs. Kernel must have CONFIG_RT_MUTEX=y.
-
-		     o "rwsem_lock": read/write down() and up() semaphore pairs.
-
-
-	    ** Torture-framework (RCU + locking) **
-
-shutdown_secs	  The number of seconds to run the test before terminating
-		  the test and powering off the system.  The default is
-		  zero, which disables test termination and system shutdown.
-		  This capability is useful for automated testing.
-
-onoff_interval	  The number of seconds between each attempt to execute a
-		  randomly selected CPU-hotplug operation.  Defaults
-		  to zero, which disables CPU hotplugging.  In
-		  CONFIG_HOTPLUG_CPU=n kernels, locktorture will silently
-		  refuse to do any CPU-hotplug operations regardless of
-		  what value is specified for onoff_interval.
-
-onoff_holdoff	  The number of seconds to wait until starting CPU-hotplug
-		  operations.  This would normally only be used when
-		  locktorture was built into the kernel and started
-		  automatically at boot time, in which case it is useful
-		  in order to avoid confusing boot-time code with CPUs
-		  coming and going. This parameter is only useful if
-		  CONFIG_HOTPLUG_CPU is enabled.
-
-stat_interval	  Number of seconds between statistics-related printk()s.
-		  By default, locktorture will report stats every 60 seconds.
-		  Setting the interval to zero causes the statistics to
-		  be printed -only- when the module is unloaded, and this
-		  is the default.
-
-stutter		  The length of time to run the test before pausing for this
-		  same period of time.  Defaults to "stutter=5", so as
-		  to run and pause for (roughly) five-second intervals.
-		  Specifying "stutter=0" causes the test to run continuously
-		  without pausing, which is the old default behavior.
-
-shuffle_interval  The number of seconds to keep the test threads affinitied
-		  to a particular subset of the CPUs, defaults to 3 seconds.
-		  Used in conjunction with test_no_idle_hz.
-
-verbose		  Enable verbose debugging printing, via printk(). Enabled
-		  by default. This extra information is mostly related to
-		  high-level errors and reports from the main 'torture'
-		  framework.
-
-
-STATISTICS
-
-Statistics are printed in the following format:
-
-spin_lock-torture: Writes:  Total: 93746064  Max/Min: 0/0   Fail: 0
-   (A)		    (B)		   (C)		  (D)	       (E)
-
-(A): Lock type that is being tortured -- torture_type parameter.
-
-(B): Number of writer lock acquisitions. If dealing with a read/write primitive
-     a second "Reads" statistics line is printed.
-
-(C): Number of times the lock was acquired.
-
-(D): Min and max number of times threads failed to acquire the lock.
-
-(E): true/false values if there were errors acquiring the lock. This should
-     -only- be positive if there is a bug in the locking primitive's
-     implementation. Otherwise a lock should never fail (i.e., spin_lock()).
-     Of course, the same applies for (C), above. A dummy example of this is
-     the "lock_busted" type.
-
-USAGE
-
-The following script may be used to torture locks:
-
-	#!/bin/sh
-
-	modprobe locktorture
-	sleep 3600
-	rmmod locktorture
-	dmesg | grep torture:
-
-The output can be manually inspected for the error flag of "!!!".
-One could of course create a more elaborate script that automatically
-checked for such errors.  The "rmmod" command forces a "SUCCESS",
-"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
-two are self-explanatory, while the last indicates that while there
-were no locking failures, CPU-hotplug problems were detected.
-
-Also see: Documentation/RCU/torture.txt
diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst
new file mode 100644
index 000000000000..4d8236b81fa5
--- /dev/null
+++ b/Documentation/locking/mutex-design.rst
@@ -0,0 +1,152 @@
+=======================
+Generic Mutex Subsystem
+=======================
+
+started by Ingo Molnar <mingo@redhat.com>
+
+updated by Davidlohr Bueso <davidlohr@hp.com>
+
+What are mutexes?
+-----------------
+
+In the Linux kernel, mutexes refer to a particular locking primitive
+that enforces serialization on shared memory systems, and not only to
+the generic term referring to 'mutual exclusion' found in academia
+or similar theoretical text books. Mutexes are sleeping locks which
+behave similarly to binary semaphores, and were introduced in 2006[1]
+as an alternative to these. This new data structure provided a number
+of advantages, including simpler interfaces, and at that time smaller
+code (see Disadvantages).
+
+[1] http://lwn.net/Articles/164802/
+
+Implementation
+--------------
+
+Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
+and implemented in kernel/locking/mutex.c. These locks use an atomic variable
+(->owner) to keep track of the lock state during its lifetime.  Field owner
+actually contains `struct task_struct *` to the current lock owner and it is
+therefore NULL if not currently owned. Since task_struct pointers are aligned
+at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
+if waiter list is non-empty).  In its most basic form it also includes a
+wait-queue and a spinlock that serializes access to it. Furthermore,
+CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
+below in (ii).
+
+When acquiring a mutex, there are three possible paths that can be
+taken, depending on the state of the lock:
+
+(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
+    the current task. This only works in the uncontended case (cmpxchg() checks
+    against 0UL, so all 3 state bits above have to be 0). If the lock is
+    contended it goes to the next possible path.
+
+(ii) midpath: aka optimistic spinning, tries to spin for acquisition
+     while the lock owner is running and there are no other tasks ready
+     to run that have higher priority (need_resched). The rationale is
+     that if the lock owner is running, it is likely to release the lock
+     soon. The mutex spinners are queued up using MCS lock so that only
+     one spinner can compete for the mutex.
+
+     The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock
+     with the desirable properties of being fair and with each cpu trying
+     to acquire the lock spinning on a local variable. It avoids expensive
+     cacheline bouncing that common test-and-set spinlock implementations
+     incur. An MCS-like lock is specially tailored for optimistic spinning
+     for sleeping lock implementation. An important feature of the customized
+     MCS lock is that it has the extra property that spinners are able to exit
+     the MCS spinlock queue when they need to reschedule. This further helps
+     avoid situations where MCS spinners that need to reschedule would continue
+     waiting to spin on mutex owner, only to go directly to slowpath upon
+     obtaining the MCS lock.
+
+
+(iii) slowpath: last resort, if the lock is still unable to be acquired,
+      the task is added to the wait-queue and sleeps until woken up by the
+      unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE.
+
+While formally kernel mutexes are sleepable locks, it is path (ii) that
+makes them more practically a hybrid type. By simply not interrupting a
+task and busy-waiting for a few cycles instead of immediately sleeping,
+the performance of this lock has been seen to significantly improve a
+number of workloads. Note that this technique is also used for rw-semaphores.
+
+Semantics
+---------
+
+The mutex subsystem checks and enforces the following rules:
+
+    - Only one task can hold the mutex at a time.
+    - Only the owner can unlock the mutex.
+    - Multiple unlocks are not permitted.
+    - Recursive locking/unlocking is not permitted.
+    - A mutex must only be initialized via the API (see below).
+    - A task may not exit with a mutex held.
+    - Memory areas where held locks reside must not be freed.
+    - Held mutexes must not be reinitialized.
+    - Mutexes may not be used in hardware or software interrupt
+      contexts such as tasklets and timers.
+
+These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled.
+In addition, the mutex debugging code also implements a number of other
+features that make lock debugging easier and faster:
+
+    - Uses symbolic names of mutexes, whenever they are printed
+      in debug output.
+    - Point-of-acquire tracking, symbolic lookup of function names,
+      list of all locks held in the system, printout of them.
+    - Owner tracking.
+    - Detects self-recursing locks and prints out all relevant info.
+    - Detects multi-task circular deadlocks and prints out all affected
+      locks and tasks (and only those tasks).
+
+
+Interfaces
+----------
+Statically define the mutex::
+
+   DEFINE_MUTEX(name);
+
+Dynamically initialize the mutex::
+
+   mutex_init(mutex);
+
+Acquire the mutex, uninterruptible::
+
+   void mutex_lock(struct mutex *lock);
+   void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+   int  mutex_trylock(struct mutex *lock);
+
+Acquire the mutex, interruptible::
+
+   int mutex_lock_interruptible_nested(struct mutex *lock,
+				       unsigned int subclass);
+   int mutex_lock_interruptible(struct mutex *lock);
+
+Acquire the mutex, interruptible, if dec to 0::
+
+   int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
+
+Unlock the mutex::
+
+   void mutex_unlock(struct mutex *lock);
+
+Test if the mutex is taken::
+
+   int mutex_is_locked(struct mutex *lock);
+
+Disadvantages
+-------------
+
+Unlike its original design and purpose, 'struct mutex' is among the largest
+locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
+is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
+cache and memory footprint.
+
+When to use mutexes
+-------------------
+
+Unless the strict semantics of mutexes are unsuitable and/or the critical
+region prevents the lock from being shared, always prefer them to any other
+locking primitive.
diff --git a/Documentation/locking/mutex-design.txt b/Documentation/locking/mutex-design.txt
deleted file mode 100644
index 818aca19612f..000000000000
--- a/Documentation/locking/mutex-design.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-Generic Mutex Subsystem
-
-started by Ingo Molnar <mingo@redhat.com>
-updated by Davidlohr Bueso <davidlohr@hp.com>
-
-What are mutexes?
------------------
-
-In the Linux kernel, mutexes refer to a particular locking primitive
-that enforces serialization on shared memory systems, and not only to
-the generic term referring to 'mutual exclusion' found in academia
-or similar theoretical text books. Mutexes are sleeping locks which
-behave similarly to binary semaphores, and were introduced in 2006[1]
-as an alternative to these. This new data structure provided a number
-of advantages, including simpler interfaces, and at that time smaller
-code (see Disadvantages).
-
-[1] http://lwn.net/Articles/164802/
-
-Implementation
---------------
-
-Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
-and implemented in kernel/locking/mutex.c. These locks use an atomic variable
-(->owner) to keep track of the lock state during its lifetime.  Field owner
-actually contains 'struct task_struct *' to the current lock owner and it is
-therefore NULL if not currently owned. Since task_struct pointers are aligned
-at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
-if waiter list is non-empty).  In its most basic form it also includes a
-wait-queue and a spinlock that serializes access to it. Furthermore,
-CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
-below in (ii).
-
-When acquiring a mutex, there are three possible paths that can be
-taken, depending on the state of the lock:
-
-(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
-    the current task. This only works in the uncontended case (cmpxchg() checks
-    against 0UL, so all 3 state bits above have to be 0). If the lock is
-    contended it goes to the next possible path.
-
-(ii) midpath: aka optimistic spinning, tries to spin for acquisition
-     while the lock owner is running and there are no other tasks ready
-     to run that have higher priority (need_resched). The rationale is
-     that if the lock owner is running, it is likely to release the lock
-     soon. The mutex spinners are queued up using MCS lock so that only
-     one spinner can compete for the mutex.
-
-     The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock
-     with the desirable properties of being fair and with each cpu trying
-     to acquire the lock spinning on a local variable. It avoids expensive
-     cacheline bouncing that common test-and-set spinlock implementations
-     incur. An MCS-like lock is specially tailored for optimistic spinning
-     for sleeping lock implementation. An important feature of the customized
-     MCS lock is that it has the extra property that spinners are able to exit
-     the MCS spinlock queue when they need to reschedule. This further helps
-     avoid situations where MCS spinners that need to reschedule would continue
-     waiting to spin on mutex owner, only to go directly to slowpath upon
-     obtaining the MCS lock.
-
-
-(iii) slowpath: last resort, if the lock is still unable to be acquired,
-      the task is added to the wait-queue and sleeps until woken up by the
-      unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE.
-
-While formally kernel mutexes are sleepable locks, it is path (ii) that
-makes them more practically a hybrid type. By simply not interrupting a
-task and busy-waiting for a few cycles instead of immediately sleeping,
-the performance of this lock has been seen to significantly improve a
-number of workloads. Note that this technique is also used for rw-semaphores.
-
-Semantics
----------
-
-The mutex subsystem checks and enforces the following rules:
-
-    - Only one task can hold the mutex at a time.
-    - Only the owner can unlock the mutex.
-    - Multiple unlocks are not permitted.
-    - Recursive locking/unlocking is not permitted.
-    - A mutex must only be initialized via the API (see below).
-    - A task may not exit with a mutex held.
-    - Memory areas where held locks reside must not be freed.
-    - Held mutexes must not be reinitialized.
-    - Mutexes may not be used in hardware or software interrupt
-      contexts such as tasklets and timers.
-
-These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled.
-In addition, the mutex debugging code also implements a number of other
-features that make lock debugging easier and faster:
-
-    - Uses symbolic names of mutexes, whenever they are printed
-      in debug output.
-    - Point-of-acquire tracking, symbolic lookup of function names,
-      list of all locks held in the system, printout of them.
-    - Owner tracking.
-    - Detects self-recursing locks and prints out all relevant info.
-    - Detects multi-task circular deadlocks and prints out all affected
-      locks and tasks (and only those tasks).
-
-
-Interfaces
-----------
-Statically define the mutex:
-   DEFINE_MUTEX(name);
-
-Dynamically initialize the mutex:
-   mutex_init(mutex);
-
-Acquire the mutex, uninterruptible:
-   void mutex_lock(struct mutex *lock);
-   void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
-   int  mutex_trylock(struct mutex *lock);
-
-Acquire the mutex, interruptible:
-   int mutex_lock_interruptible_nested(struct mutex *lock,
-				       unsigned int subclass);
-   int mutex_lock_interruptible(struct mutex *lock);
-
-Acquire the mutex, interruptible, if dec to 0:
-   int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
-
-Unlock the mutex:
-   void mutex_unlock(struct mutex *lock);
-
-Test if the mutex is taken:
-   int mutex_is_locked(struct mutex *lock);
-
-Disadvantages
--------------
-
-Unlike its original design and purpose, 'struct mutex' is among the largest
-locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
-is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
-cache and memory footprint.
-
-When to use mutexes
--------------------
-
-Unless the strict semantics of mutexes are unsuitable and/or the critical
-region prevents the lock from being shared, always prefer them to any other
-locking primitive.
diff --git a/Documentation/locking/rt-mutex-design.rst b/Documentation/locking/rt-mutex-design.rst
new file mode 100644
index 000000000000..59c2a64efb21
--- /dev/null
+++ b/Documentation/locking/rt-mutex-design.rst
@@ -0,0 +1,574 @@
+==============================
+RT-mutex implementation design
+==============================
+
+Copyright (c) 2006 Steven Rostedt
+
+Licensed under the GNU Free Documentation License, Version 1.2
+
+
+This document tries to describe the design of the rtmutex.c implementation.
+It doesn't describe the reasons why rtmutex.c exists. For that please see
+Documentation/locking/rt-mutex.rst.  Although this document does explain problems
+that happen without this code, but that is in the concept to understand
+what the code actually is doing.
+
+The goal of this document is to help others understand the priority
+inheritance (PI) algorithm that is used, as well as reasons for the
+decisions that were made to implement PI in the manner that was done.
+
+
+Unbounded Priority Inversion
+----------------------------
+
+Priority inversion is when a lower priority process executes while a higher
+priority process wants to run.  This happens for several reasons, and
+most of the time it can't be helped.  Anytime a high priority process wants
+to use a resource that a lower priority process has (a mutex for example),
+the high priority process must wait until the lower priority process is done
+with the resource.  This is a priority inversion.  What we want to prevent
+is something called unbounded priority inversion.  That is when the high
+priority process is prevented from running by a lower priority process for
+an undetermined amount of time.
+
+The classic example of unbounded priority inversion is where you have three
+processes, let's call them processes A, B, and C, where A is the highest
+priority process, C is the lowest, and B is in between. A tries to grab a lock
+that C owns and must wait and lets C run to release the lock. But in the
+meantime, B executes, and since B is of a higher priority than C, it preempts C,
+but by doing so, it is in fact preempting A which is a higher priority process.
+Now there's no way of knowing how long A will be sleeping waiting for C
+to release the lock, because for all we know, B is a CPU hog and will
+never give C a chance to release the lock.  This is called unbounded priority
+inversion.
+
+Here's a little ASCII art to show the problem::
+
+     grab lock L1 (owned by C)
+       |
+  A ---+
+          C preempted by B
+            |
+  C    +----+
+
+  B         +-------->
+                  B now keeps A from running.
+
+
+Priority Inheritance (PI)
+-------------------------
+
+There are several ways to solve this issue, but other ways are out of scope
+for this document.  Here we only discuss PI.
+
+PI is where a process inherits the priority of another process if the other
+process blocks on a lock owned by the current process.  To make this easier
+to understand, let's use the previous example, with processes A, B, and C again.
+
+This time, when A blocks on the lock owned by C, C would inherit the priority
+of A.  So now if B becomes runnable, it would not preempt C, since C now has
+the high priority of A.  As soon as C releases the lock, it loses its
+inherited priority, and A then can continue with the resource that C had.
+
+Terminology
+-----------
+
+Here I explain some terminology that is used in this document to help describe
+the design that is used to implement PI.
+
+PI chain
+         - The PI chain is an ordered series of locks and processes that cause
+           processes to inherit priorities from a previous process that is
+           blocked on one of its locks.  This is described in more detail
+           later in this document.
+
+mutex
+         - In this document, to differentiate from locks that implement
+           PI and spin locks that are used in the PI code, from now on
+           the PI locks will be called a mutex.
+
+lock
+         - In this document from now on, I will use the term lock when
+           referring to spin locks that are used to protect parts of the PI
+           algorithm.  These locks disable preemption for UP (when
+           CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
+           entering critical sections simultaneously.
+
+spin lock
+         - Same as lock above.
+
+waiter
+         - A waiter is a struct that is stored on the stack of a blocked
+           process.  Since the scope of the waiter is within the code for
+           a process being blocked on the mutex, it is fine to allocate
+           the waiter on the process's stack (local variable).  This
+           structure holds a pointer to the task, as well as the mutex that
+           the task is blocked on.  It also has rbtree node structures to
+           place the task in the waiters rbtree of a mutex as well as the
+           pi_waiters rbtree of a mutex owner task (described below).
+
+           waiter is sometimes used in reference to the task that is waiting
+           on a mutex. This is the same as waiter->task.
+
+waiters
+         - A list of processes that are blocked on a mutex.
+
+top waiter
+         - The highest priority process waiting on a specific mutex.
+
+top pi waiter
+              - The highest priority process waiting on one of the mutexes
+                that a specific process owns.
+
+Note:
+       task and process are used interchangeably in this document, mostly to
+       differentiate between two processes that are being described together.
+
+
+PI chain
+--------
+
+The PI chain is a list of processes and mutexes that may cause priority
+inheritance to take place.  Multiple chains may converge, but a chain
+would never diverge, since a process can't be blocked on more than one
+mutex at a time.
+
+Example::
+
+   Process:  A, B, C, D, E
+   Mutexes:  L1, L2, L3, L4
+
+   A owns: L1
+           B blocked on L1
+           B owns L2
+                  C blocked on L2
+                  C owns L3
+                         D blocked on L3
+                         D owns L4
+                                E blocked on L4
+
+The chain would be::
+
+   E->L4->D->L3->C->L2->B->L1->A
+
+To show where two chains merge, we could add another process F and
+another mutex L5 where B owns L5 and F is blocked on mutex L5.
+
+The chain for F would be::
+
+   F->L5->B->L1->A
+
+Since a process may own more than one mutex, but never be blocked on more than
+one, the chains merge.
+
+Here we show both chains::
+
+   E->L4->D->L3->C->L2-+
+                       |
+                       +->B->L1->A
+                       |
+                 F->L5-+
+
+For PI to work, the processes at the right end of these chains (or we may
+also call it the Top of the chain) must be equal to or higher in priority
+than the processes to the left or below in the chain.
+
+Also since a mutex may have more than one process blocked on it, we can
+have multiple chains merge at mutexes.  If we add another process G that is
+blocked on mutex L2::
+
+  G->L2->B->L1->A
+
+And once again, to show how this can grow I will show the merging chains
+again::
+
+   E->L4->D->L3->C-+
+                   +->L2-+
+                   |     |
+                 G-+     +->B->L1->A
+                         |
+                   F->L5-+
+
+If process G has the highest priority in the chain, then all the tasks up
+the chain (A and B in this example), must have their priorities increased
+to that of G.
+
+Mutex Waiters Tree
+------------------
+
+Every mutex keeps track of all the waiters that are blocked on itself. The
+mutex has a rbtree to store these waiters by priority.  This tree is protected
+by a spin lock that is located in the struct of the mutex. This lock is called
+wait_lock.
+
+
+Task PI Tree
+------------
+
+To keep track of the PI chains, each process has its own PI rbtree.  This is
+a tree of all top waiters of the mutexes that are owned by the process.
+Note that this tree only holds the top waiters and not all waiters that are
+blocked on mutexes owned by the process.
+
+The top of the task's PI tree is always the highest priority task that
+is waiting on a mutex that is owned by the task.  So if the task has
+inherited a priority, it will always be the priority of the task that is
+at the top of this tree.
+
+This tree is stored in the task structure of a process as a rbtree called
+pi_waiters.  It is protected by a spin lock also in the task structure,
+called pi_lock.  This lock may also be taken in interrupt context, so when
+locking the pi_lock, interrupts must be disabled.
+
+
+Depth of the PI Chain
+---------------------
+
+The maximum depth of the PI chain is not dynamic, and could actually be
+defined.  But is very complex to figure it out, since it depends on all
+the nesting of mutexes.  Let's look at the example where we have 3 mutexes,
+L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
+The following shows a locking order of L1->L2->L3, but may not actually
+be directly nested that way::
+
+  void func1(void)
+  {
+	mutex_lock(L1);
+
+	/* do anything */
+
+	mutex_unlock(L1);
+  }
+
+  void func2(void)
+  {
+	mutex_lock(L1);
+	mutex_lock(L2);
+
+	/* do something */
+
+	mutex_unlock(L2);
+	mutex_unlock(L1);
+  }
+
+  void func3(void)
+  {
+	mutex_lock(L2);
+	mutex_lock(L3);
+
+	/* do something else */
+
+	mutex_unlock(L3);
+	mutex_unlock(L2);
+  }
+
+  void func4(void)
+  {
+	mutex_lock(L3);
+
+	/* do something again */
+
+	mutex_unlock(L3);
+  }
+
+Now we add 4 processes that run each of these functions separately.
+Processes A, B, C, and D which run functions func1, func2, func3 and func4
+respectively, and such that D runs first and A last.  With D being preempted
+in func4 in the "do something again" area, we have a locking that follows::
+
+  D owns L3
+         C blocked on L3
+         C owns L2
+                B blocked on L2
+                B owns L1
+                       A blocked on L1
+
+  And thus we have the chain A->L1->B->L2->C->L3->D.
+
+This gives us a PI depth of 4 (four processes), but looking at any of the
+functions individually, it seems as though they only have at most a locking
+depth of two.  So, although the locking depth is defined at compile time,
+it still is very difficult to find the possibilities of that depth.
+
+Now since mutexes can be defined by user-land applications, we don't want a DOS
+type of application that nests large amounts of mutexes to create a large
+PI chain, and have the code holding spin locks while looking at a large
+amount of data.  So to prevent this, the implementation not only implements
+a maximum lock depth, but also only holds at most two different locks at a
+time, as it walks the PI chain.  More about this below.
+
+
+Mutex owner and flags
+---------------------
+
+The mutex structure contains a pointer to the owner of the mutex.  If the
+mutex is not owned, this owner is set to NULL.  Since all architectures
+have the task structure on at least a two byte alignment (and if this is
+not true, the rtmutex.c code will be broken!), this allows for the least
+significant bit to be used as a flag.  Bit 0 is used as the "Has Waiters"
+flag. It's set whenever there are waiters on a mutex.
+
+See Documentation/locking/rt-mutex.rst for further details.
+
+cmpxchg Tricks
+--------------
+
+Some architectures implement an atomic cmpxchg (Compare and Exchange).  This
+is used (when applicable) to keep the fast path of grabbing and releasing
+mutexes short.
+
+cmpxchg is basically the following function performed atomically::
+
+  unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
+  {
+	unsigned long T = *A;
+	if (*A == *B) {
+		*A = *C;
+	}
+	return T;
+  }
+  #define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
+
+This is really nice to have, since it allows you to only update a variable
+if the variable is what you expect it to be.  You know if it succeeded if
+the return value (the old value of A) is equal to B.
+
+The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
+the architecture does not support CMPXCHG, then this macro is simply set
+to fail every time.  But if CMPXCHG is supported, then this will
+help out extremely to keep the fast path short.
+
+The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
+the system for architectures that support it.  This will also be explained
+later in this document.
+
+
+Priority adjustments
+--------------------
+
+The implementation of the PI code in rtmutex.c has several places that a
+process must adjust its priority.  With the help of the pi_waiters of a
+process this is rather easy to know what needs to be adjusted.
+
+The functions implementing the task adjustments are rt_mutex_adjust_prio
+and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
+
+rt_mutex_adjust_prio examines the priority of the task, and the highest
+priority process that is waiting any of mutexes owned by the task. Since
+the pi_waiters of a task holds an order by priority of all the top waiters
+of all the mutexes that the task owns, we simply need to compare the top
+pi waiter to its own normal/deadline priority and take the higher one.
+Then rt_mutex_setprio is called to adjust the priority of the task to the
+new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
+to implement the actual change in priority.
+
+Note:
+	For the "prio" field in task_struct, the lower the number, the
+	higher the priority. A "prio" of 5 is of higher priority than a
+	"prio" of 10.
+
+It is interesting to note that rt_mutex_adjust_prio can either increase
+or decrease the priority of the task.  In the case that a higher priority
+process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
+would increase/boost the task's priority.  But if a higher priority task
+were for some reason to leave the mutex (timeout or signal), this same function
+would decrease/unboost the priority of the task.  That is because the pi_waiters
+always contains the highest priority task that is waiting on a mutex owned
+by the task, so we only need to compare the priority of that top pi waiter
+to the normal priority of the given task.
+
+
+High level overview of the PI chain walk
+----------------------------------------
+
+The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
+
+The implementation has gone through several iterations, and has ended up
+with what we believe is the best.  It walks the PI chain by only grabbing
+at most two locks at a time, and is very efficient.
+
+The rt_mutex_adjust_prio_chain can be used either to boost or lower process
+priorities.
+
+rt_mutex_adjust_prio_chain is called with a task to be checked for PI
+(de)boosting (the owner of a mutex that a process is blocking on), a flag to
+check for deadlocking, the mutex that the task owns, a pointer to a waiter
+that is the process's waiter struct that is blocked on the mutex (although this
+parameter may be NULL for deboosting), a pointer to the mutex on which the task
+is blocked, and a top_task as the top waiter of the mutex.
+
+For this explanation, I will not mention deadlock detection. This explanation
+will try to stay at a high level.
+
+When this function is called, there are no locks held.  That also means
+that the state of the owner and lock can change when entered into this function.
+
+Before this function is called, the task has already had rt_mutex_adjust_prio
+performed on it.  This means that the task is set to the priority that it
+should be at, but the rbtree nodes of the task's waiter have not been updated
+with the new priorities, and this task may not be in the proper locations
+in the pi_waiters and waiters trees that the task is blocked on. This function
+solves all that.
+
+The main operation of this function is summarized by Thomas Gleixner in
+rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
+details.
+
+Taking of a mutex (The walk through)
+------------------------------------
+
+OK, now let's take a look at the detailed walk through of what happens when
+taking a mutex.
+
+The first thing that is tried is the fast taking of the mutex.  This is
+done when we have CMPXCHG enabled (otherwise the fast taking automatically
+fails).  Only when the owner field of the mutex is NULL can the lock be
+taken with the CMPXCHG and nothing else needs to be done.
+
+If there is contention on the lock, we go about the slow path
+(rt_mutex_slowlock).
+
+The slow path function is where the task's waiter structure is created on
+the stack.  This is because the waiter structure is only needed for the
+scope of this function.  The waiter structure holds the nodes to store
+the task on the waiters tree of the mutex, and if need be, the pi_waiters
+tree of the owner.
+
+The wait_lock of the mutex is taken since the slow path of unlocking the
+mutex also takes this lock.
+
+We then call try_to_take_rt_mutex.  This is where the architecture that
+does not implement CMPXCHG would always grab the lock (if there's no
+contention).
+
+try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
+slow path.  The first thing that is done here is an atomic setting of
+the "Has Waiters" flag of the mutex's owner field. By setting this flag
+now, the current owner of the mutex being contended for can't release the mutex
+without going into the slow unlock path, and it would then need to grab the
+wait_lock, which this code currently holds. So setting the "Has Waiters" flag
+forces the current owner to synchronize with this code.
+
+The lock is taken if the following are true:
+
+   1) The lock has no owner
+   2) The current task is the highest priority against all other
+      waiters of the lock
+
+If the task succeeds to acquire the lock, then the task is set as the
+owner of the lock, and if the lock still has waiters, the top_waiter
+(highest priority task waiting on the lock) is added to this task's
+pi_waiters tree.
+
+If the lock is not taken by try_to_take_rt_mutex(), then the
+task_blocks_on_rt_mutex() function is called. This will add the task to
+the lock's waiter tree and propagate the pi chain of the lock as well
+as the lock's owner's pi_waiters tree. This is described in the next
+section.
+
+Task blocks on mutex
+--------------------
+
+The accounting of a mutex and process is done with the waiter structure of
+the process.  The "task" field is set to the process, and the "lock" field
+to the mutex.  The rbtree node of waiter are initialized to the processes
+current priority.
+
+Since the wait_lock was taken at the entry of the slow lock, we can safely
+add the waiter to the task waiter tree.  If the current process is the
+highest priority process currently waiting on this mutex, then we remove the
+previous top waiter process (if it exists) from the pi_waiters of the owner,
+and add the current process to that tree.  Since the pi_waiter of the owner
+has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
+should adjust its priority accordingly.
+
+If the owner is also blocked on a lock, and had its pi_waiters changed
+(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
+and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
+
+Now all locks are released, and if the current process is still blocked on a
+mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
+
+Waking up in the loop
+---------------------
+
+The task can then wake up for a couple of reasons:
+  1) The previous lock owner released the lock, and the task now is top_waiter
+  2) we received a signal or timeout
+
+In both cases, the task will try again to acquire the lock. If it
+does, then it will take itself off the waiters tree and set itself back
+to the TASK_RUNNING state.
+
+In first case, if the lock was acquired by another task before this task
+could get the lock, then it will go back to sleep and wait to be woken again.
+
+The second case is only applicable for tasks that are grabbing a mutex
+that can wake up before getting the lock, either due to a signal or
+a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
+take the lock again, if it succeeds, then the task will return with the
+lock held, otherwise it will return with -EINTR if the task was woken
+by a signal, or -ETIMEDOUT if it timed out.
+
+
+Unlocking the Mutex
+-------------------
+
+The unlocking of a mutex also has a fast path for those architectures with
+CMPXCHG.  Since the taking of a mutex on contention always sets the
+"Has Waiters" flag of the mutex's owner, we use this to know if we need to
+take the slow path when unlocking the mutex.  If the mutex doesn't have any
+waiters, the owner field of the mutex would equal the current process and
+the mutex can be unlocked by just replacing the owner field with NULL.
+
+If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
+the slow unlock path is taken.
+
+The first thing done in the slow unlock path is to take the wait_lock of the
+mutex.  This synchronizes the locking and unlocking of the mutex.
+
+A check is made to see if the mutex has waiters or not.  On architectures that
+do not have CMPXCHG, this is the location that the owner of the mutex will
+determine if a waiter needs to be awoken or not.  On architectures that
+do have CMPXCHG, that check is done in the fast path, but it is still needed
+in the slow path too.  If a waiter of a mutex woke up because of a signal
+or timeout between the time the owner failed the fast path CMPXCHG check and
+the grabbing of the wait_lock, the mutex may not have any waiters, thus the
+owner still needs to make this check. If there are no waiters then the mutex
+owner field is set to NULL, the wait_lock is released and nothing more is
+needed.
+
+If there are waiters, then we need to wake one up.
+
+On the wake up code, the pi_lock of the current owner is taken.  The top
+waiter of the lock is found and removed from the waiters tree of the mutex
+as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
+marked to prevent lower priority tasks from stealing the lock.
+
+Finally we unlock the pi_lock of the pending owner and wake it up.
+
+
+Contact
+-------
+
+For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
+
+
+Credits
+-------
+
+Author:  Steven Rostedt <rostedt@goodmis.org>
+
+Updated: Alex Shi <alex.shi@linaro.org>	- 7/6/2017
+
+Original Reviewers:
+		     Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
+		     Randy Dunlap
+
+Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
+
+Updates
+-------
+
+This document was originally written for 2.6.17-rc3-mm1
+was updated on 4.12
diff --git a/Documentation/locking/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt
deleted file mode 100644
index 3d7b865539cc..000000000000
--- a/Documentation/locking/rt-mutex-design.txt
+++ /dev/null
@@ -1,559 +0,0 @@
-#
-# Copyright (c) 2006 Steven Rostedt
-# Licensed under the GNU Free Documentation License, Version 1.2
-#
-
-RT-mutex implementation design
-------------------------------
-
-This document tries to describe the design of the rtmutex.c implementation.
-It doesn't describe the reasons why rtmutex.c exists. For that please see
-Documentation/locking/rt-mutex.txt.  Although this document does explain problems
-that happen without this code, but that is in the concept to understand
-what the code actually is doing.
-
-The goal of this document is to help others understand the priority
-inheritance (PI) algorithm that is used, as well as reasons for the
-decisions that were made to implement PI in the manner that was done.
-
-
-Unbounded Priority Inversion
-----------------------------
-
-Priority inversion is when a lower priority process executes while a higher
-priority process wants to run.  This happens for several reasons, and
-most of the time it can't be helped.  Anytime a high priority process wants
-to use a resource that a lower priority process has (a mutex for example),
-the high priority process must wait until the lower priority process is done
-with the resource.  This is a priority inversion.  What we want to prevent
-is something called unbounded priority inversion.  That is when the high
-priority process is prevented from running by a lower priority process for
-an undetermined amount of time.
-
-The classic example of unbounded priority inversion is where you have three
-processes, let's call them processes A, B, and C, where A is the highest
-priority process, C is the lowest, and B is in between. A tries to grab a lock
-that C owns and must wait and lets C run to release the lock. But in the
-meantime, B executes, and since B is of a higher priority than C, it preempts C,
-but by doing so, it is in fact preempting A which is a higher priority process.
-Now there's no way of knowing how long A will be sleeping waiting for C
-to release the lock, because for all we know, B is a CPU hog and will
-never give C a chance to release the lock.  This is called unbounded priority
-inversion.
-
-Here's a little ASCII art to show the problem.
-
-   grab lock L1 (owned by C)
-     |
-A ---+
-        C preempted by B
-          |
-C    +----+
-
-B         +-------->
-                B now keeps A from running.
-
-
-Priority Inheritance (PI)
--------------------------
-
-There are several ways to solve this issue, but other ways are out of scope
-for this document.  Here we only discuss PI.
-
-PI is where a process inherits the priority of another process if the other
-process blocks on a lock owned by the current process.  To make this easier
-to understand, let's use the previous example, with processes A, B, and C again.
-
-This time, when A blocks on the lock owned by C, C would inherit the priority
-of A.  So now if B becomes runnable, it would not preempt C, since C now has
-the high priority of A.  As soon as C releases the lock, it loses its
-inherited priority, and A then can continue with the resource that C had.
-
-Terminology
------------
-
-Here I explain some terminology that is used in this document to help describe
-the design that is used to implement PI.
-
-PI chain - The PI chain is an ordered series of locks and processes that cause
-           processes to inherit priorities from a previous process that is
-           blocked on one of its locks.  This is described in more detail
-           later in this document.
-
-mutex    - In this document, to differentiate from locks that implement
-           PI and spin locks that are used in the PI code, from now on
-           the PI locks will be called a mutex.
-
-lock     - In this document from now on, I will use the term lock when
-           referring to spin locks that are used to protect parts of the PI
-           algorithm.  These locks disable preemption for UP (when
-           CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
-           entering critical sections simultaneously.
-
-spin lock - Same as lock above.
-
-waiter   - A waiter is a struct that is stored on the stack of a blocked
-           process.  Since the scope of the waiter is within the code for
-           a process being blocked on the mutex, it is fine to allocate
-           the waiter on the process's stack (local variable).  This
-           structure holds a pointer to the task, as well as the mutex that
-           the task is blocked on.  It also has rbtree node structures to
-           place the task in the waiters rbtree of a mutex as well as the
-           pi_waiters rbtree of a mutex owner task (described below).
-
-           waiter is sometimes used in reference to the task that is waiting
-           on a mutex. This is the same as waiter->task.
-
-waiters  - A list of processes that are blocked on a mutex.
-
-top waiter - The highest priority process waiting on a specific mutex.
-
-top pi waiter - The highest priority process waiting on one of the mutexes
-                that a specific process owns.
-
-Note:  task and process are used interchangeably in this document, mostly to
-       differentiate between two processes that are being described together.
-
-
-PI chain
---------
-
-The PI chain is a list of processes and mutexes that may cause priority
-inheritance to take place.  Multiple chains may converge, but a chain
-would never diverge, since a process can't be blocked on more than one
-mutex at a time.
-
-Example:
-
-   Process:  A, B, C, D, E
-   Mutexes:  L1, L2, L3, L4
-
-   A owns: L1
-           B blocked on L1
-           B owns L2
-                  C blocked on L2
-                  C owns L3
-                         D blocked on L3
-                         D owns L4
-                                E blocked on L4
-
-The chain would be:
-
-   E->L4->D->L3->C->L2->B->L1->A
-
-To show where two chains merge, we could add another process F and
-another mutex L5 where B owns L5 and F is blocked on mutex L5.
-
-The chain for F would be:
-
-   F->L5->B->L1->A
-
-Since a process may own more than one mutex, but never be blocked on more than
-one, the chains merge.
-
-Here we show both chains:
-
-   E->L4->D->L3->C->L2-+
-                       |
-                       +->B->L1->A
-                       |
-                 F->L5-+
-
-For PI to work, the processes at the right end of these chains (or we may
-also call it the Top of the chain) must be equal to or higher in priority
-than the processes to the left or below in the chain.
-
-Also since a mutex may have more than one process blocked on it, we can
-have multiple chains merge at mutexes.  If we add another process G that is
-blocked on mutex L2:
-
-  G->L2->B->L1->A
-
-And once again, to show how this can grow I will show the merging chains
-again.
-
-   E->L4->D->L3->C-+
-                   +->L2-+
-                   |     |
-                 G-+     +->B->L1->A
-                         |
-                   F->L5-+
-
-If process G has the highest priority in the chain, then all the tasks up
-the chain (A and B in this example), must have their priorities increased
-to that of G.
-
-Mutex Waiters Tree
------------------
-
-Every mutex keeps track of all the waiters that are blocked on itself. The
-mutex has a rbtree to store these waiters by priority.  This tree is protected
-by a spin lock that is located in the struct of the mutex. This lock is called
-wait_lock.
-
-
-Task PI Tree
-------------
-
-To keep track of the PI chains, each process has its own PI rbtree.  This is
-a tree of all top waiters of the mutexes that are owned by the process.
-Note that this tree only holds the top waiters and not all waiters that are
-blocked on mutexes owned by the process.
-
-The top of the task's PI tree is always the highest priority task that
-is waiting on a mutex that is owned by the task.  So if the task has
-inherited a priority, it will always be the priority of the task that is
-at the top of this tree.
-
-This tree is stored in the task structure of a process as a rbtree called
-pi_waiters.  It is protected by a spin lock also in the task structure,
-called pi_lock.  This lock may also be taken in interrupt context, so when
-locking the pi_lock, interrupts must be disabled.
-
-
-Depth of the PI Chain
----------------------
-
-The maximum depth of the PI chain is not dynamic, and could actually be
-defined.  But is very complex to figure it out, since it depends on all
-the nesting of mutexes.  Let's look at the example where we have 3 mutexes,
-L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
-The following shows a locking order of L1->L2->L3, but may not actually
-be directly nested that way.
-
-void func1(void)
-{
-	mutex_lock(L1);
-
-	/* do anything */
-
-	mutex_unlock(L1);
-}
-
-void func2(void)
-{
-	mutex_lock(L1);
-	mutex_lock(L2);
-
-	/* do something */
-
-	mutex_unlock(L2);
-	mutex_unlock(L1);
-}
-
-void func3(void)
-{
-	mutex_lock(L2);
-	mutex_lock(L3);
-
-	/* do something else */
-
-	mutex_unlock(L3);
-	mutex_unlock(L2);
-}
-
-void func4(void)
-{
-	mutex_lock(L3);
-
-	/* do something again */
-
-	mutex_unlock(L3);
-}
-
-Now we add 4 processes that run each of these functions separately.
-Processes A, B, C, and D which run functions func1, func2, func3 and func4
-respectively, and such that D runs first and A last.  With D being preempted
-in func4 in the "do something again" area, we have a locking that follows:
-
-D owns L3
-       C blocked on L3
-       C owns L2
-              B blocked on L2
-              B owns L1
-                     A blocked on L1
-
-And thus we have the chain A->L1->B->L2->C->L3->D.
-
-This gives us a PI depth of 4 (four processes), but looking at any of the
-functions individually, it seems as though they only have at most a locking
-depth of two.  So, although the locking depth is defined at compile time,
-it still is very difficult to find the possibilities of that depth.
-
-Now since mutexes can be defined by user-land applications, we don't want a DOS
-type of application that nests large amounts of mutexes to create a large
-PI chain, and have the code holding spin locks while looking at a large
-amount of data.  So to prevent this, the implementation not only implements
-a maximum lock depth, but also only holds at most two different locks at a
-time, as it walks the PI chain.  More about this below.
-
-
-Mutex owner and flags
----------------------
-
-The mutex structure contains a pointer to the owner of the mutex.  If the
-mutex is not owned, this owner is set to NULL.  Since all architectures
-have the task structure on at least a two byte alignment (and if this is
-not true, the rtmutex.c code will be broken!), this allows for the least
-significant bit to be used as a flag.  Bit 0 is used as the "Has Waiters"
-flag. It's set whenever there are waiters on a mutex.
-
-See Documentation/locking/rt-mutex.txt for further details.
-
-cmpxchg Tricks
---------------
-
-Some architectures implement an atomic cmpxchg (Compare and Exchange).  This
-is used (when applicable) to keep the fast path of grabbing and releasing
-mutexes short.
-
-cmpxchg is basically the following function performed atomically:
-
-unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
-{
-	unsigned long T = *A;
-	if (*A == *B) {
-		*A = *C;
-	}
-	return T;
-}
-#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
-
-This is really nice to have, since it allows you to only update a variable
-if the variable is what you expect it to be.  You know if it succeeded if
-the return value (the old value of A) is equal to B.
-
-The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
-the architecture does not support CMPXCHG, then this macro is simply set
-to fail every time.  But if CMPXCHG is supported, then this will
-help out extremely to keep the fast path short.
-
-The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
-the system for architectures that support it.  This will also be explained
-later in this document.
-
-
-Priority adjustments
---------------------
-
-The implementation of the PI code in rtmutex.c has several places that a
-process must adjust its priority.  With the help of the pi_waiters of a
-process this is rather easy to know what needs to be adjusted.
-
-The functions implementing the task adjustments are rt_mutex_adjust_prio
-and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
-
-rt_mutex_adjust_prio examines the priority of the task, and the highest
-priority process that is waiting any of mutexes owned by the task. Since
-the pi_waiters of a task holds an order by priority of all the top waiters
-of all the mutexes that the task owns, we simply need to compare the top
-pi waiter to its own normal/deadline priority and take the higher one.
-Then rt_mutex_setprio is called to adjust the priority of the task to the
-new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
-to implement the actual change in priority.
-
-(Note:  For the "prio" field in task_struct, the lower the number, the
-	higher the priority. A "prio" of 5 is of higher priority than a
-	"prio" of 10.)
-
-It is interesting to note that rt_mutex_adjust_prio can either increase
-or decrease the priority of the task.  In the case that a higher priority
-process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
-would increase/boost the task's priority.  But if a higher priority task
-were for some reason to leave the mutex (timeout or signal), this same function
-would decrease/unboost the priority of the task.  That is because the pi_waiters
-always contains the highest priority task that is waiting on a mutex owned
-by the task, so we only need to compare the priority of that top pi waiter
-to the normal priority of the given task.
-
-
-High level overview of the PI chain walk
-----------------------------------------
-
-The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
-
-The implementation has gone through several iterations, and has ended up
-with what we believe is the best.  It walks the PI chain by only grabbing
-at most two locks at a time, and is very efficient.
-
-The rt_mutex_adjust_prio_chain can be used either to boost or lower process
-priorities.
-
-rt_mutex_adjust_prio_chain is called with a task to be checked for PI
-(de)boosting (the owner of a mutex that a process is blocking on), a flag to
-check for deadlocking, the mutex that the task owns, a pointer to a waiter
-that is the process's waiter struct that is blocked on the mutex (although this
-parameter may be NULL for deboosting), a pointer to the mutex on which the task
-is blocked, and a top_task as the top waiter of the mutex.
-
-For this explanation, I will not mention deadlock detection. This explanation
-will try to stay at a high level.
-
-When this function is called, there are no locks held.  That also means
-that the state of the owner and lock can change when entered into this function.
-
-Before this function is called, the task has already had rt_mutex_adjust_prio
-performed on it.  This means that the task is set to the priority that it
-should be at, but the rbtree nodes of the task's waiter have not been updated
-with the new priorities, and this task may not be in the proper locations
-in the pi_waiters and waiters trees that the task is blocked on. This function
-solves all that.
-
-The main operation of this function is summarized by Thomas Gleixner in
-rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
-details.
-
-Taking of a mutex (The walk through)
-------------------------------------
-
-OK, now let's take a look at the detailed walk through of what happens when
-taking a mutex.
-
-The first thing that is tried is the fast taking of the mutex.  This is
-done when we have CMPXCHG enabled (otherwise the fast taking automatically
-fails).  Only when the owner field of the mutex is NULL can the lock be
-taken with the CMPXCHG and nothing else needs to be done.
-
-If there is contention on the lock, we go about the slow path
-(rt_mutex_slowlock).
-
-The slow path function is where the task's waiter structure is created on
-the stack.  This is because the waiter structure is only needed for the
-scope of this function.  The waiter structure holds the nodes to store
-the task on the waiters tree of the mutex, and if need be, the pi_waiters
-tree of the owner.
-
-The wait_lock of the mutex is taken since the slow path of unlocking the
-mutex also takes this lock.
-
-We then call try_to_take_rt_mutex.  This is where the architecture that
-does not implement CMPXCHG would always grab the lock (if there's no
-contention).
-
-try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
-slow path.  The first thing that is done here is an atomic setting of
-the "Has Waiters" flag of the mutex's owner field. By setting this flag
-now, the current owner of the mutex being contended for can't release the mutex
-without going into the slow unlock path, and it would then need to grab the
-wait_lock, which this code currently holds. So setting the "Has Waiters" flag
-forces the current owner to synchronize with this code.
-
-The lock is taken if the following are true:
-   1) The lock has no owner
-   2) The current task is the highest priority against all other
-      waiters of the lock
-
-If the task succeeds to acquire the lock, then the task is set as the
-owner of the lock, and if the lock still has waiters, the top_waiter
-(highest priority task waiting on the lock) is added to this task's
-pi_waiters tree.
-
-If the lock is not taken by try_to_take_rt_mutex(), then the
-task_blocks_on_rt_mutex() function is called. This will add the task to
-the lock's waiter tree and propagate the pi chain of the lock as well
-as the lock's owner's pi_waiters tree. This is described in the next
-section.
-
-Task blocks on mutex
---------------------
-
-The accounting of a mutex and process is done with the waiter structure of
-the process.  The "task" field is set to the process, and the "lock" field
-to the mutex.  The rbtree node of waiter are initialized to the processes
-current priority.
-
-Since the wait_lock was taken at the entry of the slow lock, we can safely
-add the waiter to the task waiter tree.  If the current process is the
-highest priority process currently waiting on this mutex, then we remove the
-previous top waiter process (if it exists) from the pi_waiters of the owner,
-and add the current process to that tree.  Since the pi_waiter of the owner
-has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
-should adjust its priority accordingly.
-
-If the owner is also blocked on a lock, and had its pi_waiters changed
-(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
-and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
-
-Now all locks are released, and if the current process is still blocked on a
-mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
-
-Waking up in the loop
----------------------
-
-The task can then wake up for a couple of reasons:
-  1) The previous lock owner released the lock, and the task now is top_waiter
-  2) we received a signal or timeout
-
-In both cases, the task will try again to acquire the lock. If it
-does, then it will take itself off the waiters tree and set itself back
-to the TASK_RUNNING state.
-
-In first case, if the lock was acquired by another task before this task
-could get the lock, then it will go back to sleep and wait to be woken again.
-
-The second case is only applicable for tasks that are grabbing a mutex
-that can wake up before getting the lock, either due to a signal or
-a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
-take the lock again, if it succeeds, then the task will return with the
-lock held, otherwise it will return with -EINTR if the task was woken
-by a signal, or -ETIMEDOUT if it timed out.
-
-
-Unlocking the Mutex
--------------------
-
-The unlocking of a mutex also has a fast path for those architectures with
-CMPXCHG.  Since the taking of a mutex on contention always sets the
-"Has Waiters" flag of the mutex's owner, we use this to know if we need to
-take the slow path when unlocking the mutex.  If the mutex doesn't have any
-waiters, the owner field of the mutex would equal the current process and
-the mutex can be unlocked by just replacing the owner field with NULL.
-
-If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
-the slow unlock path is taken.
-
-The first thing done in the slow unlock path is to take the wait_lock of the
-mutex.  This synchronizes the locking and unlocking of the mutex.
-
-A check is made to see if the mutex has waiters or not.  On architectures that
-do not have CMPXCHG, this is the location that the owner of the mutex will
-determine if a waiter needs to be awoken or not.  On architectures that
-do have CMPXCHG, that check is done in the fast path, but it is still needed
-in the slow path too.  If a waiter of a mutex woke up because of a signal
-or timeout between the time the owner failed the fast path CMPXCHG check and
-the grabbing of the wait_lock, the mutex may not have any waiters, thus the
-owner still needs to make this check. If there are no waiters then the mutex
-owner field is set to NULL, the wait_lock is released and nothing more is
-needed.
-
-If there are waiters, then we need to wake one up.
-
-On the wake up code, the pi_lock of the current owner is taken.  The top
-waiter of the lock is found and removed from the waiters tree of the mutex
-as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
-marked to prevent lower priority tasks from stealing the lock.
-
-Finally we unlock the pi_lock of the pending owner and wake it up.
-
-
-Contact
--------
-
-For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
-
-
-Credits
--------
-
-Author:  Steven Rostedt <rostedt@goodmis.org>
-Updated: Alex Shi <alex.shi@linaro.org>	- 7/6/2017
-
-Original Reviewers:  Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
-		     Randy Dunlap
-Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
-
-Updates
--------
-
-This document was originally written for 2.6.17-rc3-mm1
-was updated on 4.12
diff --git a/Documentation/locking/rt-mutex.rst b/Documentation/locking/rt-mutex.rst
new file mode 100644
index 000000000000..c365dc302081
--- /dev/null
+++ b/Documentation/locking/rt-mutex.rst
@@ -0,0 +1,77 @@
+==================================
+RT-mutex subsystem with PI support
+==================================
+
+RT-mutexes with priority inheritance are used to support PI-futexes,
+which enable pthread_mutex_t priority inheritance attributes
+(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
+about PI-futexes.]
+
+This technology was developed in the -rt tree and streamlined for
+pthread_mutex support.
+
+Basic principles:
+-----------------
+
+RT-mutexes extend the semantics of simple mutexes by the priority
+inheritance protocol.
+
+A low priority owner of a rt-mutex inherits the priority of a higher
+priority waiter until the rt-mutex is released. If the temporarily
+boosted owner blocks on a rt-mutex itself it propagates the priority
+boosting to the owner of the other rt_mutex it gets blocked on. The
+priority boosting is immediately removed once the rt_mutex has been
+unlocked.
+
+This approach allows us to shorten the block of high-prio tasks on
+mutexes which protect shared resources. Priority inheritance is not a
+magic bullet for poorly designed applications, but it allows
+well-designed applications to use userspace locks in critical parts of
+an high priority thread, without losing determinism.
+
+The enqueueing of the waiters into the rtmutex waiter tree is done in
+priority order. For same priorities FIFO order is chosen. For each
+rtmutex, only the top priority waiter is enqueued into the owner's
+priority waiters tree. This tree too queues in priority order. Whenever
+the top priority waiter of a task changes (for example it timed out or
+got a signal), the priority of the owner task is readjusted. The
+priority enqueueing is handled by "pi_waiters".
+
+RT-mutexes are optimized for fastpath operations and have no internal
+locking overhead when locking an uncontended mutex or unlocking a mutex
+without waiters. The optimized fastpath operations require cmpxchg
+support. [If that is not available then the rt-mutex internal spinlock
+is used]
+
+The state of the rt-mutex is tracked via the owner field of the rt-mutex
+structure:
+
+lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
+keep track of the "lock has waiters" state:
+
+ ============ ======= ================================================
+ owner        bit0    Notes
+ ============ ======= ================================================
+ NULL         0       lock is free (fast acquire possible)
+ NULL         1       lock is free and has waiters and the top waiter
+		      is going to take the lock [1]_
+ taskpointer  0       lock is held (fast release possible)
+ taskpointer  1       lock is held and has waiters [2]_
+ ============ ======= ================================================
+
+The fast atomic compare exchange based acquire and release is only
+possible when bit 0 of lock->owner is 0.
+
+.. [1] It also can be a transitional state when grabbing the lock
+       with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+       we need to set the bit0 before looking at the lock, and the owner may
+       be NULL in this small time, hence this can be a transitional state.
+
+.. [2] There is a small time when bit 0 is set but there are no
+       waiters. This can happen when grabbing the lock in the slow path.
+       To prevent a cmpxchg of the owner releasing the lock, we need to
+       set this bit before looking at the lock.
+
+BTW, there is still technically a "Pending Owner", it's just not called
+that anymore. The pending owner happens to be the top_waiter of a lock
+that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/locking/rt-mutex.txt b/Documentation/locking/rt-mutex.txt
deleted file mode 100644
index 35793e003041..000000000000
--- a/Documentation/locking/rt-mutex.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-RT-mutex subsystem with PI support
-----------------------------------
-
-RT-mutexes with priority inheritance are used to support PI-futexes,
-which enable pthread_mutex_t priority inheritance attributes
-(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
-about PI-futexes.]
-
-This technology was developed in the -rt tree and streamlined for
-pthread_mutex support.
-
-Basic principles:
------------------
-
-RT-mutexes extend the semantics of simple mutexes by the priority
-inheritance protocol.
-
-A low priority owner of a rt-mutex inherits the priority of a higher
-priority waiter until the rt-mutex is released. If the temporarily
-boosted owner blocks on a rt-mutex itself it propagates the priority
-boosting to the owner of the other rt_mutex it gets blocked on. The
-priority boosting is immediately removed once the rt_mutex has been
-unlocked.
-
-This approach allows us to shorten the block of high-prio tasks on
-mutexes which protect shared resources. Priority inheritance is not a
-magic bullet for poorly designed applications, but it allows
-well-designed applications to use userspace locks in critical parts of
-an high priority thread, without losing determinism.
-
-The enqueueing of the waiters into the rtmutex waiter tree is done in
-priority order. For same priorities FIFO order is chosen. For each
-rtmutex, only the top priority waiter is enqueued into the owner's
-priority waiters tree. This tree too queues in priority order. Whenever
-the top priority waiter of a task changes (for example it timed out or
-got a signal), the priority of the owner task is readjusted. The
-priority enqueueing is handled by "pi_waiters".
-
-RT-mutexes are optimized for fastpath operations and have no internal
-locking overhead when locking an uncontended mutex or unlocking a mutex
-without waiters. The optimized fastpath operations require cmpxchg
-support. [If that is not available then the rt-mutex internal spinlock
-is used]
-
-The state of the rt-mutex is tracked via the owner field of the rt-mutex
-structure:
-
-lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
-keep track of the "lock has waiters" state.
-
- owner        bit0
- NULL         0       lock is free (fast acquire possible)
- NULL         1       lock is free and has waiters and the top waiter
-			is going to take the lock*
- taskpointer  0       lock is held (fast release possible)
- taskpointer  1       lock is held and has waiters**
-
-The fast atomic compare exchange based acquire and release is only
-possible when bit 0 of lock->owner is 0.
-
-(*) It also can be a transitional state when grabbing the lock
-with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
-we need to set the bit0 before looking at the lock, and the owner may be
-NULL in this small time, hence this can be a transitional state.
-
-(**) There is a small time when bit 0 is set but there are no
-waiters. This can happen when grabbing the lock in the slow path.
-To prevent a cmpxchg of the owner releasing the lock, we need to
-set this bit before looking at the lock.
-
-BTW, there is still technically a "Pending Owner", it's just not called
-that anymore. The pending owner happens to be the top_waiter of a lock
-that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/locking/spinlocks.rst b/Documentation/locking/spinlocks.rst
new file mode 100644
index 000000000000..098107fb7d86
--- /dev/null
+++ b/Documentation/locking/spinlocks.rst
@@ -0,0 +1,177 @@
+===============
+Locking lessons
+===============
+
+Lesson 1: Spin locks
+====================
+
+The most basic primitive for locking is spinlock::
+
+  static DEFINE_SPINLOCK(xxx_lock);
+
+	unsigned long flags;
+
+	spin_lock_irqsave(&xxx_lock, flags);
+	... critical section here ..
+	spin_unlock_irqrestore(&xxx_lock, flags);
+
+The above is always safe. It will disable interrupts _locally_, but the
+spinlock itself will guarantee the global lock, so it will guarantee that
+there is only one thread-of-control within the region(s) protected by that
+lock. This works well even under UP also, so the code does _not_ need to
+worry about UP vs SMP issues: the spinlocks work correctly under both.
+
+   NOTE! Implications of spin_locks for memory are further described in:
+
+     Documentation/memory-barriers.txt
+
+       (5) LOCK operations.
+
+       (6) UNLOCK operations.
+
+The above is usually pretty simple (you usually need and want only one
+spinlock for most things - using more than one spinlock can make things a
+lot more complex and even slower and is usually worth it only for
+sequences that you **know** need to be split up: avoid it at all cost if you
+aren't sure).
+
+This is really the only really hard part about spinlocks: once you start
+using spinlocks they tend to expand to areas you might not have noticed
+before, because you have to make sure the spinlocks correctly protect the
+shared data structures **everywhere** they are used. The spinlocks are most
+easily added to places that are completely independent of other code (for
+example, internal driver data structures that nobody else ever touches).
+
+   NOTE! The spin-lock is safe only when you **also** use the lock itself
+   to do locking across CPU's, which implies that EVERYTHING that
+   touches a shared variable has to agree about the spinlock they want
+   to use.
+
+----
+
+Lesson 2: reader-writer spinlocks.
+==================================
+
+If your data accesses have a very natural pattern where you usually tend
+to mostly read from the shared variables, the reader-writer locks
+(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
+readers to be in the same critical region at once, but if somebody wants
+to change the variables it has to get an exclusive write lock.
+
+   NOTE! reader-writer locks require more atomic memory operations than
+   simple spinlocks.  Unless the reader critical section is long, you
+   are better off just using spinlocks.
+
+The routines look the same as above::
+
+   rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
+
+	unsigned long flags;
+
+	read_lock_irqsave(&xxx_lock, flags);
+	.. critical section that only reads the info ...
+	read_unlock_irqrestore(&xxx_lock, flags);
+
+	write_lock_irqsave(&xxx_lock, flags);
+	.. read and write exclusive access to the info ...
+	write_unlock_irqrestore(&xxx_lock, flags);
+
+The above kind of lock may be useful for complex data structures like
+linked lists, especially searching for entries without changing the list
+itself.  The read lock allows many concurrent readers.  Anything that
+**changes** the list will have to get the write lock.
+
+   NOTE! RCU is better for list traversal, but requires careful
+   attention to design detail (see Documentation/RCU/listRCU.txt).
+
+Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
+time need to do any changes (even if you don't do it every time), you have
+to get the write-lock at the very beginning.
+
+   NOTE! We are working hard to remove reader-writer spinlocks in most
+   cases, so please don't add a new one without consensus.  (Instead, see
+   Documentation/RCU/rcu.txt for complete information.)
+
+----
+
+Lesson 3: spinlocks revisited.
+==============================
+
+The single spin-lock primitives above are by no means the only ones. They
+are the most safe ones, and the ones that work under all circumstances,
+but partly **because** they are safe they are also fairly slow. They are slower
+than they'd need to be, because they do have to disable interrupts
+(which is just a single instruction on a x86, but it's an expensive one -
+and on other architectures it can be worse).
+
+If you have a case where you have to protect a data structure across
+several CPU's and you want to use spinlocks you can potentially use
+cheaper versions of the spinlocks. IFF you know that the spinlocks are
+never used in interrupt handlers, you can use the non-irq versions::
+
+	spin_lock(&lock);
+	...
+	spin_unlock(&lock);
+
+(and the equivalent read-write versions too, of course). The spinlock will
+guarantee the same kind of exclusive access, and it will be much faster.
+This is useful if you know that the data in question is only ever
+manipulated from a "process context", ie no interrupts involved.
+
+The reasons you mustn't use these versions if you have interrupts that
+play with the spinlock is that you can get deadlocks::
+
+	spin_lock(&lock);
+	...
+		<- interrupt comes in:
+			spin_lock(&lock);
+
+where an interrupt tries to lock an already locked variable. This is ok if
+the other interrupt happens on another CPU, but it is _not_ ok if the
+interrupt happens on the same CPU that already holds the lock, because the
+lock will obviously never be released (because the interrupt is waiting
+for the lock, and the lock-holder is interrupted by the interrupt and will
+not continue until the interrupt has been processed).
+
+(This is also the reason why the irq-versions of the spinlocks only need
+to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
+on other CPU's, because an interrupt on another CPU doesn't interrupt the
+CPU that holds the lock, so the lock-holder can continue and eventually
+releases the lock).
+
+Note that you can be clever with read-write locks and interrupts. For
+example, if you know that the interrupt only ever gets a read-lock, then
+you can use a non-irq version of read locks everywhere - because they
+don't block on each other (and thus there is no dead-lock wrt interrupts.
+But when you do the write-lock, you have to use the irq-safe version.
+
+For an example of being clever with rw-locks, see the "waitqueue_lock"
+handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
+within an interrupt, they only read the queue in order to know whom to
+wake up. So read-locks are safe (which is good: they are very common
+indeed), while write-locks need to protect themselves against interrupts.
+
+		Linus
+
+----
+
+Reference information:
+======================
+
+For dynamic initialization, use spin_lock_init() or rwlock_init() as
+appropriate::
+
+   spinlock_t xxx_lock;
+   rwlock_t xxx_rw_lock;
+
+   static int __init xxx_init(void)
+   {
+	spin_lock_init(&xxx_lock);
+	rwlock_init(&xxx_rw_lock);
+	...
+   }
+
+   module_init(xxx_init);
+
+For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
+__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
diff --git a/Documentation/locking/spinlocks.txt b/Documentation/locking/spinlocks.txt
deleted file mode 100644
index ff35e40bdf5b..000000000000
--- a/Documentation/locking/spinlocks.txt
+++ /dev/null
@@ -1,167 +0,0 @@
-Lesson 1: Spin locks
-
-The most basic primitive for locking is spinlock.
-
-static DEFINE_SPINLOCK(xxx_lock);
-
-	unsigned long flags;
-
-	spin_lock_irqsave(&xxx_lock, flags);
-	... critical section here ..
-	spin_unlock_irqrestore(&xxx_lock, flags);
-
-The above is always safe. It will disable interrupts _locally_, but the
-spinlock itself will guarantee the global lock, so it will guarantee that
-there is only one thread-of-control within the region(s) protected by that
-lock. This works well even under UP also, so the code does _not_ need to
-worry about UP vs SMP issues: the spinlocks work correctly under both.
-
-   NOTE! Implications of spin_locks for memory are further described in:
-
-     Documentation/memory-barriers.txt
-       (5) LOCK operations.
-       (6) UNLOCK operations.
-
-The above is usually pretty simple (you usually need and want only one
-spinlock for most things - using more than one spinlock can make things a
-lot more complex and even slower and is usually worth it only for
-sequences that you _know_ need to be split up: avoid it at all cost if you
-aren't sure).
-
-This is really the only really hard part about spinlocks: once you start
-using spinlocks they tend to expand to areas you might not have noticed
-before, because you have to make sure the spinlocks correctly protect the
-shared data structures _everywhere_ they are used. The spinlocks are most
-easily added to places that are completely independent of other code (for
-example, internal driver data structures that nobody else ever touches).
-
-   NOTE! The spin-lock is safe only when you _also_ use the lock itself
-   to do locking across CPU's, which implies that EVERYTHING that
-   touches a shared variable has to agree about the spinlock they want
-   to use.
-
-----
-
-Lesson 2: reader-writer spinlocks.
-
-If your data accesses have a very natural pattern where you usually tend
-to mostly read from the shared variables, the reader-writer locks
-(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
-readers to be in the same critical region at once, but if somebody wants
-to change the variables it has to get an exclusive write lock.
-
-   NOTE! reader-writer locks require more atomic memory operations than
-   simple spinlocks.  Unless the reader critical section is long, you
-   are better off just using spinlocks.
-
-The routines look the same as above:
-
-   rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
-
-	unsigned long flags;
-
-	read_lock_irqsave(&xxx_lock, flags);
-	.. critical section that only reads the info ...
-	read_unlock_irqrestore(&xxx_lock, flags);
-
-	write_lock_irqsave(&xxx_lock, flags);
-	.. read and write exclusive access to the info ...
-	write_unlock_irqrestore(&xxx_lock, flags);
-
-The above kind of lock may be useful for complex data structures like
-linked lists, especially searching for entries without changing the list
-itself.  The read lock allows many concurrent readers.  Anything that
-_changes_ the list will have to get the write lock.
-
-   NOTE! RCU is better for list traversal, but requires careful
-   attention to design detail (see Documentation/RCU/listRCU.txt).
-
-Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
-time need to do any changes (even if you don't do it every time), you have
-to get the write-lock at the very beginning.
-
-   NOTE! We are working hard to remove reader-writer spinlocks in most
-   cases, so please don't add a new one without consensus.  (Instead, see
-   Documentation/RCU/rcu.txt for complete information.)
-
-----
-
-Lesson 3: spinlocks revisited.
-
-The single spin-lock primitives above are by no means the only ones. They
-are the most safe ones, and the ones that work under all circumstances,
-but partly _because_ they are safe they are also fairly slow. They are slower
-than they'd need to be, because they do have to disable interrupts
-(which is just a single instruction on a x86, but it's an expensive one -
-and on other architectures it can be worse).
-
-If you have a case where you have to protect a data structure across
-several CPU's and you want to use spinlocks you can potentially use
-cheaper versions of the spinlocks. IFF you know that the spinlocks are
-never used in interrupt handlers, you can use the non-irq versions:
-
-	spin_lock(&lock);
-	...
-	spin_unlock(&lock);
-
-(and the equivalent read-write versions too, of course). The spinlock will
-guarantee the same kind of exclusive access, and it will be much faster.
-This is useful if you know that the data in question is only ever
-manipulated from a "process context", ie no interrupts involved.
-
-The reasons you mustn't use these versions if you have interrupts that
-play with the spinlock is that you can get deadlocks:
-
-	spin_lock(&lock);
-	...
-		<- interrupt comes in:
-			spin_lock(&lock);
-
-where an interrupt tries to lock an already locked variable. This is ok if
-the other interrupt happens on another CPU, but it is _not_ ok if the
-interrupt happens on the same CPU that already holds the lock, because the
-lock will obviously never be released (because the interrupt is waiting
-for the lock, and the lock-holder is interrupted by the interrupt and will
-not continue until the interrupt has been processed).
-
-(This is also the reason why the irq-versions of the spinlocks only need
-to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
-on other CPU's, because an interrupt on another CPU doesn't interrupt the
-CPU that holds the lock, so the lock-holder can continue and eventually
-releases the lock).
-
-Note that you can be clever with read-write locks and interrupts. For
-example, if you know that the interrupt only ever gets a read-lock, then
-you can use a non-irq version of read locks everywhere - because they
-don't block on each other (and thus there is no dead-lock wrt interrupts.
-But when you do the write-lock, you have to use the irq-safe version.
-
-For an example of being clever with rw-locks, see the "waitqueue_lock"
-handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
-within an interrupt, they only read the queue in order to know whom to
-wake up. So read-locks are safe (which is good: they are very common
-indeed), while write-locks need to protect themselves against interrupts.
-
-		Linus
-
-----
-
-Reference information:
-
-For dynamic initialization, use spin_lock_init() or rwlock_init() as
-appropriate:
-
-   spinlock_t xxx_lock;
-   rwlock_t xxx_rw_lock;
-
-   static int __init xxx_init(void)
-   {
-	spin_lock_init(&xxx_lock);
-	rwlock_init(&xxx_rw_lock);
-	...
-   }
-
-   module_init(xxx_init);
-
-For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
-__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
diff --git a/Documentation/locking/ww-mutex-design.rst b/Documentation/locking/ww-mutex-design.rst
new file mode 100644
index 000000000000..1846c199da23
--- /dev/null
+++ b/Documentation/locking/ww-mutex-design.rst
@@ -0,0 +1,393 @@
+======================================
+Wound/Wait Deadlock-Proof Mutex Design
+======================================
+
+Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
+
+Motivation for WW-Mutexes
+-------------------------
+
+GPU's do operations that commonly involve many buffers.  Those buffers
+can be shared across contexts/processes, exist in different memory
+domains (for example VRAM vs system memory), and so on.  And with
+PRIME / dmabuf, they can even be shared across devices.  So there are
+a handful of situations where the driver needs to wait for buffers to
+become ready.  If you think about this in terms of waiting on a buffer
+mutex for it to become available, this presents a problem because
+there is no way to guarantee that buffers appear in a execbuf/batch in
+the same order in all contexts.  That is directly under control of
+userspace, and a result of the sequence of GL calls that an application
+makes.	Which results in the potential for deadlock.  The problem gets
+more complex when you consider that the kernel may need to migrate the
+buffer(s) into VRAM before the GPU operates on the buffer(s), which
+may in turn require evicting some other buffers (and you don't want to
+evict other buffers which are already queued up to the GPU), but for a
+simplified understanding of the problem you can ignore this.
+
+The algorithm that the TTM graphics subsystem came up with for dealing with
+this problem is quite simple.  For each group of buffers (execbuf) that need
+to be locked, the caller would be assigned a unique reservation id/ticket,
+from a global counter.  In case of deadlock while locking all the buffers
+associated with a execbuf, the one with the lowest reservation ticket (i.e.
+the oldest task) wins, and the one with the higher reservation id (i.e. the
+younger task) unlocks all of the buffers that it has already locked, and then
+tries again.
+
+In the RDBMS literature, a reservation ticket is associated with a transaction.
+and the deadlock handling approach is called Wait-Die. The name is based on
+the actions of a locking thread when it encounters an already locked mutex.
+If the transaction holding the lock is younger, the locking transaction waits.
+If the transaction holding the lock is older, the locking transaction backs off
+and dies. Hence Wait-Die.
+There is also another algorithm called Wound-Wait:
+If the transaction holding the lock is younger, the locking transaction
+wounds the transaction holding the lock, requesting it to die.
+If the transaction holding the lock is older, it waits for the other
+transaction. Hence Wound-Wait.
+The two algorithms are both fair in that a transaction will eventually succeed.
+However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
+compared to Wait-Die, but is, on the other hand, associated with more work than
+Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
+algorithm in that transactions are wounded by other transactions, and that
+requires a reliable way to pick up up the wounded condition and preempt the
+running transaction. Note that this is not the same as process preemption. A
+Wound-Wait transaction is considered preempted when it dies (returning
+-EDEADLK) following a wound.
+
+Concepts
+--------
+
+Compared to normal mutexes two additional concepts/objects show up in the lock
+interface for w/w mutexes:
+
+Acquire context: To ensure eventual forward progress it is important the a task
+trying to acquire locks doesn't grab a new reservation id, but keeps the one it
+acquired when starting the lock acquisition. This ticket is stored in the
+acquire context. Furthermore the acquire context keeps track of debugging state
+to catch w/w mutex interface abuse. An acquire context is representing a
+transaction.
+
+W/w class: In contrast to normal mutexes the lock class needs to be explicit for
+w/w mutexes, since it is required to initialize the acquire context. The lock
+class also specifies what algorithm to use, Wound-Wait or Wait-Die.
+
+Furthermore there are three different class of w/w lock acquire functions:
+
+* Normal lock acquisition with a context, using ww_mutex_lock.
+
+* Slowpath lock acquisition on the contending lock, used by the task that just
+  killed its transaction after having dropped all already acquired locks.
+  These functions have the _slow postfix.
+
+  From a simple semantics point-of-view the _slow functions are not strictly
+  required, since simply calling the normal ww_mutex_lock functions on the
+  contending lock (after having dropped all other already acquired locks) will
+  work correctly. After all if no other ww mutex has been acquired yet there's
+  no deadlock potential and hence the ww_mutex_lock call will block and not
+  prematurely return -EDEADLK. The advantage of the _slow functions is in
+  interface safety:
+
+  - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
+    has a void return type. Note that since ww mutex code needs loops/retries
+    anyway the __must_check doesn't result in spurious warnings, even though the
+    very first lock operation can never fail.
+  - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
+    ww mutex have been released (preventing deadlocks) and makes sure that we
+    block on the contending lock (preventing spinning through the -EDEADLK
+    slowpath until the contended lock can be acquired).
+
+* Functions to only acquire a single w/w mutex, which results in the exact same
+  semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
+  context.
+
+  Again this is not strictly required. But often you only want to acquire a
+  single lock in which case it's pointless to set up an acquire context (and so
+  better to avoid grabbing a deadlock avoidance ticket).
+
+Of course, all the usual variants for handling wake-ups due to signals are also
+provided.
+
+Usage
+-----
+
+The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
+DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
+As a rough rule of thumb, use Wound-Wait iff you
+expect the number of simultaneous competing transactions to be typically small,
+and you want to reduce the number of rollbacks.
+
+Three different ways to acquire locks within the same w/w class. Common
+definitions for methods #1 and #2::
+
+  static DEFINE_WW_CLASS(ww_class);
+
+  struct obj {
+	struct ww_mutex lock;
+	/* obj data */
+  };
+
+  struct obj_entry {
+	struct list_head head;
+	struct obj *obj;
+  };
+
+Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
+This is useful if a list of required objects is already tracked somewhere.
+Furthermore the lock helper can use propagate the -EALREADY return code back to
+the caller as a signal that an object is twice on the list. This is useful if
+the list is constructed from userspace input and the ABI requires userspace to
+not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl)::
+
+  int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj *res_obj = NULL;
+	struct obj_entry *contended_entry = NULL;
+	struct obj_entry *entry;
+
+	ww_acquire_init(ctx, &ww_class);
+
+  retry:
+	list_for_each_entry (entry, list, head) {
+		if (entry->obj == res_obj) {
+			res_obj = NULL;
+			continue;
+		}
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			contended_entry = entry;
+			goto err;
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+
+  err:
+	list_for_each_entry_continue_reverse (entry, list, head)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	if (res_obj)
+		ww_mutex_unlock(&res_obj->lock);
+
+	if (ret == -EDEADLK) {
+		/* we lost out in a seqno race, lock and retry.. */
+		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
+		res_obj = contended_entry->obj;
+		goto retry;
+	}
+	ww_acquire_fini(ctx);
+
+	return ret;
+  }
+
+Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
+of duplicate entry detection using -EALREADY as method 1 above. But the
+list-reordering allows for a bit more idiomatic code::
+
+  int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj_entry *entry, *entry2;
+
+	ww_acquire_init(ctx, &ww_class);
+
+	list_for_each_entry (entry, list, head) {
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			entry2 = entry;
+
+			list_for_each_entry_continue_reverse (entry2, list, head)
+				ww_mutex_unlock(&entry2->obj->lock);
+
+			if (ret != -EDEADLK) {
+				ww_acquire_fini(ctx);
+				return ret;
+			}
+
+			/* we lost out in a seqno race, lock and retry.. */
+			ww_mutex_lock_slow(&entry->obj->lock, ctx);
+
+			/*
+			 * Move buf to head of the list, this will point
+			 * buf->next to the first unlocked entry,
+			 * restarting the for loop.
+			 */
+			list_del(&entry->head);
+			list_add(&entry->head, list);
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+  }
+
+Unlocking works the same way for both methods #1 and #2::
+
+  void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj_entry *entry;
+
+	list_for_each_entry (entry, list, head)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	ww_acquire_fini(ctx);
+  }
+
+Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
+e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
+and edges can only be changed when holding the locks of all involved nodes. w/w
+mutexes are a natural fit for such a case for two reasons:
+
+- They can handle lock-acquisition in any order which allows us to start walking
+  a graph from a starting point and then iteratively discovering new edges and
+  locking down the nodes those edges connect to.
+- Due to the -EALREADY return code signalling that a given objects is already
+  held there's no need for additional book-keeping to break cycles in the graph
+  or keep track off which looks are already held (when using more than one node
+  as a starting point).
+
+Note that this approach differs in two important ways from the above methods:
+
+- Since the list of objects is dynamically constructed (and might very well be
+  different when retrying due to hitting the -EDEADLK die condition) there's
+  no need to keep any object on a persistent list when it's not locked. We can
+  therefore move the list_head into the object itself.
+- On the other hand the dynamic object list construction also means that the -EALREADY return
+  code can't be propagated.
+
+Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
+list of starting nodes (passed in from userspace) using one of the above
+methods. And then lock any additional objects affected by the operations using
+method #3 below. The backoff/retry procedure will be a bit more involved, since
+when the dynamic locking step hits -EDEADLK we also need to unlock all the
+objects acquired with the fixed list. But the w/w mutex debug checks will catch
+any interface misuse for these cases.
+
+Also, method 3 can't fail the lock acquisition step since it doesn't return
+-EALREADY. Of course this would be different when using the _interruptible
+variants, but that's outside of the scope of these examples here::
+
+  struct obj {
+	struct ww_mutex ww_mutex;
+	struct list_head locked_list;
+  };
+
+  static DEFINE_WW_CLASS(ww_class);
+
+  void __unlock_objs(struct list_head *list)
+  {
+	struct obj *entry, *temp;
+
+	list_for_each_entry_safe (entry, temp, list, locked_list) {
+		/* need to do that before unlocking, since only the current lock holder is
+		allowed to use object */
+		list_del(&entry->locked_list);
+		ww_mutex_unlock(entry->ww_mutex)
+	}
+  }
+
+  void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj *obj;
+
+	ww_acquire_init(ctx, &ww_class);
+
+  retry:
+	/* re-init loop start state */
+	loop {
+		/* magic code which walks over a graph and decides which objects
+		 * to lock */
+
+		ret = ww_mutex_lock(obj->ww_mutex, ctx);
+		if (ret == -EALREADY) {
+			/* we have that one already, get to the next object */
+			continue;
+		}
+		if (ret == -EDEADLK) {
+			__unlock_objs(list);
+
+			ww_mutex_lock_slow(obj, ctx);
+			list_add(&entry->locked_list, list);
+			goto retry;
+		}
+
+		/* locked a new object, add it to the list */
+		list_add_tail(&entry->locked_list, list);
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+  }
+
+  void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	__unlock_objs(list);
+	ww_acquire_fini(ctx);
+  }
+
+Method 4: Only lock one single objects. In that case deadlock detection and
+prevention is obviously overkill, since with grabbing just one lock you can't
+produce a deadlock within just one class. To simplify this case the w/w mutex
+api can be used with a NULL context.
+
+Implementation Details
+----------------------
+
+Design:
+^^^^^^^
+
+  ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
+  normal mutex locks, which are far more common. As such there is only a small
+  increase in code size if wait/wound mutexes are not used.
+
+  We maintain the following invariants for the wait list:
+
+  (1) Waiters with an acquire context are sorted by stamp order; waiters
+      without an acquire context are interspersed in FIFO order.
+  (2) For Wait-Die, among waiters with contexts, only the first one can have
+      other locks acquired already (ctx->acquired > 0). Note that this waiter
+      may come after other waiters without contexts in the list.
+
+  The Wound-Wait preemption is implemented with a lazy-preemption scheme:
+  The wounded status of the transaction is checked only when there is
+  contention for a new lock and hence a true chance of deadlock. In that
+  situation, if the transaction is wounded, it backs off, clears the
+  wounded status and retries. A great benefit of implementing preemption in
+  this way is that the wounded transaction can identify a contending lock to
+  wait for before restarting the transaction. Just blindly restarting the
+  transaction would likely make the transaction end up in a situation where
+  it would have to back off again.
+
+  In general, not much contention is expected. The locks are typically used to
+  serialize access to resources for devices, and optimization focus should
+  therefore be directed towards the uncontended cases.
+
+Lockdep:
+^^^^^^^^
+
+  Special care has been taken to warn for as many cases of api abuse
+  as possible. Some common api abuses will be caught with
+  CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
+
+  Some of the errors which will be warned about:
+   - Forgetting to call ww_acquire_fini or ww_acquire_init.
+   - Attempting to lock more mutexes after ww_acquire_done.
+   - Attempting to lock the wrong mutex after -EDEADLK and
+     unlocking all mutexes.
+   - Attempting to lock the right mutex after -EDEADLK,
+     before unlocking all mutexes.
+
+   - Calling ww_mutex_lock_slow before -EDEADLK was returned.
+
+   - Unlocking mutexes with the wrong unlock function.
+   - Calling one of the ww_acquire_* twice on the same context.
+   - Using a different ww_class for the mutex than for the ww_acquire_ctx.
+   - Normal lockdep errors that can result in deadlocks.
+
+  Some of the lockdep errors that can result in deadlocks:
+   - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
+     having called ww_acquire_fini on the first.
+   - 'normal' deadlocks that can occur.
+
+FIXME:
+  Update this section once we have the TASK_DEADLOCK task state flag magic
+  implemented.
diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt
deleted file mode 100644
index f0ed7c30e695..000000000000
--- a/Documentation/locking/ww-mutex-design.txt
+++ /dev/null
@@ -1,383 +0,0 @@
-Wound/Wait Deadlock-Proof Mutex Design
-======================================
-
-Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
-
-Motivation for WW-Mutexes
--------------------------
-
-GPU's do operations that commonly involve many buffers.  Those buffers
-can be shared across contexts/processes, exist in different memory
-domains (for example VRAM vs system memory), and so on.  And with
-PRIME / dmabuf, they can even be shared across devices.  So there are
-a handful of situations where the driver needs to wait for buffers to
-become ready.  If you think about this in terms of waiting on a buffer
-mutex for it to become available, this presents a problem because
-there is no way to guarantee that buffers appear in a execbuf/batch in
-the same order in all contexts.  That is directly under control of
-userspace, and a result of the sequence of GL calls that an application
-makes.	Which results in the potential for deadlock.  The problem gets
-more complex when you consider that the kernel may need to migrate the
-buffer(s) into VRAM before the GPU operates on the buffer(s), which
-may in turn require evicting some other buffers (and you don't want to
-evict other buffers which are already queued up to the GPU), but for a
-simplified understanding of the problem you can ignore this.
-
-The algorithm that the TTM graphics subsystem came up with for dealing with
-this problem is quite simple.  For each group of buffers (execbuf) that need
-to be locked, the caller would be assigned a unique reservation id/ticket,
-from a global counter.  In case of deadlock while locking all the buffers
-associated with a execbuf, the one with the lowest reservation ticket (i.e.
-the oldest task) wins, and the one with the higher reservation id (i.e. the
-younger task) unlocks all of the buffers that it has already locked, and then
-tries again.
-
-In the RDBMS literature, a reservation ticket is associated with a transaction.
-and the deadlock handling approach is called Wait-Die. The name is based on
-the actions of a locking thread when it encounters an already locked mutex.
-If the transaction holding the lock is younger, the locking transaction waits.
-If the transaction holding the lock is older, the locking transaction backs off
-and dies. Hence Wait-Die.
-There is also another algorithm called Wound-Wait:
-If the transaction holding the lock is younger, the locking transaction
-wounds the transaction holding the lock, requesting it to die.
-If the transaction holding the lock is older, it waits for the other
-transaction. Hence Wound-Wait.
-The two algorithms are both fair in that a transaction will eventually succeed.
-However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
-compared to Wait-Die, but is, on the other hand, associated with more work than
-Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
-algorithm in that transactions are wounded by other transactions, and that
-requires a reliable way to pick up up the wounded condition and preempt the
-running transaction. Note that this is not the same as process preemption. A
-Wound-Wait transaction is considered preempted when it dies (returning
--EDEADLK) following a wound.
-
-Concepts
---------
-
-Compared to normal mutexes two additional concepts/objects show up in the lock
-interface for w/w mutexes:
-
-Acquire context: To ensure eventual forward progress it is important the a task
-trying to acquire locks doesn't grab a new reservation id, but keeps the one it
-acquired when starting the lock acquisition. This ticket is stored in the
-acquire context. Furthermore the acquire context keeps track of debugging state
-to catch w/w mutex interface abuse. An acquire context is representing a
-transaction.
-
-W/w class: In contrast to normal mutexes the lock class needs to be explicit for
-w/w mutexes, since it is required to initialize the acquire context. The lock
-class also specifies what algorithm to use, Wound-Wait or Wait-Die.
-
-Furthermore there are three different class of w/w lock acquire functions:
-
-* Normal lock acquisition with a context, using ww_mutex_lock.
-
-* Slowpath lock acquisition on the contending lock, used by the task that just
-  killed its transaction after having dropped all already acquired locks.
-  These functions have the _slow postfix.
-
-  From a simple semantics point-of-view the _slow functions are not strictly
-  required, since simply calling the normal ww_mutex_lock functions on the
-  contending lock (after having dropped all other already acquired locks) will
-  work correctly. After all if no other ww mutex has been acquired yet there's
-  no deadlock potential and hence the ww_mutex_lock call will block and not
-  prematurely return -EDEADLK. The advantage of the _slow functions is in
-  interface safety:
-  - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
-    has a void return type. Note that since ww mutex code needs loops/retries
-    anyway the __must_check doesn't result in spurious warnings, even though the
-    very first lock operation can never fail.
-  - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
-    ww mutex have been released (preventing deadlocks) and makes sure that we
-    block on the contending lock (preventing spinning through the -EDEADLK
-    slowpath until the contended lock can be acquired).
-
-* Functions to only acquire a single w/w mutex, which results in the exact same
-  semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
-  context.
-
-  Again this is not strictly required. But often you only want to acquire a
-  single lock in which case it's pointless to set up an acquire context (and so
-  better to avoid grabbing a deadlock avoidance ticket).
-
-Of course, all the usual variants for handling wake-ups due to signals are also
-provided.
-
-Usage
------
-
-The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
-DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
-As a rough rule of thumb, use Wound-Wait iff you
-expect the number of simultaneous competing transactions to be typically small,
-and you want to reduce the number of rollbacks.
-
-Three different ways to acquire locks within the same w/w class. Common
-definitions for methods #1 and #2:
-
-static DEFINE_WW_CLASS(ww_class);
-
-struct obj {
-	struct ww_mutex lock;
-	/* obj data */
-};
-
-struct obj_entry {
-	struct list_head head;
-	struct obj *obj;
-};
-
-Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
-This is useful if a list of required objects is already tracked somewhere.
-Furthermore the lock helper can use propagate the -EALREADY return code back to
-the caller as a signal that an object is twice on the list. This is useful if
-the list is constructed from userspace input and the ABI requires userspace to
-not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
-
-int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj *res_obj = NULL;
-	struct obj_entry *contended_entry = NULL;
-	struct obj_entry *entry;
-
-	ww_acquire_init(ctx, &ww_class);
-
-retry:
-	list_for_each_entry (entry, list, head) {
-		if (entry->obj == res_obj) {
-			res_obj = NULL;
-			continue;
-		}
-		ret = ww_mutex_lock(&entry->obj->lock, ctx);
-		if (ret < 0) {
-			contended_entry = entry;
-			goto err;
-		}
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-
-err:
-	list_for_each_entry_continue_reverse (entry, list, head)
-		ww_mutex_unlock(&entry->obj->lock);
-
-	if (res_obj)
-		ww_mutex_unlock(&res_obj->lock);
-
-	if (ret == -EDEADLK) {
-		/* we lost out in a seqno race, lock and retry.. */
-		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
-		res_obj = contended_entry->obj;
-		goto retry;
-	}
-	ww_acquire_fini(ctx);
-
-	return ret;
-}
-
-Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
-of duplicate entry detection using -EALREADY as method 1 above. But the
-list-reordering allows for a bit more idiomatic code.
-
-int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj_entry *entry, *entry2;
-
-	ww_acquire_init(ctx, &ww_class);
-
-	list_for_each_entry (entry, list, head) {
-		ret = ww_mutex_lock(&entry->obj->lock, ctx);
-		if (ret < 0) {
-			entry2 = entry;
-
-			list_for_each_entry_continue_reverse (entry2, list, head)
-				ww_mutex_unlock(&entry2->obj->lock);
-
-			if (ret != -EDEADLK) {
-				ww_acquire_fini(ctx);
-				return ret;
-			}
-
-			/* we lost out in a seqno race, lock and retry.. */
-			ww_mutex_lock_slow(&entry->obj->lock, ctx);
-
-			/*
-			 * Move buf to head of the list, this will point
-			 * buf->next to the first unlocked entry,
-			 * restarting the for loop.
-			 */
-			list_del(&entry->head);
-			list_add(&entry->head, list);
-		}
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-}
-
-Unlocking works the same way for both methods #1 and #2:
-
-void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj_entry *entry;
-
-	list_for_each_entry (entry, list, head)
-		ww_mutex_unlock(&entry->obj->lock);
-
-	ww_acquire_fini(ctx);
-}
-
-Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
-e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
-and edges can only be changed when holding the locks of all involved nodes. w/w
-mutexes are a natural fit for such a case for two reasons:
-- They can handle lock-acquisition in any order which allows us to start walking
-  a graph from a starting point and then iteratively discovering new edges and
-  locking down the nodes those edges connect to.
-- Due to the -EALREADY return code signalling that a given objects is already
-  held there's no need for additional book-keeping to break cycles in the graph
-  or keep track off which looks are already held (when using more than one node
-  as a starting point).
-
-Note that this approach differs in two important ways from the above methods:
-- Since the list of objects is dynamically constructed (and might very well be
-  different when retrying due to hitting the -EDEADLK die condition) there's
-  no need to keep any object on a persistent list when it's not locked. We can
-  therefore move the list_head into the object itself.
-- On the other hand the dynamic object list construction also means that the -EALREADY return
-  code can't be propagated.
-
-Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
-list of starting nodes (passed in from userspace) using one of the above
-methods. And then lock any additional objects affected by the operations using
-method #3 below. The backoff/retry procedure will be a bit more involved, since
-when the dynamic locking step hits -EDEADLK we also need to unlock all the
-objects acquired with the fixed list. But the w/w mutex debug checks will catch
-any interface misuse for these cases.
-
-Also, method 3 can't fail the lock acquisition step since it doesn't return
--EALREADY. Of course this would be different when using the _interruptible
-variants, but that's outside of the scope of these examples here.
-
-struct obj {
-	struct ww_mutex ww_mutex;
-	struct list_head locked_list;
-};
-
-static DEFINE_WW_CLASS(ww_class);
-
-void __unlock_objs(struct list_head *list)
-{
-	struct obj *entry, *temp;
-
-	list_for_each_entry_safe (entry, temp, list, locked_list) {
-		/* need to do that before unlocking, since only the current lock holder is
-		allowed to use object */
-		list_del(&entry->locked_list);
-		ww_mutex_unlock(entry->ww_mutex)
-	}
-}
-
-void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj *obj;
-
-	ww_acquire_init(ctx, &ww_class);
-
-retry:
-	/* re-init loop start state */
-	loop {
-		/* magic code which walks over a graph and decides which objects
-		 * to lock */
-
-		ret = ww_mutex_lock(obj->ww_mutex, ctx);
-		if (ret == -EALREADY) {
-			/* we have that one already, get to the next object */
-			continue;
-		}
-		if (ret == -EDEADLK) {
-			__unlock_objs(list);
-
-			ww_mutex_lock_slow(obj, ctx);
-			list_add(&entry->locked_list, list);
-			goto retry;
-		}
-
-		/* locked a new object, add it to the list */
-		list_add_tail(&entry->locked_list, list);
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-}
-
-void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	__unlock_objs(list);
-	ww_acquire_fini(ctx);
-}
-
-Method 4: Only lock one single objects. In that case deadlock detection and
-prevention is obviously overkill, since with grabbing just one lock you can't
-produce a deadlock within just one class. To simplify this case the w/w mutex
-api can be used with a NULL context.
-
-Implementation Details
-----------------------
-
-Design:
-  ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
-  normal mutex locks, which are far more common. As such there is only a small
-  increase in code size if wait/wound mutexes are not used.
-
-  We maintain the following invariants for the wait list:
-  (1) Waiters with an acquire context are sorted by stamp order; waiters
-      without an acquire context are interspersed in FIFO order.
-  (2) For Wait-Die, among waiters with contexts, only the first one can have
-      other locks acquired already (ctx->acquired > 0). Note that this waiter
-      may come after other waiters without contexts in the list.
-
-  The Wound-Wait preemption is implemented with a lazy-preemption scheme:
-  The wounded status of the transaction is checked only when there is
-  contention for a new lock and hence a true chance of deadlock. In that
-  situation, if the transaction is wounded, it backs off, clears the
-  wounded status and retries. A great benefit of implementing preemption in
-  this way is that the wounded transaction can identify a contending lock to
-  wait for before restarting the transaction. Just blindly restarting the
-  transaction would likely make the transaction end up in a situation where
-  it would have to back off again.
-
-  In general, not much contention is expected. The locks are typically used to
-  serialize access to resources for devices, and optimization focus should
-  therefore be directed towards the uncontended cases.
-
-Lockdep:
-  Special care has been taken to warn for as many cases of api abuse
-  as possible. Some common api abuses will be caught with
-  CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
-
-  Some of the errors which will be warned about:
-   - Forgetting to call ww_acquire_fini or ww_acquire_init.
-   - Attempting to lock more mutexes after ww_acquire_done.
-   - Attempting to lock the wrong mutex after -EDEADLK and
-     unlocking all mutexes.
-   - Attempting to lock the right mutex after -EDEADLK,
-     before unlocking all mutexes.
-
-   - Calling ww_mutex_lock_slow before -EDEADLK was returned.
-
-   - Unlocking mutexes with the wrong unlock function.
-   - Calling one of the ww_acquire_* twice on the same context.
-   - Using a different ww_class for the mutex than for the ww_acquire_ctx.
-   - Normal lockdep errors that can result in deadlocks.
-
-  Some of the lockdep errors that can result in deadlocks:
-   - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
-     having called ww_acquire_fini on the first.
-   - 'normal' deadlocks that can occur.
-
-FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic
-implemented.
diff --git a/Documentation/pi-futex.txt b/Documentation/pi-futex.txt
index b154f6c0c36e..c33ba2befbf8 100644
--- a/Documentation/pi-futex.txt
+++ b/Documentation/pi-futex.txt
@@ -119,4 +119,4 @@ properties of futexes, and all four combinations are possible: futex,
 robust-futex, PI-futex, robust+PI-futex.
 
 More details about priority inheritance can be found in
-Documentation/locking/rt-mutex.txt.
+Documentation/locking/rt-mutex.rst.
diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst
index 5fd8a1abd2be..b9a6be4b8499 100644
--- a/Documentation/translations/it_IT/kernel-hacking/locking.rst
+++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst
@@ -1404,7 +1404,7 @@ Riferimento per l'API dei Futex
 Approfondimenti
 ===============
 
--  ``Documentation/locking/spinlocks.txt``: la guida di Linus Torvalds agli
+-  ``Documentation/locking/spinlocks.rst``: la guida di Linus Torvalds agli
    spinlock del kernel.
 
 -  Unix Systems for Modern Architectures: Symmetric Multiprocessing and
diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c
index 81dd11901ffd..cb5671d32ada 100644
--- a/drivers/gpu/drm/drm_modeset_lock.c
+++ b/drivers/gpu/drm/drm_modeset_lock.c
@@ -36,7 +36,7 @@
  * of extra utility/tracking out of our acquire-ctx.  This is provided
  * by &struct drm_modeset_lock and &struct drm_modeset_acquire_ctx.
  *
- * For basic principles of &ww_mutex, see: Documentation/locking/ww-mutex-design.txt
+ * For basic principles of &ww_mutex, see: Documentation/locking/ww-mutex-design.rst
  *
  * The basic usage pattern is to::
  *
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 57baa27f238c..0b0d7259276d 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -5,7 +5,7 @@
  *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  *
- * see Documentation/locking/lockdep-design.txt for more details.
+ * see Documentation/locking/lockdep-design.rst for more details.
  */
 #ifndef __LINUX_LOCKDEP_H
 #define __LINUX_LOCKDEP_H
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3093dd162424..dcd03fee6e01 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,7 +151,7 @@ static inline bool mutex_is_locked(struct mutex *lock)
 
 /*
  * See kernel/locking/mutex.c for detailed documentation of these APIs.
- * Also see Documentation/locking/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.rst.
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index e401358c4e7e..9d9c663987d8 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -160,7 +160,7 @@ extern void downgrade_write(struct rw_semaphore *sem);
  * static then another method for expressing nested locking is
  * the explicit definition of lock class keys and the use of
  * lockdep_set_class() at lock initialization time.
- * See Documentation/locking/lockdep-design.txt for more details.)
+ * See Documentation/locking/lockdep-design.rst for more details.)
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 0c601ae072b3..edd1c082dbf5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -16,7 +16,7 @@
  *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
  *    and Sven Dietrich.
  *
- * Also see Documentation/locking/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.rst.
  */
 #include <linux/mutex.h>
 #include <linux/ww_mutex.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 38fbf9fa7f1b..fa83d36e30c6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -9,7 +9,7 @@
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
  *
- *  See Documentation/locking/rt-mutex-design.txt for details.
+ *  See Documentation/locking/rt-mutex-design.rst for details.
  */
 #include <linux/spinlock.h>
 #include <linux/export.h>
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4ac4ca21a30a..a858b55e8ac7 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1139,7 +1139,7 @@ config PROVE_LOCKING
 	 the proof of observed correctness is also maintained for an
 	 arbitrary combination of these separate locking variants.
 
-	 For more details, see Documentation/locking/lockdep-design.txt.
+	 For more details, see Documentation/locking/lockdep-design.rst.
 
 config LOCK_STAT
 	bool "Lock usage statistics"
@@ -1153,7 +1153,7 @@ config LOCK_STAT
 	help
 	 This feature enables tracking lock contention points
 
-	 For more details, see Documentation/locking/lockstat.txt
+	 For more details, see Documentation/locking/lockstat.rst
 
 	 This also enables lock events required by "perf lock",
 	 subcommand of perf.
-- 
cgit v1.2.3


From 720594f691e5c8fb0624f3653b20b24ba8e57742 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Sat, 13 Apr 2019 22:54:53 -0300
Subject: docs: connector: convert to ReST and rename to connector.rst

As it has some function definitions, move them to connector.h.

The remaining conversion is actually:
  - add blank lines and identation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/connector/connector.rst | 156 +++++++++++++++++++++++++++
 Documentation/connector/connector.txt | 196 ----------------------------------
 drivers/w1/Kconfig                    |   2 +-
 include/linux/connector.h             |  63 ++++++++++-
 samples/Kconfig                       |   2 +-
 5 files changed, 220 insertions(+), 199 deletions(-)
 create mode 100644 Documentation/connector/connector.rst
 delete mode 100644 Documentation/connector/connector.txt

(limited to 'include')

diff --git a/Documentation/connector/connector.rst b/Documentation/connector/connector.rst
new file mode 100644
index 000000000000..24e26dc22dbf
--- /dev/null
+++ b/Documentation/connector/connector.rst
@@ -0,0 +1,156 @@
+:orphan:
+
+================
+Kernel Connector
+================
+
+Kernel connector - new netlink based userspace <-> kernel space easy
+to use communication module.
+
+The Connector driver makes it easy to connect various agents using a
+netlink based network.  One must register a callback and an identifier.
+When the driver receives a special netlink message with the appropriate
+identifier, the appropriate callback will be called.
+
+From the userspace point of view it's quite straightforward:
+
+	- socket();
+	- bind();
+	- send();
+	- recv();
+
+But if kernelspace wants to use the full power of such connections, the
+driver writer must create special sockets, must know about struct sk_buff
+handling, etc...  The Connector driver allows any kernelspace agents to use
+netlink based networking for inter-process communication in a significantly
+easier way::
+
+  int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
+  void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
+  void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
+
+  struct cb_id
+  {
+	__u32			idx;
+	__u32			val;
+  };
+
+idx and val are unique identifiers which must be registered in the
+connector.h header for in-kernel usage.  `void (*callback) (void *)` is a
+callback function which will be called when a message with above idx.val
+is received by the connector core.  The argument for that function must
+be dereferenced to `struct cn_msg *`::
+
+  struct cn_msg
+  {
+	struct cb_id		id;
+
+	__u32			seq;
+	__u32			ack;
+
+	__u32			len;	/* Length of the following data */
+	__u8			data[0];
+  };
+
+Connector interfaces
+====================
+
+ .. kernel-doc:: include/linux/connector.h
+
+ Note:
+   When registering new callback user, connector core assigns
+   netlink group to the user which is equal to its id.idx.
+
+Protocol description
+====================
+
+The current framework offers a transport layer with fixed headers.  The
+recommended protocol which uses such a header is as following:
+
+msg->seq and msg->ack are used to determine message genealogy.  When
+someone sends a message, they use a locally unique sequence and random
+acknowledge number.  The sequence number may be copied into
+nlmsghdr->nlmsg_seq too.
+
+The sequence number is incremented with each message sent.
+
+If you expect a reply to the message, then the sequence number in the
+received message MUST be the same as in the original message, and the
+acknowledge number MUST be the same + 1.
+
+If we receive a message and its sequence number is not equal to one we
+are expecting, then it is a new message.  If we receive a message and
+its sequence number is the same as one we are expecting, but its
+acknowledge is not equal to the sequence number in the original
+message + 1, then it is a new message.
+
+Obviously, the protocol header contains the above id.
+
+The connector allows event notification in the following form: kernel
+driver or userspace process can ask connector to notify it when
+selected ids will be turned on or off (registered or unregistered its
+callback).  It is done by sending a special command to the connector
+driver (it also registers itself with id={-1, -1}).
+
+As example of this usage can be found in the cn_test.c module which
+uses the connector to request notification and to send messages.
+
+Reliability
+===========
+
+Netlink itself is not a reliable protocol.  That means that messages can
+be lost due to memory pressure or process' receiving queue overflowed,
+so caller is warned that it must be prepared.  That is why the struct
+cn_msg [main connector's message header] contains u32 seq and u32 ack
+fields.
+
+Userspace usage
+===============
+
+2.6.14 has a new netlink socket implementation, which by default does not
+allow people to send data to netlink groups other than 1.
+So, if you wish to use a netlink socket (for example using connector)
+with a different group number, the userspace application must subscribe to
+that group first.  It can be achieved by the following pseudocode::
+
+  s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+
+  l_local.nl_family = AF_NETLINK;
+  l_local.nl_groups = 12345;
+  l_local.nl_pid = 0;
+
+  if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
+	perror("bind");
+	close(s);
+	return -1;
+  }
+
+  {
+	int on = l_local.nl_groups;
+	setsockopt(s, 270, 1, &on, sizeof(on));
+  }
+
+Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
+option.  To drop a multicast subscription, one should call the above socket
+option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
+
+2.6.14 netlink code only allows to select a group which is less or equal to
+the maximum group number, which is used at netlink_kernel_create() time.
+In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
+group number 12345, you must increment CN_NETLINK_USERS to that number.
+Additional 0xf numbers are allocated to be used by non-in-kernel users.
+
+Due to this limitation, group 0xffffffff does not work now, so one can
+not use add/remove connector's group notifications, but as far as I know,
+only cn_test.c test module used it.
+
+Some work in netlink area is still being done, so things can be changed in
+2.6.15 timeframe, if it will happen, documentation will be updated for that
+kernel.
+
+Code samples
+============
+
+Sample code for a connector test module and user space can be found
+in samples/connector/. To build this code, enable CONFIG_CONNECTOR
+and CONFIG_SAMPLES.
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt
deleted file mode 100644
index ab7ca897fab7..000000000000
--- a/Documentation/connector/connector.txt
+++ /dev/null
@@ -1,196 +0,0 @@
-/*****************************************/
-Kernel Connector.
-/*****************************************/
-
-Kernel connector - new netlink based userspace <-> kernel space easy
-to use communication module.
-
-The Connector driver makes it easy to connect various agents using a
-netlink based network.  One must register a callback and an identifier.
-When the driver receives a special netlink message with the appropriate
-identifier, the appropriate callback will be called.
-
-From the userspace point of view it's quite straightforward:
-
-	socket();
-	bind();
-	send();
-	recv();
-
-But if kernelspace wants to use the full power of such connections, the
-driver writer must create special sockets, must know about struct sk_buff
-handling, etc...  The Connector driver allows any kernelspace agents to use
-netlink based networking for inter-process communication in a significantly
-easier way:
-
-int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
-void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
-void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
-
-struct cb_id
-{
-	__u32			idx;
-	__u32			val;
-};
-
-idx and val are unique identifiers which must be registered in the
-connector.h header for in-kernel usage.  void (*callback) (void *) is a
-callback function which will be called when a message with above idx.val
-is received by the connector core.  The argument for that function must
-be dereferenced to struct cn_msg *.
-
-struct cn_msg
-{
-	struct cb_id		id;
-
-	__u32			seq;
-	__u32			ack;
-
-	__u32			len;		/* Length of the following data */
-	__u8			data[0];
-};
-
-/*****************************************/
-Connector interfaces.
-/*****************************************/
-
-int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
-
- Registers new callback with connector core.
-
- struct cb_id *id		- unique connector's user identifier.
-				  It must be registered in connector.h for legal in-kernel users.
- char *name			- connector's callback symbolic name.
- void (*callback) (struct cn..)	- connector's callback.
-				  cn_msg and the sender's credentials
-
-
-void cn_del_callback(struct cb_id *id);
-
- Unregisters new callback with connector core.
-
- struct cb_id *id		- unique connector's user identifier.
-
-
-int cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __groups, int gfp_mask);
-int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __groups, int gfp_mask);
-
- Sends message to the specified groups.  It can be safely called from
- softirq context, but may silently fail under strong memory pressure.
- If there are no listeners for given group -ESRCH can be returned.
-
- struct cn_msg *		- message header(with attached data).
- u16 len			- for *_multi multiple cn_msg messages can be sent
- u32 port			- destination port.
- 				  If non-zero the message will be sent to the
-				  given port, which should be set to the
-				  original sender.
- u32 __group			- destination group.
-				  If port and __group is zero, then appropriate group will
-				  be searched through all registered connector users,
-				  and message will be delivered to the group which was
-				  created for user with the same ID as in msg.
-				  If __group is not zero, then message will be delivered
-				  to the specified group.
- int gfp_mask			- GFP mask.
-
- Note: When registering new callback user, connector core assigns
- netlink group to the user which is equal to its id.idx.
-
-/*****************************************/
-Protocol description.
-/*****************************************/
-
-The current framework offers a transport layer with fixed headers.  The
-recommended protocol which uses such a header is as following:
-
-msg->seq and msg->ack are used to determine message genealogy.  When
-someone sends a message, they use a locally unique sequence and random
-acknowledge number.  The sequence number may be copied into
-nlmsghdr->nlmsg_seq too.
-
-The sequence number is incremented with each message sent.
-
-If you expect a reply to the message, then the sequence number in the
-received message MUST be the same as in the original message, and the
-acknowledge number MUST be the same + 1.
-
-If we receive a message and its sequence number is not equal to one we
-are expecting, then it is a new message.  If we receive a message and
-its sequence number is the same as one we are expecting, but its
-acknowledge is not equal to the sequence number in the original
-message + 1, then it is a new message.
-
-Obviously, the protocol header contains the above id.
-
-The connector allows event notification in the following form: kernel
-driver or userspace process can ask connector to notify it when
-selected ids will be turned on or off (registered or unregistered its
-callback).  It is done by sending a special command to the connector
-driver (it also registers itself with id={-1, -1}).
-
-As example of this usage can be found in the cn_test.c module which
-uses the connector to request notification and to send messages.
-
-/*****************************************/
-Reliability.
-/*****************************************/
-
-Netlink itself is not a reliable protocol.  That means that messages can
-be lost due to memory pressure or process' receiving queue overflowed,
-so caller is warned that it must be prepared.  That is why the struct
-cn_msg [main connector's message header] contains u32 seq and u32 ack
-fields.
-
-/*****************************************/
-Userspace usage.
-/*****************************************/
-
-2.6.14 has a new netlink socket implementation, which by default does not
-allow people to send data to netlink groups other than 1.
-So, if you wish to use a netlink socket (for example using connector)
-with a different group number, the userspace application must subscribe to
-that group first.  It can be achieved by the following pseudocode:
-
-s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
-
-l_local.nl_family = AF_NETLINK;
-l_local.nl_groups = 12345;
-l_local.nl_pid = 0;
-
-if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
-	perror("bind");
-	close(s);
-	return -1;
-}
-
-{
-	int on = l_local.nl_groups;
-	setsockopt(s, 270, 1, &on, sizeof(on));
-}
-
-Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
-option.  To drop a multicast subscription, one should call the above socket
-option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
-
-2.6.14 netlink code only allows to select a group which is less or equal to
-the maximum group number, which is used at netlink_kernel_create() time.
-In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
-group number 12345, you must increment CN_NETLINK_USERS to that number.
-Additional 0xf numbers are allocated to be used by non-in-kernel users.
-
-Due to this limitation, group 0xffffffff does not work now, so one can
-not use add/remove connector's group notifications, but as far as I know, 
-only cn_test.c test module used it.
-
-Some work in netlink area is still being done, so things can be changed in
-2.6.15 timeframe, if it will happen, documentation will be updated for that
-kernel.
-
-/*****************************************/
-Code samples
-/*****************************************/
-
-Sample code for a connector test module and user space can be found
-in samples/connector/. To build this code, enable CONFIG_CONNECTOR
-and CONFIG_SAMPLES.
diff --git a/drivers/w1/Kconfig b/drivers/w1/Kconfig
index 03dd57581df7..160053c0baea 100644
--- a/drivers/w1/Kconfig
+++ b/drivers/w1/Kconfig
@@ -19,7 +19,7 @@ config W1_CON
 	default y
 	---help---
 	  This allows to communicate with userspace using connector. For more
-	  information see <file:Documentation/connector/connector.txt>.
+	  information see <file:Documentation/connector/connector.rst>.
 	  There are three types of messages between w1 core and userspace:
 	  1. Events. They are generated each time new master or slave device found
 		either due to automatic or requested search.
diff --git a/include/linux/connector.h b/include/linux/connector.h
index 1d72ef76f24f..6b6c7396a584 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -55,10 +55,71 @@ struct cn_dev {
 	struct cn_queue_dev *cbdev;
 };
 
+/**
+ * cn_add_callback() - Registers new callback with connector core.
+ *
+ * @id:		unique connector's user identifier.
+ *		It must be registered in connector.h for legal
+ *		in-kernel users.
+ * @name:	connector's callback symbolic name.
+ * @callback:	connector's callback.
+ * 		parameters are %cn_msg and the sender's credentials
+ */
 int cn_add_callback(struct cb_id *id, const char *name,
 		    void (*callback)(struct cn_msg *, struct netlink_skb_parms *));
-void cn_del_callback(struct cb_id *);
+/**
+ * cn_del_callback() - Unregisters new callback with connector core.
+ *
+ * @id:		unique connector's user identifier.
+ */
+void cn_del_callback(struct cb_id *id);
+
+
+/**
+ * cn_netlink_send_mult - Sends message to the specified groups.
+ *
+ * @msg: 	message header(with attached data).
+ * @len:	Number of @msg to be sent.
+ * @portid:	destination port.
+ *		If non-zero the message will be sent to the given port,
+ *		which should be set to the original sender.
+ * @group:	destination group.
+ * 		If @portid and @group is zero, then appropriate group will
+ *		be searched through all registered connector users, and
+ *		message will be delivered to the group which was created
+ *		for user with the same ID as in @msg.
+ *		If @group is not zero, then message will be delivered
+ *		to the specified group.
+ * @gfp_mask:	GFP mask.
+ *
+ * It can be safely called from softirq context, but may silently
+ * fail under strong memory pressure.
+ *
+ * If there are no listeners for given group %-ESRCH can be returned.
+ */
 int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 group, gfp_t gfp_mask);
+
+/**
+ * cn_netlink_send_mult - Sends message to the specified groups.
+ *
+ * @msg:	message header(with attached data).
+ * @portid:	destination port.
+ *		If non-zero the message will be sent to the given port,
+ *		which should be set to the original sender.
+ * @group:	destination group.
+ * 		If @portid and @group is zero, then appropriate group will
+ *		be searched through all registered connector users, and
+ *		message will be delivered to the group which was created
+ *		for user with the same ID as in @msg.
+ *		If @group is not zero, then message will be delivered
+ *		to the specified group.
+ * @gfp_mask:	GFP mask.
+ *
+ * It can be safely called from softirq context, but may silently
+ * fail under strong memory pressure.
+ *
+ * If there are no listeners for given group %-ESRCH can be returned.
+ */
 int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 group, gfp_t gfp_mask);
 
 int cn_queue_add_callback(struct cn_queue_dev *dev, const char *name,
diff --git a/samples/Kconfig b/samples/Kconfig
index 71b5e833dd9e..155da47dc6a4 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -99,7 +99,7 @@ config SAMPLE_CONNECTOR
 	  When enabled, this builds both a sample kernel module for
 	  the connector interface and a user space tool to communicate
 	  with it.
-	  See also Documentation/connector/connector.txt
+	  See also Documentation/connector/connector.rst
 
 config SAMPLE_HIDRAW
 	bool "hidraw sample"
-- 
cgit v1.2.3


From 08536105d93fe371743709b85350db141bafc51f Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 18 Apr 2019 11:21:26 -0300
Subject: docs: ioctl-number.txt: convert it to ReST format

The conversion itself is simple: add a markup for the
title of this file and add markups for both tables.

Yet, the big table here with IOCTL numbers is badly formatted:
on several lines, the "Include File" column has some values that
are bigger than the reserved space there.

Also, on several places, a comment was misplaced at the "Include
File" space.

So, most of the work here is to actually ensure that each field
will be properly fixed.

Also worth to mention that some URLs have the asterisk character
on it. Well, Sphinx has an issue with asterisks in the middle
of an string. As this is URL, use the alternate format: %2A.

As a side effect of this patch, it is now a lot easier to see that
some reserved ioctl numbers are missing the include files
where it is supposed to be used.

PS.: While this is part of a subdir, I opted to convert this
single file alone, as this file has a potential of conflicts,
as most subsystem maintainers touch it.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/ioctl/ioctl-number.rst               | 363 +++++++++++++++++++++
 Documentation/ioctl/ioctl-number.txt               | 351 --------------------
 Documentation/process/submit-checklist.rst         |   2 +-
 .../it_IT/process/submit-checklist.rst             |   2 +-
 .../zh_CN/process/submit-checklist.rst             |   2 +-
 include/uapi/rdma/rdma_user_ioctl_cmds.h           |   2 +-
 6 files changed, 367 insertions(+), 355 deletions(-)
 create mode 100644 Documentation/ioctl/ioctl-number.rst
 delete mode 100644 Documentation/ioctl/ioctl-number.txt

(limited to 'include')

diff --git a/Documentation/ioctl/ioctl-number.rst b/Documentation/ioctl/ioctl-number.rst
new file mode 100644
index 000000000000..fcf9623a599f
--- /dev/null
+++ b/Documentation/ioctl/ioctl-number.rst
@@ -0,0 +1,363 @@
+:orphan:
+
+=============
+Ioctl Numbers
+=============
+
+19 October 1999
+
+Michael Elizabeth Chastain
+<mec@shout.net>
+
+If you are adding new ioctl's to the kernel, you should use the _IO
+macros defined in <linux/ioctl.h>:
+
+    ====== == ============================================
+    _IO    an ioctl with no parameters
+    _IOW   an ioctl with write parameters (copy_from_user)
+    _IOR   an ioctl with read parameters  (copy_to_user)
+    _IOWR  an ioctl with both write and read parameters.
+    ====== == ============================================
+
+'Write' and 'read' are from the user's point of view, just like the
+system calls 'write' and 'read'.  For example, a SET_FOO ioctl would
+be _IOW, although the kernel would actually read data from user space;
+a GET_FOO ioctl would be _IOR, although the kernel would actually write
+data to user space.
+
+The first argument to _IO, _IOW, _IOR, or _IOWR is an identifying letter
+or number from the table below.  Because of the large number of drivers,
+many drivers share a partial letter with other drivers.
+
+If you are writing a driver for a new device and need a letter, pick an
+unused block with enough room for expansion: 32 to 256 ioctl commands.
+You can register the block by patching this file and submitting the
+patch to Linus Torvalds.  Or you can e-mail me at <mec@shout.net> and
+I'll register one for you.
+
+The second argument to _IO, _IOW, _IOR, or _IOWR is a sequence number
+to distinguish ioctls from each other.  The third argument to _IOW,
+_IOR, or _IOWR is the type of the data going into the kernel or coming
+out of the kernel (e.g.  'int' or 'struct foo').  NOTE!  Do NOT use
+sizeof(arg) as the third argument as this results in your ioctl thinking
+it passes an argument of type size_t.
+
+Some devices use their major number as the identifier; this is OK, as
+long as it is unique.  Some devices are irregular and don't follow any
+convention at all.
+
+Following this convention is good because:
+
+(1) Keeping the ioctl's globally unique helps error checking:
+    if a program calls an ioctl on the wrong device, it will get an
+    error rather than some unexpected behaviour.
+
+(2) The 'strace' build procedure automatically finds ioctl numbers
+    defined with _IO, _IOW, _IOR, or _IOWR.
+
+(3) 'strace' can decode numbers back into useful names when the
+    numbers are unique.
+
+(4) People looking for ioctls can grep for them more easily when
+    this convention is used to define the ioctl numbers.
+
+(5) When following the convention, the driver code can use generic
+    code to copy the parameters between user and kernel space.
+
+This table lists ioctls visible from user land for Linux/x86.  It contains
+most drivers up to 2.6.31, but I know I am missing some.  There has been
+no attempt to list non-X86 architectures or ioctls from drivers/staging/.
+
+====  =====  ======================================================= ================================================================
+Code  Seq#    Include File                                           Comments
+      (hex)
+====  =====  ======================================================= ================================================================
+0x00  00-1F  linux/fs.h                                              conflict!
+0x00  00-1F  scsi/scsi_ioctl.h                                       conflict!
+0x00  00-1F  linux/fb.h                                              conflict!
+0x00  00-1F  linux/wavefront.h                                       conflict!
+0x02  all    linux/fd.h
+0x03  all    linux/hdreg.h
+0x04  D2-DC  linux/umsdos_fs.h                                       Dead since 2.6.11, but don't reuse these.
+0x06  all    linux/lp.h
+0x09  all    linux/raid/md_u.h
+0x10  00-0F  drivers/char/s390/vmcp.h
+0x10  10-1F  arch/s390/include/uapi/sclp_ctl.h
+0x10  20-2F  arch/s390/include/uapi/asm/hypfs.h
+0x12  all    linux/fs.h
+             linux/blkpg.h
+0x1b  all                                                            InfiniBand Subsystem
+                                                                     <http://infiniband.sourceforge.net/>
+0x20  all    drivers/cdrom/cm206.h
+0x22  all    scsi/sg.h
+'!'   00-1F  uapi/linux/seccomp.h
+'#'   00-3F                                                          IEEE 1394 Subsystem
+                                                                     Block for the entire subsystem
+'$'   00-0F  linux/perf_counter.h, linux/perf_event.h
+'%'   00-0F  include/uapi/linux/stm.h                                System Trace Module subsystem
+                                                                     <mailto:alexander.shishkin@linux.intel.com>
+'&'   00-07  drivers/firewire/nosy-user.h
+'1'   00-1F  linux/timepps.h                                         PPS kit from Ulrich Windl
+                                                                     <ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/>
+'2'   01-04  linux/i2o.h
+'3'   00-0F  drivers/s390/char/raw3270.h                             conflict!
+'3'   00-1F  linux/suspend_ioctls.h,                                 conflict!
+             kernel/power/user.c
+'8'   all                                                            SNP8023 advanced NIC card
+                                                                     <mailto:mcr@solidum.com>
+';'   64-7F  linux/vfio.h
+'@'   00-0F  linux/radeonfb.h                                        conflict!
+'@'   00-0F  drivers/video/aty/aty128fb.c                            conflict!
+'A'   00-1F  linux/apm_bios.h                                        conflict!
+'A'   00-0F  linux/agpgart.h,                                        conflict!
+             drivers/char/agp/compat_ioctl.h
+'A'   00-7F  sound/asound.h                                          conflict!
+'B'   00-1F  linux/cciss_ioctl.h                                     conflict!
+'B'   00-0F  include/linux/pmu.h                                     conflict!
+'B'   C0-FF  advanced bbus                                           <mailto:maassen@uni-freiburg.de>
+'C'   all    linux/soundcard.h                                       conflict!
+'C'   01-2F  linux/capi.h                                            conflict!
+'C'   F0-FF  drivers/net/wan/cosa.h                                  conflict!
+'D'   all    arch/s390/include/asm/dasd.h
+'D'   40-5F  drivers/scsi/dpt/dtpi_ioctl.h
+'D'   05     drivers/scsi/pmcraid.h
+'E'   all    linux/input.h                                           conflict!
+'E'   00-0F  xen/evtchn.h                                            conflict!
+'F'   all    linux/fb.h                                              conflict!
+'F'   01-02  drivers/scsi/pmcraid.h                                  conflict!
+'F'   20     drivers/video/fsl-diu-fb.h                              conflict!
+'F'   20     drivers/video/intelfb/intelfb.h                         conflict!
+'F'   20     linux/ivtvfb.h                                          conflict!
+'F'   20     linux/matroxfb.h                                        conflict!
+'F'   20     drivers/video/aty/atyfb_base.c                          conflict!
+'F'   00-0F  video/da8xx-fb.h                                        conflict!
+'F'   80-8F  linux/arcfb.h                                           conflict!
+'F'   DD     video/sstfb.h                                           conflict!
+'G'   00-3F  drivers/misc/sgi-gru/grulib.h                           conflict!
+'G'   00-0F  linux/gigaset_dev.h                                     conflict!
+'H'   00-7F  linux/hiddev.h                                          conflict!
+'H'   00-0F  linux/hidraw.h                                          conflict!
+'H'   01     linux/mei.h                                             conflict!
+'H'   02     linux/mei.h                                             conflict!
+'H'   03     linux/mei.h                                             conflict!
+'H'   00-0F  sound/asound.h                                          conflict!
+'H'   20-40  sound/asound_fm.h                                       conflict!
+'H'   80-8F  sound/sfnt_info.h                                       conflict!
+'H'   10-8F  sound/emu10k1.h                                         conflict!
+'H'   10-1F  sound/sb16_csp.h                                        conflict!
+'H'   10-1F  sound/hda_hwdep.h                                       conflict!
+'H'   40-4F  sound/hdspm.h                                           conflict!
+'H'   40-4F  sound/hdsp.h                                            conflict!
+'H'   90     sound/usb/usx2y/usb_stream.h
+'H'   A0     uapi/linux/usb/cdc-wdm.h
+'H'   C0-F0  net/bluetooth/hci.h                                     conflict!
+'H'   C0-DF  net/bluetooth/hidp/hidp.h                               conflict!
+'H'   C0-DF  net/bluetooth/cmtp/cmtp.h                               conflict!
+'H'   C0-DF  net/bluetooth/bnep/bnep.h                               conflict!
+'H'   F1     linux/hid-roccat.h                                      <mailto:erazor_de@users.sourceforge.net>
+'H'   F8-FA  sound/firewire.h
+'I'   all    linux/isdn.h                                            conflict!
+'I'   00-0F  drivers/isdn/divert/isdn_divert.h                       conflict!
+'I'   40-4F  linux/mISDNif.h                                         conflict!
+'J'   00-1F  drivers/scsi/gdth_ioctl.h
+'K'   all    linux/kd.h
+'L'   00-1F  linux/loop.h                                            conflict!
+'L'   10-1F  drivers/scsi/mpt3sas/mpt3sas_ctl.h                      conflict!
+'L'   20-2F  linux/lightnvm.h
+'L'   E0-FF  linux/ppdd.h                                            encrypted disk device driver
+                                                                     <http://linux01.gwdg.de/~alatham/ppdd.html>
+'M'   all    linux/soundcard.h                                       conflict!
+'M'   01-16  mtd/mtd-abi.h                                           conflict!
+      and    drivers/mtd/mtdchar.c
+'M'   01-03  drivers/scsi/megaraid/megaraid_sas.h
+'M'   00-0F  drivers/video/fsl-diu-fb.h                              conflict!
+'N'   00-1F  drivers/usb/scanner.h
+'N'   40-7F  drivers/block/nvme.c
+'O'   00-06  mtd/ubi-user.h                                          UBI
+'P'   all    linux/soundcard.h                                       conflict!
+'P'   60-6F  sound/sscape_ioctl.h                                    conflict!
+'P'   00-0F  drivers/usb/class/usblp.c                               conflict!
+'P'   01-09  drivers/misc/pci_endpoint_test.c                        conflict!
+'Q'   all    linux/soundcard.h
+'R'   00-1F  linux/random.h                                          conflict!
+'R'   01     linux/rfkill.h                                          conflict!
+'R'   C0-DF  net/bluetooth/rfcomm.h
+'S'   all    linux/cdrom.h                                           conflict!
+'S'   80-81  scsi/scsi_ioctl.h                                       conflict!
+'S'   82-FF  scsi/scsi.h                                             conflict!
+'S'   00-7F  sound/asequencer.h                                      conflict!
+'T'   all    linux/soundcard.h                                       conflict!
+'T'   00-AF  sound/asound.h                                          conflict!
+'T'   all    arch/x86/include/asm/ioctls.h                           conflict!
+'T'   C0-DF  linux/if_tun.h                                          conflict!
+'U'   all    sound/asound.h                                          conflict!
+'U'   00-CF  linux/uinput.h                                          conflict!
+'U'   00-EF  linux/usbdevice_fs.h
+'U'   C0-CF  drivers/bluetooth/hci_uart.h
+'V'   all    linux/vt.h                                              conflict!
+'V'   all    linux/videodev2.h                                       conflict!
+'V'   C0     linux/ivtvfb.h                                          conflict!
+'V'   C0     linux/ivtv.h                                            conflict!
+'V'   C0     media/davinci/vpfe_capture.h                            conflict!
+'V'   C0     media/si4713.h                                          conflict!
+'W'   00-1F  linux/watchdog.h                                        conflict!
+'W'   00-1F  linux/wanrouter.h                                       conflict! (pre 3.9)
+'W'   00-3F  sound/asound.h                                          conflict!
+'W'   40-5F  drivers/pci/switch/switchtec.c
+'X'   all    fs/xfs/xfs_fs.h,                                        conflict!
+             fs/xfs/linux-2.6/xfs_ioctl32.h,
+             include/linux/falloc.h,
+             linux/fs.h,
+'X'   all    fs/ocfs2/ocfs_fs.h                                      conflict!
+'X'   01     linux/pktcdvd.h                                         conflict!
+'Y'   all    linux/cyclades.h
+'Z'   14-15  drivers/message/fusion/mptctl.h
+'['   00-3F  linux/usb/tmc.h                                         USB Test and Measurement Devices
+                                                                     <mailto:gregkh@linuxfoundation.org>
+'a'   all    linux/atm*.h, linux/sonet.h                             ATM on linux
+                                                                     <http://lrcwww.epfl.ch/>
+'a'   00-0F  drivers/crypto/qat/qat_common/adf_cfg_common.h          conflict! qat driver
+'b'   00-FF                                                          conflict! bit3 vme host bridge
+                                                                     <mailto:natalia@nikhefk.nikhef.nl>
+'c'   all    linux/cm4000_cs.h                                       conflict!
+'c'   00-7F  linux/comstats.h                                        conflict!
+'c'   00-7F  linux/coda.h                                            conflict!
+'c'   00-1F  linux/chio.h                                            conflict!
+'c'   80-9F  arch/s390/include/asm/chsc.h                            conflict!
+'c'   A0-AF  arch/x86/include/asm/msr.h conflict!
+'d'   00-FF  linux/char/drm/drm.h                                    conflict!
+'d'   02-40  pcmcia/ds.h                                             conflict!
+'d'   F0-FF  linux/digi1.h
+'e'   all    linux/digi1.h                                           conflict!
+'f'   00-1F  linux/ext2_fs.h                                         conflict!
+'f'   00-1F  linux/ext3_fs.h                                         conflict!
+'f'   00-0F  fs/jfs/jfs_dinode.h                                     conflict!
+'f'   00-0F  fs/ext4/ext4.h                                          conflict!
+'f'   00-0F  linux/fs.h                                              conflict!
+'f'   00-0F  fs/ocfs2/ocfs2_fs.h                                     conflict!
+'g'   00-0F  linux/usb/gadgetfs.h
+'g'   20-2F  linux/usb/g_printer.h
+'h'   00-7F                                                          conflict! Charon filesystem
+                                                                     <mailto:zapman@interlan.net>
+'h'   00-1F  linux/hpet.h                                            conflict!
+'h'   80-8F  fs/hfsplus/ioctl.c
+'i'   00-3F  linux/i2o-dev.h                                         conflict!
+'i'   0B-1F  linux/ipmi.h                                            conflict!
+'i'   80-8F  linux/i8k.h
+'j'   00-3F  linux/joystick.h
+'k'   00-0F  linux/spi/spidev.h                                      conflict!
+'k'   00-05  video/kyro.h                                            conflict!
+'k'   10-17  linux/hsi/hsi_char.h                                    HSI character device
+'l'   00-3F  linux/tcfs_fs.h                                         transparent cryptographic file system
+                                                                     <http://web.archive.org/web/%2A/http://mikonos.dia.unisa.it/tcfs>
+'l'   40-7F  linux/udf_fs_i.h                                        in development:
+                                                                     <http://sourceforge.net/projects/linux-udf/>
+'m'   00-09  linux/mmtimer.h                                         conflict!
+'m'   all    linux/mtio.h                                            conflict!
+'m'   all    linux/soundcard.h                                       conflict!
+'m'   all    linux/synclink.h                                        conflict!
+'m'   00-19  drivers/message/fusion/mptctl.h                         conflict!
+'m'   00     drivers/scsi/megaraid/megaraid_ioctl.h                  conflict!
+'n'   00-7F  linux/ncp_fs.h and fs/ncpfs/ioctl.c
+'n'   80-8F  uapi/linux/nilfs2_api.h                                 NILFS2
+'n'   E0-FF  linux/matroxfb.h                                        matroxfb
+'o'   00-1F  fs/ocfs2/ocfs2_fs.h                                     OCFS2
+'o'   00-03  mtd/ubi-user.h                                          conflict! (OCFS2 and UBI overlaps)
+'o'   40-41  mtd/ubi-user.h                                          UBI
+'o'   01-A1  `linux/dvb/*.h`                                         DVB
+'p'   00-0F  linux/phantom.h                                         conflict! (OpenHaptics needs this)
+'p'   00-1F  linux/rtc.h                                             conflict!
+'p'   00-3F  linux/mc146818rtc.h                                     conflict!
+'p'   40-7F  linux/nvram.h
+'p'   80-9F  linux/ppdev.h                                           user-space parport
+                                                                     <mailto:tim@cyberelk.net>
+'p'   A1-A5  linux/pps.h                                             LinuxPPS
+                                                                     <mailto:giometti@linux.it>
+'q'   00-1F  linux/serio.h
+'q'   80-FF  linux/telephony.h                                       Internet PhoneJACK, Internet LineJACK
+             linux/ixjuser.h                                         <http://web.archive.org/web/%2A/http://www.quicknet.net>
+'r'   00-1F  linux/msdos_fs.h and fs/fat/dir.c
+'s'   all    linux/cdk.h
+'t'   00-7F  linux/ppp-ioctl.h
+'t'   80-8F  linux/isdn_ppp.h
+'t'   90-91  linux/toshiba.h                                         toshiba and toshiba_acpi SMM
+'u'   00-1F  linux/smb_fs.h                                          gone
+'u'   20-3F  linux/uvcvideo.h                                        USB video class host driver
+'u'   40-4f  linux/udmabuf.h                                         userspace dma-buf misc device
+'v'   00-1F  linux/ext2_fs.h                                         conflict!
+'v'   00-1F  linux/fs.h                                              conflict!
+'v'   00-0F  linux/sonypi.h                                          conflict!
+'v'   00-0F  media/v4l2-subdev.h                                     conflict!
+'v'   C0-FF  linux/meye.h                                            conflict!
+'w'   all                                                            CERN SCI driver
+'y'   00-1F                                                          packet based user level communications
+                                                                     <mailto:zapman@interlan.net>
+'z'   00-3F                                                          CAN bus card conflict!
+                                                                     <mailto:hdstich@connectu.ulm.circular.de>
+'z'   40-7F                                                          CAN bus card conflict!
+                                                                     <mailto:oe@port.de>
+'z'   10-4F  drivers/s390/crypto/zcrypt_api.h                        conflict!
+'|'   00-7F  linux/media.h
+0x80  00-1F  linux/fb.h
+0x89  00-06  arch/x86/include/asm/sockios.h
+0x89  0B-DF  linux/sockios.h
+0x89  E0-EF  linux/sockios.h                                         SIOCPROTOPRIVATE range
+0x89  E0-EF  linux/dn.h                                              PROTOPRIVATE range
+0x89  F0-FF  linux/sockios.h                                         SIOCDEVPRIVATE range
+0x8B  all    linux/wireless.h
+0x8C  00-3F                                                          WiNRADiO driver
+                                                                     <http://www.winradio.com.au/>
+0x90  00     drivers/cdrom/sbpcd.h
+0x92  00-0F  drivers/usb/mon/mon_bin.c
+0x93  60-7F  linux/auto_fs.h
+0x94  all    fs/btrfs/ioctl.h                                        Btrfs filesystem
+             and linux/fs.h                                          some lifted to vfs/generic
+0x97  00-7F  fs/ceph/ioctl.h                                         Ceph file system
+0x99  00-0F                                                          537-Addinboard driver
+                                                                     <mailto:buk@buks.ipn.de>
+0xA0  all    linux/sdp/sdp.h                                         Industrial Device Project
+                                                                     <mailto:kenji@bitgate.com>
+0xA1  0      linux/vtpm_proxy.h                                      TPM Emulator Proxy Driver
+0xA3  80-8F                                                          Port ACL  in development:
+                                                                     <mailto:tlewis@mindspring.com>
+0xA3  90-9F  linux/dtlk.h
+0xA4  00-1F  uapi/linux/tee.h                                        Generic TEE subsystem
+0xAA  00-3F  linux/uapi/linux/userfaultfd.h
+0xAB  00-1F  linux/nbd.h
+0xAC  00-1F  linux/raw.h
+0xAD  00                                                             Netfilter device in development:
+                                                                     <mailto:rusty@rustcorp.com.au>
+0xAE  all    linux/kvm.h                                             Kernel-based Virtual Machine
+                                                                     <mailto:kvm@vger.kernel.org>
+0xAF  00-1F  linux/fsl_hypervisor.h                                  Freescale hypervisor
+0xB0  all                                                            RATIO devices in development:
+                                                                     <mailto:vgo@ratio.de>
+0xB1  00-1F                                                          PPPoX
+                                                                     <mailto:mostrows@styx.uwaterloo.ca>
+0xB3  00     linux/mmc/ioctl.h
+0xB4  00-0F  linux/gpio.h                                            <mailto:linux-gpio@vger.kernel.org>
+0xB5  00-0F  uapi/linux/rpmsg.h                                      <mailto:linux-remoteproc@vger.kernel.org>
+0xB6  all    linux/fpga-dfl.h
+0xC0  00-0F  linux/usb/iowarrior.h
+0xCA  00-0F  uapi/misc/cxl.h
+0xCA  10-2F  uapi/misc/ocxl.h
+0xCA  80-BF  uapi/scsi/cxlflash_ioctl.h
+0xCB  00-1F                                                          CBM serial IEC bus in development:
+                                                                     <mailto:michael.klein@puffin.lb.shuttle.de>
+0xCC  00-0F  drivers/misc/ibmvmc.h                                   pseries VMC driver
+0xCD  01     linux/reiserfs_fs.h
+0xCF  02     fs/cifs/ioctl.c
+0xDB  00-0F  drivers/char/mwave/mwavepub.h
+0xDD  00-3F                                                          ZFCP device driver see drivers/s390/scsi/
+                                                                     <mailto:aherrman@de.ibm.com>
+0xE5  00-3F  linux/fuse.h
+0xEC  00-01  drivers/platform/chrome/cros_ec_dev.h                   ChromeOS EC driver
+0xF3  00-3F  drivers/usb/misc/sisusbvga/sisusb.h                     sisfb (in development)
+                                                                     <mailto:thomas@winischhofer.net>
+0xF4  00-1F  video/mbxfb.h                                           mbxfb
+                                                                     <mailto:raph@8d.com>
+0xF6  all                                                            LTTng Linux Trace Toolkit Next Generation
+                                                                     <mailto:mathieu.desnoyers@efficios.com>
+0xFD  all    linux/dm-ioctl.h
+0xFE  all    linux/isst_if.h
+====  =====  ======================================================= ================================================================
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
deleted file mode 100644
index ab0b3f686454..000000000000
--- a/Documentation/ioctl/ioctl-number.txt
+++ /dev/null
@@ -1,351 +0,0 @@
-Ioctl Numbers
-19 October 1999
-Michael Elizabeth Chastain
-<mec@shout.net>
-
-If you are adding new ioctl's to the kernel, you should use the _IO
-macros defined in <linux/ioctl.h>:
-
-    _IO    an ioctl with no parameters
-    _IOW   an ioctl with write parameters (copy_from_user)
-    _IOR   an ioctl with read parameters  (copy_to_user)
-    _IOWR  an ioctl with both write and read parameters.
-
-'Write' and 'read' are from the user's point of view, just like the
-system calls 'write' and 'read'.  For example, a SET_FOO ioctl would
-be _IOW, although the kernel would actually read data from user space;
-a GET_FOO ioctl would be _IOR, although the kernel would actually write
-data to user space.
-
-The first argument to _IO, _IOW, _IOR, or _IOWR is an identifying letter
-or number from the table below.  Because of the large number of drivers,
-many drivers share a partial letter with other drivers.
-
-If you are writing a driver for a new device and need a letter, pick an
-unused block with enough room for expansion: 32 to 256 ioctl commands.
-You can register the block by patching this file and submitting the
-patch to Linus Torvalds.  Or you can e-mail me at <mec@shout.net> and
-I'll register one for you.
-
-The second argument to _IO, _IOW, _IOR, or _IOWR is a sequence number
-to distinguish ioctls from each other.  The third argument to _IOW,
-_IOR, or _IOWR is the type of the data going into the kernel or coming
-out of the kernel (e.g.  'int' or 'struct foo').  NOTE!  Do NOT use
-sizeof(arg) as the third argument as this results in your ioctl thinking
-it passes an argument of type size_t.
-
-Some devices use their major number as the identifier; this is OK, as
-long as it is unique.  Some devices are irregular and don't follow any
-convention at all.
-
-Following this convention is good because:
-
-(1) Keeping the ioctl's globally unique helps error checking:
-    if a program calls an ioctl on the wrong device, it will get an
-    error rather than some unexpected behaviour.
-
-(2) The 'strace' build procedure automatically finds ioctl numbers
-    defined with _IO, _IOW, _IOR, or _IOWR.
-
-(3) 'strace' can decode numbers back into useful names when the
-    numbers are unique.
-
-(4) People looking for ioctls can grep for them more easily when
-    this convention is used to define the ioctl numbers.
-
-(5) When following the convention, the driver code can use generic
-    code to copy the parameters between user and kernel space.
-
-This table lists ioctls visible from user land for Linux/x86.  It contains
-most drivers up to 2.6.31, but I know I am missing some.  There has been
-no attempt to list non-X86 architectures or ioctls from drivers/staging/.
-
-Code  Seq#(hex)	Include File		Comments
-========================================================
-0x00	00-1F	linux/fs.h		conflict!
-0x00	00-1F	scsi/scsi_ioctl.h	conflict!
-0x00	00-1F	linux/fb.h		conflict!
-0x00	00-1F	linux/wavefront.h	conflict!
-0x02	all	linux/fd.h
-0x03	all	linux/hdreg.h
-0x04	D2-DC	linux/umsdos_fs.h	Dead since 2.6.11, but don't reuse these.
-0x06	all	linux/lp.h
-0x09	all	linux/raid/md_u.h
-0x10	00-0F	drivers/char/s390/vmcp.h
-0x10	10-1F	arch/s390/include/uapi/sclp_ctl.h
-0x10	20-2F	arch/s390/include/uapi/asm/hypfs.h
-0x12	all	linux/fs.h
-		linux/blkpg.h
-0x1b	all	InfiniBand Subsystem	<http://infiniband.sourceforge.net/>
-0x20	all	drivers/cdrom/cm206.h
-0x22	all	scsi/sg.h
-'!'	00-1F	uapi/linux/seccomp.h
-'#'	00-3F	IEEE 1394 Subsystem	Block for the entire subsystem
-'$'	00-0F	linux/perf_counter.h, linux/perf_event.h
-'%'	00-0F	include/uapi/linux/stm.h
-					System Trace Module subsystem
-					<mailto:alexander.shishkin@linux.intel.com>
-'&'	00-07	drivers/firewire/nosy-user.h
-'1'	00-1F	<linux/timepps.h>	PPS kit from Ulrich Windl
-					<ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/>
-'2'	01-04	linux/i2o.h
-'3'	00-0F	drivers/s390/char/raw3270.h	conflict!
-'3'	00-1F	linux/suspend_ioctls.h	conflict!
-		and kernel/power/user.c
-'8'	all				SNP8023 advanced NIC card
-					<mailto:mcr@solidum.com>
-';'	64-7F	linux/vfio.h
-'@'	00-0F	linux/radeonfb.h	conflict!
-'@'	00-0F	drivers/video/aty/aty128fb.c	conflict!
-'A'	00-1F	linux/apm_bios.h	conflict!
-'A'	00-0F	linux/agpgart.h		conflict!
-		and drivers/char/agp/compat_ioctl.h
-'A'	00-7F	sound/asound.h		conflict!
-'B'	00-1F	linux/cciss_ioctl.h	conflict!
-'B'	00-0F	include/linux/pmu.h	conflict!
-'B'	C0-FF				advanced bbus
-					<mailto:maassen@uni-freiburg.de>
-'C'	all	linux/soundcard.h	conflict!
-'C'	01-2F	linux/capi.h		conflict!
-'C'	F0-FF	drivers/net/wan/cosa.h	conflict!
-'D'	all	arch/s390/include/asm/dasd.h
-'D'	40-5F	drivers/scsi/dpt/dtpi_ioctl.h
-'D'	05	drivers/scsi/pmcraid.h
-'E'	all	linux/input.h		conflict!
-'E'	00-0F	xen/evtchn.h		conflict!
-'F'	all	linux/fb.h		conflict!
-'F'	01-02	drivers/scsi/pmcraid.h	conflict!
-'F'	20	drivers/video/fsl-diu-fb.h	conflict!
-'F'	20	drivers/video/intelfb/intelfb.h	conflict!
-'F'	20	linux/ivtvfb.h		conflict!
-'F'	20	linux/matroxfb.h	conflict!
-'F'	20	drivers/video/aty/atyfb_base.c	conflict!
-'F'	00-0F	video/da8xx-fb.h	conflict!
-'F'	80-8F	linux/arcfb.h		conflict!
-'F'	DD	video/sstfb.h		conflict!
-'G'	00-3F	drivers/misc/sgi-gru/grulib.h	conflict!
-'G'	00-0F	linux/gigaset_dev.h	conflict!
-'H'	00-7F	linux/hiddev.h		conflict!
-'H'	00-0F	linux/hidraw.h		conflict!
-'H'	01	linux/mei.h		conflict!
-'H'	02	linux/mei.h		conflict!
-'H'	03	linux/mei.h		conflict!
-'H'	00-0F	sound/asound.h		conflict!
-'H'	20-40	sound/asound_fm.h	conflict!
-'H'	80-8F	sound/sfnt_info.h	conflict!
-'H'	10-8F	sound/emu10k1.h		conflict!
-'H'	10-1F	sound/sb16_csp.h	conflict!
-'H'	10-1F	sound/hda_hwdep.h	conflict!
-'H'	40-4F	sound/hdspm.h		conflict!
-'H'	40-4F	sound/hdsp.h		conflict!
-'H'	90	sound/usb/usx2y/usb_stream.h
-'H'	A0	uapi/linux/usb/cdc-wdm.h
-'H'	C0-F0	net/bluetooth/hci.h	conflict!
-'H'	C0-DF	net/bluetooth/hidp/hidp.h	conflict!
-'H'	C0-DF	net/bluetooth/cmtp/cmtp.h	conflict!
-'H'	C0-DF	net/bluetooth/bnep/bnep.h	conflict!
-'H'	F1	linux/hid-roccat.h	<mailto:erazor_de@users.sourceforge.net>
-'H'	F8-FA	sound/firewire.h
-'I'	all	linux/isdn.h		conflict!
-'I'	00-0F	drivers/isdn/divert/isdn_divert.h	conflict!
-'I'	40-4F	linux/mISDNif.h		conflict!
-'J'	00-1F	drivers/scsi/gdth_ioctl.h
-'K'	all	linux/kd.h
-'L'	00-1F	linux/loop.h		conflict!
-'L'	10-1F	drivers/scsi/mpt3sas/mpt3sas_ctl.h	conflict!
-'L'	20-2F	linux/lightnvm.h
-'L'	E0-FF	linux/ppdd.h		encrypted disk device driver
-					<http://linux01.gwdg.de/~alatham/ppdd.html>
-'M'	all	linux/soundcard.h	conflict!
-'M'	01-16	mtd/mtd-abi.h		conflict!
-		and drivers/mtd/mtdchar.c
-'M'	01-03	drivers/scsi/megaraid/megaraid_sas.h
-'M'	00-0F	drivers/video/fsl-diu-fb.h	conflict!
-'N'	00-1F	drivers/usb/scanner.h
-'N'	40-7F	drivers/block/nvme.c
-'O'     00-06   mtd/ubi-user.h		UBI
-'P'	all	linux/soundcard.h	conflict!
-'P'	60-6F	sound/sscape_ioctl.h	conflict!
-'P'	00-0F	drivers/usb/class/usblp.c	conflict!
-'P'	01-09	drivers/misc/pci_endpoint_test.c	conflict!
-'Q'	all	linux/soundcard.h
-'R'	00-1F	linux/random.h		conflict!
-'R'	01	linux/rfkill.h		conflict!
-'R'	C0-DF	net/bluetooth/rfcomm.h
-'S'	all	linux/cdrom.h		conflict!
-'S'	80-81	scsi/scsi_ioctl.h	conflict!
-'S'	82-FF	scsi/scsi.h		conflict!
-'S'	00-7F	sound/asequencer.h	conflict!
-'T'	all	linux/soundcard.h	conflict!
-'T'	00-AF	sound/asound.h		conflict!
-'T'	all	arch/x86/include/asm/ioctls.h	conflict!
-'T'	C0-DF	linux/if_tun.h		conflict!
-'U'	all	sound/asound.h		conflict!
-'U'	00-CF	linux/uinput.h		conflict!
-'U'	00-EF	linux/usbdevice_fs.h
-'U'	C0-CF	drivers/bluetooth/hci_uart.h
-'V'	all	linux/vt.h		conflict!
-'V'	all	linux/videodev2.h	conflict!
-'V'	C0	linux/ivtvfb.h		conflict!
-'V'	C0	linux/ivtv.h		conflict!
-'V'	C0	media/davinci/vpfe_capture.h	conflict!
-'V'	C0	media/si4713.h		conflict!
-'W'	00-1F	linux/watchdog.h	conflict!
-'W'	00-1F	linux/wanrouter.h	conflict!		(pre 3.9)
-'W'	00-3F	sound/asound.h		conflict!
-'W'	40-5F   drivers/pci/switch/switchtec.c
-'X'	all	fs/xfs/xfs_fs.h		conflict!
-		and fs/xfs/linux-2.6/xfs_ioctl32.h
-		and include/linux/falloc.h
-		and linux/fs.h
-'X'	all	fs/ocfs2/ocfs_fs.h	conflict!
-'X'	01	linux/pktcdvd.h		conflict!
-'Y'	all	linux/cyclades.h
-'Z'	14-15	drivers/message/fusion/mptctl.h
-'['	00-3F	linux/usb/tmc.h		USB Test and Measurement Devices
-					<mailto:gregkh@linuxfoundation.org>
-'a'	all	linux/atm*.h, linux/sonet.h	ATM on linux
-					<http://lrcwww.epfl.ch/>
-'a'	00-0F	drivers/crypto/qat/qat_common/adf_cfg_common.h	conflict! qat driver
-'b'	00-FF				conflict! bit3 vme host bridge
-					<mailto:natalia@nikhefk.nikhef.nl>
-'c'	all	linux/cm4000_cs.h	conflict!
-'c'	00-7F	linux/comstats.h	conflict!
-'c'	00-7F	linux/coda.h		conflict!
-'c'	00-1F	linux/chio.h		conflict!
-'c'	80-9F	arch/s390/include/asm/chsc.h	conflict!
-'c'	A0-AF   arch/x86/include/asm/msr.h	conflict!
-'d'	00-FF	linux/char/drm/drm.h	conflict!
-'d'	02-40	pcmcia/ds.h		conflict!
-'d'	F0-FF	linux/digi1.h
-'e'	all	linux/digi1.h		conflict!
-'f'	00-1F	linux/ext2_fs.h		conflict!
-'f'	00-1F	linux/ext3_fs.h		conflict!
-'f'	00-0F	fs/jfs/jfs_dinode.h	conflict!
-'f'	00-0F	fs/ext4/ext4.h		conflict!
-'f'	00-0F	linux/fs.h		conflict!
-'f'	00-0F	fs/ocfs2/ocfs2_fs.h	conflict!
-'g'	00-0F	linux/usb/gadgetfs.h
-'g'	20-2F	linux/usb/g_printer.h
-'h'	00-7F				conflict! Charon filesystem
-					<mailto:zapman@interlan.net>
-'h'	00-1F	linux/hpet.h		conflict!
-'h'	80-8F	fs/hfsplus/ioctl.c
-'i'	00-3F	linux/i2o-dev.h		conflict!
-'i'	0B-1F	linux/ipmi.h		conflict!
-'i'	80-8F	linux/i8k.h
-'j'	00-3F	linux/joystick.h
-'k'	00-0F	linux/spi/spidev.h	conflict!
-'k'	00-05	video/kyro.h		conflict!
-'k'	10-17	linux/hsi/hsi_char.h	HSI character device
-'l'	00-3F	linux/tcfs_fs.h		transparent cryptographic file system
-					<http://web.archive.org/web/*/http://mikonos.dia.unisa.it/tcfs>
-'l'	40-7F	linux/udf_fs_i.h	in development:
-					<http://sourceforge.net/projects/linux-udf/>
-'m'	00-09	linux/mmtimer.h		conflict!
-'m'	all	linux/mtio.h		conflict!
-'m'	all	linux/soundcard.h	conflict!
-'m'	all	linux/synclink.h	conflict!
-'m'	00-19	drivers/message/fusion/mptctl.h	conflict!
-'m'	00	drivers/scsi/megaraid/megaraid_ioctl.h	conflict!
-'n'	00-7F	linux/ncp_fs.h and fs/ncpfs/ioctl.c
-'n'	80-8F	uapi/linux/nilfs2_api.h	NILFS2
-'n'	E0-FF	linux/matroxfb.h	matroxfb
-'o'	00-1F	fs/ocfs2/ocfs2_fs.h	OCFS2
-'o'     00-03   mtd/ubi-user.h		conflict! (OCFS2 and UBI overlaps)
-'o'     40-41   mtd/ubi-user.h		UBI
-'o'     01-A1   linux/dvb/*.h		DVB
-'p'	00-0F	linux/phantom.h		conflict! (OpenHaptics needs this)
-'p'	00-1F	linux/rtc.h		conflict!
-'p'	00-3F	linux/mc146818rtc.h	conflict!
-'p'	40-7F	linux/nvram.h
-'p'	80-9F	linux/ppdev.h		user-space parport
-					<mailto:tim@cyberelk.net>
-'p'	A1-A5	linux/pps.h		LinuxPPS
-					<mailto:giometti@linux.it>
-'q'	00-1F	linux/serio.h
-'q'	80-FF	linux/telephony.h	Internet PhoneJACK, Internet LineJACK
-		linux/ixjuser.h		<http://web.archive.org/web/*/http://www.quicknet.net>
-'r'	00-1F	linux/msdos_fs.h and fs/fat/dir.c
-'s'	all	linux/cdk.h
-'t'	00-7F	linux/ppp-ioctl.h
-'t'	80-8F	linux/isdn_ppp.h
-'t'	90-91	linux/toshiba.h		toshiba and toshiba_acpi SMM
-'u'	00-1F	linux/smb_fs.h		gone
-'u'	20-3F	linux/uvcvideo.h	USB video class host driver
-'u'	40-4f	linux/udmabuf.h		userspace dma-buf misc device
-'v'	00-1F	linux/ext2_fs.h		conflict!
-'v'	00-1F	linux/fs.h		conflict!
-'v'	00-0F	linux/sonypi.h		conflict!
-'v'	00-0F	media/v4l2-subdev.h	conflict!
-'v'	C0-FF	linux/meye.h		conflict!
-'w'	all				CERN SCI driver
-'y'	00-1F				packet based user level communications
-					<mailto:zapman@interlan.net>
-'z'	00-3F				CAN bus card	conflict!
-					<mailto:hdstich@connectu.ulm.circular.de>
-'z'	40-7F				CAN bus card	conflict!
-					<mailto:oe@port.de>
-'z'	10-4F	drivers/s390/crypto/zcrypt_api.h	conflict!
-'|'	00-7F	linux/media.h
-0x80	00-1F	linux/fb.h
-0x89	00-06	arch/x86/include/asm/sockios.h
-0x89	0B-DF	linux/sockios.h
-0x89	E0-EF	linux/sockios.h		SIOCPROTOPRIVATE range
-0x89	E0-EF	linux/dn.h		PROTOPRIVATE range
-0x89	F0-FF	linux/sockios.h		SIOCDEVPRIVATE range
-0x8B	all	linux/wireless.h
-0x8C	00-3F				WiNRADiO driver
-					<http://www.winradio.com.au/>
-0x90	00	drivers/cdrom/sbpcd.h
-0x92	00-0F	drivers/usb/mon/mon_bin.c
-0x93	60-7F	linux/auto_fs.h
-0x94	all	fs/btrfs/ioctl.h	Btrfs filesystem
-		and linux/fs.h		some lifted to vfs/generic
-0x97	00-7F	fs/ceph/ioctl.h		Ceph file system
-0x99	00-0F				537-Addinboard driver
-					<mailto:buk@buks.ipn.de>
-0xA0	all	linux/sdp/sdp.h		Industrial Device Project
-					<mailto:kenji@bitgate.com>
-0xA1	0	linux/vtpm_proxy.h	TPM Emulator Proxy Driver
-0xA3	80-8F	Port ACL		in development:
-					<mailto:tlewis@mindspring.com>
-0xA3	90-9F	linux/dtlk.h
-0xA4	00-1F	uapi/linux/tee.h	Generic TEE subsystem
-0xAA	00-3F	linux/uapi/linux/userfaultfd.h
-0xAB	00-1F	linux/nbd.h
-0xAC	00-1F	linux/raw.h
-0xAD	00	Netfilter device	in development:
-					<mailto:rusty@rustcorp.com.au>
-0xAE	all	linux/kvm.h		Kernel-based Virtual Machine
-					<mailto:kvm@vger.kernel.org>
-0xAF	00-1F	linux/fsl_hypervisor.h	Freescale hypervisor
-0xB0	all	RATIO devices		in development:
-					<mailto:vgo@ratio.de>
-0xB1	00-1F	PPPoX			<mailto:mostrows@styx.uwaterloo.ca>
-0xB3	00	linux/mmc/ioctl.h
-0xB4	00-0F	linux/gpio.h		<mailto:linux-gpio@vger.kernel.org>
-0xB5	00-0F	uapi/linux/rpmsg.h	<mailto:linux-remoteproc@vger.kernel.org>
-0xB6	all	linux/fpga-dfl.h
-0xC0	00-0F	linux/usb/iowarrior.h
-0xCA	00-0F	uapi/misc/cxl.h
-0xCA	10-2F	uapi/misc/ocxl.h
-0xCA	80-BF	uapi/scsi/cxlflash_ioctl.h
-0xCB	00-1F	CBM serial IEC bus	in development:
-					<mailto:michael.klein@puffin.lb.shuttle.de>
-0xCC	00-0F	drivers/misc/ibmvmc.h    pseries VMC driver
-0xCD	01	linux/reiserfs_fs.h
-0xCF	02	fs/cifs/ioctl.c
-0xDB	00-0F	drivers/char/mwave/mwavepub.h
-0xDD	00-3F	ZFCP device driver	see drivers/s390/scsi/
-					<mailto:aherrman@de.ibm.com>
-0xE5	00-3F	linux/fuse.h
-0xEC	00-01	drivers/platform/chrome/cros_ec_dev.h	ChromeOS EC driver
-0xF3	00-3F	drivers/usb/misc/sisusbvga/sisusb.h	sisfb (in development)
-					<mailto:thomas@winischhofer.net>
-0xF4	00-1F	video/mbxfb.h		mbxfb
-					<mailto:raph@8d.com>
-0xF6	all	LTTng			Linux Trace Toolkit Next Generation
-					<mailto:mathieu.desnoyers@efficios.com>
-0xFD	all	linux/dm-ioctl.h
-0xFE	all	linux/isst_if.h
diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst
index 365efc9e4aa8..8e56337d422d 100644
--- a/Documentation/process/submit-checklist.rst
+++ b/Documentation/process/submit-checklist.rst
@@ -107,7 +107,7 @@ and elsewhere regarding submitting Linux kernel patches.
     and why.
 
 26) If any ioctl's are added by the patch, then also update
-    ``Documentation/ioctl/ioctl-number.txt``.
+    ``Documentation/ioctl/ioctl-number.rst``.
 
 27) If your modified source code depends on or uses any of the kernel
     APIs or features that are related to the following ``Kconfig`` symbols,
diff --git a/Documentation/translations/it_IT/process/submit-checklist.rst b/Documentation/translations/it_IT/process/submit-checklist.rst
index ea74cae958d7..995ee69fab11 100644
--- a/Documentation/translations/it_IT/process/submit-checklist.rst
+++ b/Documentation/translations/it_IT/process/submit-checklist.rst
@@ -117,7 +117,7 @@ sottomissione delle patch, in particolare
     sorgenti che ne spieghi la logica: cosa fanno e perché.
 
 25) Se la patch aggiunge nuove chiamate ioctl, allora aggiornate
-    ``Documentation/ioctl/ioctl-number.txt``.
+    ``Documentation/ioctl/ioctl-number.rst``.
 
 26) Se il codice che avete modificato dipende o usa una qualsiasi interfaccia o
     funzionalità del kernel che è associata a uno dei seguenti simboli
diff --git a/Documentation/translations/zh_CN/process/submit-checklist.rst b/Documentation/translations/zh_CN/process/submit-checklist.rst
index f4785d2b0491..8738c55e42a2 100644
--- a/Documentation/translations/zh_CN/process/submit-checklist.rst
+++ b/Documentation/translations/zh_CN/process/submit-checklist.rst
@@ -97,7 +97,7 @@ Linux内核补丁提交清单
 24) 所有内存屏障例如 ``barrier()``, ``rmb()``, ``wmb()`` 都需要源代码中的注
     释来解释它们正在执行的操作及其原因的逻辑。
 
-25) 如果补丁添加了任何ioctl，那么也要更新 ``Documentation/ioctl/ioctl-number.txt``
+25) 如果补丁添加了任何ioctl，那么也要更新 ``Documentation/ioctl/ioctl-number.rst``
 
 26) 如果修改后的源代码依赖或使用与以下 ``Kconfig`` 符号相关的任何内核API或
     功能，则在禁用相关 ``Kconfig`` 符号和/或 ``=m`` （如果该选项可用）的情况
diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h
index 26213f49f5c8..54e16a589472 100644
--- a/include/uapi/rdma/rdma_user_ioctl_cmds.h
+++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h
@@ -36,7 +36,7 @@
 #include <linux/types.h>
 #include <linux/ioctl.h>
 
-/* Documentation/ioctl/ioctl-number.txt */
+/* Documentation/ioctl/ioctl-number.rst */
 #define RDMA_IOCTL_MAGIC	0x1b
 #define RDMA_VERBS_IOCTL \
 	_IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr)
-- 
cgit v1.2.3


From fe34c89d25429e079ba67416529514120dd715f8 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue, 18 Jun 2019 12:34:59 -0300
Subject: docs: driver-model: move it to the driver-api book

The audience for the Kernel driver-model is clearly Kernel hackers.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> # ice driver changes
---
 Documentation/driver-api/driver-model/binding.rst  |  98 +++++
 Documentation/driver-api/driver-model/bus.rst      | 146 +++++++
 Documentation/driver-api/driver-model/class.rst    | 149 +++++++
 .../driver-api/driver-model/design-patterns.rst    | 116 ++++++
 Documentation/driver-api/driver-model/device.rst   | 109 +++++
 Documentation/driver-api/driver-model/devres.rst   | 414 +++++++++++++++++++
 Documentation/driver-api/driver-model/driver.rst   | 223 ++++++++++
 Documentation/driver-api/driver-model/index.rst    |  24 ++
 Documentation/driver-api/driver-model/overview.rst | 124 ++++++
 Documentation/driver-api/driver-model/platform.rst | 246 +++++++++++
 Documentation/driver-api/driver-model/porting.rst  | 448 +++++++++++++++++++++
 Documentation/driver-api/gpio/driver.rst           |   2 +-
 Documentation/driver-api/index.rst                 |   1 +
 Documentation/driver-model/binding.rst             |  98 -----
 Documentation/driver-model/bus.rst                 | 146 -------
 Documentation/driver-model/class.rst               | 149 -------
 Documentation/driver-model/design-patterns.rst     | 116 ------
 Documentation/driver-model/device.rst              | 109 -----
 Documentation/driver-model/devres.rst              | 414 -------------------
 Documentation/driver-model/driver.rst              | 223 ----------
 Documentation/driver-model/index.rst               |  26 --
 Documentation/driver-model/overview.rst            | 124 ------
 Documentation/driver-model/platform.rst            | 246 -----------
 Documentation/driver-model/porting.rst             | 448 ---------------------
 Documentation/eisa.txt                             |   4 +-
 Documentation/filesystems/sysfs.txt                |   2 +-
 Documentation/hwmon/submitting-patches.rst         |   2 +-
 .../translations/zh_CN/filesystems/sysfs.txt       |   2 +-
 drivers/base/platform.c                            |   2 +-
 drivers/gpio/gpio-cs5535.c                         |   2 +-
 drivers/net/ethernet/intel/ice/ice_main.c          |   2 +-
 drivers/staging/unisys/Documentation/overview.txt  |   4 +-
 include/linux/device.h                             |   2 +-
 include/linux/platform_device.h                    |   2 +-
 scripts/coccinelle/free/devm_free.cocci            |   2 +-
 35 files changed, 2112 insertions(+), 2113 deletions(-)
 create mode 100644 Documentation/driver-api/driver-model/binding.rst
 create mode 100644 Documentation/driver-api/driver-model/bus.rst
 create mode 100644 Documentation/driver-api/driver-model/class.rst
 create mode 100644 Documentation/driver-api/driver-model/design-patterns.rst
 create mode 100644 Documentation/driver-api/driver-model/device.rst
 create mode 100644 Documentation/driver-api/driver-model/devres.rst
 create mode 100644 Documentation/driver-api/driver-model/driver.rst
 create mode 100644 Documentation/driver-api/driver-model/index.rst
 create mode 100644 Documentation/driver-api/driver-model/overview.rst
 create mode 100644 Documentation/driver-api/driver-model/platform.rst
 create mode 100644 Documentation/driver-api/driver-model/porting.rst
 delete mode 100644 Documentation/driver-model/binding.rst
 delete mode 100644 Documentation/driver-model/bus.rst
 delete mode 100644 Documentation/driver-model/class.rst
 delete mode 100644 Documentation/driver-model/design-patterns.rst
 delete mode 100644 Documentation/driver-model/device.rst
 delete mode 100644 Documentation/driver-model/devres.rst
 delete mode 100644 Documentation/driver-model/driver.rst
 delete mode 100644 Documentation/driver-model/index.rst
 delete mode 100644 Documentation/driver-model/overview.rst
 delete mode 100644 Documentation/driver-model/platform.rst
 delete mode 100644 Documentation/driver-model/porting.rst

(limited to 'include')

diff --git a/Documentation/driver-api/driver-model/binding.rst b/Documentation/driver-api/driver-model/binding.rst
new file mode 100644
index 000000000000..7ea1d7a41e1d
--- /dev/null
+++ b/Documentation/driver-api/driver-model/binding.rst
@@ -0,0 +1,98 @@
+==============
+Driver Binding
+==============
+
+Driver binding is the process of associating a device with a device
+driver that can control it. Bus drivers have typically handled this
+because there have been bus-specific structures to represent the
+devices and the drivers. With generic device and device driver
+structures, most of the binding can take place using common code.
+
+
+Bus
+~~~
+
+The bus type structure contains a list of all devices that are on that bus
+type in the system. When device_register is called for a device, it is
+inserted into the end of this list. The bus object also contains a
+list of all drivers of that bus type. When driver_register is called
+for a driver, it is inserted at the end of this list. These are the
+two events which trigger driver binding.
+
+
+device_register
+~~~~~~~~~~~~~~~
+
+When a new device is added, the bus's list of drivers is iterated over
+to find one that supports it. In order to determine that, the device
+ID of the device must match one of the device IDs that the driver
+supports. The format and semantics for comparing IDs is bus-specific.
+Instead of trying to derive a complex state machine and matching
+algorithm, it is up to the bus driver to provide a callback to compare
+a device against the IDs of a driver. The bus returns 1 if a match was
+found; 0 otherwise.
+
+int match(struct device * dev, struct device_driver * drv);
+
+If a match is found, the device's driver field is set to the driver
+and the driver's probe callback is called. This gives the driver a
+chance to verify that it really does support the hardware, and that
+it's in a working state.
+
+Device Class
+~~~~~~~~~~~~
+
+Upon the successful completion of probe, the device is registered with
+the class to which it belongs. Device drivers belong to one and only one
+class, and that is set in the driver's devclass field.
+devclass_add_device is called to enumerate the device within the class
+and actually register it with the class, which happens with the
+class's register_dev callback.
+
+
+Driver
+~~~~~~
+
+When a driver is attached to a device, the device is inserted into the
+driver's list of devices.
+
+
+sysfs
+~~~~~
+
+A symlink is created in the bus's 'devices' directory that points to
+the device's directory in the physical hierarchy.
+
+A symlink is created in the driver's 'devices' directory that points
+to the device's directory in the physical hierarchy.
+
+A directory for the device is created in the class's directory. A
+symlink is created in that directory that points to the device's
+physical location in the sysfs tree.
+
+A symlink can be created (though this isn't done yet) in the device's
+physical directory to either its class directory, or the class's
+top-level directory. One can also be created to point to its driver's
+directory also.
+
+
+driver_register
+~~~~~~~~~~~~~~~
+
+The process is almost identical for when a new driver is added.
+The bus's list of devices is iterated over to find a match. Devices
+that already have a driver are skipped. All the devices are iterated
+over, to bind as many devices as possible to the driver.
+
+
+Removal
+~~~~~~~
+
+When a device is removed, the reference count for it will eventually
+go to 0. When it does, the remove callback of the driver is called. It
+is removed from the driver's list of devices and the reference count
+of the driver is decremented. All symlinks between the two are removed.
+
+When a driver is removed, the list of devices that it supports is
+iterated over, and the driver's remove callback is called for each
+one. The device is removed from that list and the symlinks removed.
diff --git a/Documentation/driver-api/driver-model/bus.rst b/Documentation/driver-api/driver-model/bus.rst
new file mode 100644
index 000000000000..016b15a6e8ea
--- /dev/null
+++ b/Documentation/driver-api/driver-model/bus.rst
@@ -0,0 +1,146 @@
+=========
+Bus Types
+=========
+
+Definition
+~~~~~~~~~~
+See the kerneldoc for the struct bus_type.
+
+int bus_register(struct bus_type * bus);
+
+
+Declaration
+~~~~~~~~~~~
+
+Each bus type in the kernel (PCI, USB, etc) should declare one static
+object of this type. They must initialize the name field, and may
+optionally initialize the match callback::
+
+   struct bus_type pci_bus_type = {
+          .name	= "pci",
+          .match	= pci_bus_match,
+   };
+
+The structure should be exported to drivers in a header file:
+
+extern struct bus_type pci_bus_type;
+
+
+Registration
+~~~~~~~~~~~~
+
+When a bus driver is initialized, it calls bus_register. This
+initializes the rest of the fields in the bus object and inserts it
+into a global list of bus types. Once the bus object is registered,
+the fields in it are usable by the bus driver.
+
+
+Callbacks
+~~~~~~~~~
+
+match(): Attaching Drivers to Devices
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The format of device ID structures and the semantics for comparing
+them are inherently bus-specific. Drivers typically declare an array
+of device IDs of devices they support that reside in a bus-specific
+driver structure.
+
+The purpose of the match callback is to give the bus an opportunity to
+determine if a particular driver supports a particular device by
+comparing the device IDs the driver supports with the device ID of a
+particular device, without sacrificing bus-specific functionality or
+type-safety.
+
+When a driver is registered with the bus, the bus's list of devices is
+iterated over, and the match callback is called for each device that
+does not have a driver associated with it.
+
+
+
+Device and Driver Lists
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The lists of devices and drivers are intended to replace the local
+lists that many buses keep. They are lists of struct devices and
+struct device_drivers, respectively. Bus drivers are free to use the
+lists as they please, but conversion to the bus-specific type may be
+necessary.
+
+The LDM core provides helper functions for iterating over each list::
+
+  int bus_for_each_dev(struct bus_type * bus, struct device * start,
+		       void * data,
+		       int (*fn)(struct device *, void *));
+
+  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+		       void * data, int (*fn)(struct device_driver *, void *));
+
+These helpers iterate over the respective list, and call the callback
+for each device or driver in the list. All list accesses are
+synchronized by taking the bus's lock (read currently). The reference
+count on each object in the list is incremented before the callback is
+called; it is decremented after the next object has been obtained. The
+lock is not held when calling the callback.
+
+
+sysfs
+~~~~~~~~
+There is a top-level directory named 'bus'.
+
+Each bus gets a directory in the bus directory, along with two default
+directories::
+
+	/sys/bus/pci/
+	|-- devices
+	`-- drivers
+
+Drivers registered with the bus get a directory in the bus's drivers
+directory::
+
+	/sys/bus/pci/
+	|-- devices
+	`-- drivers
+	    |-- Intel ICH
+	    |-- Intel ICH Joystick
+	    |-- agpgart
+	    `-- e100
+
+Each device that is discovered on a bus of that type gets a symlink in
+the bus's devices directory to the device's directory in the physical
+hierarchy::
+
+	/sys/bus/pci/
+	|-- devices
+	|   |-- 00:00.0 -> ../../../root/pci0/00:00.0
+	|   |-- 00:01.0 -> ../../../root/pci0/00:01.0
+	|   `-- 00:02.0 -> ../../../root/pci0/00:02.0
+	`-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+
+::
+
+  struct bus_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(struct bus_type *, char * buf);
+	ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
+  };
+
+Bus drivers can export attributes using the BUS_ATTR_RW macro that works
+similarly to the DEVICE_ATTR_RW macro for devices. For example, a
+definition like this::
+
+	static BUS_ATTR_RW(debug);
+
+is equivalent to declaring::
+
+	static bus_attribute bus_attr_debug;
+
+This can then be used to add and remove the attribute from the bus's
+sysfs directory using::
+
+	int bus_create_file(struct bus_type *, struct bus_attribute *);
+	void bus_remove_file(struct bus_type *, struct bus_attribute *);
diff --git a/Documentation/driver-api/driver-model/class.rst b/Documentation/driver-api/driver-model/class.rst
new file mode 100644
index 000000000000..fff55b80e86a
--- /dev/null
+++ b/Documentation/driver-api/driver-model/class.rst
@@ -0,0 +1,149 @@
+==============
+Device Classes
+==============
+
+Introduction
+~~~~~~~~~~~~
+A device class describes a type of device, like an audio or network
+device. The following device classes have been identified:
+
+<Insert List of Device Classes Here>
+
+
+Each device class defines a set of semantics and a programming interface
+that devices of that class adhere to. Device drivers are the
+implementation of that programming interface for a particular device on
+a particular bus.
+
+Device classes are agnostic with respect to what bus a device resides
+on.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The device class structure looks like::
+
+
+  typedef int (*devclass_add)(struct device *);
+  typedef void (*devclass_remove)(struct device *);
+
+See the kerneldoc for the struct class.
+
+A typical device class definition would look like::
+
+  struct device_class input_devclass = {
+        .name		= "input",
+        .add_device	= input_add_device,
+	.remove_device	= input_remove_device,
+  };
+
+Each device class structure should be exported in a header file so it
+can be used by drivers, extensions and interfaces.
+
+Device classes are registered and unregistered with the core using::
+
+  int devclass_register(struct device_class * cls);
+  void devclass_unregister(struct device_class * cls);
+
+
+Devices
+~~~~~~~
+As devices are bound to drivers, they are added to the device class
+that the driver belongs to. Before the driver model core, this would
+typically happen during the driver's probe() callback, once the device
+has been initialized. It now happens after the probe() callback
+finishes from the core.
+
+The device is enumerated in the class. Each time a device is added to
+the class, the class's devnum field is incremented and assigned to the
+device. The field is never decremented, so if the device is removed
+from the class and re-added, it will receive a different enumerated
+value.
+
+The class is allowed to create a class-specific structure for the
+device and store it in the device's class_data pointer.
+
+There is no list of devices in the device class. Each driver has a
+list of devices that it supports. The device class has a list of
+drivers of that particular class. To access all of the devices in the
+class, iterate over the device lists of each driver in the class.
+
+
+Device Drivers
+~~~~~~~~~~~~~~
+Device drivers are added to device classes when they are registered
+with the core. A driver specifies the class it belongs to by setting
+the struct device_driver::devclass field.
+
+
+sysfs directory structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+There is a top-level sysfs directory named 'class'.
+
+Each class gets a directory in the class directory, along with two
+default subdirectories::
+
+        class/
+        `-- input
+            |-- devices
+            `-- drivers
+
+
+Drivers registered with the class get a symlink in the drivers/ directory
+that points to the driver's directory (under its bus directory)::
+
+   class/
+   `-- input
+       |-- devices
+       `-- drivers
+           `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
+
+
+Each device gets a symlink in the devices/ directory that points to the
+device's directory in the physical hierarchy::
+
+   class/
+   `-- input
+       |-- devices
+       |   `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
+       `-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+
+::
+
+  struct devclass_attribute {
+        struct attribute        attr;
+        ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
+        ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
+  };
+
+Class drivers can export attributes using the DEVCLASS_ATTR macro that works
+similarly to the DEVICE_ATTR macro for devices. For example, a definition
+like this::
+
+  static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
+
+is equivalent to declaring::
+
+  static devclass_attribute devclass_attr_debug;
+
+The bus driver can add and remove the attribute from the class's
+sysfs directory using::
+
+  int devclass_create_file(struct device_class *, struct devclass_attribute *);
+  void devclass_remove_file(struct device_class *, struct devclass_attribute *);
+
+In the example above, the file will be named 'debug' in placed in the
+class's directory in sysfs.
+
+
+Interfaces
+~~~~~~~~~~
+There may exist multiple mechanisms for accessing the same device of a
+particular class type. Device interfaces describe these mechanisms.
+
+When a device is added to a device class, the core attempts to add it
+to every interface that is registered with the device class.
diff --git a/Documentation/driver-api/driver-model/design-patterns.rst b/Documentation/driver-api/driver-model/design-patterns.rst
new file mode 100644
index 000000000000..41eb8f41f7dd
--- /dev/null
+++ b/Documentation/driver-api/driver-model/design-patterns.rst
@@ -0,0 +1,116 @@
+=============================
+Device Driver Design Patterns
+=============================
+
+This document describes a few common design patterns found in device drivers.
+It is likely that subsystem maintainers will ask driver developers to
+conform to these design patterns.
+
+1. State Container
+2. container_of()
+
+
+1. State Container
+~~~~~~~~~~~~~~~~~~
+
+While the kernel contains a few device drivers that assume that they will
+only be probed() once on a certain system (singletons), it is custom to assume
+that the device the driver binds to will appear in several instances. This
+means that the probe() function and all callbacks need to be reentrant.
+
+The most common way to achieve this is to use the state container design
+pattern. It usually has this form::
+
+  struct foo {
+      spinlock_t lock; /* Example member */
+      (...)
+  };
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
+      if (!foo)
+          return -ENOMEM;
+      spin_lock_init(&foo->lock);
+      (...)
+  }
+
+This will create an instance of struct foo in memory every time probe() is
+called. This is our state container for this instance of the device driver.
+Of course it is then necessary to always pass this instance of the
+state around to all functions that need access to the state and its members.
+
+For example, if the driver is registering an interrupt handler, you would
+pass around a pointer to struct foo like this::
+
+  static irqreturn_t foo_handler(int irq, void *arg)
+  {
+      struct foo *foo = arg;
+      (...)
+  }
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      (...)
+      ret = request_irq(irq, foo_handler, 0, "foo", foo);
+  }
+
+This way you always get a pointer back to the correct instance of foo in
+your interrupt handler.
+
+
+2. container_of()
+~~~~~~~~~~~~~~~~~
+
+Continuing on the above example we add an offloaded work::
+
+  struct foo {
+      spinlock_t lock;
+      struct workqueue_struct *wq;
+      struct work_struct offload;
+      (...)
+  };
+
+  static void foo_work(struct work_struct *work)
+  {
+      struct foo *foo = container_of(work, struct foo, offload);
+
+      (...)
+  }
+
+  static irqreturn_t foo_handler(int irq, void *arg)
+  {
+      struct foo *foo = arg;
+
+      queue_work(foo->wq, &foo->offload);
+      (...)
+  }
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      foo->wq = create_singlethread_workqueue("foo-wq");
+      INIT_WORK(&foo->offload, foo_work);
+      (...)
+  }
+
+The design pattern is the same for an hrtimer or something similar that will
+return a single argument which is a pointer to a struct member in the
+callback.
+
+container_of() is a macro defined in <linux/kernel.h>
+
+What container_of() does is to obtain a pointer to the containing struct from
+a pointer to a member by a simple subtraction using the offsetof() macro from
+standard C, which allows something similar to object oriented behaviours.
+Notice that the contained member must not be a pointer, but an actual member
+for this to work.
+
+We can see here that we avoid having global pointers to our struct foo *
+instance this way, while still keeping the number of parameters passed to the
+work function to a single pointer.
diff --git a/Documentation/driver-api/driver-model/device.rst b/Documentation/driver-api/driver-model/device.rst
new file mode 100644
index 000000000000..2b868d49d349
--- /dev/null
+++ b/Documentation/driver-api/driver-model/device.rst
@@ -0,0 +1,109 @@
+==========================
+The Basic Device Structure
+==========================
+
+See the kerneldoc for the struct device.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The bus driver that discovers the device uses this to register the
+device with the core::
+
+  int device_register(struct device * dev);
+
+The bus should initialize the following fields:
+
+    - parent
+    - name
+    - bus_id
+    - bus
+
+A device is removed from the core when its reference count goes to
+0. The reference count can be adjusted using::
+
+  struct device * get_device(struct device * dev);
+  void put_device(struct device * dev);
+
+get_device() will return a pointer to the struct device passed to it
+if the reference is not already 0 (if it's in the process of being
+removed already).
+
+A driver can access the lock in the device structure using::
+
+  void lock_device(struct device * dev);
+  void unlock_device(struct device * dev);
+
+
+Attributes
+~~~~~~~~~~
+
+::
+
+  struct device_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(struct device *dev, struct device_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t count);
+  };
+
+Attributes of devices can be exported by a device driver through sysfs.
+
+Please see Documentation/filesystems/sysfs.txt for more information
+on how sysfs works.
+
+As explained in Documentation/kobject.txt, device attributes must be
+created before the KOBJ_ADD uevent is generated. The only way to realize
+that is by defining an attribute group.
+
+Attributes are declared using a macro called DEVICE_ATTR::
+
+  #define DEVICE_ATTR(name,mode,show,store)
+
+Example:::
+
+  static DEVICE_ATTR(type, 0444, show_type, NULL);
+  static DEVICE_ATTR(power, 0644, show_power, store_power);
+
+This declares two structures of type struct device_attribute with respective
+names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be
+organized as follows into a group::
+
+  static struct attribute *dev_attrs[] = {
+	&dev_attr_type.attr,
+	&dev_attr_power.attr,
+	NULL,
+  };
+
+  static struct attribute_group dev_attr_group = {
+	.attrs = dev_attrs,
+  };
+
+  static const struct attribute_group *dev_attr_groups[] = {
+	&dev_attr_group,
+	NULL,
+  };
+
+This array of groups can then be associated with a device by setting the
+group pointer in struct device before device_register() is invoked::
+
+        dev->groups = dev_attr_groups;
+        device_register(dev);
+
+The device_register() function will use the 'groups' pointer to create the
+device attributes and the device_unregister() function will use this pointer
+to remove the device attributes.
+
+Word of warning:  While the kernel allows device_create_file() and
+device_remove_file() to be called on a device at any time, userspace has
+strict expectations on when attributes get created.  When a new device is
+registered in the kernel, a uevent is generated to notify userspace (like
+udev) that a new device is available.  If attributes are added after the
+device is registered, then userspace won't get notified and userspace will
+not know about the new attributes.
+
+This is important for device driver that need to publish additional
+attributes for a device at driver probe time.  If the device driver simply
+calls device_create_file() on the device structure passed to it, then
+userspace will never be notified of the new attributes.
diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
new file mode 100644
index 000000000000..4ac99122b5f1
--- /dev/null
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -0,0 +1,414 @@
+================================
+Devres - Managed Device Resource
+================================
+
+Tejun Heo	<teheo@suse.de>
+
+First draft	10 January 2007
+
+.. contents
+
+   1. Intro			: Huh? Devres?
+   2. Devres			: Devres in a nutshell
+   3. Devres Group		: Group devres'es and release them together
+   4. Details			: Life time rules, calling context, ...
+   5. Overhead			: How much do we have to pay for this?
+   6. List of managed interfaces: Currently implemented managed interfaces
+
+
+1. Intro
+--------
+
+devres came up while trying to convert libata to use iomap.  Each
+iomapped address should be kept and unmapped on driver detach.  For
+example, a plain SFF ATA controller (that is, good old PCI IDE) in
+native mode makes use of 5 PCI BARs and all of them should be
+maintained.
+
+As with many other device drivers, libata low level drivers have
+sufficient bugs in ->remove and ->probe failure path.  Well, yes,
+that's probably because libata low level driver developers are lazy
+bunch, but aren't all low level driver developers?  After spending a
+day fiddling with braindamaged hardware with no document or
+braindamaged document, if it's finally working, well, it's working.
+
+For one reason or another, low level drivers don't receive as much
+attention or testing as core code, and bugs on driver detach or
+initialization failure don't happen often enough to be noticeable.
+Init failure path is worse because it's much less travelled while
+needs to handle multiple entry points.
+
+So, many low level drivers end up leaking resources on driver detach
+and having half broken failure path implementation in ->probe() which
+would leak resources or even cause oops when failure occurs.  iomap
+adds more to this mix.  So do msi and msix.
+
+
+2. Devres
+---------
+
+devres is basically linked list of arbitrarily sized memory areas
+associated with a struct device.  Each devres entry is associated with
+a release function.  A devres can be released in several ways.  No
+matter what, all devres entries are released on driver detach.  On
+release, the associated release function is invoked and then the
+devres entry is freed.
+
+Managed interface is created for resources commonly used by device
+drivers using devres.  For example, coherent DMA memory is acquired
+using dma_alloc_coherent().  The managed version is called
+dmam_alloc_coherent().  It is identical to dma_alloc_coherent() except
+for the DMA memory allocated using it is managed and will be
+automatically released on driver detach.  Implementation looks like
+the following::
+
+  struct dma_devres {
+	size_t		size;
+	void		*vaddr;
+	dma_addr_t	dma_handle;
+  };
+
+  static void dmam_coherent_release(struct device *dev, void *res)
+  {
+	struct dma_devres *this = res;
+
+	dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
+  }
+
+  dmam_alloc_coherent(dev, size, dma_handle, gfp)
+  {
+	struct dma_devres *dr;
+	void *vaddr;
+
+	dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
+	...
+
+	/* alloc DMA memory as usual */
+	vaddr = dma_alloc_coherent(...);
+	...
+
+	/* record size, vaddr, dma_handle in dr */
+	dr->vaddr = vaddr;
+	...
+
+	devres_add(dev, dr);
+
+	return vaddr;
+  }
+
+If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
+freed whether initialization fails half-way or the device gets
+detached.  If most resources are acquired using managed interface, a
+driver can have much simpler init and exit code.  Init path basically
+looks like the following::
+
+  my_init_one()
+  {
+	struct mydev *d;
+
+	d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->ring = dmam_alloc_coherent(...);
+	if (!d->ring)
+		return -ENOMEM;
+
+	if (check something)
+		return -EINVAL;
+	...
+
+	return register_to_upper_layer(d);
+  }
+
+And exit path::
+
+  my_remove_one()
+  {
+	unregister_from_upper_layer(d);
+	shutdown_my_hardware();
+  }
+
+As shown above, low level drivers can be simplified a lot by using
+devres.  Complexity is shifted from less maintained low level drivers
+to better maintained higher layer.  Also, as init failure path is
+shared with exit path, both can get more testing.
+
+Note though that when converting current calls or assignments to
+managed devm_* versions it is up to you to check if internal operations
+like allocating memory, have failed. Managed resources pertains to the
+freeing of these resources *only* - all other checks needed are still
+on you. In some cases this may mean introducing checks that were not
+necessary before moving to the managed devm_* calls.
+
+
+3. Devres group
+---------------
+
+Devres entries can be grouped using devres group.  When a group is
+released, all contained normal devres entries and properly nested
+groups are released.  One usage is to rollback series of acquired
+resources on failure.  For example::
+
+  if (!devres_open_group(dev, NULL, GFP_KERNEL))
+	return -ENOMEM;
+
+  acquire A;
+  if (failed)
+	goto err;
+
+  acquire B;
+  if (failed)
+	goto err;
+  ...
+
+  devres_remove_group(dev, NULL);
+  return 0;
+
+ err:
+  devres_release_group(dev, NULL);
+  return err_code;
+
+As resource acquisition failure usually means probe failure, constructs
+like above are usually useful in midlayer driver (e.g. libata core
+layer) where interface function shouldn't have side effect on failure.
+For LLDs, just returning error code suffices in most cases.
+
+Each group is identified by `void *id`.  It can either be explicitly
+specified by @id argument to devres_open_group() or automatically
+created by passing NULL as @id as in the above example.  In both
+cases, devres_open_group() returns the group's id.  The returned id
+can be passed to other devres functions to select the target group.
+If NULL is given to those functions, the latest open group is
+selected.
+
+For example, you can do something like the following::
+
+  int my_midlayer_create_something()
+  {
+	if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
+		return -ENOMEM;
+
+	...
+
+	devres_close_group(dev, my_midlayer_create_something);
+	return 0;
+  }
+
+  void my_midlayer_destroy_something()
+  {
+	devres_release_group(dev, my_midlayer_create_something);
+  }
+
+
+4. Details
+----------
+
+Lifetime of a devres entry begins on devres allocation and finishes
+when it is released or destroyed (removed and freed) - no reference
+counting.
+
+devres core guarantees atomicity to all basic devres operations and
+has support for single-instance devres types (atomic
+lookup-and-add-if-not-found).  Other than that, synchronizing
+concurrent accesses to allocated devres data is caller's
+responsibility.  This is usually non-issue because bus ops and
+resource allocations already do the job.
+
+For an example of single-instance devres type, read pcim_iomap_table()
+in lib/devres.c.
+
+All devres interface functions can be called without context if the
+right gfp mask is given.
+
+
+5. Overhead
+-----------
+
+Each devres bookkeeping info is allocated together with requested data
+area.  With debug option turned off, bookkeeping info occupies 16
+bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
+up to ull alignment).  If singly linked list is used, it can be
+reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
+
+Each devres group occupies 8 pointers.  It can be reduced to 6 if
+singly linked list is used.
+
+Memory space overhead on ahci controller with two ports is between 300
+and 400 bytes on 32bit machine after naive conversion (we can
+certainly invest a bit more effort into libata core layer).
+
+
+6. List of managed interfaces
+-----------------------------
+
+CLOCK
+  devm_clk_get()
+  devm_clk_get_optional()
+  devm_clk_put()
+  devm_clk_hw_register()
+  devm_of_clk_add_hw_provider()
+  devm_clk_hw_register_clkdev()
+
+DMA
+  dmaenginem_async_device_register()
+  dmam_alloc_coherent()
+  dmam_alloc_attrs()
+  dmam_free_coherent()
+  dmam_pool_create()
+  dmam_pool_destroy()
+
+DRM
+  devm_drm_dev_init()
+
+GPIO
+  devm_gpiod_get()
+  devm_gpiod_get_index()
+  devm_gpiod_get_index_optional()
+  devm_gpiod_get_optional()
+  devm_gpiod_put()
+  devm_gpiod_unhinge()
+  devm_gpiochip_add_data()
+  devm_gpio_request()
+  devm_gpio_request_one()
+  devm_gpio_free()
+
+I2C
+  devm_i2c_new_dummy_device()
+
+IIO
+  devm_iio_device_alloc()
+  devm_iio_device_free()
+  devm_iio_device_register()
+  devm_iio_device_unregister()
+  devm_iio_kfifo_allocate()
+  devm_iio_kfifo_free()
+  devm_iio_triggered_buffer_setup()
+  devm_iio_triggered_buffer_cleanup()
+  devm_iio_trigger_alloc()
+  devm_iio_trigger_free()
+  devm_iio_trigger_register()
+  devm_iio_trigger_unregister()
+  devm_iio_channel_get()
+  devm_iio_channel_release()
+  devm_iio_channel_get_all()
+  devm_iio_channel_release_all()
+
+INPUT
+  devm_input_allocate_device()
+
+IO region
+  devm_release_mem_region()
+  devm_release_region()
+  devm_release_resource()
+  devm_request_mem_region()
+  devm_request_region()
+  devm_request_resource()
+
+IOMAP
+  devm_ioport_map()
+  devm_ioport_unmap()
+  devm_ioremap()
+  devm_ioremap_nocache()
+  devm_ioremap_wc()
+  devm_ioremap_resource() : checks resource, requests memory region, ioremaps
+  devm_iounmap()
+  pcim_iomap()
+  pcim_iomap_regions()	: do request_region() and iomap() on multiple BARs
+  pcim_iomap_table()	: array of mapped addresses indexed by BAR
+  pcim_iounmap()
+
+IRQ
+  devm_free_irq()
+  devm_request_any_context_irq()
+  devm_request_irq()
+  devm_request_threaded_irq()
+  devm_irq_alloc_descs()
+  devm_irq_alloc_desc()
+  devm_irq_alloc_desc_at()
+  devm_irq_alloc_desc_from()
+  devm_irq_alloc_descs_from()
+  devm_irq_alloc_generic_chip()
+  devm_irq_setup_generic_chip()
+  devm_irq_sim_init()
+
+LED
+  devm_led_classdev_register()
+  devm_led_classdev_unregister()
+
+MDIO
+  devm_mdiobus_alloc()
+  devm_mdiobus_alloc_size()
+  devm_mdiobus_free()
+
+MEM
+  devm_free_pages()
+  devm_get_free_pages()
+  devm_kasprintf()
+  devm_kcalloc()
+  devm_kfree()
+  devm_kmalloc()
+  devm_kmalloc_array()
+  devm_kmemdup()
+  devm_kstrdup()
+  devm_kvasprintf()
+  devm_kzalloc()
+
+MFD
+  devm_mfd_add_devices()
+
+MUX
+  devm_mux_chip_alloc()
+  devm_mux_chip_register()
+  devm_mux_control_get()
+
+PER-CPU MEM
+  devm_alloc_percpu()
+  devm_free_percpu()
+
+PCI
+  devm_pci_alloc_host_bridge()  : managed PCI host bridge allocation
+  devm_pci_remap_cfgspace()	: ioremap PCI configuration space
+  devm_pci_remap_cfg_resource()	: ioremap PCI configuration space resource
+  pcim_enable_device()		: after success, all PCI ops become managed
+  pcim_pin_device()		: keep PCI device enabled after release
+
+PHY
+  devm_usb_get_phy()
+  devm_usb_put_phy()
+
+PINCTRL
+  devm_pinctrl_get()
+  devm_pinctrl_put()
+  devm_pinctrl_register()
+  devm_pinctrl_unregister()
+
+POWER
+  devm_reboot_mode_register()
+  devm_reboot_mode_unregister()
+
+PWM
+  devm_pwm_get()
+  devm_pwm_put()
+
+REGULATOR
+  devm_regulator_bulk_get()
+  devm_regulator_get()
+  devm_regulator_put()
+  devm_regulator_register()
+
+RESET
+  devm_reset_control_get()
+  devm_reset_controller_register()
+
+SERDEV
+  devm_serdev_device_open()
+
+SLAVE DMA ENGINE
+  devm_acpi_dma_controller_register()
+
+SPI
+  devm_spi_register_master()
+
+WATCHDOG
+  devm_watchdog_register_device()
diff --git a/Documentation/driver-api/driver-model/driver.rst b/Documentation/driver-api/driver-model/driver.rst
new file mode 100644
index 000000000000..11d281506a04
--- /dev/null
+++ b/Documentation/driver-api/driver-model/driver.rst
@@ -0,0 +1,223 @@
+==============
+Device Drivers
+==============
+
+See the kerneldoc for the struct device_driver.
+
+
+Allocation
+~~~~~~~~~~
+
+Device drivers are statically allocated structures. Though there may
+be multiple devices in a system that a driver supports, struct
+device_driver represents the driver as a whole (not a particular
+device instance).
+
+Initialization
+~~~~~~~~~~~~~~
+
+The driver must initialize at least the name and bus fields. It should
+also initialize the devclass field (when it arrives), so it may obtain
+the proper linkage internally. It should also initialize as many of
+the callbacks as possible, though each is optional.
+
+Declaration
+~~~~~~~~~~~
+
+As stated above, struct device_driver objects are statically
+allocated. Below is an example declaration of the eepro100
+driver. This declaration is hypothetical only; it relies on the driver
+being converted completely to the new model::
+
+  static struct device_driver eepro100_driver = {
+         .name		= "eepro100",
+         .bus		= &pci_bus_type,
+
+         .probe		= eepro100_probe,
+         .remove		= eepro100_remove,
+         .suspend		= eepro100_suspend,
+         .resume		= eepro100_resume,
+  };
+
+Most drivers will not be able to be converted completely to the new
+model because the bus they belong to has a bus-specific structure with
+bus-specific fields that cannot be generalized.
+
+The most common example of this are device ID structures. A driver
+typically defines an array of device IDs that it supports. The format
+of these structures and the semantics for comparing device IDs are
+completely bus-specific. Defining them as bus-specific entities would
+sacrifice type-safety, so we keep bus-specific structures around.
+
+Bus-specific drivers should include a generic struct device_driver in
+the definition of the bus-specific driver. Like this::
+
+  struct pci_driver {
+         const struct pci_device_id *id_table;
+         struct device_driver	  driver;
+  };
+
+A definition that included bus-specific fields would look like
+(using the eepro100 driver again)::
+
+  static struct pci_driver eepro100_driver = {
+         .id_table       = eepro100_pci_tbl,
+         .driver	       = {
+		.name		= "eepro100",
+		.bus		= &pci_bus_type,
+		.probe		= eepro100_probe,
+		.remove		= eepro100_remove,
+		.suspend	= eepro100_suspend,
+		.resume		= eepro100_resume,
+         },
+  };
+
+Some may find the syntax of embedded struct initialization awkward or
+even a bit ugly. So far, it's the best way we've found to do what we want...
+
+Registration
+~~~~~~~~~~~~
+
+::
+
+  int driver_register(struct device_driver *drv);
+
+The driver registers the structure on startup. For drivers that have
+no bus-specific fields (i.e. don't have a bus-specific driver
+structure), they would use driver_register and pass a pointer to their
+struct device_driver object.
+
+Most drivers, however, will have a bus-specific structure and will
+need to register with the bus using something like pci_driver_register.
+
+It is important that drivers register their driver structure as early as
+possible. Registration with the core initializes several fields in the
+struct device_driver object, including the reference count and the
+lock. These fields are assumed to be valid at all times and may be
+used by the device model core or the bus driver.
+
+
+Transition Bus Drivers
+~~~~~~~~~~~~~~~~~~~~~~
+
+By defining wrapper functions, the transition to the new model can be
+made easier. Drivers can ignore the generic structure altogether and
+let the bus wrapper fill in the fields. For the callbacks, the bus can
+define generic callbacks that forward the call to the bus-specific
+callbacks of the drivers.
+
+This solution is intended to be only temporary. In order to get class
+information in the driver, the drivers must be modified anyway. Since
+converting drivers to the new model should reduce some infrastructural
+complexity and code size, it is recommended that they are converted as
+class information is added.
+
+Access
+~~~~~~
+
+Once the object has been registered, it may access the common fields of
+the object, like the lock and the list of devices::
+
+  int driver_for_each_dev(struct device_driver *drv, void *data,
+			  int (*callback)(struct device *dev, void *data));
+
+The devices field is a list of all the devices that have been bound to
+the driver. The LDM core provides a helper function to operate on all
+the devices a driver controls. This helper locks the driver on each
+node access, and does proper reference counting on each device as it
+accesses it.
+
+
+sysfs
+~~~~~
+
+When a driver is registered, a sysfs directory is created in its
+bus's directory. In this directory, the driver can export an interface
+to userspace to control operation of the driver on a global basis;
+e.g. toggling debugging output in the driver.
+
+A future feature of this directory will be a 'devices' directory. This
+directory will contain symlinks to the directories of devices it
+supports.
+
+
+
+Callbacks
+~~~~~~~~~
+
+::
+
+	int	(*probe)	(struct device *dev);
+
+The probe() entry is called in task context, with the bus's rwsem locked
+and the driver partially bound to the device.  Drivers commonly use
+container_of() to convert "dev" to a bus-specific type, both in probe()
+and other routines.  That type often provides device resource data, such
+as pci_dev.resource[] or platform_device.resources, which is used in
+addition to dev->platform_data to initialize the driver.
+
+This callback holds the driver-specific logic to bind the driver to a
+given device.  That includes verifying that the device is present, that
+it's a version the driver can handle, that driver data structures can
+be allocated and initialized, and that any hardware can be initialized.
+Drivers often store a pointer to their state with dev_set_drvdata().
+When the driver has successfully bound itself to that device, then probe()
+returns zero and the driver model code will finish its part of binding
+the driver to that device.
+
+A driver's probe() may return a negative errno value to indicate that
+the driver did not bind to this device, in which case it should have
+released all resources it allocated::
+
+	int 	(*remove)	(struct device *dev);
+
+remove is called to unbind a driver from a device. This may be
+called if a device is physically removed from the system, if the
+driver module is being unloaded, during a reboot sequence, or
+in other cases.
+
+It is up to the driver to determine if the device is present or
+not. It should free any resources allocated specifically for the
+device; i.e. anything in the device's driver_data field.
+
+If the device is still present, it should quiesce the device and place
+it into a supported low-power state::
+
+	int	(*suspend)	(struct device *dev, pm_message_t state);
+
+suspend is called to put the device in a low power state::
+
+	int	(*resume)	(struct device *dev);
+
+Resume is used to bring a device back from a low power state.
+
+
+Attributes
+~~~~~~~~~~
+
+::
+
+  struct driver_attribute {
+          struct attribute        attr;
+          ssize_t (*show)(struct device_driver *driver, char *buf);
+          ssize_t (*store)(struct device_driver *, const char *buf, size_t count);
+  };
+
+Device drivers can export attributes via their sysfs directories.
+Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO
+macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO
+macros.
+
+Example::
+
+	DRIVER_ATTR_RW(debug);
+
+This is equivalent to declaring::
+
+	struct driver_attribute driver_attr_debug;
+
+This can then be used to add and remove the attribute from the
+driver's directory using::
+
+  int driver_create_file(struct device_driver *, const struct driver_attribute *);
+  void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/driver-api/driver-model/index.rst b/Documentation/driver-api/driver-model/index.rst
new file mode 100644
index 000000000000..755016422269
--- /dev/null
+++ b/Documentation/driver-api/driver-model/index.rst
@@ -0,0 +1,24 @@
+============
+Driver Model
+============
+
+.. toctree::
+   :maxdepth: 1
+
+   binding
+   bus
+   class
+   design-patterns
+   device
+   devres
+   driver
+   overview
+   platform
+   porting
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/driver-model/overview.rst b/Documentation/driver-api/driver-model/overview.rst
new file mode 100644
index 000000000000..d4d1e9b40e0c
--- /dev/null
+++ b/Documentation/driver-api/driver-model/overview.rst
@@ -0,0 +1,124 @@
+=============================
+The Linux Kernel Device Model
+=============================
+
+Patrick Mochel	<mochel@digitalimplant.org>
+
+Drafted 26 August 2002
+Updated 31 January 2006
+
+
+Overview
+~~~~~~~~
+
+The Linux Kernel Driver Model is a unification of all the disparate driver
+models that were previously used in the kernel. It is intended to augment the
+bus-specific drivers for bridges and devices by consolidating a set of data
+and operations into globally accessible data structures.
+
+Traditional driver models implemented some sort of tree-like structure
+(sometimes just a list) for the devices they control. There wasn't any
+uniformity across the different bus types.
+
+The current driver model provides a common, uniform data model for describing
+a bus and the devices that can appear under the bus. The unified bus
+model includes a set of common attributes which all busses carry, and a set
+of common callbacks, such as device discovery during bus probing, bus
+shutdown, bus power management, etc.
+
+The common device and bridge interface reflects the goals of the modern
+computer: namely the ability to do seamless device "plug and play", power
+management, and hot plug. In particular, the model dictated by Intel and
+Microsoft (namely ACPI) ensures that almost every device on almost any bus
+on an x86-compatible system can work within this paradigm.  Of course,
+not every bus is able to support all such operations, although most
+buses support most of those operations.
+
+
+Downstream Access
+~~~~~~~~~~~~~~~~~
+
+Common data fields have been moved out of individual bus layers into a common
+data structure. These fields must still be accessed by the bus layers,
+and sometimes by the device-specific drivers.
+
+Other bus layers are encouraged to do what has been done for the PCI layer.
+struct pci_dev now looks like this::
+
+  struct pci_dev {
+	...
+
+	struct device dev;     /* Generic device interface */
+	...
+  };
+
+Note first that the struct device dev within the struct pci_dev is
+statically allocated. This means only one allocation on device discovery.
+
+Note also that that struct device dev is not necessarily defined at the
+front of the pci_dev structure.  This is to make people think about what
+they're doing when switching between the bus driver and the global driver,
+and to discourage meaningless and incorrect casts between the two.
+
+The PCI bus layer freely accesses the fields of struct device. It knows about
+the structure of struct pci_dev, and it should know the structure of struct
+device. Individual PCI device drivers that have been converted to the current
+driver model generally do not and should not touch the fields of struct device,
+unless there is a compelling reason to do so.
+
+The above abstraction prevents unnecessary pain during transitional phases.
+If it were not done this way, then when a field was renamed or removed, every
+downstream driver would break.  On the other hand, if only the bus layer
+(and not the device layer) accesses the struct device, it is only the bus
+layer that needs to change.
+
+
+User Interface
+~~~~~~~~~~~~~~
+
+By virtue of having a complete hierarchical view of all the devices in the
+system, exporting a complete hierarchical view to userspace becomes relatively
+easy. This has been accomplished by implementing a special purpose virtual
+file system named sysfs.
+
+Almost all mainstream Linux distros mount this filesystem automatically; you
+can see some variation of the following in the output of the "mount" command::
+
+  $ mount
+  ...
+  none on /sys type sysfs (rw,noexec,nosuid,nodev)
+  ...
+  $
+
+The auto-mounting of sysfs is typically accomplished by an entry similar to
+the following in the /etc/fstab file::
+
+  none     	/sys	sysfs    defaults	  	0 0
+
+or something similar in the /lib/init/fstab file on Debian-based systems::
+
+  none            /sys    sysfs    nodev,noexec,nosuid    0 0
+
+If sysfs is not automatically mounted, you can always do it manually with::
+
+	# mount -t sysfs sysfs /sys
+
+Whenever a device is inserted into the tree, a directory is created for it.
+This directory may be populated at each layer of discovery - the global layer,
+the bus layer, or the device layer.
+
+The global layer currently creates two files - 'name' and 'power'. The
+former only reports the name of the device. The latter reports the
+current power state of the device. It will also be used to set the current
+power state.
+
+The bus layer may also create files for the devices it finds while probing the
+bus. For example, the PCI layer currently creates 'irq' and 'resource' files
+for each PCI device.
+
+A device-specific driver may also export files in its directory to expose
+device-specific data or tunable interfaces.
+
+More information about the sysfs directory layout can be found in
+the other documents in this directory and in the file
+Documentation/filesystems/sysfs.txt.
diff --git a/Documentation/driver-api/driver-model/platform.rst b/Documentation/driver-api/driver-model/platform.rst
new file mode 100644
index 000000000000..334dd4071ae4
--- /dev/null
+++ b/Documentation/driver-api/driver-model/platform.rst
@@ -0,0 +1,246 @@
+============================
+Platform Devices and Drivers
+============================
+
+See <linux/platform_device.h> for the driver model interface to the
+platform bus:  platform_device, and platform_driver.  This pseudo-bus
+is used to connect devices on busses with minimal infrastructure,
+like those used to integrate peripherals on many system-on-chip
+processors, or some "legacy" PC interconnects; as opposed to large
+formally specified ones like PCI or USB.
+
+
+Platform devices
+~~~~~~~~~~~~~~~~
+Platform devices are devices that typically appear as autonomous
+entities in the system. This includes legacy port-based devices and
+host bridges to peripheral buses, and most controllers integrated
+into system-on-chip platforms.  What they usually have in common
+is direct addressing from a CPU bus.  Rarely, a platform_device will
+be connected through a segment of some other kind of bus; but its
+registers will still be directly addressable.
+
+Platform devices are given a name, used in driver binding, and a
+list of resources such as addresses and IRQs::
+
+  struct platform_device {
+	const char	*name;
+	u32		id;
+	struct device	dev;
+	u32		num_resources;
+	struct resource	*resource;
+  };
+
+
+Platform drivers
+~~~~~~~~~~~~~~~~
+Platform drivers follow the standard driver model convention, where
+discovery/enumeration is handled outside the drivers, and drivers
+provide probe() and remove() methods.  They support power management
+and shutdown notifications using the standard conventions::
+
+  struct platform_driver {
+	int (*probe)(struct platform_device *);
+	int (*remove)(struct platform_device *);
+	void (*shutdown)(struct platform_device *);
+	int (*suspend)(struct platform_device *, pm_message_t state);
+	int (*suspend_late)(struct platform_device *, pm_message_t state);
+	int (*resume_early)(struct platform_device *);
+	int (*resume)(struct platform_device *);
+	struct device_driver driver;
+  };
+
+Note that probe() should in general verify that the specified device hardware
+actually exists; sometimes platform setup code can't be sure.  The probing
+can use device resources, including clocks, and device platform_data.
+
+Platform drivers register themselves the normal way::
+
+	int platform_driver_register(struct platform_driver *drv);
+
+Or, in common situations where the device is known not to be hot-pluggable,
+the probe() routine can live in an init section to reduce the driver's
+runtime memory footprint::
+
+	int platform_driver_probe(struct platform_driver *drv,
+			  int (*probe)(struct platform_device *))
+
+Kernel modules can be composed of several platform drivers. The platform core
+provides helpers to register and unregister an array of drivers::
+
+	int __platform_register_drivers(struct platform_driver * const *drivers,
+				      unsigned int count, struct module *owner);
+	void platform_unregister_drivers(struct platform_driver * const *drivers,
+					 unsigned int count);
+
+If one of the drivers fails to register, all drivers registered up to that
+point will be unregistered in reverse order. Note that there is a convenience
+macro that passes THIS_MODULE as owner parameter::
+
+	#define platform_register_drivers(drivers, count)
+
+
+Device Enumeration
+~~~~~~~~~~~~~~~~~~
+As a rule, platform specific (and often board-specific) setup code will
+register platform devices::
+
+	int platform_device_register(struct platform_device *pdev);
+
+	int platform_add_devices(struct platform_device **pdevs, int ndev);
+
+The general rule is to register only those devices that actually exist,
+but in some cases extra devices might be registered.  For example, a kernel
+might be configured to work with an external network adapter that might not
+be populated on all boards, or likewise to work with an integrated controller
+that some boards might not hook up to any peripherals.
+
+In some cases, boot firmware will export tables describing the devices
+that are populated on a given board.   Without such tables, often the
+only way for system setup code to set up the correct devices is to build
+a kernel for a specific target board.  Such board-specific kernels are
+common with embedded and custom systems development.
+
+In many cases, the memory and IRQ resources associated with the platform
+device are not enough to let the device's driver work.  Board setup code
+will often provide additional information using the device's platform_data
+field to hold additional information.
+
+Embedded systems frequently need one or more clocks for platform devices,
+which are normally kept off until they're actively needed (to save power).
+System setup also associates those clocks with the device, so that that
+calls to clk_get(&pdev->dev, clock_name) return them as needed.
+
+
+Legacy Drivers:  Device Probing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Some drivers are not fully converted to the driver model, because they take
+on a non-driver role:  the driver registers its platform device, rather than
+leaving that for system infrastructure.  Such drivers can't be hotplugged
+or coldplugged, since those mechanisms require device creation to be in a
+different system component than the driver.
+
+The only "good" reason for this is to handle older system designs which, like
+original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
+configuration.  Newer systems have largely abandoned that model, in favor of
+bus-level support for dynamic configuration (PCI, USB), or device tables
+provided by the boot firmware (e.g. PNPACPI on x86).  There are too many
+conflicting options about what might be where, and even educated guesses by
+an operating system will be wrong often enough to make trouble.
+
+This style of driver is discouraged.  If you're updating such a driver,
+please try to move the device enumeration to a more appropriate location,
+outside the driver.  This will usually be cleanup, since such drivers
+tend to already have "normal" modes, such as ones using device nodes that
+were created by PNP or by platform device setup.
+
+None the less, there are some APIs to support such legacy drivers.  Avoid
+using these calls except with such hotplug-deficient drivers::
+
+	struct platform_device *platform_device_alloc(
+			const char *name, int id);
+
+You can use platform_device_alloc() to dynamically allocate a device, which
+you will then initialize with resources and platform_device_register().
+A better solution is usually::
+
+	struct platform_device *platform_device_register_simple(
+			const char *name, int id,
+			struct resource *res, unsigned int nres);
+
+You can use platform_device_register_simple() as a one-step call to allocate
+and register a device.
+
+
+Device Naming and Driver Binding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The platform_device.dev.bus_id is the canonical name for the devices.
+It's built from two components:
+
+    * platform_device.name ... which is also used to for driver matching.
+
+    * platform_device.id ... the device instance number, or else "-1"
+      to indicate there's only one.
+
+These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
+"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
+named "serial".  While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
+and use the platform_driver called "my_rtc".
+
+Driver binding is performed automatically by the driver core, invoking
+driver probe() after finding a match between device and driver.  If the
+probe() succeeds, the driver and device are bound as usual.  There are
+three different ways to find such a match:
+
+    - Whenever a device is registered, the drivers for that bus are
+      checked for matches.  Platform devices should be registered very
+      early during system boot.
+
+    - When a driver is registered using platform_driver_register(), all
+      unbound devices on that bus are checked for matches.  Drivers
+      usually register later during booting, or by module loading.
+
+    - Registering a driver using platform_driver_probe() works just like
+      using platform_driver_register(), except that the driver won't
+      be probed later if another device registers.  (Which is OK, since
+      this interface is only for use with non-hotpluggable devices.)
+
+
+Early Platform Devices and Drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The early platform interfaces provide platform data to platform device
+drivers early on during the system boot. The code is built on top of the
+early_param() command line parsing and can be executed very early on.
+
+Example: "earlyprintk" class early serial console in 6 steps
+
+1. Registering early platform device data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code registers platform device data using the function
+early_platform_add_devices(). In the case of early serial console this
+should be hardware configuration for the serial port. Devices registered
+at this point will later on be matched against early platform drivers.
+
+2. Parsing kernel command line
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls parse_early_param() to parse the kernel
+command line. This will execute all matching early_param() callbacks.
+User specified early platform devices will be registered at this point.
+For the early serial console case the user can specify port on the
+kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
+the class string, "serial" is the name of the platform driver and
+0 is the platform device id. If the id is -1 then the dot and the
+id can be omitted.
+
+3. Installing early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code may optionally force registration of all early
+platform drivers belonging to a certain class using the function
+early_platform_driver_register_all(). User specified devices from
+step 2 have priority over these. This step is omitted by the serial
+driver example since the early serial driver code should be disabled
+unless the user has specified port on the kernel command line.
+
+4. Early platform driver registration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Compiled-in platform drivers making use of early_platform_init() are
+automatically registered during step 2 or 3. The serial driver example
+should use early_platform_init("earlyprintk", &platform_driver).
+
+5. Probing of early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls early_platform_driver_probe() to match
+registered early platform devices associated with a certain class with
+registered early platform drivers. Matched devices will get probed().
+This step can be executed at any point during the early boot. As soon
+as possible may be good for the serial port case.
+
+6. Inside the early platform driver probe()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The driver code needs to take special care during early boot, especially
+when it comes to memory allocation and interrupt registration. The code
+in the probe() function can use is_early_platform_device() to check if
+it is called at early platform device or at the regular platform device
+time. The early serial driver performs register_console() at this point.
+
+For further information, see <linux/platform_device.h>.
diff --git a/Documentation/driver-api/driver-model/porting.rst b/Documentation/driver-api/driver-model/porting.rst
new file mode 100644
index 000000000000..931ea879af3f
--- /dev/null
+++ b/Documentation/driver-api/driver-model/porting.rst
@@ -0,0 +1,448 @@
+=======================================
+Porting Drivers to the New Driver Model
+=======================================
+
+Patrick Mochel
+
+7 January 2003
+
+
+Overview
+
+Please refer to `Documentation/driver-api/driver-model/*.rst` for definitions of
+various driver types and concepts.
+
+Most of the work of porting devices drivers to the new model happens
+at the bus driver layer. This was intentional, to minimize the
+negative effect on kernel drivers, and to allow a gradual transition
+of bus drivers.
+
+In a nutshell, the driver model consists of a set of objects that can
+be embedded in larger, bus-specific objects. Fields in these generic
+objects can replace fields in the bus-specific objects.
+
+The generic objects must be registered with the driver model core. By
+doing so, they will exported via the sysfs filesystem. sysfs can be
+mounted by doing::
+
+	# mount -t sysfs sysfs /sys
+
+
+
+The Process
+
+Step 0: Read include/linux/device.h for object and function definitions.
+
+Step 1: Registering the bus driver.
+
+
+- Define a struct bus_type for the bus driver::
+
+    struct bus_type pci_bus_type = {
+          .name           = "pci",
+    };
+
+
+- Register the bus type.
+
+  This should be done in the initialization function for the bus type,
+  which is usually the module_init(), or equivalent, function::
+
+    static int __init pci_driver_init(void)
+    {
+            return bus_register(&pci_bus_type);
+    }
+
+    subsys_initcall(pci_driver_init);
+
+
+  The bus type may be unregistered (if the bus driver may be compiled
+  as a module) by doing::
+
+     bus_unregister(&pci_bus_type);
+
+
+- Export the bus type for others to use.
+
+  Other code may wish to reference the bus type, so declare it in a
+  shared header file and export the symbol.
+
+From include/linux/pci.h::
+
+  extern struct bus_type pci_bus_type;
+
+
+From file the above code appears in::
+
+  EXPORT_SYMBOL(pci_bus_type);
+
+
+
+- This will cause the bus to show up in /sys/bus/pci/ with two
+  subdirectories: 'devices' and 'drivers'::
+
+    # tree -d /sys/bus/pci/
+    /sys/bus/pci/
+    |-- devices
+    `-- drivers
+
+
+
+Step 2: Registering Devices.
+
+struct device represents a single device. It mainly contains metadata
+describing the relationship the device has to other entities.
+
+
+- Embed a struct device in the bus-specific device type::
+
+
+    struct pci_dev {
+           ...
+           struct  device  dev;            /* Generic device interface */
+           ...
+    };
+
+  It is recommended that the generic device not be the first item in
+  the struct to discourage programmers from doing mindless casts
+  between the object types. Instead macros, or inline functions,
+  should be created to convert from the generic object type::
+
+
+    #define to_pci_dev(n) container_of(n, struct pci_dev, dev)
+
+    or
+
+    static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
+    {
+	return container_of(n, struct pci_dev, dev);
+    }
+
+  This allows the compiler to verify type-safety of the operations
+  that are performed (which is Good).
+
+
+- Initialize the device on registration.
+
+  When devices are discovered or registered with the bus type, the
+  bus driver should initialize the generic device. The most important
+  things to initialize are the bus_id, parent, and bus fields.
+
+  The bus_id is an ASCII string that contains the device's address on
+  the bus. The format of this string is bus-specific. This is
+  necessary for representing devices in sysfs.
+
+  parent is the physical parent of the device. It is important that
+  the bus driver sets this field correctly.
+
+  The driver model maintains an ordered list of devices that it uses
+  for power management. This list must be in order to guarantee that
+  devices are shutdown before their physical parents, and vice versa.
+  The order of this list is determined by the parent of registered
+  devices.
+
+  Also, the location of the device's sysfs directory depends on a
+  device's parent. sysfs exports a directory structure that mirrors
+  the device hierarchy. Accurately setting the parent guarantees that
+  sysfs will accurately represent the hierarchy.
+
+  The device's bus field is a pointer to the bus type the device
+  belongs to. This should be set to the bus_type that was declared
+  and initialized before.
+
+  Optionally, the bus driver may set the device's name and release
+  fields.
+
+  The name field is an ASCII string describing the device, like
+
+     "ATI Technologies Inc Radeon QD"
+
+  The release field is a callback that the driver model core calls
+  when the device has been removed, and all references to it have
+  been released. More on this in a moment.
+
+
+- Register the device.
+
+  Once the generic device has been initialized, it can be registered
+  with the driver model core by doing::
+
+       device_register(&dev->dev);
+
+  It can later be unregistered by doing::
+
+       device_unregister(&dev->dev);
+
+  This should happen on buses that support hotpluggable devices.
+  If a bus driver unregisters a device, it should not immediately free
+  it. It should instead wait for the driver model core to call the
+  device's release method, then free the bus-specific object.
+  (There may be other code that is currently referencing the device
+  structure, and it would be rude to free the device while that is
+  happening).
+
+
+  When the device is registered, a directory in sysfs is created.
+  The PCI tree in sysfs looks like::
+
+    /sys/devices/pci0/
+    |-- 00:00.0
+    |-- 00:01.0
+    |   `-- 01:00.0
+    |-- 00:02.0
+    |   `-- 02:1f.0
+    |       `-- 03:00.0
+    |-- 00:1e.0
+    |   `-- 04:04.0
+    |-- 00:1f.0
+    |-- 00:1f.1
+    |   |-- ide0
+    |   |   |-- 0.0
+    |   |   `-- 0.1
+    |   `-- ide1
+    |       `-- 1.0
+    |-- 00:1f.2
+    |-- 00:1f.3
+    `-- 00:1f.5
+
+  Also, symlinks are created in the bus's 'devices' directory
+  that point to the device's directory in the physical hierarchy::
+
+    /sys/bus/pci/devices/
+    |-- 00:00.0 -> ../../../devices/pci0/00:00.0
+    |-- 00:01.0 -> ../../../devices/pci0/00:01.0
+    |-- 00:02.0 -> ../../../devices/pci0/00:02.0
+    |-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
+    |-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
+    |-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
+    |-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
+    |-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
+    |-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
+    |-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
+    |-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
+    |-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
+    `-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
+
+
+
+Step 3: Registering Drivers.
+
+struct device_driver is a simple driver structure that contains a set
+of operations that the driver model core may call.
+
+
+- Embed a struct device_driver in the bus-specific driver.
+
+  Just like with devices, do something like::
+
+    struct pci_driver {
+           ...
+           struct device_driver    driver;
+    };
+
+
+- Initialize the generic driver structure.
+
+  When the driver registers with the bus (e.g. doing pci_register_driver()),
+  initialize the necessary fields of the driver: the name and bus
+  fields.
+
+
+- Register the driver.
+
+  After the generic driver has been initialized, call::
+
+	driver_register(&drv->driver);
+
+  to register the driver with the core.
+
+  When the driver is unregistered from the bus, unregister it from the
+  core by doing::
+
+        driver_unregister(&drv->driver);
+
+  Note that this will block until all references to the driver have
+  gone away. Normally, there will not be any.
+
+
+- Sysfs representation.
+
+  Drivers are exported via sysfs in their bus's 'driver's directory.
+  For example::
+
+    /sys/bus/pci/drivers/
+    |-- 3c59x
+    |-- Ensoniq AudioPCI
+    |-- agpgart-amdk7
+    |-- e100
+    `-- serial
+
+
+Step 4: Define Generic Methods for Drivers.
+
+struct device_driver defines a set of operations that the driver model
+core calls. Most of these operations are probably similar to
+operations the bus already defines for drivers, but taking different
+parameters.
+
+It would be difficult and tedious to force every driver on a bus to
+simultaneously convert their drivers to generic format. Instead, the
+bus driver should define single instances of the generic methods that
+forward call to the bus-specific drivers. For instance::
+
+
+  static int pci_device_remove(struct device * dev)
+  {
+          struct pci_dev * pci_dev = to_pci_dev(dev);
+          struct pci_driver * drv = pci_dev->driver;
+
+          if (drv) {
+                  if (drv->remove)
+                          drv->remove(pci_dev);
+                  pci_dev->driver = NULL;
+          }
+          return 0;
+  }
+
+
+The generic driver should be initialized with these methods before it
+is registered::
+
+        /* initialize common driver fields */
+        drv->driver.name = drv->name;
+        drv->driver.bus = &pci_bus_type;
+        drv->driver.probe = pci_device_probe;
+        drv->driver.resume = pci_device_resume;
+        drv->driver.suspend = pci_device_suspend;
+        drv->driver.remove = pci_device_remove;
+
+        /* register with core */
+        driver_register(&drv->driver);
+
+
+Ideally, the bus should only initialize the fields if they are not
+already set. This allows the drivers to implement their own generic
+methods.
+
+
+Step 5: Support generic driver binding.
+
+The model assumes that a device or driver can be dynamically
+registered with the bus at any time. When registration happens,
+devices must be bound to a driver, or drivers must be bound to all
+devices that it supports.
+
+A driver typically contains a list of device IDs that it supports. The
+bus driver compares these IDs to the IDs of devices registered with it.
+The format of the device IDs, and the semantics for comparing them are
+bus-specific, so the generic model does attempt to generalize them.
+
+Instead, a bus may supply a method in struct bus_type that does the
+comparison::
+
+  int (*match)(struct device * dev, struct device_driver * drv);
+
+match should return positive value if the driver supports the device,
+and zero otherwise. It may also return error code (for example
+-EPROBE_DEFER) if determining that given driver supports the device is
+not possible.
+
+When a device is registered, the bus's list of drivers is iterated
+over. bus->match() is called for each one until a match is found.
+
+When a driver is registered, the bus's list of devices is iterated
+over. bus->match() is called for each device that is not already
+claimed by a driver.
+
+When a device is successfully bound to a driver, device->driver is
+set, the device is added to a per-driver list of devices, and a
+symlink is created in the driver's sysfs directory that points to the
+device's physical directory::
+
+  /sys/bus/pci/drivers/
+  |-- 3c59x
+  |   `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
+  |-- Ensoniq AudioPCI
+  |-- agpgart-amdk7
+  |   `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
+  |-- e100
+  |   `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
+  `-- serial
+
+
+This driver binding should replace the existing driver binding
+mechanism the bus currently uses.
+
+
+Step 6: Supply a hotplug callback.
+
+Whenever a device is registered with the driver model core, the
+userspace program /sbin/hotplug is called to notify userspace.
+Users can define actions to perform when a device is inserted or
+removed.
+
+The driver model core passes several arguments to userspace via
+environment variables, including
+
+- ACTION: set to 'add' or 'remove'
+- DEVPATH: set to the device's physical path in sysfs.
+
+A bus driver may also supply additional parameters for userspace to
+consume. To do this, a bus must implement the 'hotplug' method in
+struct bus_type::
+
+     int (*hotplug) (struct device *dev, char **envp,
+                     int num_envp, char *buffer, int buffer_size);
+
+This is called immediately before /sbin/hotplug is executed.
+
+
+Step 7: Cleaning up the bus driver.
+
+The generic bus, device, and driver structures provide several fields
+that can replace those defined privately to the bus driver.
+
+- Device list.
+
+struct bus_type contains a list of all devices registered with the bus
+type. This includes all devices on all instances of that bus type.
+An internal list that the bus uses may be removed, in favor of using
+this one.
+
+The core provides an iterator to access these devices::
+
+  int bus_for_each_dev(struct bus_type * bus, struct device * start,
+                       void * data, int (*fn)(struct device *, void *));
+
+
+- Driver list.
+
+struct bus_type also contains a list of all drivers registered with
+it. An internal list of drivers that the bus driver maintains may
+be removed in favor of using the generic one.
+
+The drivers may be iterated over, like devices::
+
+  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+                       void * data, int (*fn)(struct device_driver *, void *));
+
+
+Please see drivers/base/bus.c for more information.
+
+
+- rwsem
+
+struct bus_type contains an rwsem that protects all core accesses to
+the device and driver lists. This can be used by the bus driver
+internally, and should be used when accessing the device or driver
+lists the bus maintains.
+
+
+- Device and driver fields.
+
+Some of the fields in struct device and struct device_driver duplicate
+fields in the bus-specific representations of these objects. Feel free
+to remove the bus-specific ones and favor the generic ones. Note
+though, that this will likely mean fixing up all the drivers that
+reference the bus-specific fields (though those should all be 1-line
+changes).
diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst
index 349f2dc33029..921c71a3d683 100644
--- a/Documentation/driver-api/gpio/driver.rst
+++ b/Documentation/driver-api/gpio/driver.rst
@@ -399,7 +399,7 @@ symbol:
   will pass the struct gpio_chip* for the chip to all IRQ callbacks, so the
   callbacks need to embed the gpio_chip in its state container and obtain a
   pointer to the container using container_of().
-  (See Documentation/driver-model/design-patterns.rst)
+  (See Documentation/driver-api/driver-model/design-patterns.rst)
 
 - gpiochip_irqchip_add_nested(): adds a nested cascaded irqchip to a gpiochip,
   as discussed above regarding different types of cascaded irqchips. The
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index b4c993ff7655..9fb03b7bdeb1 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -14,6 +14,7 @@ available subsections can be seen below.
 .. toctree::
    :maxdepth: 2
 
+   driver-model/index
    basics
    infrastructure
    early-userspace/index
diff --git a/Documentation/driver-model/binding.rst b/Documentation/driver-model/binding.rst
deleted file mode 100644
index 7ea1d7a41e1d..000000000000
--- a/Documentation/driver-model/binding.rst
+++ /dev/null
@@ -1,98 +0,0 @@
-==============
-Driver Binding
-==============
-
-Driver binding is the process of associating a device with a device
-driver that can control it. Bus drivers have typically handled this
-because there have been bus-specific structures to represent the
-devices and the drivers. With generic device and device driver
-structures, most of the binding can take place using common code.
-
-
-Bus
-~~~
-
-The bus type structure contains a list of all devices that are on that bus
-type in the system. When device_register is called for a device, it is
-inserted into the end of this list. The bus object also contains a
-list of all drivers of that bus type. When driver_register is called
-for a driver, it is inserted at the end of this list. These are the
-two events which trigger driver binding.
-
-
-device_register
-~~~~~~~~~~~~~~~
-
-When a new device is added, the bus's list of drivers is iterated over
-to find one that supports it. In order to determine that, the device
-ID of the device must match one of the device IDs that the driver
-supports. The format and semantics for comparing IDs is bus-specific.
-Instead of trying to derive a complex state machine and matching
-algorithm, it is up to the bus driver to provide a callback to compare
-a device against the IDs of a driver. The bus returns 1 if a match was
-found; 0 otherwise.
-
-int match(struct device * dev, struct device_driver * drv);
-
-If a match is found, the device's driver field is set to the driver
-and the driver's probe callback is called. This gives the driver a
-chance to verify that it really does support the hardware, and that
-it's in a working state.
-
-Device Class
-~~~~~~~~~~~~
-
-Upon the successful completion of probe, the device is registered with
-the class to which it belongs. Device drivers belong to one and only one
-class, and that is set in the driver's devclass field.
-devclass_add_device is called to enumerate the device within the class
-and actually register it with the class, which happens with the
-class's register_dev callback.
-
-
-Driver
-~~~~~~
-
-When a driver is attached to a device, the device is inserted into the
-driver's list of devices.
-
-
-sysfs
-~~~~~
-
-A symlink is created in the bus's 'devices' directory that points to
-the device's directory in the physical hierarchy.
-
-A symlink is created in the driver's 'devices' directory that points
-to the device's directory in the physical hierarchy.
-
-A directory for the device is created in the class's directory. A
-symlink is created in that directory that points to the device's
-physical location in the sysfs tree.
-
-A symlink can be created (though this isn't done yet) in the device's
-physical directory to either its class directory, or the class's
-top-level directory. One can also be created to point to its driver's
-directory also.
-
-
-driver_register
-~~~~~~~~~~~~~~~
-
-The process is almost identical for when a new driver is added.
-The bus's list of devices is iterated over to find a match. Devices
-that already have a driver are skipped. All the devices are iterated
-over, to bind as many devices as possible to the driver.
-
-
-Removal
-~~~~~~~
-
-When a device is removed, the reference count for it will eventually
-go to 0. When it does, the remove callback of the driver is called. It
-is removed from the driver's list of devices and the reference count
-of the driver is decremented. All symlinks between the two are removed.
-
-When a driver is removed, the list of devices that it supports is
-iterated over, and the driver's remove callback is called for each
-one. The device is removed from that list and the symlinks removed.
diff --git a/Documentation/driver-model/bus.rst b/Documentation/driver-model/bus.rst
deleted file mode 100644
index 016b15a6e8ea..000000000000
--- a/Documentation/driver-model/bus.rst
+++ /dev/null
@@ -1,146 +0,0 @@
-=========
-Bus Types
-=========
-
-Definition
-~~~~~~~~~~
-See the kerneldoc for the struct bus_type.
-
-int bus_register(struct bus_type * bus);
-
-
-Declaration
-~~~~~~~~~~~
-
-Each bus type in the kernel (PCI, USB, etc) should declare one static
-object of this type. They must initialize the name field, and may
-optionally initialize the match callback::
-
-   struct bus_type pci_bus_type = {
-          .name	= "pci",
-          .match	= pci_bus_match,
-   };
-
-The structure should be exported to drivers in a header file:
-
-extern struct bus_type pci_bus_type;
-
-
-Registration
-~~~~~~~~~~~~
-
-When a bus driver is initialized, it calls bus_register. This
-initializes the rest of the fields in the bus object and inserts it
-into a global list of bus types. Once the bus object is registered,
-the fields in it are usable by the bus driver.
-
-
-Callbacks
-~~~~~~~~~
-
-match(): Attaching Drivers to Devices
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The format of device ID structures and the semantics for comparing
-them are inherently bus-specific. Drivers typically declare an array
-of device IDs of devices they support that reside in a bus-specific
-driver structure.
-
-The purpose of the match callback is to give the bus an opportunity to
-determine if a particular driver supports a particular device by
-comparing the device IDs the driver supports with the device ID of a
-particular device, without sacrificing bus-specific functionality or
-type-safety.
-
-When a driver is registered with the bus, the bus's list of devices is
-iterated over, and the match callback is called for each device that
-does not have a driver associated with it.
-
-
-
-Device and Driver Lists
-~~~~~~~~~~~~~~~~~~~~~~~
-
-The lists of devices and drivers are intended to replace the local
-lists that many buses keep. They are lists of struct devices and
-struct device_drivers, respectively. Bus drivers are free to use the
-lists as they please, but conversion to the bus-specific type may be
-necessary.
-
-The LDM core provides helper functions for iterating over each list::
-
-  int bus_for_each_dev(struct bus_type * bus, struct device * start,
-		       void * data,
-		       int (*fn)(struct device *, void *));
-
-  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
-		       void * data, int (*fn)(struct device_driver *, void *));
-
-These helpers iterate over the respective list, and call the callback
-for each device or driver in the list. All list accesses are
-synchronized by taking the bus's lock (read currently). The reference
-count on each object in the list is incremented before the callback is
-called; it is decremented after the next object has been obtained. The
-lock is not held when calling the callback.
-
-
-sysfs
-~~~~~~~~
-There is a top-level directory named 'bus'.
-
-Each bus gets a directory in the bus directory, along with two default
-directories::
-
-	/sys/bus/pci/
-	|-- devices
-	`-- drivers
-
-Drivers registered with the bus get a directory in the bus's drivers
-directory::
-
-	/sys/bus/pci/
-	|-- devices
-	`-- drivers
-	    |-- Intel ICH
-	    |-- Intel ICH Joystick
-	    |-- agpgart
-	    `-- e100
-
-Each device that is discovered on a bus of that type gets a symlink in
-the bus's devices directory to the device's directory in the physical
-hierarchy::
-
-	/sys/bus/pci/
-	|-- devices
-	|   |-- 00:00.0 -> ../../../root/pci0/00:00.0
-	|   |-- 00:01.0 -> ../../../root/pci0/00:01.0
-	|   `-- 00:02.0 -> ../../../root/pci0/00:02.0
-	`-- drivers
-
-
-Exporting Attributes
-~~~~~~~~~~~~~~~~~~~~
-
-::
-
-  struct bus_attribute {
-	struct attribute	attr;
-	ssize_t (*show)(struct bus_type *, char * buf);
-	ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
-  };
-
-Bus drivers can export attributes using the BUS_ATTR_RW macro that works
-similarly to the DEVICE_ATTR_RW macro for devices. For example, a
-definition like this::
-
-	static BUS_ATTR_RW(debug);
-
-is equivalent to declaring::
-
-	static bus_attribute bus_attr_debug;
-
-This can then be used to add and remove the attribute from the bus's
-sysfs directory using::
-
-	int bus_create_file(struct bus_type *, struct bus_attribute *);
-	void bus_remove_file(struct bus_type *, struct bus_attribute *);
diff --git a/Documentation/driver-model/class.rst b/Documentation/driver-model/class.rst
deleted file mode 100644
index fff55b80e86a..000000000000
--- a/Documentation/driver-model/class.rst
+++ /dev/null
@@ -1,149 +0,0 @@
-==============
-Device Classes
-==============
-
-Introduction
-~~~~~~~~~~~~
-A device class describes a type of device, like an audio or network
-device. The following device classes have been identified:
-
-<Insert List of Device Classes Here>
-
-
-Each device class defines a set of semantics and a programming interface
-that devices of that class adhere to. Device drivers are the
-implementation of that programming interface for a particular device on
-a particular bus.
-
-Device classes are agnostic with respect to what bus a device resides
-on.
-
-
-Programming Interface
-~~~~~~~~~~~~~~~~~~~~~
-The device class structure looks like::
-
-
-  typedef int (*devclass_add)(struct device *);
-  typedef void (*devclass_remove)(struct device *);
-
-See the kerneldoc for the struct class.
-
-A typical device class definition would look like::
-
-  struct device_class input_devclass = {
-        .name		= "input",
-        .add_device	= input_add_device,
-	.remove_device	= input_remove_device,
-  };
-
-Each device class structure should be exported in a header file so it
-can be used by drivers, extensions and interfaces.
-
-Device classes are registered and unregistered with the core using::
-
-  int devclass_register(struct device_class * cls);
-  void devclass_unregister(struct device_class * cls);
-
-
-Devices
-~~~~~~~
-As devices are bound to drivers, they are added to the device class
-that the driver belongs to. Before the driver model core, this would
-typically happen during the driver's probe() callback, once the device
-has been initialized. It now happens after the probe() callback
-finishes from the core.
-
-The device is enumerated in the class. Each time a device is added to
-the class, the class's devnum field is incremented and assigned to the
-device. The field is never decremented, so if the device is removed
-from the class and re-added, it will receive a different enumerated
-value.
-
-The class is allowed to create a class-specific structure for the
-device and store it in the device's class_data pointer.
-
-There is no list of devices in the device class. Each driver has a
-list of devices that it supports. The device class has a list of
-drivers of that particular class. To access all of the devices in the
-class, iterate over the device lists of each driver in the class.
-
-
-Device Drivers
-~~~~~~~~~~~~~~
-Device drivers are added to device classes when they are registered
-with the core. A driver specifies the class it belongs to by setting
-the struct device_driver::devclass field.
-
-
-sysfs directory structure
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-There is a top-level sysfs directory named 'class'.
-
-Each class gets a directory in the class directory, along with two
-default subdirectories::
-
-        class/
-        `-- input
-            |-- devices
-            `-- drivers
-
-
-Drivers registered with the class get a symlink in the drivers/ directory
-that points to the driver's directory (under its bus directory)::
-
-   class/
-   `-- input
-       |-- devices
-       `-- drivers
-           `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
-
-
-Each device gets a symlink in the devices/ directory that points to the
-device's directory in the physical hierarchy::
-
-   class/
-   `-- input
-       |-- devices
-       |   `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
-       `-- drivers
-
-
-Exporting Attributes
-~~~~~~~~~~~~~~~~~~~~
-
-::
-
-  struct devclass_attribute {
-        struct attribute        attr;
-        ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
-        ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
-  };
-
-Class drivers can export attributes using the DEVCLASS_ATTR macro that works
-similarly to the DEVICE_ATTR macro for devices. For example, a definition
-like this::
-
-  static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
-
-is equivalent to declaring::
-
-  static devclass_attribute devclass_attr_debug;
-
-The bus driver can add and remove the attribute from the class's
-sysfs directory using::
-
-  int devclass_create_file(struct device_class *, struct devclass_attribute *);
-  void devclass_remove_file(struct device_class *, struct devclass_attribute *);
-
-In the example above, the file will be named 'debug' in placed in the
-class's directory in sysfs.
-
-
-Interfaces
-~~~~~~~~~~
-There may exist multiple mechanisms for accessing the same device of a
-particular class type. Device interfaces describe these mechanisms.
-
-When a device is added to a device class, the core attempts to add it
-to every interface that is registered with the device class.
diff --git a/Documentation/driver-model/design-patterns.rst b/Documentation/driver-model/design-patterns.rst
deleted file mode 100644
index 41eb8f41f7dd..000000000000
--- a/Documentation/driver-model/design-patterns.rst
+++ /dev/null
@@ -1,116 +0,0 @@
-=============================
-Device Driver Design Patterns
-=============================
-
-This document describes a few common design patterns found in device drivers.
-It is likely that subsystem maintainers will ask driver developers to
-conform to these design patterns.
-
-1. State Container
-2. container_of()
-
-
-1. State Container
-~~~~~~~~~~~~~~~~~~
-
-While the kernel contains a few device drivers that assume that they will
-only be probed() once on a certain system (singletons), it is custom to assume
-that the device the driver binds to will appear in several instances. This
-means that the probe() function and all callbacks need to be reentrant.
-
-The most common way to achieve this is to use the state container design
-pattern. It usually has this form::
-
-  struct foo {
-      spinlock_t lock; /* Example member */
-      (...)
-  };
-
-  static int foo_probe(...)
-  {
-      struct foo *foo;
-
-      foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
-      if (!foo)
-          return -ENOMEM;
-      spin_lock_init(&foo->lock);
-      (...)
-  }
-
-This will create an instance of struct foo in memory every time probe() is
-called. This is our state container for this instance of the device driver.
-Of course it is then necessary to always pass this instance of the
-state around to all functions that need access to the state and its members.
-
-For example, if the driver is registering an interrupt handler, you would
-pass around a pointer to struct foo like this::
-
-  static irqreturn_t foo_handler(int irq, void *arg)
-  {
-      struct foo *foo = arg;
-      (...)
-  }
-
-  static int foo_probe(...)
-  {
-      struct foo *foo;
-
-      (...)
-      ret = request_irq(irq, foo_handler, 0, "foo", foo);
-  }
-
-This way you always get a pointer back to the correct instance of foo in
-your interrupt handler.
-
-
-2. container_of()
-~~~~~~~~~~~~~~~~~
-
-Continuing on the above example we add an offloaded work::
-
-  struct foo {
-      spinlock_t lock;
-      struct workqueue_struct *wq;
-      struct work_struct offload;
-      (...)
-  };
-
-  static void foo_work(struct work_struct *work)
-  {
-      struct foo *foo = container_of(work, struct foo, offload);
-
-      (...)
-  }
-
-  static irqreturn_t foo_handler(int irq, void *arg)
-  {
-      struct foo *foo = arg;
-
-      queue_work(foo->wq, &foo->offload);
-      (...)
-  }
-
-  static int foo_probe(...)
-  {
-      struct foo *foo;
-
-      foo->wq = create_singlethread_workqueue("foo-wq");
-      INIT_WORK(&foo->offload, foo_work);
-      (...)
-  }
-
-The design pattern is the same for an hrtimer or something similar that will
-return a single argument which is a pointer to a struct member in the
-callback.
-
-container_of() is a macro defined in <linux/kernel.h>
-
-What container_of() does is to obtain a pointer to the containing struct from
-a pointer to a member by a simple subtraction using the offsetof() macro from
-standard C, which allows something similar to object oriented behaviours.
-Notice that the contained member must not be a pointer, but an actual member
-for this to work.
-
-We can see here that we avoid having global pointers to our struct foo *
-instance this way, while still keeping the number of parameters passed to the
-work function to a single pointer.
diff --git a/Documentation/driver-model/device.rst b/Documentation/driver-model/device.rst
deleted file mode 100644
index 2b868d49d349..000000000000
--- a/Documentation/driver-model/device.rst
+++ /dev/null
@@ -1,109 +0,0 @@
-==========================
-The Basic Device Structure
-==========================
-
-See the kerneldoc for the struct device.
-
-
-Programming Interface
-~~~~~~~~~~~~~~~~~~~~~
-The bus driver that discovers the device uses this to register the
-device with the core::
-
-  int device_register(struct device * dev);
-
-The bus should initialize the following fields:
-
-    - parent
-    - name
-    - bus_id
-    - bus
-
-A device is removed from the core when its reference count goes to
-0. The reference count can be adjusted using::
-
-  struct device * get_device(struct device * dev);
-  void put_device(struct device * dev);
-
-get_device() will return a pointer to the struct device passed to it
-if the reference is not already 0 (if it's in the process of being
-removed already).
-
-A driver can access the lock in the device structure using::
-
-  void lock_device(struct device * dev);
-  void unlock_device(struct device * dev);
-
-
-Attributes
-~~~~~~~~~~
-
-::
-
-  struct device_attribute {
-	struct attribute	attr;
-	ssize_t (*show)(struct device *dev, struct device_attribute *attr,
-			char *buf);
-	ssize_t (*store)(struct device *dev, struct device_attribute *attr,
-			 const char *buf, size_t count);
-  };
-
-Attributes of devices can be exported by a device driver through sysfs.
-
-Please see Documentation/filesystems/sysfs.txt for more information
-on how sysfs works.
-
-As explained in Documentation/kobject.txt, device attributes must be
-created before the KOBJ_ADD uevent is generated. The only way to realize
-that is by defining an attribute group.
-
-Attributes are declared using a macro called DEVICE_ATTR::
-
-  #define DEVICE_ATTR(name,mode,show,store)
-
-Example:::
-
-  static DEVICE_ATTR(type, 0444, show_type, NULL);
-  static DEVICE_ATTR(power, 0644, show_power, store_power);
-
-This declares two structures of type struct device_attribute with respective
-names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be
-organized as follows into a group::
-
-  static struct attribute *dev_attrs[] = {
-	&dev_attr_type.attr,
-	&dev_attr_power.attr,
-	NULL,
-  };
-
-  static struct attribute_group dev_attr_group = {
-	.attrs = dev_attrs,
-  };
-
-  static const struct attribute_group *dev_attr_groups[] = {
-	&dev_attr_group,
-	NULL,
-  };
-
-This array of groups can then be associated with a device by setting the
-group pointer in struct device before device_register() is invoked::
-
-        dev->groups = dev_attr_groups;
-        device_register(dev);
-
-The device_register() function will use the 'groups' pointer to create the
-device attributes and the device_unregister() function will use this pointer
-to remove the device attributes.
-
-Word of warning:  While the kernel allows device_create_file() and
-device_remove_file() to be called on a device at any time, userspace has
-strict expectations on when attributes get created.  When a new device is
-registered in the kernel, a uevent is generated to notify userspace (like
-udev) that a new device is available.  If attributes are added after the
-device is registered, then userspace won't get notified and userspace will
-not know about the new attributes.
-
-This is important for device driver that need to publish additional
-attributes for a device at driver probe time.  If the device driver simply
-calls device_create_file() on the device structure passed to it, then
-userspace will never be notified of the new attributes.
diff --git a/Documentation/driver-model/devres.rst b/Documentation/driver-model/devres.rst
deleted file mode 100644
index 4ac99122b5f1..000000000000
--- a/Documentation/driver-model/devres.rst
+++ /dev/null
@@ -1,414 +0,0 @@
-================================
-Devres - Managed Device Resource
-================================
-
-Tejun Heo	<teheo@suse.de>
-
-First draft	10 January 2007
-
-.. contents
-
-   1. Intro			: Huh? Devres?
-   2. Devres			: Devres in a nutshell
-   3. Devres Group		: Group devres'es and release them together
-   4. Details			: Life time rules, calling context, ...
-   5. Overhead			: How much do we have to pay for this?
-   6. List of managed interfaces: Currently implemented managed interfaces
-
-
-1. Intro
---------
-
-devres came up while trying to convert libata to use iomap.  Each
-iomapped address should be kept and unmapped on driver detach.  For
-example, a plain SFF ATA controller (that is, good old PCI IDE) in
-native mode makes use of 5 PCI BARs and all of them should be
-maintained.
-
-As with many other device drivers, libata low level drivers have
-sufficient bugs in ->remove and ->probe failure path.  Well, yes,
-that's probably because libata low level driver developers are lazy
-bunch, but aren't all low level driver developers?  After spending a
-day fiddling with braindamaged hardware with no document or
-braindamaged document, if it's finally working, well, it's working.
-
-For one reason or another, low level drivers don't receive as much
-attention or testing as core code, and bugs on driver detach or
-initialization failure don't happen often enough to be noticeable.
-Init failure path is worse because it's much less travelled while
-needs to handle multiple entry points.
-
-So, many low level drivers end up leaking resources on driver detach
-and having half broken failure path implementation in ->probe() which
-would leak resources or even cause oops when failure occurs.  iomap
-adds more to this mix.  So do msi and msix.
-
-
-2. Devres
----------
-
-devres is basically linked list of arbitrarily sized memory areas
-associated with a struct device.  Each devres entry is associated with
-a release function.  A devres can be released in several ways.  No
-matter what, all devres entries are released on driver detach.  On
-release, the associated release function is invoked and then the
-devres entry is freed.
-
-Managed interface is created for resources commonly used by device
-drivers using devres.  For example, coherent DMA memory is acquired
-using dma_alloc_coherent().  The managed version is called
-dmam_alloc_coherent().  It is identical to dma_alloc_coherent() except
-for the DMA memory allocated using it is managed and will be
-automatically released on driver detach.  Implementation looks like
-the following::
-
-  struct dma_devres {
-	size_t		size;
-	void		*vaddr;
-	dma_addr_t	dma_handle;
-  };
-
-  static void dmam_coherent_release(struct device *dev, void *res)
-  {
-	struct dma_devres *this = res;
-
-	dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
-  }
-
-  dmam_alloc_coherent(dev, size, dma_handle, gfp)
-  {
-	struct dma_devres *dr;
-	void *vaddr;
-
-	dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
-	...
-
-	/* alloc DMA memory as usual */
-	vaddr = dma_alloc_coherent(...);
-	...
-
-	/* record size, vaddr, dma_handle in dr */
-	dr->vaddr = vaddr;
-	...
-
-	devres_add(dev, dr);
-
-	return vaddr;
-  }
-
-If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
-freed whether initialization fails half-way or the device gets
-detached.  If most resources are acquired using managed interface, a
-driver can have much simpler init and exit code.  Init path basically
-looks like the following::
-
-  my_init_one()
-  {
-	struct mydev *d;
-
-	d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
-	if (!d)
-		return -ENOMEM;
-
-	d->ring = dmam_alloc_coherent(...);
-	if (!d->ring)
-		return -ENOMEM;
-
-	if (check something)
-		return -EINVAL;
-	...
-
-	return register_to_upper_layer(d);
-  }
-
-And exit path::
-
-  my_remove_one()
-  {
-	unregister_from_upper_layer(d);
-	shutdown_my_hardware();
-  }
-
-As shown above, low level drivers can be simplified a lot by using
-devres.  Complexity is shifted from less maintained low level drivers
-to better maintained higher layer.  Also, as init failure path is
-shared with exit path, both can get more testing.
-
-Note though that when converting current calls or assignments to
-managed devm_* versions it is up to you to check if internal operations
-like allocating memory, have failed. Managed resources pertains to the
-freeing of these resources *only* - all other checks needed are still
-on you. In some cases this may mean introducing checks that were not
-necessary before moving to the managed devm_* calls.
-
-
-3. Devres group
----------------
-
-Devres entries can be grouped using devres group.  When a group is
-released, all contained normal devres entries and properly nested
-groups are released.  One usage is to rollback series of acquired
-resources on failure.  For example::
-
-  if (!devres_open_group(dev, NULL, GFP_KERNEL))
-	return -ENOMEM;
-
-  acquire A;
-  if (failed)
-	goto err;
-
-  acquire B;
-  if (failed)
-	goto err;
-  ...
-
-  devres_remove_group(dev, NULL);
-  return 0;
-
- err:
-  devres_release_group(dev, NULL);
-  return err_code;
-
-As resource acquisition failure usually means probe failure, constructs
-like above are usually useful in midlayer driver (e.g. libata core
-layer) where interface function shouldn't have side effect on failure.
-For LLDs, just returning error code suffices in most cases.
-
-Each group is identified by `void *id`.  It can either be explicitly
-specified by @id argument to devres_open_group() or automatically
-created by passing NULL as @id as in the above example.  In both
-cases, devres_open_group() returns the group's id.  The returned id
-can be passed to other devres functions to select the target group.
-If NULL is given to those functions, the latest open group is
-selected.
-
-For example, you can do something like the following::
-
-  int my_midlayer_create_something()
-  {
-	if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
-		return -ENOMEM;
-
-	...
-
-	devres_close_group(dev, my_midlayer_create_something);
-	return 0;
-  }
-
-  void my_midlayer_destroy_something()
-  {
-	devres_release_group(dev, my_midlayer_create_something);
-  }
-
-
-4. Details
-----------
-
-Lifetime of a devres entry begins on devres allocation and finishes
-when it is released or destroyed (removed and freed) - no reference
-counting.
-
-devres core guarantees atomicity to all basic devres operations and
-has support for single-instance devres types (atomic
-lookup-and-add-if-not-found).  Other than that, synchronizing
-concurrent accesses to allocated devres data is caller's
-responsibility.  This is usually non-issue because bus ops and
-resource allocations already do the job.
-
-For an example of single-instance devres type, read pcim_iomap_table()
-in lib/devres.c.
-
-All devres interface functions can be called without context if the
-right gfp mask is given.
-
-
-5. Overhead
------------
-
-Each devres bookkeeping info is allocated together with requested data
-area.  With debug option turned off, bookkeeping info occupies 16
-bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
-up to ull alignment).  If singly linked list is used, it can be
-reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
-
-Each devres group occupies 8 pointers.  It can be reduced to 6 if
-singly linked list is used.
-
-Memory space overhead on ahci controller with two ports is between 300
-and 400 bytes on 32bit machine after naive conversion (we can
-certainly invest a bit more effort into libata core layer).
-
-
-6. List of managed interfaces
------------------------------
-
-CLOCK
-  devm_clk_get()
-  devm_clk_get_optional()
-  devm_clk_put()
-  devm_clk_hw_register()
-  devm_of_clk_add_hw_provider()
-  devm_clk_hw_register_clkdev()
-
-DMA
-  dmaenginem_async_device_register()
-  dmam_alloc_coherent()
-  dmam_alloc_attrs()
-  dmam_free_coherent()
-  dmam_pool_create()
-  dmam_pool_destroy()
-
-DRM
-  devm_drm_dev_init()
-
-GPIO
-  devm_gpiod_get()
-  devm_gpiod_get_index()
-  devm_gpiod_get_index_optional()
-  devm_gpiod_get_optional()
-  devm_gpiod_put()
-  devm_gpiod_unhinge()
-  devm_gpiochip_add_data()
-  devm_gpio_request()
-  devm_gpio_request_one()
-  devm_gpio_free()
-
-I2C
-  devm_i2c_new_dummy_device()
-
-IIO
-  devm_iio_device_alloc()
-  devm_iio_device_free()
-  devm_iio_device_register()
-  devm_iio_device_unregister()
-  devm_iio_kfifo_allocate()
-  devm_iio_kfifo_free()
-  devm_iio_triggered_buffer_setup()
-  devm_iio_triggered_buffer_cleanup()
-  devm_iio_trigger_alloc()
-  devm_iio_trigger_free()
-  devm_iio_trigger_register()
-  devm_iio_trigger_unregister()
-  devm_iio_channel_get()
-  devm_iio_channel_release()
-  devm_iio_channel_get_all()
-  devm_iio_channel_release_all()
-
-INPUT
-  devm_input_allocate_device()
-
-IO region
-  devm_release_mem_region()
-  devm_release_region()
-  devm_release_resource()
-  devm_request_mem_region()
-  devm_request_region()
-  devm_request_resource()
-
-IOMAP
-  devm_ioport_map()
-  devm_ioport_unmap()
-  devm_ioremap()
-  devm_ioremap_nocache()
-  devm_ioremap_wc()
-  devm_ioremap_resource() : checks resource, requests memory region, ioremaps
-  devm_iounmap()
-  pcim_iomap()
-  pcim_iomap_regions()	: do request_region() and iomap() on multiple BARs
-  pcim_iomap_table()	: array of mapped addresses indexed by BAR
-  pcim_iounmap()
-
-IRQ
-  devm_free_irq()
-  devm_request_any_context_irq()
-  devm_request_irq()
-  devm_request_threaded_irq()
-  devm_irq_alloc_descs()
-  devm_irq_alloc_desc()
-  devm_irq_alloc_desc_at()
-  devm_irq_alloc_desc_from()
-  devm_irq_alloc_descs_from()
-  devm_irq_alloc_generic_chip()
-  devm_irq_setup_generic_chip()
-  devm_irq_sim_init()
-
-LED
-  devm_led_classdev_register()
-  devm_led_classdev_unregister()
-
-MDIO
-  devm_mdiobus_alloc()
-  devm_mdiobus_alloc_size()
-  devm_mdiobus_free()
-
-MEM
-  devm_free_pages()
-  devm_get_free_pages()
-  devm_kasprintf()
-  devm_kcalloc()
-  devm_kfree()
-  devm_kmalloc()
-  devm_kmalloc_array()
-  devm_kmemdup()
-  devm_kstrdup()
-  devm_kvasprintf()
-  devm_kzalloc()
-
-MFD
-  devm_mfd_add_devices()
-
-MUX
-  devm_mux_chip_alloc()
-  devm_mux_chip_register()
-  devm_mux_control_get()
-
-PER-CPU MEM
-  devm_alloc_percpu()
-  devm_free_percpu()
-
-PCI
-  devm_pci_alloc_host_bridge()  : managed PCI host bridge allocation
-  devm_pci_remap_cfgspace()	: ioremap PCI configuration space
-  devm_pci_remap_cfg_resource()	: ioremap PCI configuration space resource
-  pcim_enable_device()		: after success, all PCI ops become managed
-  pcim_pin_device()		: keep PCI device enabled after release
-
-PHY
-  devm_usb_get_phy()
-  devm_usb_put_phy()
-
-PINCTRL
-  devm_pinctrl_get()
-  devm_pinctrl_put()
-  devm_pinctrl_register()
-  devm_pinctrl_unregister()
-
-POWER
-  devm_reboot_mode_register()
-  devm_reboot_mode_unregister()
-
-PWM
-  devm_pwm_get()
-  devm_pwm_put()
-
-REGULATOR
-  devm_regulator_bulk_get()
-  devm_regulator_get()
-  devm_regulator_put()
-  devm_regulator_register()
-
-RESET
-  devm_reset_control_get()
-  devm_reset_controller_register()
-
-SERDEV
-  devm_serdev_device_open()
-
-SLAVE DMA ENGINE
-  devm_acpi_dma_controller_register()
-
-SPI
-  devm_spi_register_master()
-
-WATCHDOG
-  devm_watchdog_register_device()
diff --git a/Documentation/driver-model/driver.rst b/Documentation/driver-model/driver.rst
deleted file mode 100644
index 11d281506a04..000000000000
--- a/Documentation/driver-model/driver.rst
+++ /dev/null
@@ -1,223 +0,0 @@
-==============
-Device Drivers
-==============
-
-See the kerneldoc for the struct device_driver.
-
-
-Allocation
-~~~~~~~~~~
-
-Device drivers are statically allocated structures. Though there may
-be multiple devices in a system that a driver supports, struct
-device_driver represents the driver as a whole (not a particular
-device instance).
-
-Initialization
-~~~~~~~~~~~~~~
-
-The driver must initialize at least the name and bus fields. It should
-also initialize the devclass field (when it arrives), so it may obtain
-the proper linkage internally. It should also initialize as many of
-the callbacks as possible, though each is optional.
-
-Declaration
-~~~~~~~~~~~
-
-As stated above, struct device_driver objects are statically
-allocated. Below is an example declaration of the eepro100
-driver. This declaration is hypothetical only; it relies on the driver
-being converted completely to the new model::
-
-  static struct device_driver eepro100_driver = {
-         .name		= "eepro100",
-         .bus		= &pci_bus_type,
-
-         .probe		= eepro100_probe,
-         .remove		= eepro100_remove,
-         .suspend		= eepro100_suspend,
-         .resume		= eepro100_resume,
-  };
-
-Most drivers will not be able to be converted completely to the new
-model because the bus they belong to has a bus-specific structure with
-bus-specific fields that cannot be generalized.
-
-The most common example of this are device ID structures. A driver
-typically defines an array of device IDs that it supports. The format
-of these structures and the semantics for comparing device IDs are
-completely bus-specific. Defining them as bus-specific entities would
-sacrifice type-safety, so we keep bus-specific structures around.
-
-Bus-specific drivers should include a generic struct device_driver in
-the definition of the bus-specific driver. Like this::
-
-  struct pci_driver {
-         const struct pci_device_id *id_table;
-         struct device_driver	  driver;
-  };
-
-A definition that included bus-specific fields would look like
-(using the eepro100 driver again)::
-
-  static struct pci_driver eepro100_driver = {
-         .id_table       = eepro100_pci_tbl,
-         .driver	       = {
-		.name		= "eepro100",
-		.bus		= &pci_bus_type,
-		.probe		= eepro100_probe,
-		.remove		= eepro100_remove,
-		.suspend	= eepro100_suspend,
-		.resume		= eepro100_resume,
-         },
-  };
-
-Some may find the syntax of embedded struct initialization awkward or
-even a bit ugly. So far, it's the best way we've found to do what we want...
-
-Registration
-~~~~~~~~~~~~
-
-::
-
-  int driver_register(struct device_driver *drv);
-
-The driver registers the structure on startup. For drivers that have
-no bus-specific fields (i.e. don't have a bus-specific driver
-structure), they would use driver_register and pass a pointer to their
-struct device_driver object.
-
-Most drivers, however, will have a bus-specific structure and will
-need to register with the bus using something like pci_driver_register.
-
-It is important that drivers register their driver structure as early as
-possible. Registration with the core initializes several fields in the
-struct device_driver object, including the reference count and the
-lock. These fields are assumed to be valid at all times and may be
-used by the device model core or the bus driver.
-
-
-Transition Bus Drivers
-~~~~~~~~~~~~~~~~~~~~~~
-
-By defining wrapper functions, the transition to the new model can be
-made easier. Drivers can ignore the generic structure altogether and
-let the bus wrapper fill in the fields. For the callbacks, the bus can
-define generic callbacks that forward the call to the bus-specific
-callbacks of the drivers.
-
-This solution is intended to be only temporary. In order to get class
-information in the driver, the drivers must be modified anyway. Since
-converting drivers to the new model should reduce some infrastructural
-complexity and code size, it is recommended that they are converted as
-class information is added.
-
-Access
-~~~~~~
-
-Once the object has been registered, it may access the common fields of
-the object, like the lock and the list of devices::
-
-  int driver_for_each_dev(struct device_driver *drv, void *data,
-			  int (*callback)(struct device *dev, void *data));
-
-The devices field is a list of all the devices that have been bound to
-the driver. The LDM core provides a helper function to operate on all
-the devices a driver controls. This helper locks the driver on each
-node access, and does proper reference counting on each device as it
-accesses it.
-
-
-sysfs
-~~~~~
-
-When a driver is registered, a sysfs directory is created in its
-bus's directory. In this directory, the driver can export an interface
-to userspace to control operation of the driver on a global basis;
-e.g. toggling debugging output in the driver.
-
-A future feature of this directory will be a 'devices' directory. This
-directory will contain symlinks to the directories of devices it
-supports.
-
-
-
-Callbacks
-~~~~~~~~~
-
-::
-
-	int	(*probe)	(struct device *dev);
-
-The probe() entry is called in task context, with the bus's rwsem locked
-and the driver partially bound to the device.  Drivers commonly use
-container_of() to convert "dev" to a bus-specific type, both in probe()
-and other routines.  That type often provides device resource data, such
-as pci_dev.resource[] or platform_device.resources, which is used in
-addition to dev->platform_data to initialize the driver.
-
-This callback holds the driver-specific logic to bind the driver to a
-given device.  That includes verifying that the device is present, that
-it's a version the driver can handle, that driver data structures can
-be allocated and initialized, and that any hardware can be initialized.
-Drivers often store a pointer to their state with dev_set_drvdata().
-When the driver has successfully bound itself to that device, then probe()
-returns zero and the driver model code will finish its part of binding
-the driver to that device.
-
-A driver's probe() may return a negative errno value to indicate that
-the driver did not bind to this device, in which case it should have
-released all resources it allocated::
-
-	int 	(*remove)	(struct device *dev);
-
-remove is called to unbind a driver from a device. This may be
-called if a device is physically removed from the system, if the
-driver module is being unloaded, during a reboot sequence, or
-in other cases.
-
-It is up to the driver to determine if the device is present or
-not. It should free any resources allocated specifically for the
-device; i.e. anything in the device's driver_data field.
-
-If the device is still present, it should quiesce the device and place
-it into a supported low-power state::
-
-	int	(*suspend)	(struct device *dev, pm_message_t state);
-
-suspend is called to put the device in a low power state::
-
-	int	(*resume)	(struct device *dev);
-
-Resume is used to bring a device back from a low power state.
-
-
-Attributes
-~~~~~~~~~~
-
-::
-
-  struct driver_attribute {
-          struct attribute        attr;
-          ssize_t (*show)(struct device_driver *driver, char *buf);
-          ssize_t (*store)(struct device_driver *, const char *buf, size_t count);
-  };
-
-Device drivers can export attributes via their sysfs directories.
-Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO
-macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO
-macros.
-
-Example::
-
-	DRIVER_ATTR_RW(debug);
-
-This is equivalent to declaring::
-
-	struct driver_attribute driver_attr_debug;
-
-This can then be used to add and remove the attribute from the
-driver's directory using::
-
-  int driver_create_file(struct device_driver *, const struct driver_attribute *);
-  void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/driver-model/index.rst b/Documentation/driver-model/index.rst
deleted file mode 100644
index 9f85d579ce56..000000000000
--- a/Documentation/driver-model/index.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-:orphan:
-
-============
-Driver Model
-============
-
-.. toctree::
-   :maxdepth: 1
-
-   binding
-   bus
-   class
-   design-patterns
-   device
-   devres
-   driver
-   overview
-   platform
-   porting
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/driver-model/overview.rst b/Documentation/driver-model/overview.rst
deleted file mode 100644
index d4d1e9b40e0c..000000000000
--- a/Documentation/driver-model/overview.rst
+++ /dev/null
@@ -1,124 +0,0 @@
-=============================
-The Linux Kernel Device Model
-=============================
-
-Patrick Mochel	<mochel@digitalimplant.org>
-
-Drafted 26 August 2002
-Updated 31 January 2006
-
-
-Overview
-~~~~~~~~
-
-The Linux Kernel Driver Model is a unification of all the disparate driver
-models that were previously used in the kernel. It is intended to augment the
-bus-specific drivers for bridges and devices by consolidating a set of data
-and operations into globally accessible data structures.
-
-Traditional driver models implemented some sort of tree-like structure
-(sometimes just a list) for the devices they control. There wasn't any
-uniformity across the different bus types.
-
-The current driver model provides a common, uniform data model for describing
-a bus and the devices that can appear under the bus. The unified bus
-model includes a set of common attributes which all busses carry, and a set
-of common callbacks, such as device discovery during bus probing, bus
-shutdown, bus power management, etc.
-
-The common device and bridge interface reflects the goals of the modern
-computer: namely the ability to do seamless device "plug and play", power
-management, and hot plug. In particular, the model dictated by Intel and
-Microsoft (namely ACPI) ensures that almost every device on almost any bus
-on an x86-compatible system can work within this paradigm.  Of course,
-not every bus is able to support all such operations, although most
-buses support most of those operations.
-
-
-Downstream Access
-~~~~~~~~~~~~~~~~~
-
-Common data fields have been moved out of individual bus layers into a common
-data structure. These fields must still be accessed by the bus layers,
-and sometimes by the device-specific drivers.
-
-Other bus layers are encouraged to do what has been done for the PCI layer.
-struct pci_dev now looks like this::
-
-  struct pci_dev {
-	...
-
-	struct device dev;     /* Generic device interface */
-	...
-  };
-
-Note first that the struct device dev within the struct pci_dev is
-statically allocated. This means only one allocation on device discovery.
-
-Note also that that struct device dev is not necessarily defined at the
-front of the pci_dev structure.  This is to make people think about what
-they're doing when switching between the bus driver and the global driver,
-and to discourage meaningless and incorrect casts between the two.
-
-The PCI bus layer freely accesses the fields of struct device. It knows about
-the structure of struct pci_dev, and it should know the structure of struct
-device. Individual PCI device drivers that have been converted to the current
-driver model generally do not and should not touch the fields of struct device,
-unless there is a compelling reason to do so.
-
-The above abstraction prevents unnecessary pain during transitional phases.
-If it were not done this way, then when a field was renamed or removed, every
-downstream driver would break.  On the other hand, if only the bus layer
-(and not the device layer) accesses the struct device, it is only the bus
-layer that needs to change.
-
-
-User Interface
-~~~~~~~~~~~~~~
-
-By virtue of having a complete hierarchical view of all the devices in the
-system, exporting a complete hierarchical view to userspace becomes relatively
-easy. This has been accomplished by implementing a special purpose virtual
-file system named sysfs.
-
-Almost all mainstream Linux distros mount this filesystem automatically; you
-can see some variation of the following in the output of the "mount" command::
-
-  $ mount
-  ...
-  none on /sys type sysfs (rw,noexec,nosuid,nodev)
-  ...
-  $
-
-The auto-mounting of sysfs is typically accomplished by an entry similar to
-the following in the /etc/fstab file::
-
-  none     	/sys	sysfs    defaults	  	0 0
-
-or something similar in the /lib/init/fstab file on Debian-based systems::
-
-  none            /sys    sysfs    nodev,noexec,nosuid    0 0
-
-If sysfs is not automatically mounted, you can always do it manually with::
-
-	# mount -t sysfs sysfs /sys
-
-Whenever a device is inserted into the tree, a directory is created for it.
-This directory may be populated at each layer of discovery - the global layer,
-the bus layer, or the device layer.
-
-The global layer currently creates two files - 'name' and 'power'. The
-former only reports the name of the device. The latter reports the
-current power state of the device. It will also be used to set the current
-power state.
-
-The bus layer may also create files for the devices it finds while probing the
-bus. For example, the PCI layer currently creates 'irq' and 'resource' files
-for each PCI device.
-
-A device-specific driver may also export files in its directory to expose
-device-specific data or tunable interfaces.
-
-More information about the sysfs directory layout can be found in
-the other documents in this directory and in the file
-Documentation/filesystems/sysfs.txt.
diff --git a/Documentation/driver-model/platform.rst b/Documentation/driver-model/platform.rst
deleted file mode 100644
index 334dd4071ae4..000000000000
--- a/Documentation/driver-model/platform.rst
+++ /dev/null
@@ -1,246 +0,0 @@
-============================
-Platform Devices and Drivers
-============================
-
-See <linux/platform_device.h> for the driver model interface to the
-platform bus:  platform_device, and platform_driver.  This pseudo-bus
-is used to connect devices on busses with minimal infrastructure,
-like those used to integrate peripherals on many system-on-chip
-processors, or some "legacy" PC interconnects; as opposed to large
-formally specified ones like PCI or USB.
-
-
-Platform devices
-~~~~~~~~~~~~~~~~
-Platform devices are devices that typically appear as autonomous
-entities in the system. This includes legacy port-based devices and
-host bridges to peripheral buses, and most controllers integrated
-into system-on-chip platforms.  What they usually have in common
-is direct addressing from a CPU bus.  Rarely, a platform_device will
-be connected through a segment of some other kind of bus; but its
-registers will still be directly addressable.
-
-Platform devices are given a name, used in driver binding, and a
-list of resources such as addresses and IRQs::
-
-  struct platform_device {
-	const char	*name;
-	u32		id;
-	struct device	dev;
-	u32		num_resources;
-	struct resource	*resource;
-  };
-
-
-Platform drivers
-~~~~~~~~~~~~~~~~
-Platform drivers follow the standard driver model convention, where
-discovery/enumeration is handled outside the drivers, and drivers
-provide probe() and remove() methods.  They support power management
-and shutdown notifications using the standard conventions::
-
-  struct platform_driver {
-	int (*probe)(struct platform_device *);
-	int (*remove)(struct platform_device *);
-	void (*shutdown)(struct platform_device *);
-	int (*suspend)(struct platform_device *, pm_message_t state);
-	int (*suspend_late)(struct platform_device *, pm_message_t state);
-	int (*resume_early)(struct platform_device *);
-	int (*resume)(struct platform_device *);
-	struct device_driver driver;
-  };
-
-Note that probe() should in general verify that the specified device hardware
-actually exists; sometimes platform setup code can't be sure.  The probing
-can use device resources, including clocks, and device platform_data.
-
-Platform drivers register themselves the normal way::
-
-	int platform_driver_register(struct platform_driver *drv);
-
-Or, in common situations where the device is known not to be hot-pluggable,
-the probe() routine can live in an init section to reduce the driver's
-runtime memory footprint::
-
-	int platform_driver_probe(struct platform_driver *drv,
-			  int (*probe)(struct platform_device *))
-
-Kernel modules can be composed of several platform drivers. The platform core
-provides helpers to register and unregister an array of drivers::
-
-	int __platform_register_drivers(struct platform_driver * const *drivers,
-				      unsigned int count, struct module *owner);
-	void platform_unregister_drivers(struct platform_driver * const *drivers,
-					 unsigned int count);
-
-If one of the drivers fails to register, all drivers registered up to that
-point will be unregistered in reverse order. Note that there is a convenience
-macro that passes THIS_MODULE as owner parameter::
-
-	#define platform_register_drivers(drivers, count)
-
-
-Device Enumeration
-~~~~~~~~~~~~~~~~~~
-As a rule, platform specific (and often board-specific) setup code will
-register platform devices::
-
-	int platform_device_register(struct platform_device *pdev);
-
-	int platform_add_devices(struct platform_device **pdevs, int ndev);
-
-The general rule is to register only those devices that actually exist,
-but in some cases extra devices might be registered.  For example, a kernel
-might be configured to work with an external network adapter that might not
-be populated on all boards, or likewise to work with an integrated controller
-that some boards might not hook up to any peripherals.
-
-In some cases, boot firmware will export tables describing the devices
-that are populated on a given board.   Without such tables, often the
-only way for system setup code to set up the correct devices is to build
-a kernel for a specific target board.  Such board-specific kernels are
-common with embedded and custom systems development.
-
-In many cases, the memory and IRQ resources associated with the platform
-device are not enough to let the device's driver work.  Board setup code
-will often provide additional information using the device's platform_data
-field to hold additional information.
-
-Embedded systems frequently need one or more clocks for platform devices,
-which are normally kept off until they're actively needed (to save power).
-System setup also associates those clocks with the device, so that that
-calls to clk_get(&pdev->dev, clock_name) return them as needed.
-
-
-Legacy Drivers:  Device Probing
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some drivers are not fully converted to the driver model, because they take
-on a non-driver role:  the driver registers its platform device, rather than
-leaving that for system infrastructure.  Such drivers can't be hotplugged
-or coldplugged, since those mechanisms require device creation to be in a
-different system component than the driver.
-
-The only "good" reason for this is to handle older system designs which, like
-original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
-configuration.  Newer systems have largely abandoned that model, in favor of
-bus-level support for dynamic configuration (PCI, USB), or device tables
-provided by the boot firmware (e.g. PNPACPI on x86).  There are too many
-conflicting options about what might be where, and even educated guesses by
-an operating system will be wrong often enough to make trouble.
-
-This style of driver is discouraged.  If you're updating such a driver,
-please try to move the device enumeration to a more appropriate location,
-outside the driver.  This will usually be cleanup, since such drivers
-tend to already have "normal" modes, such as ones using device nodes that
-were created by PNP or by platform device setup.
-
-None the less, there are some APIs to support such legacy drivers.  Avoid
-using these calls except with such hotplug-deficient drivers::
-
-	struct platform_device *platform_device_alloc(
-			const char *name, int id);
-
-You can use platform_device_alloc() to dynamically allocate a device, which
-you will then initialize with resources and platform_device_register().
-A better solution is usually::
-
-	struct platform_device *platform_device_register_simple(
-			const char *name, int id,
-			struct resource *res, unsigned int nres);
-
-You can use platform_device_register_simple() as a one-step call to allocate
-and register a device.
-
-
-Device Naming and Driver Binding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The platform_device.dev.bus_id is the canonical name for the devices.
-It's built from two components:
-
-    * platform_device.name ... which is also used to for driver matching.
-
-    * platform_device.id ... the device instance number, or else "-1"
-      to indicate there's only one.
-
-These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
-"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
-named "serial".  While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
-and use the platform_driver called "my_rtc".
-
-Driver binding is performed automatically by the driver core, invoking
-driver probe() after finding a match between device and driver.  If the
-probe() succeeds, the driver and device are bound as usual.  There are
-three different ways to find such a match:
-
-    - Whenever a device is registered, the drivers for that bus are
-      checked for matches.  Platform devices should be registered very
-      early during system boot.
-
-    - When a driver is registered using platform_driver_register(), all
-      unbound devices on that bus are checked for matches.  Drivers
-      usually register later during booting, or by module loading.
-
-    - Registering a driver using platform_driver_probe() works just like
-      using platform_driver_register(), except that the driver won't
-      be probed later if another device registers.  (Which is OK, since
-      this interface is only for use with non-hotpluggable devices.)
-
-
-Early Platform Devices and Drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The early platform interfaces provide platform data to platform device
-drivers early on during the system boot. The code is built on top of the
-early_param() command line parsing and can be executed very early on.
-
-Example: "earlyprintk" class early serial console in 6 steps
-
-1. Registering early platform device data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code registers platform device data using the function
-early_platform_add_devices(). In the case of early serial console this
-should be hardware configuration for the serial port. Devices registered
-at this point will later on be matched against early platform drivers.
-
-2. Parsing kernel command line
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code calls parse_early_param() to parse the kernel
-command line. This will execute all matching early_param() callbacks.
-User specified early platform devices will be registered at this point.
-For the early serial console case the user can specify port on the
-kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
-the class string, "serial" is the name of the platform driver and
-0 is the platform device id. If the id is -1 then the dot and the
-id can be omitted.
-
-3. Installing early platform drivers belonging to a certain class
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code may optionally force registration of all early
-platform drivers belonging to a certain class using the function
-early_platform_driver_register_all(). User specified devices from
-step 2 have priority over these. This step is omitted by the serial
-driver example since the early serial driver code should be disabled
-unless the user has specified port on the kernel command line.
-
-4. Early platform driver registration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Compiled-in platform drivers making use of early_platform_init() are
-automatically registered during step 2 or 3. The serial driver example
-should use early_platform_init("earlyprintk", &platform_driver).
-
-5. Probing of early platform drivers belonging to a certain class
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code calls early_platform_driver_probe() to match
-registered early platform devices associated with a certain class with
-registered early platform drivers. Matched devices will get probed().
-This step can be executed at any point during the early boot. As soon
-as possible may be good for the serial port case.
-
-6. Inside the early platform driver probe()
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The driver code needs to take special care during early boot, especially
-when it comes to memory allocation and interrupt registration. The code
-in the probe() function can use is_early_platform_device() to check if
-it is called at early platform device or at the regular platform device
-time. The early serial driver performs register_console() at this point.
-
-For further information, see <linux/platform_device.h>.
diff --git a/Documentation/driver-model/porting.rst b/Documentation/driver-model/porting.rst
deleted file mode 100644
index ae4bf843c1d6..000000000000
--- a/Documentation/driver-model/porting.rst
+++ /dev/null
@@ -1,448 +0,0 @@
-=======================================
-Porting Drivers to the New Driver Model
-=======================================
-
-Patrick Mochel
-
-7 January 2003
-
-
-Overview
-
-Please refer to `Documentation/driver-model/*.rst` for definitions of
-various driver types and concepts.
-
-Most of the work of porting devices drivers to the new model happens
-at the bus driver layer. This was intentional, to minimize the
-negative effect on kernel drivers, and to allow a gradual transition
-of bus drivers.
-
-In a nutshell, the driver model consists of a set of objects that can
-be embedded in larger, bus-specific objects. Fields in these generic
-objects can replace fields in the bus-specific objects.
-
-The generic objects must be registered with the driver model core. By
-doing so, they will exported via the sysfs filesystem. sysfs can be
-mounted by doing::
-
-	# mount -t sysfs sysfs /sys
-
-
-
-The Process
-
-Step 0: Read include/linux/device.h for object and function definitions.
-
-Step 1: Registering the bus driver.
-
-
-- Define a struct bus_type for the bus driver::
-
-    struct bus_type pci_bus_type = {
-          .name           = "pci",
-    };
-
-
-- Register the bus type.
-
-  This should be done in the initialization function for the bus type,
-  which is usually the module_init(), or equivalent, function::
-
-    static int __init pci_driver_init(void)
-    {
-            return bus_register(&pci_bus_type);
-    }
-
-    subsys_initcall(pci_driver_init);
-
-
-  The bus type may be unregistered (if the bus driver may be compiled
-  as a module) by doing::
-
-     bus_unregister(&pci_bus_type);
-
-
-- Export the bus type for others to use.
-
-  Other code may wish to reference the bus type, so declare it in a
-  shared header file and export the symbol.
-
-From include/linux/pci.h::
-
-  extern struct bus_type pci_bus_type;
-
-
-From file the above code appears in::
-
-  EXPORT_SYMBOL(pci_bus_type);
-
-
-
-- This will cause the bus to show up in /sys/bus/pci/ with two
-  subdirectories: 'devices' and 'drivers'::
-
-    # tree -d /sys/bus/pci/
-    /sys/bus/pci/
-    |-- devices
-    `-- drivers
-
-
-
-Step 2: Registering Devices.
-
-struct device represents a single device. It mainly contains metadata
-describing the relationship the device has to other entities.
-
-
-- Embed a struct device in the bus-specific device type::
-
-
-    struct pci_dev {
-           ...
-           struct  device  dev;            /* Generic device interface */
-           ...
-    };
-
-  It is recommended that the generic device not be the first item in
-  the struct to discourage programmers from doing mindless casts
-  between the object types. Instead macros, or inline functions,
-  should be created to convert from the generic object type::
-
-
-    #define to_pci_dev(n) container_of(n, struct pci_dev, dev)
-
-    or
-
-    static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
-    {
-	return container_of(n, struct pci_dev, dev);
-    }
-
-  This allows the compiler to verify type-safety of the operations
-  that are performed (which is Good).
-
-
-- Initialize the device on registration.
-
-  When devices are discovered or registered with the bus type, the
-  bus driver should initialize the generic device. The most important
-  things to initialize are the bus_id, parent, and bus fields.
-
-  The bus_id is an ASCII string that contains the device's address on
-  the bus. The format of this string is bus-specific. This is
-  necessary for representing devices in sysfs.
-
-  parent is the physical parent of the device. It is important that
-  the bus driver sets this field correctly.
-
-  The driver model maintains an ordered list of devices that it uses
-  for power management. This list must be in order to guarantee that
-  devices are shutdown before their physical parents, and vice versa.
-  The order of this list is determined by the parent of registered
-  devices.
-
-  Also, the location of the device's sysfs directory depends on a
-  device's parent. sysfs exports a directory structure that mirrors
-  the device hierarchy. Accurately setting the parent guarantees that
-  sysfs will accurately represent the hierarchy.
-
-  The device's bus field is a pointer to the bus type the device
-  belongs to. This should be set to the bus_type that was declared
-  and initialized before.
-
-  Optionally, the bus driver may set the device's name and release
-  fields.
-
-  The name field is an ASCII string describing the device, like
-
-     "ATI Technologies Inc Radeon QD"
-
-  The release field is a callback that the driver model core calls
-  when the device has been removed, and all references to it have
-  been released. More on this in a moment.
-
-
-- Register the device.
-
-  Once the generic device has been initialized, it can be registered
-  with the driver model core by doing::
-
-       device_register(&dev->dev);
-
-  It can later be unregistered by doing::
-
-       device_unregister(&dev->dev);
-
-  This should happen on buses that support hotpluggable devices.
-  If a bus driver unregisters a device, it should not immediately free
-  it. It should instead wait for the driver model core to call the
-  device's release method, then free the bus-specific object.
-  (There may be other code that is currently referencing the device
-  structure, and it would be rude to free the device while that is
-  happening).
-
-
-  When the device is registered, a directory in sysfs is created.
-  The PCI tree in sysfs looks like::
-
-    /sys/devices/pci0/
-    |-- 00:00.0
-    |-- 00:01.0
-    |   `-- 01:00.0
-    |-- 00:02.0
-    |   `-- 02:1f.0
-    |       `-- 03:00.0
-    |-- 00:1e.0
-    |   `-- 04:04.0
-    |-- 00:1f.0
-    |-- 00:1f.1
-    |   |-- ide0
-    |   |   |-- 0.0
-    |   |   `-- 0.1
-    |   `-- ide1
-    |       `-- 1.0
-    |-- 00:1f.2
-    |-- 00:1f.3
-    `-- 00:1f.5
-
-  Also, symlinks are created in the bus's 'devices' directory
-  that point to the device's directory in the physical hierarchy::
-
-    /sys/bus/pci/devices/
-    |-- 00:00.0 -> ../../../devices/pci0/00:00.0
-    |-- 00:01.0 -> ../../../devices/pci0/00:01.0
-    |-- 00:02.0 -> ../../../devices/pci0/00:02.0
-    |-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
-    |-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
-    |-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
-    |-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
-    |-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
-    |-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
-    |-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
-    |-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
-    |-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
-    `-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
-
-
-
-Step 3: Registering Drivers.
-
-struct device_driver is a simple driver structure that contains a set
-of operations that the driver model core may call.
-
-
-- Embed a struct device_driver in the bus-specific driver.
-
-  Just like with devices, do something like::
-
-    struct pci_driver {
-           ...
-           struct device_driver    driver;
-    };
-
-
-- Initialize the generic driver structure.
-
-  When the driver registers with the bus (e.g. doing pci_register_driver()),
-  initialize the necessary fields of the driver: the name and bus
-  fields.
-
-
-- Register the driver.
-
-  After the generic driver has been initialized, call::
-
-	driver_register(&drv->driver);
-
-  to register the driver with the core.
-
-  When the driver is unregistered from the bus, unregister it from the
-  core by doing::
-
-        driver_unregister(&drv->driver);
-
-  Note that this will block until all references to the driver have
-  gone away. Normally, there will not be any.
-
-
-- Sysfs representation.
-
-  Drivers are exported via sysfs in their bus's 'driver's directory.
-  For example::
-
-    /sys/bus/pci/drivers/
-    |-- 3c59x
-    |-- Ensoniq AudioPCI
-    |-- agpgart-amdk7
-    |-- e100
-    `-- serial
-
-
-Step 4: Define Generic Methods for Drivers.
-
-struct device_driver defines a set of operations that the driver model
-core calls. Most of these operations are probably similar to
-operations the bus already defines for drivers, but taking different
-parameters.
-
-It would be difficult and tedious to force every driver on a bus to
-simultaneously convert their drivers to generic format. Instead, the
-bus driver should define single instances of the generic methods that
-forward call to the bus-specific drivers. For instance::
-
-
-  static int pci_device_remove(struct device * dev)
-  {
-          struct pci_dev * pci_dev = to_pci_dev(dev);
-          struct pci_driver * drv = pci_dev->driver;
-
-          if (drv) {
-                  if (drv->remove)
-                          drv->remove(pci_dev);
-                  pci_dev->driver = NULL;
-          }
-          return 0;
-  }
-
-
-The generic driver should be initialized with these methods before it
-is registered::
-
-        /* initialize common driver fields */
-        drv->driver.name = drv->name;
-        drv->driver.bus = &pci_bus_type;
-        drv->driver.probe = pci_device_probe;
-        drv->driver.resume = pci_device_resume;
-        drv->driver.suspend = pci_device_suspend;
-        drv->driver.remove = pci_device_remove;
-
-        /* register with core */
-        driver_register(&drv->driver);
-
-
-Ideally, the bus should only initialize the fields if they are not
-already set. This allows the drivers to implement their own generic
-methods.
-
-
-Step 5: Support generic driver binding.
-
-The model assumes that a device or driver can be dynamically
-registered with the bus at any time. When registration happens,
-devices must be bound to a driver, or drivers must be bound to all
-devices that it supports.
-
-A driver typically contains a list of device IDs that it supports. The
-bus driver compares these IDs to the IDs of devices registered with it.
-The format of the device IDs, and the semantics for comparing them are
-bus-specific, so the generic model does attempt to generalize them.
-
-Instead, a bus may supply a method in struct bus_type that does the
-comparison::
-
-  int (*match)(struct device * dev, struct device_driver * drv);
-
-match should return positive value if the driver supports the device,
-and zero otherwise. It may also return error code (for example
--EPROBE_DEFER) if determining that given driver supports the device is
-not possible.
-
-When a device is registered, the bus's list of drivers is iterated
-over. bus->match() is called for each one until a match is found.
-
-When a driver is registered, the bus's list of devices is iterated
-over. bus->match() is called for each device that is not already
-claimed by a driver.
-
-When a device is successfully bound to a driver, device->driver is
-set, the device is added to a per-driver list of devices, and a
-symlink is created in the driver's sysfs directory that points to the
-device's physical directory::
-
-  /sys/bus/pci/drivers/
-  |-- 3c59x
-  |   `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
-  |-- Ensoniq AudioPCI
-  |-- agpgart-amdk7
-  |   `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
-  |-- e100
-  |   `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
-  `-- serial
-
-
-This driver binding should replace the existing driver binding
-mechanism the bus currently uses.
-
-
-Step 6: Supply a hotplug callback.
-
-Whenever a device is registered with the driver model core, the
-userspace program /sbin/hotplug is called to notify userspace.
-Users can define actions to perform when a device is inserted or
-removed.
-
-The driver model core passes several arguments to userspace via
-environment variables, including
-
-- ACTION: set to 'add' or 'remove'
-- DEVPATH: set to the device's physical path in sysfs.
-
-A bus driver may also supply additional parameters for userspace to
-consume. To do this, a bus must implement the 'hotplug' method in
-struct bus_type::
-
-     int (*hotplug) (struct device *dev, char **envp,
-                     int num_envp, char *buffer, int buffer_size);
-
-This is called immediately before /sbin/hotplug is executed.
-
-
-Step 7: Cleaning up the bus driver.
-
-The generic bus, device, and driver structures provide several fields
-that can replace those defined privately to the bus driver.
-
-- Device list.
-
-struct bus_type contains a list of all devices registered with the bus
-type. This includes all devices on all instances of that bus type.
-An internal list that the bus uses may be removed, in favor of using
-this one.
-
-The core provides an iterator to access these devices::
-
-  int bus_for_each_dev(struct bus_type * bus, struct device * start,
-                       void * data, int (*fn)(struct device *, void *));
-
-
-- Driver list.
-
-struct bus_type also contains a list of all drivers registered with
-it. An internal list of drivers that the bus driver maintains may
-be removed in favor of using the generic one.
-
-The drivers may be iterated over, like devices::
-
-  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
-                       void * data, int (*fn)(struct device_driver *, void *));
-
-
-Please see drivers/base/bus.c for more information.
-
-
-- rwsem
-
-struct bus_type contains an rwsem that protects all core accesses to
-the device and driver lists. This can be used by the bus driver
-internally, and should be used when accessing the device or driver
-lists the bus maintains.
-
-
-- Device and driver fields.
-
-Some of the fields in struct device and struct device_driver duplicate
-fields in the bus-specific representations of these objects. Feel free
-to remove the bus-specific ones and favor the generic ones. Note
-though, that this will likely mean fixing up all the drivers that
-reference the bus-specific fields (though those should all be 1-line
-changes).
diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt
index f388545a85a7..c07565ba57da 100644
--- a/Documentation/eisa.txt
+++ b/Documentation/eisa.txt
@@ -103,7 +103,7 @@ id_table	an array of NULL terminated EISA id strings,
 		(driver_data).
 
 driver		a generic driver, such as described in
-		Documentation/driver-model/driver.rst. Only .name,
+		Documentation/driver-api/driver-model/driver.rst. Only .name,
 		.probe and .remove members are mandatory.
 =============== ====================================================
 
@@ -152,7 +152,7 @@ state    set of flags indicating the state of the device. Current
 	 flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED.
 res	 set of four 256 bytes I/O regions allocated to this device
 dma_mask DMA mask set from the parent device.
-dev	 generic device (see Documentation/driver-model/device.rst)
+dev	 generic device (see Documentation/driver-api/driver-model/device.rst)
 ======== ============================================================
 
 You can get the 'struct eisa_device' from 'struct device' using the
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 5b5311f9358d..ddf15b1b0d5a 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -319,7 +319,7 @@ quick way to lookup the sysfs interface for a device from the result of
 a stat(2) operation.
 
 More information can driver-model specific features can be found in
-Documentation/driver-model/. 
+Documentation/driver-api/driver-model/.
 
 
 TODO: Finish this section.
diff --git a/Documentation/hwmon/submitting-patches.rst b/Documentation/hwmon/submitting-patches.rst
index d5b05d3e54ba..452fc28d8e0b 100644
--- a/Documentation/hwmon/submitting-patches.rst
+++ b/Documentation/hwmon/submitting-patches.rst
@@ -89,7 +89,7 @@ increase the chances of your change being accepted.
   console. Excessive logging can seriously affect system performance.
 
 * Use devres functions whenever possible to allocate resources. For rationale
-  and supported functions, please see Documentation/driver-model/devres.rst.
+  and supported functions, please see Documentation/driver-api/driver-model/devres.rst.
   If a function is not supported by devres, consider using devm_add_action().
 
 * If the driver has a detect function, make sure it is silent. Debug messages
diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt
index 452271dda141..ee1f37da5b23 100644
--- a/Documentation/translations/zh_CN/filesystems/sysfs.txt
+++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt
@@ -288,7 +288,7 @@ dev/ 包含两个子目录： char/ 和 block/。在这两个子目录中，有
 中相应的设备。/sys/dev 提供一个通过一个 stat(2) 操作结果，查找
 设备 sysfs 接口快捷的方法。
 
-更多有关 driver-model 的特性信息可以在 Documentation/driver-model/
+更多有关 driver-model 的特性信息可以在 Documentation/driver-api/driver-model/
 中找到。
 
 
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 713903290385..506a0175a5a7 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -5,7 +5,7 @@
  * Copyright (c) 2002-3 Patrick Mochel
  * Copyright (c) 2002-3 Open Source Development Labs
  *
- * Please see Documentation/driver-model/platform.rst for more
+ * Please see Documentation/driver-api/driver-model/platform.rst for more
  * information.
  */
 
diff --git a/drivers/gpio/gpio-cs5535.c b/drivers/gpio/gpio-cs5535.c
index 3611a0571667..53b24e3ae7de 100644
--- a/drivers/gpio/gpio-cs5535.c
+++ b/drivers/gpio/gpio-cs5535.c
@@ -41,7 +41,7 @@ MODULE_PARM_DESC(mask, "GPIO channel mask.");
 
 /*
  * FIXME: convert this singleton driver to use the state container
- * design pattern, see Documentation/driver-model/design-patterns.rst
+ * design pattern, see Documentation/driver-api/driver-model/design-patterns.rst
  */
 static struct cs5535_gpio_chip {
 	struct gpio_chip chip;
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 41c90f2ddb31..63db08d9bafa 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -2286,7 +2286,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	struct ice_hw *hw;
 	int err;
 
-	/* this driver uses devres, see Documentation/driver-model/devres.rst */
+	/* this driver uses devres, see Documentation/driver-api/driver-model/devres.rst */
 	err = pcim_enable_device(pdev);
 	if (err)
 		return err;
diff --git a/drivers/staging/unisys/Documentation/overview.txt b/drivers/staging/unisys/Documentation/overview.txt
index 9ab30af265a5..f8a4144b239c 100644
--- a/drivers/staging/unisys/Documentation/overview.txt
+++ b/drivers/staging/unisys/Documentation/overview.txt
@@ -15,7 +15,7 @@ normally be unsharable, specifically:
 * visorinput - keyboard and mouse
 
 These drivers conform to the standard Linux bus/device model described
-within Documentation/driver-model/, and utilize a driver named visorbus to
+within Documentation/driver-api/driver-model/, and utilize a driver named visorbus to
 present the virtual busses involved. Drivers in the 'visor*' driver set are
 commonly referred to as "guest drivers" or "client drivers".  All drivers
 except visorbus expose a device of a specific usable class to the Linux guest
@@ -141,7 +141,7 @@ called automatically by the visorbus driver at appropriate times:
 -----------------------------------
 
 Because visorbus is a standard Linux bus driver in the model described in
-Documentation/driver-model/, the hierarchy of s-Par virtual devices is
+Documentation/driver-api/driver-model/, the hierarchy of s-Par virtual devices is
 published in the sysfs tree beneath /bus/visorbus/, e.g.,
 /sys/bus/visorbus/devices/ might look like:
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 5eabfa0c4dee..c330b75c6c57 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -6,7 +6,7 @@
  * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
  * Copyright (c) 2008-2009 Novell Inc.
  *
- * See Documentation/driver-model/ for more information.
+ * See Documentation/driver-api/driver-model/ for more information.
  */
 
 #ifndef _DEVICE_H_
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index beb25f277889..9bc36b589827 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -4,7 +4,7 @@
  *
  * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
  *
- * See Documentation/driver-model/ for more information.
+ * See Documentation/driver-api/driver-model/ for more information.
  */
 
 #ifndef _PLATFORM_DEVICE_H_
diff --git a/scripts/coccinelle/free/devm_free.cocci b/scripts/coccinelle/free/devm_free.cocci
index fefd0331a2de..441799b5359b 100644
--- a/scripts/coccinelle/free/devm_free.cocci
+++ b/scripts/coccinelle/free/devm_free.cocci
@@ -3,7 +3,7 @@
 /// functions.  Values allocated using the devm_functions are freed when
 /// the device is detached, and thus the use of the standard freeing
 /// function would cause a double free.
-/// See Documentation/driver-model/devres.rst for more information.
+/// See Documentation/driver-api/driver-model/devres.rst for more information.
 ///
 /// A difficulty of detecting this problem is that the standard freeing
 /// function might be called from a different function than the one
-- 
cgit v1.2.3


From da82c92f1150f66afabf78d2c85ef9ac18dc6d38 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 27 Jun 2019 13:08:35 -0300
Subject: docs: cgroup-v1: add it to the admin-guide book

Those files belong to the admin guide, so add them.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 .../admin-guide/cgroup-v1/blkio-controller.rst     |  302 ++++++
 Documentation/admin-guide/cgroup-v1/cgroups.rst    |  695 ++++++++++++++
 Documentation/admin-guide/cgroup-v1/cpuacct.rst    |   50 +
 Documentation/admin-guide/cgroup-v1/cpusets.rst    |  866 +++++++++++++++++
 Documentation/admin-guide/cgroup-v1/devices.rst    |  132 +++
 .../admin-guide/cgroup-v1/freezer-subsystem.rst    |  127 +++
 Documentation/admin-guide/cgroup-v1/hugetlb.rst    |   50 +
 Documentation/admin-guide/cgroup-v1/index.rst      |   28 +
 Documentation/admin-guide/cgroup-v1/memcg_test.rst |  355 +++++++
 Documentation/admin-guide/cgroup-v1/memory.rst     | 1003 ++++++++++++++++++++
 Documentation/admin-guide/cgroup-v1/net_cls.rst    |   44 +
 Documentation/admin-guide/cgroup-v1/net_prio.rst   |   57 ++
 Documentation/admin-guide/cgroup-v1/pids.rst       |   92 ++
 Documentation/admin-guide/cgroup-v1/rdma.rst       |  117 +++
 Documentation/admin-guide/cgroup-v2.rst            |    2 +-
 Documentation/admin-guide/index.rst                |    1 +
 Documentation/admin-guide/kernel-parameters.txt    |    4 +-
 .../admin-guide/mm/numa_memory_policy.rst          |    2 +-
 Documentation/block/bfq-iosched.rst                |    2 +-
 Documentation/cgroup-v1/blkio-controller.rst       |  302 ------
 Documentation/cgroup-v1/cgroups.rst                |  695 --------------
 Documentation/cgroup-v1/cpuacct.rst                |   50 -
 Documentation/cgroup-v1/cpusets.rst                |  866 -----------------
 Documentation/cgroup-v1/devices.rst                |  132 ---
 Documentation/cgroup-v1/freezer-subsystem.rst      |  127 ---
 Documentation/cgroup-v1/hugetlb.rst                |   50 -
 Documentation/cgroup-v1/index.rst                  |   30 -
 Documentation/cgroup-v1/memcg_test.rst             |  355 -------
 Documentation/cgroup-v1/memory.rst                 | 1003 --------------------
 Documentation/cgroup-v1/net_cls.rst                |   44 -
 Documentation/cgroup-v1/net_prio.rst               |   57 --
 Documentation/cgroup-v1/pids.rst                   |   92 --
 Documentation/cgroup-v1/rdma.rst                   |  117 ---
 Documentation/filesystems/tmpfs.txt                |    2 +-
 Documentation/kernel-per-CPU-kthreads.txt          |    2 +-
 Documentation/scheduler/sched-deadline.rst         |    2 +-
 Documentation/scheduler/sched-design-CFS.rst       |    2 +-
 Documentation/scheduler/sched-rt-group.rst         |    2 +-
 Documentation/vm/numa.rst                          |    4 +-
 Documentation/vm/page_migration.rst                |    2 +-
 Documentation/vm/unevictable-lru.rst               |    2 +-
 Documentation/x86/x86_64/fake-numa-for-cpusets.rst |    4 +-
 MAINTAINERS                                        |    4 +-
 block/Kconfig                                      |    2 +-
 include/linux/cgroup-defs.h                        |    2 +-
 include/uapi/linux/bpf.h                           |    2 +-
 init/Kconfig                                       |    4 +-
 kernel/cgroup/cpuset.c                             |    2 +-
 security/device_cgroup.c                           |    2 +-
 tools/include/uapi/linux/bpf.h                     |    2 +-
 50 files changed, 3945 insertions(+), 3946 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/blkio-controller.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/cgroups.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/cpuacct.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/cpusets.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/devices.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/hugetlb.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/index.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/memcg_test.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/memory.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/net_cls.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/net_prio.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/pids.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/rdma.rst
 delete mode 100644 Documentation/cgroup-v1/blkio-controller.rst
 delete mode 100644 Documentation/cgroup-v1/cgroups.rst
 delete mode 100644 Documentation/cgroup-v1/cpuacct.rst
 delete mode 100644 Documentation/cgroup-v1/cpusets.rst
 delete mode 100644 Documentation/cgroup-v1/devices.rst
 delete mode 100644 Documentation/cgroup-v1/freezer-subsystem.rst
 delete mode 100644 Documentation/cgroup-v1/hugetlb.rst
 delete mode 100644 Documentation/cgroup-v1/index.rst
 delete mode 100644 Documentation/cgroup-v1/memcg_test.rst
 delete mode 100644 Documentation/cgroup-v1/memory.rst
 delete mode 100644 Documentation/cgroup-v1/net_cls.rst
 delete mode 100644 Documentation/cgroup-v1/net_prio.rst
 delete mode 100644 Documentation/cgroup-v1/pids.rst
 delete mode 100644 Documentation/cgroup-v1/rdma.rst

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
new file mode 100644
index 000000000000..1d7d962933be
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
@@ -0,0 +1,302 @@
+===================
+Block IO Controller
+===================
+
+Overview
+========
+cgroup subsys "blkio" implements the block io controller. There seems to be
+a need of various kinds of IO control policies (like proportional BW, max BW)
+both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
+Plan is to use the same cgroup based management interface for blkio controller
+and based on user options switch IO policies in the background.
+
+One IO control policy is throttling policy which can be used to
+specify upper IO rate limits on devices. This policy is implemented in
+generic block layer and can be used on leaf nodes as well as higher
+level logical devices like device mapper.
+
+HOWTO
+=====
+Throttling/Upper Limit policy
+-----------------------------
+- Enable Block IO controller::
+
+	CONFIG_BLK_CGROUP=y
+
+- Enable throttling in block layer::
+
+	CONFIG_BLK_DEV_THROTTLING=y
+
+- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
+
+        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
+
+- Specify a bandwidth rate on particular device for root group. The format
+  for policy is "<major>:<minor>  <bytes_per_second>"::
+
+        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
+
+  Above will put a limit of 1MB/second on reads happening for root group
+  on device having major/minor number 8:16.
+
+- Run dd to read a file and see if rate is throttled to 1MB/s or not::
+
+        # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
+        1024+0 records in
+        1024+0 records out
+        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
+
+ Limits for writes can be put using blkio.throttle.write_bps_device file.
+
+Hierarchical Cgroups
+====================
+
+Throttling implements hierarchy support; however,
+throttling's hierarchy support is enabled iff "sane_behavior" is
+enabled from cgroup side, which currently is a development option and
+not publicly available.
+
+If somebody created a hierarchy like as follows::
+
+			root
+			/  \
+		     test1 test2
+			|
+		     test3
+
+Throttling with "sane_behavior" will handle the
+hierarchy correctly. For throttling, all limits apply
+to the whole subtree while all statistics are local to the IOs
+directly generated by tasks in that cgroup.
+
+Throttling without "sane_behavior" enabled from cgroup side will
+practically treat all groups at same level as if it looks like the
+following::
+
+				pivot
+			     /  /   \  \
+			root  test1 test2  test3
+
+Various user visible config options
+===================================
+CONFIG_BLK_CGROUP
+	- Block IO controller.
+
+CONFIG_BFQ_CGROUP_DEBUG
+	- Debug help. Right now some additional stats file show up in cgroup
+	  if this option is enabled.
+
+CONFIG_BLK_DEV_THROTTLING
+	- Enable block device throttling support in block layer.
+
+Details of cgroup files
+=======================
+Proportional weight policy files
+--------------------------------
+- blkio.weight
+	- Specifies per cgroup weight. This is default weight of the group
+	  on all the devices until and unless overridden by per device rule.
+	  (See blkio.weight_device).
+	  Currently allowed range of weights is from 10 to 1000.
+
+- blkio.weight_device
+	- One can specify per cgroup per device rules using this interface.
+	  These rules override the default value of group weight as specified
+	  by blkio.weight.
+
+	  Following is the format::
+
+	    # echo dev_maj:dev_minor weight > blkio.weight_device
+
+	  Configure weight=300 on /dev/sdb (8:16) in this cgroup::
+
+	    # echo 8:16 300 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:16    300
+
+	  Configure weight=500 on /dev/sda (8:0) in this cgroup::
+
+	    # echo 8:0 500 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:0     500
+	    8:16    300
+
+	  Remove specific weight for /dev/sda in this cgroup::
+
+	    # echo 8:0 0 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:16    300
+
+- blkio.leaf_weight[_device]
+	- Equivalents of blkio.weight[_device] for the purpose of
+          deciding how much weight tasks in the given cgroup has while
+          competing with the cgroup's child cgroups. For details,
+          please refer to Documentation/block/cfq-iosched.txt.
+
+- blkio.time
+	- disk time allocated to cgroup per device in milliseconds. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the disk time allocated to group in
+	  milliseconds.
+
+- blkio.sectors
+	- number of sectors transferred to/from disk by the group. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the number of sectors transferred by the
+	  group to/from the device.
+
+- blkio.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+- blkio.io_serviced
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.io_service_time
+	- Total amount of time between request dispatch and request completion
+	  for the IOs done by this cgroup. This is in nanoseconds to make it
+	  meaningful for flash devices too. For devices with queue depth of 1,
+	  this time represents the actual service time. When queue_depth > 1,
+	  that is no longer true as requests may be served out of order. This
+	  may cause the service time for a given IO to include the service time
+	  of multiple IOs when served out of order which may result in total
+	  io_service_time > actual time elapsed. This time is further divided by
+	  the type of operation - read or write, sync or async. First two fields
+	  specify the major and minor number of the device, third field
+	  specifies the operation type and the fourth field specifies the
+	  io_service_time in ns.
+
+- blkio.io_wait_time
+	- Total amount of time the IOs for this cgroup spent waiting in the
+	  scheduler queues for service. This can be greater than the total time
+	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
+	  measure of total time the cgroup spent waiting but rather a measure of
+	  the wait_time for its individual IOs. For devices with queue_depth > 1
+	  this metric does not include the time spent waiting for service once
+	  the IO is dispatched to the device but till it actually gets serviced
+	  (there might be a time lag here due to re-ordering of requests by the
+	  device). This is in nanoseconds to make it meaningful for flash
+	  devices too. This time is further divided by the type of operation -
+	  read or write, sync or async. First two fields specify the major and
+	  minor number of the device, third field specifies the operation type
+	  and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+	- Total number of bios/requests merged into requests belonging to this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.io_queued
+	- Total number of requests queued up at any given instant for this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.avg_queue_size
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  The average queue size for this cgroup over the entire time of this
+	  cgroup's existence. Queue size samples are taken each time one of the
+	  queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time the cgroup had to wait since it became busy
+	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+	  its queues. This is different from the io_wait_time which is the
+	  cumulative total of the amount of time spent by each IO in that cgroup
+	  waiting in the scheduler queue. This is in nanoseconds. If this is
+	  read when the cgroup is in a waiting (for timeslice) state, the stat
+	  will only report the group_wait_time accumulated till the last time it
+	  got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time a cgroup spends without any pending
+	  requests when not being served, i.e., it does not include any time
+	  spent idling for one of the queues of the cgroup. This is in
+	  nanoseconds. If this is read when the cgroup is in an empty state,
+	  the stat will only report the empty_time accumulated till the last
+	  time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time spent by the IO scheduler idling for a
+	  given cgroup in anticipation of a better request than the existing ones
+	  from other queues/cgroups. This is in nanoseconds. If this is read
+	  when the cgroup is in an idling state, the stat will only report the
+	  idle_time accumulated till the last idle period and will not include
+	  the current delta.
+
+- blkio.dequeue
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
+	  gives the statistics about how many a times a group was dequeued
+	  from service tree of the device. First two fields specify the major
+	  and minor number of the device and third field specifies the number
+	  of times a group was dequeued from a particular device.
+
+- blkio.*_recursive
+	- Recursive version of various stats. These files show the
+          same information as their non-recursive counterparts but
+          include stats from all the descendant cgroups.
+
+Throttling/Upper limit policy files
+-----------------------------------
+- blkio.throttle.read_bps_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in bytes per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
+
+- blkio.throttle.write_bps_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in bytes per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
+
+- blkio.throttle.read_iops_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in IO per second. Rules are per device. Following is
+	  the format::
+
+	   echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
+
+- blkio.throttle.write_iops_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in io per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
+
+Note: If both BW and IOPS rules are specified for a device, then IO is
+      subjected to both the constraints.
+
+- blkio.throttle.io_serviced
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.throttle.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+Common files among various policies
+-----------------------------------
+- blkio.reset_stats
+	- Writing an int to this file will result in resetting all the stats
+	  for that cgroup.
diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst
new file mode 100644
index 000000000000..b0688011ed06
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst
@@ -0,0 +1,695 @@
+==============
+Control Groups
+==============
+
+Written by Paul Menage <menage@google.com> based on
+Documentation/admin-guide/cgroup-v1/cpusets.rst
+
+Original copyright statements from cpusets.txt:
+
+Portions Copyright (C) 2004 BULL SA.
+
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+
+Modified by Paul Jackson <pj@sgi.com>
+
+Modified by Christoph Lameter <cl@linux.com>
+
+.. CONTENTS:
+
+	1. Control Groups
+	1.1 What are cgroups ?
+	1.2 Why are cgroups needed ?
+	1.3 How are cgroups implemented ?
+	1.4 What does notify_on_release do ?
+	1.5 What does clone_children do ?
+	1.6 How do I use cgroups ?
+	2. Usage Examples and Syntax
+	2.1 Basic Usage
+	2.2 Attaching processes
+	2.3 Mounting hierarchies by name
+	3. Kernel API
+	3.1 Overview
+	3.2 Synchronization
+	3.3 Subsystem API
+	4. Extended attributes usage
+	5. Questions
+
+1. Control Groups
+=================
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy.  Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierarchies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User-level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task PIDs assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rst) allow
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource-tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+up in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines::
+
+       CPU :          "Top cpuset"
+                       /       \
+               CPUSet1         CPUSet2
+                  |               |
+               (Professors)    (Students)
+
+               In addition (system tasks) are attached to topcpuset (so
+               that they can run anywhere) with a limit of 20%
+
+       Memory : Professors (50%), Students (30%), system (20%)
+
+       Disk : Professors (50%), Students (30%), system (20%)
+
+       Network : WWW browsing (20%), Network File System (60%), others (20%)
+                               / \
+               Professors (15%)  students (5%)
+
+Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
+into the NFS network class.
+
+At the same time Firefox/Lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies),
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can::
+
+    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+appropriate network and other resource class.  This may lead to
+proliferation of such cgroups.
+
+Also let's say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :))  OR give one of the student's simulation
+apps enhanced CPU power.
+
+With ability to write PIDs directly to resource classes, it's just a
+matter of::
+
+       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
+       (after some time)
+       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
+
+Without this ability, the administrator would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+   css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+   cgroup_subsys_state objects, one for each cgroup subsystem
+   registered in the system. There is no direct link from a task to
+   the cgroup of which it's a member in each hierarchy, but this
+   can be determined by following pointers through the
+   cgroup_subsys_state objects. This is because accessing the
+   subsystem state is something that's expected to happen frequently
+   and in performance-critical code, whereas operations that require a
+   task's actual cgroup assignments (in particular, moving between
+   cgroups) are less common. A linked list runs through the cg_list
+   field of each task_struct using the css_set, anchored at
+   css_set->tasks.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+   manipulation from user space.
+
+ - You can list all the tasks (by PID) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance-critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+   css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition, a new file system of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel.  When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options.  By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+
+ - tasks: list of tasks (by PID) attached to that cgroup.  This list
+   is not guaranteed to be sorted.  Writing a thread ID into this file
+   moves the thread into this cgroup.
+ - cgroup.procs: list of thread group IDs in the cgroup.  This list is
+   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
+   should sort/uniquify the list if this property is required.
+   Writing a thread group ID into this file moves all threads in that
+   group into this cgroup.
+ - notify_on_release flag: run the release agent on exit?
+ - release_agent: the path to use for release notifications (this file
+   exists in the top cgroup only)
+
+Other subsystems such as cpusets may add additional files in each
+cgroup dir.
+
+New cgroups are created using the mkdir system call or shell
+command.  The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks.  A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, otherwise a new
+css_set is allocated. The appropriate existing css_set is located by
+looking into a hash table.
+
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cgrp_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+
+1.4 What does notify_on_release do ?
+------------------------------------
+
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup.  This enables automatic
+removal of abandoned cgroups.  The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0).  The default value of other cgroups at creation is the current
+value of their parents' notify_on_release settings. The default value of
+a cgroup hierarchy's release_agent path is empty.
+
+1.5 What does clone_children do ?
+---------------------------------
+
+This flag only affects the cpuset controller. If the clone_children
+flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
+configuration from the parent during initialization.
+
+1.6 How do I use cgroups ?
+--------------------------
+
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like::
+
+ 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
+ 2) mkdir /sys/fs/cgroup/cpuset
+ 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 5) Start a task that will be the "founding father" of the new job.
+ 6) Attach that task to the new cgroup by writing its PID to the
+    /sys/fs/cgroup/cpuset tasks file for that cgroup.
+ 7) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup::
+
+  mount -t tmpfs cgroup_root /sys/fs/cgroup
+  mkdir /sys/fs/cgroup/cpuset
+  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cgroup Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cgroup
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using cgroups can be done through the cgroup
+virtual filesystem.
+
+To mount a cgroup hierarchy with all available subsystems, type::
+
+  # mount -t cgroup xxx /sys/fs/cgroup
+
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+
+Note: Some subsystems do not work without some user input first.  For instance,
+if cpusets are enabled the user will have to populate the cpus and mems files
+for each new cgroup created before that group can be used.
+
+As explained in section `1.2 Why are cgroups needed?` you should create
+different hierarchies of cgroups for each single resource or group of
+resources you want to control. Therefore, you should mount a tmpfs on
+/sys/fs/cgroup and create directories for each cgroup resource or resource
+group::
+
+  # mount -t tmpfs cgroup_root /sys/fs/cgroup
+  # mkdir /sys/fs/cgroup/rg1
+
+To mount a cgroup hierarchy with just the cpuset and memory
+subsystems, type::
+
+  # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
+
+While remounting cgroups is currently supported, it is not recommend
+to use it. Remounting allows changing bound subsystems and
+release_agent. Rebinding is hardly useful as it only works when the
+hierarchy is empty and release_agent itself should be replaced with
+conventional fsnotify. The support for remounting will be removed in
+the future.
+
+To Specify a hierarchy's release_agent::
+
+  # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
+    xxx /sys/fs/cgroup/rg1
+
+Note that specifying 'release_agent' more than once will return failure.
+
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+
+Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
+is the cgroup that holds the whole system.
+
+If you want to change the value of release_agent::
+
+  # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
+
+It can also be changed via remount.
+
+If you want to create a new cgroup under /sys/fs/cgroup/rg1::
+
+  # cd /sys/fs/cgroup/rg1
+  # mkdir my_cgroup
+
+Now you want to do something with this cgroup:
+
+  # cd my_cgroup
+
+In this directory you can find several files::
+
+  # ls
+  cgroup.procs notify_on_release tasks
+  (plus whatever files added by the attached subsystems)
+
+Now attach your shell to this cgroup::
+
+  # /bin/echo $$ > tasks
+
+You can also create cgroups inside your cgroup by using mkdir in this
+directory::
+
+  # mkdir my_sub_cs
+
+To remove a cgroup, just use rmdir::
+
+  # rmdir my_sub_cs
+
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+
+2.2 Attaching processes
+-----------------------
+
+::
+
+  # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+  # /bin/echo PID1 > tasks
+  # /bin/echo PID2 > tasks
+	  ...
+  # /bin/echo PIDn > tasks
+
+You can attach the current shell task by echoing 0::
+
+  # echo 0 > tasks
+
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the PID of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
+Note: Since every task is always a member of exactly one cgroup in each
+mounted hierarchy, to remove a task from its current cgroup you must
+move it into a new cgroup (possibly the root cgroup) by writing to the
+new cgroup's tasks file.
+
+Note: Due to some restrictions enforced by some cgroup subsystems, moving
+a process to another cgroup can fail.
+
+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy.  This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems.  Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
+3. Kernel API
+=============
+
+3.1 Overview
+------------
+
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem ID which will be assigned by the cgroup system.
+
+Other fields in the cgroup_subsys object include:
+
+- subsys_id: a unique array index for the subsystem, indicating which
+  entry in cgroup->subsys[] this subsystem should be managing.
+
+- name: should be initialized to a unique subsystem name. Should be
+  no longer than MAX_CGROUP_TYPE_NAMELEN.
+
+- early_init: indicate if the subsystem needs early initialization
+  at system boot.
+
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem ID; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+
+3.2 Synchronization
+-------------------
+
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+
+See kernel/cgroup.c for more details.
+
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+
+3.3 Subsystem API
+-----------------
+
+Each subsystem should:
+
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_cgrp_subsys
+
+Each subsystem may export the following methods. The only mandatory
+methods are css_alloc/free. Any others that are null are presumed to
+be successful no-ops.
+
+``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called to allocate a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+ERR_PTR() value. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+
+``int css_online(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called after @cgrp successfully completed all allocations and made
+visible to cgroup_for_each_child/descendant_*() iterators. The
+subsystem may choose to fail creation by returning -errno. This
+callback can be used to implement reliable state sharing and
+propagation along the hierarchy. See the comment on
+cgroup_for_each_descendant_pre() for details.
+
+``void css_offline(struct cgroup *cgrp);``
+(cgroup_mutex held by caller)
+
+This is the counterpart of css_online() and called iff css_online()
+has succeeded on @cgrp. This signifies the beginning of the end of
+@cgrp. @cgrp is being removed and the subsystem should start dropping
+all references it's holding on @cgrp. When all references are dropped,
+cgroup removal will proceed to the next step - css_free(). After this
+callback, @cgrp should be considered dead to the subsystem.
+
+``void css_free(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+The cgroup system is about to free @cgrp; the subsystem should free
+its subsystem state object. By the time this method is called, @cgrp
+is completely unused; @cgrp->parent is still valid. (Note - can also
+be called for a newly-created cgroup if an error occurs after this
+subsystem's create() method has been called for the new cgroup).
+
+``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called prior to moving one or more tasks into a cgroup; if the
+subsystem returns an error, this will abort the attach operation.
+@tset contains the tasks to be attached and is guaranteed to have at
+least one task in it.
+
+If there are multiple tasks in the taskset, then:
+  - it's guaranteed that all are from the same thread group
+  - @tset contains all tasks from the thread group whether or not
+    they're switching cgroups
+  - the first task is the leader
+
+Each @tset entry also contains the task's old cgroup and tasks which
+aren't switching cgroup can be skipped easily using the
+cgroup_taskset_for_each() iterator. Note that this isn't called on a
+fork. If this method returns 0 (success) then this should remain valid
+while the caller holds cgroup_mutex and it is ensured that either
+attach() or cancel_attach() will be called in future.
+
+``void css_reset(struct cgroup_subsys_state *css)``
+(cgroup_mutex held by caller)
+
+An optional operation which should restore @css's configuration to the
+initial state.  This is currently only used on the unified hierarchy
+when a subsystem is disabled on a cgroup through
+"cgroup.subtree_control" but should remain enabled because other
+subsystems depend on it.  cgroup core makes such a css invisible by
+removing the associated interface files and invokes this callback so
+that the hidden subsystem can return to the initial neutral state.
+This prevents unexpected resource control from a hidden css and
+ensures that the configuration is in the initial state when it is made
+visible again later.
+
+``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called when a task attach operation has failed after can_attach() has succeeded.
+A subsystem whose can_attach() has some side-effects should provide this
+function, so that the subsystem can implement a rollback. If not, not necessary.
+This will be called only about subsystems whose can_attach() operation have
+succeeded. The parameters are identical to can_attach().
+
+``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+The parameters are identical to can_attach().
+
+``void fork(struct task_struct *task)``
+
+Called when a task is forked into a cgroup.
+
+``void exit(struct task_struct *task)``
+
+Called during task exit.
+
+``void free(struct task_struct *task)``
+
+Called when the task_struct is freed.
+
+``void bind(struct cgroup *root)``
+(cgroup_mutex held by caller)
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+
+4. Extended attribute usage
+===========================
+
+cgroup filesystem supports certain types of extended attributes in its
+directories and files.  The current supported types are:
+
+	- Trusted (XATTR_TRUSTED)
+	- Security (XATTR_SECURITY)
+
+Both require CAP_SYS_ADMIN capability to set.
+
+Like in tmpfs, the extended attributes in cgroup filesystem are stored
+using kernel memory and it's advised to keep the usage at minimum.  This
+is the reason why user defined extended attributes are not supported, since
+any user can do it and there's no limit in the value size.
+
+The current known users for this feature are SELinux to limit cgroup usage
+in containers and systemd for assorted meta data like main PID in a cgroup
+(systemd creates a cgroup per service).
+
+5. Questions
+============
+
+::
+
+  Q: what's up with this '/bin/echo' ?
+  A: bash's builtin 'echo' command does not check calls to write() against
+     errors. If you use it in the cgroup file system, you won't be
+     able to tell whether a command succeeded or failed.
+
+  Q: When I attach processes, only the first of the line gets really attached !
+  A: We can only return one error code per call to write(). So you should also
+     put only ONE PID.
diff --git a/Documentation/admin-guide/cgroup-v1/cpuacct.rst b/Documentation/admin-guide/cgroup-v1/cpuacct.rst
new file mode 100644
index 000000000000..d30ed81d2ad7
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cpuacct.rst
@@ -0,0 +1,50 @@
+=========================
+CPU Accounting Controller
+=========================
+
+The CPU accounting controller is used to group tasks using cgroups and
+account the CPU usage of these groups of tasks.
+
+The CPU accounting controller supports multi-hierarchy groups. An accounting
+group accumulates the CPU usage of all of its child groups and the tasks
+directly present in its group.
+
+Accounting groups can be created by first mounting the cgroup filesystem::
+
+  # mount -t cgroup -ocpuacct none /sys/fs/cgroup
+
+With the above step, the initial or the parent accounting group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
+by this group which is essentially the CPU time obtained by all the tasks
+in the system.
+
+New accounting groups can be created under the parent group /sys/fs/cgroup::
+
+  # cd /sys/fs/cgroup
+  # mkdir g1
+  # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it. CPU time consumed by this bash and its children
+can be obtained from g1/cpuacct.usage and the same is accumulated in
+/sys/fs/cgroup/cpuacct.usage also.
+
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+
+user: Time spent by tasks of the cgroup in user mode.
+system: Time spent by tasks of the cgroup in kernel mode.
+
+user and system are in USER_HZ unit.
+
+cpuacct controller uses percpu_counter interface to collect user and
+system times. This has two side effects:
+
+- It is theoretically possible to see wrong values for user and system times.
+  This is because percpu_counter_read() on 32bit systems isn't safe
+  against concurrent writes.
+- It is possible to see slightly outdated values for user and system times
+  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst
new file mode 100644
index 000000000000..86a6ae995d54
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst
@@ -0,0 +1,866 @@
+=======
+CPUSETS
+=======
+
+Copyright (C) 2004 BULL SA.
+
+Written by Simon.Derr@bull.net
+
+- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+- Modified by Paul Jackson <pj@sgi.com>
+- Modified by Christoph Lameter <cl@linux.com>
+- Modified by Paul Menage <menage@google.com>
+- Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+
+.. CONTENTS:
+
+   1. Cpusets
+     1.1 What are cpusets ?
+     1.2 Why are cpusets needed ?
+     1.3 How are cpusets implemented ?
+     1.4 What are exclusive cpusets ?
+     1.5 What is memory_pressure ?
+     1.6 What is memory spread ?
+     1.7 What is sched_load_balance ?
+     1.8 What is sched_relax_domain_level ?
+     1.9 How do I use cpusets ?
+   2. Usage Examples and Syntax
+     2.1 Basic Usage
+     2.2 Adding/removing cpus
+     2.3 Setting flags
+     2.4 Attaching processes
+   3. Questions
+   4. Contact
+
+1. Cpusets
+==========
+
+1.1 What are cpusets ?
+----------------------
+
+Cpusets provide a mechanism for assigning a set of CPUs and Memory
+Nodes to a set of tasks.   In this document "Memory Node" refers to
+an on-line node that contains memory.
+
+Cpusets constrain the CPU and Memory placement of tasks to only
+the resources within a task's current cpuset.  They form a nested
+hierarchy visible in a virtual file system.  These are the essential
+hooks, beyond what is already present, required to manage dynamic
+job placement on large systems.
+
+Cpusets use the generic cgroup subsystem described in
+Documentation/admin-guide/cgroup-v1/cgroups.rst.
+
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that task's cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset.  The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting task's mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the cgroup
+virtual file system, manage the attributes and permissions of these
+cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
+specify and query to which cpuset a task is assigned, and list the
+task pids assigned to a cpuset.
+
+
+1.2 Why are cpusets needed ?
+----------------------------
+
+The management of large computer systems, with many processors (CPUs),
+complex memory cache hierarchies and multiple Memory Nodes having
+non-uniform access times (NUMA) presents additional challenges for
+the efficient scheduling and memory placement of processes.
+
+Frequently more modest sized systems can be operated with adequate
+efficiency just by letting the operating system automatically share
+the available CPU and Memory resources amongst the requesting tasks.
+
+But larger systems, which benefit more from careful processor and
+memory placement to reduce memory access times and contention,
+and which typically represent a larger investment for the customer,
+can benefit from explicitly placing jobs on properly sized subsets of
+the system.
+
+This can be especially valuable on:
+
+    * Web Servers running multiple instances of the same web application,
+    * Servers running different applications (for instance, a web server
+      and a database), or
+    * NUMA systems running large HPC applications with demanding
+      performance characteristics.
+
+These subsets, or "soft partitions" must be able to be dynamically
+adjusted, as the job mix changes, without impacting other concurrently
+executing jobs. The location of the running jobs pages may also be moved
+when the memory locations are changed.
+
+The kernel cpuset patch provides the minimum essential kernel
+mechanisms required to efficiently implement such subsets.  It
+leverages existing CPU and Memory Placement facilities in the Linux
+kernel to avoid any additional impact on the critical scheduler or
+memory allocator code.
+
+
+1.3 How are cpusets implemented ?
+---------------------------------
+
+Cpusets provide a Linux kernel mechanism to constrain which CPUs and
+Memory Nodes are used by a process or set of processes.
+
+The Linux kernel already has a pair of mechanisms to specify on which
+CPUs a task may be scheduled (sched_setaffinity) and on which Memory
+Nodes it may obtain memory (mbind, set_mempolicy).
+
+Cpusets extends these two mechanisms as follows:
+
+ - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
+   kernel.
+ - Each task in the system is attached to a cpuset, via a pointer
+   in the task structure to a reference counted cgroup structure.
+ - Calls to sched_setaffinity are filtered to just those CPUs
+   allowed in that task's cpuset.
+ - Calls to mbind and set_mempolicy are filtered to just
+   those Memory Nodes allowed in that task's cpuset.
+ - The root cpuset contains all the systems CPUs and Memory
+   Nodes.
+ - For any cpuset, one can define child cpusets containing a subset
+   of the parents CPU and Memory Node resources.
+ - The hierarchy of cpusets can be mounted at /dev/cpuset, for
+   browsing and manipulation from user space.
+ - A cpuset may be marked exclusive, which ensures that no other
+   cpuset (except direct ancestors and descendants) may contain
+   any overlapping CPUs or Memory Nodes.
+ - You can list all the tasks (by pid) attached to any cpuset.
+
+The implementation of cpusets requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cpuset at system boot.
+ - in fork and exit, to attach and detach a task from its cpuset.
+ - in sched_setaffinity, to mask the requested CPUs by what's
+   allowed in that task's cpuset.
+ - in sched.c migrate_live_tasks(), to keep migrating tasks within
+   the CPUs allowed by their cpuset, if possible.
+ - in the mbind and set_mempolicy system calls, to mask the requested
+   Memory Nodes by what's allowed in that task's cpuset.
+ - in page_alloc.c, to restrict memory to allowed nodes.
+ - in vmscan.c, to restrict page recovery to the current cpuset.
+
+You should mount the "cgroup" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel.  No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.
+
+The /proc/<pid>/status file for each task has four added lines,
+displaying the task's cpus_allowed (on which CPUs it may be scheduled)
+and mems_allowed (on which Memory Nodes it may obtain memory),
+in the two formats seen in the following example::
+
+  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
+  Cpus_allowed_list:      0-127
+  Mems_allowed:   ffffffff,ffffffff
+  Mems_allowed_list:      0-63
+
+Each cpuset is represented by a directory in the cgroup file system
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
+
+ - cpuset.cpus: list of CPUs in that cpuset
+ - cpuset.mems: list of Memory Nodes in that cpuset
+ - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
+ - cpuset.cpu_exclusive flag: is cpu placement exclusive?
+ - cpuset.mem_exclusive flag: is memory placement exclusive?
+ - cpuset.mem_hardwall flag:  is memory allocation hardwalled
+ - cpuset.memory_pressure: measure of how much paging pressure in cpuset
+ - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
+ - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
+ - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
+ - cpuset.sched_relax_domain_level: the searching range when migrating tasks
+
+In addition, only the root cpuset has the following file:
+
+ - cpuset.memory_pressure_enabled flag: compute memory_pressure?
+
+New cpusets are created using the mkdir system call or shell
+command.  The properties of a cpuset, such as its flags, allowed
+CPUs and Memory Nodes, and attached tasks, are modified by writing
+to the appropriate file in that cpusets directory, as listed above.
+
+The named hierarchical structure of nested cpusets allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cpuset allows organizing the work load
+on a system into related sets of tasks such that each set is constrained
+to using the CPUs and Memory Nodes of a particular cpuset.  A task
+may be re-attached to any other cpuset, if allowed by the permissions
+on the necessary cpuset file system directories.
+
+Such management of a system "in the large" integrates smoothly with
+the detailed placement done on individual tasks and memory regions
+using the sched_setaffinity, mbind and set_mempolicy system calls.
+
+The following rules apply to each cpuset:
+
+ - Its CPUs and Memory Nodes must be a subset of its parents.
+ - It can't be marked exclusive unless its parent is.
+ - If its cpu or memory is exclusive, they may not overlap any sibling.
+
+These rules, and the natural hierarchy of cpusets, enable efficient
+enforcement of the exclusive guarantee, without having to scan all
+cpusets every time any of them change to ensure nothing overlaps a
+exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
+to represent the cpuset hierarchy provides for a familiar permission
+and name space for cpusets, with a minimum of additional kernel code.
+
+The cpus and mems files in the root (top_cpuset) cpuset are
+read-only.  The cpus file automatically tracks the value of
+cpu_online_mask using a CPU hotplug notifier, and the mems file
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
+nodes with memory--using the cpuset_track_online_nodes() hook.
+
+
+1.4 What are exclusive cpusets ?
+--------------------------------
+
+If a cpuset is cpu or mem exclusive, no other cpuset, other than
+a direct ancestor or descendant, may share any of the same CPUs or
+Memory Nodes.
+
+A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users.  All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space.  This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset.  To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
+
+
+1.5 What is memory_pressure ?
+-----------------------------
+The memory_pressure of a cpuset provides a simple per-cpuset metric
+of the rate that the tasks in a cpuset are attempting to free up in
+use memory on the nodes of the cpuset to satisfy additional memory
+requests.
+
+This enables batch managers monitoring jobs running in dedicated
+cpusets to efficiently detect what level of memory pressure that job
+is causing.
+
+This is useful both on tightly managed systems running a wide mix of
+submitted jobs, which may choose to terminate or re-prioritize jobs that
+are trying to use more memory than allowed on the nodes assigned to them,
+and with tightly coupled, long running, massively parallel scientific
+computing jobs that will dramatically fail to meet required performance
+goals if they start to use more memory than allowed to them.
+
+This mechanism provides a very economical way for the batch manager
+to monitor a cpuset for signs of memory pressure.  It's up to the
+batch manager or other user code to decide what to do about it and
+take action.
+
+==>
+    Unless this feature is enabled by writing "1" to the special file
+    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
+    code of __alloc_pages() for this metric reduces to simply noticing
+    that the cpuset_memory_pressure_enabled flag is zero.  So only
+    systems that enable this feature will compute the metric.
+
+Why a per-cpuset, running average:
+
+    Because this meter is per-cpuset, rather than per-task or mm,
+    the system load imposed by a batch scheduler monitoring this
+    metric is sharply reduced on large systems, because a scan of
+    the tasklist can be avoided on each set of queries.
+
+    Because this meter is a running average, instead of an accumulating
+    counter, a batch scheduler can detect memory pressure with a
+    single read, instead of having to read and accumulate results
+    for a period of time.
+
+    Because this meter is per-cpuset rather than per-task or mm,
+    the batch scheduler can obtain the key information, memory
+    pressure in a cpuset, with a single read, rather than having to
+    query and accumulate results over all the (dynamically changing)
+    set of tasks in the cpuset.
+
+A per-cpuset simple digital filter (requires a spinlock and 3 words
+of data per-cpuset) is kept, and updated by any task attached to that
+cpuset, if it enters the synchronous (direct) page reclaim code.
+
+A per-cpuset file provides an integer number representing the recent
+(half-life of 10 seconds) rate of direct page reclaims caused by
+the tasks in the cpuset, in units of reclaims attempted per second,
+times 1000.
+
+
+1.6 What is memory spread ?
+---------------------------
+There are two boolean flag files per cpuset that control where the
+kernel allocates pages for the file system buffers and related in
+kernel data structures.  They are called 'cpuset.memory_spread_page' and
+'cpuset.memory_spread_slab'.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
+the kernel will spread the file system buffers (page cache) evenly
+over all the nodes that the faulting task is allowed to use, instead
+of preferring to put those pages on the node where the task is running.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
+then the kernel will spread some file system related slab caches,
+such as for inodes and dentries evenly over all the nodes that the
+faulting task is allowed to use, instead of preferring to put those
+pages on the node where the task is running.
+
+The setting of these flags does not affect anonymous data segment or
+stack segment pages of a task.
+
+By default, both kinds of memory spreading are off, and memory
+pages are allocated on the node local to where the task is running,
+except perhaps as modified by the task's NUMA mempolicy or cpuset
+configuration, so long as sufficient free memory pages are available.
+
+When new cpusets are created, they inherit the memory spread settings
+of their parent.
+
+Setting memory spreading causes allocations for the affected page
+or slab caches to ignore the task's NUMA mempolicy and be spread
+instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
+mempolicies will not notice any change in these calls as a result of
+their containing task's memory spread settings.  If memory spreading
+is turned off, then the currently specified NUMA mempolicy once again
+applies to memory page allocations.
+
+Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
+files.  By default they contain "0", meaning that the feature is off
+for that cpuset.  If a "1" is written to that file, then that turns
+the named feature on.
+
+The implementation is simple.
+
+Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
+PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
+joins that cpuset.  The page allocation calls for the page cache
+is modified to perform an inline check for this PFA_SPREAD_PAGE task
+flag, and if set, a call to a new routine cpuset_mem_spread_node()
+returns the node to prefer for the allocation.
+
+Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
+PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
+pages from the node returned by cpuset_mem_spread_node().
+
+The cpuset_mem_spread_node() routine is also simple.  It uses the
+value of a per-task rotor cpuset_mem_spread_rotor to select the next
+node in the current task's mems_allowed to prefer for the allocation.
+
+This memory placement policy is also known (in other contexts) as
+round-robin or interleave.
+
+This policy can provide substantial improvements for jobs that need
+to place thread local data on the corresponding node, but that need
+to access large file system data sets that need to be spread across
+the several nodes in the jobs cpuset in order to fit.  Without this
+policy, especially for jobs that might have one thread reading in the
+data set, the memory allocation across the nodes in the jobs cpuset
+can become very uneven.
+
+1.7 What is sched_load_balance ?
+--------------------------------
+
+The kernel scheduler (kernel/sched/core.c) automatically load balances
+tasks.  If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced.  So the scheduler
+has support to partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+
+By default, there is one sched domain covering all CPUs, including those
+marked isolated using the kernel boot time "isolcpus=" argument. However,
+the isolated CPUs will not participate in load balancing, and will not
+have tasks running on them unless explicitly assigned.
+
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+
+ 1) On large systems, load balancing across many CPUs is expensive.
+    If the system is managed using cpusets to place independent jobs
+    on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+    system overhead on those CPUs, including avoiding task load
+    balancing if that is not needed.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+
+So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+
+Therefore in the above two situations, the top cpuset flag
+"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendant cpusets.  Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest.  Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding.  So if each of two partially
+overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
+form a single sched domain that is a superset of both.  We won't move
+a task to a CPU outside its cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "cpuset.sched_load_balance" enabled,
+and the sched domain configuration.  If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+
+If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above.  In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+
+CPUs in "cpuset.isolcpus" were excluded from load balancing by the
+isolcpus= kernel boot option, and will never be load balanced regardless
+of the value of "cpuset.sched_load_balance" in any cpuset.
+
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+
+The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.)  When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+
+If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+
+If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+
+The kernel commits to user space that it will avoid load balancing
+where it can.  It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
+
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
+all the CPUs that must be load balanced.
+
+The cpuset code builds a new such partition and passes it to the
+scheduler sched domain setup code, to have the sched domains rebuilt
+as necessary, whenever:
+
+ - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
+ - or CPUs come or go from a cpuset with this flag enabled,
+ - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
+   and with this flag enabled changes,
+ - or a cpuset with non-empty CPUs and with this flag enabled is removed,
+ - or a cpu is offlined/onlined.
+
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (struct cpumask) in the
+partition.
+
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
+
+
+1.8 What is sched_relax_domain_level ?
+--------------------------------------
+
+In sched domain, the scheduler migrates tasks in 2 ways; periodic load
+balance on tick, and at time of some schedule events.
+
+When a task is woken up, scheduler try to move the task on idle CPU.
+For example, if a task A running on CPU X activates another task B
+on the same CPU X, and if CPU Y is X's sibling and performing idle,
+then scheduler migrate task B to CPU Y so that task B can start on
+CPU Y without waiting task A on CPU X.
+
+And if a CPU run out of tasks in its runqueue, the CPU try to pull
+extra tasks from other busy CPUs to help them before it is going to
+be idle.
+
+Of course it takes some searching cost to find movable tasks and/or
+idle CPUs, the scheduler might not search all CPUs in the domain
+every time.  In fact, in some architectures, the searching ranges on
+events are limited in the same socket or node where the CPU locates,
+while the load balance on tick searches all.
+
+For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
+is idle while CPU X and the siblings are busy, scheduler can't migrate
+woken task B from X to Z since it is out of its searching range.
+As the result, task B on CPU X need to wait task A or wait load balance
+on the next tick.  For some applications in special situation, waiting
+1 tick may be too long.
+
+The 'cpuset.sched_relax_domain_level' file allows you to request changing
+this searching range as you like.  This file takes int value which
+indicates size of searching range in levels ideally as follows,
+otherwise initial value -1 that indicates the cpuset has no request.
+
+====== ===========================================================
+  -1   no request. use system default or follow request of others.
+   0   no search.
+   1   search siblings (hyperthreads in a core).
+   2   search cores in a package.
+   3   search cpus in a node [= system wide on non-NUMA system]
+   4   search nodes in a chunk of node [on NUMA system]
+   5   search system wide [on NUMA system]
+====== ===========================================================
+
+The system default is architecture dependent.  The system default
+can be changed using the relax_domain_level= boot parameter.
+
+This file is per-cpuset and affect the sched domain where the cpuset
+belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
+is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
+there is no sched domain belonging the cpuset.
+
+If multiple cpusets are overlapping and hence they form a single sched
+domain, the largest value among those is used.  Be careful, if one
+requests 0 and others are -1 then 0 is used.
+
+Note that modifying this file will have both good and bad effects,
+and whether it is acceptable or not depends on your situation.
+Don't modify this file if you are not sure.
+
+If your situation is:
+
+ - The migration costs between each cpu can be assumed considerably
+   small(for you) due to your special application's behavior or
+   special hardware support for CPU cache etc.
+ - The searching cost doesn't have impact(for you) or you can make
+   the searching cost enough small by managing cpuset to compact etc.
+ - The latency is required even it sacrifices cache hit rate etc.
+   then increasing 'sched_relax_domain_level' would benefit you.
+
+
+1.9 How do I use cpusets ?
+--------------------------
+
+In order to minimize the impact of cpusets on critical kernel
+code, such as the scheduler, and due to the fact that the kernel
+does not support one task updating the memory placement of another
+task directly, the impact on a task of changing its cpuset CPU
+or Memory Node placement, or of changing to which cpuset a task
+is attached, is subtle.
+
+If a cpuset has its Memory Nodes modified, then for each task attached
+to that cpuset, the next time that the kernel attempts to allocate
+a page of memory for that task, the kernel will notice the change
+in the task's cpuset, and update its per-task memory placement to
+remain within the new cpusets memory placement.  If the task was using
+mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
+its new cpuset, then the task will continue to use whatever subset
+of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
+was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
+in the new cpuset, then the task will be essentially treated as if it
+was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
+as queried by get_mempolicy(), doesn't change).  If a task is moved
+from one cpuset to another, then the kernel will adjust the task's
+memory placement, as above, the next time that the kernel attempts
+to allocate a page of memory for that task.
+
+If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately.  Similarly,
+if a task's pid is written to another cpuset's 'tasks' file, then its
+allowed CPU placement is changed immediately.  If such a task had been
+bound to some subset of its cpuset using the sched_setaffinity() call,
+the task will be allowed to run on any CPU allowed in its new cpuset,
+negating the effect of the prior sched_setaffinity() call.
+
+In summary, the memory placement of a task whose cpuset is changed is
+updated by the kernel, on the next allocation of a page for that task,
+and the processor placement is updated immediately.
+
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'cpuset.mems' subsequently changes.
+If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the task's new cpuset. The relative placement of the page within
+the cpuset is preserved during these migration operations if possible.
+For example if the page was on the second valid node of the prior cpuset
+then the page will be placed on the second valid node of the new cpuset.
+
+Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
+'cpuset.mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'cpuset.mems',
+will be moved to nodes in the new setting of 'mems.'
+Pages that were not in the task's prior cpuset, or in the cpuset's
+prior 'cpuset.mems' setting, will not be moved.
+
+There is an exception to the above.  If hotplug functionality is used
+to remove all the CPUs that are currently assigned to a cpuset,
+then all the tasks in that cpuset will be moved to the nearest ancestor
+with non-empty cpus.  But the moving of some (or all) tasks might fail if
+cpuset is bound with another cgroup subsystem which has some restrictions
+on task attaching.  In this failing case, those tasks will stay
+in the original cpuset, and the kernel will automatically update
+their cpus_allowed to allow all online CPUs.  When memory hotplug
+functionality for removing Memory Nodes is available, a similar exception
+is expected to apply there as well.  In general, the kernel prefers to
+violate cpuset placement, over starving a task that has had all
+its allowed CPUs or Memory Nodes taken offline.
+
+There is a second exception to the above.  GFP_ATOMIC requests are
+kernel internal allocations that must be satisfied, immediately.
+The kernel may drop some request, in rare cases even panic, if a
+GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
+the current task's cpuset, then we relax the cpuset, and look for
+memory anywhere we can find it.  It's better to violate the cpuset
+than stress the kernel.
+
+To start a new job that is to be contained within a cpuset, the steps are:
+
+ 1) mkdir /sys/fs/cgroup/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cpuset by writing its pid to the
+    /sys/fs/cgroup/cpuset tasks file for that cpuset.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cpuset
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cpuset::
+
+  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cpuset Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cpuset
+
+There are ways to query or modify cpusets:
+
+ - via the cpuset file system directly, using the various cd, mkdir, echo,
+   cat, rmdir commands from the shell, or their equivalent from C.
+ - via the C library libcpuset.
+ - via the C library libcgroup.
+   (http://sourceforge.net/projects/libcg/)
+ - via the python application cset.
+   (http://code.google.com/p/cpuset/)
+
+The sched_setaffinity calls can also be done at the shell prompt using
+SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
+calls can be done at the shell prompt using the numactl command
+(part of Andi Kleen's numa package).
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cpusets can be done through the cpuset
+virtual filesystem.
+
+To mount it, type:
+# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
+
+Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
+tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
+is the cpuset that holds the whole system.
+
+If you want to create a new cpuset under /sys/fs/cgroup/cpuset::
+
+  # cd /sys/fs/cgroup/cpuset
+  # mkdir my_cpuset
+
+Now you want to do something with this cpuset::
+
+  # cd my_cpuset
+
+In this directory you can find several files::
+
+  # ls
+  cgroup.clone_children  cpuset.memory_pressure
+  cgroup.event_control   cpuset.memory_spread_page
+  cgroup.procs           cpuset.memory_spread_slab
+  cpuset.cpu_exclusive   cpuset.mems
+  cpuset.cpus            cpuset.sched_load_balance
+  cpuset.mem_exclusive   cpuset.sched_relax_domain_level
+  cpuset.mem_hardwall    notify_on_release
+  cpuset.memory_migrate  tasks
+
+Reading them will give you information about the state of this cpuset:
+the CPUs and Memory Nodes it can use, the processes that are using
+it, its properties.  By writing to these files you can manipulate
+the cpuset.
+
+Set some flags::
+
+  # /bin/echo 1 > cpuset.cpu_exclusive
+
+Add some cpus::
+
+  # /bin/echo 0-7 > cpuset.cpus
+
+Add some mems::
+
+  # /bin/echo 0-7 > cpuset.mems
+
+Now attach your shell to this cpuset::
+
+  # /bin/echo $$ > tasks
+
+You can also create cpusets inside your cpuset by using mkdir in this
+directory::
+
+  # mkdir my_sub_cs
+
+To remove a cpuset, just use rmdir::
+
+  # rmdir my_sub_cs
+
+This will fail if the cpuset is in use (has cpusets inside, or has
+processes attached).
+
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+
+The command::
+
+  mount -t cpuset X /sys/fs/cgroup/cpuset
+
+is equivalent to::
+
+  mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
+  echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
+
+2.2 Adding/removing cpus
+------------------------
+
+This is the syntax to use when writing in the cpus or mems files
+in cpuset directories::
+
+  # /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
+  # /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
+
+To add a CPU to a cpuset, write the new list of CPUs including the
+CPU to be added. To add 6 to the above cpuset::
+
+  # /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
+
+Similarly to remove a CPU from a cpuset, write the new list of CPUs
+without the CPU to be removed.
+
+To remove all the CPUs::
+
+  # /bin/echo "" > cpuset.cpus		-> clear cpus list
+
+2.3 Setting flags
+-----------------
+
+The syntax is very simple::
+
+  # /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
+  # /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
+
+2.4 Attaching processes
+-----------------------
+
+::
+
+  # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+  # /bin/echo PID1 > tasks
+  # /bin/echo PID2 > tasks
+	...
+  # /bin/echo PIDn > tasks
+
+
+3. Questions
+============
+
+Q:
+   what's up with this '/bin/echo' ?
+
+A:
+   bash's builtin 'echo' command does not check calls to write() against
+   errors. If you use it in the cpuset file system, you won't be
+   able to tell whether a command succeeded or failed.
+
+Q:
+   When I attach processes, only the first of the line gets really attached !
+
+A:
+   We can only return one error code per call to write(). So you should also
+   put only ONE pid.
+
+4. Contact
+==========
+
+Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/admin-guide/cgroup-v1/devices.rst b/Documentation/admin-guide/cgroup-v1/devices.rst
new file mode 100644
index 000000000000..e1886783961e
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/devices.rst
@@ -0,0 +1,132 @@
+===========================
+Device Whitelist Controller
+===========================
+
+1. Description
+==============
+
+Implement a cgroup to track and enforce open and mknod restrictions
+on device files.  A device cgroup associates a device access
+whitelist with each cgroup.  A whitelist entry has 4 fields.
+'type' is a (all), c (char), or b (block).  'all' means it applies
+to all types and all major and minor numbers.  Major and minor are
+either an integer or * for all.  Access is a composition of r
+(read), w (write), and m (mknod).
+
+The root device cgroup starts with rwm to 'all'.  A child device
+cgroup gets a copy of the parent.  Administrators can then remove
+devices from the whitelist or add new entries.  A child cgroup can
+never receive a device access which is denied by its parent.
+
+2. User Interface
+=================
+
+An entry is added using devices.allow, and removed using
+devices.deny.  For instance::
+
+	echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
+
+allows cgroup 1 to read and mknod the device usually known as
+/dev/null.  Doing::
+
+	echo a > /sys/fs/cgroup/1/devices.deny
+
+will remove the default 'a *:* rwm' entry. Doing::
+
+	echo a > /sys/fs/cgroup/1/devices.allow
+
+will add the 'a *:* rwm' entry to the whitelist.
+
+3. Security
+===========
+
+Any task can move itself between cgroups.  This clearly won't
+suffice, but we can decide the best way to adequately restrict
+movement as people get some experience with this.  We may just want
+to require CAP_SYS_ADMIN, which at least is a separate bit from
+CAP_MKNOD.  We may want to just refuse moving to a cgroup which
+isn't a descendant of the current one.  Or we may want to use
+CAP_MAC_ADMIN, since we really are trying to lock down root.
+
+CAP_SYS_ADMIN is needed to modify the whitelist or move another
+task to a new cgroup.  (Again we'll probably want to change that).
+
+A cgroup may not be granted more permissions than the cgroup's
+parent has.
+
+4. Hierarchy
+============
+
+device cgroups maintain hierarchy by making sure a cgroup never has more
+access permissions than its parent.  Every time an entry is written to
+a cgroup's devices.deny file, all its children will have that entry removed
+from their whitelist and all the locally set whitelist entries will be
+re-evaluated.  In case one of the locally set whitelist entries would provide
+more access than the cgroup's parent, it'll be removed from the whitelist.
+
+Example::
+
+      A
+     / \
+        B
+
+    group        behavior	exceptions
+    A            allow		"b 8:* rwm", "c 116:1 rw"
+    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
+
+If a device is denied in group A::
+
+	# echo "c 116:* r" > A/devices.deny
+
+it'll propagate down and after revalidating B's entries, the whitelist entry
+"c 116:2 rwm" will be removed::
+
+    group        whitelist entries                        denied devices
+    A            all                                      "b 8:* rwm", "c 116:* rw"
+    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
+
+In case parent's exceptions change and local exceptions are not allowed
+anymore, they'll be deleted.
+
+Notice that new whitelist entries will not be propagated::
+
+      A
+     / \
+        B
+
+    group        whitelist entries                        denied devices
+    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+when adding ``c *:3 rwm``::
+
+	# echo "c *:3 rwm" >A/devices.allow
+
+the result::
+
+    group        whitelist entries                        denied devices
+    A            "c *:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+but now it'll be possible to add new entries to B::
+
+	# echo "c 2:3 rwm" >B/devices.allow
+	# echo "c 50:3 r" >B/devices.allow
+
+or even::
+
+	# echo "c *:3 rwm" >B/devices.allow
+
+Allowing or denying all by writing 'a' to devices.allow or devices.deny will
+not be possible once the device cgroups has children.
+
+4.1 Hierarchy (internal implementation)
+---------------------------------------
+
+device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
+list of exceptions.  The internal state is controlled using the same user
+interface to preserve compatibility with the previous whitelist-only
+implementation.  Removal or addition of exceptions that will reduce the access
+to devices will be propagated down the hierarchy.
+For every propagated exception, the effective rules will be re-evaluated based
+on current parent's access rules.
diff --git a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
new file mode 100644
index 000000000000..582d3427de3f
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
@@ -0,0 +1,127 @@
+==============
+Cgroup Freezer
+==============
+
+The cgroup freezer is useful to batch job management system which start
+and stop sets of tasks in order to schedule the resources of a machine
+according to the desires of a system administrator. This sort of program
+is often used on HPC clusters to schedule access to the cluster as a
+whole. The cgroup freezer uses cgroups to describe the set of tasks to
+be started/stopped by the batch job management system. It also provides
+a means to start and stop the tasks composing the job.
+
+The cgroup freezer will also be useful for checkpointing running groups
+of tasks. The freezer allows the checkpoint code to obtain a consistent
+image of the tasks by attempting to force the tasks in a cgroup into a
+quiescent state. Once the tasks are quiescent another task can
+walk /proc or invoke a kernel interface to gather information about the
+quiesced tasks. Checkpointed tasks can be restarted later should a
+recoverable error occur. This also allows the checkpointed tasks to be
+migrated between nodes in a cluster by copying the gathered information
+to another node and restarting the tasks there.
+
+Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
+and resuming tasks in userspace. Both of these signals are observable
+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
+SIGCONT is especially unsuitable since it can be caught by the task. Any
+programs designed to watch for SIGSTOP and SIGCONT could be broken by
+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
+demonstrate this problem using nested bash shells::
+
+	$ echo $$
+	16644
+	$ bash
+	$ echo $$
+	16690
+
+	From a second, unrelated bash shell:
+	$ kill -SIGSTOP 16690
+	$ kill -SIGCONT 16690
+
+	<at this point 16690 exits and causes 16644 to exit too>
+
+This happens because bash can observe both signals and choose how it
+responds to them.
+
+Another example of a program which catches and responds to these
+signals is gdb. In fact any program designed to use ptrace is likely to
+have a problem with this method of stopping and resuming tasks.
+
+In contrast, the cgroup freezer uses the kernel freezer code to
+prevent the freeze/unfreeze cycle from becoming visible to the tasks
+being frozen. This allows the bash example above and gdb to run as
+expected.
+
+The cgroup freezer is hierarchical. Freezing a cgroup freezes all
+tasks belonging to the cgroup and all its descendant cgroups. Each
+cgroup has its own state (self-state) and the state inherited from the
+parent (parent-state). Iff both states are THAWED, the cgroup is
+THAWED.
+
+The following cgroupfs files are created by cgroup freezer.
+
+* freezer.state: Read-write.
+
+  When read, returns the effective state of the cgroup - "THAWED",
+  "FREEZING" or "FROZEN". This is the combined self and parent-states.
+  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
+
+  FREEZING cgroup transitions into FROZEN state when all tasks
+  belonging to the cgroup and its descendants become frozen. Note that
+  a cgroup reverts to FREEZING from FROZEN after a new task is added
+  to the cgroup or one of its descendant cgroups until the new task is
+  frozen.
+
+  When written, sets the self-state of the cgroup. Two values are
+  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
+  if not already freezing, enters FREEZING state along with all its
+  descendant cgroups.
+
+  If THAWED is written, the self-state of the cgroup is changed to
+  THAWED.  Note that the effective state may not change to THAWED if
+  the parent-state is still freezing. If a cgroup's effective state
+  becomes THAWED, all its descendants which are freezing because of
+  the cgroup also leave the freezing state.
+
+* freezer.self_freezing: Read only.
+
+  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
+  This value is 1 iff the last write to freezer.state was "FROZEN".
+
+* freezer.parent_freezing: Read only.
+
+  Shows the parent-state.  0 if none of the cgroup's ancestors is
+  frozen; otherwise, 1.
+
+The root cgroup is non-freezable and the above interface files don't
+exist.
+
+* Examples of usage::
+
+   # mkdir /sys/fs/cgroup/freezer
+   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
+   # mkdir /sys/fs/cgroup/freezer/0
+   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
+
+to get status of the freezer subsystem::
+
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+to freeze all tasks in the container::
+
+   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FREEZING
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FROZEN
+
+to unfreeze all tasks in the container::
+
+   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+This is the basic mechanism which should do the right thing for user space task
+in a simple scenario.
diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
new file mode 100644
index 000000000000..a3902aa253a9
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
@@ -0,0 +1,50 @@
+==================
+HugeTLB Controller
+==================
+
+The HugeTLB controller allows to limit the HugeTLB usage per control group and
+enforces the controller limit during page fault. Since HugeTLB doesn't
+support page reclaim, enforcing the limit at page fault time implies that,
+the application will get SIGBUS signal if it tries to access HugeTLB pages
+beyond its limit. This requires the application to know beforehand how much
+HugeTLB pages it would require for its use.
+
+HugeTLB controller can be created by first mounting the cgroup filesystem.
+
+# mount -t cgroup -o hugetlb none /sys/fs/cgroup
+
+With the above step, the initial or the parent HugeTLB group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+
+New groups can be created under the parent group /sys/fs/cgroup::
+
+  # cd /sys/fs/cgroup
+  # mkdir g1
+  # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it.
+
+Brief summary of control files::
+
+ hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
+ hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
+ hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
+ hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit
+
+For a system supporting three hugepage sizes (64k, 32M and 1G), the control
+files include::
+
+  hugetlb.1GB.limit_in_bytes
+  hugetlb.1GB.max_usage_in_bytes
+  hugetlb.1GB.usage_in_bytes
+  hugetlb.1GB.failcnt
+  hugetlb.64KB.limit_in_bytes
+  hugetlb.64KB.max_usage_in_bytes
+  hugetlb.64KB.usage_in_bytes
+  hugetlb.64KB.failcnt
+  hugetlb.32MB.limit_in_bytes
+  hugetlb.32MB.max_usage_in_bytes
+  hugetlb.32MB.usage_in_bytes
+  hugetlb.32MB.failcnt
diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst
new file mode 100644
index 000000000000..10bf48bae0b0
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -0,0 +1,28 @@
+========================
+Control Groups version 1
+========================
+
+.. toctree::
+    :maxdepth: 1
+
+    cgroups
+
+    blkio-controller
+    cpuacct
+    cpusets
+    devices
+    freezer-subsystem
+    hugetlb
+    memcg_test
+    memory
+    net_cls
+    net_prio
+    pids
+    rdma
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
new file mode 100644
index 000000000000..3f7115e07b5d
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -0,0 +1,355 @@
+=====================================================
+Memory Resource Controller(Memcg) Implementation Memo
+=====================================================
+
+Last Updated: 2010/2
+
+Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
+
+Because VM is getting complex (one of reasons is memcg...), memcg's behavior
+is complex. This is a document for memcg's internal behavior.
+Please note that implementation details can be changed.
+
+(*) Topics on API should be in Documentation/admin-guide/cgroup-v1/memory.rst)
+
+0. How to record usage ?
+========================
+
+   2 objects are used.
+
+   page_cgroup ....an object per page.
+
+	Allocated at boot or memory hotplug. Freed at memory hot removal.
+
+   swap_cgroup ... an entry per swp_entry.
+
+	Allocated at swapon(). Freed at swapoff().
+
+   The page_cgroup has USED bit and double count against a page_cgroup never
+   occurs. swap_cgroup is used only when a charged page is swapped-out.
+
+1. Charge
+=========
+
+   a page/swp_entry may be charged (usage += PAGE_SIZE) at
+
+	mem_cgroup_try_charge()
+
+2. Uncharge
+===========
+
+  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
+
+	mem_cgroup_uncharge()
+	  Called when a page's refcount goes down to 0.
+
+	mem_cgroup_uncharge_swap()
+	  Called when swp_entry's refcnt goes down to 0. A charge against swap
+	  disappears.
+
+3. charge-commit-cancel
+=======================
+
+	Memcg pages are charged in two steps:
+
+		- mem_cgroup_try_charge()
+		- mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
+
+	At try_charge(), there are no flags to say "this page is charged".
+	at this point, usage += PAGE_SIZE.
+
+	At commit(), the page is associated with the memcg.
+
+	At cancel(), simply usage -= PAGE_SIZE.
+
+Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
+
+4. Anonymous
+============
+
+	Anonymous page is newly allocated at
+		  - page fault into MAP_ANONYMOUS mapping.
+		  - Copy-On-Write.
+
+	4.1 Swap-in.
+	At swap-in, the page is taken from swap-cache. There are 2 cases.
+
+	(a) If the SwapCache is newly allocated and read, it has no charges.
+	(b) If the SwapCache has been mapped by processes, it has been
+	    charged already.
+
+	4.2 Swap-out.
+	At swap-out, typical state transition is below.
+
+	(a) add to swap cache. (marked as SwapCache)
+	    swp_entry's refcnt += 1.
+	(b) fully unmapped.
+	    swp_entry's refcnt += # of ptes.
+	(c) write back to swap.
+	(d) delete from swap cache. (remove from SwapCache)
+	    swp_entry's refcnt -= 1.
+
+
+	Finally, at task exit,
+	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
+
+5. Page Cache
+=============
+
+	Page Cache is charged at
+	- add_to_page_cache_locked().
+
+	The logic is very clear. (About migration, see below)
+
+	Note:
+	  __remove_from_page_cache() is called by remove_from_page_cache()
+	  and __remove_mapping().
+
+6. Shmem(tmpfs) Page Cache
+===========================
+
+	The best way to understand shmem's page state transition is to read
+	mm/shmem.c.
+
+	But brief explanation of the behavior of memcg around shmem will be
+	helpful to understand the logic.
+
+	Shmem's page (just leaf page, not direct/indirect block) can be on
+
+		- radix-tree of shmem's inode.
+		- SwapCache.
+		- Both on radix-tree and SwapCache. This happens at swap-in
+		  and swap-out,
+
+	It's charged when...
+
+	- A new page is added to shmem's radix-tree.
+	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
+
+7. Page Migration
+=================
+
+	mem_cgroup_migrate()
+
+8. LRU
+======
+        Each memcg has its own private LRU. Now, its handling is under global
+	VM's control (means that it's handled under global pgdat->lru_lock).
+	Almost all routines around memcg's LRU is called by global LRU's
+	list management functions under pgdat->lru_lock.
+
+	A special function is mem_cgroup_isolate_pages(). This scans
+	memcg's private LRU and call __isolate_lru_page() to extract a page
+	from LRU.
+
+	(By __isolate_lru_page(), the page is removed from both of global and
+	private LRU.)
+
+
+9. Typical Tests.
+=================
+
+ Tests for racy cases.
+
+9.1 Small limit to memcg.
+-------------------------
+
+	When you do test to do racy case, it's good test to set memcg's limit
+	to be very small rather than GB. Many races found in the test under
+	xKB or xxMB limits.
+
+	(Memory behavior under GB and Memory behavior under MB shows very
+	different situation.)
+
+9.2 Shmem
+---------
+
+	Historically, memcg's shmem handling was poor and we saw some amount
+	of troubles here. This is because shmem is page-cache but can be
+	SwapCache. Test with shmem/tmpfs is always good test.
+
+9.3 Migration
+-------------
+
+	For NUMA, migration is an another special case. To do easy test, cpuset
+	is useful. Following is a sample script to do migration::
+
+		mount -t cgroup -o cpuset none /opt/cpuset
+
+		mkdir /opt/cpuset/01
+		echo 1 > /opt/cpuset/01/cpuset.cpus
+		echo 0 > /opt/cpuset/01/cpuset.mems
+		echo 1 > /opt/cpuset/01/cpuset.memory_migrate
+		mkdir /opt/cpuset/02
+		echo 1 > /opt/cpuset/02/cpuset.cpus
+		echo 1 > /opt/cpuset/02/cpuset.mems
+		echo 1 > /opt/cpuset/02/cpuset.memory_migrate
+
+	In above set, when you moves a task from 01 to 02, page migration to
+	node 0 to node 1 will occur. Following is a script to migrate all
+	under cpuset.::
+
+		--
+		move_task()
+		{
+		for pid in $1
+		do
+			/bin/echo $pid >$2/tasks 2>/dev/null
+			echo -n $pid
+			echo -n " "
+		done
+		echo END
+		}
+
+		G1_TASK=`cat ${G1}/tasks`
+		G2_TASK=`cat ${G2}/tasks`
+		move_task "${G1_TASK}" ${G2} &
+		--
+
+9.4 Memory hotplug
+------------------
+
+	memory hotplug test is one of good test.
+
+	to offline memory, do following::
+
+		# echo offline > /sys/devices/system/memory/memoryXXX/state
+
+	(XXX is the place of memory)
+
+	This is an easy way to test page migration, too.
+
+9.5 mkdir/rmdir
+---------------
+
+	When using hierarchy, mkdir/rmdir test should be done.
+	Use tests like the following::
+
+		echo 1 >/opt/cgroup/01/memory/use_hierarchy
+		mkdir /opt/cgroup/01/child_a
+		mkdir /opt/cgroup/01/child_b
+
+		set limit to 01.
+		add limit to 01/child_b
+		run jobs under child_a and child_b
+
+	create/delete following groups at random while jobs are running::
+
+		/opt/cgroup/01/child_a/child_aa
+		/opt/cgroup/01/child_b/child_bb
+		/opt/cgroup/01/child_c
+
+	running new jobs in new group is also good.
+
+9.6 Mount with other subsystems
+-------------------------------
+
+	Mounting with other subsystems is a good test because there is a
+	race and lock dependency with other cgroup subsystems.
+
+	example::
+
+		# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
+
+	and do task move, mkdir, rmdir etc...under this.
+
+9.7 swapoff
+-----------
+
+	Besides management of swap is one of complicated parts of memcg,
+	call path of swap-in at swapoff is not same as usual swap-in path..
+	It's worth to be tested explicitly.
+
+	For example, test like following is good:
+
+	(Shell-A)::
+
+		# mount -t cgroup none /cgroup -o memory
+		# mkdir /cgroup/test
+		# echo 40M > /cgroup/test/memory.limit_in_bytes
+		# echo 0 > /cgroup/test/tasks
+
+	Run malloc(100M) program under this. You'll see 60M of swaps.
+
+	(Shell-B)::
+
+		# move all tasks in /cgroup/test to /cgroup
+		# /sbin/swapoff -a
+		# rmdir /cgroup/test
+		# kill malloc task.
+
+	Of course, tmpfs v.s. swapoff test should be tested, too.
+
+9.8 OOM-Killer
+--------------
+
+	Out-of-memory caused by memcg's limit will kill tasks under
+	the memcg. When hierarchy is used, a task under hierarchy
+	will be killed by the kernel.
+
+	In this case, panic_on_oom shouldn't be invoked and tasks
+	in other groups shouldn't be killed.
+
+	It's not difficult to cause OOM under memcg as following.
+
+	Case A) when you can swapoff::
+
+		#swapoff -a
+		#echo 50M > /memory.limit_in_bytes
+
+	run 51M of malloc
+
+	Case B) when you use mem+swap limitation::
+
+		#echo 50M > memory.limit_in_bytes
+		#echo 50M > memory.memsw.limit_in_bytes
+
+	run 51M of malloc
+
+9.9 Move charges at task migration
+----------------------------------
+
+	Charges associated with a task can be moved along with task migration.
+
+	(Shell-A)::
+
+		#mkdir /cgroup/A
+		#echo $$ >/cgroup/A/tasks
+
+	run some programs which uses some amount of memory in /cgroup/A.
+
+	(Shell-B)::
+
+		#mkdir /cgroup/B
+		#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
+		#echo "pid of the program running in group A" >/cgroup/B/tasks
+
+	You can see charges have been moved by reading ``*.usage_in_bytes`` or
+	memory.stat of both A and B.
+
+	See 8.2 of Documentation/admin-guide/cgroup-v1/memory.rst to see what value should
+	be written to move_charge_at_immigrate.
+
+9.10 Memory thresholds
+----------------------
+
+	Memory controller implements memory thresholds using cgroups notification
+	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
+
+	(Shell-A) Create cgroup and run event listener::
+
+		# mkdir /cgroup/A
+		# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
+
+	(Shell-B) Add task to cgroup and try to allocate and free memory::
+
+		# echo $$ >/cgroup/A/tasks
+		# a="$(dd if=/dev/zero bs=1M count=10)"
+		# a=
+
+	You will see message from cgroup_event_listener every time you cross
+	the thresholds.
+
+	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
+
+	It's good idea to test root cgroup as well.
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
new file mode 100644
index 000000000000..41bdc038dad9
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -0,0 +1,1003 @@
+==========================
+Memory Resource Controller
+==========================
+
+NOTE:
+      This document is hopelessly outdated and it asks for a complete
+      rewrite. It still contains a useful information so we are keeping it
+      here but make sure to check the current code if you need a deeper
+      understanding.
+
+NOTE:
+      The Memory Resource Controller has generically been referred to as the
+      memory controller in this document. Do not confuse memory controller
+      used here with the memory controller that is used in hardware.
+
+(For editors) In this document:
+      When we mention a cgroup (cgroupfs's directory) with memory controller,
+      we call it "memory cgroup". When you see git-log and source code, you'll
+      see patch's title and function names tend to use "memcg".
+      In this document, we avoid using it.
+
+Benefits and Purpose of the memory controller
+=============================================
+
+The memory controller isolates the memory behaviour of a group of tasks
+from the rest of the system. The article on LWN [12] mentions some probable
+uses of the memory controller. The memory controller can be used to
+
+a. Isolate an application or a group of applications
+   Memory-hungry applications can be isolated and limited to a smaller
+   amount of memory.
+b. Create a cgroup with a limited amount of memory; this can be used
+   as a good alternative to booting with mem=XXXX.
+c. Virtualization solutions can control the amount of memory they want
+   to assign to a virtual machine instance.
+d. A CD/DVD burner could control the amount of memory used by the
+   rest of the system to ensure that burning does not fail due to lack
+   of available memory.
+e. There are several other use cases; find one or use the controller just
+   for fun (to learn and hack on the VM subsystem).
+
+Current Status: linux-2.6.34-mmotm(development version of 2010/April)
+
+Features:
+
+ - accounting anonymous pages, file caches, swap caches usage and limiting them.
+ - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
+ - optionally, memory+swap usage can be accounted and limited.
+ - hierarchical accounting
+ - soft limit
+ - moving (recharging) account at moving a task is selectable.
+ - usage threshold notifier
+ - memory pressure notifier
+ - oom-killer disable knob and oom-notifier
+ - Root cgroup has no limit controls.
+
+ Kernel memory support is a work in progress, and the current version provides
+ basically functionality. (See Section 2.7)
+
+Brief summary of control files.
+
+==================================== ==========================================
+ tasks				     attach a task(thread) and show list of
+				     threads
+ cgroup.procs			     show list of processes
+ cgroup.event_control		     an interface for event_fd()
+ memory.usage_in_bytes		     show current usage for memory
+				     (See 5.5 for details)
+ memory.memsw.usage_in_bytes	     show current usage for memory+Swap
+				     (See 5.5 for details)
+ memory.limit_in_bytes		     set/show limit of memory usage
+ memory.memsw.limit_in_bytes	     set/show limit of memory+Swap usage
+ memory.failcnt			     show the number of memory usage hits limits
+ memory.memsw.failcnt		     show the number of memory+Swap hits limits
+ memory.max_usage_in_bytes	     show max memory usage recorded
+ memory.memsw.max_usage_in_bytes     show max memory+Swap usage recorded
+ memory.soft_limit_in_bytes	     set/show soft limit of memory usage
+ memory.stat			     show various statistics
+ memory.use_hierarchy		     set/show hierarchical account enabled
+ memory.force_empty		     trigger forced page reclaim
+ memory.pressure_level		     set memory pressure notifications
+ memory.swappiness		     set/show swappiness parameter of vmscan
+				     (See sysctl's vm.swappiness)
+ memory.move_charge_at_immigrate     set/show controls of moving charges
+ memory.oom_control		     set/show oom controls.
+ memory.numa_stat		     show the number of memory usage per numa
+				     node
+
+ memory.kmem.limit_in_bytes          set/show hard limit for kernel memory
+ memory.kmem.usage_in_bytes          show current kernel memory allocation
+ memory.kmem.failcnt                 show the number of kernel memory usage
+				     hits limits
+ memory.kmem.max_usage_in_bytes      show max kernel memory usage recorded
+
+ memory.kmem.tcp.limit_in_bytes      set/show hard limit for tcp buf memory
+ memory.kmem.tcp.usage_in_bytes      show current tcp buf memory allocation
+ memory.kmem.tcp.failcnt             show the number of tcp buf memory usage
+				     hits limits
+ memory.kmem.tcp.max_usage_in_bytes  show max tcp buf memory usage recorded
+==================================== ==========================================
+
+1. History
+==========
+
+The memory controller has a long history. A request for comments for the memory
+controller was posted by Balbir Singh [1]. At the time the RFC was posted
+there were several implementations for memory control. The goal of the
+RFC was to build consensus and agreement for the minimal features required
+for memory control. The first RSS controller was posted by Balbir Singh[2]
+in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
+RSS controller. At OLS, at the resource management BoF, everyone suggested
+that we handle both page cache and RSS together. Another request was raised
+to allow user space handling of OOM. The current memory controller is
+at version 6; it combines both mapped (RSS) and unmapped Page
+Cache Control [11].
+
+2. Memory Control
+=================
+
+Memory is a unique resource in the sense that it is present in a limited
+amount. If a task requires a lot of CPU processing, the task can spread
+its processing over a period of hours, days, months or years, but with
+memory, the same physical memory needs to be reused to accomplish the task.
+
+The memory controller implementation has been divided into phases. These
+are:
+
+1. Memory controller
+2. mlock(2) controller
+3. Kernel user memory accounting and slab control
+4. user mappings length controller
+
+The memory controller is the first controller developed.
+
+2.1. Design
+-----------
+
+The core of the design is a counter called the page_counter. The
+page_counter tracks the current memory usage and limit of the group of
+processes associated with the controller. Each cgroup has a memory controller
+specific data structure (mem_cgroup) associated with it.
+
+2.2. Accounting
+---------------
+
+::
+
+		+--------------------+
+		|  mem_cgroup        |
+		|  (page_counter)    |
+		+--------------------+
+		 /            ^      \
+		/             |       \
+           +---------------+  |        +---------------+
+           | mm_struct     |  |....    | mm_struct     |
+           |               |  |        |               |
+           +---------------+  |        +---------------+
+                              |
+                              + --------------+
+                                              |
+           +---------------+           +------+--------+
+           | page          +---------->  page_cgroup|
+           |               |           |               |
+           +---------------+           +---------------+
+
+             (Figure 1: Hierarchy of Accounting)
+
+
+Figure 1 shows the important aspects of the controller
+
+1. Accounting happens per cgroup
+2. Each mm_struct knows about which cgroup it belongs to
+3. Each page has a pointer to the page_cgroup, which in turn knows the
+   cgroup it belongs to
+
+The accounting is done as follows: mem_cgroup_charge_common() is invoked to
+set up the necessary data structures and check if the cgroup that is being
+charged is over its limit. If it is, then reclaim is invoked on the cgroup.
+More details can be found in the reclaim section of this document.
+If everything goes well, a page meta-data-structure called page_cgroup is
+updated. page_cgroup has its own LRU on cgroup.
+(*) page_cgroup structure is allocated at boot/memory-hotplug time.
+
+2.2.1 Accounting details
+------------------------
+
+All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
+Some pages which are never reclaimable and will not be on the LRU
+are not accounted. We just account pages under usual VM management.
+
+RSS pages are accounted at page_fault unless they've already been accounted
+for earlier. A file page will be accounted for as Page Cache when it's
+inserted into inode (radix-tree). While it's mapped into the page tables of
+processes, duplicate accounting is carefully avoided.
+
+An RSS page is unaccounted when it's fully unmapped. A PageCache page is
+unaccounted when it's removed from radix-tree. Even if RSS pages are fully
+unmapped (by kswapd), they may exist as SwapCache in the system until they
+are really freed. Such SwapCaches are also accounted.
+A swapped-in page is not accounted until it's mapped.
+
+Note: The kernel does swapin-readahead and reads multiple swaps at once.
+This means swapped-in pages may contain pages for other tasks than a task
+causing page fault. So, we avoid accounting at swap-in I/O.
+
+At page migration, accounting information is kept.
+
+Note: we just account pages-on-LRU because our purpose is to control amount
+of used pages; not-on-LRU pages tend to be out-of-control from VM view.
+
+2.3 Shared Page Accounting
+--------------------------
+
+Shared pages are accounted on the basis of the first touch approach. The
+cgroup that first touches a page is accounted for the page. The principle
+behind this approach is that a cgroup that aggressively uses a shared
+page will eventually get charged for it (once it is uncharged from
+the cgroup that brought it in -- this will happen on memory pressure).
+
+But see section 8.2: when moving a task to another cgroup, its pages may
+be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
+
+Exception: If CONFIG_MEMCG_SWAP is not used.
+When you do swapoff and make swapped-out pages of shmem(tmpfs) to
+be backed into memory in force, charges for pages are accounted against the
+caller of swapoff rather than the users of shmem.
+
+2.4 Swap Extension (CONFIG_MEMCG_SWAP)
+--------------------------------------
+
+Swap Extension allows you to record charge for swap. A swapped-in page is
+charged back to original page allocator if possible.
+
+When swap is accounted, following files are added.
+
+ - memory.memsw.usage_in_bytes.
+ - memory.memsw.limit_in_bytes.
+
+memsw means memory+swap. Usage of memory+swap is limited by
+memsw.limit_in_bytes.
+
+Example: Assume a system with 4G of swap. A task which allocates 6G of memory
+(by mistake) under 2G memory limitation will use all swap.
+In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
+By using the memsw limit, you can avoid system OOM which can be caused by swap
+shortage.
+
+**why 'memory+swap' rather than swap**
+
+The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
+to move account from memory to swap...there is no change in usage of
+memory+swap. In other words, when we want to limit the usage of swap without
+affecting global LRU, memory+swap limit is better than just limiting swap from
+an OS point of view.
+
+**What happens when a cgroup hits memory.memsw.limit_in_bytes**
+
+When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
+in this cgroup. Then, swap-out will not be done by cgroup routine and file
+caches are dropped. But as mentioned above, global LRU can do swapout memory
+from it for sanity of the system's memory management state. You can't forbid
+it by cgroup.
+
+2.5 Reclaim
+-----------
+
+Each cgroup maintains a per cgroup LRU which has the same structure as
+global VM. When a cgroup goes over its limit, we first try
+to reclaim memory from the cgroup so as to make space for the new
+pages that the cgroup has touched. If the reclaim is unsuccessful,
+an OOM routine is invoked to select and kill the bulkiest task in the
+cgroup. (See 10. OOM Control below.)
+
+The reclaim algorithm has not been modified for cgroups, except that
+pages that are selected for reclaiming come from the per-cgroup LRU
+list.
+
+NOTE:
+  Reclaim does not work for the root cgroup, since we cannot set any
+  limits on the root cgroup.
+
+Note2:
+  When panic_on_oom is set to "2", the whole system will panic.
+
+When oom event notifier is registered, event will be delivered.
+(See oom_control section)
+
+2.6 Locking
+-----------
+
+   lock_page_cgroup()/unlock_page_cgroup() should not be called under
+   the i_pages lock.
+
+   Other lock order is following:
+
+   PG_locked.
+     mm->page_table_lock
+         pgdat->lru_lock
+	   lock_page_cgroup.
+
+  In many cases, just lock_page_cgroup() is called.
+
+  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
+  pgdat->lru_lock, it has no lock of its own.
+
+2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
+-----------------------------------------------
+
+With the Kernel memory extension, the Memory Controller is able to limit
+the amount of kernel memory used by the system. Kernel memory is fundamentally
+different than user memory, since it can't be swapped out, which makes it
+possible to DoS the system by consuming too much of this precious resource.
+
+Kernel memory accounting is enabled for all memory cgroups by default. But
+it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
+at boot time. In this case, kernel memory will not be accounted at all.
+
+Kernel memory limits are not imposed for the root cgroup. Usage for the root
+cgroup may or may not be accounted. The memory used is accumulated into
+memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
+(currently only for tcp).
+
+The main "kmem" counter is fed into the main counter, so kmem charges will
+also be visible from the user counter.
+
+Currently no soft limit is implemented for kernel memory. It is future work
+to trigger slab reclaim when those limits are reached.
+
+2.7.1 Current Kernel Memory resources accounted
+-----------------------------------------------
+
+stack pages:
+  every process consumes some stack pages. By accounting into
+  kernel memory, we prevent new processes from being created when the kernel
+  memory usage is too high.
+
+slab pages:
+  pages allocated by the SLAB or SLUB allocator are tracked. A copy
+  of each kmem_cache is created every time the cache is touched by the first time
+  from inside the memcg. The creation is done lazily, so some objects can still be
+  skipped while the cache is being created. All objects in a slab page should
+  belong to the same memcg. This only fails to hold when a task is migrated to a
+  different memcg during the page allocation by the cache.
+
+sockets memory pressure:
+  some sockets protocols have memory pressure
+  thresholds. The Memory Controller allows them to be controlled individually
+  per cgroup, instead of globally.
+
+tcp memory pressure:
+  sockets memory pressure for the tcp protocol.
+
+2.7.2 Common use cases
+----------------------
+
+Because the "kmem" counter is fed to the main user counter, kernel memory can
+never be limited completely independently of user memory. Say "U" is the user
+limit, and "K" the kernel limit. There are three possible ways limits can be
+set:
+
+U != 0, K = unlimited:
+    This is the standard memcg limitation mechanism already present before kmem
+    accounting. Kernel memory is completely ignored.
+
+U != 0, K < U:
+    Kernel memory is a subset of the user memory. This setup is useful in
+    deployments where the total amount of memory per-cgroup is overcommited.
+    Overcommiting kernel memory limits is definitely not recommended, since the
+    box can still run out of non-reclaimable memory.
+    In this case, the admin could set up K so that the sum of all groups is
+    never greater than the total memory, and freely set U at the cost of his
+    QoS.
+
+WARNING:
+    In the current implementation, memory reclaim will NOT be
+    triggered for a cgroup when it hits K while staying below U, which makes
+    this setup impractical.
+
+U != 0, K >= U:
+    Since kmem charges will also be fed to the user counter and reclaim will be
+    triggered for the cgroup for both kinds of memory. This setup gives the
+    admin a unified view of memory, and it is also useful for people who just
+    want to track kernel memory usage.
+
+3. User Interface
+=================
+
+3.0. Configuration
+------------------
+
+a. Enable CONFIG_CGROUPS
+b. Enable CONFIG_MEMCG
+c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
+d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
+
+3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
+-------------------------------------------------------------------
+
+::
+
+	# mount -t tmpfs none /sys/fs/cgroup
+	# mkdir /sys/fs/cgroup/memory
+	# mount -t cgroup none /sys/fs/cgroup/memory -o memory
+
+3.2. Make the new group and move bash into it::
+
+	# mkdir /sys/fs/cgroup/memory/0
+	# echo $$ > /sys/fs/cgroup/memory/0/tasks
+
+Since now we're in the 0 cgroup, we can alter the memory limit::
+
+	# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+
+NOTE:
+  We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
+  mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
+  Gibibytes.)
+
+NOTE:
+  We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
+
+NOTE:
+  We cannot set limits on the root cgroup any more.
+
+::
+
+  # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+  4194304
+
+We can check the usage::
+
+  # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
+  1216512
+
+A successful write to this file does not guarantee a successful setting of
+this limit to the value written into the file. This can be due to a
+number of factors, such as rounding up to page boundaries or the total
+availability of memory on the system. The user is required to re-read
+this file after a write to guarantee the value committed by the kernel::
+
+  # echo 1 > memory.limit_in_bytes
+  # cat memory.limit_in_bytes
+  4096
+
+The memory.failcnt field gives the number of times that the cgroup limit was
+exceeded.
+
+The memory.stat file gives accounting information. Now, the number of
+caches, RSS and Active pages/Inactive pages are shown.
+
+4. Testing
+==========
+
+For testing features and implementation, see memcg_test.txt.
+
+Performance test is also important. To see pure memory controller's overhead,
+testing on tmpfs will give you good numbers of small overheads.
+Example: do kernel make on tmpfs.
+
+Page-fault scalability is also important. At measuring parallel
+page fault test, multi-process test may be better than multi-thread
+test because it has noise of shared objects/status.
+
+But the above two are testing extreme situations.
+Trying usual test under memory controller is always helpful.
+
+4.1 Troubleshooting
+-------------------
+
+Sometimes a user might find that the application under a cgroup is
+terminated by the OOM killer. There are several causes for this:
+
+1. The cgroup limit is too low (just too low to do anything useful)
+2. The user is using anonymous memory and swap is turned off or too low
+
+A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
+some of the pages cached in the cgroup (page cache pages).
+
+To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
+seeing what happens will be helpful.
+
+4.2 Task migration
+------------------
+
+When a task migrates from one cgroup to another, its charge is not
+carried forward by default. The pages allocated from the original cgroup still
+remain charged to it, the charge is dropped when the page is freed or
+reclaimed.
+
+You can move charges of a task along with task migration.
+See 8. "Move charges at task migration"
+
+4.3 Removing a cgroup
+---------------------
+
+A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
+cgroup might have some charge associated with it, even though all
+tasks have migrated away from it. (because we charge against pages, not
+against tasks.)
+
+We move the stats to root (if use_hierarchy==0) or parent (if
+use_hierarchy==1), and no change on the charge except uncharging
+from the child.
+
+Charges recorded in swap information is not updated at removal of cgroup.
+Recorded information is discarded and a cgroup which uses swap (swapcache)
+will be charged as a new owner of it.
+
+About use_hierarchy, see Section 6.
+
+5. Misc. interfaces
+===================
+
+5.1 force_empty
+---------------
+  memory.force_empty interface is provided to make cgroup's memory usage empty.
+  When writing anything to this::
+
+    # echo 0 > memory.force_empty
+
+  the cgroup will be reclaimed and as many pages reclaimed as possible.
+
+  The typical use case for this interface is before calling rmdir().
+  Though rmdir() offlines memcg, but the memcg may still stay there due to
+  charged file caches. Some out-of-use page caches may keep charged until
+  memory pressure happens. If you want to avoid that, force_empty will be useful.
+
+  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
+  kernel pages will still be seen. This is not considered a failure and the
+  write will still return success. In this case, it is expected that
+  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
+
+  About use_hierarchy, see Section 6.
+
+5.2 stat file
+-------------
+
+memory.stat file includes following statistics
+
+per-memory cgroup local status
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+=============== ===============================================================
+cache		# of bytes of page cache memory.
+rss		# of bytes of anonymous and swap cache memory (includes
+		transparent hugepages).
+rss_huge	# of bytes of anonymous transparent hugepages.
+mapped_file	# of bytes of mapped file (includes tmpfs/shmem)
+pgpgin		# of charging events to the memory cgroup. The charging
+		event happens each time a page is accounted as either mapped
+		anon page(RSS) or cache page(Page Cache) to the cgroup.
+pgpgout		# of uncharging events to the memory cgroup. The uncharging
+		event happens each time a page is unaccounted from the cgroup.
+swap		# of bytes of swap usage
+dirty		# of bytes that are waiting to get written back to the disk.
+writeback	# of bytes of file/anon cache that are queued for syncing to
+		disk.
+inactive_anon	# of bytes of anonymous and swap cache memory on inactive
+		LRU list.
+active_anon	# of bytes of anonymous and swap cache memory on active
+		LRU list.
+inactive_file	# of bytes of file-backed memory on inactive LRU list.
+active_file	# of bytes of file-backed memory on active LRU list.
+unevictable	# of bytes of memory that cannot be reclaimed (mlocked etc).
+=============== ===============================================================
+
+status considering hierarchy (see memory.use_hierarchy settings)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ===================================================
+hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
+			  under which the memory cgroup is
+hierarchical_memsw_limit  # of bytes of memory+swap limit with regard to
+			  hierarchy under which memory cgroup is.
+
+total_<counter>		  # hierarchical version of <counter>, which in
+			  addition to the cgroup's own value includes the
+			  sum of all hierarchical children's values of
+			  <counter>, i.e. total_cache
+========================= ===================================================
+
+The following additional stats are dependent on CONFIG_DEBUG_VM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ========================================
+recent_rotated_anon	  VM internal parameter. (see mm/vmscan.c)
+recent_rotated_file	  VM internal parameter. (see mm/vmscan.c)
+recent_scanned_anon	  VM internal parameter. (see mm/vmscan.c)
+recent_scanned_file	  VM internal parameter. (see mm/vmscan.c)
+========================= ========================================
+
+Memo:
+	recent_rotated means recent frequency of LRU rotation.
+	recent_scanned means recent # of scans to LRU.
+	showing for better debug please see the code for meanings.
+
+Note:
+	Only anonymous and swap cache memory is listed as part of 'rss' stat.
+	This should not be confused with the true 'resident set size' or the
+	amount of physical memory used by the cgroup.
+
+	'rss + mapped_file" will give you resident set size of cgroup.
+
+	(Note: file and shmem may be shared among other cgroups. In that case,
+	mapped_file is accounted only when the memory cgroup is owner of page
+	cache.)
+
+5.3 swappiness
+--------------
+
+Overrides /proc/sys/vm/swappiness for the particular group. The tunable
+in the root cgroup corresponds to the global swappiness setting.
+
+Please note that unlike during the global reclaim, limit reclaim
+enforces that 0 swappiness really prevents from any swapping even if
+there is a swap storage available. This might lead to memcg OOM killer
+if there are no file pages to reclaim.
+
+5.4 failcnt
+-----------
+
+A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
+This failcnt(== failure count) shows the number of times that a usage counter
+hit its limit. When a memory cgroup hits a limit, failcnt increases and
+memory under it will be reclaimed.
+
+You can reset failcnt by writing 0 to failcnt file::
+
+	# echo 0 > .../memory.failcnt
+
+5.5 usage_in_bytes
+------------------
+
+For efficiency, as other kernel components, memory cgroup uses some optimization
+to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
+method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
+value for efficient access. (Of course, when necessary, it's synchronized.)
+If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
+value in memory.stat(see 5.2).
+
+5.6 numa_stat
+-------------
+
+This is similar to numa_maps but operates on a per-memcg basis.  This is
+useful for providing visibility into the numa locality information within
+an memcg since the pages are allowed to be allocated from any physical
+node.  One of the use cases is evaluating application performance by
+combining this information with the application's CPU allocation.
+
+Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
+per-node page counts including "hierarchical_<counter>" which sums up all
+hierarchical children's values in addition to the memcg's own value.
+
+The output format of memory.numa_stat is::
+
+  total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
+
+The "total" count is sum of file + anon + unevictable.
+
+6. Hierarchy support
+====================
+
+The memory controller supports a deep hierarchy and hierarchical accounting.
+The hierarchy is created by creating the appropriate cgroups in the
+cgroup filesystem. Consider for example, the following cgroup filesystem
+hierarchy::
+
+	       root
+	     /  |   \
+            /	|    \
+	   a	b     c
+		      | \
+		      |  \
+		      d   e
+
+In the diagram above, with hierarchical accounting enabled, all memory
+usage of e, is accounted to its ancestors up until the root (i.e, c and root),
+that has memory.use_hierarchy enabled. If one of the ancestors goes over its
+limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
+children of the ancestor.
+
+6.1 Enabling hierarchical accounting and reclaim
+------------------------------------------------
+
+A memory cgroup by default disables the hierarchy feature. Support
+can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup::
+
+	# echo 1 > memory.use_hierarchy
+
+The feature can be disabled by::
+
+	# echo 0 > memory.use_hierarchy
+
+NOTE1:
+       Enabling/disabling will fail if either the cgroup already has other
+       cgroups created below it, or if the parent cgroup has use_hierarchy
+       enabled.
+
+NOTE2:
+       When panic_on_oom is set to "2", the whole system will panic in
+       case of an OOM event in any cgroup.
+
+7. Soft limits
+==============
+
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+
+a. There is no memory contention
+b. They do not exceed their hard limit
+
+When the system detects memory contention or low memory, control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+
+Please note that soft limits is a best-effort feature; it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is set up such that
+it gets invoked from balance_pgdat (kswapd).
+
+7.1 Interface
+-------------
+
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 MiB)::
+
+	# echo 256M > memory.soft_limit_in_bytes
+
+If we want to change this to 1G, we can at any time use::
+
+	# echo 1G > memory.soft_limit_in_bytes
+
+NOTE1:
+       Soft limits take effect over a long period of time, since they involve
+       reclaiming memory for balancing between memory cgroups
+NOTE2:
+       It is recommended to set the soft limit always below the hard limit,
+       otherwise the hard limit will take precedence.
+
+8. Move charges at task migration
+=================================
+
+Users can move charges associated with a task along with task migration, that
+is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
+This feature is not supported in !CONFIG_MMU environments because of lack of
+page tables.
+
+8.1 Interface
+-------------
+
+This feature is disabled by default. It can be enabled (and disabled again) by
+writing to memory.move_charge_at_immigrate of the destination cgroup.
+
+If you want to enable it::
+
+	# echo (some positive value) > memory.move_charge_at_immigrate
+
+Note:
+      Each bits of move_charge_at_immigrate has its own meaning about what type
+      of charges should be moved. See 8.2 for details.
+Note:
+      Charges are moved only when you move mm->owner, in other words,
+      a leader of a thread group.
+Note:
+      If we cannot find enough space for the task in the destination cgroup, we
+      try to make space by reclaiming memory. Task migration may fail if we
+      cannot make enough space.
+Note:
+      It can take several seconds if you move charges much.
+
+And if you want disable it again::
+
+	# echo 0 > memory.move_charge_at_immigrate
+
+8.2 Type of charges which can be moved
+--------------------------------------
+
+Each bit in move_charge_at_immigrate has its own meaning about what type of
+charges should be moved. But in any case, it must be noted that an account of
+a page or a swap can be moved only when it is charged to the task's current
+(old) memory cgroup.
+
++---+--------------------------------------------------------------------------+
+|bit| what type of charges would be moved ?                                    |
++===+==========================================================================+
+| 0 | A charge of an anonymous page (or swap of it) used by the target task.   |
+|   | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
++---+--------------------------------------------------------------------------+
+| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
+|   | and swaps of tmpfs file) mmapped by the target task. Unlike the case of  |
+|   | anonymous pages, file pages (and swaps) in the range mmapped by the task |
+|   | will be moved even if the task hasn't done page fault, i.e. they might   |
+|   | not be the task's "RSS", but other task's "RSS" that maps the same file. |
+|   | And mapcount of the page is ignored (the page can be moved even if       |
+|   | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to    |
+|   | enable move of swap charges.                                             |
++---+--------------------------------------------------------------------------+
+
+8.3 TODO
+--------
+
+- All of moving charge operations are done under cgroup_mutex. It's not good
+  behavior to hold the mutex too long, so we may need some trick.
+
+9. Memory thresholds
+====================
+
+Memory cgroup implements memory thresholds using the cgroups notification
+API (see cgroups.txt). It allows to register multiple memory and memsw
+thresholds and gets notifications when it crosses.
+
+To register a threshold, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
+- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
+  cgroup.event_control.
+
+Application will be notified through eventfd when memory usage crosses
+threshold in any direction.
+
+It's applicable for root and non-root cgroup.
+
+10. OOM Control
+===============
+
+memory.oom_control file is for OOM notification and other controls.
+
+Memory cgroup implements OOM notifier using the cgroup notification
+API (See cgroups.txt). It allows to register multiple OOM notification
+delivery and gets notification when OOM happens.
+
+To register a notifier, an application must:
+
+ - create an eventfd using eventfd(2)
+ - open memory.oom_control file
+ - write string like "<event_fd> <fd of memory.oom_control>" to
+   cgroup.event_control
+
+The application will be notified through eventfd when OOM happens.
+OOM notification doesn't work for the root cgroup.
+
+You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
+
+	#echo 1 > memory.oom_control
+
+If OOM-killer is disabled, tasks under cgroup will hang/sleep
+in memory cgroup's OOM-waitqueue when they request accountable memory.
+
+For running them, you have to relax the memory cgroup's OOM status by
+
+	* enlarge limit or reduce usage.
+
+To reduce usage,
+
+	* kill some tasks.
+	* move some tasks to other group with account migration.
+	* remove some files (on tmpfs?)
+
+Then, stopped tasks will work again.
+
+At reading, current status of OOM is shown.
+
+	- oom_kill_disable 0 or 1
+	  (if 1, oom-killer is disabled)
+	- under_oom	   0 or 1
+	  (if 1, the memory cgroup is under OOM, tasks may be stopped.)
+
+11. Memory Pressure
+===================
+
+The pressure level notifications can be used to monitor the memory
+allocation cost; based on the pressure, applications can implement
+different strategies of managing their memory resources. The pressure
+levels are defined as following:
+
+The "low" level means that the system is reclaiming memory for new
+allocations. Monitoring this reclaiming activity might be useful for
+maintaining cache level. Upon notification, the program (typically
+"Activity Manager") might analyze vmstat and act in advance (i.e.
+prematurely shutdown unimportant services).
+
+The "medium" level means that the system is experiencing medium memory
+pressure, the system might be making swap, paging out active file caches,
+etc. Upon this event applications may decide to further analyze
+vmstat/zoneinfo/memcg or internal memory usage statistics and free any
+resources that can be easily reconstructed or re-read from a disk.
+
+The "critical" level means that the system is actively thrashing, it is
+about to out of memory (OOM) or even the in-kernel OOM killer is on its
+way to trigger. Applications should do whatever they can to help the
+system. It might be too late to consult with vmstat or any other
+statistics, so it's advisable to take an immediate action.
+
+By default, events are propagated upward until the event is handled, i.e. the
+events are not pass-through. For example, you have three cgroups: A->B->C. Now
+you set up an event listener on cgroups A, B and C, and suppose group C
+experiences some pressure. In this situation, only group C will receive the
+notification, i.e. groups A and B will not receive it. This is done to avoid
+excessive "broadcasting" of messages, which disturbs the system and which is
+especially bad if we are low on memory or thrashing. Group B, will receive
+notification only if there are no event listers for group C.
+
+There are three optional modes that specify different propagation behavior:
+
+ - "default": this is the default behavior specified above. This mode is the
+   same as omitting the optional mode parameter, preserved by backwards
+   compatibility.
+
+ - "hierarchy": events always propagate up to the root, similar to the default
+   behavior, except that propagation continues regardless of whether there are
+   event listeners at each level, with the "hierarchy" mode. In the above
+   example, groups A, B, and C will receive notification of memory pressure.
+
+ - "local": events are pass-through, i.e. they only receive notifications when
+   memory pressure is experienced in the memcg for which the notification is
+   registered. In the above example, group C will receive notification if
+   registered for "local" notification and the group experiences memory
+   pressure. However, group B will never receive notification, regardless if
+   there is an event listener for group C or not, if group B is registered for
+   local notification.
+
+The level and event notification mode ("hierarchy" or "local", if necessary) are
+specified by a comma-delimited string, i.e. "low,hierarchy" specifies
+hierarchical, pass-through, notification for all ancestor memcgs. Notification
+that is the default, non pass-through behavior, does not specify a mode.
+"medium,local" specifies pass-through notification for the medium level.
+
+The file memory.pressure_level is only used to setup an eventfd. To
+register a notification, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.pressure_level;
+- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
+  to cgroup.event_control.
+
+Application will be notified through eventfd when memory pressure is at
+the specific level (or higher). Read/write operations to
+memory.pressure_level are no implemented.
+
+Test:
+
+   Here is a small script example that makes a new cgroup, sets up a
+   memory limit, sets up a notification in the cgroup and then makes child
+   cgroup experience a critical pressure::
+
+	# cd /sys/fs/cgroup/memory/
+	# mkdir foo
+	# cd foo
+	# cgroup_event_listener memory.pressure_level low,hierarchy &
+	# echo 8000000 > memory.limit_in_bytes
+	# echo 8000000 > memory.memsw.limit_in_bytes
+	# echo $$ > tasks
+	# dd if=/dev/zero | read x
+
+   (Expect a bunch of notifications, and eventually, the oom-killer will
+   trigger.)
+
+12. TODO
+========
+
+1. Make per-cgroup scanner reclaim not-shared pages first
+2. Teach controller to account for shared-pages
+3. Start reclamation in the background when the limit is
+   not yet hit but the usage is getting closer
+
+Summary
+=======
+
+Overall, the memory controller has been a stable controller and has been
+commented and discussed quite extensively in the community.
+
+References
+==========
+
+1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
+2. Singh, Balbir. Memory Controller (RSS Control),
+   http://lwn.net/Articles/222762/
+3. Emelianov, Pavel. Resource controllers based on process cgroups
+   http://lkml.org/lkml/2007/3/6/198
+4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
+   http://lkml.org/lkml/2007/4/9/78
+5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
+   http://lkml.org/lkml/2007/5/30/244
+6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
+7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
+   subsystem (v3), http://lwn.net/Articles/235534/
+8. Singh, Balbir. RSS controller v2 test results (lmbench),
+   http://lkml.org/lkml/2007/5/17/232
+9. Singh, Balbir. RSS controller v2 AIM9 results
+   http://lkml.org/lkml/2007/5/18/1
+10. Singh, Balbir. Memory controller v6 test results,
+    http://lkml.org/lkml/2007/8/19/36
+11. Singh, Balbir. Memory controller introduction (v6),
+    http://lkml.org/lkml/2007/8/17/69
+12. Corbet, Jonathan, Controlling memory use in cgroups,
+    http://lwn.net/Articles/243795/
diff --git a/Documentation/admin-guide/cgroup-v1/net_cls.rst b/Documentation/admin-guide/cgroup-v1/net_cls.rst
new file mode 100644
index 000000000000..a2cf272af7a0
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/net_cls.rst
@@ -0,0 +1,44 @@
+=========================
+Network classifier cgroup
+=========================
+
+The Network classifier cgroup provides an interface to
+tag network packets with a class identifier (classid).
+
+The Traffic Controller (tc) can be used to assign
+different priorities to packets from different cgroups.
+Also, Netfilter (iptables) can use this tag to perform
+actions on such packets.
+
+Creating a net_cls cgroups instance creates a net_cls.classid file.
+This net_cls.classid value is initialized to 0.
+
+You can write hexadecimal values to net_cls.classid; the format for these
+values is 0xAAAABBBB; AAAA is the major handle number and BBBB
+is the minor handle number.
+Reading net_cls.classid yields a decimal result.
+
+Example::
+
+	mkdir /sys/fs/cgroup/net_cls
+	mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
+	mkdir /sys/fs/cgroup/net_cls/0
+	echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
+
+- setting a 10:1 handle::
+
+	cat /sys/fs/cgroup/net_cls/0/net_cls.classid
+	1048577
+
+- configuring tc::
+
+	tc qdisc add dev eth0 root handle 10: htb
+	tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
+
+- creating traffic class 10:1::
+
+	tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
+
+configuring iptables, basic example::
+
+	iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/admin-guide/cgroup-v1/net_prio.rst b/Documentation/admin-guide/cgroup-v1/net_prio.rst
new file mode 100644
index 000000000000..b40905871c64
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/net_prio.rst
@@ -0,0 +1,57 @@
+=======================
+Network priority cgroup
+=======================
+
+The Network priority cgroup provides an interface to allow an administrator to
+dynamically set the priority of network traffic generated by various
+applications
+
+Nominally, an application would set the priority of its traffic via the
+SO_PRIORITY socket option.  This however, is not always possible because:
+
+1) The application may not have been coded to set this value
+2) The priority of application traffic is often a site-specific administrative
+   decision rather than an application defined one.
+
+This cgroup allows an administrator to assign a process to a group which defines
+the priority of egress traffic on a given interface. Network priority groups can
+be created by first mounting the cgroup filesystem::
+
+	# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
+
+With the above step, the initial group acting as the parent accounting group
+becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
+the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
+
+Each net_prio cgroup contains two files that are subsystem specific
+
+net_prio.prioidx
+  This file is read-only, and is simply informative.  It contains a unique
+  integer value that the kernel uses as an internal representation of this
+  cgroup.
+
+net_prio.ifpriomap
+  This file contains a map of the priorities assigned to traffic originating
+  from processes in this group and egressing the system on various interfaces.
+  It contains a list of tuples in the form <ifname priority>.  Contents of this
+  file can be modified by echoing a string into the file using the same tuple
+  format. For example::
+
+	echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
+
+This command would force any traffic originating from processes belonging to the
+iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
+said traffic set to the value 5. The parent accounting group also has a
+writeable 'net_prio.ifpriomap' file that can be used to set a system default
+priority.
+
+Priorities are set immediately prior to queueing a frame to the device
+queueing discipline (qdisc) so priorities will be assigned prior to the hardware
+queue selection being made.
+
+One usage for the net_prio cgroup is with mqprio qdisc allowing application
+traffic to be steered to hardware/driver based traffic classes. These mappings
+can then be managed by administrators or other networking protocols such as
+DCBX.
+
+A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst
new file mode 100644
index 000000000000..6acebd9e72c8
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/pids.rst
@@ -0,0 +1,92 @@
+=========================
+Process Number Controller
+=========================
+
+Abstract
+--------
+
+The process number controller is used to allow a cgroup hierarchy to stop any
+new tasks from being fork()'d or clone()'d after a certain limit is reached.
+
+Since it is trivial to hit the task limit without hitting any kmemcg limits in
+place, PIDs are a fundamental resource. As such, PID exhaustion must be
+preventable in the scope of a cgroup hierarchy by allowing resource limiting of
+the number of tasks in a cgroup.
+
+Usage
+-----
+
+In order to use the `pids` controller, set the maximum number of tasks in
+pids.max (this is not available in the root cgroup for obvious reasons). The
+number of processes currently in the cgroup is given by pids.current.
+
+Organisational operations are not blocked by cgroup policies, so it is possible
+to have pids.current > pids.max. This can be done by either setting the limit to
+be smaller than pids.current, or attaching enough processes to the cgroup such
+that pids.current > pids.max. However, it is not possible to violate a cgroup
+policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
+creation of a new process would cause a cgroup policy to be violated.
+
+To set a cgroup to have no limit, set pids.max to "max". This is the default for
+all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
+limit in the hierarchy is followed).
+
+pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
+superset of parent/child/pids.current.
+
+The pids.events file contains event counters:
+
+  - max: Number of times fork failed because limit was hit.
+
+Example
+-------
+
+First, we mount the pids controller::
+
+	# mkdir -p /sys/fs/cgroup/pids
+	# mount -t cgroup -o pids none /sys/fs/cgroup/pids
+
+Then we create a hierarchy, set limits and attach processes to it::
+
+	# mkdir -p /sys/fs/cgroup/pids/parent/child
+	# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
+	# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	#
+
+It should be noted that attempts to overcome the set limit (2 in this case) will
+fail::
+
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	# ( /bin/echo "Here's some processes for you." | cat )
+	sh: fork: Resource temporary unavailable
+	#
+
+Even if we migrate to a child cgroup (which doesn't have a set limit), we will
+not be able to overcome the most stringent limit in the hierarchy (in this case,
+parent's)::
+
+	# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	# cat /sys/fs/cgroup/pids/parent/child/pids.current
+	2
+	# cat /sys/fs/cgroup/pids/parent/child/pids.max
+	max
+	# ( /bin/echo "Here's some processes for you." | cat )
+	sh: fork: Resource temporary unavailable
+	#
+
+We can set a limit that is smaller than pids.current, which will stop any new
+processes from being forked at all (note that the shell itself counts towards
+pids.current)::
+
+	# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
+	# /bin/echo "We can't even spawn a single process now."
+	sh: fork: Resource temporary unavailable
+	# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
+	# /bin/echo "We can't even spawn a single process now."
+	sh: fork: Resource temporary unavailable
+	#
diff --git a/Documentation/admin-guide/cgroup-v1/rdma.rst b/Documentation/admin-guide/cgroup-v1/rdma.rst
new file mode 100644
index 000000000000..2fcb0a9bf790
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/rdma.rst
@@ -0,0 +1,117 @@
+===============
+RDMA Controller
+===============
+
+.. Contents
+
+   1. Overview
+     1-1. What is RDMA controller?
+     1-2. Why RDMA controller needed?
+     1-3. How is RDMA controller implemented?
+   2. Usage Examples
+
+1. Overview
+===========
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can lead to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+
+  ==========    =============================
+  hca_handle	Maximum number of HCA Handles
+  hca_object 	Maximum number of HCA Objects
+  ==========    =============================
+
+2. Usage Examples
+=================
+
+(a) Configure resource limit::
+
+	echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+	echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit::
+
+	cat /sys/fs/cgroup/rdma/2/rdma.max
+	#Output:
+	mlx4_0 hca_handle=2 hca_object=2000
+	ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage::
+
+	cat /sys/fs/cgroup/rdma/2/rdma.current
+	#Output:
+	mlx4_0 hca_handle=1 hca_object=20
+	ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit::
+
+	echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 080b18ce2a5d..ed4c5977d6e1 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and
 conventions of cgroup v2.  It describes all userland-visible aspects
 of cgroup including core and specific controller behaviors.  All
 future changes must be reflected in this document.  Documentation for
-v1 is available under Documentation/cgroup-v1/.
+v1 is available under Documentation/admin-guide/cgroup-v1/.
 
 .. CONTENTS
 
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 1f0d9b939311..a5fdb1a846ce 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -59,6 +59,7 @@ configure specific aspects of kernel behavior to your liking.
 
    initrd
    cgroup-v2
+   cgroup-v1/index
    serial-console
    braille-console
    parport
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 78576aa45cce..a571a67e0c85 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4089,7 +4089,7 @@
 
 	relax_domain_level=
 			[KNL, SMP] Set scheduler's default relax_domain_level.
-			See Documentation/cgroup-v1/cpusets.rst.
+			See Documentation/admin-guide/cgroup-v1/cpusets.rst.
 
 	reserve=	[KNL,BUGS] Force kernel to ignore I/O ports or memory
 			Format: <base1>,<size1>[,<base2>,<size2>,...]
@@ -4599,7 +4599,7 @@
 	swapaccount=[0|1]
 			[KNL] Enable accounting of swap in memory resource
 			controller if no parameter or 1 is given or disable
-			it if 0 is given (See Documentation/cgroup-v1/memory.rst)
+			it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
 
 	swiotlb=	[ARM,IA-64,PPC,MIPS,X86]
 			Format: { <int> | force | noforce }
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index 546f174e5d6a..8463f5538fda 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy
 support.
 
 Memory policies should not be confused with cpusets
-(``Documentation/cgroup-v1/cpusets.rst``)
+(``Documentation/admin-guide/cgroup-v1/cpusets.rst``)
 which is an administrative mechanism for restricting the nodes from which
 memory may be allocated by a set of processes. Memory policies are a
 programming interface that a NUMA-aware application can take advantage of.  When
diff --git a/Documentation/block/bfq-iosched.rst b/Documentation/block/bfq-iosched.rst
index 2c13b2fc1888..0d237d402860 100644
--- a/Documentation/block/bfq-iosched.rst
+++ b/Documentation/block/bfq-iosched.rst
@@ -547,7 +547,7 @@ As for cgroups-v1 (blkio controller), the exact set of stat files
 created, and kept up-to-date by bfq, depends on whether
 CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all
 the stat files documented in
-Documentation/cgroup-v1/blkio-controller.rst. If, instead,
+Documentation/admin-guide/cgroup-v1/blkio-controller.rst. If, instead,
 CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files::
 
   blkio.bfq.io_service_bytes
diff --git a/Documentation/cgroup-v1/blkio-controller.rst b/Documentation/cgroup-v1/blkio-controller.rst
deleted file mode 100644
index 1d7d962933be..000000000000
--- a/Documentation/cgroup-v1/blkio-controller.rst
+++ /dev/null
@@ -1,302 +0,0 @@
-===================
-Block IO Controller
-===================
-
-Overview
-========
-cgroup subsys "blkio" implements the block io controller. There seems to be
-a need of various kinds of IO control policies (like proportional BW, max BW)
-both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
-Plan is to use the same cgroup based management interface for blkio controller
-and based on user options switch IO policies in the background.
-
-One IO control policy is throttling policy which can be used to
-specify upper IO rate limits on devices. This policy is implemented in
-generic block layer and can be used on leaf nodes as well as higher
-level logical devices like device mapper.
-
-HOWTO
-=====
-Throttling/Upper Limit policy
------------------------------
-- Enable Block IO controller::
-
-	CONFIG_BLK_CGROUP=y
-
-- Enable throttling in block layer::
-
-	CONFIG_BLK_DEV_THROTTLING=y
-
-- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
-
-        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Specify a bandwidth rate on particular device for root group. The format
-  for policy is "<major>:<minor>  <bytes_per_second>"::
-
-        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
-
-  Above will put a limit of 1MB/second on reads happening for root group
-  on device having major/minor number 8:16.
-
-- Run dd to read a file and see if rate is throttled to 1MB/s or not::
-
-        # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
-        1024+0 records in
-        1024+0 records out
-        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
-
- Limits for writes can be put using blkio.throttle.write_bps_device file.
-
-Hierarchical Cgroups
-====================
-
-Throttling implements hierarchy support; however,
-throttling's hierarchy support is enabled iff "sane_behavior" is
-enabled from cgroup side, which currently is a development option and
-not publicly available.
-
-If somebody created a hierarchy like as follows::
-
-			root
-			/  \
-		     test1 test2
-			|
-		     test3
-
-Throttling with "sane_behavior" will handle the
-hierarchy correctly. For throttling, all limits apply
-to the whole subtree while all statistics are local to the IOs
-directly generated by tasks in that cgroup.
-
-Throttling without "sane_behavior" enabled from cgroup side will
-practically treat all groups at same level as if it looks like the
-following::
-
-				pivot
-			     /  /   \  \
-			root  test1 test2  test3
-
-Various user visible config options
-===================================
-CONFIG_BLK_CGROUP
-	- Block IO controller.
-
-CONFIG_BFQ_CGROUP_DEBUG
-	- Debug help. Right now some additional stats file show up in cgroup
-	  if this option is enabled.
-
-CONFIG_BLK_DEV_THROTTLING
-	- Enable block device throttling support in block layer.
-
-Details of cgroup files
-=======================
-Proportional weight policy files
---------------------------------
-- blkio.weight
-	- Specifies per cgroup weight. This is default weight of the group
-	  on all the devices until and unless overridden by per device rule.
-	  (See blkio.weight_device).
-	  Currently allowed range of weights is from 10 to 1000.
-
-- blkio.weight_device
-	- One can specify per cgroup per device rules using this interface.
-	  These rules override the default value of group weight as specified
-	  by blkio.weight.
-
-	  Following is the format::
-
-	    # echo dev_maj:dev_minor weight > blkio.weight_device
-
-	  Configure weight=300 on /dev/sdb (8:16) in this cgroup::
-
-	    # echo 8:16 300 > blkio.weight_device
-	    # cat blkio.weight_device
-	    dev     weight
-	    8:16    300
-
-	  Configure weight=500 on /dev/sda (8:0) in this cgroup::
-
-	    # echo 8:0 500 > blkio.weight_device
-	    # cat blkio.weight_device
-	    dev     weight
-	    8:0     500
-	    8:16    300
-
-	  Remove specific weight for /dev/sda in this cgroup::
-
-	    # echo 8:0 0 > blkio.weight_device
-	    # cat blkio.weight_device
-	    dev     weight
-	    8:16    300
-
-- blkio.leaf_weight[_device]
-	- Equivalents of blkio.weight[_device] for the purpose of
-          deciding how much weight tasks in the given cgroup has while
-          competing with the cgroup's child cgroups. For details,
-          please refer to Documentation/block/cfq-iosched.txt.
-
-- blkio.time
-	- disk time allocated to cgroup per device in milliseconds. First
-	  two fields specify the major and minor number of the device and
-	  third field specifies the disk time allocated to group in
-	  milliseconds.
-
-- blkio.sectors
-	- number of sectors transferred to/from disk by the group. First
-	  two fields specify the major and minor number of the device and
-	  third field specifies the number of sectors transferred by the
-	  group to/from the device.
-
-- blkio.io_service_bytes
-	- Number of bytes transferred to/from the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of bytes.
-
-- blkio.io_serviced
-	- Number of IOs (bio) issued to the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of IOs.
-
-- blkio.io_service_time
-	- Total amount of time between request dispatch and request completion
-	  for the IOs done by this cgroup. This is in nanoseconds to make it
-	  meaningful for flash devices too. For devices with queue depth of 1,
-	  this time represents the actual service time. When queue_depth > 1,
-	  that is no longer true as requests may be served out of order. This
-	  may cause the service time for a given IO to include the service time
-	  of multiple IOs when served out of order which may result in total
-	  io_service_time > actual time elapsed. This time is further divided by
-	  the type of operation - read or write, sync or async. First two fields
-	  specify the major and minor number of the device, third field
-	  specifies the operation type and the fourth field specifies the
-	  io_service_time in ns.
-
-- blkio.io_wait_time
-	- Total amount of time the IOs for this cgroup spent waiting in the
-	  scheduler queues for service. This can be greater than the total time
-	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
-	  measure of total time the cgroup spent waiting but rather a measure of
-	  the wait_time for its individual IOs. For devices with queue_depth > 1
-	  this metric does not include the time spent waiting for service once
-	  the IO is dispatched to the device but till it actually gets serviced
-	  (there might be a time lag here due to re-ordering of requests by the
-	  device). This is in nanoseconds to make it meaningful for flash
-	  devices too. This time is further divided by the type of operation -
-	  read or write, sync or async. First two fields specify the major and
-	  minor number of the device, third field specifies the operation type
-	  and the fourth field specifies the io_wait_time in ns.
-
-- blkio.io_merged
-	- Total number of bios/requests merged into requests belonging to this
-	  cgroup. This is further divided by the type of operation - read or
-	  write, sync or async.
-
-- blkio.io_queued
-	- Total number of requests queued up at any given instant for this
-	  cgroup. This is further divided by the type of operation - read or
-	  write, sync or async.
-
-- blkio.avg_queue_size
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  The average queue size for this cgroup over the entire time of this
-	  cgroup's existence. Queue size samples are taken each time one of the
-	  queues of this cgroup gets a timeslice.
-
-- blkio.group_wait_time
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  This is the amount of time the cgroup had to wait since it became busy
-	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
-	  its queues. This is different from the io_wait_time which is the
-	  cumulative total of the amount of time spent by each IO in that cgroup
-	  waiting in the scheduler queue. This is in nanoseconds. If this is
-	  read when the cgroup is in a waiting (for timeslice) state, the stat
-	  will only report the group_wait_time accumulated till the last time it
-	  got a timeslice and will not include the current delta.
-
-- blkio.empty_time
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  This is the amount of time a cgroup spends without any pending
-	  requests when not being served, i.e., it does not include any time
-	  spent idling for one of the queues of the cgroup. This is in
-	  nanoseconds. If this is read when the cgroup is in an empty state,
-	  the stat will only report the empty_time accumulated till the last
-	  time it had a pending request and will not include the current delta.
-
-- blkio.idle_time
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  This is the amount of time spent by the IO scheduler idling for a
-	  given cgroup in anticipation of a better request than the existing ones
-	  from other queues/cgroups. This is in nanoseconds. If this is read
-	  when the cgroup is in an idling state, the stat will only report the
-	  idle_time accumulated till the last idle period and will not include
-	  the current delta.
-
-- blkio.dequeue
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
-	  gives the statistics about how many a times a group was dequeued
-	  from service tree of the device. First two fields specify the major
-	  and minor number of the device and third field specifies the number
-	  of times a group was dequeued from a particular device.
-
-- blkio.*_recursive
-	- Recursive version of various stats. These files show the
-          same information as their non-recursive counterparts but
-          include stats from all the descendant cgroups.
-
-Throttling/Upper limit policy files
------------------------------------
-- blkio.throttle.read_bps_device
-	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in bytes per second. Rules are per device. Following is
-	  the format::
-
-	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
-
-- blkio.throttle.write_bps_device
-	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in bytes per second. Rules are per device. Following is
-	  the format::
-
-	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
-
-- blkio.throttle.read_iops_device
-	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in IO per second. Rules are per device. Following is
-	  the format::
-
-	   echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
-
-- blkio.throttle.write_iops_device
-	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in io per second. Rules are per device. Following is
-	  the format::
-
-	    echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
-
-Note: If both BW and IOPS rules are specified for a device, then IO is
-      subjected to both the constraints.
-
-- blkio.throttle.io_serviced
-	- Number of IOs (bio) issued to the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of IOs.
-
-- blkio.throttle.io_service_bytes
-	- Number of bytes transferred to/from the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of bytes.
-
-Common files among various policies
------------------------------------
-- blkio.reset_stats
-	- Writing an int to this file will result in resetting all the stats
-	  for that cgroup.
diff --git a/Documentation/cgroup-v1/cgroups.rst b/Documentation/cgroup-v1/cgroups.rst
deleted file mode 100644
index 46bbe7e022d4..000000000000
--- a/Documentation/cgroup-v1/cgroups.rst
+++ /dev/null
@@ -1,695 +0,0 @@
-==============
-Control Groups
-==============
-
-Written by Paul Menage <menage@google.com> based on
-Documentation/cgroup-v1/cpusets.rst
-
-Original copyright statements from cpusets.txt:
-
-Portions Copyright (C) 2004 BULL SA.
-
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-
-Modified by Paul Jackson <pj@sgi.com>
-
-Modified by Christoph Lameter <cl@linux.com>
-
-.. CONTENTS:
-
-	1. Control Groups
-	1.1 What are cgroups ?
-	1.2 Why are cgroups needed ?
-	1.3 How are cgroups implemented ?
-	1.4 What does notify_on_release do ?
-	1.5 What does clone_children do ?
-	1.6 How do I use cgroups ?
-	2. Usage Examples and Syntax
-	2.1 Basic Usage
-	2.2 Attaching processes
-	2.3 Mounting hierarchies by name
-	3. Kernel API
-	3.1 Overview
-	3.2 Synchronization
-	3.3 Subsystem API
-	4. Extended attributes usage
-	5. Questions
-
-1. Control Groups
-=================
-
-1.1 What are cgroups ?
-----------------------
-
-Control Groups provide a mechanism for aggregating/partitioning sets of
-tasks, and all their future children, into hierarchical groups with
-specialized behaviour.
-
-Definitions:
-
-A *cgroup* associates a set of tasks with a set of parameters for one
-or more subsystems.
-
-A *subsystem* is a module that makes use of the task grouping
-facilities provided by cgroups to treat groups of tasks in
-particular ways. A subsystem is typically a "resource controller" that
-schedules a resource or applies per-cgroup limits, but it may be
-anything that wants to act on a group of processes, e.g. a
-virtualization subsystem.
-
-A *hierarchy* is a set of cgroups arranged in a tree, such that
-every task in the system is in exactly one of the cgroups in the
-hierarchy, and a set of subsystems; each subsystem has system-specific
-state attached to each cgroup in the hierarchy.  Each hierarchy has
-an instance of the cgroup virtual filesystem associated with it.
-
-At any one time there may be multiple active hierarchies of task
-cgroups. Each hierarchy is a partition of all tasks in the system.
-
-User-level code may create and destroy cgroups by name in an
-instance of the cgroup virtual file system, specify and query to
-which cgroup a task is assigned, and list the task PIDs assigned to
-a cgroup. Those creations and assignments only affect the hierarchy
-associated with that instance of the cgroup file system.
-
-On their own, the only use for cgroups is for simple job
-tracking. The intention is that other subsystems hook into the generic
-cgroup support to provide new attributes for cgroups, such as
-accounting/limiting the resources which processes in a cgroup can
-access. For example, cpusets (see Documentation/cgroup-v1/cpusets.rst) allow
-you to associate a set of CPUs and a set of memory nodes with the
-tasks in each cgroup.
-
-1.2 Why are cgroups needed ?
-----------------------------
-
-There are multiple efforts to provide process aggregations in the
-Linux kernel, mainly for resource-tracking purposes. Such efforts
-include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
-namespaces. These all require the basic notion of a
-grouping/partitioning of processes, with newly forked processes ending
-up in the same group (cgroup) as their parent process.
-
-The kernel cgroup patch provides the minimum essential kernel
-mechanisms required to efficiently implement such groups. It has
-minimal impact on the system fast paths, and provides hooks for
-specific subsystems such as cpusets to provide additional behaviour as
-desired.
-
-Multiple hierarchy support is provided to allow for situations where
-the division of tasks into cgroups is distinctly different for
-different subsystems - having parallel hierarchies allows each
-hierarchy to be a natural division of tasks, without having to handle
-complex combinations of tasks that would be present if several
-unrelated subsystems needed to be forced into the same tree of
-cgroups.
-
-At one extreme, each resource controller or subsystem could be in a
-separate hierarchy; at the other extreme, all subsystems
-would be attached to the same hierarchy.
-
-As an example of a scenario (originally proposed by vatsa@in.ibm.com)
-that can benefit from multiple hierarchies, consider a large
-university server with various users - students, professors, system
-tasks etc. The resource planning for this server could be along the
-following lines::
-
-       CPU :          "Top cpuset"
-                       /       \
-               CPUSet1         CPUSet2
-                  |               |
-               (Professors)    (Students)
-
-               In addition (system tasks) are attached to topcpuset (so
-               that they can run anywhere) with a limit of 20%
-
-       Memory : Professors (50%), Students (30%), system (20%)
-
-       Disk : Professors (50%), Students (30%), system (20%)
-
-       Network : WWW browsing (20%), Network File System (60%), others (20%)
-                               / \
-               Professors (15%)  students (5%)
-
-Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
-into the NFS network class.
-
-At the same time Firefox/Lynx will share an appropriate CPU/Memory class
-depending on who launched it (prof/student).
-
-With the ability to classify tasks differently for different resources
-(by putting those resource subsystems in different hierarchies),
-the admin can easily set up a script which receives exec notifications
-and depending on who is launching the browser he can::
-
-    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
-
-With only a single hierarchy, he now would potentially have to create
-a separate cgroup for every browser launched and associate it with
-appropriate network and other resource class.  This may lead to
-proliferation of such cgroups.
-
-Also let's say that the administrator would like to give enhanced network
-access temporarily to a student's browser (since it is night and the user
-wants to do online gaming :))  OR give one of the student's simulation
-apps enhanced CPU power.
-
-With ability to write PIDs directly to resource classes, it's just a
-matter of::
-
-       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
-       (after some time)
-       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
-
-Without this ability, the administrator would have to split the cgroup into
-multiple separate ones and then associate the new cgroups with the
-new resource classes.
-
-
-
-1.3 How are cgroups implemented ?
----------------------------------
-
-Control Groups extends the kernel as follows:
-
- - Each task in the system has a reference-counted pointer to a
-   css_set.
-
- - A css_set contains a set of reference-counted pointers to
-   cgroup_subsys_state objects, one for each cgroup subsystem
-   registered in the system. There is no direct link from a task to
-   the cgroup of which it's a member in each hierarchy, but this
-   can be determined by following pointers through the
-   cgroup_subsys_state objects. This is because accessing the
-   subsystem state is something that's expected to happen frequently
-   and in performance-critical code, whereas operations that require a
-   task's actual cgroup assignments (in particular, moving between
-   cgroups) are less common. A linked list runs through the cg_list
-   field of each task_struct using the css_set, anchored at
-   css_set->tasks.
-
- - A cgroup hierarchy filesystem can be mounted for browsing and
-   manipulation from user space.
-
- - You can list all the tasks (by PID) attached to any cgroup.
-
-The implementation of cgroups requires a few, simple hooks
-into the rest of the kernel, none in performance-critical paths:
-
- - in init/main.c, to initialize the root cgroups and initial
-   css_set at system boot.
-
- - in fork and exit, to attach and detach a task from its css_set.
-
-In addition, a new file system of type "cgroup" may be mounted, to
-enable browsing and modifying the cgroups presently known to the
-kernel.  When mounting a cgroup hierarchy, you may specify a
-comma-separated list of subsystems to mount as the filesystem mount
-options.  By default, mounting the cgroup filesystem attempts to
-mount a hierarchy containing all registered subsystems.
-
-If an active hierarchy with exactly the same set of subsystems already
-exists, it will be reused for the new mount. If no existing hierarchy
-matches, and any of the requested subsystems are in use in an existing
-hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
-is activated, associated with the requested subsystems.
-
-It's not currently possible to bind a new subsystem to an active
-cgroup hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy. This may be possible in future, but is fraught with nasty
-error-recovery issues.
-
-When a cgroup filesystem is unmounted, if there are any
-child cgroups created below the top-level cgroup, that hierarchy
-will remain active even though unmounted; if there are no
-child cgroups then the hierarchy will be deactivated.
-
-No new system calls are added for cgroups - all support for
-querying and modifying cgroups is via this cgroup file system.
-
-Each task under /proc has an added file named 'cgroup' displaying,
-for each active hierarchy, the subsystem names and the cgroup name
-as the path relative to the root of the cgroup file system.
-
-Each cgroup is represented by a directory in the cgroup file system
-containing the following files describing that cgroup:
-
- - tasks: list of tasks (by PID) attached to that cgroup.  This list
-   is not guaranteed to be sorted.  Writing a thread ID into this file
-   moves the thread into this cgroup.
- - cgroup.procs: list of thread group IDs in the cgroup.  This list is
-   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
-   should sort/uniquify the list if this property is required.
-   Writing a thread group ID into this file moves all threads in that
-   group into this cgroup.
- - notify_on_release flag: run the release agent on exit?
- - release_agent: the path to use for release notifications (this file
-   exists in the top cgroup only)
-
-Other subsystems such as cpusets may add additional files in each
-cgroup dir.
-
-New cgroups are created using the mkdir system call or shell
-command.  The properties of a cgroup, such as its flags, are
-modified by writing to the appropriate file in that cgroups
-directory, as listed above.
-
-The named hierarchical structure of nested cgroups allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cgroup allows organizing the work load
-on a system into related sets of tasks.  A task may be re-attached to
-any other cgroup, if allowed by the permissions on the necessary
-cgroup file system directories.
-
-When a task is moved from one cgroup to another, it gets a new
-css_set pointer - if there's an already existing css_set with the
-desired collection of cgroups then that group is reused, otherwise a new
-css_set is allocated. The appropriate existing css_set is located by
-looking into a hash table.
-
-To allow access from a cgroup to the css_sets (and hence tasks)
-that comprise it, a set of cg_cgroup_link objects form a lattice;
-each cg_cgroup_link is linked into a list of cg_cgroup_links for
-a single cgroup on its cgrp_link_list field, and a list of
-cg_cgroup_links for a single css_set on its cg_link_list.
-
-Thus the set of tasks in a cgroup can be listed by iterating over
-each css_set that references the cgroup, and sub-iterating over
-each css_set's task set.
-
-The use of a Linux virtual file system (vfs) to represent the
-cgroup hierarchy provides for a familiar permission and name space
-for cgroups, with a minimum of additional kernel code.
-
-1.4 What does notify_on_release do ?
-------------------------------------
-
-If the notify_on_release flag is enabled (1) in a cgroup, then
-whenever the last task in the cgroup leaves (exits or attaches to
-some other cgroup) and the last child cgroup of that cgroup
-is removed, then the kernel runs the command specified by the contents
-of the "release_agent" file in that hierarchy's root directory,
-supplying the pathname (relative to the mount point of the cgroup
-file system) of the abandoned cgroup.  This enables automatic
-removal of abandoned cgroups.  The default value of
-notify_on_release in the root cgroup at system boot is disabled
-(0).  The default value of other cgroups at creation is the current
-value of their parents' notify_on_release settings. The default value of
-a cgroup hierarchy's release_agent path is empty.
-
-1.5 What does clone_children do ?
----------------------------------
-
-This flag only affects the cpuset controller. If the clone_children
-flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
-configuration from the parent during initialization.
-
-1.6 How do I use cgroups ?
---------------------------
-
-To start a new job that is to be contained within a cgroup, using
-the "cpuset" cgroup subsystem, the steps are something like::
-
- 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
- 2) mkdir /sys/fs/cgroup/cpuset
- 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 5) Start a task that will be the "founding father" of the new job.
- 6) Attach that task to the new cgroup by writing its PID to the
-    /sys/fs/cgroup/cpuset tasks file for that cgroup.
- 7) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cgroup
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cgroup::
-
-  mount -t tmpfs cgroup_root /sys/fs/cgroup
-  mkdir /sys/fs/cgroup/cpuset
-  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cgroup Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cgroup
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using cgroups can be done through the cgroup
-virtual filesystem.
-
-To mount a cgroup hierarchy with all available subsystems, type::
-
-  # mount -t cgroup xxx /sys/fs/cgroup
-
-The "xxx" is not interpreted by the cgroup code, but will appear in
-/proc/mounts so may be any useful identifying string that you like.
-
-Note: Some subsystems do not work without some user input first.  For instance,
-if cpusets are enabled the user will have to populate the cpus and mems files
-for each new cgroup created before that group can be used.
-
-As explained in section `1.2 Why are cgroups needed?` you should create
-different hierarchies of cgroups for each single resource or group of
-resources you want to control. Therefore, you should mount a tmpfs on
-/sys/fs/cgroup and create directories for each cgroup resource or resource
-group::
-
-  # mount -t tmpfs cgroup_root /sys/fs/cgroup
-  # mkdir /sys/fs/cgroup/rg1
-
-To mount a cgroup hierarchy with just the cpuset and memory
-subsystems, type::
-
-  # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
-
-While remounting cgroups is currently supported, it is not recommend
-to use it. Remounting allows changing bound subsystems and
-release_agent. Rebinding is hardly useful as it only works when the
-hierarchy is empty and release_agent itself should be replaced with
-conventional fsnotify. The support for remounting will be removed in
-the future.
-
-To Specify a hierarchy's release_agent::
-
-  # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
-    xxx /sys/fs/cgroup/rg1
-
-Note that specifying 'release_agent' more than once will return failure.
-
-Note that changing the set of subsystems is currently only supported
-when the hierarchy consists of a single (root) cgroup. Supporting
-the ability to arbitrarily bind/unbind subsystems from an existing
-cgroup hierarchy is intended to be implemented in the future.
-
-Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
-tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
-is the cgroup that holds the whole system.
-
-If you want to change the value of release_agent::
-
-  # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
-
-It can also be changed via remount.
-
-If you want to create a new cgroup under /sys/fs/cgroup/rg1::
-
-  # cd /sys/fs/cgroup/rg1
-  # mkdir my_cgroup
-
-Now you want to do something with this cgroup:
-
-  # cd my_cgroup
-
-In this directory you can find several files::
-
-  # ls
-  cgroup.procs notify_on_release tasks
-  (plus whatever files added by the attached subsystems)
-
-Now attach your shell to this cgroup::
-
-  # /bin/echo $$ > tasks
-
-You can also create cgroups inside your cgroup by using mkdir in this
-directory::
-
-  # mkdir my_sub_cs
-
-To remove a cgroup, just use rmdir::
-
-  # rmdir my_sub_cs
-
-This will fail if the cgroup is in use (has cgroups inside, or
-has processes attached, or is held alive by other subsystem-specific
-reference).
-
-2.2 Attaching processes
------------------------
-
-::
-
-  # /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another::
-
-  # /bin/echo PID1 > tasks
-  # /bin/echo PID2 > tasks
-	  ...
-  # /bin/echo PIDn > tasks
-
-You can attach the current shell task by echoing 0::
-
-  # echo 0 > tasks
-
-You can use the cgroup.procs file instead of the tasks file to move all
-threads in a threadgroup at once. Echoing the PID of any task in a
-threadgroup to cgroup.procs causes all tasks in that threadgroup to be
-attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
-in the writing task's threadgroup.
-
-Note: Since every task is always a member of exactly one cgroup in each
-mounted hierarchy, to remove a task from its current cgroup you must
-move it into a new cgroup (possibly the root cgroup) by writing to the
-new cgroup's tasks file.
-
-Note: Due to some restrictions enforced by some cgroup subsystems, moving
-a process to another cgroup can fail.
-
-2.3 Mounting hierarchies by name
---------------------------------
-
-Passing the name=<x> option when mounting a cgroups hierarchy
-associates the given name with the hierarchy.  This can be used when
-mounting a pre-existing hierarchy, in order to refer to it by name
-rather than by its set of active subsystems.  Each hierarchy is either
-nameless, or has a unique name.
-
-The name should match [\w.-]+
-
-When passing a name=<x> option for a new hierarchy, you need to
-specify subsystems manually; the legacy behaviour of mounting all
-subsystems when none are explicitly specified is not supported when
-you give a subsystem a name.
-
-The name of the subsystem appears as part of the hierarchy description
-in /proc/mounts and /proc/<pid>/cgroups.
-
-
-3. Kernel API
-=============
-
-3.1 Overview
-------------
-
-Each kernel subsystem that wants to hook into the generic cgroup
-system needs to create a cgroup_subsys object. This contains
-various methods, which are callbacks from the cgroup system, along
-with a subsystem ID which will be assigned by the cgroup system.
-
-Other fields in the cgroup_subsys object include:
-
-- subsys_id: a unique array index for the subsystem, indicating which
-  entry in cgroup->subsys[] this subsystem should be managing.
-
-- name: should be initialized to a unique subsystem name. Should be
-  no longer than MAX_CGROUP_TYPE_NAMELEN.
-
-- early_init: indicate if the subsystem needs early initialization
-  at system boot.
-
-Each cgroup object created by the system has an array of pointers,
-indexed by subsystem ID; this pointer is entirely managed by the
-subsystem; the generic cgroup code will never touch this pointer.
-
-3.2 Synchronization
--------------------
-
-There is a global mutex, cgroup_mutex, used by the cgroup
-system. This should be taken by anything that wants to modify a
-cgroup. It may also be taken to prevent cgroups from being
-modified, but more specific locks may be more appropriate in that
-situation.
-
-See kernel/cgroup.c for more details.
-
-Subsystems can take/release the cgroup_mutex via the functions
-cgroup_lock()/cgroup_unlock().
-
-Accessing a task's cgroup pointer may be done in the following ways:
-- while holding cgroup_mutex
-- while holding the task's alloc_lock (via task_lock())
-- inside an rcu_read_lock() section via rcu_dereference()
-
-3.3 Subsystem API
------------------
-
-Each subsystem should:
-
-- add an entry in linux/cgroup_subsys.h
-- define a cgroup_subsys object called <name>_cgrp_subsys
-
-Each subsystem may export the following methods. The only mandatory
-methods are css_alloc/free. Any others that are null are presumed to
-be successful no-ops.
-
-``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)``
-(cgroup_mutex held by caller)
-
-Called to allocate a subsystem state object for a cgroup. The
-subsystem should allocate its subsystem state object for the passed
-cgroup, returning a pointer to the new object on success or a
-ERR_PTR() value. On success, the subsystem pointer should point to
-a structure of type cgroup_subsys_state (typically embedded in a
-larger subsystem-specific object), which will be initialized by the
-cgroup system. Note that this will be called at initialization to
-create the root subsystem state for this subsystem; this case can be
-identified by the passed cgroup object having a NULL parent (since
-it's the root of the hierarchy) and may be an appropriate place for
-initialization code.
-
-``int css_online(struct cgroup *cgrp)``
-(cgroup_mutex held by caller)
-
-Called after @cgrp successfully completed all allocations and made
-visible to cgroup_for_each_child/descendant_*() iterators. The
-subsystem may choose to fail creation by returning -errno. This
-callback can be used to implement reliable state sharing and
-propagation along the hierarchy. See the comment on
-cgroup_for_each_descendant_pre() for details.
-
-``void css_offline(struct cgroup *cgrp);``
-(cgroup_mutex held by caller)
-
-This is the counterpart of css_online() and called iff css_online()
-has succeeded on @cgrp. This signifies the beginning of the end of
-@cgrp. @cgrp is being removed and the subsystem should start dropping
-all references it's holding on @cgrp. When all references are dropped,
-cgroup removal will proceed to the next step - css_free(). After this
-callback, @cgrp should be considered dead to the subsystem.
-
-``void css_free(struct cgroup *cgrp)``
-(cgroup_mutex held by caller)
-
-The cgroup system is about to free @cgrp; the subsystem should free
-its subsystem state object. By the time this method is called, @cgrp
-is completely unused; @cgrp->parent is still valid. (Note - can also
-be called for a newly-created cgroup if an error occurs after this
-subsystem's create() method has been called for the new cgroup).
-
-``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
-(cgroup_mutex held by caller)
-
-Called prior to moving one or more tasks into a cgroup; if the
-subsystem returns an error, this will abort the attach operation.
-@tset contains the tasks to be attached and is guaranteed to have at
-least one task in it.
-
-If there are multiple tasks in the taskset, then:
-  - it's guaranteed that all are from the same thread group
-  - @tset contains all tasks from the thread group whether or not
-    they're switching cgroups
-  - the first task is the leader
-
-Each @tset entry also contains the task's old cgroup and tasks which
-aren't switching cgroup can be skipped easily using the
-cgroup_taskset_for_each() iterator. Note that this isn't called on a
-fork. If this method returns 0 (success) then this should remain valid
-while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future.
-
-``void css_reset(struct cgroup_subsys_state *css)``
-(cgroup_mutex held by caller)
-
-An optional operation which should restore @css's configuration to the
-initial state.  This is currently only used on the unified hierarchy
-when a subsystem is disabled on a cgroup through
-"cgroup.subtree_control" but should remain enabled because other
-subsystems depend on it.  cgroup core makes such a css invisible by
-removing the associated interface files and invokes this callback so
-that the hidden subsystem can return to the initial neutral state.
-This prevents unexpected resource control from a hidden css and
-ensures that the configuration is in the initial state when it is made
-visible again later.
-
-``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
-(cgroup_mutex held by caller)
-
-Called when a task attach operation has failed after can_attach() has succeeded.
-A subsystem whose can_attach() has some side-effects should provide this
-function, so that the subsystem can implement a rollback. If not, not necessary.
-This will be called only about subsystems whose can_attach() operation have
-succeeded. The parameters are identical to can_attach().
-
-``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
-(cgroup_mutex held by caller)
-
-Called after the task has been attached to the cgroup, to allow any
-post-attachment activity that requires memory allocations or blocking.
-The parameters are identical to can_attach().
-
-``void fork(struct task_struct *task)``
-
-Called when a task is forked into a cgroup.
-
-``void exit(struct task_struct *task)``
-
-Called during task exit.
-
-``void free(struct task_struct *task)``
-
-Called when the task_struct is freed.
-
-``void bind(struct cgroup *root)``
-(cgroup_mutex held by caller)
-
-Called when a cgroup subsystem is rebound to a different hierarchy
-and root cgroup. Currently this will only involve movement between
-the default hierarchy (which never has sub-cgroups) and a hierarchy
-that is being created/destroyed (and hence has no sub-cgroups).
-
-4. Extended attribute usage
-===========================
-
-cgroup filesystem supports certain types of extended attributes in its
-directories and files.  The current supported types are:
-
-	- Trusted (XATTR_TRUSTED)
-	- Security (XATTR_SECURITY)
-
-Both require CAP_SYS_ADMIN capability to set.
-
-Like in tmpfs, the extended attributes in cgroup filesystem are stored
-using kernel memory and it's advised to keep the usage at minimum.  This
-is the reason why user defined extended attributes are not supported, since
-any user can do it and there's no limit in the value size.
-
-The current known users for this feature are SELinux to limit cgroup usage
-in containers and systemd for assorted meta data like main PID in a cgroup
-(systemd creates a cgroup per service).
-
-5. Questions
-============
-
-::
-
-  Q: what's up with this '/bin/echo' ?
-  A: bash's builtin 'echo' command does not check calls to write() against
-     errors. If you use it in the cgroup file system, you won't be
-     able to tell whether a command succeeded or failed.
-
-  Q: When I attach processes, only the first of the line gets really attached !
-  A: We can only return one error code per call to write(). So you should also
-     put only ONE PID.
diff --git a/Documentation/cgroup-v1/cpuacct.rst b/Documentation/cgroup-v1/cpuacct.rst
deleted file mode 100644
index d30ed81d2ad7..000000000000
--- a/Documentation/cgroup-v1/cpuacct.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-=========================
-CPU Accounting Controller
-=========================
-
-The CPU accounting controller is used to group tasks using cgroups and
-account the CPU usage of these groups of tasks.
-
-The CPU accounting controller supports multi-hierarchy groups. An accounting
-group accumulates the CPU usage of all of its child groups and the tasks
-directly present in its group.
-
-Accounting groups can be created by first mounting the cgroup filesystem::
-
-  # mount -t cgroup -ocpuacct none /sys/fs/cgroup
-
-With the above step, the initial or the parent accounting group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
-by this group which is essentially the CPU time obtained by all the tasks
-in the system.
-
-New accounting groups can be created under the parent group /sys/fs/cgroup::
-
-  # cd /sys/fs/cgroup
-  # mkdir g1
-  # echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it. CPU time consumed by this bash and its children
-can be obtained from g1/cpuacct.usage and the same is accumulated in
-/sys/fs/cgroup/cpuacct.usage also.
-
-cpuacct.stat file lists a few statistics which further divide the
-CPU time obtained by the cgroup into user and system times. Currently
-the following statistics are supported:
-
-user: Time spent by tasks of the cgroup in user mode.
-system: Time spent by tasks of the cgroup in kernel mode.
-
-user and system are in USER_HZ unit.
-
-cpuacct controller uses percpu_counter interface to collect user and
-system times. This has two side effects:
-
-- It is theoretically possible to see wrong values for user and system times.
-  This is because percpu_counter_read() on 32bit systems isn't safe
-  against concurrent writes.
-- It is possible to see slightly outdated values for user and system times
-  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroup-v1/cpusets.rst b/Documentation/cgroup-v1/cpusets.rst
deleted file mode 100644
index b6a42cdea72b..000000000000
--- a/Documentation/cgroup-v1/cpusets.rst
+++ /dev/null
@@ -1,866 +0,0 @@
-=======
-CPUSETS
-=======
-
-Copyright (C) 2004 BULL SA.
-
-Written by Simon.Derr@bull.net
-
-- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-- Modified by Paul Jackson <pj@sgi.com>
-- Modified by Christoph Lameter <cl@linux.com>
-- Modified by Paul Menage <menage@google.com>
-- Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
-
-.. CONTENTS:
-
-   1. Cpusets
-     1.1 What are cpusets ?
-     1.2 Why are cpusets needed ?
-     1.3 How are cpusets implemented ?
-     1.4 What are exclusive cpusets ?
-     1.5 What is memory_pressure ?
-     1.6 What is memory spread ?
-     1.7 What is sched_load_balance ?
-     1.8 What is sched_relax_domain_level ?
-     1.9 How do I use cpusets ?
-   2. Usage Examples and Syntax
-     2.1 Basic Usage
-     2.2 Adding/removing cpus
-     2.3 Setting flags
-     2.4 Attaching processes
-   3. Questions
-   4. Contact
-
-1. Cpusets
-==========
-
-1.1 What are cpusets ?
-----------------------
-
-Cpusets provide a mechanism for assigning a set of CPUs and Memory
-Nodes to a set of tasks.   In this document "Memory Node" refers to
-an on-line node that contains memory.
-
-Cpusets constrain the CPU and Memory placement of tasks to only
-the resources within a task's current cpuset.  They form a nested
-hierarchy visible in a virtual file system.  These are the essential
-hooks, beyond what is already present, required to manage dynamic
-job placement on large systems.
-
-Cpusets use the generic cgroup subsystem described in
-Documentation/cgroup-v1/cgroups.rst.
-
-Requests by a task, using the sched_setaffinity(2) system call to
-include CPUs in its CPU affinity mask, and using the mbind(2) and
-set_mempolicy(2) system calls to include Memory Nodes in its memory
-policy, are both filtered through that task's cpuset, filtering out any
-CPUs or Memory Nodes not in that cpuset.  The scheduler will not
-schedule a task on a CPU that is not allowed in its cpus_allowed
-vector, and the kernel page allocator will not allocate a page on a
-node that is not allowed in the requesting task's mems_allowed vector.
-
-User level code may create and destroy cpusets by name in the cgroup
-virtual file system, manage the attributes and permissions of these
-cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
-specify and query to which cpuset a task is assigned, and list the
-task pids assigned to a cpuset.
-
-
-1.2 Why are cpusets needed ?
-----------------------------
-
-The management of large computer systems, with many processors (CPUs),
-complex memory cache hierarchies and multiple Memory Nodes having
-non-uniform access times (NUMA) presents additional challenges for
-the efficient scheduling and memory placement of processes.
-
-Frequently more modest sized systems can be operated with adequate
-efficiency just by letting the operating system automatically share
-the available CPU and Memory resources amongst the requesting tasks.
-
-But larger systems, which benefit more from careful processor and
-memory placement to reduce memory access times and contention,
-and which typically represent a larger investment for the customer,
-can benefit from explicitly placing jobs on properly sized subsets of
-the system.
-
-This can be especially valuable on:
-
-    * Web Servers running multiple instances of the same web application,
-    * Servers running different applications (for instance, a web server
-      and a database), or
-    * NUMA systems running large HPC applications with demanding
-      performance characteristics.
-
-These subsets, or "soft partitions" must be able to be dynamically
-adjusted, as the job mix changes, without impacting other concurrently
-executing jobs. The location of the running jobs pages may also be moved
-when the memory locations are changed.
-
-The kernel cpuset patch provides the minimum essential kernel
-mechanisms required to efficiently implement such subsets.  It
-leverages existing CPU and Memory Placement facilities in the Linux
-kernel to avoid any additional impact on the critical scheduler or
-memory allocator code.
-
-
-1.3 How are cpusets implemented ?
----------------------------------
-
-Cpusets provide a Linux kernel mechanism to constrain which CPUs and
-Memory Nodes are used by a process or set of processes.
-
-The Linux kernel already has a pair of mechanisms to specify on which
-CPUs a task may be scheduled (sched_setaffinity) and on which Memory
-Nodes it may obtain memory (mbind, set_mempolicy).
-
-Cpusets extends these two mechanisms as follows:
-
- - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
-   kernel.
- - Each task in the system is attached to a cpuset, via a pointer
-   in the task structure to a reference counted cgroup structure.
- - Calls to sched_setaffinity are filtered to just those CPUs
-   allowed in that task's cpuset.
- - Calls to mbind and set_mempolicy are filtered to just
-   those Memory Nodes allowed in that task's cpuset.
- - The root cpuset contains all the systems CPUs and Memory
-   Nodes.
- - For any cpuset, one can define child cpusets containing a subset
-   of the parents CPU and Memory Node resources.
- - The hierarchy of cpusets can be mounted at /dev/cpuset, for
-   browsing and manipulation from user space.
- - A cpuset may be marked exclusive, which ensures that no other
-   cpuset (except direct ancestors and descendants) may contain
-   any overlapping CPUs or Memory Nodes.
- - You can list all the tasks (by pid) attached to any cpuset.
-
-The implementation of cpusets requires a few, simple hooks
-into the rest of the kernel, none in performance critical paths:
-
- - in init/main.c, to initialize the root cpuset at system boot.
- - in fork and exit, to attach and detach a task from its cpuset.
- - in sched_setaffinity, to mask the requested CPUs by what's
-   allowed in that task's cpuset.
- - in sched.c migrate_live_tasks(), to keep migrating tasks within
-   the CPUs allowed by their cpuset, if possible.
- - in the mbind and set_mempolicy system calls, to mask the requested
-   Memory Nodes by what's allowed in that task's cpuset.
- - in page_alloc.c, to restrict memory to allowed nodes.
- - in vmscan.c, to restrict page recovery to the current cpuset.
-
-You should mount the "cgroup" filesystem type in order to enable
-browsing and modifying the cpusets presently known to the kernel.  No
-new system calls are added for cpusets - all support for querying and
-modifying cpusets is via this cpuset file system.
-
-The /proc/<pid>/status file for each task has four added lines,
-displaying the task's cpus_allowed (on which CPUs it may be scheduled)
-and mems_allowed (on which Memory Nodes it may obtain memory),
-in the two formats seen in the following example::
-
-  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
-  Cpus_allowed_list:      0-127
-  Mems_allowed:   ffffffff,ffffffff
-  Mems_allowed_list:      0-63
-
-Each cpuset is represented by a directory in the cgroup file system
-containing (on top of the standard cgroup files) the following
-files describing that cpuset:
-
- - cpuset.cpus: list of CPUs in that cpuset
- - cpuset.mems: list of Memory Nodes in that cpuset
- - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
- - cpuset.cpu_exclusive flag: is cpu placement exclusive?
- - cpuset.mem_exclusive flag: is memory placement exclusive?
- - cpuset.mem_hardwall flag:  is memory allocation hardwalled
- - cpuset.memory_pressure: measure of how much paging pressure in cpuset
- - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
- - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
- - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
- - cpuset.sched_relax_domain_level: the searching range when migrating tasks
-
-In addition, only the root cpuset has the following file:
-
- - cpuset.memory_pressure_enabled flag: compute memory_pressure?
-
-New cpusets are created using the mkdir system call or shell
-command.  The properties of a cpuset, such as its flags, allowed
-CPUs and Memory Nodes, and attached tasks, are modified by writing
-to the appropriate file in that cpusets directory, as listed above.
-
-The named hierarchical structure of nested cpusets allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cpuset allows organizing the work load
-on a system into related sets of tasks such that each set is constrained
-to using the CPUs and Memory Nodes of a particular cpuset.  A task
-may be re-attached to any other cpuset, if allowed by the permissions
-on the necessary cpuset file system directories.
-
-Such management of a system "in the large" integrates smoothly with
-the detailed placement done on individual tasks and memory regions
-using the sched_setaffinity, mbind and set_mempolicy system calls.
-
-The following rules apply to each cpuset:
-
- - Its CPUs and Memory Nodes must be a subset of its parents.
- - It can't be marked exclusive unless its parent is.
- - If its cpu or memory is exclusive, they may not overlap any sibling.
-
-These rules, and the natural hierarchy of cpusets, enable efficient
-enforcement of the exclusive guarantee, without having to scan all
-cpusets every time any of them change to ensure nothing overlaps a
-exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
-to represent the cpuset hierarchy provides for a familiar permission
-and name space for cpusets, with a minimum of additional kernel code.
-
-The cpus and mems files in the root (top_cpuset) cpuset are
-read-only.  The cpus file automatically tracks the value of
-cpu_online_mask using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_states[N_MEMORY]--i.e.,
-nodes with memory--using the cpuset_track_online_nodes() hook.
-
-
-1.4 What are exclusive cpusets ?
---------------------------------
-
-If a cpuset is cpu or mem exclusive, no other cpuset, other than
-a direct ancestor or descendant, may share any of the same CPUs or
-Memory Nodes.
-
-A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
-i.e. it restricts kernel allocations for page, buffer and other data
-commonly shared by the kernel across multiple users.  All cpusets,
-whether hardwalled or not, restrict allocations of memory for user
-space.  This enables configuring a system so that several independent
-jobs can share common kernel data, such as file system pages, while
-isolating each job's user allocation in its own cpuset.  To do this,
-construct a large mem_exclusive cpuset to hold all the jobs, and
-construct child, non-mem_exclusive cpusets for each individual job.
-Only a small amount of typical kernel memory, such as requests from
-interrupt handlers, is allowed to be taken outside even a
-mem_exclusive cpuset.
-
-
-1.5 What is memory_pressure ?
------------------------------
-The memory_pressure of a cpuset provides a simple per-cpuset metric
-of the rate that the tasks in a cpuset are attempting to free up in
-use memory on the nodes of the cpuset to satisfy additional memory
-requests.
-
-This enables batch managers monitoring jobs running in dedicated
-cpusets to efficiently detect what level of memory pressure that job
-is causing.
-
-This is useful both on tightly managed systems running a wide mix of
-submitted jobs, which may choose to terminate or re-prioritize jobs that
-are trying to use more memory than allowed on the nodes assigned to them,
-and with tightly coupled, long running, massively parallel scientific
-computing jobs that will dramatically fail to meet required performance
-goals if they start to use more memory than allowed to them.
-
-This mechanism provides a very economical way for the batch manager
-to monitor a cpuset for signs of memory pressure.  It's up to the
-batch manager or other user code to decide what to do about it and
-take action.
-
-==>
-    Unless this feature is enabled by writing "1" to the special file
-    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
-    code of __alloc_pages() for this metric reduces to simply noticing
-    that the cpuset_memory_pressure_enabled flag is zero.  So only
-    systems that enable this feature will compute the metric.
-
-Why a per-cpuset, running average:
-
-    Because this meter is per-cpuset, rather than per-task or mm,
-    the system load imposed by a batch scheduler monitoring this
-    metric is sharply reduced on large systems, because a scan of
-    the tasklist can be avoided on each set of queries.
-
-    Because this meter is a running average, instead of an accumulating
-    counter, a batch scheduler can detect memory pressure with a
-    single read, instead of having to read and accumulate results
-    for a period of time.
-
-    Because this meter is per-cpuset rather than per-task or mm,
-    the batch scheduler can obtain the key information, memory
-    pressure in a cpuset, with a single read, rather than having to
-    query and accumulate results over all the (dynamically changing)
-    set of tasks in the cpuset.
-
-A per-cpuset simple digital filter (requires a spinlock and 3 words
-of data per-cpuset) is kept, and updated by any task attached to that
-cpuset, if it enters the synchronous (direct) page reclaim code.
-
-A per-cpuset file provides an integer number representing the recent
-(half-life of 10 seconds) rate of direct page reclaims caused by
-the tasks in the cpuset, in units of reclaims attempted per second,
-times 1000.
-
-
-1.6 What is memory spread ?
----------------------------
-There are two boolean flag files per cpuset that control where the
-kernel allocates pages for the file system buffers and related in
-kernel data structures.  They are called 'cpuset.memory_spread_page' and
-'cpuset.memory_spread_slab'.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
-the kernel will spread the file system buffers (page cache) evenly
-over all the nodes that the faulting task is allowed to use, instead
-of preferring to put those pages on the node where the task is running.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
-then the kernel will spread some file system related slab caches,
-such as for inodes and dentries evenly over all the nodes that the
-faulting task is allowed to use, instead of preferring to put those
-pages on the node where the task is running.
-
-The setting of these flags does not affect anonymous data segment or
-stack segment pages of a task.
-
-By default, both kinds of memory spreading are off, and memory
-pages are allocated on the node local to where the task is running,
-except perhaps as modified by the task's NUMA mempolicy or cpuset
-configuration, so long as sufficient free memory pages are available.
-
-When new cpusets are created, they inherit the memory spread settings
-of their parent.
-
-Setting memory spreading causes allocations for the affected page
-or slab caches to ignore the task's NUMA mempolicy and be spread
-instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
-mempolicies will not notice any change in these calls as a result of
-their containing task's memory spread settings.  If memory spreading
-is turned off, then the currently specified NUMA mempolicy once again
-applies to memory page allocations.
-
-Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
-files.  By default they contain "0", meaning that the feature is off
-for that cpuset.  If a "1" is written to that file, then that turns
-the named feature on.
-
-The implementation is simple.
-
-Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
-PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
-joins that cpuset.  The page allocation calls for the page cache
-is modified to perform an inline check for this PFA_SPREAD_PAGE task
-flag, and if set, a call to a new routine cpuset_mem_spread_node()
-returns the node to prefer for the allocation.
-
-Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
-PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
-pages from the node returned by cpuset_mem_spread_node().
-
-The cpuset_mem_spread_node() routine is also simple.  It uses the
-value of a per-task rotor cpuset_mem_spread_rotor to select the next
-node in the current task's mems_allowed to prefer for the allocation.
-
-This memory placement policy is also known (in other contexts) as
-round-robin or interleave.
-
-This policy can provide substantial improvements for jobs that need
-to place thread local data on the corresponding node, but that need
-to access large file system data sets that need to be spread across
-the several nodes in the jobs cpuset in order to fit.  Without this
-policy, especially for jobs that might have one thread reading in the
-data set, the memory allocation across the nodes in the jobs cpuset
-can become very uneven.
-
-1.7 What is sched_load_balance ?
---------------------------------
-
-The kernel scheduler (kernel/sched/core.c) automatically load balances
-tasks.  If one CPU is underutilized, kernel code running on that
-CPU will look for tasks on other more overloaded CPUs and move those
-tasks to itself, within the constraints of such placement mechanisms
-as cpusets and sched_setaffinity.
-
-The algorithmic cost of load balancing and its impact on key shared
-kernel data structures such as the task list increases more than
-linearly with the number of CPUs being balanced.  So the scheduler
-has support to partition the systems CPUs into a number of sched
-domains such that it only load balances within each sched domain.
-Each sched domain covers some subset of the CPUs in the system;
-no two sched domains overlap; some CPUs might not be in any sched
-domain and hence won't be load balanced.
-
-Put simply, it costs less to balance between two smaller sched domains
-than one big one, but doing so means that overloads in one of the
-two domains won't be load balanced to the other one.
-
-By default, there is one sched domain covering all CPUs, including those
-marked isolated using the kernel boot time "isolcpus=" argument. However,
-the isolated CPUs will not participate in load balancing, and will not
-have tasks running on them unless explicitly assigned.
-
-This default load balancing across all CPUs is not well suited for
-the following two situations:
-
- 1) On large systems, load balancing across many CPUs is expensive.
-    If the system is managed using cpusets to place independent jobs
-    on separate sets of CPUs, full load balancing is unnecessary.
- 2) Systems supporting realtime on some CPUs need to minimize
-    system overhead on those CPUs, including avoiding task load
-    balancing if that is not needed.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
-setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
-be contained in a single sched domain, ensuring that load balancing
-can move a task (not otherwised pinned, as by sched_setaffinity)
-from any CPU in that cpuset to any other.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
-scheduler will avoid load balancing across the CPUs in that cpuset,
---except-- in so far as is necessary because some overlapping cpuset
-has "sched_load_balance" enabled.
-
-So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
-enabled, then the scheduler will have one sched domain covering all
-CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
-cpusets won't matter, as we're already fully load balancing.
-
-Therefore in the above two situations, the top cpuset flag
-"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
-child cpusets have this flag enabled.
-
-When doing this, you don't usually want to leave any unpinned tasks in
-the top cpuset that might use non-trivial amounts of CPU, as such tasks
-may be artificially constrained to some subset of CPUs, depending on
-the particulars of this flag setting in descendant cpusets.  Even if
-such a task could use spare CPU cycles in some other CPUs, the kernel
-scheduler might not consider the possibility of load balancing that
-task to that underused CPU.
-
-Of course, tasks pinned to a particular CPU can be left in a cpuset
-that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
-else anyway.
-
-There is an impedance mismatch here, between cpusets and sched domains.
-Cpusets are hierarchical and nest.  Sched domains are flat; they don't
-overlap and each CPU is in at most one sched domain.
-
-It is necessary for sched domains to be flat because load balancing
-across partially overlapping sets of CPUs would risk unstable dynamics
-that would be beyond our understanding.  So if each of two partially
-overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
-form a single sched domain that is a superset of both.  We won't move
-a task to a CPU outside its cpuset, but the scheduler load balancing
-code might waste some compute cycles considering that possibility.
-
-This mismatch is why there is not a simple one-to-one relation
-between which cpusets have the flag "cpuset.sched_load_balance" enabled,
-and the sched domain configuration.  If a cpuset enables the flag, it
-will get balancing across all its CPUs, but if it disables the flag,
-it will only be assured of no load balancing if no other overlapping
-cpuset enables the flag.
-
-If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
-one of them has this flag enabled, then the other may find its
-tasks only partially load balanced, just on the overlapping CPUs.
-This is just the general case of the top_cpuset example given a few
-paragraphs above.  In the general case, as in the top cpuset case,
-don't leave tasks that might use non-trivial amounts of CPU in
-such partially load balanced cpusets, as they may be artificially
-constrained to some subset of the CPUs allowed to them, for lack of
-load balancing to the other CPUs.
-
-CPUs in "cpuset.isolcpus" were excluded from load balancing by the
-isolcpus= kernel boot option, and will never be load balanced regardless
-of the value of "cpuset.sched_load_balance" in any cpuset.
-
-1.7.1 sched_load_balance implementation details.
-------------------------------------------------
-
-The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
-to most cpuset flags.)  When enabled for a cpuset, the kernel will
-ensure that it can load balance across all the CPUs in that cpuset
-(makes sure that all the CPUs in the cpus_allowed of that cpuset are
-in the same sched domain.)
-
-If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
-then they will be (must be) both in the same sched domain.
-
-If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
-then by the above that means there is a single sched domain covering
-the whole system, regardless of any other cpuset settings.
-
-The kernel commits to user space that it will avoid load balancing
-where it can.  It will pick as fine a granularity partition of sched
-domains as it can while still providing load balancing for any set
-of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
-
-The internal kernel cpuset to scheduler interface passes from the
-cpuset code to the scheduler code a partition of the load balanced
-CPUs in the system. This partition is a set of subsets (represented
-as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
-all the CPUs that must be load balanced.
-
-The cpuset code builds a new such partition and passes it to the
-scheduler sched domain setup code, to have the sched domains rebuilt
-as necessary, whenever:
-
- - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
- - or CPUs come or go from a cpuset with this flag enabled,
- - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
-   and with this flag enabled changes,
- - or a cpuset with non-empty CPUs and with this flag enabled is removed,
- - or a cpu is offlined/onlined.
-
-This partition exactly defines what sched domains the scheduler should
-setup - one sched domain for each element (struct cpumask) in the
-partition.
-
-The scheduler remembers the currently active sched domain partitions.
-When the scheduler routine partition_sched_domains() is invoked from
-the cpuset code to update these sched domains, it compares the new
-partition requested with the current, and updates its sched domains,
-removing the old and adding the new, for each change.
-
-
-1.8 What is sched_relax_domain_level ?
---------------------------------------
-
-In sched domain, the scheduler migrates tasks in 2 ways; periodic load
-balance on tick, and at time of some schedule events.
-
-When a task is woken up, scheduler try to move the task on idle CPU.
-For example, if a task A running on CPU X activates another task B
-on the same CPU X, and if CPU Y is X's sibling and performing idle,
-then scheduler migrate task B to CPU Y so that task B can start on
-CPU Y without waiting task A on CPU X.
-
-And if a CPU run out of tasks in its runqueue, the CPU try to pull
-extra tasks from other busy CPUs to help them before it is going to
-be idle.
-
-Of course it takes some searching cost to find movable tasks and/or
-idle CPUs, the scheduler might not search all CPUs in the domain
-every time.  In fact, in some architectures, the searching ranges on
-events are limited in the same socket or node where the CPU locates,
-while the load balance on tick searches all.
-
-For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
-is idle while CPU X and the siblings are busy, scheduler can't migrate
-woken task B from X to Z since it is out of its searching range.
-As the result, task B on CPU X need to wait task A or wait load balance
-on the next tick.  For some applications in special situation, waiting
-1 tick may be too long.
-
-The 'cpuset.sched_relax_domain_level' file allows you to request changing
-this searching range as you like.  This file takes int value which
-indicates size of searching range in levels ideally as follows,
-otherwise initial value -1 that indicates the cpuset has no request.
-
-====== ===========================================================
-  -1   no request. use system default or follow request of others.
-   0   no search.
-   1   search siblings (hyperthreads in a core).
-   2   search cores in a package.
-   3   search cpus in a node [= system wide on non-NUMA system]
-   4   search nodes in a chunk of node [on NUMA system]
-   5   search system wide [on NUMA system]
-====== ===========================================================
-
-The system default is architecture dependent.  The system default
-can be changed using the relax_domain_level= boot parameter.
-
-This file is per-cpuset and affect the sched domain where the cpuset
-belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
-is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
-there is no sched domain belonging the cpuset.
-
-If multiple cpusets are overlapping and hence they form a single sched
-domain, the largest value among those is used.  Be careful, if one
-requests 0 and others are -1 then 0 is used.
-
-Note that modifying this file will have both good and bad effects,
-and whether it is acceptable or not depends on your situation.
-Don't modify this file if you are not sure.
-
-If your situation is:
-
- - The migration costs between each cpu can be assumed considerably
-   small(for you) due to your special application's behavior or
-   special hardware support for CPU cache etc.
- - The searching cost doesn't have impact(for you) or you can make
-   the searching cost enough small by managing cpuset to compact etc.
- - The latency is required even it sacrifices cache hit rate etc.
-   then increasing 'sched_relax_domain_level' would benefit you.
-
-
-1.9 How do I use cpusets ?
---------------------------
-
-In order to minimize the impact of cpusets on critical kernel
-code, such as the scheduler, and due to the fact that the kernel
-does not support one task updating the memory placement of another
-task directly, the impact on a task of changing its cpuset CPU
-or Memory Node placement, or of changing to which cpuset a task
-is attached, is subtle.
-
-If a cpuset has its Memory Nodes modified, then for each task attached
-to that cpuset, the next time that the kernel attempts to allocate
-a page of memory for that task, the kernel will notice the change
-in the task's cpuset, and update its per-task memory placement to
-remain within the new cpusets memory placement.  If the task was using
-mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
-its new cpuset, then the task will continue to use whatever subset
-of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
-was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
-in the new cpuset, then the task will be essentially treated as if it
-was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
-as queried by get_mempolicy(), doesn't change).  If a task is moved
-from one cpuset to another, then the kernel will adjust the task's
-memory placement, as above, the next time that the kernel attempts
-to allocate a page of memory for that task.
-
-If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
-will have its allowed CPU placement changed immediately.  Similarly,
-if a task's pid is written to another cpuset's 'tasks' file, then its
-allowed CPU placement is changed immediately.  If such a task had been
-bound to some subset of its cpuset using the sched_setaffinity() call,
-the task will be allowed to run on any CPU allowed in its new cpuset,
-negating the effect of the prior sched_setaffinity() call.
-
-In summary, the memory placement of a task whose cpuset is changed is
-updated by the kernel, on the next allocation of a page for that task,
-and the processor placement is updated immediately.
-
-Normally, once a page is allocated (given a physical page
-of main memory) then that page stays on whatever node it
-was allocated, so long as it remains allocated, even if the
-cpusets memory placement policy 'cpuset.mems' subsequently changes.
-If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
-tasks are attached to that cpuset, any pages that task had
-allocated to it on nodes in its previous cpuset are migrated
-to the task's new cpuset. The relative placement of the page within
-the cpuset is preserved during these migration operations if possible.
-For example if the page was on the second valid node of the prior cpuset
-then the page will be placed on the second valid node of the new cpuset.
-
-Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
-'cpuset.mems' file is modified, pages allocated to tasks in that
-cpuset, that were on nodes in the previous setting of 'cpuset.mems',
-will be moved to nodes in the new setting of 'mems.'
-Pages that were not in the task's prior cpuset, or in the cpuset's
-prior 'cpuset.mems' setting, will not be moved.
-
-There is an exception to the above.  If hotplug functionality is used
-to remove all the CPUs that are currently assigned to a cpuset,
-then all the tasks in that cpuset will be moved to the nearest ancestor
-with non-empty cpus.  But the moving of some (or all) tasks might fail if
-cpuset is bound with another cgroup subsystem which has some restrictions
-on task attaching.  In this failing case, those tasks will stay
-in the original cpuset, and the kernel will automatically update
-their cpus_allowed to allow all online CPUs.  When memory hotplug
-functionality for removing Memory Nodes is available, a similar exception
-is expected to apply there as well.  In general, the kernel prefers to
-violate cpuset placement, over starving a task that has had all
-its allowed CPUs or Memory Nodes taken offline.
-
-There is a second exception to the above.  GFP_ATOMIC requests are
-kernel internal allocations that must be satisfied, immediately.
-The kernel may drop some request, in rare cases even panic, if a
-GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
-the current task's cpuset, then we relax the cpuset, and look for
-memory anywhere we can find it.  It's better to violate the cpuset
-than stress the kernel.
-
-To start a new job that is to be contained within a cpuset, the steps are:
-
- 1) mkdir /sys/fs/cgroup/cpuset
- 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 4) Start a task that will be the "founding father" of the new job.
- 5) Attach that task to the new cpuset by writing its pid to the
-    /sys/fs/cgroup/cpuset tasks file for that cpuset.
- 6) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cpuset
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cpuset::
-
-  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cpuset Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cpuset
-
-There are ways to query or modify cpusets:
-
- - via the cpuset file system directly, using the various cd, mkdir, echo,
-   cat, rmdir commands from the shell, or their equivalent from C.
- - via the C library libcpuset.
- - via the C library libcgroup.
-   (http://sourceforge.net/projects/libcg/)
- - via the python application cset.
-   (http://code.google.com/p/cpuset/)
-
-The sched_setaffinity calls can also be done at the shell prompt using
-SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
-calls can be done at the shell prompt using the numactl command
-(part of Andi Kleen's numa package).
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using the cpusets can be done through the cpuset
-virtual filesystem.
-
-To mount it, type:
-# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
-
-Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
-tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
-is the cpuset that holds the whole system.
-
-If you want to create a new cpuset under /sys/fs/cgroup/cpuset::
-
-  # cd /sys/fs/cgroup/cpuset
-  # mkdir my_cpuset
-
-Now you want to do something with this cpuset::
-
-  # cd my_cpuset
-
-In this directory you can find several files::
-
-  # ls
-  cgroup.clone_children  cpuset.memory_pressure
-  cgroup.event_control   cpuset.memory_spread_page
-  cgroup.procs           cpuset.memory_spread_slab
-  cpuset.cpu_exclusive   cpuset.mems
-  cpuset.cpus            cpuset.sched_load_balance
-  cpuset.mem_exclusive   cpuset.sched_relax_domain_level
-  cpuset.mem_hardwall    notify_on_release
-  cpuset.memory_migrate  tasks
-
-Reading them will give you information about the state of this cpuset:
-the CPUs and Memory Nodes it can use, the processes that are using
-it, its properties.  By writing to these files you can manipulate
-the cpuset.
-
-Set some flags::
-
-  # /bin/echo 1 > cpuset.cpu_exclusive
-
-Add some cpus::
-
-  # /bin/echo 0-7 > cpuset.cpus
-
-Add some mems::
-
-  # /bin/echo 0-7 > cpuset.mems
-
-Now attach your shell to this cpuset::
-
-  # /bin/echo $$ > tasks
-
-You can also create cpusets inside your cpuset by using mkdir in this
-directory::
-
-  # mkdir my_sub_cs
-
-To remove a cpuset, just use rmdir::
-
-  # rmdir my_sub_cs
-
-This will fail if the cpuset is in use (has cpusets inside, or has
-processes attached).
-
-Note that for legacy reasons, the "cpuset" filesystem exists as a
-wrapper around the cgroup filesystem.
-
-The command::
-
-  mount -t cpuset X /sys/fs/cgroup/cpuset
-
-is equivalent to::
-
-  mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
-  echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
-
-2.2 Adding/removing cpus
-------------------------
-
-This is the syntax to use when writing in the cpus or mems files
-in cpuset directories::
-
-  # /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
-  # /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
-
-To add a CPU to a cpuset, write the new list of CPUs including the
-CPU to be added. To add 6 to the above cpuset::
-
-  # /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
-
-Similarly to remove a CPU from a cpuset, write the new list of CPUs
-without the CPU to be removed.
-
-To remove all the CPUs::
-
-  # /bin/echo "" > cpuset.cpus		-> clear cpus list
-
-2.3 Setting flags
------------------
-
-The syntax is very simple::
-
-  # /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
-  # /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
-
-2.4 Attaching processes
------------------------
-
-::
-
-  # /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another::
-
-  # /bin/echo PID1 > tasks
-  # /bin/echo PID2 > tasks
-	...
-  # /bin/echo PIDn > tasks
-
-
-3. Questions
-============
-
-Q:
-   what's up with this '/bin/echo' ?
-
-A:
-   bash's builtin 'echo' command does not check calls to write() against
-   errors. If you use it in the cpuset file system, you won't be
-   able to tell whether a command succeeded or failed.
-
-Q:
-   When I attach processes, only the first of the line gets really attached !
-
-A:
-   We can only return one error code per call to write(). So you should also
-   put only ONE pid.
-
-4. Contact
-==========
-
-Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/cgroup-v1/devices.rst b/Documentation/cgroup-v1/devices.rst
deleted file mode 100644
index e1886783961e..000000000000
--- a/Documentation/cgroup-v1/devices.rst
+++ /dev/null
@@ -1,132 +0,0 @@
-===========================
-Device Whitelist Controller
-===========================
-
-1. Description
-==============
-
-Implement a cgroup to track and enforce open and mknod restrictions
-on device files.  A device cgroup associates a device access
-whitelist with each cgroup.  A whitelist entry has 4 fields.
-'type' is a (all), c (char), or b (block).  'all' means it applies
-to all types and all major and minor numbers.  Major and minor are
-either an integer or * for all.  Access is a composition of r
-(read), w (write), and m (mknod).
-
-The root device cgroup starts with rwm to 'all'.  A child device
-cgroup gets a copy of the parent.  Administrators can then remove
-devices from the whitelist or add new entries.  A child cgroup can
-never receive a device access which is denied by its parent.
-
-2. User Interface
-=================
-
-An entry is added using devices.allow, and removed using
-devices.deny.  For instance::
-
-	echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
-
-allows cgroup 1 to read and mknod the device usually known as
-/dev/null.  Doing::
-
-	echo a > /sys/fs/cgroup/1/devices.deny
-
-will remove the default 'a *:* rwm' entry. Doing::
-
-	echo a > /sys/fs/cgroup/1/devices.allow
-
-will add the 'a *:* rwm' entry to the whitelist.
-
-3. Security
-===========
-
-Any task can move itself between cgroups.  This clearly won't
-suffice, but we can decide the best way to adequately restrict
-movement as people get some experience with this.  We may just want
-to require CAP_SYS_ADMIN, which at least is a separate bit from
-CAP_MKNOD.  We may want to just refuse moving to a cgroup which
-isn't a descendant of the current one.  Or we may want to use
-CAP_MAC_ADMIN, since we really are trying to lock down root.
-
-CAP_SYS_ADMIN is needed to modify the whitelist or move another
-task to a new cgroup.  (Again we'll probably want to change that).
-
-A cgroup may not be granted more permissions than the cgroup's
-parent has.
-
-4. Hierarchy
-============
-
-device cgroups maintain hierarchy by making sure a cgroup never has more
-access permissions than its parent.  Every time an entry is written to
-a cgroup's devices.deny file, all its children will have that entry removed
-from their whitelist and all the locally set whitelist entries will be
-re-evaluated.  In case one of the locally set whitelist entries would provide
-more access than the cgroup's parent, it'll be removed from the whitelist.
-
-Example::
-
-      A
-     / \
-        B
-
-    group        behavior	exceptions
-    A            allow		"b 8:* rwm", "c 116:1 rw"
-    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
-
-If a device is denied in group A::
-
-	# echo "c 116:* r" > A/devices.deny
-
-it'll propagate down and after revalidating B's entries, the whitelist entry
-"c 116:2 rwm" will be removed::
-
-    group        whitelist entries                        denied devices
-    A            all                                      "b 8:* rwm", "c 116:* rw"
-    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
-
-In case parent's exceptions change and local exceptions are not allowed
-anymore, they'll be deleted.
-
-Notice that new whitelist entries will not be propagated::
-
-      A
-     / \
-        B
-
-    group        whitelist entries                        denied devices
-    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-when adding ``c *:3 rwm``::
-
-	# echo "c *:3 rwm" >A/devices.allow
-
-the result::
-
-    group        whitelist entries                        denied devices
-    A            "c *:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-but now it'll be possible to add new entries to B::
-
-	# echo "c 2:3 rwm" >B/devices.allow
-	# echo "c 50:3 r" >B/devices.allow
-
-or even::
-
-	# echo "c *:3 rwm" >B/devices.allow
-
-Allowing or denying all by writing 'a' to devices.allow or devices.deny will
-not be possible once the device cgroups has children.
-
-4.1 Hierarchy (internal implementation)
----------------------------------------
-
-device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
-list of exceptions.  The internal state is controlled using the same user
-interface to preserve compatibility with the previous whitelist-only
-implementation.  Removal or addition of exceptions that will reduce the access
-to devices will be propagated down the hierarchy.
-For every propagated exception, the effective rules will be re-evaluated based
-on current parent's access rules.
diff --git a/Documentation/cgroup-v1/freezer-subsystem.rst b/Documentation/cgroup-v1/freezer-subsystem.rst
deleted file mode 100644
index 582d3427de3f..000000000000
--- a/Documentation/cgroup-v1/freezer-subsystem.rst
+++ /dev/null
@@ -1,127 +0,0 @@
-==============
-Cgroup Freezer
-==============
-
-The cgroup freezer is useful to batch job management system which start
-and stop sets of tasks in order to schedule the resources of a machine
-according to the desires of a system administrator. This sort of program
-is often used on HPC clusters to schedule access to the cluster as a
-whole. The cgroup freezer uses cgroups to describe the set of tasks to
-be started/stopped by the batch job management system. It also provides
-a means to start and stop the tasks composing the job.
-
-The cgroup freezer will also be useful for checkpointing running groups
-of tasks. The freezer allows the checkpoint code to obtain a consistent
-image of the tasks by attempting to force the tasks in a cgroup into a
-quiescent state. Once the tasks are quiescent another task can
-walk /proc or invoke a kernel interface to gather information about the
-quiesced tasks. Checkpointed tasks can be restarted later should a
-recoverable error occur. This also allows the checkpointed tasks to be
-migrated between nodes in a cluster by copying the gathered information
-to another node and restarting the tasks there.
-
-Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
-and resuming tasks in userspace. Both of these signals are observable
-from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
-blocked, or ignored it can be seen by waiting or ptracing parent tasks.
-SIGCONT is especially unsuitable since it can be caught by the task. Any
-programs designed to watch for SIGSTOP and SIGCONT could be broken by
-attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
-demonstrate this problem using nested bash shells::
-
-	$ echo $$
-	16644
-	$ bash
-	$ echo $$
-	16690
-
-	From a second, unrelated bash shell:
-	$ kill -SIGSTOP 16690
-	$ kill -SIGCONT 16690
-
-	<at this point 16690 exits and causes 16644 to exit too>
-
-This happens because bash can observe both signals and choose how it
-responds to them.
-
-Another example of a program which catches and responds to these
-signals is gdb. In fact any program designed to use ptrace is likely to
-have a problem with this method of stopping and resuming tasks.
-
-In contrast, the cgroup freezer uses the kernel freezer code to
-prevent the freeze/unfreeze cycle from becoming visible to the tasks
-being frozen. This allows the bash example above and gdb to run as
-expected.
-
-The cgroup freezer is hierarchical. Freezing a cgroup freezes all
-tasks belonging to the cgroup and all its descendant cgroups. Each
-cgroup has its own state (self-state) and the state inherited from the
-parent (parent-state). Iff both states are THAWED, the cgroup is
-THAWED.
-
-The following cgroupfs files are created by cgroup freezer.
-
-* freezer.state: Read-write.
-
-  When read, returns the effective state of the cgroup - "THAWED",
-  "FREEZING" or "FROZEN". This is the combined self and parent-states.
-  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
-
-  FREEZING cgroup transitions into FROZEN state when all tasks
-  belonging to the cgroup and its descendants become frozen. Note that
-  a cgroup reverts to FREEZING from FROZEN after a new task is added
-  to the cgroup or one of its descendant cgroups until the new task is
-  frozen.
-
-  When written, sets the self-state of the cgroup. Two values are
-  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
-  if not already freezing, enters FREEZING state along with all its
-  descendant cgroups.
-
-  If THAWED is written, the self-state of the cgroup is changed to
-  THAWED.  Note that the effective state may not change to THAWED if
-  the parent-state is still freezing. If a cgroup's effective state
-  becomes THAWED, all its descendants which are freezing because of
-  the cgroup also leave the freezing state.
-
-* freezer.self_freezing: Read only.
-
-  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
-  This value is 1 iff the last write to freezer.state was "FROZEN".
-
-* freezer.parent_freezing: Read only.
-
-  Shows the parent-state.  0 if none of the cgroup's ancestors is
-  frozen; otherwise, 1.
-
-The root cgroup is non-freezable and the above interface files don't
-exist.
-
-* Examples of usage::
-
-   # mkdir /sys/fs/cgroup/freezer
-   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
-   # mkdir /sys/fs/cgroup/freezer/0
-   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
-
-to get status of the freezer subsystem::
-
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-to freeze all tasks in the container::
-
-   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FREEZING
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FROZEN
-
-to unfreeze all tasks in the container::
-
-   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-This is the basic mechanism which should do the right thing for user space task
-in a simple scenario.
diff --git a/Documentation/cgroup-v1/hugetlb.rst b/Documentation/cgroup-v1/hugetlb.rst
deleted file mode 100644
index a3902aa253a9..000000000000
--- a/Documentation/cgroup-v1/hugetlb.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-==================
-HugeTLB Controller
-==================
-
-The HugeTLB controller allows to limit the HugeTLB usage per control group and
-enforces the controller limit during page fault. Since HugeTLB doesn't
-support page reclaim, enforcing the limit at page fault time implies that,
-the application will get SIGBUS signal if it tries to access HugeTLB pages
-beyond its limit. This requires the application to know beforehand how much
-HugeTLB pages it would require for its use.
-
-HugeTLB controller can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -o hugetlb none /sys/fs/cgroup
-
-With the above step, the initial or the parent HugeTLB group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-
-New groups can be created under the parent group /sys/fs/cgroup::
-
-  # cd /sys/fs/cgroup
-  # mkdir g1
-  # echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it.
-
-Brief summary of control files::
-
- hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
- hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
- hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
- hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit
-
-For a system supporting three hugepage sizes (64k, 32M and 1G), the control
-files include::
-
-  hugetlb.1GB.limit_in_bytes
-  hugetlb.1GB.max_usage_in_bytes
-  hugetlb.1GB.usage_in_bytes
-  hugetlb.1GB.failcnt
-  hugetlb.64KB.limit_in_bytes
-  hugetlb.64KB.max_usage_in_bytes
-  hugetlb.64KB.usage_in_bytes
-  hugetlb.64KB.failcnt
-  hugetlb.32MB.limit_in_bytes
-  hugetlb.32MB.max_usage_in_bytes
-  hugetlb.32MB.usage_in_bytes
-  hugetlb.32MB.failcnt
diff --git a/Documentation/cgroup-v1/index.rst b/Documentation/cgroup-v1/index.rst
deleted file mode 100644
index fe76d42edc11..000000000000
--- a/Documentation/cgroup-v1/index.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-:orphan:
-
-========================
-Control Groups version 1
-========================
-
-.. toctree::
-    :maxdepth: 1
-
-    cgroups
-
-    blkio-controller
-    cpuacct
-    cpusets
-    devices
-    freezer-subsystem
-    hugetlb
-    memcg_test
-    memory
-    net_cls
-    net_prio
-    pids
-    rdma
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/cgroup-v1/memcg_test.rst b/Documentation/cgroup-v1/memcg_test.rst
deleted file mode 100644
index 91bd18c6a514..000000000000
--- a/Documentation/cgroup-v1/memcg_test.rst
+++ /dev/null
@@ -1,355 +0,0 @@
-=====================================================
-Memory Resource Controller(Memcg) Implementation Memo
-=====================================================
-
-Last Updated: 2010/2
-
-Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
-
-Because VM is getting complex (one of reasons is memcg...), memcg's behavior
-is complex. This is a document for memcg's internal behavior.
-Please note that implementation details can be changed.
-
-(*) Topics on API should be in Documentation/cgroup-v1/memory.rst)
-
-0. How to record usage ?
-========================
-
-   2 objects are used.
-
-   page_cgroup ....an object per page.
-
-	Allocated at boot or memory hotplug. Freed at memory hot removal.
-
-   swap_cgroup ... an entry per swp_entry.
-
-	Allocated at swapon(). Freed at swapoff().
-
-   The page_cgroup has USED bit and double count against a page_cgroup never
-   occurs. swap_cgroup is used only when a charged page is swapped-out.
-
-1. Charge
-=========
-
-   a page/swp_entry may be charged (usage += PAGE_SIZE) at
-
-	mem_cgroup_try_charge()
-
-2. Uncharge
-===========
-
-  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
-
-	mem_cgroup_uncharge()
-	  Called when a page's refcount goes down to 0.
-
-	mem_cgroup_uncharge_swap()
-	  Called when swp_entry's refcnt goes down to 0. A charge against swap
-	  disappears.
-
-3. charge-commit-cancel
-=======================
-
-	Memcg pages are charged in two steps:
-
-		- mem_cgroup_try_charge()
-		- mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
-
-	At try_charge(), there are no flags to say "this page is charged".
-	at this point, usage += PAGE_SIZE.
-
-	At commit(), the page is associated with the memcg.
-
-	At cancel(), simply usage -= PAGE_SIZE.
-
-Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
-
-4. Anonymous
-============
-
-	Anonymous page is newly allocated at
-		  - page fault into MAP_ANONYMOUS mapping.
-		  - Copy-On-Write.
-
-	4.1 Swap-in.
-	At swap-in, the page is taken from swap-cache. There are 2 cases.
-
-	(a) If the SwapCache is newly allocated and read, it has no charges.
-	(b) If the SwapCache has been mapped by processes, it has been
-	    charged already.
-
-	4.2 Swap-out.
-	At swap-out, typical state transition is below.
-
-	(a) add to swap cache. (marked as SwapCache)
-	    swp_entry's refcnt += 1.
-	(b) fully unmapped.
-	    swp_entry's refcnt += # of ptes.
-	(c) write back to swap.
-	(d) delete from swap cache. (remove from SwapCache)
-	    swp_entry's refcnt -= 1.
-
-
-	Finally, at task exit,
-	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
-
-5. Page Cache
-=============
-
-	Page Cache is charged at
-	- add_to_page_cache_locked().
-
-	The logic is very clear. (About migration, see below)
-
-	Note:
-	  __remove_from_page_cache() is called by remove_from_page_cache()
-	  and __remove_mapping().
-
-6. Shmem(tmpfs) Page Cache
-===========================
-
-	The best way to understand shmem's page state transition is to read
-	mm/shmem.c.
-
-	But brief explanation of the behavior of memcg around shmem will be
-	helpful to understand the logic.
-
-	Shmem's page (just leaf page, not direct/indirect block) can be on
-
-		- radix-tree of shmem's inode.
-		- SwapCache.
-		- Both on radix-tree and SwapCache. This happens at swap-in
-		  and swap-out,
-
-	It's charged when...
-
-	- A new page is added to shmem's radix-tree.
-	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
-
-7. Page Migration
-=================
-
-	mem_cgroup_migrate()
-
-8. LRU
-======
-        Each memcg has its own private LRU. Now, its handling is under global
-	VM's control (means that it's handled under global pgdat->lru_lock).
-	Almost all routines around memcg's LRU is called by global LRU's
-	list management functions under pgdat->lru_lock.
-
-	A special function is mem_cgroup_isolate_pages(). This scans
-	memcg's private LRU and call __isolate_lru_page() to extract a page
-	from LRU.
-
-	(By __isolate_lru_page(), the page is removed from both of global and
-	private LRU.)
-
-
-9. Typical Tests.
-=================
-
- Tests for racy cases.
-
-9.1 Small limit to memcg.
--------------------------
-
-	When you do test to do racy case, it's good test to set memcg's limit
-	to be very small rather than GB. Many races found in the test under
-	xKB or xxMB limits.
-
-	(Memory behavior under GB and Memory behavior under MB shows very
-	different situation.)
-
-9.2 Shmem
----------
-
-	Historically, memcg's shmem handling was poor and we saw some amount
-	of troubles here. This is because shmem is page-cache but can be
-	SwapCache. Test with shmem/tmpfs is always good test.
-
-9.3 Migration
--------------
-
-	For NUMA, migration is an another special case. To do easy test, cpuset
-	is useful. Following is a sample script to do migration::
-
-		mount -t cgroup -o cpuset none /opt/cpuset
-
-		mkdir /opt/cpuset/01
-		echo 1 > /opt/cpuset/01/cpuset.cpus
-		echo 0 > /opt/cpuset/01/cpuset.mems
-		echo 1 > /opt/cpuset/01/cpuset.memory_migrate
-		mkdir /opt/cpuset/02
-		echo 1 > /opt/cpuset/02/cpuset.cpus
-		echo 1 > /opt/cpuset/02/cpuset.mems
-		echo 1 > /opt/cpuset/02/cpuset.memory_migrate
-
-	In above set, when you moves a task from 01 to 02, page migration to
-	node 0 to node 1 will occur. Following is a script to migrate all
-	under cpuset.::
-
-		--
-		move_task()
-		{
-		for pid in $1
-		do
-			/bin/echo $pid >$2/tasks 2>/dev/null
-			echo -n $pid
-			echo -n " "
-		done
-		echo END
-		}
-
-		G1_TASK=`cat ${G1}/tasks`
-		G2_TASK=`cat ${G2}/tasks`
-		move_task "${G1_TASK}" ${G2} &
-		--
-
-9.4 Memory hotplug
-------------------
-
-	memory hotplug test is one of good test.
-
-	to offline memory, do following::
-
-		# echo offline > /sys/devices/system/memory/memoryXXX/state
-
-	(XXX is the place of memory)
-
-	This is an easy way to test page migration, too.
-
-9.5 mkdir/rmdir
----------------
-
-	When using hierarchy, mkdir/rmdir test should be done.
-	Use tests like the following::
-
-		echo 1 >/opt/cgroup/01/memory/use_hierarchy
-		mkdir /opt/cgroup/01/child_a
-		mkdir /opt/cgroup/01/child_b
-
-		set limit to 01.
-		add limit to 01/child_b
-		run jobs under child_a and child_b
-
-	create/delete following groups at random while jobs are running::
-
-		/opt/cgroup/01/child_a/child_aa
-		/opt/cgroup/01/child_b/child_bb
-		/opt/cgroup/01/child_c
-
-	running new jobs in new group is also good.
-
-9.6 Mount with other subsystems
--------------------------------
-
-	Mounting with other subsystems is a good test because there is a
-	race and lock dependency with other cgroup subsystems.
-
-	example::
-
-		# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
-
-	and do task move, mkdir, rmdir etc...under this.
-
-9.7 swapoff
------------
-
-	Besides management of swap is one of complicated parts of memcg,
-	call path of swap-in at swapoff is not same as usual swap-in path..
-	It's worth to be tested explicitly.
-
-	For example, test like following is good:
-
-	(Shell-A)::
-
-		# mount -t cgroup none /cgroup -o memory
-		# mkdir /cgroup/test
-		# echo 40M > /cgroup/test/memory.limit_in_bytes
-		# echo 0 > /cgroup/test/tasks
-
-	Run malloc(100M) program under this. You'll see 60M of swaps.
-
-	(Shell-B)::
-
-		# move all tasks in /cgroup/test to /cgroup
-		# /sbin/swapoff -a
-		# rmdir /cgroup/test
-		# kill malloc task.
-
-	Of course, tmpfs v.s. swapoff test should be tested, too.
-
-9.8 OOM-Killer
---------------
-
-	Out-of-memory caused by memcg's limit will kill tasks under
-	the memcg. When hierarchy is used, a task under hierarchy
-	will be killed by the kernel.
-
-	In this case, panic_on_oom shouldn't be invoked and tasks
-	in other groups shouldn't be killed.
-
-	It's not difficult to cause OOM under memcg as following.
-
-	Case A) when you can swapoff::
-
-		#swapoff -a
-		#echo 50M > /memory.limit_in_bytes
-
-	run 51M of malloc
-
-	Case B) when you use mem+swap limitation::
-
-		#echo 50M > memory.limit_in_bytes
-		#echo 50M > memory.memsw.limit_in_bytes
-
-	run 51M of malloc
-
-9.9 Move charges at task migration
-----------------------------------
-
-	Charges associated with a task can be moved along with task migration.
-
-	(Shell-A)::
-
-		#mkdir /cgroup/A
-		#echo $$ >/cgroup/A/tasks
-
-	run some programs which uses some amount of memory in /cgroup/A.
-
-	(Shell-B)::
-
-		#mkdir /cgroup/B
-		#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
-		#echo "pid of the program running in group A" >/cgroup/B/tasks
-
-	You can see charges have been moved by reading ``*.usage_in_bytes`` or
-	memory.stat of both A and B.
-
-	See 8.2 of Documentation/cgroup-v1/memory.rst to see what value should
-	be written to move_charge_at_immigrate.
-
-9.10 Memory thresholds
-----------------------
-
-	Memory controller implements memory thresholds using cgroups notification
-	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
-
-	(Shell-A) Create cgroup and run event listener::
-
-		# mkdir /cgroup/A
-		# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
-
-	(Shell-B) Add task to cgroup and try to allocate and free memory::
-
-		# echo $$ >/cgroup/A/tasks
-		# a="$(dd if=/dev/zero bs=1M count=10)"
-		# a=
-
-	You will see message from cgroup_event_listener every time you cross
-	the thresholds.
-
-	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
-
-	It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroup-v1/memory.rst b/Documentation/cgroup-v1/memory.rst
deleted file mode 100644
index 41bdc038dad9..000000000000
--- a/Documentation/cgroup-v1/memory.rst
+++ /dev/null
@@ -1,1003 +0,0 @@
-==========================
-Memory Resource Controller
-==========================
-
-NOTE:
-      This document is hopelessly outdated and it asks for a complete
-      rewrite. It still contains a useful information so we are keeping it
-      here but make sure to check the current code if you need a deeper
-      understanding.
-
-NOTE:
-      The Memory Resource Controller has generically been referred to as the
-      memory controller in this document. Do not confuse memory controller
-      used here with the memory controller that is used in hardware.
-
-(For editors) In this document:
-      When we mention a cgroup (cgroupfs's directory) with memory controller,
-      we call it "memory cgroup". When you see git-log and source code, you'll
-      see patch's title and function names tend to use "memcg".
-      In this document, we avoid using it.
-
-Benefits and Purpose of the memory controller
-=============================================
-
-The memory controller isolates the memory behaviour of a group of tasks
-from the rest of the system. The article on LWN [12] mentions some probable
-uses of the memory controller. The memory controller can be used to
-
-a. Isolate an application or a group of applications
-   Memory-hungry applications can be isolated and limited to a smaller
-   amount of memory.
-b. Create a cgroup with a limited amount of memory; this can be used
-   as a good alternative to booting with mem=XXXX.
-c. Virtualization solutions can control the amount of memory they want
-   to assign to a virtual machine instance.
-d. A CD/DVD burner could control the amount of memory used by the
-   rest of the system to ensure that burning does not fail due to lack
-   of available memory.
-e. There are several other use cases; find one or use the controller just
-   for fun (to learn and hack on the VM subsystem).
-
-Current Status: linux-2.6.34-mmotm(development version of 2010/April)
-
-Features:
-
- - accounting anonymous pages, file caches, swap caches usage and limiting them.
- - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
- - optionally, memory+swap usage can be accounted and limited.
- - hierarchical accounting
- - soft limit
- - moving (recharging) account at moving a task is selectable.
- - usage threshold notifier
- - memory pressure notifier
- - oom-killer disable knob and oom-notifier
- - Root cgroup has no limit controls.
-
- Kernel memory support is a work in progress, and the current version provides
- basically functionality. (See Section 2.7)
-
-Brief summary of control files.
-
-==================================== ==========================================
- tasks				     attach a task(thread) and show list of
-				     threads
- cgroup.procs			     show list of processes
- cgroup.event_control		     an interface for event_fd()
- memory.usage_in_bytes		     show current usage for memory
-				     (See 5.5 for details)
- memory.memsw.usage_in_bytes	     show current usage for memory+Swap
-				     (See 5.5 for details)
- memory.limit_in_bytes		     set/show limit of memory usage
- memory.memsw.limit_in_bytes	     set/show limit of memory+Swap usage
- memory.failcnt			     show the number of memory usage hits limits
- memory.memsw.failcnt		     show the number of memory+Swap hits limits
- memory.max_usage_in_bytes	     show max memory usage recorded
- memory.memsw.max_usage_in_bytes     show max memory+Swap usage recorded
- memory.soft_limit_in_bytes	     set/show soft limit of memory usage
- memory.stat			     show various statistics
- memory.use_hierarchy		     set/show hierarchical account enabled
- memory.force_empty		     trigger forced page reclaim
- memory.pressure_level		     set memory pressure notifications
- memory.swappiness		     set/show swappiness parameter of vmscan
-				     (See sysctl's vm.swappiness)
- memory.move_charge_at_immigrate     set/show controls of moving charges
- memory.oom_control		     set/show oom controls.
- memory.numa_stat		     show the number of memory usage per numa
-				     node
-
- memory.kmem.limit_in_bytes          set/show hard limit for kernel memory
- memory.kmem.usage_in_bytes          show current kernel memory allocation
- memory.kmem.failcnt                 show the number of kernel memory usage
-				     hits limits
- memory.kmem.max_usage_in_bytes      show max kernel memory usage recorded
-
- memory.kmem.tcp.limit_in_bytes      set/show hard limit for tcp buf memory
- memory.kmem.tcp.usage_in_bytes      show current tcp buf memory allocation
- memory.kmem.tcp.failcnt             show the number of tcp buf memory usage
-				     hits limits
- memory.kmem.tcp.max_usage_in_bytes  show max tcp buf memory usage recorded
-==================================== ==========================================
-
-1. History
-==========
-
-The memory controller has a long history. A request for comments for the memory
-controller was posted by Balbir Singh [1]. At the time the RFC was posted
-there were several implementations for memory control. The goal of the
-RFC was to build consensus and agreement for the minimal features required
-for memory control. The first RSS controller was posted by Balbir Singh[2]
-in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
-RSS controller. At OLS, at the resource management BoF, everyone suggested
-that we handle both page cache and RSS together. Another request was raised
-to allow user space handling of OOM. The current memory controller is
-at version 6; it combines both mapped (RSS) and unmapped Page
-Cache Control [11].
-
-2. Memory Control
-=================
-
-Memory is a unique resource in the sense that it is present in a limited
-amount. If a task requires a lot of CPU processing, the task can spread
-its processing over a period of hours, days, months or years, but with
-memory, the same physical memory needs to be reused to accomplish the task.
-
-The memory controller implementation has been divided into phases. These
-are:
-
-1. Memory controller
-2. mlock(2) controller
-3. Kernel user memory accounting and slab control
-4. user mappings length controller
-
-The memory controller is the first controller developed.
-
-2.1. Design
------------
-
-The core of the design is a counter called the page_counter. The
-page_counter tracks the current memory usage and limit of the group of
-processes associated with the controller. Each cgroup has a memory controller
-specific data structure (mem_cgroup) associated with it.
-
-2.2. Accounting
----------------
-
-::
-
-		+--------------------+
-		|  mem_cgroup        |
-		|  (page_counter)    |
-		+--------------------+
-		 /            ^      \
-		/             |       \
-           +---------------+  |        +---------------+
-           | mm_struct     |  |....    | mm_struct     |
-           |               |  |        |               |
-           +---------------+  |        +---------------+
-                              |
-                              + --------------+
-                                              |
-           +---------------+           +------+--------+
-           | page          +---------->  page_cgroup|
-           |               |           |               |
-           +---------------+           +---------------+
-
-             (Figure 1: Hierarchy of Accounting)
-
-
-Figure 1 shows the important aspects of the controller
-
-1. Accounting happens per cgroup
-2. Each mm_struct knows about which cgroup it belongs to
-3. Each page has a pointer to the page_cgroup, which in turn knows the
-   cgroup it belongs to
-
-The accounting is done as follows: mem_cgroup_charge_common() is invoked to
-set up the necessary data structures and check if the cgroup that is being
-charged is over its limit. If it is, then reclaim is invoked on the cgroup.
-More details can be found in the reclaim section of this document.
-If everything goes well, a page meta-data-structure called page_cgroup is
-updated. page_cgroup has its own LRU on cgroup.
-(*) page_cgroup structure is allocated at boot/memory-hotplug time.
-
-2.2.1 Accounting details
-------------------------
-
-All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
-Some pages which are never reclaimable and will not be on the LRU
-are not accounted. We just account pages under usual VM management.
-
-RSS pages are accounted at page_fault unless they've already been accounted
-for earlier. A file page will be accounted for as Page Cache when it's
-inserted into inode (radix-tree). While it's mapped into the page tables of
-processes, duplicate accounting is carefully avoided.
-
-An RSS page is unaccounted when it's fully unmapped. A PageCache page is
-unaccounted when it's removed from radix-tree. Even if RSS pages are fully
-unmapped (by kswapd), they may exist as SwapCache in the system until they
-are really freed. Such SwapCaches are also accounted.
-A swapped-in page is not accounted until it's mapped.
-
-Note: The kernel does swapin-readahead and reads multiple swaps at once.
-This means swapped-in pages may contain pages for other tasks than a task
-causing page fault. So, we avoid accounting at swap-in I/O.
-
-At page migration, accounting information is kept.
-
-Note: we just account pages-on-LRU because our purpose is to control amount
-of used pages; not-on-LRU pages tend to be out-of-control from VM view.
-
-2.3 Shared Page Accounting
---------------------------
-
-Shared pages are accounted on the basis of the first touch approach. The
-cgroup that first touches a page is accounted for the page. The principle
-behind this approach is that a cgroup that aggressively uses a shared
-page will eventually get charged for it (once it is uncharged from
-the cgroup that brought it in -- this will happen on memory pressure).
-
-But see section 8.2: when moving a task to another cgroup, its pages may
-be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
-
-Exception: If CONFIG_MEMCG_SWAP is not used.
-When you do swapoff and make swapped-out pages of shmem(tmpfs) to
-be backed into memory in force, charges for pages are accounted against the
-caller of swapoff rather than the users of shmem.
-
-2.4 Swap Extension (CONFIG_MEMCG_SWAP)
---------------------------------------
-
-Swap Extension allows you to record charge for swap. A swapped-in page is
-charged back to original page allocator if possible.
-
-When swap is accounted, following files are added.
-
- - memory.memsw.usage_in_bytes.
- - memory.memsw.limit_in_bytes.
-
-memsw means memory+swap. Usage of memory+swap is limited by
-memsw.limit_in_bytes.
-
-Example: Assume a system with 4G of swap. A task which allocates 6G of memory
-(by mistake) under 2G memory limitation will use all swap.
-In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
-By using the memsw limit, you can avoid system OOM which can be caused by swap
-shortage.
-
-**why 'memory+swap' rather than swap**
-
-The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
-to move account from memory to swap...there is no change in usage of
-memory+swap. In other words, when we want to limit the usage of swap without
-affecting global LRU, memory+swap limit is better than just limiting swap from
-an OS point of view.
-
-**What happens when a cgroup hits memory.memsw.limit_in_bytes**
-
-When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
-in this cgroup. Then, swap-out will not be done by cgroup routine and file
-caches are dropped. But as mentioned above, global LRU can do swapout memory
-from it for sanity of the system's memory management state. You can't forbid
-it by cgroup.
-
-2.5 Reclaim
------------
-
-Each cgroup maintains a per cgroup LRU which has the same structure as
-global VM. When a cgroup goes over its limit, we first try
-to reclaim memory from the cgroup so as to make space for the new
-pages that the cgroup has touched. If the reclaim is unsuccessful,
-an OOM routine is invoked to select and kill the bulkiest task in the
-cgroup. (See 10. OOM Control below.)
-
-The reclaim algorithm has not been modified for cgroups, except that
-pages that are selected for reclaiming come from the per-cgroup LRU
-list.
-
-NOTE:
-  Reclaim does not work for the root cgroup, since we cannot set any
-  limits on the root cgroup.
-
-Note2:
-  When panic_on_oom is set to "2", the whole system will panic.
-
-When oom event notifier is registered, event will be delivered.
-(See oom_control section)
-
-2.6 Locking
------------
-
-   lock_page_cgroup()/unlock_page_cgroup() should not be called under
-   the i_pages lock.
-
-   Other lock order is following:
-
-   PG_locked.
-     mm->page_table_lock
-         pgdat->lru_lock
-	   lock_page_cgroup.
-
-  In many cases, just lock_page_cgroup() is called.
-
-  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
-  pgdat->lru_lock, it has no lock of its own.
-
-2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
------------------------------------------------
-
-With the Kernel memory extension, the Memory Controller is able to limit
-the amount of kernel memory used by the system. Kernel memory is fundamentally
-different than user memory, since it can't be swapped out, which makes it
-possible to DoS the system by consuming too much of this precious resource.
-
-Kernel memory accounting is enabled for all memory cgroups by default. But
-it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
-at boot time. In this case, kernel memory will not be accounted at all.
-
-Kernel memory limits are not imposed for the root cgroup. Usage for the root
-cgroup may or may not be accounted. The memory used is accumulated into
-memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
-(currently only for tcp).
-
-The main "kmem" counter is fed into the main counter, so kmem charges will
-also be visible from the user counter.
-
-Currently no soft limit is implemented for kernel memory. It is future work
-to trigger slab reclaim when those limits are reached.
-
-2.7.1 Current Kernel Memory resources accounted
------------------------------------------------
-
-stack pages:
-  every process consumes some stack pages. By accounting into
-  kernel memory, we prevent new processes from being created when the kernel
-  memory usage is too high.
-
-slab pages:
-  pages allocated by the SLAB or SLUB allocator are tracked. A copy
-  of each kmem_cache is created every time the cache is touched by the first time
-  from inside the memcg. The creation is done lazily, so some objects can still be
-  skipped while the cache is being created. All objects in a slab page should
-  belong to the same memcg. This only fails to hold when a task is migrated to a
-  different memcg during the page allocation by the cache.
-
-sockets memory pressure:
-  some sockets protocols have memory pressure
-  thresholds. The Memory Controller allows them to be controlled individually
-  per cgroup, instead of globally.
-
-tcp memory pressure:
-  sockets memory pressure for the tcp protocol.
-
-2.7.2 Common use cases
-----------------------
-
-Because the "kmem" counter is fed to the main user counter, kernel memory can
-never be limited completely independently of user memory. Say "U" is the user
-limit, and "K" the kernel limit. There are three possible ways limits can be
-set:
-
-U != 0, K = unlimited:
-    This is the standard memcg limitation mechanism already present before kmem
-    accounting. Kernel memory is completely ignored.
-
-U != 0, K < U:
-    Kernel memory is a subset of the user memory. This setup is useful in
-    deployments where the total amount of memory per-cgroup is overcommited.
-    Overcommiting kernel memory limits is definitely not recommended, since the
-    box can still run out of non-reclaimable memory.
-    In this case, the admin could set up K so that the sum of all groups is
-    never greater than the total memory, and freely set U at the cost of his
-    QoS.
-
-WARNING:
-    In the current implementation, memory reclaim will NOT be
-    triggered for a cgroup when it hits K while staying below U, which makes
-    this setup impractical.
-
-U != 0, K >= U:
-    Since kmem charges will also be fed to the user counter and reclaim will be
-    triggered for the cgroup for both kinds of memory. This setup gives the
-    admin a unified view of memory, and it is also useful for people who just
-    want to track kernel memory usage.
-
-3. User Interface
-=================
-
-3.0. Configuration
-------------------
-
-a. Enable CONFIG_CGROUPS
-b. Enable CONFIG_MEMCG
-c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
-d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
-
-3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
--------------------------------------------------------------------
-
-::
-
-	# mount -t tmpfs none /sys/fs/cgroup
-	# mkdir /sys/fs/cgroup/memory
-	# mount -t cgroup none /sys/fs/cgroup/memory -o memory
-
-3.2. Make the new group and move bash into it::
-
-	# mkdir /sys/fs/cgroup/memory/0
-	# echo $$ > /sys/fs/cgroup/memory/0/tasks
-
-Since now we're in the 0 cgroup, we can alter the memory limit::
-
-	# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-
-NOTE:
-  We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
-  mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
-  Gibibytes.)
-
-NOTE:
-  We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
-
-NOTE:
-  We cannot set limits on the root cgroup any more.
-
-::
-
-  # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-  4194304
-
-We can check the usage::
-
-  # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
-  1216512
-
-A successful write to this file does not guarantee a successful setting of
-this limit to the value written into the file. This can be due to a
-number of factors, such as rounding up to page boundaries or the total
-availability of memory on the system. The user is required to re-read
-this file after a write to guarantee the value committed by the kernel::
-
-  # echo 1 > memory.limit_in_bytes
-  # cat memory.limit_in_bytes
-  4096
-
-The memory.failcnt field gives the number of times that the cgroup limit was
-exceeded.
-
-The memory.stat file gives accounting information. Now, the number of
-caches, RSS and Active pages/Inactive pages are shown.
-
-4. Testing
-==========
-
-For testing features and implementation, see memcg_test.txt.
-
-Performance test is also important. To see pure memory controller's overhead,
-testing on tmpfs will give you good numbers of small overheads.
-Example: do kernel make on tmpfs.
-
-Page-fault scalability is also important. At measuring parallel
-page fault test, multi-process test may be better than multi-thread
-test because it has noise of shared objects/status.
-
-But the above two are testing extreme situations.
-Trying usual test under memory controller is always helpful.
-
-4.1 Troubleshooting
--------------------
-
-Sometimes a user might find that the application under a cgroup is
-terminated by the OOM killer. There are several causes for this:
-
-1. The cgroup limit is too low (just too low to do anything useful)
-2. The user is using anonymous memory and swap is turned off or too low
-
-A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
-some of the pages cached in the cgroup (page cache pages).
-
-To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
-seeing what happens will be helpful.
-
-4.2 Task migration
-------------------
-
-When a task migrates from one cgroup to another, its charge is not
-carried forward by default. The pages allocated from the original cgroup still
-remain charged to it, the charge is dropped when the page is freed or
-reclaimed.
-
-You can move charges of a task along with task migration.
-See 8. "Move charges at task migration"
-
-4.3 Removing a cgroup
----------------------
-
-A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
-cgroup might have some charge associated with it, even though all
-tasks have migrated away from it. (because we charge against pages, not
-against tasks.)
-
-We move the stats to root (if use_hierarchy==0) or parent (if
-use_hierarchy==1), and no change on the charge except uncharging
-from the child.
-
-Charges recorded in swap information is not updated at removal of cgroup.
-Recorded information is discarded and a cgroup which uses swap (swapcache)
-will be charged as a new owner of it.
-
-About use_hierarchy, see Section 6.
-
-5. Misc. interfaces
-===================
-
-5.1 force_empty
----------------
-  memory.force_empty interface is provided to make cgroup's memory usage empty.
-  When writing anything to this::
-
-    # echo 0 > memory.force_empty
-
-  the cgroup will be reclaimed and as many pages reclaimed as possible.
-
-  The typical use case for this interface is before calling rmdir().
-  Though rmdir() offlines memcg, but the memcg may still stay there due to
-  charged file caches. Some out-of-use page caches may keep charged until
-  memory pressure happens. If you want to avoid that, force_empty will be useful.
-
-  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
-  kernel pages will still be seen. This is not considered a failure and the
-  write will still return success. In this case, it is expected that
-  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
-
-  About use_hierarchy, see Section 6.
-
-5.2 stat file
--------------
-
-memory.stat file includes following statistics
-
-per-memory cgroup local status
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-=============== ===============================================================
-cache		# of bytes of page cache memory.
-rss		# of bytes of anonymous and swap cache memory (includes
-		transparent hugepages).
-rss_huge	# of bytes of anonymous transparent hugepages.
-mapped_file	# of bytes of mapped file (includes tmpfs/shmem)
-pgpgin		# of charging events to the memory cgroup. The charging
-		event happens each time a page is accounted as either mapped
-		anon page(RSS) or cache page(Page Cache) to the cgroup.
-pgpgout		# of uncharging events to the memory cgroup. The uncharging
-		event happens each time a page is unaccounted from the cgroup.
-swap		# of bytes of swap usage
-dirty		# of bytes that are waiting to get written back to the disk.
-writeback	# of bytes of file/anon cache that are queued for syncing to
-		disk.
-inactive_anon	# of bytes of anonymous and swap cache memory on inactive
-		LRU list.
-active_anon	# of bytes of anonymous and swap cache memory on active
-		LRU list.
-inactive_file	# of bytes of file-backed memory on inactive LRU list.
-active_file	# of bytes of file-backed memory on active LRU list.
-unevictable	# of bytes of memory that cannot be reclaimed (mlocked etc).
-=============== ===============================================================
-
-status considering hierarchy (see memory.use_hierarchy settings)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-========================= ===================================================
-hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
-			  under which the memory cgroup is
-hierarchical_memsw_limit  # of bytes of memory+swap limit with regard to
-			  hierarchy under which memory cgroup is.
-
-total_<counter>		  # hierarchical version of <counter>, which in
-			  addition to the cgroup's own value includes the
-			  sum of all hierarchical children's values of
-			  <counter>, i.e. total_cache
-========================= ===================================================
-
-The following additional stats are dependent on CONFIG_DEBUG_VM
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-========================= ========================================
-recent_rotated_anon	  VM internal parameter. (see mm/vmscan.c)
-recent_rotated_file	  VM internal parameter. (see mm/vmscan.c)
-recent_scanned_anon	  VM internal parameter. (see mm/vmscan.c)
-recent_scanned_file	  VM internal parameter. (see mm/vmscan.c)
-========================= ========================================
-
-Memo:
-	recent_rotated means recent frequency of LRU rotation.
-	recent_scanned means recent # of scans to LRU.
-	showing for better debug please see the code for meanings.
-
-Note:
-	Only anonymous and swap cache memory is listed as part of 'rss' stat.
-	This should not be confused with the true 'resident set size' or the
-	amount of physical memory used by the cgroup.
-
-	'rss + mapped_file" will give you resident set size of cgroup.
-
-	(Note: file and shmem may be shared among other cgroups. In that case,
-	mapped_file is accounted only when the memory cgroup is owner of page
-	cache.)
-
-5.3 swappiness
---------------
-
-Overrides /proc/sys/vm/swappiness for the particular group. The tunable
-in the root cgroup corresponds to the global swappiness setting.
-
-Please note that unlike during the global reclaim, limit reclaim
-enforces that 0 swappiness really prevents from any swapping even if
-there is a swap storage available. This might lead to memcg OOM killer
-if there are no file pages to reclaim.
-
-5.4 failcnt
------------
-
-A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
-This failcnt(== failure count) shows the number of times that a usage counter
-hit its limit. When a memory cgroup hits a limit, failcnt increases and
-memory under it will be reclaimed.
-
-You can reset failcnt by writing 0 to failcnt file::
-
-	# echo 0 > .../memory.failcnt
-
-5.5 usage_in_bytes
-------------------
-
-For efficiency, as other kernel components, memory cgroup uses some optimization
-to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
-method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
-value for efficient access. (Of course, when necessary, it's synchronized.)
-If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
-value in memory.stat(see 5.2).
-
-5.6 numa_stat
--------------
-
-This is similar to numa_maps but operates on a per-memcg basis.  This is
-useful for providing visibility into the numa locality information within
-an memcg since the pages are allowed to be allocated from any physical
-node.  One of the use cases is evaluating application performance by
-combining this information with the application's CPU allocation.
-
-Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
-per-node page counts including "hierarchical_<counter>" which sums up all
-hierarchical children's values in addition to the memcg's own value.
-
-The output format of memory.numa_stat is::
-
-  total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
-
-The "total" count is sum of file + anon + unevictable.
-
-6. Hierarchy support
-====================
-
-The memory controller supports a deep hierarchy and hierarchical accounting.
-The hierarchy is created by creating the appropriate cgroups in the
-cgroup filesystem. Consider for example, the following cgroup filesystem
-hierarchy::
-
-	       root
-	     /  |   \
-            /	|    \
-	   a	b     c
-		      | \
-		      |  \
-		      d   e
-
-In the diagram above, with hierarchical accounting enabled, all memory
-usage of e, is accounted to its ancestors up until the root (i.e, c and root),
-that has memory.use_hierarchy enabled. If one of the ancestors goes over its
-limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
-children of the ancestor.
-
-6.1 Enabling hierarchical accounting and reclaim
-------------------------------------------------
-
-A memory cgroup by default disables the hierarchy feature. Support
-can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup::
-
-	# echo 1 > memory.use_hierarchy
-
-The feature can be disabled by::
-
-	# echo 0 > memory.use_hierarchy
-
-NOTE1:
-       Enabling/disabling will fail if either the cgroup already has other
-       cgroups created below it, or if the parent cgroup has use_hierarchy
-       enabled.
-
-NOTE2:
-       When panic_on_oom is set to "2", the whole system will panic in
-       case of an OOM event in any cgroup.
-
-7. Soft limits
-==============
-
-Soft limits allow for greater sharing of memory. The idea behind soft limits
-is to allow control groups to use as much of the memory as needed, provided
-
-a. There is no memory contention
-b. They do not exceed their hard limit
-
-When the system detects memory contention or low memory, control groups
-are pushed back to their soft limits. If the soft limit of each control
-group is very high, they are pushed back as much as possible to make
-sure that one control group does not starve the others of memory.
-
-Please note that soft limits is a best-effort feature; it comes with
-no guarantees, but it does its best to make sure that when memory is
-heavily contended for, memory is allocated based on the soft limit
-hints/setup. Currently soft limit based reclaim is set up such that
-it gets invoked from balance_pgdat (kswapd).
-
-7.1 Interface
--------------
-
-Soft limits can be setup by using the following commands (in this example we
-assume a soft limit of 256 MiB)::
-
-	# echo 256M > memory.soft_limit_in_bytes
-
-If we want to change this to 1G, we can at any time use::
-
-	# echo 1G > memory.soft_limit_in_bytes
-
-NOTE1:
-       Soft limits take effect over a long period of time, since they involve
-       reclaiming memory for balancing between memory cgroups
-NOTE2:
-       It is recommended to set the soft limit always below the hard limit,
-       otherwise the hard limit will take precedence.
-
-8. Move charges at task migration
-=================================
-
-Users can move charges associated with a task along with task migration, that
-is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
-This feature is not supported in !CONFIG_MMU environments because of lack of
-page tables.
-
-8.1 Interface
--------------
-
-This feature is disabled by default. It can be enabled (and disabled again) by
-writing to memory.move_charge_at_immigrate of the destination cgroup.
-
-If you want to enable it::
-
-	# echo (some positive value) > memory.move_charge_at_immigrate
-
-Note:
-      Each bits of move_charge_at_immigrate has its own meaning about what type
-      of charges should be moved. See 8.2 for details.
-Note:
-      Charges are moved only when you move mm->owner, in other words,
-      a leader of a thread group.
-Note:
-      If we cannot find enough space for the task in the destination cgroup, we
-      try to make space by reclaiming memory. Task migration may fail if we
-      cannot make enough space.
-Note:
-      It can take several seconds if you move charges much.
-
-And if you want disable it again::
-
-	# echo 0 > memory.move_charge_at_immigrate
-
-8.2 Type of charges which can be moved
---------------------------------------
-
-Each bit in move_charge_at_immigrate has its own meaning about what type of
-charges should be moved. But in any case, it must be noted that an account of
-a page or a swap can be moved only when it is charged to the task's current
-(old) memory cgroup.
-
-+---+--------------------------------------------------------------------------+
-|bit| what type of charges would be moved ?                                    |
-+===+==========================================================================+
-| 0 | A charge of an anonymous page (or swap of it) used by the target task.   |
-|   | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
-+---+--------------------------------------------------------------------------+
-| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
-|   | and swaps of tmpfs file) mmapped by the target task. Unlike the case of  |
-|   | anonymous pages, file pages (and swaps) in the range mmapped by the task |
-|   | will be moved even if the task hasn't done page fault, i.e. they might   |
-|   | not be the task's "RSS", but other task's "RSS" that maps the same file. |
-|   | And mapcount of the page is ignored (the page can be moved even if       |
-|   | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to    |
-|   | enable move of swap charges.                                             |
-+---+--------------------------------------------------------------------------+
-
-8.3 TODO
---------
-
-- All of moving charge operations are done under cgroup_mutex. It's not good
-  behavior to hold the mutex too long, so we may need some trick.
-
-9. Memory thresholds
-====================
-
-Memory cgroup implements memory thresholds using the cgroups notification
-API (see cgroups.txt). It allows to register multiple memory and memsw
-thresholds and gets notifications when it crosses.
-
-To register a threshold, an application must:
-
-- create an eventfd using eventfd(2);
-- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
-- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
-  cgroup.event_control.
-
-Application will be notified through eventfd when memory usage crosses
-threshold in any direction.
-
-It's applicable for root and non-root cgroup.
-
-10. OOM Control
-===============
-
-memory.oom_control file is for OOM notification and other controls.
-
-Memory cgroup implements OOM notifier using the cgroup notification
-API (See cgroups.txt). It allows to register multiple OOM notification
-delivery and gets notification when OOM happens.
-
-To register a notifier, an application must:
-
- - create an eventfd using eventfd(2)
- - open memory.oom_control file
- - write string like "<event_fd> <fd of memory.oom_control>" to
-   cgroup.event_control
-
-The application will be notified through eventfd when OOM happens.
-OOM notification doesn't work for the root cgroup.
-
-You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
-
-	#echo 1 > memory.oom_control
-
-If OOM-killer is disabled, tasks under cgroup will hang/sleep
-in memory cgroup's OOM-waitqueue when they request accountable memory.
-
-For running them, you have to relax the memory cgroup's OOM status by
-
-	* enlarge limit or reduce usage.
-
-To reduce usage,
-
-	* kill some tasks.
-	* move some tasks to other group with account migration.
-	* remove some files (on tmpfs?)
-
-Then, stopped tasks will work again.
-
-At reading, current status of OOM is shown.
-
-	- oom_kill_disable 0 or 1
-	  (if 1, oom-killer is disabled)
-	- under_oom	   0 or 1
-	  (if 1, the memory cgroup is under OOM, tasks may be stopped.)
-
-11. Memory Pressure
-===================
-
-The pressure level notifications can be used to monitor the memory
-allocation cost; based on the pressure, applications can implement
-different strategies of managing their memory resources. The pressure
-levels are defined as following:
-
-The "low" level means that the system is reclaiming memory for new
-allocations. Monitoring this reclaiming activity might be useful for
-maintaining cache level. Upon notification, the program (typically
-"Activity Manager") might analyze vmstat and act in advance (i.e.
-prematurely shutdown unimportant services).
-
-The "medium" level means that the system is experiencing medium memory
-pressure, the system might be making swap, paging out active file caches,
-etc. Upon this event applications may decide to further analyze
-vmstat/zoneinfo/memcg or internal memory usage statistics and free any
-resources that can be easily reconstructed or re-read from a disk.
-
-The "critical" level means that the system is actively thrashing, it is
-about to out of memory (OOM) or even the in-kernel OOM killer is on its
-way to trigger. Applications should do whatever they can to help the
-system. It might be too late to consult with vmstat or any other
-statistics, so it's advisable to take an immediate action.
-
-By default, events are propagated upward until the event is handled, i.e. the
-events are not pass-through. For example, you have three cgroups: A->B->C. Now
-you set up an event listener on cgroups A, B and C, and suppose group C
-experiences some pressure. In this situation, only group C will receive the
-notification, i.e. groups A and B will not receive it. This is done to avoid
-excessive "broadcasting" of messages, which disturbs the system and which is
-especially bad if we are low on memory or thrashing. Group B, will receive
-notification only if there are no event listers for group C.
-
-There are three optional modes that specify different propagation behavior:
-
- - "default": this is the default behavior specified above. This mode is the
-   same as omitting the optional mode parameter, preserved by backwards
-   compatibility.
-
- - "hierarchy": events always propagate up to the root, similar to the default
-   behavior, except that propagation continues regardless of whether there are
-   event listeners at each level, with the "hierarchy" mode. In the above
-   example, groups A, B, and C will receive notification of memory pressure.
-
- - "local": events are pass-through, i.e. they only receive notifications when
-   memory pressure is experienced in the memcg for which the notification is
-   registered. In the above example, group C will receive notification if
-   registered for "local" notification and the group experiences memory
-   pressure. However, group B will never receive notification, regardless if
-   there is an event listener for group C or not, if group B is registered for
-   local notification.
-
-The level and event notification mode ("hierarchy" or "local", if necessary) are
-specified by a comma-delimited string, i.e. "low,hierarchy" specifies
-hierarchical, pass-through, notification for all ancestor memcgs. Notification
-that is the default, non pass-through behavior, does not specify a mode.
-"medium,local" specifies pass-through notification for the medium level.
-
-The file memory.pressure_level is only used to setup an eventfd. To
-register a notification, an application must:
-
-- create an eventfd using eventfd(2);
-- open memory.pressure_level;
-- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
-  to cgroup.event_control.
-
-Application will be notified through eventfd when memory pressure is at
-the specific level (or higher). Read/write operations to
-memory.pressure_level are no implemented.
-
-Test:
-
-   Here is a small script example that makes a new cgroup, sets up a
-   memory limit, sets up a notification in the cgroup and then makes child
-   cgroup experience a critical pressure::
-
-	# cd /sys/fs/cgroup/memory/
-	# mkdir foo
-	# cd foo
-	# cgroup_event_listener memory.pressure_level low,hierarchy &
-	# echo 8000000 > memory.limit_in_bytes
-	# echo 8000000 > memory.memsw.limit_in_bytes
-	# echo $$ > tasks
-	# dd if=/dev/zero | read x
-
-   (Expect a bunch of notifications, and eventually, the oom-killer will
-   trigger.)
-
-12. TODO
-========
-
-1. Make per-cgroup scanner reclaim not-shared pages first
-2. Teach controller to account for shared-pages
-3. Start reclamation in the background when the limit is
-   not yet hit but the usage is getting closer
-
-Summary
-=======
-
-Overall, the memory controller has been a stable controller and has been
-commented and discussed quite extensively in the community.
-
-References
-==========
-
-1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
-2. Singh, Balbir. Memory Controller (RSS Control),
-   http://lwn.net/Articles/222762/
-3. Emelianov, Pavel. Resource controllers based on process cgroups
-   http://lkml.org/lkml/2007/3/6/198
-4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
-   http://lkml.org/lkml/2007/4/9/78
-5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
-   http://lkml.org/lkml/2007/5/30/244
-6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
-7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
-   subsystem (v3), http://lwn.net/Articles/235534/
-8. Singh, Balbir. RSS controller v2 test results (lmbench),
-   http://lkml.org/lkml/2007/5/17/232
-9. Singh, Balbir. RSS controller v2 AIM9 results
-   http://lkml.org/lkml/2007/5/18/1
-10. Singh, Balbir. Memory controller v6 test results,
-    http://lkml.org/lkml/2007/8/19/36
-11. Singh, Balbir. Memory controller introduction (v6),
-    http://lkml.org/lkml/2007/8/17/69
-12. Corbet, Jonathan, Controlling memory use in cgroups,
-    http://lwn.net/Articles/243795/
diff --git a/Documentation/cgroup-v1/net_cls.rst b/Documentation/cgroup-v1/net_cls.rst
deleted file mode 100644
index a2cf272af7a0..000000000000
--- a/Documentation/cgroup-v1/net_cls.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-=========================
-Network classifier cgroup
-=========================
-
-The Network classifier cgroup provides an interface to
-tag network packets with a class identifier (classid).
-
-The Traffic Controller (tc) can be used to assign
-different priorities to packets from different cgroups.
-Also, Netfilter (iptables) can use this tag to perform
-actions on such packets.
-
-Creating a net_cls cgroups instance creates a net_cls.classid file.
-This net_cls.classid value is initialized to 0.
-
-You can write hexadecimal values to net_cls.classid; the format for these
-values is 0xAAAABBBB; AAAA is the major handle number and BBBB
-is the minor handle number.
-Reading net_cls.classid yields a decimal result.
-
-Example::
-
-	mkdir /sys/fs/cgroup/net_cls
-	mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
-	mkdir /sys/fs/cgroup/net_cls/0
-	echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
-
-- setting a 10:1 handle::
-
-	cat /sys/fs/cgroup/net_cls/0/net_cls.classid
-	1048577
-
-- configuring tc::
-
-	tc qdisc add dev eth0 root handle 10: htb
-	tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
-
-- creating traffic class 10:1::
-
-	tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
-
-configuring iptables, basic example::
-
-	iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/cgroup-v1/net_prio.rst b/Documentation/cgroup-v1/net_prio.rst
deleted file mode 100644
index b40905871c64..000000000000
--- a/Documentation/cgroup-v1/net_prio.rst
+++ /dev/null
@@ -1,57 +0,0 @@
-=======================
-Network priority cgroup
-=======================
-
-The Network priority cgroup provides an interface to allow an administrator to
-dynamically set the priority of network traffic generated by various
-applications
-
-Nominally, an application would set the priority of its traffic via the
-SO_PRIORITY socket option.  This however, is not always possible because:
-
-1) The application may not have been coded to set this value
-2) The priority of application traffic is often a site-specific administrative
-   decision rather than an application defined one.
-
-This cgroup allows an administrator to assign a process to a group which defines
-the priority of egress traffic on a given interface. Network priority groups can
-be created by first mounting the cgroup filesystem::
-
-	# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
-
-With the above step, the initial group acting as the parent accounting group
-becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
-the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
-
-Each net_prio cgroup contains two files that are subsystem specific
-
-net_prio.prioidx
-  This file is read-only, and is simply informative.  It contains a unique
-  integer value that the kernel uses as an internal representation of this
-  cgroup.
-
-net_prio.ifpriomap
-  This file contains a map of the priorities assigned to traffic originating
-  from processes in this group and egressing the system on various interfaces.
-  It contains a list of tuples in the form <ifname priority>.  Contents of this
-  file can be modified by echoing a string into the file using the same tuple
-  format. For example::
-
-	echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
-
-This command would force any traffic originating from processes belonging to the
-iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
-said traffic set to the value 5. The parent accounting group also has a
-writeable 'net_prio.ifpriomap' file that can be used to set a system default
-priority.
-
-Priorities are set immediately prior to queueing a frame to the device
-queueing discipline (qdisc) so priorities will be assigned prior to the hardware
-queue selection being made.
-
-One usage for the net_prio cgroup is with mqprio qdisc allowing application
-traffic to be steered to hardware/driver based traffic classes. These mappings
-can then be managed by administrators or other networking protocols such as
-DCBX.
-
-A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/cgroup-v1/pids.rst b/Documentation/cgroup-v1/pids.rst
deleted file mode 100644
index 6acebd9e72c8..000000000000
--- a/Documentation/cgroup-v1/pids.rst
+++ /dev/null
@@ -1,92 +0,0 @@
-=========================
-Process Number Controller
-=========================
-
-Abstract
---------
-
-The process number controller is used to allow a cgroup hierarchy to stop any
-new tasks from being fork()'d or clone()'d after a certain limit is reached.
-
-Since it is trivial to hit the task limit without hitting any kmemcg limits in
-place, PIDs are a fundamental resource. As such, PID exhaustion must be
-preventable in the scope of a cgroup hierarchy by allowing resource limiting of
-the number of tasks in a cgroup.
-
-Usage
------
-
-In order to use the `pids` controller, set the maximum number of tasks in
-pids.max (this is not available in the root cgroup for obvious reasons). The
-number of processes currently in the cgroup is given by pids.current.
-
-Organisational operations are not blocked by cgroup policies, so it is possible
-to have pids.current > pids.max. This can be done by either setting the limit to
-be smaller than pids.current, or attaching enough processes to the cgroup such
-that pids.current > pids.max. However, it is not possible to violate a cgroup
-policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
-creation of a new process would cause a cgroup policy to be violated.
-
-To set a cgroup to have no limit, set pids.max to "max". This is the default for
-all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
-limit in the hierarchy is followed).
-
-pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
-superset of parent/child/pids.current.
-
-The pids.events file contains event counters:
-
-  - max: Number of times fork failed because limit was hit.
-
-Example
--------
-
-First, we mount the pids controller::
-
-	# mkdir -p /sys/fs/cgroup/pids
-	# mount -t cgroup -o pids none /sys/fs/cgroup/pids
-
-Then we create a hierarchy, set limits and attach processes to it::
-
-	# mkdir -p /sys/fs/cgroup/pids/parent/child
-	# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
-	# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
-	# cat /sys/fs/cgroup/pids/parent/pids.current
-	2
-	#
-
-It should be noted that attempts to overcome the set limit (2 in this case) will
-fail::
-
-	# cat /sys/fs/cgroup/pids/parent/pids.current
-	2
-	# ( /bin/echo "Here's some processes for you." | cat )
-	sh: fork: Resource temporary unavailable
-	#
-
-Even if we migrate to a child cgroup (which doesn't have a set limit), we will
-not be able to overcome the most stringent limit in the hierarchy (in this case,
-parent's)::
-
-	# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
-	# cat /sys/fs/cgroup/pids/parent/pids.current
-	2
-	# cat /sys/fs/cgroup/pids/parent/child/pids.current
-	2
-	# cat /sys/fs/cgroup/pids/parent/child/pids.max
-	max
-	# ( /bin/echo "Here's some processes for you." | cat )
-	sh: fork: Resource temporary unavailable
-	#
-
-We can set a limit that is smaller than pids.current, which will stop any new
-processes from being forked at all (note that the shell itself counts towards
-pids.current)::
-
-	# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
-	# /bin/echo "We can't even spawn a single process now."
-	sh: fork: Resource temporary unavailable
-	# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
-	# /bin/echo "We can't even spawn a single process now."
-	sh: fork: Resource temporary unavailable
-	#
diff --git a/Documentation/cgroup-v1/rdma.rst b/Documentation/cgroup-v1/rdma.rst
deleted file mode 100644
index 2fcb0a9bf790..000000000000
--- a/Documentation/cgroup-v1/rdma.rst
+++ /dev/null
@@ -1,117 +0,0 @@
-===============
-RDMA Controller
-===============
-
-.. Contents
-
-   1. Overview
-     1-1. What is RDMA controller?
-     1-2. Why RDMA controller needed?
-     1-3. How is RDMA controller implemented?
-   2. Usage Examples
-
-1. Overview
-===========
-
-1-1. What is RDMA controller?
------------------------------
-
-RDMA controller allows user to limit RDMA/IB specific resources that a given
-set of processes can use. These processes are grouped using RDMA controller.
-
-RDMA controller defines two resources which can be limited for processes of a
-cgroup.
-
-1-2. Why RDMA controller needed?
---------------------------------
-
-Currently user space applications can easily take away all the rdma verb
-specific resources such as AH, CQ, QP, MR etc. Due to which other applications
-in other cgroup or kernel space ULPs may not even get chance to allocate any
-rdma resources. This can lead to service unavailability.
-
-Therefore RDMA controller is needed through which resource consumption
-of processes can be limited. Through this controller different rdma
-resources can be accounted.
-
-1-3. How is RDMA controller implemented?
-----------------------------------------
-
-RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
-resource accounting per cgroup, per device using resource pool structure.
-Each such resource pool is limited up to 64 resources in given resource pool
-by rdma cgroup, which can be extended later if required.
-
-This resource pool object is linked to the cgroup css. Typically there
-are 0 to 4 resource pool instances per cgroup, per device in most use cases.
-But nothing limits to have it more. At present hundreds of RDMA devices per
-single cgroup may not be handled optimally, however there is no
-known use case or requirement for such configuration either.
-
-Since RDMA resources can be allocated from any process and can be freed by any
-of the child processes which shares the address space, rdma resources are
-always owned by the creator cgroup css. This allows process migration from one
-to other cgroup without major complexity of transferring resource ownership;
-because such ownership is not really present due to shared nature of
-rdma resources. Linking resources around css also ensures that cgroups can be
-deleted after processes migrated. This allow progress migration as well with
-active resources, even though that is not a primary use case.
-
-Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
-the caller. Same rdma cgroup should be passed while uncharging the resource.
-This also allows process migrated with active RDMA resource to charge
-to new owner cgroup for new resource. It also allows to uncharge resource of
-a process from previously charged cgroup which is migrated to new cgroup,
-even though that is not a primary use case.
-
-Resource pool object is created in following situations.
-(a) User sets the limit and no previous resource pool exist for the device
-of interest for the cgroup.
-(b) No resource limits were configured, but IB/RDMA stack tries to
-charge the resource. So that it correctly uncharge them when applications are
-running without limits and later on when limits are enforced during uncharging,
-otherwise usage count will drop to negative.
-
-Resource pool is destroyed if all the resource limits are set to max and
-it is the last resource getting deallocated.
-
-User should set all the limit to max value if it intents to remove/unconfigure
-the resource pool for a particular device.
-
-IB stack honors limits enforced by the rdma controller. When application
-query about maximum resource limits of IB device, it returns minimum of
-what is configured by user for a given cgroup and what is supported by
-IB device.
-
-Following resources can be accounted by rdma controller.
-
-  ==========    =============================
-  hca_handle	Maximum number of HCA Handles
-  hca_object 	Maximum number of HCA Objects
-  ==========    =============================
-
-2. Usage Examples
-=================
-
-(a) Configure resource limit::
-
-	echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
-	echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
-
-(b) Query resource limit::
-
-	cat /sys/fs/cgroup/rdma/2/rdma.max
-	#Output:
-	mlx4_0 hca_handle=2 hca_object=2000
-	ocrdma1 hca_handle=3 hca_object=max
-
-(c) Query current usage::
-
-	cat /sys/fs/cgroup/rdma/2/rdma.current
-	#Output:
-	mlx4_0 hca_handle=1 hca_object=20
-	ocrdma1 hca_handle=1 hca_object=23
-
-(d) Delete resource limit::
-
-	echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index cad797a8a39e..5ecbc03e6b2f 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for
 use at file creation time.  When a task allocates a file in the file
 system, the mount option memory policy will be applied with a NodeList,
 if any, modified by the calling task's cpuset constraints
-[See Documentation/cgroup-v1/cpusets.rst] and any optional flags, listed
+[See Documentation/admin-guide/cgroup-v1/cpusets.rst] and any optional flags, listed
 below.  If the resulting NodeLists is the empty set, the effective memory
 policy for the file will revert to "default" policy.
 
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index 5623b9916411..4f18456dd3b1 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -12,7 +12,7 @@ References
 
 -	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
 
--	Documentation/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
+-	Documentation/admin-guide/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
 
 -	man taskset:  Using the taskset command to bind tasks to sets
 	of CPUs.
diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
index 3391e86d810c..14a2f7bf63fe 100644
--- a/Documentation/scheduler/sched-deadline.rst
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -669,7 +669,7 @@ Deadline Task Scheduling
 
  -deadline tasks cannot have an affinity mask smaller that the entire
  root_domain they are created on. However, affinities can be specified
- through the cpuset facility (Documentation/cgroup-v1/cpusets.rst).
+ through the cpuset facility (Documentation/admin-guide/cgroup-v1/cpusets.rst).
 
 5.1 SCHED_DEADLINE and cpusets HOWTO
 ------------------------------------
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
index 53b30d1967cf..a96c72651877 100644
--- a/Documentation/scheduler/sched-design-CFS.rst
+++ b/Documentation/scheduler/sched-design-CFS.rst
@@ -222,7 +222,7 @@ SCHED_BATCH) tasks.
 
    These options need CONFIG_CGROUPS to be defined, and let the administrator
    create arbitrary groups of tasks, using the "cgroup" pseudo filesystem.  See
-   Documentation/cgroup-v1/cgroups.rst for more information about this filesystem.
+   Documentation/admin-guide/cgroup-v1/cgroups.rst for more information about this filesystem.
 
 When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
 group created using the pseudo filesystem.  See example steps below to create
diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst
index d27d3f3712fd..655a096ec8fb 100644
--- a/Documentation/scheduler/sched-rt-group.rst
+++ b/Documentation/scheduler/sched-rt-group.rst
@@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "<cgroup>/cpu.rt_runtime_us"
 to control the CPU time reserved for each control group.
 
 For more information on working with control groups, you should read
-Documentation/cgroup-v1/cgroups.rst as well.
+Documentation/admin-guide/cgroup-v1/cgroups.rst as well.
 
 Group settings are checked against the following limits in order to keep the
 configuration schedulable:
diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst
index 130f3cfa1c19..99fdeca917ca 100644
--- a/Documentation/vm/numa.rst
+++ b/Documentation/vm/numa.rst
@@ -67,7 +67,7 @@ nodes.  Each emulated node will manage a fraction of the underlying cells'
 physical memory.  NUMA emluation is useful for testing NUMA kernel and
 application features on non-NUMA platforms, and as a sort of memory resource
 management mechanism when used together with cpusets.
-[see Documentation/cgroup-v1/cpusets.rst]
+[see Documentation/admin-guide/cgroup-v1/cpusets.rst]
 
 For each node with memory, Linux constructs an independent memory management
 subsystem, complete with its own free page lists, in-use page lists, usage
@@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see
 
 System administrators can restrict the CPUs and nodes' memories that a non-
 privileged user can specify in the scheduling or NUMA commands and functions
-using control groups and CPUsets.  [see Documentation/cgroup-v1/cpusets.rst]
+using control groups and CPUsets.  [see Documentation/admin-guide/cgroup-v1/cpusets.rst]
 
 On architectures that do not hide memoryless nodes, Linux will include only
 zones [nodes] with memory in the zonelists.  This means that for a memoryless
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
index 35bba27d5fff..1d6cd7db4e43 100644
--- a/Documentation/vm/page_migration.rst
+++ b/Documentation/vm/page_migration.rst
@@ -41,7 +41,7 @@ locations.
 Larger installations usually partition the system using cpusets into
 sections of nodes. Paul Jackson has equipped cpusets with the ability to
 move pages when a task is moved to another cpuset (See
-Documentation/cgroup-v1/cpusets.rst).
+Documentation/admin-guide/cgroup-v1/cpusets.rst).
 Cpusets allows the automation of process locality. If a task is moved to
 a new cpuset then also all its pages are moved with it so that the
 performance of the process does not sink dramatically. Also the pages
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
index 109052215bce..17d0861b0f1d 100644
--- a/Documentation/vm/unevictable-lru.rst
+++ b/Documentation/vm/unevictable-lru.rst
@@ -98,7 +98,7 @@ Memory Control Group Interaction
 --------------------------------
 
 The unevictable LRU facility interacts with the memory control group [aka
-memory controller; see Documentation/cgroup-v1/memory.rst] by extending the
+memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by extending the
 lru_list enum.
 
 The memory controller data structure automatically gets a per-zone unevictable
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
index 30108684ae87..ff9bcfd2cc14 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
@@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks.  This is a way of limiting the
 amount of system memory that are available to a certain class of tasks.
 
 For more information on the features of cpusets, see
-Documentation/cgroup-v1/cpusets.rst.
+Documentation/admin-guide/cgroup-v1/cpusets.rst.
 There are a number of different configurations you can use for your needs.  For
 more information on the numa=fake command line option and its various ways of
 configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst.
@@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg::
 	On node 3 totalpages: 131072
 
 Now following the instructions for mounting the cpusets filesystem from
-Documentation/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory
+Documentation/admin-guide/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory
 address spaces) to individual cpusets::
 
 	[root@xroads /]# mkdir exampleset
diff --git a/MAINTAINERS b/MAINTAINERS
index 0c603ea73034..c1593a668f80 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4158,7 +4158,7 @@ L:	cgroups@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
 S:	Maintained
 F:	Documentation/admin-guide/cgroup-v2.rst
-F:	Documentation/cgroup-v1/
+F:	Documentation/admin-guide/cgroup-v1/
 F:	include/linux/cgroup*
 F:	kernel/cgroup/
 
@@ -4169,7 +4169,7 @@ W:	http://www.bullopensource.org/cpuset/
 W:	http://oss.sgi.com/projects/cpusets/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
 S:	Maintained
-F:	Documentation/cgroup-v1/cpusets.rst
+F:	Documentation/admin-guide/cgroup-v1/cpusets.rst
 F:	include/linux/cpuset.h
 F:	kernel/cgroup/cpuset.c
 
diff --git a/block/Kconfig b/block/Kconfig
index b16b3e075d31..8b5f8e560eb4 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -89,7 +89,7 @@ config BLK_DEV_THROTTLING
 	one needs to mount and use blkio cgroup controller for creating
 	cgroups and specifying per device IO rate policies.
 
-	See Documentation/cgroup-v1/blkio-controller.rst for more information.
+	See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
 
 config BLK_DEV_THROTTLING_LOW
 	bool "Block throttling .low limit interface support (EXPERIMENTAL)"
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c5311935239d..430e219e3aba 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -624,7 +624,7 @@ struct cftype {
 
 /*
  * Control Group subsystem type.
- * See Documentation/cgroup-v1/cgroups.rst for details
+ * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details
  */
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6f68438aa4ed..82699845ef79 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -806,7 +806,7 @@ union bpf_attr {
  * 		based on a user-provided identifier for all traffic coming from
  * 		the tasks belonging to the related cgroup. See also the related
  * 		kernel documentation, available from the Linux sources in file
- * 		*Documentation/cgroup-v1/net_cls.rst*.
+ * 		*Documentation/admin-guide/cgroup-v1/net_cls.rst*.
  *
  * 		The Linux kernel has two versions for cgroups: there are
  * 		cgroups v1 and cgroups v2. Both are available to users, who can
diff --git a/init/Kconfig b/init/Kconfig
index 9eb92ee52d40..381cdfee6e0e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -821,7 +821,7 @@ menuconfig CGROUPS
 	  controls or device isolation.
 	  See
 		- Documentation/scheduler/sched-design-CFS.rst	(CFS)
-		- Documentation/cgroup-v1/ (features for grouping, isolation
+		- Documentation/admin-guide/cgroup-v1/ (features for grouping, isolation
 					  and resource control)
 
 	  Say N if unsure.
@@ -883,7 +883,7 @@ config BLK_CGROUP
 	CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
 	CONFIG_BLK_DEV_THROTTLING=y.
 
-	See Documentation/cgroup-v1/blkio-controller.rst for more information.
+	See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
 
 config CGROUP_WRITEBACK
 	bool
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b3b02b9c4405..863e434a6020 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -729,7 +729,7 @@ static inline int nr_cpusets(void)
  * load balancing domains (sched domains) as specified by that partial
  * partition.
  *
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
+ * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
  * for a background explanation of this.
  *
  * Does not return errors, on the theory that the callers of this
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index c07196502577..725674f3276d 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -509,7 +509,7 @@ static inline int may_allow_all(struct dev_cgroup *parent)
  * This is one of the three key functions for hierarchy implementation.
  * This function is responsible for re-evaluating all the cgroup's active
  * exceptions due to a parent's exception change.
- * Refer to Documentation/cgroup-v1/devices.rst for more details.
+ * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details.
  */
 static void revalidate_active_exceptions(struct dev_cgroup *devcg)
 {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f506c68b2612..17e2b1713702 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -806,7 +806,7 @@ union bpf_attr {
  * 		based on a user-provided identifier for all traffic coming from
  * 		the tasks belonging to the related cgroup. See also the related
  * 		kernel documentation, available from the Linux sources in file
- * 		*Documentation/cgroup-v1/net_cls.rst*.
+ * 		*Documentation/admin-guide/cgroup-v1/net_cls.rst*.
  *
  * 		The Linux kernel has two versions for cgroups: there are
  * 		cgroups v1 and cgroups v2. Both are available to users, who can
-- 
cgit v1.2.3


From 4f4cfa6c560c93ba180c30675cf845e1597de44c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 27 Jun 2019 14:56:51 -0300
Subject: docs: admin-guide: add a series of orphaned documents

There are lots of documents that belong to the admin-guide but
are on random places (most under Documentation root dir).

Move them to the admin guide.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 Documentation/ABI/stable/sysfs-devices-node        |   2 +-
 Documentation/ABI/testing/procfs-diskstats         |   2 +-
 Documentation/ABI/testing/sysfs-block              |   2 +-
 Documentation/ABI/testing/sysfs-devices-system-cpu |   4 +-
 Documentation/admin-guide/btmrvl.rst               | 124 +++++++
 Documentation/admin-guide/clearing-warn-once.rst   |   9 +
 Documentation/admin-guide/cpu-load.rst             | 114 +++++++
 Documentation/admin-guide/cputopology.rst          | 177 ++++++++++
 .../admin-guide/device-mapper/statistics.rst       |   4 +-
 Documentation/admin-guide/efi-stub.rst             | 100 ++++++
 Documentation/admin-guide/highuid.rst              |  80 +++++
 Documentation/admin-guide/hw-vuln/l1tf.rst         |   2 +-
 Documentation/admin-guide/hw_random.rst            | 105 ++++++
 Documentation/admin-guide/index.rst                |  17 +
 Documentation/admin-guide/iostats.rst              | 197 ++++++++++++
 Documentation/admin-guide/kernel-parameters.txt    |   2 +-
 .../admin-guide/kernel-per-CPU-kthreads.rst        | 356 +++++++++++++++++++++
 Documentation/admin-guide/lcd-panel-cgram.rst      |  27 ++
 Documentation/admin-guide/ldm.rst                  | 121 +++++++
 Documentation/admin-guide/lockup-watchdogs.rst     |  83 +++++
 Documentation/admin-guide/mm/cma_debugfs.rst       |  25 ++
 Documentation/admin-guide/mm/index.rst             |   1 +
 Documentation/admin-guide/numastat.rst             |  30 ++
 Documentation/admin-guide/pnp.rst                  | 292 +++++++++++++++++
 Documentation/admin-guide/rtc.rst                  | 140 ++++++++
 Documentation/admin-guide/svga.rst                 | 249 ++++++++++++++
 Documentation/admin-guide/sysctl/kernel.rst        |   2 +-
 Documentation/admin-guide/video-output.rst         |  34 ++
 Documentation/auxdisplay/lcd-panel-cgram.rst       |  29 --
 Documentation/btmrvl.txt                           | 124 -------
 Documentation/clearing-warn-once.txt               |   9 -
 Documentation/cma/debugfs.rst                      |  27 --
 Documentation/cpu-load.txt                         | 114 -------
 Documentation/cputopology.txt                      | 177 ----------
 Documentation/efi-stub.txt                         | 100 ------
 Documentation/fb/vesafb.rst                        |   2 +-
 Documentation/highuid.txt                          |  80 -----
 Documentation/hw_random.txt                        | 105 ------
 Documentation/iostats.txt                          | 197 ------------
 Documentation/kernel-per-CPU-kthreads.txt          | 356 ---------------------
 Documentation/ldm.txt                              | 121 -------
 Documentation/lockup-watchdogs.txt                 |  83 -----
 Documentation/numastat.txt                         |  30 --
 Documentation/pnp.txt                              | 292 -----------------
 Documentation/rtc.txt                              | 140 --------
 Documentation/svga.txt                             | 249 --------------
 Documentation/video-output.txt                     |  34 --
 Documentation/x86/topology.rst                     |   2 +-
 MAINTAINERS                                        |  12 +-
 arch/arm/Kconfig                                   |   2 +-
 arch/parisc/Kconfig                                |   2 +-
 arch/sh/Kconfig                                    |   2 +-
 arch/sparc/Kconfig                                 |   2 +-
 arch/x86/Kconfig                                   |   4 +-
 block/partitions/Kconfig                           |   2 +-
 drivers/char/Kconfig                               |   4 +-
 drivers/char/hw_random/core.c                      |   2 +-
 include/linux/hw_random.h                          |   2 +-
 58 files changed, 2310 insertions(+), 2296 deletions(-)
 create mode 100644 Documentation/admin-guide/btmrvl.rst
 create mode 100644 Documentation/admin-guide/clearing-warn-once.rst
 create mode 100644 Documentation/admin-guide/cpu-load.rst
 create mode 100644 Documentation/admin-guide/cputopology.rst
 create mode 100644 Documentation/admin-guide/efi-stub.rst
 create mode 100644 Documentation/admin-guide/highuid.rst
 create mode 100644 Documentation/admin-guide/hw_random.rst
 create mode 100644 Documentation/admin-guide/iostats.rst
 create mode 100644 Documentation/admin-guide/kernel-per-CPU-kthreads.rst
 create mode 100644 Documentation/admin-guide/lcd-panel-cgram.rst
 create mode 100644 Documentation/admin-guide/ldm.rst
 create mode 100644 Documentation/admin-guide/lockup-watchdogs.rst
 create mode 100644 Documentation/admin-guide/mm/cma_debugfs.rst
 create mode 100644 Documentation/admin-guide/numastat.rst
 create mode 100644 Documentation/admin-guide/pnp.rst
 create mode 100644 Documentation/admin-guide/rtc.rst
 create mode 100644 Documentation/admin-guide/svga.rst
 create mode 100644 Documentation/admin-guide/video-output.rst
 delete mode 100644 Documentation/auxdisplay/lcd-panel-cgram.rst
 delete mode 100644 Documentation/btmrvl.txt
 delete mode 100644 Documentation/clearing-warn-once.txt
 delete mode 100644 Documentation/cma/debugfs.rst
 delete mode 100644 Documentation/cpu-load.txt
 delete mode 100644 Documentation/cputopology.txt
 delete mode 100644 Documentation/efi-stub.txt
 delete mode 100644 Documentation/highuid.txt
 delete mode 100644 Documentation/hw_random.txt
 delete mode 100644 Documentation/iostats.txt
 delete mode 100644 Documentation/kernel-per-CPU-kthreads.txt
 delete mode 100644 Documentation/ldm.txt
 delete mode 100644 Documentation/lockup-watchdogs.txt
 delete mode 100644 Documentation/numastat.txt
 delete mode 100644 Documentation/pnp.txt
 delete mode 100644 Documentation/rtc.txt
 delete mode 100644 Documentation/svga.txt
 delete mode 100644 Documentation/video-output.txt

(limited to 'include')

diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index f7ce68fbd4b9..df8413cf1468 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -61,7 +61,7 @@ Date:		October 2002
 Contact:	Linux Memory Management list <linux-mm@kvack.org>
 Description:
 		The node's hit/miss statistics, in units of pages.
-		See Documentation/numastat.txt
+		See Documentation/admin-guide/numastat.rst
 
 What:		/sys/devices/system/node/nodeX/distance
 Date:		October 2002
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index abac31d216de..2c44b4f1b060 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -29,4 +29,4 @@ Description:
 		17 - sectors discarded
 		18 - time spent discarding
 
-		For more details refer to Documentation/iostats.txt
+		For more details refer to Documentation/admin-guide/iostats.rst
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index dfad7427817c..f8c7c7126bb1 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -15,7 +15,7 @@ Description:
 		 9 - I/Os currently in progress
 		10 - time spent doing I/Os (ms)
 		11 - weighted time spent doing I/Os (ms)
-		For more details refer Documentation/iostats.txt
+		For more details refer Documentation/admin-guide/iostats.rst
 
 
 What:		/sys/block/<disk>/<part>/stat
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index d404603c6b52..5f7d7b14fa44 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -34,7 +34,7 @@ Description:	CPU topology files that describe kernel limits related to
 		present: cpus that have been identified as being present in
 		the system.
 
-		See Documentation/cputopology.txt for more information.
+		See Documentation/admin-guide/cputopology.rst for more information.
 
 
 What:		/sys/devices/system/cpu/probe
@@ -103,7 +103,7 @@ Description:	CPU topology files that describe a logical CPU's relationship
 		thread_siblings_list: human-readable list of cpu#'s hardware
 		threads within the same core as cpu#
 
-		See Documentation/cputopology.txt for more information.
+		See Documentation/admin-guide/cputopology.rst for more information.
 
 
 What:		/sys/devices/system/cpu/cpuidle/current_driver
diff --git a/Documentation/admin-guide/btmrvl.rst b/Documentation/admin-guide/btmrvl.rst
new file mode 100644
index 000000000000..ec57740ead0c
--- /dev/null
+++ b/Documentation/admin-guide/btmrvl.rst
@@ -0,0 +1,124 @@
+=============
+btmrvl driver
+=============
+
+All commands are used via debugfs interface.
+
+Set/get driver configurations
+=============================
+
+Path:	/debug/btmrvl/config/
+
+gpiogap=[n], hscfgcmd
+	These commands are used to configure the host sleep parameters::
+	bit 8:0  -- Gap
+	bit 16:8 -- GPIO
+
+	where GPIO is the pin number of GPIO used to wake up the host.
+	It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface
+	wakeup will be used instead).
+
+	where Gap is the gap in milli seconds between wakeup signal and
+	wakeup event, or 0xff for special host sleep setting.
+
+	Usage::
+
+		# Use SDIO interface to wake up the host and set GAP to 0x80:
+		echo 0xff80 > /debug/btmrvl/config/gpiogap
+		echo 1 > /debug/btmrvl/config/hscfgcmd
+
+		# Use GPIO pin #3 to wake up the host and set GAP to 0xff:
+		echo 0x03ff >  /debug/btmrvl/config/gpiogap
+		echo 1 > /debug/btmrvl/config/hscfgcmd
+
+psmode=[n], pscmd
+	These commands are used to enable/disable auto sleep mode
+
+	where the option is::
+
+			1 	-- Enable auto sleep mode
+			0 	-- Disable auto sleep mode
+
+	Usage::
+
+		# Enable auto sleep mode
+		echo 1 > /debug/btmrvl/config/psmode
+		echo 1 > /debug/btmrvl/config/pscmd
+
+		# Disable auto sleep mode
+		echo 0 > /debug/btmrvl/config/psmode
+		echo 1 > /debug/btmrvl/config/pscmd
+
+
+hsmode=[n], hscmd
+	These commands are used to enable host sleep or wake up firmware
+
+	where the option is::
+
+			1	-- Enable host sleep
+			0	-- Wake up firmware
+
+	Usage::
+
+		# Enable host sleep
+		echo 1 > /debug/btmrvl/config/hsmode
+		echo 1 > /debug/btmrvl/config/hscmd
+
+		# Wake up firmware
+		echo 0 > /debug/btmrvl/config/hsmode
+		echo 1 > /debug/btmrvl/config/hscmd
+
+
+Get driver status
+=================
+
+Path:	/debug/btmrvl/status/
+
+Usage::
+
+	cat /debug/btmrvl/status/<args>
+
+where the args are:
+
+curpsmode
+	This command displays current auto sleep status.
+
+psstate
+	This command display the power save state.
+
+hsstate
+	This command display the host sleep state.
+
+txdnldrdy
+	This command displays the value of Tx download ready flag.
+
+Issuing a raw hci command
+=========================
+
+Use hcitool to issue raw hci command, refer to hcitool manual
+
+Usage::
+
+	Hcitool cmd <ogf> <ocf> [Parameters]
+
+Interface Control Command::
+
+	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00    --Enable All interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01    --Enable Wlan interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02    --Enable BT interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00    --Disable All interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01    --Disable Wlan interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02    --Disable BT interface
+
+SD8688 firmware
+===============
+
+Images:
+
+- /lib/firmware/sd8688_helper.bin
+- /lib/firmware/sd8688.bin
+
+
+The images can be downloaded from:
+
+git.infradead.org/users/dwmw2/linux-firmware.git/libertas/
diff --git a/Documentation/admin-guide/clearing-warn-once.rst b/Documentation/admin-guide/clearing-warn-once.rst
new file mode 100644
index 000000000000..211fd926cf00
--- /dev/null
+++ b/Documentation/admin-guide/clearing-warn-once.rst
@@ -0,0 +1,9 @@
+Clearing WARN_ONCE
+------------------
+
+WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once.
+
+echo 1 > /sys/kernel/debug/clear_warn_once
+
+clears the state and allows the warnings to print once again.
+This can be useful after test suite runs to reproduce problems.
diff --git a/Documentation/admin-guide/cpu-load.rst b/Documentation/admin-guide/cpu-load.rst
new file mode 100644
index 000000000000..2d01ce43d2a2
--- /dev/null
+++ b/Documentation/admin-guide/cpu-load.rst
@@ -0,0 +1,114 @@
+========
+CPU load
+========
+
+Linux exports various bits of information via ``/proc/stat`` and
+``/proc/uptime`` that userland tools, such as top(1), use to calculate
+the average time system spent in a particular state, for example::
+
+    $ iostat
+    Linux 2.6.18.3-exp (linmac)     02/20/2007
+
+    avg-cpu:  %user   %nice %system %iowait  %steal   %idle
+              10.01    0.00    2.92    5.44    0.00   81.63
+
+    ...
+
+Here the system thinks that over the default sampling period the
+system spent 10.01% of the time doing work in user space, 2.92% in the
+kernel, and was overall 81.63% of the time idle.
+
+In most cases the ``/proc/stat``	 information reflects the reality quite
+closely, however due to the nature of how/when the kernel collects
+this data sometimes it can not be trusted at all.
+
+So how is this information collected?  Whenever timer interrupt is
+signalled the kernel looks what kind of task was running at this
+moment and increments the counter that corresponds to this tasks
+kind/state.  The problem with this is that the system could have
+switched between various states multiple times between two timer
+interrupts yet the counter is incremented only for the last state.
+
+
+Example
+-------
+
+If we imagine the system with one task that periodically burns cycles
+in the following manner::
+
+     time line between two timer interrupts
+    |--------------------------------------|
+     ^                                    ^
+     |_ something begins working          |
+                                          |_ something goes to sleep
+                                         (only to be awaken quite soon)
+
+In the above situation the system will be 0% loaded according to the
+``/proc/stat`` (since the timer interrupt will always happen when the
+system is executing the idle handler), but in reality the load is
+closer to 99%.
+
+One can imagine many more situations where this behavior of the kernel
+will lead to quite erratic information inside ``/proc/stat``::
+
+
+	/* gcc -o hog smallhog.c */
+	#include <time.h>
+	#include <limits.h>
+	#include <signal.h>
+	#include <sys/time.h>
+	#define HIST 10
+
+	static volatile sig_atomic_t stop;
+
+	static void sighandler (int signr)
+	{
+	(void) signr;
+	stop = 1;
+	}
+	static unsigned long hog (unsigned long niters)
+	{
+	stop = 0;
+	while (!stop && --niters);
+	return niters;
+	}
+	int main (void)
+	{
+	int i;
+	struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
+				.it_value = { .tv_sec = 0, .tv_usec = 1 } };
+	sigset_t set;
+	unsigned long v[HIST];
+	double tmp = 0.0;
+	unsigned long n;
+	signal (SIGALRM, &sighandler);
+	setitimer (ITIMER_REAL, &it, NULL);
+
+	hog (ULONG_MAX);
+	for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
+	for (i = 0; i < HIST; ++i) tmp += v[i];
+	tmp /= HIST;
+	n = tmp - (tmp / 3.0);
+
+	sigemptyset (&set);
+	sigaddset (&set, SIGALRM);
+
+	for (;;) {
+		hog (n);
+		sigwait (&set, &i);
+	}
+	return 0;
+	}
+
+
+References
+----------
+
+- http://lkml.org/lkml/2007/2/12/6
+- Documentation/filesystems/proc.txt (1.8)
+
+
+Thanks
+------
+
+Con Kolivas, Pavel Machek
diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst
new file mode 100644
index 000000000000..b90dafcc8237
--- /dev/null
+++ b/Documentation/admin-guide/cputopology.rst
@@ -0,0 +1,177 @@
+===========================================
+How CPU topology info is exported via sysfs
+===========================================
+
+Export CPU topology info via sysfs. Items (attributes) are similar
+to /proc/cpuinfo output of some architectures.  They reside in
+/sys/devices/system/cpu/cpuX/topology/:
+
+physical_package_id:
+
+	physical package id of cpuX. Typically corresponds to a physical
+	socket number, but the actual value is architecture and platform
+	dependent.
+
+die_id:
+
+	the CPU die ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).  The actual value is
+	architecture and platform dependent.
+
+core_id:
+
+	the CPU core ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).  The actual value is
+	architecture and platform dependent.
+
+book_id:
+
+	the book ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).	The actual value is
+	architecture and platform dependent.
+
+drawer_id:
+
+	the drawer ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).	The actual value is
+	architecture and platform dependent.
+
+core_cpus:
+
+	internal kernel map of CPUs within the same core.
+	(deprecated name: "thread_siblings")
+
+core_cpus_list:
+
+	human-readable list of CPUs within the same core.
+	(deprecated name: "thread_siblings_list");
+
+package_cpus:
+
+	internal kernel map of the CPUs sharing the same physical_package_id.
+	(deprecated name: "core_siblings")
+
+package_cpus_list:
+
+	human-readable list of CPUs sharing the same physical_package_id.
+	(deprecated name: "core_siblings_list")
+
+die_cpus:
+
+	internal kernel map of CPUs within the same die.
+
+die_cpus_list:
+
+	human-readable list of CPUs within the same die.
+
+book_siblings:
+
+	internal kernel map of cpuX's hardware threads within the same
+	book_id.
+
+book_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	book_id.
+
+drawer_siblings:
+
+	internal kernel map of cpuX's hardware threads within the same
+	drawer_id.
+
+drawer_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	drawer_id.
+
+Architecture-neutral, drivers/base/topology.c, exports these attributes.
+However, the book and drawer related sysfs files will only be created if
+CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
+
+CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
+where they reflect the cpu and cache hierarchy.
+
+For an architecture to support this feature, it must define some of
+these macros in include/asm-XXX/topology.h::
+
+	#define topology_physical_package_id(cpu)
+	#define topology_die_id(cpu)
+	#define topology_core_id(cpu)
+	#define topology_book_id(cpu)
+	#define topology_drawer_id(cpu)
+	#define topology_sibling_cpumask(cpu)
+	#define topology_core_cpumask(cpu)
+	#define topology_die_cpumask(cpu)
+	#define topology_book_cpumask(cpu)
+	#define topology_drawer_cpumask(cpu)
+
+The type of ``**_id macros`` is int.
+The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
+correspond with appropriate ``**_siblings`` sysfs attributes (except for
+topology_sibling_cpumask() which corresponds with thread_siblings).
+
+To be consistent on all architectures, include/linux/topology.h
+provides default definitions for any of the above macros that are
+not defined by include/asm-XXX/topology.h:
+
+1) topology_physical_package_id: -1
+2) topology_die_id: -1
+3) topology_core_id: 0
+4) topology_sibling_cpumask: just the given CPU
+5) topology_core_cpumask: just the given CPU
+6) topology_die_cpumask: just the given CPU
+
+For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
+default definitions for topology_book_id() and topology_book_cpumask().
+For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
+no default definitions for topology_drawer_id() and topology_drawer_cpumask().
+
+Additionally, CPU topology information is provided under
+/sys/devices/system/cpu and includes these files.  The internal
+source for the output is in brackets ("[]").
+
+    =========== ==========================================================
+    kernel_max: the maximum CPU index allowed by the kernel configuration.
+		[NR_CPUS-1]
+
+    offline:	CPUs that are not online because they have been
+		HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit
+		of CPUs allowed by the kernel configuration (kernel_max
+		above). [~cpu_online_mask + cpus >= NR_CPUS]
+
+    online:	CPUs that are online and being scheduled [cpu_online_mask]
+
+    possible:	CPUs that have been allocated resources and can be
+		brought online if they are present. [cpu_possible_mask]
+
+    present:	CPUs that have been identified as being present in the
+		system. [cpu_present_mask]
+    =========== ==========================================================
+
+The format for the above output is compatible with cpulist_parse()
+[see <linux/cpumask.h>].  Some examples follow.
+
+In this example, there are 64 CPUs in the system but cpus 32-63 exceed
+the kernel max which is limited to 0..31 by the NR_CPUS config option
+being 32.  Note also that CPUs 2 and 4-31 are not online but could be
+brought online as they are both present and possible::
+
+     kernel_max: 31
+        offline: 2,4-31,32-63
+         online: 0-1,3
+       possible: 0-31
+        present: 0-31
+
+In this example, the NR_CPUS config option is 128, but the kernel was
+started with possible_cpus=144.  There are 4 CPUs in the system and cpu2
+was manually taken offline (and is the only CPU that can be brought
+online.)::
+
+     kernel_max: 127
+        offline: 2,4-127,128-143
+         online: 0-1,3
+       possible: 0-127
+        present: 0-3
+
+See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter
+as well as more information on the various cpumasks.
diff --git a/Documentation/admin-guide/device-mapper/statistics.rst b/Documentation/admin-guide/device-mapper/statistics.rst
index 3d80a9f850cc..41ded0bc5933 100644
--- a/Documentation/admin-guide/device-mapper/statistics.rst
+++ b/Documentation/admin-guide/device-mapper/statistics.rst
@@ -13,7 +13,7 @@ the range specified.
 
 The I/O statistics counters for each step-sized area of a region are
 in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see:
-Documentation/iostats.txt).  But two extra counters (12 and 13) are
+Documentation/admin-guide/iostats.rst).  But two extra counters (12 and 13) are
 provided: total time spent reading and writing.  When the histogram
 argument is used, the 14th parameter is reported that represents the
 histogram of latencies.  All these counters may be accessed by sending
@@ -151,7 +151,7 @@ Messages
 	  The first 11 counters have the same meaning as
 	  `/sys/block/*/stat or /proc/diskstats`.
 
-	  Please refer to Documentation/iostats.txt for details.
+	  Please refer to Documentation/admin-guide/iostats.rst for details.
 
 	  1. the number of reads completed
 	  2. the number of reads merged
diff --git a/Documentation/admin-guide/efi-stub.rst b/Documentation/admin-guide/efi-stub.rst
new file mode 100644
index 000000000000..833edb0d0bc4
--- /dev/null
+++ b/Documentation/admin-guide/efi-stub.rst
@@ -0,0 +1,100 @@
+=================
+The EFI Boot Stub
+=================
+
+On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade
+as a PE/COFF image, thereby convincing EFI firmware loaders to load
+it as an EFI executable. The code that modifies the bzImage header,
+along with the EFI-specific entry point that the firmware loader
+jumps to are collectively known as the "EFI boot stub", and live in
+arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c,
+respectively. For ARM the EFI stub is implemented in
+arch/arm/boot/compressed/efi-header.S and
+arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared
+between architectures is in drivers/firmware/efi/libstub.
+
+For arm64, there is no compressed kernel support, so the Image itself
+masquerades as a PE/COFF image and the EFI stub is linked into the
+kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S
+and drivers/firmware/efi/libstub/arm64-stub.c.
+
+By using the EFI boot stub it's possible to boot a Linux kernel
+without the use of a conventional EFI boot loader, such as grub or
+elilo. Since the EFI boot stub performs the jobs of a boot loader, in
+a certain sense it *IS* the boot loader.
+
+The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option.
+
+
+How to install bzImage.efi
+--------------------------
+
+The bzImage located in arch/x86/boot/bzImage must be copied to the EFI
+System Partition (ESP) and renamed with the extension ".efi". Without
+the extension the EFI firmware loader will refuse to execute it. It's
+not possible to execute bzImage.efi from the usual Linux file systems
+because EFI firmware doesn't have support for them. For ARM the
+arch/arm/boot/zImage should be copied to the system partition, and it
+may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image
+should be copied but not necessarily renamed.
+
+
+Passing kernel parameters from the EFI shell
+--------------------------------------------
+
+Arguments to the kernel can be passed after bzImage.efi, e.g.::
+
+	fs0:> bzImage.efi console=ttyS0 root=/dev/sda4
+
+
+The "initrd=" option
+--------------------
+
+Like most boot loaders, the EFI stub allows the user to specify
+multiple initrd files using the "initrd=" option. This is the only EFI
+stub-specific command line parameter, everything else is passed to the
+kernel when it boots.
+
+The path to the initrd file must be an absolute path from the
+beginning of the ESP, relative path names do not work. Also, the path
+is an EFI-style path and directory elements must be separated with
+backslashes (\). For example, given the following directory layout::
+
+  fs0:>
+	Kernels\
+			bzImage.efi
+			initrd-large.img
+
+	Ramdisks\
+			initrd-small.img
+			initrd-medium.img
+
+to boot with the initrd-large.img file if the current working
+directory is fs0:\Kernels, the following command must be used::
+
+	fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img
+
+Notice how bzImage.efi can be specified with a relative path. That's
+because the image we're executing is interpreted by the EFI shell,
+which understands relative paths, whereas the rest of the command line
+is passed to bzImage.efi.
+
+
+The "dtb=" option
+-----------------
+
+For the ARM and arm64 architectures, a device tree must be provided to
+the kernel. Normally firmware shall supply the device tree via the
+EFI CONFIGURATION TABLE. However, the "dtb=" command line option can
+be used to override the firmware supplied device tree, or to supply
+one when firmware is unable to.
+
+Please note: Firmware adds runtime configuration information to the
+device tree before booting the kernel. If dtb= is used to override
+the device tree, then any runtime data provided by firmware will be
+lost. The dtb= option should only be used either as a debug tool, or
+as a last resort when a device tree is not provided in the EFI
+CONFIGURATION TABLE.
+
+"dtb=" is processed in the same manner as the "initrd=" option that is
+described above.
diff --git a/Documentation/admin-guide/highuid.rst b/Documentation/admin-guide/highuid.rst
new file mode 100644
index 000000000000..6ee70465c0ea
--- /dev/null
+++ b/Documentation/admin-guide/highuid.rst
@@ -0,0 +1,80 @@
+===================================================
+Notes on the change from 16-bit UIDs to 32-bit UIDs
+===================================================
+
+:Author: Chris Wing <wingc@umich.edu>
+:Last updated: January 11, 2000
+
+- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t
+  when communicating between user and kernel space in an ioctl or data
+  structure.
+
+- kernel code should use uid_t and gid_t in kernel-private structures and
+  code.
+
+What's left to be done for 32-bit UIDs on all Linux architectures:
+
+- Disk quotas have an interesting limitation that is not related to the
+  maximum UID/GID. They are limited by the maximum file size on the
+  underlying filesystem, because quota records are written at offsets
+  corresponding to the UID in question.
+  Further investigation is needed to see if the quota system can cope
+  properly with huge UIDs. If it can deal with 64-bit file offsets on all 
+  architectures, this should not be a problem.
+
+- Decide whether or not to keep backwards compatibility with the system
+  accounting file, or if we should break it as the comments suggest
+  (currently, the old 16-bit UID and GID are still written to disk, and
+  part of the former pad space is used to store separate 32-bit UID and
+  GID)
+
+- Need to validate that OS emulation calls the 16-bit UID
+  compatibility syscalls, if the OS being emulated used 16-bit UIDs, or
+  uses the 32-bit UID system calls properly otherwise.
+
+  This affects at least:
+
+	- iBCS on Intel
+
+	- sparc32 emulation on sparc64
+	  (need to support whatever new 32-bit UID system calls are added to
+	  sparc32)
+
+- Validate that all filesystems behave properly.
+
+  At present, 32-bit UIDs _should_ work for:
+
+	- ext2
+	- ufs
+	- isofs
+	- nfs
+	- coda
+	- udf
+
+  Ioctl() fixups have been made for:
+
+	- ncpfs
+	- smbfs
+
+  Filesystems with simple fixups to prevent 16-bit UID wraparound:
+
+	- minix
+	- sysv
+	- qnx4
+
+  Other filesystems have not been checked yet.
+
+- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in
+  all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but
+  more are needed. (as well as new user<->kernel data structures)
+
+- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k,
+  sh, and sparc32. Fixing this is probably not that important, but would
+  require adding a new ELF section.
+
+- The ioctl()s used to control the in-kernel NFS server only support
+  16-bit UIDs on arm, i386, m68k, sh, and sparc32.
+
+- make sure that the UID mapping feature of AX25 networking works properly
+  (it should be safe because it's always used a 32-bit integer to
+  communicate between user and kernel)
diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst
index 656aee262e23..f83212fae4d5 100644
--- a/Documentation/admin-guide/hw-vuln/l1tf.rst
+++ b/Documentation/admin-guide/hw-vuln/l1tf.rst
@@ -241,7 +241,7 @@ Guest mitigation mechanisms
    For further information about confining guests to a single or to a group
    of cores consult the cpusets documentation:
 
-   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.rst
+   https://www.kernel.org/doc/Documentation/admin-guide/cgroup-v1/cpusets.rst
 
 .. _interrupt_isolation:
 
diff --git a/Documentation/admin-guide/hw_random.rst b/Documentation/admin-guide/hw_random.rst
new file mode 100644
index 000000000000..121de96e395e
--- /dev/null
+++ b/Documentation/admin-guide/hw_random.rst
@@ -0,0 +1,105 @@
+==========================================================
+Linux support for random number generator in i8xx chipsets
+==========================================================
+
+Introduction
+============
+
+The hw_random framework is software that makes use of a
+special hardware feature on your CPU or motherboard,
+a Random Number Generator (RNG).  The software has two parts:
+a core providing the /dev/hwrng character device and its
+sysfs support, plus a hardware-specific driver that plugs
+into that core.
+
+To make the most effective use of these mechanisms, you
+should download the support software as well.  Download the
+latest version of the "rng-tools" package from the
+hw_random driver's official Web site:
+
+	http://sourceforge.net/projects/gkernel/
+
+Those tools use /dev/hwrng to fill the kernel entropy pool,
+which is used internally and exported by the /dev/urandom and
+/dev/random special files.
+
+Theory of operation
+===================
+
+CHARACTER DEVICE.  Using the standard open()
+and read() system calls, you can read random data from
+the hardware RNG device.  This data is NOT CHECKED by any
+fitness tests, and could potentially be bogus (if the
+hardware is faulty or has been tampered with).  Data is only
+output if the hardware "has-data" flag is set, but nevertheless
+a security-conscious person would run fitness tests on the
+data before assuming it is truly random.
+
+The rng-tools package uses such tests in "rngd", and lets you
+run them by hand with a "rngtest" utility.
+
+/dev/hwrng is char device major 10, minor 183.
+
+CLASS DEVICE.  There is a /sys/class/misc/hw_random node with
+two unique attributes, "rng_available" and "rng_current".  The
+"rng_available" attribute lists the hardware-specific drivers
+available, while "rng_current" lists the one which is currently
+connected to /dev/hwrng.  If your system has more than one
+RNG available, you may change the one used by writing a name from
+the list in "rng_available" into "rng_current".
+
+==========================================================================
+
+
+Hardware driver for Intel/AMD/VIA Random Number Generators (RNG)
+	- Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
+	- Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
+
+
+About the Intel RNG hardware, from the firmware hub datasheet
+=============================================================
+
+The Firmware Hub integrates a Random Number Generator (RNG)
+using thermal noise generated from inherently random quantum
+mechanical properties of silicon. When not generating new random
+bits the RNG circuitry will enter a low power state. Intel will
+provide a binary software driver to give third party software
+access to our RNG for use as a security feature. At this time,
+the RNG is only to be used with a system in an OS-present state.
+
+Intel RNG Driver notes
+======================
+
+FIXME: support poll(2)
+
+.. note::
+
+	request_mem_region was removed, for three reasons:
+
+	1) Only one RNG is supported by this driver;
+	2) The location used by the RNG is a fixed location in
+	   MMIO-addressable memory;
+	3) users with properly working BIOS e820 handling will always
+	   have the region in which the RNG is located reserved, so
+	   request_mem_region calls always fail for proper setups.
+	   However, for people who use mem=XX, BIOS e820 information is
+	   **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can
+	   succeed.
+
+Driver details
+==============
+
+Based on:
+	Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet
+	May 1999 Order Number: 290658-002 R
+
+Intel 82802 Firmware Hub:
+	Random Number Generator
+	Programmer's Reference Manual
+	December 1999 Order Number: 298029-001 R
+
+Intel 82802 Firmware HUB Random Number Generator Driver
+	Copyright (c) 2000 Matt Sottek <msottek@quiknet.com>
+
+Special thanks to Matt Sottek.  I did the "guts", he
+did the "brains" and all the testing.
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index a5fdb1a846ce..4e98f5596da0 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -85,8 +85,25 @@ configure specific aspects of kernel behavior to your liking.
    perf-security
    acpi/index
    aoe/index
+   btmrvl
+   clearing-warn-once
+   cpu-load
+   cputopology
    device-mapper/index
+   efi-stub
+   highuid
+   hw_random
+   iostats
+   kernel-per-CPU-kthreads
    laptops/index
+   lcd-panel-cgram
+   ldm
+   lockup-watchdogs
+   numastat
+   pnp
+   rtc
+   svga
+   video-output
 
 .. only::  subproject and html
 
diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst
new file mode 100644
index 000000000000..5d63b18bd6d1
--- /dev/null
+++ b/Documentation/admin-guide/iostats.rst
@@ -0,0 +1,197 @@
+=====================
+I/O statistics fields
+=====================
+
+Since 2.4.20 (and some versions before, with patches), and 2.5.45,
+more extensive disk statistics have been introduced to help measure disk
+activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do
+the work for you, but in case you are interested in creating your own
+tools, the fields are explained here.
+
+In 2.4 now, the information is found as additional fields in
+``/proc/partitions``.  In 2.6 and upper, the same information is found in two
+places: one is in the file ``/proc/diskstats``, and the other is within
+the sysfs file system, which must be mounted in order to obtain
+the information. Throughout this document we'll assume that sysfs
+is mounted on ``/sys``, although of course it may be mounted anywhere.
+Both ``/proc/diskstats`` and sysfs use the same source for the information
+and so should not differ.
+
+Here are examples of these different formats::
+
+   2.4:
+      3     0   39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      3     1    9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
+
+   2.6+ sysfs:
+      446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      35486    38030    38030    38030
+
+   2.6+ diskstats:
+      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      3    1   hda1 35486 38030 38030 38030
+
+   4.18+ diskstats:
+      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
+
+On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
+a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
+
+The advantage of one over the other is that the sysfs choice works well
+if you are watching a known, small set of disks.  ``/proc/diskstats`` may
+be a better choice if you are watching a large number of disks because
+you'll avoid the overhead of 50, 100, or 500 or more opens/closes with
+each snapshot of your disk statistics.
+
+In 2.4, the statistics fields are those after the device name. In
+the above example, the first field of statistics would be 446216.
+By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
+find just the eleven fields, beginning with 446216.  If you look at
+``/proc/diskstats``, the eleven fields will be preceded by the major and
+minor device numbers, and device name.  Each of these formats provides
+eleven fields of statistics, each meaning exactly the same things.
+All fields except field 9 are cumulative since boot.  Field 9 should
+go to zero as I/Os complete; all others only increase (unless they
+overflow and wrap).  Yes, these are (32-bit or 64-bit) unsigned long
+(native word size) numbers, and on a very busy or long-lived system they
+may wrap. Applications should be prepared to deal with that; unless
+your observations are measured in large numbers of minutes or hours,
+they should not wrap twice before you notice them.
+
+Each set of stats only applies to the indicated device; if you want
+system-wide stats you'll have to find all the devices and sum them all up.
+
+Field  1 -- # of reads completed
+    This is the total number of reads completed successfully.
+
+Field  2 -- # of reads merged, field 6 -- # of writes merged
+    Reads and writes which are adjacent to each other may be merged for
+    efficiency.  Thus two 4K reads may become one 8K read before it is
+    ultimately handed to the disk, and so it will be counted (and queued)
+    as only one I/O.  This field lets you know how often this was done.
+
+Field  3 -- # of sectors read
+    This is the total number of sectors read successfully.
+
+Field  4 -- # of milliseconds spent reading
+    This is the total number of milliseconds spent by all reads (as
+    measured from __make_request() to end_that_request_last()).
+
+Field  5 -- # of writes completed
+    This is the total number of writes completed successfully.
+
+Field  6 -- # of writes merged
+    See the description of field 2.
+
+Field  7 -- # of sectors written
+    This is the total number of sectors written successfully.
+
+Field  8 -- # of milliseconds spent writing
+    This is the total number of milliseconds spent by all writes (as
+    measured from __make_request() to end_that_request_last()).
+
+Field  9 -- # of I/Os currently in progress
+    The only field that should go to zero. Incremented as requests are
+    given to appropriate struct request_queue and decremented as they finish.
+
+Field 10 -- # of milliseconds spent doing I/Os
+    This field increases so long as field 9 is nonzero.
+
+    Since 5.0 this field counts jiffies when at least one request was
+    started or completed. If request runs more than 2 jiffies then some
+    I/O time will not be accounted unless there are other requests.
+
+Field 11 -- weighted # of milliseconds spent doing I/Os
+    This field is incremented at each I/O start, I/O completion, I/O
+    merge, or read of these stats by the number of I/Os in progress
+    (field 9) times the number of milliseconds spent doing I/O since the
+    last update of this field.  This can provide an easy measure of both
+    I/O completion time and the backlog that may be accumulating.
+
+Field 12 -- # of discards completed
+    This is the total number of discards completed successfully.
+
+Field 13 -- # of discards merged
+    See the description of field 2
+
+Field 14 -- # of sectors discarded
+    This is the total number of sectors discarded successfully.
+
+Field 15 -- # of milliseconds spent discarding
+    This is the total number of milliseconds spent by all discards (as
+    measured from __make_request() to end_that_request_last()).
+
+To avoid introducing performance bottlenecks, no locks are held while
+modifying these counters.  This implies that minor inaccuracies may be
+introduced when changes collide, so (for instance) adding up all the
+read I/Os issued per partition should equal those made to the disks ...
+but due to the lack of locking it may only be very close.
+
+In 2.6+, there are counters for each CPU, which make the lack of locking
+almost a non-issue.  When the statistics are read, the per-CPU counters
+are summed (possibly overflowing the unsigned long variable they are
+summed to) and the result given to the user.  There is no convenient
+user interface for accessing the per-CPU counters themselves.
+
+Disks vs Partitions
+-------------------
+
+There were significant changes between 2.4 and 2.6+ in the I/O subsystem.
+As a result, some statistic information disappeared. The translation from
+a disk address relative to a partition to the disk address relative to
+the host disk happens much earlier.  All merges and timings now happen
+at the disk level rather than at both the disk and partition level as
+in 2.4.  Consequently, you'll see a different statistics output on 2.6+ for
+partitions from that for disks.  There are only *four* fields available
+for partitions on 2.6+ machines.  This is reflected in the examples above.
+
+Field  1 -- # of reads issued
+    This is the total number of reads issued to this partition.
+
+Field  2 -- # of sectors read
+    This is the total number of sectors requested to be read from this
+    partition.
+
+Field  3 -- # of writes issued
+    This is the total number of writes issued to this partition.
+
+Field  4 -- # of sectors written
+    This is the total number of sectors requested to be written to
+    this partition.
+
+Note that since the address is translated to a disk-relative one, and no
+record of the partition-relative address is kept, the subsequent success
+or failure of the read cannot be attributed to the partition.  In other
+words, the number of reads for partitions is counted slightly before time
+of queuing for partitions, and at completion for whole disks.  This is
+a subtle distinction that is probably uninteresting for most cases.
+
+More significant is the error induced by counting the numbers of
+reads/writes before merges for partitions and after for disks. Since a
+typical workload usually contains a lot of successive and adjacent requests,
+the number of reads/writes issued can be several times higher than the
+number of reads/writes completed.
+
+In 2.6.25, the full statistic set is again available for partitions and
+disk and partition statistics are consistent again. Since we still don't
+keep record of the partition-relative address, an operation is attributed to
+the partition which contains the first sector of the request after the
+eventual merges. As requests can be merged across partition, this could lead
+to some (probably insignificant) inaccuracy.
+
+Additional notes
+----------------
+
+In 2.6+, sysfs is not mounted by default.  If your distribution of
+Linux hasn't added it already, here's the line you'll want to add to
+your ``/etc/fstab``::
+
+	none /sys sysfs defaults 0 0
+
+
+In 2.6+, all disk statistics were removed from ``/proc/stat``.  In 2.4, they
+appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in
+``/proc/stat`` take a very different format from those in ``/proc/partitions``
+(see proc(5), if your system has it.)
+
+-- ricklind@us.ibm.com
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a571a67e0c85..19b1e3bef56c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5066,7 +5066,7 @@
 
 	vga=		[BOOT,X86-32] Select a particular video mode
 			See Documentation/x86/boot.rst and
-			Documentation/svga.txt.
+			Documentation/admin-guide/svga.rst.
 			Use vga=ask for menu.
 			This is actually a boot loader parameter; the value is
 			passed to the kernel using a special protocol.
diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
new file mode 100644
index 000000000000..4f18456dd3b1
--- /dev/null
+++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
@@ -0,0 +1,356 @@
+==========================================
+Reducing OS jitter due to per-cpu kthreads
+==========================================
+
+This document lists per-CPU kthreads in the Linux kernel and presents
+options to control their OS jitter.  Note that non-per-CPU kthreads are
+not listed here.  To reduce OS jitter from non-per-CPU kthreads, bind
+them to a "housekeeping" CPU dedicated to such work.
+
+References
+==========
+
+-	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
+
+-	Documentation/admin-guide/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
+
+-	man taskset:  Using the taskset command to bind tasks to sets
+	of CPUs.
+
+-	man sched_setaffinity:  Using the sched_setaffinity() system
+	call to bind tasks to sets of CPUs.
+
+-	/sys/devices/system/cpu/cpuN/online:  Control CPU N's hotplug state,
+	writing "0" to offline and "1" to online.
+
+-	In order to locate kernel-generated OS jitter on CPU N:
+
+		cd /sys/kernel/debug/tracing
+		echo 1 > max_graph_depth # Increase the "1" for more detail
+		echo function_graph > current_tracer
+		# run workload
+		cat per_cpu/cpuN/trace
+
+kthreads
+========
+
+Name:
+  ehca_comp/%u
+
+Purpose:
+  Periodically process Infiniband-related work.
+
+To reduce its OS jitter, do any of the following:
+
+1.	Don't use eHCA Infiniband hardware, instead choosing hardware
+	that does not require per-CPU kthreads.  This will prevent these
+	kthreads from being created in the first place.  (This will
+	work for most people, as this hardware, though important, is
+	relatively old and is produced in relatively low unit volumes.)
+2.	Do all eHCA-Infiniband-related work on other CPUs, including
+	interrupts.
+3.	Rework the eHCA driver so that its per-CPU kthreads are
+	provisioned only on selected CPUs.
+
+
+Name:
+  irq/%d-%s
+
+Purpose:
+  Handle threaded interrupts.
+
+To reduce its OS jitter, do the following:
+
+1.	Use irq affinity to force the irq threads to execute on
+	some other CPU.
+
+Name:
+  kcmtpd_ctr_%d
+
+Purpose:
+  Handle Bluetooth work.
+
+To reduce its OS jitter, do one of the following:
+
+1.	Don't use Bluetooth, in which case these kthreads won't be
+	created in the first place.
+2.	Use irq affinity to force Bluetooth-related interrupts to
+	occur on some other CPU and furthermore initiate all
+	Bluetooth activity on some other CPU.
+
+Name:
+  ksoftirqd/%u
+
+Purpose:
+  Execute softirq handlers when threaded or when under heavy load.
+
+To reduce its OS jitter, each softirq vector must be handled
+separately as follows:
+
+TIMER_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle, for example, by avoiding system calls and by forcing
+	both kernel threads and interrupts to execute elsewhere.
+2.	Build with CONFIG_HOTPLUG_CPU=y.  After boot completes, force
+	the CPU offline, then bring it back online.  This forces
+	recurring timers to migrate elsewhere.	If you are concerned
+	with multiple CPUs, force them all offline before bringing the
+	first one back online.  Once you have onlined the CPUs in question,
+	do not offline any other CPUs, because doing so could force the
+	timer back onto one of the CPUs in question.
+
+NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
+---------------------------------
+
+Do all of the following:
+
+1.	Force networking interrupts onto other CPUs.
+2.	Initiate any network I/O on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+BLOCK_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	Force block-device interrupts onto some other CPU.
+2.	Initiate any block I/O on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+IRQ_POLL_SOFTIRQ
+----------------
+
+Do all of the following:
+
+1.	Force block-device interrupts onto some other CPU.
+2.	Initiate any block I/O and block-I/O polling on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+TASKLET_SOFTIRQ
+---------------
+
+Do one or more of the following:
+
+1.	Avoid use of drivers that use tasklets.  (Such drivers will contain
+	calls to things like tasklet_schedule().)
+2.	Convert all drivers that you must use from tasklets to workqueues.
+3.	Force interrupts for drivers using tasklets onto other CPUs,
+	and also do I/O involving these drivers on other CPUs.
+
+SCHED_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	Avoid sending scheduler IPIs to the CPU to be de-jittered,
+	for example, ensure that at most one runnable kthread is present
+	on that CPU.  If a thread that expects to run on the de-jittered
+	CPU awakens, the scheduler will send an IPI that can result in
+	a subsequent SCHED_SOFTIRQ.
+2.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
+	is marked as an adaptive-ticks CPU using the "nohz_full="
+	boot parameter.  This reduces the number of scheduler-clock
+	interrupts that the de-jittered CPU receives, minimizing its
+	chances of being selected to do the load balancing work that
+	runs in SCHED_SOFTIRQ context.
+3.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle, for example, by avoiding system calls and by
+	forcing both kernel threads and interrupts to execute elsewhere.
+	This further reduces the number of scheduler-clock interrupts
+	received by the de-jittered CPU.
+
+HRTIMER_SOFTIRQ
+---------------
+
+Do all of the following:
+
+1.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle.  For example, avoid system calls and force both
+	kernel threads and interrupts to execute elsewhere.
+2.	Build with CONFIG_HOTPLUG_CPU=y.  Once boot completes, force the
+	CPU offline, then bring it back online.  This forces recurring
+	timers to migrate elsewhere.  If you are concerned with multiple
+	CPUs, force them all offline before bringing the first one
+	back online.  Once you have onlined the CPUs in question, do not
+	offline any other CPUs, because doing so could force the timer
+	back onto one of the CPUs in question.
+
+RCU_SOFTIRQ
+-----------
+
+Do at least one of the following:
+
+1.	Offload callbacks and keep the CPU in either dyntick-idle or
+	adaptive-ticks state by doing all of the following:
+
+	a.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
+		de-jittered is marked as an adaptive-ticks CPU using the
+		"nohz_full=" boot parameter.  Bind the rcuo kthreads to
+		housekeeping CPUs, which can tolerate OS jitter.
+	b.	To the extent possible, keep the CPU out of the kernel
+		when it is non-idle, for example, by avoiding system
+		calls and by forcing both kernel threads and interrupts
+		to execute elsewhere.
+
+2.	Enable RCU to do its processing remotely via dyntick-idle by
+	doing all of the following:
+
+	a.	Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
+	b.	Ensure that the CPU goes idle frequently, allowing other
+		CPUs to detect that it has passed through an RCU quiescent
+		state.	If the kernel is built with CONFIG_NO_HZ_FULL=y,
+		userspace execution also allows other CPUs to detect that
+		the CPU in question has passed through a quiescent state.
+	c.	To the extent possible, keep the CPU out of the kernel
+		when it is non-idle, for example, by avoiding system
+		calls and by forcing both kernel threads and interrupts
+		to execute elsewhere.
+
+Name:
+  kworker/%u:%d%s (cpu, id, priority)
+
+Purpose:
+  Execute workqueue requests
+
+To reduce its OS jitter, do any of the following:
+
+1.	Run your workload at a real-time priority, which will allow
+	preempting the kworker daemons.
+2.	A given workqueue can be made visible in the sysfs filesystem
+	by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
+	Such a workqueue can be confined to a given subset of the
+	CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
+	files.	The set of WQ_SYSFS workqueues can be displayed using
+	"ls sys/devices/virtual/workqueue".  That said, the workqueues
+	maintainer would like to caution people against indiscriminately
+	sprinkling WQ_SYSFS across all the workqueues.	The reason for
+	caution is that it is easy to add WQ_SYSFS, but because sysfs is
+	part of the formal user/kernel API, it can be nearly impossible
+	to remove it, even if its addition was a mistake.
+3.	Do any of the following needed to avoid jitter that your
+	application cannot tolerate:
+
+	a.	Build your kernel with CONFIG_SLUB=y rather than
+		CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
+		use of each CPU's workqueues to run its cache_reap()
+		function.
+	b.	Avoid using oprofile, thus avoiding OS jitter from
+		wq_sync_buffer().
+	c.	Limit your CPU frequency so that a CPU-frequency
+		governor is not required, possibly enlisting the aid of
+		special heatsinks or other cooling technologies.  If done
+		correctly, and if you CPU architecture permits, you should
+		be able to build your kernel with CONFIG_CPU_FREQ=n to
+		avoid the CPU-frequency governor periodically running
+		on each CPU, including cs_dbs_timer() and od_dbs_timer().
+
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	d.	As of v3.18, Christoph Lameter's on-demand vmstat workers
+		commit prevents OS jitter due to vmstat_update() on
+		CONFIG_SMP=y systems.  Before v3.18, is not possible
+		to entirely get rid of the OS jitter, but you can
+		decrease its frequency by writing a large value to
+		/proc/sys/vm/stat_interval.  The default value is HZ,
+		for an interval of one second.	Of course, larger values
+		will make your virtual-memory statistics update more
+		slowly.  Of course, you can also run your workload at
+		a real-time priority, thus preempting vmstat_update(),
+		but if your workload is CPU-bound, this is a bad idea.
+		However, there is an RFC patch from Christoph Lameter
+		(based on an earlier one from Gilad Ben-Yossef) that
+		reduces or even eliminates vmstat overhead for some
+		workloads at https://lkml.org/lkml/2013/9/4/379.
+	e.	Boot with "elevator=noop" to avoid workqueue use by
+		the block layer.
+	f.	If running on high-end powerpc servers, build with
+		CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
+		daemon from running on each CPU every second or so.
+		(This will require editing Kconfig files and will defeat
+		this platform's RAS functionality.)  This avoids jitter
+		due to the rtas_event_scan() function.
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	g.	If running on Cell Processor, build your kernel with
+		CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
+		spu_gov_work().
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	h.	If running on PowerMAC, build your kernel with
+		CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
+		avoiding OS jitter from rackmeter_do_timer().
+
+Name:
+  rcuc/%u
+
+Purpose:
+  Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Build the kernel with CONFIG_PREEMPT=n.  This prevents these
+	kthreads from being created in the first place, and also obviates
+	the need for RCU priority boosting.  This approach is feasible
+	for workloads that do not require high degrees of responsiveness.
+2.	Build the kernel with CONFIG_RCU_BOOST=n.  This prevents these
+	kthreads from being created in the first place.  This approach
+	is feasible only if your workload never requires RCU priority
+	boosting, for example, if you ensure frequent idle time on all
+	CPUs that might execute within the kernel.
+3.	Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
+	boot parameter offloading RCU callbacks from all CPUs susceptible
+	to OS jitter.  This approach prevents the rcuc/%u kthreads from
+	having any work to do, so that they are never awakened.
+4.	Ensure that the CPU never enters the kernel, and, in particular,
+	avoid initiating any CPU hotplug operations on this CPU.  This is
+	another way of preventing any callbacks from being queued on the
+	CPU, again preventing the rcuc/%u kthreads from having any work
+	to do.
+
+Name:
+  rcuop/%d and rcuos/%d
+
+Purpose:
+  Offload RCU callbacks from the corresponding CPU.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Use affinity, cgroups, or other mechanism to force these kthreads
+	to execute on some other CPU.
+2.	Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
+	kthreads from being created in the first place.  However, please
+	note that this will not eliminate OS jitter, but will instead
+	shift it to RCU_SOFTIRQ.
+
+Name:
+  watchdog/%u
+
+Purpose:
+  Detect software lockups on each CPU.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
+	kthreads from being created in the first place.
+2.	Boot with "nosoftlockup=0", which will also prevent these kthreads
+	from being created.  Other related watchdog and softlockup boot
+	parameters may be found in Documentation/admin-guide/kernel-parameters.rst
+	and Documentation/watchdog/watchdog-parameters.rst.
+3.	Echo a zero to /proc/sys/kernel/watchdog to disable the
+	watchdog timer.
+4.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
+	order to reduce the frequency of OS jitter due to the watchdog
+	timer down to a level that is acceptable for your workload.
diff --git a/Documentation/admin-guide/lcd-panel-cgram.rst b/Documentation/admin-guide/lcd-panel-cgram.rst
new file mode 100644
index 000000000000..a3eb00c62f53
--- /dev/null
+++ b/Documentation/admin-guide/lcd-panel-cgram.rst
@@ -0,0 +1,27 @@
+======================================
+Parallel port LCD/Keypad Panel support
+======================================
+
+Some LCDs allow you to define up to 8 characters, mapped to ASCII
+characters 0 to 7. The escape code to define a new character is
+'\e[LG' followed by one digit from 0 to 7, representing the character
+number, and up to 8 couples of hex digits terminated by a semi-colon
+(';'). Each couple of digits represents a line, with 1-bits for each
+illuminated pixel with LSB on the right. Lines are numbered from the
+top of the character to the bottom. On a 5x7 matrix, only the 5 lower
+bits of the 7 first bytes are used for each character. If the string
+is incomplete, only complete lines will be redefined. Here are some
+examples::
+
+  printf "\e[LG0010101050D1F0C04;"  => 0 = [enter]
+  printf "\e[LG1040E1F0000000000;"  => 1 = [up]
+  printf "\e[LG2000000001F0E0400;"  => 2 = [down]
+  printf "\e[LG3040E1F001F0E0400;"  => 3 = [up-down]
+  printf "\e[LG40002060E1E0E0602;"  => 4 = [left]
+  printf "\e[LG500080C0E0F0E0C08;"  => 5 = [right]
+  printf "\e[LG60016051516141400;"  => 6 = "IP"
+
+  printf "\e[LG00103071F1F070301;"  => big speaker
+  printf "\e[LG00002061E1E060200;"  => small speaker
+
+Willy
diff --git a/Documentation/admin-guide/ldm.rst b/Documentation/admin-guide/ldm.rst
new file mode 100644
index 000000000000..12c571368e73
--- /dev/null
+++ b/Documentation/admin-guide/ldm.rst
@@ -0,0 +1,121 @@
+==========================================
+LDM - Logical Disk Manager (Dynamic Disks)
+==========================================
+
+:Author: Originally Written by FlatCap - Richard Russon <ldm@flatcap.org>.
+:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista.
+
+Overview
+--------
+
+Windows 2000, XP, and Vista use a new partitioning scheme.  It is a complete
+replacement for the MSDOS style partitions.  It stores its information in a
+1MiB journalled database at the end of the physical disk.  The size of
+partitions is limited only by disk space.  The maximum number of partitions is
+nearly 2000.
+
+Any partitions created under the LDM are called "Dynamic Disks".  There are no
+longer any primary or extended partitions.  Normal MSDOS style partitions are
+now known as Basic Disks.
+
+If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use
+Dynamic Disks.  The journalling allows Windows to make changes to these
+partitions and filesystems without the need to reboot.
+
+Once the LDM driver has divided up the disk, you can use the MD driver to
+assemble any multi-partition volumes, e.g.  Stripes, RAID5.
+
+To prevent legacy applications from repartitioning the disk, the LDM creates a
+dummy MSDOS partition containing one disk-sized partition.  This is what is
+supported with the Linux LDM driver.
+
+A newer approach that has been implemented with Vista is to put LDM on top of a
+GPT label disk.  This is not supported by the Linux LDM driver yet.
+
+
+Example
+-------
+
+Below we have a 50MiB disk, divided into seven partitions.
+
+.. note::
+
+   The missing 1MiB at the end of the disk is where the LDM database is
+   stored.
+
++-------++--------------+---------+-----++--------------+---------+----+
+|Device || Offset Bytes | Sectors | MiB || Size   Bytes | Sectors | MiB|
++=======++==============+=========+=====++==============+=========+====+
+|hda    ||            0 |       0 |   0 ||     52428800 |  102400 |  50|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda1   ||     51380224 |  100352 |  49 ||      1048576 |    2048 |   1|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda2   ||        16384 |      32 |   0 ||      6979584 |   13632 |   6|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda3   ||      6995968 |   13664 |   6 ||     10485760 |   20480 |  10|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda4   ||     17481728 |   34144 |  16 ||      4194304 |    8192 |   4|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda5   ||     21676032 |   42336 |  20 ||      5242880 |   10240 |   5|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda6   ||     26918912 |   52576 |  25 ||     10485760 |   20480 |  10|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda7   ||     37404672 |   73056 |  35 ||     13959168 |   27264 |  13|
++-------++--------------+---------+-----++--------------+---------+----+
+
+The LDM Database may not store the partitions in the order that they appear on
+disk, but the driver will sort them.
+
+When Linux boots, you will see something like::
+
+  hda: 102400 sectors w/32KiB Cache, CHS=50/64/32
+  hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7
+
+
+Compiling LDM Support
+---------------------
+
+To enable LDM, choose the following two options: 
+
+  - "Advanced partition selection" CONFIG_PARTITION_ADVANCED
+  - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION
+
+If you believe the driver isn't working as it should, you can enable the extra
+debugging code.  This will produce a LOT of output.  The option is:
+
+  - "Windows LDM extra logging" CONFIG_LDM_DEBUG
+
+N.B. The partition code cannot be compiled as a module.
+
+As with all the partition code, if the driver doesn't see signs of its type of
+partition, it will pass control to another driver, so there is no harm in
+enabling it.
+
+If you have Dynamic Disks but don't enable the driver, then all you will see
+is a dummy MSDOS partition filling the whole disk.  You won't be able to mount
+any of the volumes on the disk.
+
+
+Booting
+-------
+
+If you enable LDM support, then lilo is capable of booting from any of the
+discovered partitions.  However, grub does not understand the LDM partitioning
+and cannot boot from a Dynamic Disk.
+
+
+More Documentation
+------------------
+
+There is an Overview of the LDM together with complete Technical Documentation.
+It is available for download.
+
+  http://www.linux-ntfs.org/
+
+If you have any LDM questions that aren't answered in the documentation, email
+me.
+
+Cheers,
+    FlatCap - Richard Russon
+    ldm@flatcap.org
+
diff --git a/Documentation/admin-guide/lockup-watchdogs.rst b/Documentation/admin-guide/lockup-watchdogs.rst
new file mode 100644
index 000000000000..290840c160af
--- /dev/null
+++ b/Documentation/admin-guide/lockup-watchdogs.rst
@@ -0,0 +1,83 @@
+===============================================================
+Softlockup detector and hardlockup detector (aka nmi_watchdog)
+===============================================================
+
+The Linux kernel can act as a watchdog to detect both soft and hard
+lockups.
+
+A 'softlockup' is defined as a bug that causes the kernel to loop in
+kernel mode for more than 20 seconds (see "Implementation" below for
+details), without giving other tasks a chance to run. The current
+stack trace is displayed upon detection and, by default, the system
+will stay locked up. Alternatively, the kernel can be configured to
+panic; a sysctl, "kernel.softlockup_panic", a kernel parameter,
+"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for
+details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are
+provided for this.
+
+A 'hardlockup' is defined as a bug that causes the CPU to loop in
+kernel mode for more than 10 seconds (see "Implementation" below for
+details), without letting other interrupts have a chance to run.
+Similarly to the softlockup case, the current stack trace is displayed
+upon detection and the system will stay locked up unless the default
+behavior is changed, which can be done through a sysctl,
+'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
+and a kernel parameter, "nmi_watchdog"
+(see "Documentation/admin-guide/kernel-parameters.rst" for details).
+
+The panic option can be used in combination with panic_timeout (this
+timeout is set through the confusingly named "kernel.panic" sysctl),
+to cause the system to reboot automatically after a specified amount
+of time.
+
+Implementation
+==============
+
+The soft and hard lockup detectors are built on top of the hrtimer and
+perf subsystems, respectively. A direct consequence of this is that,
+in principle, they should work in any architecture where these
+subsystems are present.
+
+A periodic hrtimer runs to generate interrupts and kick the watchdog
+task. An NMI perf event is generated every "watchdog_thresh"
+(compile-time initialized to 10 and configurable through sysctl of the
+same name) seconds to check for hardlockups. If any CPU in the system
+does not receive any hrtimer interrupt during that time the
+'hardlockup detector' (the handler for the NMI perf event) will
+generate a kernel warning or call panic, depending on the
+configuration.
+
+The watchdog task is a high priority kernel thread that updates a
+timestamp every time it is scheduled. If that timestamp is not updated
+for 2*watchdog_thresh seconds (the softlockup threshold) the
+'softlockup detector' (coded inside the hrtimer callback function)
+will dump useful debug information to the system log, after which it
+will call panic if it was instructed to do so or resume execution of
+other kernel code.
+
+The period of the hrtimer is 2*watchdog_thresh/5, which means it has
+two or three chances to generate an interrupt before the hardlockup
+detector kicks in.
+
+As explained above, a kernel knob is provided that allows
+administrators to configure the period of the hrtimer and the perf
+event. The right value for a particular environment is a trade-off
+between fast response to lockups and detection overhead.
+
+By default, the watchdog runs on all online cores.  However, on a
+kernel configured with NO_HZ_FULL, by default the watchdog runs only
+on the housekeeping cores, not the cores specified in the "nohz_full"
+boot argument.  If we allowed the watchdog to run by default on
+the "nohz_full" cores, we would have to run timer ticks to activate
+the scheduler, which would prevent the "nohz_full" functionality
+from protecting the user code on those cores from the kernel.
+Of course, disabling it by default on the nohz_full cores means that
+when those cores do enter the kernel, by default we will not be
+able to detect if they lock up.  However, allowing the watchdog
+to continue to run on the housekeeping (non-tickless) cores means
+that we will continue to detect lockups properly on those cores.
+
+In either case, the set of cores excluded from running the watchdog
+may be adjusted via the kernel.watchdog_cpumask sysctl.  For
+nohz_full cores, this may be useful for debugging a case where the
+kernel seems to be hanging on the nohz_full cores.
diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst
new file mode 100644
index 000000000000..4e06ffabd78a
--- /dev/null
+++ b/Documentation/admin-guide/mm/cma_debugfs.rst
@@ -0,0 +1,25 @@
+=====================
+CMA Debugfs Interface
+=====================
+
+The CMA debugfs interface is useful to retrieve basic information out of the
+different CMA areas and to test allocation/release in each of the areas.
+
+Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
+kernel's CMA index. So the first CMA zone would be:
+
+	<debugfs>/cma/cma-0
+
+The structure of the files created under that directory is as follows:
+
+ - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
+ - [RO] count: Amount of memory in the CMA area.
+ - [RO] order_per_bit: Order of pages represented by one bit.
+ - [RO] bitmap: The bitmap of page states in the zone.
+ - [WO] alloc: Allocate N pages from that CMA area. For example::
+
+	echo 5 > <debugfs>/cma/cma-2/alloc
+
+would try to allocate 5 pages from the cma-2 area.
+
+ - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index 5f61a6c429e0..11db46448354 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -26,6 +26,7 @@ the Linux memory management.
    :maxdepth: 1
 
    concepts
+   cma_debugfs
    hugetlbpage
    idle_page_tracking
    ksm
diff --git a/Documentation/admin-guide/numastat.rst b/Documentation/admin-guide/numastat.rst
new file mode 100644
index 000000000000..aaf1667489f8
--- /dev/null
+++ b/Documentation/admin-guide/numastat.rst
@@ -0,0 +1,30 @@
+===============================
+Numa policy hit/miss statistics
+===============================
+
+/sys/devices/system/node/node*/numastat
+
+All units are pages. Hugepages have separate counters.
+
+=============== ============================================================
+numa_hit	A process wanted to allocate memory from this node,
+		and succeeded.
+
+numa_miss	A process wanted to allocate memory from another node,
+		but ended up with memory from this node.
+
+numa_foreign	A process wanted to allocate on this node,
+		but ended up with memory from another one.
+
+local_node	A process ran on this node and got memory from it.
+
+other_node	A process ran on this node and got memory from another node.
+
+interleave_hit 	Interleaving wanted to allocate from this node
+		and succeeded.
+=============== ============================================================
+
+For easier reading you can use the numastat utility from the numactl package
+(http://oss.sgi.com/projects/libnuma/). Note that it only works
+well right now on machines with a small number of CPUs.
+
diff --git a/Documentation/admin-guide/pnp.rst b/Documentation/admin-guide/pnp.rst
new file mode 100644
index 000000000000..bab2d10631f0
--- /dev/null
+++ b/Documentation/admin-guide/pnp.rst
@@ -0,0 +1,292 @@
+=================================
+Linux Plug and Play Documentation
+=================================
+
+:Author: Adam Belay <ambx1@neo.rr.com>
+:Last updated: Oct. 16, 2002
+
+
+Overview
+--------
+
+Plug and Play provides a means of detecting and setting resources for legacy or
+otherwise unconfigurable devices.  The Linux Plug and Play Layer provides these 
+services to compatible drivers.
+
+
+The User Interface
+------------------
+
+The Linux Plug and Play user interface provides a means to activate PnP devices
+for legacy and user level drivers that do not support Linux Plug and Play.  The 
+user interface is integrated into sysfs.
+
+In addition to the standard sysfs file the following are created in each
+device's directory:
+- id - displays a list of support EISA IDs
+- options - displays possible resource configurations
+- resources - displays currently allocated resources and allows resource changes
+
+activating a device
+^^^^^^^^^^^^^^^^^^^
+
+::
+
+	# echo "auto" > resources
+
+this will invoke the automatic resource config system to activate the device
+
+manually activating a device
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+	# echo "manual <depnum> <mode>" > resources
+
+	<depnum> - the configuration number
+	<mode> - static or dynamic
+		 static = for next boot
+		 dynamic = now
+
+disabling a device
+^^^^^^^^^^^^^^^^^^
+
+::
+
+	# echo "disable" > resources
+
+
+EXAMPLE:
+
+Suppose you need to activate the floppy disk controller.
+
+1. change to the proper directory, in my case it is
+   /driver/bus/pnp/devices/00:0f::
+
+	# cd /driver/bus/pnp/devices/00:0f
+	# cat name
+	PC standard floppy disk controller
+
+2. check if the device is already active::
+
+	# cat resources
+	DISABLED
+
+  - Notice the string "DISABLED".  This means the device is not active.
+
+3. check the device's possible configurations (optional)::
+
+	# cat options
+	Dependent: 01 - Priority acceptable
+	    port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding
+	    port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding
+	    irq 6
+	    dma 2 8-bit compatible
+	Dependent: 02 - Priority acceptable
+	    port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding
+	    port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding
+	    irq 6
+	    dma 2 8-bit compatible
+
+4. now activate the device::
+
+	# echo "auto" > resources
+
+5. finally check if the device is active::
+
+	# cat resources
+	io 0x3f0-0x3f5
+	io 0x3f7-0x3f7
+	irq 6
+	dma 2
+
+also there are a series of kernel parameters::
+
+	pnp_reserve_irq=irq1[,irq2] ....
+	pnp_reserve_dma=dma1[,dma2] ....
+	pnp_reserve_io=io1,size1[,io2,size2] ....
+	pnp_reserve_mem=mem1,size1[,mem2,size2] ....
+
+
+
+The Unified Plug and Play Layer
+-------------------------------
+
+All Plug and Play drivers, protocols, and services meet at a central location
+called the Plug and Play Layer.  This layer is responsible for the exchange of 
+information between PnP drivers and PnP protocols.  Thus it automatically 
+forwards commands to the proper protocol.  This makes writing PnP drivers 
+significantly easier.
+
+The following functions are available from the Plug and Play Layer:
+
+pnp_get_protocol
+  increments the number of uses by one
+
+pnp_put_protocol
+  deincrements the number of uses by one
+
+pnp_register_protocol
+  use this to register a new PnP protocol
+
+pnp_unregister_protocol
+  use this function to remove a PnP protocol from the Plug and Play Layer
+
+pnp_register_driver
+  adds a PnP driver to the Plug and Play Layer
+
+  this includes driver model integration
+  returns zero for success or a negative error number for failure; count
+  calls to the .add() method if you need to know how many devices bind to
+  the driver
+
+pnp_unregister_driver
+  removes a PnP driver from the Plug and Play Layer
+
+
+
+Plug and Play Protocols
+-----------------------
+
+This section contains information for PnP protocol developers.
+
+The following Protocols are currently available in the computing world:
+
+- PNPBIOS:
+    used for system devices such as serial and parallel ports.
+- ISAPNP:
+    provides PnP support for the ISA bus
+- ACPI:
+    among its many uses, ACPI provides information about system level
+    devices.
+
+It is meant to replace the PNPBIOS.  It is not currently supported by Linux
+Plug and Play but it is planned to be in the near future.
+
+
+Requirements for a Linux PnP protocol:
+1. the protocol must use EISA IDs
+2. the protocol must inform the PnP Layer of a device's current configuration
+
+- the ability to set resources is optional but preferred.
+
+The following are PnP protocol related functions:
+
+pnp_add_device
+  use this function to add a PnP device to the PnP layer
+
+  only call this function when all wanted values are set in the pnp_dev
+  structure
+
+pnp_init_device
+  call this to initialize the PnP structure
+
+pnp_remove_device
+  call this to remove a device from the Plug and Play Layer.
+  it will fail if the device is still in use.
+  automatically will free mem used by the device and related structures
+
+pnp_add_id
+  adds an EISA ID to the list of supported IDs for the specified device
+
+For more information consult the source of a protocol such as
+/drivers/pnp/pnpbios/core.c.
+
+
+
+Linux Plug and Play Drivers
+---------------------------
+
+This section contains information for Linux PnP driver developers.
+
+The New Way
+^^^^^^^^^^^
+
+1. first make a list of supported EISA IDS
+
+   ex::
+
+	static const struct pnp_id pnp_dev_table[] = {
+		/* Standard LPT Printer Port */
+		{.id = "PNP0400", .driver_data = 0},
+		/* ECP Printer Port */
+		{.id = "PNP0401", .driver_data = 0},
+		{.id = ""}
+	};
+
+   Please note that the character 'X' can be used as a wild card in the function
+   portion (last four characters).
+
+   ex::
+
+	/* Unknown PnP modems */
+	{	"PNPCXXX",		UNKNOWN_DEV	},
+
+   Supported PnP card IDs can optionally be defined.
+   ex::
+
+	static const struct pnp_id pnp_card_table[] = {
+		{	"ANYDEVS",		0	},
+		{	"",			0	}
+	};
+
+2. Optionally define probe and remove functions.  It may make sense not to
+   define these functions if the driver already has a reliable method of detecting
+   the resources, such as the parport_pc driver.
+
+   ex::
+
+	static int
+	serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const
+			struct pnp_id *dev_id)
+	{
+	. . .
+
+   ex::
+
+	static void serial_pnp_remove(struct pnp_dev * dev)
+	{
+	. . .
+
+   consult /drivers/serial/8250_pnp.c for more information.
+
+3. create a driver structure
+
+   ex::
+
+	static struct pnp_driver serial_pnp_driver = {
+		.name		= "serial",
+		.card_id_table	= pnp_card_table,
+		.id_table	= pnp_dev_table,
+		.probe		= serial_pnp_probe,
+		.remove		= serial_pnp_remove,
+	};
+
+   * name and id_table cannot be NULL.
+
+4. register the driver
+
+   ex::
+
+	static int __init serial8250_pnp_init(void)
+	{
+		return pnp_register_driver(&serial_pnp_driver);
+	}
+
+The Old Way
+^^^^^^^^^^^
+
+A series of compatibility functions have been created to make it easy to convert
+ISAPNP drivers.  They should serve as a temporary solution only.
+
+They are as follows::
+
+	struct pnp_card *pnp_find_card(unsigned short vendor,
+				       unsigned short device,
+				       struct pnp_card *from)
+
+	struct pnp_dev *pnp_find_dev(struct pnp_card *card,
+				     unsigned short vendor,
+				     unsigned short function,
+				     struct pnp_dev *from)
+
diff --git a/Documentation/admin-guide/rtc.rst b/Documentation/admin-guide/rtc.rst
new file mode 100644
index 000000000000..688c95b11919
--- /dev/null
+++ b/Documentation/admin-guide/rtc.rst
@@ -0,0 +1,140 @@
+=======================================
+Real Time Clock (RTC) Drivers for Linux
+=======================================
+
+When Linux developers talk about a "Real Time Clock", they usually mean
+something that tracks wall clock time and is battery backed so that it
+works even with system power off.  Such clocks will normally not track
+the local time zone or daylight savings time -- unless they dual boot
+with MS-Windows -- but will instead be set to Coordinated Universal Time
+(UTC, formerly "Greenwich Mean Time").
+
+The newest non-PC hardware tends to just count seconds, like the time(2)
+system call reports, but RTCs also very commonly represent time using
+the Gregorian calendar and 24 hour time, as reported by gmtime(3).
+
+Linux has two largely-compatible userspace RTC API families you may
+need to know about:
+
+    *	/dev/rtc ... is the RTC provided by PC compatible systems,
+	so it's not very portable to non-x86 systems.
+
+    *	/dev/rtc0, /dev/rtc1 ... are part of a framework that's
+	supported by a wide variety of RTC chips on all systems.
+
+Programmers need to understand that the PC/AT functionality is not
+always available, and some systems can do much more.  That is, the
+RTCs use the same API to make requests in both RTC frameworks (using
+different filenames of course), but the hardware may not offer the
+same functionality.  For example, not every RTC is hooked up to an
+IRQ, so they can't all issue alarms; and where standard PC RTCs can
+only issue an alarm up to 24 hours in the future, other hardware may
+be able to schedule one any time in the upcoming century.
+
+
+Old PC/AT-Compatible driver:  /dev/rtc
+--------------------------------------
+
+All PCs (even Alpha machines) have a Real Time Clock built into them.
+Usually they are built into the chipset of the computer, but some may
+actually have a Motorola MC146818 (or clone) on the board. This is the
+clock that keeps the date and time while your computer is turned off.
+
+ACPI has standardized that MC146818 functionality, and extended it in
+a few ways (enabling longer alarm periods, and wake-from-hibernate).
+That functionality is NOT exposed in the old driver.
+
+However it can also be used to generate signals from a slow 2Hz to a
+relatively fast 8192Hz, in increments of powers of two. These signals
+are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is
+for...) It can also function as a 24hr alarm, raising IRQ 8 when the
+alarm goes off. The alarm can also be programmed to only check any
+subset of the three programmable values, meaning that it could be set to
+ring on the 30th second of the 30th minute of every hour, for example.
+The clock can also be set to generate an interrupt upon every clock
+update, thus generating a 1Hz signal.
+
+The interrupts are reported via /dev/rtc (major 10, minor 135, read only
+character device) in the form of an unsigned long. The low byte contains
+the type of interrupt (update-done, alarm-rang, or periodic) that was
+raised, and the remaining bytes contain the number of interrupts since
+the last read.  Status information is reported through the pseudo-file
+/proc/driver/rtc if the /proc filesystem was enabled.  The driver has
+built in locking so that only one process is allowed to have the /dev/rtc
+interface open at a time.
+
+A user process can monitor these interrupts by doing a read(2) or a
+select(2) on /dev/rtc -- either will block/stop the user process until
+the next interrupt is received. This is useful for things like
+reasonably high frequency data acquisition where one doesn't want to
+burn up 100% CPU by polling gettimeofday etc. etc.
+
+At high frequencies, or under high loads, the user process should check
+the number of interrupts received since the last read to determine if
+there has been any interrupt "pileup" so to speak. Just for reference, a
+typical 486-33 running a tight read loop on /dev/rtc will start to suffer
+occasional interrupt pileup (i.e. > 1 IRQ event since last read) for
+frequencies above 1024Hz. So you really should check the high bytes
+of the value you read, especially at frequencies above that of the
+normal timer interrupt, which is 100Hz.
+
+Programming and/or enabling interrupt frequencies greater than 64Hz is
+only allowed by root. This is perhaps a bit conservative, but we don't want
+an evil user generating lots of IRQs on a slow 386sx-16, where it might have
+a negative impact on performance. This 64Hz limit can be changed by writing
+a different value to /proc/sys/dev/rtc/max-user-freq. Note that the
+interrupt handler is only a few lines of code to minimize any possibility
+of this effect.
+
+Also, if the kernel time is synchronized with an external source, the 
+kernel will write the time back to the CMOS clock every 11 minutes. In 
+the process of doing this, the kernel briefly turns off RTC periodic 
+interrupts, so be aware of this if you are doing serious work. If you
+don't synchronize the kernel time with an external source (via ntp or
+whatever) then the kernel will keep its hands off the RTC, allowing you
+exclusive access to the device for your applications.
+
+The alarm and/or interrupt frequency are programmed into the RTC via
+various ioctl(2) calls as listed in ./include/linux/rtc.h
+Rather than write 50 pages describing the ioctl() and so on, it is
+perhaps more useful to include a small test program that demonstrates
+how to use them, and demonstrates the features of the driver. This is
+probably a lot more useful to people interested in writing applications
+that will be using this driver.  See the code at the end of this document.
+
+(The original /dev/rtc driver was written by Paul Gortmaker.)
+
+
+New portable "RTC Class" drivers:  /dev/rtcN
+--------------------------------------------
+
+Because Linux supports many non-ACPI and non-PC platforms, some of which
+have more than one RTC style clock, it needed a more portable solution
+than expecting a single battery-backed MC146818 clone on every system.
+Accordingly, a new "RTC Class" framework has been defined.  It offers
+three different userspace interfaces:
+
+    *	/dev/rtcN ... much the same as the older /dev/rtc interface
+
+    *	/sys/class/rtc/rtcN ... sysfs attributes support readonly
+	access to some RTC attributes.
+
+    *	/proc/driver/rtc ... the system clock RTC may expose itself
+	using a procfs interface. If there is no RTC for the system clock,
+	rtc0 is used by default. More information is (currently) shown
+	here than through sysfs.
+
+The RTC Class framework supports a wide variety of RTCs, ranging from those
+integrated into embeddable system-on-chip (SOC) processors to discrete chips
+using I2C, SPI, or some other bus to communicate with the host CPU.  There's
+even support for PC-style RTCs ... including the features exposed on newer PCs
+through ACPI.
+
+The new framework also removes the "one RTC per system" restriction.  For
+example, maybe the low-power battery-backed RTC is a discrete I2C chip, but
+a high functionality RTC is integrated into the SOC.  That system might read
+the system clock from the discrete RTC, but use the integrated one for all
+other tasks, because of its greater functionality.
+
+Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the
+ioctl interface.
diff --git a/Documentation/admin-guide/svga.rst b/Documentation/admin-guide/svga.rst
new file mode 100644
index 000000000000..b6c2f9acca92
--- /dev/null
+++ b/Documentation/admin-guide/svga.rst
@@ -0,0 +1,249 @@
+.. include:: <isonum.txt>
+
+=================================
+Video Mode Selection Support 2.13
+=================================
+
+:Copyright: |copy| 1995--1999 Martin Mares, <mj@ucw.cz>
+
+Intro
+~~~~~
+
+This small document describes the "Video Mode Selection" feature which
+allows the use of various special video modes supported by the video BIOS. Due
+to usage of the BIOS, the selection is limited to boot time (before the
+kernel decompression starts) and works only on 80X86 machines.
+
+.. note::
+
+   Short intro for the impatient: Just use vga=ask for the first time,
+   enter ``scan`` on the video mode prompt, pick the mode you want to use,
+   remember its mode ID (the four-digit hexadecimal number) and then
+   set the vga parameter to this number (converted to decimal first).
+
+The video mode to be used is selected by a kernel parameter which can be
+specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..."
+option of LILO (or some other boot loader you use) or by the "vidmode" utility
+(present in standard Linux utility packages). You can use the following values
+of this parameter::
+
+   NORMAL_VGA - Standard 80x25 mode available on all display adapters.
+
+   EXTENDED_VGA	- Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA.
+
+   ASK_VGA - Display a video mode menu upon startup (see below).
+
+   0..35 - Menu item number (when you have used the menu to view the list of
+      modes available on your adapter, you can specify the menu item you want
+      to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the
+      mode list displayed may vary as the kernel version changes, because the
+      modes are listed in a "first detected -- first displayed" manner. It's
+      better to use absolute mode numbers instead.
+
+   0x.... - Hexadecimal video mode ID (also displayed on the menu, see below
+      for exact meaning of the ID). Warning: rdev and LILO don't support
+      hexadecimal numbers -- you have to convert it to decimal manually.
+
+Menu
+~~~~
+
+The ASK_VGA mode causes the kernel to offer a video mode menu upon
+bootup. It displays a "Press <RETURN> to see video modes available, <SPACE>
+to continue or wait 30 secs" message. If you press <RETURN>, you enter the
+menu, if you press <SPACE> or wait 30 seconds, the kernel will boot up in
+the standard 80x25 mode.
+
+The menu looks like::
+
+	Video adapter: <name-of-detected-video-adapter>
+	Mode:    COLSxROWS:
+	0  0F00  80x25
+	1  0F01  80x50
+	2  0F02  80x43
+	3  0F03  80x26
+	....
+	Enter mode number or ``scan``: <flashing-cursor-here>
+
+<name-of-detected-video-adapter> tells what video adapter did Linux detect
+-- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA
+with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection
+of chipsets is turned off by default as it's inherently unreliable due to
+absolutely insane PC design.
+
+"0  0F00  80x25" means that the first menu item (the menu items are numbered
+from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the
+next section for a description of mode IDs).
+
+<flashing-cursor-here> encourages you to enter the item number or mode ID
+you wish to set and press <RETURN>. If the computer complains something about
+"Unknown mode ID", it is trying to tell you that it isn't possible to set such
+a mode. It's also possible to press only <RETURN> which leaves the current mode.
+
+The mode list usually contains a few basic modes and some VESA modes.  In
+case your chipset has been detected, some chipset-specific modes are shown as
+well (some of these might be missing or unusable on your machine as different
+BIOSes are often shipped with the same card and the mode numbers depend purely
+on the VGA BIOS).
+
+The modes displayed on the menu are partially sorted: The list starts with
+the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and
+80x43), local modes (if the local modes feature is enabled), VESA modes and
+finally SVGA modes for the auto-detected adapter.
+
+If you are not happy with the mode list offered (e.g., if you think your card
+is able to do more), you can enter "scan" instead of item number / mode ID.  The
+program will try to ask the BIOS for all possible video mode numbers and test
+what happens then. The screen will be probably flashing wildly for some time and
+strange noises will be heard from inside the monitor and so on and then, really
+all consistent video modes supported by your BIOS will appear (plus maybe some
+``ghost modes``). If you are afraid this could damage your monitor, don't use
+this function.
+
+After scanning, the mode ordering is a bit different: the auto-detected SVGA
+modes are not listed at all and the modes revealed by ``scan`` are shown before
+all VESA modes.
+
+Mode IDs
+~~~~~~~~
+
+Because of the complexity of all the video stuff, the video mode IDs
+used here are also a bit complex. A video mode ID is a 16-bit number usually
+expressed in a hexadecimal notation (starting with "0x"). You can set a mode
+by entering its mode directly if you know it even if it isn't shown on the menu.
+
+The ID numbers can be divided to those regions::
+
+   0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use
+	outside the menu as this can change from boot to boot (especially if you
+	have used the ``scan`` feature).
+
+   0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number
+	(as presented to INT 10, function 00) increased by 0x0100.
+
+   0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by
+	0x0100. All VESA modes should be autodetected and shown on the menu.
+
+   0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05.
+	(Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60,
+	945=132x28 for the standard Video7 BIOS)
+
+   0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually
+	by modifying one of the standard modes). Currently available:
+	0x0f00	standard 80x25, don't reset mode if already set (=FFFF)
+	0x0f01	standard with 8-point font: 80x43 on EGA, 80x50 on VGA
+	0x0f02	VGA 80x43 (VGA switched to 350 scanlines with a 8-point font)
+	0x0f03	VGA 80x28 (standard VGA scans, but 14-point font)
+	0x0f04	leave current video mode
+	0x0f05	VGA 80x30 (480 scans, 16-point font)
+	0x0f06	VGA 80x34 (480 scans, 14-point font)
+	0x0f07	VGA 80x60 (480 scans, 8-point font)
+	0x0f08	Graphics hack (see the VIDEO_GFX_HACK paragraph below)
+
+   0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC"
+	form where RR is a number of rows and CC is a number of columns.
+	E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc.
+	This is the only fully portable way to refer to a non-standard mode,
+	but it relies on the mode being found and displayed on the menu
+	(remember that mode scanning is not done automatically).
+
+   0xff00 to 0xffff - aliases for backward compatibility:
+	0xffff	equivalent to 0x0f00 (standard 80x25)
+	0xfffe	equivalent to 0x0f01 (EGA 80x43 or VGA 80x50)
+
+If you add 0x8000 to the mode ID, the program will try to recalculate
+vertical display timing according to mode parameters, which can be used to
+eliminate some annoying bugs of certain VGA BIOSes (usually those used for
+cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the
+end of the display.
+
+Options
+~~~~~~~
+
+Build options for arch/x86/boot/* are selected by the kernel kconfig
+utility and the kernel .config file.
+
+VIDEO_GFX_HACK - includes special hack for setting of graphics modes
+to be used later by special drivers.
+Allows to set _any_ BIOS mode including graphic ones and forcing specific
+text screen resolution instead of peeking it from BIOS variables. Don't use
+unless you think you know what you're doing. To activate this setup, use
+mode number 0x0f08 (see the Mode IDs section above).
+
+Still doesn't work?
+~~~~~~~~~~~~~~~~~~~
+
+When the mode detection doesn't work (e.g., the mode list is incorrect or
+the machine hangs instead of displaying the menu), try to switch off some of
+the configuration options listed under "Options". If it fails, you can still use
+your kernel with the video mode set directly via the kernel parameter.
+
+In either case, please send me a bug report containing what _exactly_
+happens and how do the configuration switches affect the behaviour of the bug.
+
+If you start Linux from M$-DOS, you might also use some DOS tools for
+video mode setting. In this case, you must specify the 0x0f04 mode ("leave
+current settings") to Linux, because if you don't and you use any non-standard
+mode, Linux will switch to 80x25 automatically.
+
+If you set some extended mode and there's one or more extra lines on the
+bottom of the display containing already scrolled-out text, your VGA BIOS
+contains the most common video BIOS bug called "incorrect vertical display
+end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately,
+this must be done manually -- no autodetection mechanisms are available.
+
+History
+~~~~~~~
+
+=============== ================================================================
+1.0 (??-Nov-95)	First version supporting all adapters supported by the old
+		setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels
+		and then removed due to instability on some machines.
+2.0 (28-Jan-96)	Rewritten from scratch. Cirrus Logic 64XX support added, almost
+		everything is configurable, the VESA support should be much more
+		stable, explicit mode numbering allowed, "scan" implemented etc.
+2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution
+		supported. Few bugs fixed. VESA modes are listed prior to
+		modes supplied by SVGA autodetection as they are more reliable.
+		CLGD autodetect works better. Doesn't depend on 80x25 being
+		active when started. Scanning fixed. 80x43 (any VGA) added.
+		Code cleaned up.
+2.2 (01-Feb-96)	EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX
+		VESA modes work now). Display end bug workaround supported.
+		Special modes renumbered to allow adding of the "recalculate"
+		flag, 0xffff and 0xfffe became aliases instead of real IDs.
+		Screen contents retained during mode changes.
+2.3 (15-Mar-96)	Changed to work with 1.3.74 kernel.
+2.4 (18-Mar-96)	Added patches by Hans Lermen fixing a memory overwrite problem
+		with some boot loaders. Memory management rewritten to reflect
+		these changes. Unfortunately, screen contents retaining works
+		only with some loaders now.
+		Added a Tseng 132x60 mode.
+2.5 (19-Mar-96)	Fixed a VESA mode scanning bug introduced in 2.4.
+2.6 (25-Mar-96)	Some VESA BIOS errors not reported -- it fixes error reports on
+		several cards with broken VESA code (e.g., ATI VGA).
+2.7 (09-Apr-96)	- Accepted all VESA modes in range 0x100 to 0x7ff, because some
+		  cards use very strange mode numbers.
+		- Added Realtek VGA modes (thanks to Gonzalo Tornaria).
+		- Hardware testing order slightly changed, tests based on ROM
+		  contents done as first.
+		- Added support for special Video7 mode switching functions
+		  (thanks to Tom Vander Aa).
+		- Added 480-scanline modes (especially useful for notebooks,
+		  original version written by hhanemaa@cs.ruu.nl, patched by
+		  Jeff Chua, rewritten by me).
+		- Screen store/restore fixed.
+2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA.
+		- Better recognition of text modes during mode scan.
+2.9 (12-May-96)	- Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!)
+2.10(11-Nov-96) - The whole thing made optional.
+		- Added the CONFIG_VIDEO_400_HACK switch.
+		- Added the CONFIG_VIDEO_GFX_HACK switch.
+		- Code cleanup.
+2.11(03-May-97) - Yet another cleanup, now including also the documentation.
+		- Direct testing of SVGA adapters turned off by default, ``scan``
+		  offered explicitly on the prompt line.
+		- Removed the doc section describing adding of new probing
+		  functions as I try to get rid of _all_ hardware probing here.
+2.12(25-May-98) Added support for VESA frame buffer graphics.
+2.13(14-May-99) Minor documentation fixes.
+=============== ================================================================
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index a0c1d4ce403a..032c7cd3cede 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -327,7 +327,7 @@ when a hard lockup is detected.
    0 - don't panic on hard lockup
    1 - panic on hard lockup
 
-See Documentation/lockup-watchdogs.txt for more information.  This can
+See Documentation/admin-guide/lockup-watchdogs.rst for more information.  This can
 also be set using the nmi_watchdog kernel parameter.
 
 
diff --git a/Documentation/admin-guide/video-output.rst b/Documentation/admin-guide/video-output.rst
new file mode 100644
index 000000000000..56d6fa2e2368
--- /dev/null
+++ b/Documentation/admin-guide/video-output.rst
@@ -0,0 +1,34 @@
+Video Output Switcher Control
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+2006 luming.yu@intel.com
+
+The output sysfs class driver provides an abstract video output layer that
+can be used to hook platform specific methods to enable/disable video output
+device through common sysfs interface. For example, on my IBM ThinkPad T42
+laptop, The ACPI video driver registered its output devices and read/write
+method for 'state' with output sysfs class. The user interface under sysfs is::
+
+  linux:/sys/class/video_output # tree .
+  .
+  |-- CRT0
+  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+  |   |-- state
+  |   |-- subsystem -> ../../../class/video_output
+  |   `-- uevent
+  |-- DVI0
+  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+  |   |-- state
+  |   |-- subsystem -> ../../../class/video_output
+  |   `-- uevent
+  |-- LCD0
+  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+  |   |-- state
+  |   |-- subsystem -> ../../../class/video_output
+  |   `-- uevent
+  `-- TV0
+     |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+     |-- state
+     |-- subsystem -> ../../../class/video_output
+     `-- uevent
+
diff --git a/Documentation/auxdisplay/lcd-panel-cgram.rst b/Documentation/auxdisplay/lcd-panel-cgram.rst
deleted file mode 100644
index dfef50286018..000000000000
--- a/Documentation/auxdisplay/lcd-panel-cgram.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-:orphan:
-
-======================================
-Parallel port LCD/Keypad Panel support
-======================================
-
-Some LCDs allow you to define up to 8 characters, mapped to ASCII
-characters 0 to 7. The escape code to define a new character is
-'\e[LG' followed by one digit from 0 to 7, representing the character
-number, and up to 8 couples of hex digits terminated by a semi-colon
-(';'). Each couple of digits represents a line, with 1-bits for each
-illuminated pixel with LSB on the right. Lines are numbered from the
-top of the character to the bottom. On a 5x7 matrix, only the 5 lower
-bits of the 7 first bytes are used for each character. If the string
-is incomplete, only complete lines will be redefined. Here are some
-examples::
-
-  printf "\e[LG0010101050D1F0C04;"  => 0 = [enter]
-  printf "\e[LG1040E1F0000000000;"  => 1 = [up]
-  printf "\e[LG2000000001F0E0400;"  => 2 = [down]
-  printf "\e[LG3040E1F001F0E0400;"  => 3 = [up-down]
-  printf "\e[LG40002060E1E0E0602;"  => 4 = [left]
-  printf "\e[LG500080C0E0F0E0C08;"  => 5 = [right]
-  printf "\e[LG60016051516141400;"  => 6 = "IP"
-
-  printf "\e[LG00103071F1F070301;"  => big speaker
-  printf "\e[LG00002061E1E060200;"  => small speaker
-
-Willy
diff --git a/Documentation/btmrvl.txt b/Documentation/btmrvl.txt
deleted file mode 100644
index ec57740ead0c..000000000000
--- a/Documentation/btmrvl.txt
+++ /dev/null
@@ -1,124 +0,0 @@
-=============
-btmrvl driver
-=============
-
-All commands are used via debugfs interface.
-
-Set/get driver configurations
-=============================
-
-Path:	/debug/btmrvl/config/
-
-gpiogap=[n], hscfgcmd
-	These commands are used to configure the host sleep parameters::
-	bit 8:0  -- Gap
-	bit 16:8 -- GPIO
-
-	where GPIO is the pin number of GPIO used to wake up the host.
-	It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface
-	wakeup will be used instead).
-
-	where Gap is the gap in milli seconds between wakeup signal and
-	wakeup event, or 0xff for special host sleep setting.
-
-	Usage::
-
-		# Use SDIO interface to wake up the host and set GAP to 0x80:
-		echo 0xff80 > /debug/btmrvl/config/gpiogap
-		echo 1 > /debug/btmrvl/config/hscfgcmd
-
-		# Use GPIO pin #3 to wake up the host and set GAP to 0xff:
-		echo 0x03ff >  /debug/btmrvl/config/gpiogap
-		echo 1 > /debug/btmrvl/config/hscfgcmd
-
-psmode=[n], pscmd
-	These commands are used to enable/disable auto sleep mode
-
-	where the option is::
-
-			1 	-- Enable auto sleep mode
-			0 	-- Disable auto sleep mode
-
-	Usage::
-
-		# Enable auto sleep mode
-		echo 1 > /debug/btmrvl/config/psmode
-		echo 1 > /debug/btmrvl/config/pscmd
-
-		# Disable auto sleep mode
-		echo 0 > /debug/btmrvl/config/psmode
-		echo 1 > /debug/btmrvl/config/pscmd
-
-
-hsmode=[n], hscmd
-	These commands are used to enable host sleep or wake up firmware
-
-	where the option is::
-
-			1	-- Enable host sleep
-			0	-- Wake up firmware
-
-	Usage::
-
-		# Enable host sleep
-		echo 1 > /debug/btmrvl/config/hsmode
-		echo 1 > /debug/btmrvl/config/hscmd
-
-		# Wake up firmware
-		echo 0 > /debug/btmrvl/config/hsmode
-		echo 1 > /debug/btmrvl/config/hscmd
-
-
-Get driver status
-=================
-
-Path:	/debug/btmrvl/status/
-
-Usage::
-
-	cat /debug/btmrvl/status/<args>
-
-where the args are:
-
-curpsmode
-	This command displays current auto sleep status.
-
-psstate
-	This command display the power save state.
-
-hsstate
-	This command display the host sleep state.
-
-txdnldrdy
-	This command displays the value of Tx download ready flag.
-
-Issuing a raw hci command
-=========================
-
-Use hcitool to issue raw hci command, refer to hcitool manual
-
-Usage::
-
-	Hcitool cmd <ogf> <ocf> [Parameters]
-
-Interface Control Command::
-
-	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00    --Enable All interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01    --Enable Wlan interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02    --Enable BT interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00    --Disable All interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01    --Disable Wlan interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02    --Disable BT interface
-
-SD8688 firmware
-===============
-
-Images:
-
-- /lib/firmware/sd8688_helper.bin
-- /lib/firmware/sd8688.bin
-
-
-The images can be downloaded from:
-
-git.infradead.org/users/dwmw2/linux-firmware.git/libertas/
diff --git a/Documentation/clearing-warn-once.txt b/Documentation/clearing-warn-once.txt
deleted file mode 100644
index 211fd926cf00..000000000000
--- a/Documentation/clearing-warn-once.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-Clearing WARN_ONCE
-------------------
-
-WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once.
-
-echo 1 > /sys/kernel/debug/clear_warn_once
-
-clears the state and allows the warnings to print once again.
-This can be useful after test suite runs to reproduce problems.
diff --git a/Documentation/cma/debugfs.rst b/Documentation/cma/debugfs.rst
deleted file mode 100644
index 518fe401b5ee..000000000000
--- a/Documentation/cma/debugfs.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-:orphan:
-
-=====================
-CMA Debugfs Interface
-=====================
-
-The CMA debugfs interface is useful to retrieve basic information out of the
-different CMA areas and to test allocation/release in each of the areas.
-
-Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
-kernel's CMA index. So the first CMA zone would be:
-
-	<debugfs>/cma/cma-0
-
-The structure of the files created under that directory is as follows:
-
- - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
- - [RO] count: Amount of memory in the CMA area.
- - [RO] order_per_bit: Order of pages represented by one bit.
- - [RO] bitmap: The bitmap of page states in the zone.
- - [WO] alloc: Allocate N pages from that CMA area. For example::
-
-	echo 5 > <debugfs>/cma/cma-2/alloc
-
-would try to allocate 5 pages from the cma-2 area.
-
- - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt
deleted file mode 100644
index 2d01ce43d2a2..000000000000
--- a/Documentation/cpu-load.txt
+++ /dev/null
@@ -1,114 +0,0 @@
-========
-CPU load
-========
-
-Linux exports various bits of information via ``/proc/stat`` and
-``/proc/uptime`` that userland tools, such as top(1), use to calculate
-the average time system spent in a particular state, for example::
-
-    $ iostat
-    Linux 2.6.18.3-exp (linmac)     02/20/2007
-
-    avg-cpu:  %user   %nice %system %iowait  %steal   %idle
-              10.01    0.00    2.92    5.44    0.00   81.63
-
-    ...
-
-Here the system thinks that over the default sampling period the
-system spent 10.01% of the time doing work in user space, 2.92% in the
-kernel, and was overall 81.63% of the time idle.
-
-In most cases the ``/proc/stat``	 information reflects the reality quite
-closely, however due to the nature of how/when the kernel collects
-this data sometimes it can not be trusted at all.
-
-So how is this information collected?  Whenever timer interrupt is
-signalled the kernel looks what kind of task was running at this
-moment and increments the counter that corresponds to this tasks
-kind/state.  The problem with this is that the system could have
-switched between various states multiple times between two timer
-interrupts yet the counter is incremented only for the last state.
-
-
-Example
--------
-
-If we imagine the system with one task that periodically burns cycles
-in the following manner::
-
-     time line between two timer interrupts
-    |--------------------------------------|
-     ^                                    ^
-     |_ something begins working          |
-                                          |_ something goes to sleep
-                                         (only to be awaken quite soon)
-
-In the above situation the system will be 0% loaded according to the
-``/proc/stat`` (since the timer interrupt will always happen when the
-system is executing the idle handler), but in reality the load is
-closer to 99%.
-
-One can imagine many more situations where this behavior of the kernel
-will lead to quite erratic information inside ``/proc/stat``::
-
-
-	/* gcc -o hog smallhog.c */
-	#include <time.h>
-	#include <limits.h>
-	#include <signal.h>
-	#include <sys/time.h>
-	#define HIST 10
-
-	static volatile sig_atomic_t stop;
-
-	static void sighandler (int signr)
-	{
-	(void) signr;
-	stop = 1;
-	}
-	static unsigned long hog (unsigned long niters)
-	{
-	stop = 0;
-	while (!stop && --niters);
-	return niters;
-	}
-	int main (void)
-	{
-	int i;
-	struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
-				.it_value = { .tv_sec = 0, .tv_usec = 1 } };
-	sigset_t set;
-	unsigned long v[HIST];
-	double tmp = 0.0;
-	unsigned long n;
-	signal (SIGALRM, &sighandler);
-	setitimer (ITIMER_REAL, &it, NULL);
-
-	hog (ULONG_MAX);
-	for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
-	for (i = 0; i < HIST; ++i) tmp += v[i];
-	tmp /= HIST;
-	n = tmp - (tmp / 3.0);
-
-	sigemptyset (&set);
-	sigaddset (&set, SIGALRM);
-
-	for (;;) {
-		hog (n);
-		sigwait (&set, &i);
-	}
-	return 0;
-	}
-
-
-References
-----------
-
-- http://lkml.org/lkml/2007/2/12/6
-- Documentation/filesystems/proc.txt (1.8)
-
-
-Thanks
-------
-
-Con Kolivas, Pavel Machek
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
deleted file mode 100644
index b90dafcc8237..000000000000
--- a/Documentation/cputopology.txt
+++ /dev/null
@@ -1,177 +0,0 @@
-===========================================
-How CPU topology info is exported via sysfs
-===========================================
-
-Export CPU topology info via sysfs. Items (attributes) are similar
-to /proc/cpuinfo output of some architectures.  They reside in
-/sys/devices/system/cpu/cpuX/topology/:
-
-physical_package_id:
-
-	physical package id of cpuX. Typically corresponds to a physical
-	socket number, but the actual value is architecture and platform
-	dependent.
-
-die_id:
-
-	the CPU die ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).  The actual value is
-	architecture and platform dependent.
-
-core_id:
-
-	the CPU core ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).  The actual value is
-	architecture and platform dependent.
-
-book_id:
-
-	the book ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).	The actual value is
-	architecture and platform dependent.
-
-drawer_id:
-
-	the drawer ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).	The actual value is
-	architecture and platform dependent.
-
-core_cpus:
-
-	internal kernel map of CPUs within the same core.
-	(deprecated name: "thread_siblings")
-
-core_cpus_list:
-
-	human-readable list of CPUs within the same core.
-	(deprecated name: "thread_siblings_list");
-
-package_cpus:
-
-	internal kernel map of the CPUs sharing the same physical_package_id.
-	(deprecated name: "core_siblings")
-
-package_cpus_list:
-
-	human-readable list of CPUs sharing the same physical_package_id.
-	(deprecated name: "core_siblings_list")
-
-die_cpus:
-
-	internal kernel map of CPUs within the same die.
-
-die_cpus_list:
-
-	human-readable list of CPUs within the same die.
-
-book_siblings:
-
-	internal kernel map of cpuX's hardware threads within the same
-	book_id.
-
-book_siblings_list:
-
-	human-readable list of cpuX's hardware threads within the same
-	book_id.
-
-drawer_siblings:
-
-	internal kernel map of cpuX's hardware threads within the same
-	drawer_id.
-
-drawer_siblings_list:
-
-	human-readable list of cpuX's hardware threads within the same
-	drawer_id.
-
-Architecture-neutral, drivers/base/topology.c, exports these attributes.
-However, the book and drawer related sysfs files will only be created if
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
-
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
-where they reflect the cpu and cache hierarchy.
-
-For an architecture to support this feature, it must define some of
-these macros in include/asm-XXX/topology.h::
-
-	#define topology_physical_package_id(cpu)
-	#define topology_die_id(cpu)
-	#define topology_core_id(cpu)
-	#define topology_book_id(cpu)
-	#define topology_drawer_id(cpu)
-	#define topology_sibling_cpumask(cpu)
-	#define topology_core_cpumask(cpu)
-	#define topology_die_cpumask(cpu)
-	#define topology_book_cpumask(cpu)
-	#define topology_drawer_cpumask(cpu)
-
-The type of ``**_id macros`` is int.
-The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
-correspond with appropriate ``**_siblings`` sysfs attributes (except for
-topology_sibling_cpumask() which corresponds with thread_siblings).
-
-To be consistent on all architectures, include/linux/topology.h
-provides default definitions for any of the above macros that are
-not defined by include/asm-XXX/topology.h:
-
-1) topology_physical_package_id: -1
-2) topology_die_id: -1
-3) topology_core_id: 0
-4) topology_sibling_cpumask: just the given CPU
-5) topology_core_cpumask: just the given CPU
-6) topology_die_cpumask: just the given CPU
-
-For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
-default definitions for topology_book_id() and topology_book_cpumask().
-For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
-no default definitions for topology_drawer_id() and topology_drawer_cpumask().
-
-Additionally, CPU topology information is provided under
-/sys/devices/system/cpu and includes these files.  The internal
-source for the output is in brackets ("[]").
-
-    =========== ==========================================================
-    kernel_max: the maximum CPU index allowed by the kernel configuration.
-		[NR_CPUS-1]
-
-    offline:	CPUs that are not online because they have been
-		HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit
-		of CPUs allowed by the kernel configuration (kernel_max
-		above). [~cpu_online_mask + cpus >= NR_CPUS]
-
-    online:	CPUs that are online and being scheduled [cpu_online_mask]
-
-    possible:	CPUs that have been allocated resources and can be
-		brought online if they are present. [cpu_possible_mask]
-
-    present:	CPUs that have been identified as being present in the
-		system. [cpu_present_mask]
-    =========== ==========================================================
-
-The format for the above output is compatible with cpulist_parse()
-[see <linux/cpumask.h>].  Some examples follow.
-
-In this example, there are 64 CPUs in the system but cpus 32-63 exceed
-the kernel max which is limited to 0..31 by the NR_CPUS config option
-being 32.  Note also that CPUs 2 and 4-31 are not online but could be
-brought online as they are both present and possible::
-
-     kernel_max: 31
-        offline: 2,4-31,32-63
-         online: 0-1,3
-       possible: 0-31
-        present: 0-31
-
-In this example, the NR_CPUS config option is 128, but the kernel was
-started with possible_cpus=144.  There are 4 CPUs in the system and cpu2
-was manually taken offline (and is the only CPU that can be brought
-online.)::
-
-     kernel_max: 127
-        offline: 2,4-127,128-143
-         online: 0-1,3
-       possible: 0-127
-        present: 0-3
-
-See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter
-as well as more information on the various cpumasks.
diff --git a/Documentation/efi-stub.txt b/Documentation/efi-stub.txt
deleted file mode 100644
index 833edb0d0bc4..000000000000
--- a/Documentation/efi-stub.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-=================
-The EFI Boot Stub
-=================
-
-On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade
-as a PE/COFF image, thereby convincing EFI firmware loaders to load
-it as an EFI executable. The code that modifies the bzImage header,
-along with the EFI-specific entry point that the firmware loader
-jumps to are collectively known as the "EFI boot stub", and live in
-arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c,
-respectively. For ARM the EFI stub is implemented in
-arch/arm/boot/compressed/efi-header.S and
-arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared
-between architectures is in drivers/firmware/efi/libstub.
-
-For arm64, there is no compressed kernel support, so the Image itself
-masquerades as a PE/COFF image and the EFI stub is linked into the
-kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S
-and drivers/firmware/efi/libstub/arm64-stub.c.
-
-By using the EFI boot stub it's possible to boot a Linux kernel
-without the use of a conventional EFI boot loader, such as grub or
-elilo. Since the EFI boot stub performs the jobs of a boot loader, in
-a certain sense it *IS* the boot loader.
-
-The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option.
-
-
-How to install bzImage.efi
---------------------------
-
-The bzImage located in arch/x86/boot/bzImage must be copied to the EFI
-System Partition (ESP) and renamed with the extension ".efi". Without
-the extension the EFI firmware loader will refuse to execute it. It's
-not possible to execute bzImage.efi from the usual Linux file systems
-because EFI firmware doesn't have support for them. For ARM the
-arch/arm/boot/zImage should be copied to the system partition, and it
-may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image
-should be copied but not necessarily renamed.
-
-
-Passing kernel parameters from the EFI shell
---------------------------------------------
-
-Arguments to the kernel can be passed after bzImage.efi, e.g.::
-
-	fs0:> bzImage.efi console=ttyS0 root=/dev/sda4
-
-
-The "initrd=" option
---------------------
-
-Like most boot loaders, the EFI stub allows the user to specify
-multiple initrd files using the "initrd=" option. This is the only EFI
-stub-specific command line parameter, everything else is passed to the
-kernel when it boots.
-
-The path to the initrd file must be an absolute path from the
-beginning of the ESP, relative path names do not work. Also, the path
-is an EFI-style path and directory elements must be separated with
-backslashes (\). For example, given the following directory layout::
-
-  fs0:>
-	Kernels\
-			bzImage.efi
-			initrd-large.img
-
-	Ramdisks\
-			initrd-small.img
-			initrd-medium.img
-
-to boot with the initrd-large.img file if the current working
-directory is fs0:\Kernels, the following command must be used::
-
-	fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img
-
-Notice how bzImage.efi can be specified with a relative path. That's
-because the image we're executing is interpreted by the EFI shell,
-which understands relative paths, whereas the rest of the command line
-is passed to bzImage.efi.
-
-
-The "dtb=" option
------------------
-
-For the ARM and arm64 architectures, a device tree must be provided to
-the kernel. Normally firmware shall supply the device tree via the
-EFI CONFIGURATION TABLE. However, the "dtb=" command line option can
-be used to override the firmware supplied device tree, or to supply
-one when firmware is unable to.
-
-Please note: Firmware adds runtime configuration information to the
-device tree before booting the kernel. If dtb= is used to override
-the device tree, then any runtime data provided by firmware will be
-lost. The dtb= option should only be used either as a debug tool, or
-as a last resort when a device tree is not provided in the EFI
-CONFIGURATION TABLE.
-
-"dtb=" is processed in the same manner as the "initrd=" option that is
-described above.
diff --git a/Documentation/fb/vesafb.rst b/Documentation/fb/vesafb.rst
index 2ed0dfb661cf..6821c87b7893 100644
--- a/Documentation/fb/vesafb.rst
+++ b/Documentation/fb/vesafb.rst
@@ -30,7 +30,7 @@ How to use it?
 ==============
 
 Switching modes is done using the vga=... boot parameter.  Read
-Documentation/svga.txt for details.
+Documentation/admin-guide/svga.rst for details.
 
 You should compile in both vgacon (for text mode) and vesafb (for
 graphics mode). Which of them takes over the console depends on
diff --git a/Documentation/highuid.txt b/Documentation/highuid.txt
deleted file mode 100644
index 6ee70465c0ea..000000000000
--- a/Documentation/highuid.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-===================================================
-Notes on the change from 16-bit UIDs to 32-bit UIDs
-===================================================
-
-:Author: Chris Wing <wingc@umich.edu>
-:Last updated: January 11, 2000
-
-- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t
-  when communicating between user and kernel space in an ioctl or data
-  structure.
-
-- kernel code should use uid_t and gid_t in kernel-private structures and
-  code.
-
-What's left to be done for 32-bit UIDs on all Linux architectures:
-
-- Disk quotas have an interesting limitation that is not related to the
-  maximum UID/GID. They are limited by the maximum file size on the
-  underlying filesystem, because quota records are written at offsets
-  corresponding to the UID in question.
-  Further investigation is needed to see if the quota system can cope
-  properly with huge UIDs. If it can deal with 64-bit file offsets on all 
-  architectures, this should not be a problem.
-
-- Decide whether or not to keep backwards compatibility with the system
-  accounting file, or if we should break it as the comments suggest
-  (currently, the old 16-bit UID and GID are still written to disk, and
-  part of the former pad space is used to store separate 32-bit UID and
-  GID)
-
-- Need to validate that OS emulation calls the 16-bit UID
-  compatibility syscalls, if the OS being emulated used 16-bit UIDs, or
-  uses the 32-bit UID system calls properly otherwise.
-
-  This affects at least:
-
-	- iBCS on Intel
-
-	- sparc32 emulation on sparc64
-	  (need to support whatever new 32-bit UID system calls are added to
-	  sparc32)
-
-- Validate that all filesystems behave properly.
-
-  At present, 32-bit UIDs _should_ work for:
-
-	- ext2
-	- ufs
-	- isofs
-	- nfs
-	- coda
-	- udf
-
-  Ioctl() fixups have been made for:
-
-	- ncpfs
-	- smbfs
-
-  Filesystems with simple fixups to prevent 16-bit UID wraparound:
-
-	- minix
-	- sysv
-	- qnx4
-
-  Other filesystems have not been checked yet.
-
-- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in
-  all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but
-  more are needed. (as well as new user<->kernel data structures)
-
-- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k,
-  sh, and sparc32. Fixing this is probably not that important, but would
-  require adding a new ELF section.
-
-- The ioctl()s used to control the in-kernel NFS server only support
-  16-bit UIDs on arm, i386, m68k, sh, and sparc32.
-
-- make sure that the UID mapping feature of AX25 networking works properly
-  (it should be safe because it's always used a 32-bit integer to
-  communicate between user and kernel)
diff --git a/Documentation/hw_random.txt b/Documentation/hw_random.txt
deleted file mode 100644
index 121de96e395e..000000000000
--- a/Documentation/hw_random.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-==========================================================
-Linux support for random number generator in i8xx chipsets
-==========================================================
-
-Introduction
-============
-
-The hw_random framework is software that makes use of a
-special hardware feature on your CPU or motherboard,
-a Random Number Generator (RNG).  The software has two parts:
-a core providing the /dev/hwrng character device and its
-sysfs support, plus a hardware-specific driver that plugs
-into that core.
-
-To make the most effective use of these mechanisms, you
-should download the support software as well.  Download the
-latest version of the "rng-tools" package from the
-hw_random driver's official Web site:
-
-	http://sourceforge.net/projects/gkernel/
-
-Those tools use /dev/hwrng to fill the kernel entropy pool,
-which is used internally and exported by the /dev/urandom and
-/dev/random special files.
-
-Theory of operation
-===================
-
-CHARACTER DEVICE.  Using the standard open()
-and read() system calls, you can read random data from
-the hardware RNG device.  This data is NOT CHECKED by any
-fitness tests, and could potentially be bogus (if the
-hardware is faulty or has been tampered with).  Data is only
-output if the hardware "has-data" flag is set, but nevertheless
-a security-conscious person would run fitness tests on the
-data before assuming it is truly random.
-
-The rng-tools package uses such tests in "rngd", and lets you
-run them by hand with a "rngtest" utility.
-
-/dev/hwrng is char device major 10, minor 183.
-
-CLASS DEVICE.  There is a /sys/class/misc/hw_random node with
-two unique attributes, "rng_available" and "rng_current".  The
-"rng_available" attribute lists the hardware-specific drivers
-available, while "rng_current" lists the one which is currently
-connected to /dev/hwrng.  If your system has more than one
-RNG available, you may change the one used by writing a name from
-the list in "rng_available" into "rng_current".
-
-==========================================================================
-
-
-Hardware driver for Intel/AMD/VIA Random Number Generators (RNG)
-	- Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
-	- Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
-
-
-About the Intel RNG hardware, from the firmware hub datasheet
-=============================================================
-
-The Firmware Hub integrates a Random Number Generator (RNG)
-using thermal noise generated from inherently random quantum
-mechanical properties of silicon. When not generating new random
-bits the RNG circuitry will enter a low power state. Intel will
-provide a binary software driver to give third party software
-access to our RNG for use as a security feature. At this time,
-the RNG is only to be used with a system in an OS-present state.
-
-Intel RNG Driver notes
-======================
-
-FIXME: support poll(2)
-
-.. note::
-
-	request_mem_region was removed, for three reasons:
-
-	1) Only one RNG is supported by this driver;
-	2) The location used by the RNG is a fixed location in
-	   MMIO-addressable memory;
-	3) users with properly working BIOS e820 handling will always
-	   have the region in which the RNG is located reserved, so
-	   request_mem_region calls always fail for proper setups.
-	   However, for people who use mem=XX, BIOS e820 information is
-	   **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can
-	   succeed.
-
-Driver details
-==============
-
-Based on:
-	Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet
-	May 1999 Order Number: 290658-002 R
-
-Intel 82802 Firmware Hub:
-	Random Number Generator
-	Programmer's Reference Manual
-	December 1999 Order Number: 298029-001 R
-
-Intel 82802 Firmware HUB Random Number Generator Driver
-	Copyright (c) 2000 Matt Sottek <msottek@quiknet.com>
-
-Special thanks to Matt Sottek.  I did the "guts", he
-did the "brains" and all the testing.
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
deleted file mode 100644
index 5d63b18bd6d1..000000000000
--- a/Documentation/iostats.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-=====================
-I/O statistics fields
-=====================
-
-Since 2.4.20 (and some versions before, with patches), and 2.5.45,
-more extensive disk statistics have been introduced to help measure disk
-activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do
-the work for you, but in case you are interested in creating your own
-tools, the fields are explained here.
-
-In 2.4 now, the information is found as additional fields in
-``/proc/partitions``.  In 2.6 and upper, the same information is found in two
-places: one is in the file ``/proc/diskstats``, and the other is within
-the sysfs file system, which must be mounted in order to obtain
-the information. Throughout this document we'll assume that sysfs
-is mounted on ``/sys``, although of course it may be mounted anywhere.
-Both ``/proc/diskstats`` and sysfs use the same source for the information
-and so should not differ.
-
-Here are examples of these different formats::
-
-   2.4:
-      3     0   39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      3     1    9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
-
-   2.6+ sysfs:
-      446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      35486    38030    38030    38030
-
-   2.6+ diskstats:
-      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      3    1   hda1 35486 38030 38030 38030
-
-   4.18+ diskstats:
-      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
-
-On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
-a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
-
-The advantage of one over the other is that the sysfs choice works well
-if you are watching a known, small set of disks.  ``/proc/diskstats`` may
-be a better choice if you are watching a large number of disks because
-you'll avoid the overhead of 50, 100, or 500 or more opens/closes with
-each snapshot of your disk statistics.
-
-In 2.4, the statistics fields are those after the device name. In
-the above example, the first field of statistics would be 446216.
-By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
-find just the eleven fields, beginning with 446216.  If you look at
-``/proc/diskstats``, the eleven fields will be preceded by the major and
-minor device numbers, and device name.  Each of these formats provides
-eleven fields of statistics, each meaning exactly the same things.
-All fields except field 9 are cumulative since boot.  Field 9 should
-go to zero as I/Os complete; all others only increase (unless they
-overflow and wrap).  Yes, these are (32-bit or 64-bit) unsigned long
-(native word size) numbers, and on a very busy or long-lived system they
-may wrap. Applications should be prepared to deal with that; unless
-your observations are measured in large numbers of minutes or hours,
-they should not wrap twice before you notice them.
-
-Each set of stats only applies to the indicated device; if you want
-system-wide stats you'll have to find all the devices and sum them all up.
-
-Field  1 -- # of reads completed
-    This is the total number of reads completed successfully.
-
-Field  2 -- # of reads merged, field 6 -- # of writes merged
-    Reads and writes which are adjacent to each other may be merged for
-    efficiency.  Thus two 4K reads may become one 8K read before it is
-    ultimately handed to the disk, and so it will be counted (and queued)
-    as only one I/O.  This field lets you know how often this was done.
-
-Field  3 -- # of sectors read
-    This is the total number of sectors read successfully.
-
-Field  4 -- # of milliseconds spent reading
-    This is the total number of milliseconds spent by all reads (as
-    measured from __make_request() to end_that_request_last()).
-
-Field  5 -- # of writes completed
-    This is the total number of writes completed successfully.
-
-Field  6 -- # of writes merged
-    See the description of field 2.
-
-Field  7 -- # of sectors written
-    This is the total number of sectors written successfully.
-
-Field  8 -- # of milliseconds spent writing
-    This is the total number of milliseconds spent by all writes (as
-    measured from __make_request() to end_that_request_last()).
-
-Field  9 -- # of I/Os currently in progress
-    The only field that should go to zero. Incremented as requests are
-    given to appropriate struct request_queue and decremented as they finish.
-
-Field 10 -- # of milliseconds spent doing I/Os
-    This field increases so long as field 9 is nonzero.
-
-    Since 5.0 this field counts jiffies when at least one request was
-    started or completed. If request runs more than 2 jiffies then some
-    I/O time will not be accounted unless there are other requests.
-
-Field 11 -- weighted # of milliseconds spent doing I/Os
-    This field is incremented at each I/O start, I/O completion, I/O
-    merge, or read of these stats by the number of I/Os in progress
-    (field 9) times the number of milliseconds spent doing I/O since the
-    last update of this field.  This can provide an easy measure of both
-    I/O completion time and the backlog that may be accumulating.
-
-Field 12 -- # of discards completed
-    This is the total number of discards completed successfully.
-
-Field 13 -- # of discards merged
-    See the description of field 2
-
-Field 14 -- # of sectors discarded
-    This is the total number of sectors discarded successfully.
-
-Field 15 -- # of milliseconds spent discarding
-    This is the total number of milliseconds spent by all discards (as
-    measured from __make_request() to end_that_request_last()).
-
-To avoid introducing performance bottlenecks, no locks are held while
-modifying these counters.  This implies that minor inaccuracies may be
-introduced when changes collide, so (for instance) adding up all the
-read I/Os issued per partition should equal those made to the disks ...
-but due to the lack of locking it may only be very close.
-
-In 2.6+, there are counters for each CPU, which make the lack of locking
-almost a non-issue.  When the statistics are read, the per-CPU counters
-are summed (possibly overflowing the unsigned long variable they are
-summed to) and the result given to the user.  There is no convenient
-user interface for accessing the per-CPU counters themselves.
-
-Disks vs Partitions
--------------------
-
-There were significant changes between 2.4 and 2.6+ in the I/O subsystem.
-As a result, some statistic information disappeared. The translation from
-a disk address relative to a partition to the disk address relative to
-the host disk happens much earlier.  All merges and timings now happen
-at the disk level rather than at both the disk and partition level as
-in 2.4.  Consequently, you'll see a different statistics output on 2.6+ for
-partitions from that for disks.  There are only *four* fields available
-for partitions on 2.6+ machines.  This is reflected in the examples above.
-
-Field  1 -- # of reads issued
-    This is the total number of reads issued to this partition.
-
-Field  2 -- # of sectors read
-    This is the total number of sectors requested to be read from this
-    partition.
-
-Field  3 -- # of writes issued
-    This is the total number of writes issued to this partition.
-
-Field  4 -- # of sectors written
-    This is the total number of sectors requested to be written to
-    this partition.
-
-Note that since the address is translated to a disk-relative one, and no
-record of the partition-relative address is kept, the subsequent success
-or failure of the read cannot be attributed to the partition.  In other
-words, the number of reads for partitions is counted slightly before time
-of queuing for partitions, and at completion for whole disks.  This is
-a subtle distinction that is probably uninteresting for most cases.
-
-More significant is the error induced by counting the numbers of
-reads/writes before merges for partitions and after for disks. Since a
-typical workload usually contains a lot of successive and adjacent requests,
-the number of reads/writes issued can be several times higher than the
-number of reads/writes completed.
-
-In 2.6.25, the full statistic set is again available for partitions and
-disk and partition statistics are consistent again. Since we still don't
-keep record of the partition-relative address, an operation is attributed to
-the partition which contains the first sector of the request after the
-eventual merges. As requests can be merged across partition, this could lead
-to some (probably insignificant) inaccuracy.
-
-Additional notes
-----------------
-
-In 2.6+, sysfs is not mounted by default.  If your distribution of
-Linux hasn't added it already, here's the line you'll want to add to
-your ``/etc/fstab``::
-
-	none /sys sysfs defaults 0 0
-
-
-In 2.6+, all disk statistics were removed from ``/proc/stat``.  In 2.4, they
-appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in
-``/proc/stat`` take a very different format from those in ``/proc/partitions``
-(see proc(5), if your system has it.)
-
--- ricklind@us.ibm.com
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
deleted file mode 100644
index 4f18456dd3b1..000000000000
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ /dev/null
@@ -1,356 +0,0 @@
-==========================================
-Reducing OS jitter due to per-cpu kthreads
-==========================================
-
-This document lists per-CPU kthreads in the Linux kernel and presents
-options to control their OS jitter.  Note that non-per-CPU kthreads are
-not listed here.  To reduce OS jitter from non-per-CPU kthreads, bind
-them to a "housekeeping" CPU dedicated to such work.
-
-References
-==========
-
--	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
-
--	Documentation/admin-guide/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
-
--	man taskset:  Using the taskset command to bind tasks to sets
-	of CPUs.
-
--	man sched_setaffinity:  Using the sched_setaffinity() system
-	call to bind tasks to sets of CPUs.
-
--	/sys/devices/system/cpu/cpuN/online:  Control CPU N's hotplug state,
-	writing "0" to offline and "1" to online.
-
--	In order to locate kernel-generated OS jitter on CPU N:
-
-		cd /sys/kernel/debug/tracing
-		echo 1 > max_graph_depth # Increase the "1" for more detail
-		echo function_graph > current_tracer
-		# run workload
-		cat per_cpu/cpuN/trace
-
-kthreads
-========
-
-Name:
-  ehca_comp/%u
-
-Purpose:
-  Periodically process Infiniband-related work.
-
-To reduce its OS jitter, do any of the following:
-
-1.	Don't use eHCA Infiniband hardware, instead choosing hardware
-	that does not require per-CPU kthreads.  This will prevent these
-	kthreads from being created in the first place.  (This will
-	work for most people, as this hardware, though important, is
-	relatively old and is produced in relatively low unit volumes.)
-2.	Do all eHCA-Infiniband-related work on other CPUs, including
-	interrupts.
-3.	Rework the eHCA driver so that its per-CPU kthreads are
-	provisioned only on selected CPUs.
-
-
-Name:
-  irq/%d-%s
-
-Purpose:
-  Handle threaded interrupts.
-
-To reduce its OS jitter, do the following:
-
-1.	Use irq affinity to force the irq threads to execute on
-	some other CPU.
-
-Name:
-  kcmtpd_ctr_%d
-
-Purpose:
-  Handle Bluetooth work.
-
-To reduce its OS jitter, do one of the following:
-
-1.	Don't use Bluetooth, in which case these kthreads won't be
-	created in the first place.
-2.	Use irq affinity to force Bluetooth-related interrupts to
-	occur on some other CPU and furthermore initiate all
-	Bluetooth activity on some other CPU.
-
-Name:
-  ksoftirqd/%u
-
-Purpose:
-  Execute softirq handlers when threaded or when under heavy load.
-
-To reduce its OS jitter, each softirq vector must be handled
-separately as follows:
-
-TIMER_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle, for example, by avoiding system calls and by forcing
-	both kernel threads and interrupts to execute elsewhere.
-2.	Build with CONFIG_HOTPLUG_CPU=y.  After boot completes, force
-	the CPU offline, then bring it back online.  This forces
-	recurring timers to migrate elsewhere.	If you are concerned
-	with multiple CPUs, force them all offline before bringing the
-	first one back online.  Once you have onlined the CPUs in question,
-	do not offline any other CPUs, because doing so could force the
-	timer back onto one of the CPUs in question.
-
-NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
----------------------------------
-
-Do all of the following:
-
-1.	Force networking interrupts onto other CPUs.
-2.	Initiate any network I/O on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-BLOCK_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	Force block-device interrupts onto some other CPU.
-2.	Initiate any block I/O on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-IRQ_POLL_SOFTIRQ
-----------------
-
-Do all of the following:
-
-1.	Force block-device interrupts onto some other CPU.
-2.	Initiate any block I/O and block-I/O polling on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-TASKLET_SOFTIRQ
----------------
-
-Do one or more of the following:
-
-1.	Avoid use of drivers that use tasklets.  (Such drivers will contain
-	calls to things like tasklet_schedule().)
-2.	Convert all drivers that you must use from tasklets to workqueues.
-3.	Force interrupts for drivers using tasklets onto other CPUs,
-	and also do I/O involving these drivers on other CPUs.
-
-SCHED_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	Avoid sending scheduler IPIs to the CPU to be de-jittered,
-	for example, ensure that at most one runnable kthread is present
-	on that CPU.  If a thread that expects to run on the de-jittered
-	CPU awakens, the scheduler will send an IPI that can result in
-	a subsequent SCHED_SOFTIRQ.
-2.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
-	is marked as an adaptive-ticks CPU using the "nohz_full="
-	boot parameter.  This reduces the number of scheduler-clock
-	interrupts that the de-jittered CPU receives, minimizing its
-	chances of being selected to do the load balancing work that
-	runs in SCHED_SOFTIRQ context.
-3.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle, for example, by avoiding system calls and by
-	forcing both kernel threads and interrupts to execute elsewhere.
-	This further reduces the number of scheduler-clock interrupts
-	received by the de-jittered CPU.
-
-HRTIMER_SOFTIRQ
----------------
-
-Do all of the following:
-
-1.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle.  For example, avoid system calls and force both
-	kernel threads and interrupts to execute elsewhere.
-2.	Build with CONFIG_HOTPLUG_CPU=y.  Once boot completes, force the
-	CPU offline, then bring it back online.  This forces recurring
-	timers to migrate elsewhere.  If you are concerned with multiple
-	CPUs, force them all offline before bringing the first one
-	back online.  Once you have onlined the CPUs in question, do not
-	offline any other CPUs, because doing so could force the timer
-	back onto one of the CPUs in question.
-
-RCU_SOFTIRQ
------------
-
-Do at least one of the following:
-
-1.	Offload callbacks and keep the CPU in either dyntick-idle or
-	adaptive-ticks state by doing all of the following:
-
-	a.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
-		de-jittered is marked as an adaptive-ticks CPU using the
-		"nohz_full=" boot parameter.  Bind the rcuo kthreads to
-		housekeeping CPUs, which can tolerate OS jitter.
-	b.	To the extent possible, keep the CPU out of the kernel
-		when it is non-idle, for example, by avoiding system
-		calls and by forcing both kernel threads and interrupts
-		to execute elsewhere.
-
-2.	Enable RCU to do its processing remotely via dyntick-idle by
-	doing all of the following:
-
-	a.	Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
-	b.	Ensure that the CPU goes idle frequently, allowing other
-		CPUs to detect that it has passed through an RCU quiescent
-		state.	If the kernel is built with CONFIG_NO_HZ_FULL=y,
-		userspace execution also allows other CPUs to detect that
-		the CPU in question has passed through a quiescent state.
-	c.	To the extent possible, keep the CPU out of the kernel
-		when it is non-idle, for example, by avoiding system
-		calls and by forcing both kernel threads and interrupts
-		to execute elsewhere.
-
-Name:
-  kworker/%u:%d%s (cpu, id, priority)
-
-Purpose:
-  Execute workqueue requests
-
-To reduce its OS jitter, do any of the following:
-
-1.	Run your workload at a real-time priority, which will allow
-	preempting the kworker daemons.
-2.	A given workqueue can be made visible in the sysfs filesystem
-	by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
-	Such a workqueue can be confined to a given subset of the
-	CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
-	files.	The set of WQ_SYSFS workqueues can be displayed using
-	"ls sys/devices/virtual/workqueue".  That said, the workqueues
-	maintainer would like to caution people against indiscriminately
-	sprinkling WQ_SYSFS across all the workqueues.	The reason for
-	caution is that it is easy to add WQ_SYSFS, but because sysfs is
-	part of the formal user/kernel API, it can be nearly impossible
-	to remove it, even if its addition was a mistake.
-3.	Do any of the following needed to avoid jitter that your
-	application cannot tolerate:
-
-	a.	Build your kernel with CONFIG_SLUB=y rather than
-		CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
-		use of each CPU's workqueues to run its cache_reap()
-		function.
-	b.	Avoid using oprofile, thus avoiding OS jitter from
-		wq_sync_buffer().
-	c.	Limit your CPU frequency so that a CPU-frequency
-		governor is not required, possibly enlisting the aid of
-		special heatsinks or other cooling technologies.  If done
-		correctly, and if you CPU architecture permits, you should
-		be able to build your kernel with CONFIG_CPU_FREQ=n to
-		avoid the CPU-frequency governor periodically running
-		on each CPU, including cs_dbs_timer() and od_dbs_timer().
-
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	d.	As of v3.18, Christoph Lameter's on-demand vmstat workers
-		commit prevents OS jitter due to vmstat_update() on
-		CONFIG_SMP=y systems.  Before v3.18, is not possible
-		to entirely get rid of the OS jitter, but you can
-		decrease its frequency by writing a large value to
-		/proc/sys/vm/stat_interval.  The default value is HZ,
-		for an interval of one second.	Of course, larger values
-		will make your virtual-memory statistics update more
-		slowly.  Of course, you can also run your workload at
-		a real-time priority, thus preempting vmstat_update(),
-		but if your workload is CPU-bound, this is a bad idea.
-		However, there is an RFC patch from Christoph Lameter
-		(based on an earlier one from Gilad Ben-Yossef) that
-		reduces or even eliminates vmstat overhead for some
-		workloads at https://lkml.org/lkml/2013/9/4/379.
-	e.	Boot with "elevator=noop" to avoid workqueue use by
-		the block layer.
-	f.	If running on high-end powerpc servers, build with
-		CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
-		daemon from running on each CPU every second or so.
-		(This will require editing Kconfig files and will defeat
-		this platform's RAS functionality.)  This avoids jitter
-		due to the rtas_event_scan() function.
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	g.	If running on Cell Processor, build your kernel with
-		CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
-		spu_gov_work().
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	h.	If running on PowerMAC, build your kernel with
-		CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
-		avoiding OS jitter from rackmeter_do_timer().
-
-Name:
-  rcuc/%u
-
-Purpose:
-  Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Build the kernel with CONFIG_PREEMPT=n.  This prevents these
-	kthreads from being created in the first place, and also obviates
-	the need for RCU priority boosting.  This approach is feasible
-	for workloads that do not require high degrees of responsiveness.
-2.	Build the kernel with CONFIG_RCU_BOOST=n.  This prevents these
-	kthreads from being created in the first place.  This approach
-	is feasible only if your workload never requires RCU priority
-	boosting, for example, if you ensure frequent idle time on all
-	CPUs that might execute within the kernel.
-3.	Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
-	boot parameter offloading RCU callbacks from all CPUs susceptible
-	to OS jitter.  This approach prevents the rcuc/%u kthreads from
-	having any work to do, so that they are never awakened.
-4.	Ensure that the CPU never enters the kernel, and, in particular,
-	avoid initiating any CPU hotplug operations on this CPU.  This is
-	another way of preventing any callbacks from being queued on the
-	CPU, again preventing the rcuc/%u kthreads from having any work
-	to do.
-
-Name:
-  rcuop/%d and rcuos/%d
-
-Purpose:
-  Offload RCU callbacks from the corresponding CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Use affinity, cgroups, or other mechanism to force these kthreads
-	to execute on some other CPU.
-2.	Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
-	kthreads from being created in the first place.  However, please
-	note that this will not eliminate OS jitter, but will instead
-	shift it to RCU_SOFTIRQ.
-
-Name:
-  watchdog/%u
-
-Purpose:
-  Detect software lockups on each CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
-	kthreads from being created in the first place.
-2.	Boot with "nosoftlockup=0", which will also prevent these kthreads
-	from being created.  Other related watchdog and softlockup boot
-	parameters may be found in Documentation/admin-guide/kernel-parameters.rst
-	and Documentation/watchdog/watchdog-parameters.rst.
-3.	Echo a zero to /proc/sys/kernel/watchdog to disable the
-	watchdog timer.
-4.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
-	order to reduce the frequency of OS jitter due to the watchdog
-	timer down to a level that is acceptable for your workload.
diff --git a/Documentation/ldm.txt b/Documentation/ldm.txt
deleted file mode 100644
index 12c571368e73..000000000000
--- a/Documentation/ldm.txt
+++ /dev/null
@@ -1,121 +0,0 @@
-==========================================
-LDM - Logical Disk Manager (Dynamic Disks)
-==========================================
-
-:Author: Originally Written by FlatCap - Richard Russon <ldm@flatcap.org>.
-:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista.
-
-Overview
---------
-
-Windows 2000, XP, and Vista use a new partitioning scheme.  It is a complete
-replacement for the MSDOS style partitions.  It stores its information in a
-1MiB journalled database at the end of the physical disk.  The size of
-partitions is limited only by disk space.  The maximum number of partitions is
-nearly 2000.
-
-Any partitions created under the LDM are called "Dynamic Disks".  There are no
-longer any primary or extended partitions.  Normal MSDOS style partitions are
-now known as Basic Disks.
-
-If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use
-Dynamic Disks.  The journalling allows Windows to make changes to these
-partitions and filesystems without the need to reboot.
-
-Once the LDM driver has divided up the disk, you can use the MD driver to
-assemble any multi-partition volumes, e.g.  Stripes, RAID5.
-
-To prevent legacy applications from repartitioning the disk, the LDM creates a
-dummy MSDOS partition containing one disk-sized partition.  This is what is
-supported with the Linux LDM driver.
-
-A newer approach that has been implemented with Vista is to put LDM on top of a
-GPT label disk.  This is not supported by the Linux LDM driver yet.
-
-
-Example
--------
-
-Below we have a 50MiB disk, divided into seven partitions.
-
-.. note::
-
-   The missing 1MiB at the end of the disk is where the LDM database is
-   stored.
-
-+-------++--------------+---------+-----++--------------+---------+----+
-|Device || Offset Bytes | Sectors | MiB || Size   Bytes | Sectors | MiB|
-+=======++==============+=========+=====++==============+=========+====+
-|hda    ||            0 |       0 |   0 ||     52428800 |  102400 |  50|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda1   ||     51380224 |  100352 |  49 ||      1048576 |    2048 |   1|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda2   ||        16384 |      32 |   0 ||      6979584 |   13632 |   6|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda3   ||      6995968 |   13664 |   6 ||     10485760 |   20480 |  10|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda4   ||     17481728 |   34144 |  16 ||      4194304 |    8192 |   4|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda5   ||     21676032 |   42336 |  20 ||      5242880 |   10240 |   5|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda6   ||     26918912 |   52576 |  25 ||     10485760 |   20480 |  10|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda7   ||     37404672 |   73056 |  35 ||     13959168 |   27264 |  13|
-+-------++--------------+---------+-----++--------------+---------+----+
-
-The LDM Database may not store the partitions in the order that they appear on
-disk, but the driver will sort them.
-
-When Linux boots, you will see something like::
-
-  hda: 102400 sectors w/32KiB Cache, CHS=50/64/32
-  hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7
-
-
-Compiling LDM Support
----------------------
-
-To enable LDM, choose the following two options: 
-
-  - "Advanced partition selection" CONFIG_PARTITION_ADVANCED
-  - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION
-
-If you believe the driver isn't working as it should, you can enable the extra
-debugging code.  This will produce a LOT of output.  The option is:
-
-  - "Windows LDM extra logging" CONFIG_LDM_DEBUG
-
-N.B. The partition code cannot be compiled as a module.
-
-As with all the partition code, if the driver doesn't see signs of its type of
-partition, it will pass control to another driver, so there is no harm in
-enabling it.
-
-If you have Dynamic Disks but don't enable the driver, then all you will see
-is a dummy MSDOS partition filling the whole disk.  You won't be able to mount
-any of the volumes on the disk.
-
-
-Booting
--------
-
-If you enable LDM support, then lilo is capable of booting from any of the
-discovered partitions.  However, grub does not understand the LDM partitioning
-and cannot boot from a Dynamic Disk.
-
-
-More Documentation
-------------------
-
-There is an Overview of the LDM together with complete Technical Documentation.
-It is available for download.
-
-  http://www.linux-ntfs.org/
-
-If you have any LDM questions that aren't answered in the documentation, email
-me.
-
-Cheers,
-    FlatCap - Richard Russon
-    ldm@flatcap.org
-
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt
deleted file mode 100644
index 290840c160af..000000000000
--- a/Documentation/lockup-watchdogs.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-===============================================================
-Softlockup detector and hardlockup detector (aka nmi_watchdog)
-===============================================================
-
-The Linux kernel can act as a watchdog to detect both soft and hard
-lockups.
-
-A 'softlockup' is defined as a bug that causes the kernel to loop in
-kernel mode for more than 20 seconds (see "Implementation" below for
-details), without giving other tasks a chance to run. The current
-stack trace is displayed upon detection and, by default, the system
-will stay locked up. Alternatively, the kernel can be configured to
-panic; a sysctl, "kernel.softlockup_panic", a kernel parameter,
-"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for
-details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are
-provided for this.
-
-A 'hardlockup' is defined as a bug that causes the CPU to loop in
-kernel mode for more than 10 seconds (see "Implementation" below for
-details), without letting other interrupts have a chance to run.
-Similarly to the softlockup case, the current stack trace is displayed
-upon detection and the system will stay locked up unless the default
-behavior is changed, which can be done through a sysctl,
-'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
-and a kernel parameter, "nmi_watchdog"
-(see "Documentation/admin-guide/kernel-parameters.rst" for details).
-
-The panic option can be used in combination with panic_timeout (this
-timeout is set through the confusingly named "kernel.panic" sysctl),
-to cause the system to reboot automatically after a specified amount
-of time.
-
-Implementation
-==============
-
-The soft and hard lockup detectors are built on top of the hrtimer and
-perf subsystems, respectively. A direct consequence of this is that,
-in principle, they should work in any architecture where these
-subsystems are present.
-
-A periodic hrtimer runs to generate interrupts and kick the watchdog
-task. An NMI perf event is generated every "watchdog_thresh"
-(compile-time initialized to 10 and configurable through sysctl of the
-same name) seconds to check for hardlockups. If any CPU in the system
-does not receive any hrtimer interrupt during that time the
-'hardlockup detector' (the handler for the NMI perf event) will
-generate a kernel warning or call panic, depending on the
-configuration.
-
-The watchdog task is a high priority kernel thread that updates a
-timestamp every time it is scheduled. If that timestamp is not updated
-for 2*watchdog_thresh seconds (the softlockup threshold) the
-'softlockup detector' (coded inside the hrtimer callback function)
-will dump useful debug information to the system log, after which it
-will call panic if it was instructed to do so or resume execution of
-other kernel code.
-
-The period of the hrtimer is 2*watchdog_thresh/5, which means it has
-two or three chances to generate an interrupt before the hardlockup
-detector kicks in.
-
-As explained above, a kernel knob is provided that allows
-administrators to configure the period of the hrtimer and the perf
-event. The right value for a particular environment is a trade-off
-between fast response to lockups and detection overhead.
-
-By default, the watchdog runs on all online cores.  However, on a
-kernel configured with NO_HZ_FULL, by default the watchdog runs only
-on the housekeeping cores, not the cores specified in the "nohz_full"
-boot argument.  If we allowed the watchdog to run by default on
-the "nohz_full" cores, we would have to run timer ticks to activate
-the scheduler, which would prevent the "nohz_full" functionality
-from protecting the user code on those cores from the kernel.
-Of course, disabling it by default on the nohz_full cores means that
-when those cores do enter the kernel, by default we will not be
-able to detect if they lock up.  However, allowing the watchdog
-to continue to run on the housekeeping (non-tickless) cores means
-that we will continue to detect lockups properly on those cores.
-
-In either case, the set of cores excluded from running the watchdog
-may be adjusted via the kernel.watchdog_cpumask sysctl.  For
-nohz_full cores, this may be useful for debugging a case where the
-kernel seems to be hanging on the nohz_full cores.
diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt
deleted file mode 100644
index aaf1667489f8..000000000000
--- a/Documentation/numastat.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-===============================
-Numa policy hit/miss statistics
-===============================
-
-/sys/devices/system/node/node*/numastat
-
-All units are pages. Hugepages have separate counters.
-
-=============== ============================================================
-numa_hit	A process wanted to allocate memory from this node,
-		and succeeded.
-
-numa_miss	A process wanted to allocate memory from another node,
-		but ended up with memory from this node.
-
-numa_foreign	A process wanted to allocate on this node,
-		but ended up with memory from another one.
-
-local_node	A process ran on this node and got memory from it.
-
-other_node	A process ran on this node and got memory from another node.
-
-interleave_hit 	Interleaving wanted to allocate from this node
-		and succeeded.
-=============== ============================================================
-
-For easier reading you can use the numastat utility from the numactl package
-(http://oss.sgi.com/projects/libnuma/). Note that it only works
-well right now on machines with a small number of CPUs.
-
diff --git a/Documentation/pnp.txt b/Documentation/pnp.txt
deleted file mode 100644
index bab2d10631f0..000000000000
--- a/Documentation/pnp.txt
+++ /dev/null
@@ -1,292 +0,0 @@
-=================================
-Linux Plug and Play Documentation
-=================================
-
-:Author: Adam Belay <ambx1@neo.rr.com>
-:Last updated: Oct. 16, 2002
-
-
-Overview
---------
-
-Plug and Play provides a means of detecting and setting resources for legacy or
-otherwise unconfigurable devices.  The Linux Plug and Play Layer provides these 
-services to compatible drivers.
-
-
-The User Interface
-------------------
-
-The Linux Plug and Play user interface provides a means to activate PnP devices
-for legacy and user level drivers that do not support Linux Plug and Play.  The 
-user interface is integrated into sysfs.
-
-In addition to the standard sysfs file the following are created in each
-device's directory:
-- id - displays a list of support EISA IDs
-- options - displays possible resource configurations
-- resources - displays currently allocated resources and allows resource changes
-
-activating a device
-^^^^^^^^^^^^^^^^^^^
-
-::
-
-	# echo "auto" > resources
-
-this will invoke the automatic resource config system to activate the device
-
-manually activating a device
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-::
-
-	# echo "manual <depnum> <mode>" > resources
-
-	<depnum> - the configuration number
-	<mode> - static or dynamic
-		 static = for next boot
-		 dynamic = now
-
-disabling a device
-^^^^^^^^^^^^^^^^^^
-
-::
-
-	# echo "disable" > resources
-
-
-EXAMPLE:
-
-Suppose you need to activate the floppy disk controller.
-
-1. change to the proper directory, in my case it is
-   /driver/bus/pnp/devices/00:0f::
-
-	# cd /driver/bus/pnp/devices/00:0f
-	# cat name
-	PC standard floppy disk controller
-
-2. check if the device is already active::
-
-	# cat resources
-	DISABLED
-
-  - Notice the string "DISABLED".  This means the device is not active.
-
-3. check the device's possible configurations (optional)::
-
-	# cat options
-	Dependent: 01 - Priority acceptable
-	    port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding
-	    port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding
-	    irq 6
-	    dma 2 8-bit compatible
-	Dependent: 02 - Priority acceptable
-	    port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding
-	    port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding
-	    irq 6
-	    dma 2 8-bit compatible
-
-4. now activate the device::
-
-	# echo "auto" > resources
-
-5. finally check if the device is active::
-
-	# cat resources
-	io 0x3f0-0x3f5
-	io 0x3f7-0x3f7
-	irq 6
-	dma 2
-
-also there are a series of kernel parameters::
-
-	pnp_reserve_irq=irq1[,irq2] ....
-	pnp_reserve_dma=dma1[,dma2] ....
-	pnp_reserve_io=io1,size1[,io2,size2] ....
-	pnp_reserve_mem=mem1,size1[,mem2,size2] ....
-
-
-
-The Unified Plug and Play Layer
--------------------------------
-
-All Plug and Play drivers, protocols, and services meet at a central location
-called the Plug and Play Layer.  This layer is responsible for the exchange of 
-information between PnP drivers and PnP protocols.  Thus it automatically 
-forwards commands to the proper protocol.  This makes writing PnP drivers 
-significantly easier.
-
-The following functions are available from the Plug and Play Layer:
-
-pnp_get_protocol
-  increments the number of uses by one
-
-pnp_put_protocol
-  deincrements the number of uses by one
-
-pnp_register_protocol
-  use this to register a new PnP protocol
-
-pnp_unregister_protocol
-  use this function to remove a PnP protocol from the Plug and Play Layer
-
-pnp_register_driver
-  adds a PnP driver to the Plug and Play Layer
-
-  this includes driver model integration
-  returns zero for success or a negative error number for failure; count
-  calls to the .add() method if you need to know how many devices bind to
-  the driver
-
-pnp_unregister_driver
-  removes a PnP driver from the Plug and Play Layer
-
-
-
-Plug and Play Protocols
------------------------
-
-This section contains information for PnP protocol developers.
-
-The following Protocols are currently available in the computing world:
-
-- PNPBIOS:
-    used for system devices such as serial and parallel ports.
-- ISAPNP:
-    provides PnP support for the ISA bus
-- ACPI:
-    among its many uses, ACPI provides information about system level
-    devices.
-
-It is meant to replace the PNPBIOS.  It is not currently supported by Linux
-Plug and Play but it is planned to be in the near future.
-
-
-Requirements for a Linux PnP protocol:
-1. the protocol must use EISA IDs
-2. the protocol must inform the PnP Layer of a device's current configuration
-
-- the ability to set resources is optional but preferred.
-
-The following are PnP protocol related functions:
-
-pnp_add_device
-  use this function to add a PnP device to the PnP layer
-
-  only call this function when all wanted values are set in the pnp_dev
-  structure
-
-pnp_init_device
-  call this to initialize the PnP structure
-
-pnp_remove_device
-  call this to remove a device from the Plug and Play Layer.
-  it will fail if the device is still in use.
-  automatically will free mem used by the device and related structures
-
-pnp_add_id
-  adds an EISA ID to the list of supported IDs for the specified device
-
-For more information consult the source of a protocol such as
-/drivers/pnp/pnpbios/core.c.
-
-
-
-Linux Plug and Play Drivers
----------------------------
-
-This section contains information for Linux PnP driver developers.
-
-The New Way
-^^^^^^^^^^^
-
-1. first make a list of supported EISA IDS
-
-   ex::
-
-	static const struct pnp_id pnp_dev_table[] = {
-		/* Standard LPT Printer Port */
-		{.id = "PNP0400", .driver_data = 0},
-		/* ECP Printer Port */
-		{.id = "PNP0401", .driver_data = 0},
-		{.id = ""}
-	};
-
-   Please note that the character 'X' can be used as a wild card in the function
-   portion (last four characters).
-
-   ex::
-
-	/* Unknown PnP modems */
-	{	"PNPCXXX",		UNKNOWN_DEV	},
-
-   Supported PnP card IDs can optionally be defined.
-   ex::
-
-	static const struct pnp_id pnp_card_table[] = {
-		{	"ANYDEVS",		0	},
-		{	"",			0	}
-	};
-
-2. Optionally define probe and remove functions.  It may make sense not to
-   define these functions if the driver already has a reliable method of detecting
-   the resources, such as the parport_pc driver.
-
-   ex::
-
-	static int
-	serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const
-			struct pnp_id *dev_id)
-	{
-	. . .
-
-   ex::
-
-	static void serial_pnp_remove(struct pnp_dev * dev)
-	{
-	. . .
-
-   consult /drivers/serial/8250_pnp.c for more information.
-
-3. create a driver structure
-
-   ex::
-
-	static struct pnp_driver serial_pnp_driver = {
-		.name		= "serial",
-		.card_id_table	= pnp_card_table,
-		.id_table	= pnp_dev_table,
-		.probe		= serial_pnp_probe,
-		.remove		= serial_pnp_remove,
-	};
-
-   * name and id_table cannot be NULL.
-
-4. register the driver
-
-   ex::
-
-	static int __init serial8250_pnp_init(void)
-	{
-		return pnp_register_driver(&serial_pnp_driver);
-	}
-
-The Old Way
-^^^^^^^^^^^
-
-A series of compatibility functions have been created to make it easy to convert
-ISAPNP drivers.  They should serve as a temporary solution only.
-
-They are as follows::
-
-	struct pnp_card *pnp_find_card(unsigned short vendor,
-				       unsigned short device,
-				       struct pnp_card *from)
-
-	struct pnp_dev *pnp_find_dev(struct pnp_card *card,
-				     unsigned short vendor,
-				     unsigned short function,
-				     struct pnp_dev *from)
-
diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt
deleted file mode 100644
index 688c95b11919..000000000000
--- a/Documentation/rtc.txt
+++ /dev/null
@@ -1,140 +0,0 @@
-=======================================
-Real Time Clock (RTC) Drivers for Linux
-=======================================
-
-When Linux developers talk about a "Real Time Clock", they usually mean
-something that tracks wall clock time and is battery backed so that it
-works even with system power off.  Such clocks will normally not track
-the local time zone or daylight savings time -- unless they dual boot
-with MS-Windows -- but will instead be set to Coordinated Universal Time
-(UTC, formerly "Greenwich Mean Time").
-
-The newest non-PC hardware tends to just count seconds, like the time(2)
-system call reports, but RTCs also very commonly represent time using
-the Gregorian calendar and 24 hour time, as reported by gmtime(3).
-
-Linux has two largely-compatible userspace RTC API families you may
-need to know about:
-
-    *	/dev/rtc ... is the RTC provided by PC compatible systems,
-	so it's not very portable to non-x86 systems.
-
-    *	/dev/rtc0, /dev/rtc1 ... are part of a framework that's
-	supported by a wide variety of RTC chips on all systems.
-
-Programmers need to understand that the PC/AT functionality is not
-always available, and some systems can do much more.  That is, the
-RTCs use the same API to make requests in both RTC frameworks (using
-different filenames of course), but the hardware may not offer the
-same functionality.  For example, not every RTC is hooked up to an
-IRQ, so they can't all issue alarms; and where standard PC RTCs can
-only issue an alarm up to 24 hours in the future, other hardware may
-be able to schedule one any time in the upcoming century.
-
-
-Old PC/AT-Compatible driver:  /dev/rtc
---------------------------------------
-
-All PCs (even Alpha machines) have a Real Time Clock built into them.
-Usually they are built into the chipset of the computer, but some may
-actually have a Motorola MC146818 (or clone) on the board. This is the
-clock that keeps the date and time while your computer is turned off.
-
-ACPI has standardized that MC146818 functionality, and extended it in
-a few ways (enabling longer alarm periods, and wake-from-hibernate).
-That functionality is NOT exposed in the old driver.
-
-However it can also be used to generate signals from a slow 2Hz to a
-relatively fast 8192Hz, in increments of powers of two. These signals
-are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is
-for...) It can also function as a 24hr alarm, raising IRQ 8 when the
-alarm goes off. The alarm can also be programmed to only check any
-subset of the three programmable values, meaning that it could be set to
-ring on the 30th second of the 30th minute of every hour, for example.
-The clock can also be set to generate an interrupt upon every clock
-update, thus generating a 1Hz signal.
-
-The interrupts are reported via /dev/rtc (major 10, minor 135, read only
-character device) in the form of an unsigned long. The low byte contains
-the type of interrupt (update-done, alarm-rang, or periodic) that was
-raised, and the remaining bytes contain the number of interrupts since
-the last read.  Status information is reported through the pseudo-file
-/proc/driver/rtc if the /proc filesystem was enabled.  The driver has
-built in locking so that only one process is allowed to have the /dev/rtc
-interface open at a time.
-
-A user process can monitor these interrupts by doing a read(2) or a
-select(2) on /dev/rtc -- either will block/stop the user process until
-the next interrupt is received. This is useful for things like
-reasonably high frequency data acquisition where one doesn't want to
-burn up 100% CPU by polling gettimeofday etc. etc.
-
-At high frequencies, or under high loads, the user process should check
-the number of interrupts received since the last read to determine if
-there has been any interrupt "pileup" so to speak. Just for reference, a
-typical 486-33 running a tight read loop on /dev/rtc will start to suffer
-occasional interrupt pileup (i.e. > 1 IRQ event since last read) for
-frequencies above 1024Hz. So you really should check the high bytes
-of the value you read, especially at frequencies above that of the
-normal timer interrupt, which is 100Hz.
-
-Programming and/or enabling interrupt frequencies greater than 64Hz is
-only allowed by root. This is perhaps a bit conservative, but we don't want
-an evil user generating lots of IRQs on a slow 386sx-16, where it might have
-a negative impact on performance. This 64Hz limit can be changed by writing
-a different value to /proc/sys/dev/rtc/max-user-freq. Note that the
-interrupt handler is only a few lines of code to minimize any possibility
-of this effect.
-
-Also, if the kernel time is synchronized with an external source, the 
-kernel will write the time back to the CMOS clock every 11 minutes. In 
-the process of doing this, the kernel briefly turns off RTC periodic 
-interrupts, so be aware of this if you are doing serious work. If you
-don't synchronize the kernel time with an external source (via ntp or
-whatever) then the kernel will keep its hands off the RTC, allowing you
-exclusive access to the device for your applications.
-
-The alarm and/or interrupt frequency are programmed into the RTC via
-various ioctl(2) calls as listed in ./include/linux/rtc.h
-Rather than write 50 pages describing the ioctl() and so on, it is
-perhaps more useful to include a small test program that demonstrates
-how to use them, and demonstrates the features of the driver. This is
-probably a lot more useful to people interested in writing applications
-that will be using this driver.  See the code at the end of this document.
-
-(The original /dev/rtc driver was written by Paul Gortmaker.)
-
-
-New portable "RTC Class" drivers:  /dev/rtcN
---------------------------------------------
-
-Because Linux supports many non-ACPI and non-PC platforms, some of which
-have more than one RTC style clock, it needed a more portable solution
-than expecting a single battery-backed MC146818 clone on every system.
-Accordingly, a new "RTC Class" framework has been defined.  It offers
-three different userspace interfaces:
-
-    *	/dev/rtcN ... much the same as the older /dev/rtc interface
-
-    *	/sys/class/rtc/rtcN ... sysfs attributes support readonly
-	access to some RTC attributes.
-
-    *	/proc/driver/rtc ... the system clock RTC may expose itself
-	using a procfs interface. If there is no RTC for the system clock,
-	rtc0 is used by default. More information is (currently) shown
-	here than through sysfs.
-
-The RTC Class framework supports a wide variety of RTCs, ranging from those
-integrated into embeddable system-on-chip (SOC) processors to discrete chips
-using I2C, SPI, or some other bus to communicate with the host CPU.  There's
-even support for PC-style RTCs ... including the features exposed on newer PCs
-through ACPI.
-
-The new framework also removes the "one RTC per system" restriction.  For
-example, maybe the low-power battery-backed RTC is a discrete I2C chip, but
-a high functionality RTC is integrated into the SOC.  That system might read
-the system clock from the discrete RTC, but use the integrated one for all
-other tasks, because of its greater functionality.
-
-Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the
-ioctl interface.
diff --git a/Documentation/svga.txt b/Documentation/svga.txt
deleted file mode 100644
index b6c2f9acca92..000000000000
--- a/Documentation/svga.txt
+++ /dev/null
@@ -1,249 +0,0 @@
-.. include:: <isonum.txt>
-
-=================================
-Video Mode Selection Support 2.13
-=================================
-
-:Copyright: |copy| 1995--1999 Martin Mares, <mj@ucw.cz>
-
-Intro
-~~~~~
-
-This small document describes the "Video Mode Selection" feature which
-allows the use of various special video modes supported by the video BIOS. Due
-to usage of the BIOS, the selection is limited to boot time (before the
-kernel decompression starts) and works only on 80X86 machines.
-
-.. note::
-
-   Short intro for the impatient: Just use vga=ask for the first time,
-   enter ``scan`` on the video mode prompt, pick the mode you want to use,
-   remember its mode ID (the four-digit hexadecimal number) and then
-   set the vga parameter to this number (converted to decimal first).
-
-The video mode to be used is selected by a kernel parameter which can be
-specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..."
-option of LILO (or some other boot loader you use) or by the "vidmode" utility
-(present in standard Linux utility packages). You can use the following values
-of this parameter::
-
-   NORMAL_VGA - Standard 80x25 mode available on all display adapters.
-
-   EXTENDED_VGA	- Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA.
-
-   ASK_VGA - Display a video mode menu upon startup (see below).
-
-   0..35 - Menu item number (when you have used the menu to view the list of
-      modes available on your adapter, you can specify the menu item you want
-      to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the
-      mode list displayed may vary as the kernel version changes, because the
-      modes are listed in a "first detected -- first displayed" manner. It's
-      better to use absolute mode numbers instead.
-
-   0x.... - Hexadecimal video mode ID (also displayed on the menu, see below
-      for exact meaning of the ID). Warning: rdev and LILO don't support
-      hexadecimal numbers -- you have to convert it to decimal manually.
-
-Menu
-~~~~
-
-The ASK_VGA mode causes the kernel to offer a video mode menu upon
-bootup. It displays a "Press <RETURN> to see video modes available, <SPACE>
-to continue or wait 30 secs" message. If you press <RETURN>, you enter the
-menu, if you press <SPACE> or wait 30 seconds, the kernel will boot up in
-the standard 80x25 mode.
-
-The menu looks like::
-
-	Video adapter: <name-of-detected-video-adapter>
-	Mode:    COLSxROWS:
-	0  0F00  80x25
-	1  0F01  80x50
-	2  0F02  80x43
-	3  0F03  80x26
-	....
-	Enter mode number or ``scan``: <flashing-cursor-here>
-
-<name-of-detected-video-adapter> tells what video adapter did Linux detect
--- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA
-with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection
-of chipsets is turned off by default as it's inherently unreliable due to
-absolutely insane PC design.
-
-"0  0F00  80x25" means that the first menu item (the menu items are numbered
-from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the
-next section for a description of mode IDs).
-
-<flashing-cursor-here> encourages you to enter the item number or mode ID
-you wish to set and press <RETURN>. If the computer complains something about
-"Unknown mode ID", it is trying to tell you that it isn't possible to set such
-a mode. It's also possible to press only <RETURN> which leaves the current mode.
-
-The mode list usually contains a few basic modes and some VESA modes.  In
-case your chipset has been detected, some chipset-specific modes are shown as
-well (some of these might be missing or unusable on your machine as different
-BIOSes are often shipped with the same card and the mode numbers depend purely
-on the VGA BIOS).
-
-The modes displayed on the menu are partially sorted: The list starts with
-the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and
-80x43), local modes (if the local modes feature is enabled), VESA modes and
-finally SVGA modes for the auto-detected adapter.
-
-If you are not happy with the mode list offered (e.g., if you think your card
-is able to do more), you can enter "scan" instead of item number / mode ID.  The
-program will try to ask the BIOS for all possible video mode numbers and test
-what happens then. The screen will be probably flashing wildly for some time and
-strange noises will be heard from inside the monitor and so on and then, really
-all consistent video modes supported by your BIOS will appear (plus maybe some
-``ghost modes``). If you are afraid this could damage your monitor, don't use
-this function.
-
-After scanning, the mode ordering is a bit different: the auto-detected SVGA
-modes are not listed at all and the modes revealed by ``scan`` are shown before
-all VESA modes.
-
-Mode IDs
-~~~~~~~~
-
-Because of the complexity of all the video stuff, the video mode IDs
-used here are also a bit complex. A video mode ID is a 16-bit number usually
-expressed in a hexadecimal notation (starting with "0x"). You can set a mode
-by entering its mode directly if you know it even if it isn't shown on the menu.
-
-The ID numbers can be divided to those regions::
-
-   0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use
-	outside the menu as this can change from boot to boot (especially if you
-	have used the ``scan`` feature).
-
-   0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number
-	(as presented to INT 10, function 00) increased by 0x0100.
-
-   0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by
-	0x0100. All VESA modes should be autodetected and shown on the menu.
-
-   0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05.
-	(Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60,
-	945=132x28 for the standard Video7 BIOS)
-
-   0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually
-	by modifying one of the standard modes). Currently available:
-	0x0f00	standard 80x25, don't reset mode if already set (=FFFF)
-	0x0f01	standard with 8-point font: 80x43 on EGA, 80x50 on VGA
-	0x0f02	VGA 80x43 (VGA switched to 350 scanlines with a 8-point font)
-	0x0f03	VGA 80x28 (standard VGA scans, but 14-point font)
-	0x0f04	leave current video mode
-	0x0f05	VGA 80x30 (480 scans, 16-point font)
-	0x0f06	VGA 80x34 (480 scans, 14-point font)
-	0x0f07	VGA 80x60 (480 scans, 8-point font)
-	0x0f08	Graphics hack (see the VIDEO_GFX_HACK paragraph below)
-
-   0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC"
-	form where RR is a number of rows and CC is a number of columns.
-	E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc.
-	This is the only fully portable way to refer to a non-standard mode,
-	but it relies on the mode being found and displayed on the menu
-	(remember that mode scanning is not done automatically).
-
-   0xff00 to 0xffff - aliases for backward compatibility:
-	0xffff	equivalent to 0x0f00 (standard 80x25)
-	0xfffe	equivalent to 0x0f01 (EGA 80x43 or VGA 80x50)
-
-If you add 0x8000 to the mode ID, the program will try to recalculate
-vertical display timing according to mode parameters, which can be used to
-eliminate some annoying bugs of certain VGA BIOSes (usually those used for
-cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the
-end of the display.
-
-Options
-~~~~~~~
-
-Build options for arch/x86/boot/* are selected by the kernel kconfig
-utility and the kernel .config file.
-
-VIDEO_GFX_HACK - includes special hack for setting of graphics modes
-to be used later by special drivers.
-Allows to set _any_ BIOS mode including graphic ones and forcing specific
-text screen resolution instead of peeking it from BIOS variables. Don't use
-unless you think you know what you're doing. To activate this setup, use
-mode number 0x0f08 (see the Mode IDs section above).
-
-Still doesn't work?
-~~~~~~~~~~~~~~~~~~~
-
-When the mode detection doesn't work (e.g., the mode list is incorrect or
-the machine hangs instead of displaying the menu), try to switch off some of
-the configuration options listed under "Options". If it fails, you can still use
-your kernel with the video mode set directly via the kernel parameter.
-
-In either case, please send me a bug report containing what _exactly_
-happens and how do the configuration switches affect the behaviour of the bug.
-
-If you start Linux from M$-DOS, you might also use some DOS tools for
-video mode setting. In this case, you must specify the 0x0f04 mode ("leave
-current settings") to Linux, because if you don't and you use any non-standard
-mode, Linux will switch to 80x25 automatically.
-
-If you set some extended mode and there's one or more extra lines on the
-bottom of the display containing already scrolled-out text, your VGA BIOS
-contains the most common video BIOS bug called "incorrect vertical display
-end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately,
-this must be done manually -- no autodetection mechanisms are available.
-
-History
-~~~~~~~
-
-=============== ================================================================
-1.0 (??-Nov-95)	First version supporting all adapters supported by the old
-		setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels
-		and then removed due to instability on some machines.
-2.0 (28-Jan-96)	Rewritten from scratch. Cirrus Logic 64XX support added, almost
-		everything is configurable, the VESA support should be much more
-		stable, explicit mode numbering allowed, "scan" implemented etc.
-2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution
-		supported. Few bugs fixed. VESA modes are listed prior to
-		modes supplied by SVGA autodetection as they are more reliable.
-		CLGD autodetect works better. Doesn't depend on 80x25 being
-		active when started. Scanning fixed. 80x43 (any VGA) added.
-		Code cleaned up.
-2.2 (01-Feb-96)	EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX
-		VESA modes work now). Display end bug workaround supported.
-		Special modes renumbered to allow adding of the "recalculate"
-		flag, 0xffff and 0xfffe became aliases instead of real IDs.
-		Screen contents retained during mode changes.
-2.3 (15-Mar-96)	Changed to work with 1.3.74 kernel.
-2.4 (18-Mar-96)	Added patches by Hans Lermen fixing a memory overwrite problem
-		with some boot loaders. Memory management rewritten to reflect
-		these changes. Unfortunately, screen contents retaining works
-		only with some loaders now.
-		Added a Tseng 132x60 mode.
-2.5 (19-Mar-96)	Fixed a VESA mode scanning bug introduced in 2.4.
-2.6 (25-Mar-96)	Some VESA BIOS errors not reported -- it fixes error reports on
-		several cards with broken VESA code (e.g., ATI VGA).
-2.7 (09-Apr-96)	- Accepted all VESA modes in range 0x100 to 0x7ff, because some
-		  cards use very strange mode numbers.
-		- Added Realtek VGA modes (thanks to Gonzalo Tornaria).
-		- Hardware testing order slightly changed, tests based on ROM
-		  contents done as first.
-		- Added support for special Video7 mode switching functions
-		  (thanks to Tom Vander Aa).
-		- Added 480-scanline modes (especially useful for notebooks,
-		  original version written by hhanemaa@cs.ruu.nl, patched by
-		  Jeff Chua, rewritten by me).
-		- Screen store/restore fixed.
-2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA.
-		- Better recognition of text modes during mode scan.
-2.9 (12-May-96)	- Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!)
-2.10(11-Nov-96) - The whole thing made optional.
-		- Added the CONFIG_VIDEO_400_HACK switch.
-		- Added the CONFIG_VIDEO_GFX_HACK switch.
-		- Code cleanup.
-2.11(03-May-97) - Yet another cleanup, now including also the documentation.
-		- Direct testing of SVGA adapters turned off by default, ``scan``
-		  offered explicitly on the prompt line.
-		- Removed the doc section describing adding of new probing
-		  functions as I try to get rid of _all_ hardware probing here.
-2.12(25-May-98) Added support for VESA frame buffer graphics.
-2.13(14-May-99) Minor documentation fixes.
-=============== ================================================================
diff --git a/Documentation/video-output.txt b/Documentation/video-output.txt
deleted file mode 100644
index 56d6fa2e2368..000000000000
--- a/Documentation/video-output.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-Video Output Switcher Control
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-2006 luming.yu@intel.com
-
-The output sysfs class driver provides an abstract video output layer that
-can be used to hook platform specific methods to enable/disable video output
-device through common sysfs interface. For example, on my IBM ThinkPad T42
-laptop, The ACPI video driver registered its output devices and read/write
-method for 'state' with output sysfs class. The user interface under sysfs is::
-
-  linux:/sys/class/video_output # tree .
-  .
-  |-- CRT0
-  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-  |   |-- state
-  |   |-- subsystem -> ../../../class/video_output
-  |   `-- uevent
-  |-- DVI0
-  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-  |   |-- state
-  |   |-- subsystem -> ../../../class/video_output
-  |   `-- uevent
-  |-- LCD0
-  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-  |   |-- state
-  |   |-- subsystem -> ../../../class/video_output
-  |   `-- uevent
-  `-- TV0
-     |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-     |-- state
-     |-- subsystem -> ../../../class/video_output
-     `-- uevent
-
diff --git a/Documentation/x86/topology.rst b/Documentation/x86/topology.rst
index 8e9704f61017..e29739904e37 100644
--- a/Documentation/x86/topology.rst
+++ b/Documentation/x86/topology.rst
@@ -9,7 +9,7 @@ representation in the kernel. Update/change when doing changes to the
 respective code.
 
 The architecture-agnostic topology definitions are in
-Documentation/cputopology.txt. This file holds x86-specific
+Documentation/admin-guide/cputopology.rst. This file holds x86-specific
 differences/specialities which must not necessarily apply to the generic
 definitions. Thus, the way to read up on Linux topology on x86 is to start
 with the generic one and look at this one in parallel for the x86 specifics.
diff --git a/MAINTAINERS b/MAINTAINERS
index c1593a668f80..570572627fd1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6080,7 +6080,7 @@ M:	Ard Biesheuvel <ard.biesheuvel@linaro.org>
 L:	linux-efi@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git
 S:	Maintained
-F:	Documentation/efi-stub.txt
+F:	Documentation/admin-guide/efi-stub.rst
 F:	arch/*/kernel/efi.c
 F:	arch/x86/boot/compressed/eboot.[ch]
 F:	arch/*/include/asm/efi.h
@@ -7088,7 +7088,7 @@ M:	Herbert Xu <herbert@gondor.apana.org.au>
 L:	linux-crypto@vger.kernel.org
 S:	Odd fixes
 F:	Documentation/devicetree/bindings/rng/
-F:	Documentation/hw_random.txt
+F:	Documentation/admin-guide/hw_random.rst
 F:	drivers/char/hw_random/
 F:	include/linux/hw_random.h
 
@@ -9398,7 +9398,7 @@ M:	"Richard Russon (FlatCap)" <ldm@flatcap.org>
 L:	linux-ntfs-dev@lists.sourceforge.net
 W:	http://www.linux-ntfs.org/content/view/19/37/
 S:	Maintained
-F:	Documentation/ldm.txt
+F:	Documentation/admin-guide/ldm.rst
 F:	block/partitions/ldm.*
 
 LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI)
@@ -12058,7 +12058,7 @@ PARALLEL LCD/KEYPAD PANEL DRIVER
 M:	Willy Tarreau <willy@haproxy.com>
 M:	Ksenija Stanojevic <ksenija.stanojevic@gmail.com>
 S:	Odd Fixes
-F:	Documentation/auxdisplay/lcd-panel-cgram.rst
+F:	Documentation/admin-guide/lcd-panel-cgram.rst
 F:	drivers/auxdisplay/panel.c
 
 PARALLEL PORT SUBSYSTEM
@@ -13476,7 +13476,7 @@ Q:	http://patchwork.ozlabs.org/project/rtc-linux/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git
 S:	Maintained
 F:	Documentation/devicetree/bindings/rtc/
-F:	Documentation/rtc.txt
+F:	Documentation/admin-guide/rtc.rst
 F:	drivers/rtc/
 F:	include/linux/rtc.h
 F:	include/uapi/linux/rtc.h
@@ -15306,7 +15306,7 @@ SVGA HANDLING
 M:	Martin Mares <mj@ucw.cz>
 L:	linux-video@atrey.karlin.mff.cuni.cz
 S:	Maintained
-F:	Documentation/svga.txt
+F:	Documentation/admin-guide/svga.rst
 F:	arch/x86/boot/video*
 
 SWIOTLB SUBSYSTEM
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 20afd6077465..600c5ba1af41 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1297,7 +1297,7 @@ config SMP
 	  will run faster if you say N here.
 
 	  See also <file:Documentation/x86/i386/IO-APIC.rst>,
-	  <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
+	  <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at
 	  <http://tldp.org/HOWTO/SMP-HOWTO.html>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 42875ff15671..6d732e451071 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -277,7 +277,7 @@ config SMP
 	  machines, but will use only one CPU of a multiprocessor machine.
 	  On a uniprocessor machine, the kernel will run faster if you say N.
 
-	  See also <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO
+	  See also <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index c2858ac6a46a..6b1b5941b618 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -679,7 +679,7 @@ config SMP
 	  People using multiprocessor machines who say Y here should also say
 	  Y to "Enhanced Real Time Clock Support", below.
 
-	  See also <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO
+	  See also <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e9f5d62e9817..7926a2e11bdc 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -180,7 +180,7 @@ config SMP
 	  Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
 	  Management" code will be disabled if you say Y here.
 
-	  See also <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO
+	  See also <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9505066b7ba3..9e95af666b33 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -402,7 +402,7 @@ config SMP
 	  Management" code will be disabled if you say Y here.
 
 	  See also <file:Documentation/x86/i386/IO-APIC.rst>,
-	  <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
+	  <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at
 	  <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
@@ -1959,7 +1959,7 @@ config EFI_STUB
           This kernel feature allows a bzImage to be loaded directly
 	  by EFI firmware without the use of a bootloader.
 
-	  See Documentation/efi-stub.txt for more information.
+	  See Documentation/admin-guide/efi-stub.rst for more information.
 
 config EFI_MIXED
 	bool "EFI mixed-mode support"
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 37b9710cc80a..702689a628f0 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -194,7 +194,7 @@ config LDM_PARTITION
 	  Normal partitions are now called Basic Disks under Windows 2000, XP,
 	  and Vista.
 
-	  For a fuller description read <file:Documentation/ldm.txt>.
+	  For a fuller description read <file:Documentation/admin-guide/ldm.rst>.
 
 	  If unsure, say N.
 
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 442403abd73a..3e866885a405 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -291,7 +291,7 @@ config RTC
 	  and set the RTC in an SMP compatible fashion.
 
 	  If you think you have a use for such a device (such as periodic data
-	  sampling), then say Y here, and read <file:Documentation/rtc.txt>
+	  sampling), then say Y here, and read <file:Documentation/admin-guide/rtc.rst>
 	  for details.
 
 	  To compile this driver as a module, choose M here: the
@@ -313,7 +313,7 @@ config JS_RTC
 	  /dev/rtc.
 
 	  If you think you have a use for such a device (such as periodic data
-	  sampling), then say Y here, and read <file:Documentation/rtc.txt>
+	  sampling), then say Y here, and read <file:Documentation/admin-guide/rtc.rst>
 	  for details.
 
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 95be7228f327..9044d31ab1a1 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -4,7 +4,7 @@
  * Copyright 2006 Michael Buesch <m@bues.ch>
  * Copyright 2005 (c) MontaVista Software, Inc.
  *
- * Please read Documentation/hw_random.txt for details on use.
+ * Please read Documentation/admin-guide/hw_random.rst for details on use.
  *
  * This software may be used and distributed according to the terms
  * of the GNU General Public License, incorporated herein by reference.
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index c0b93e0ff0c0..8e6dd908da21 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -1,7 +1,7 @@
 /*
 	Hardware Random Number Generator
 
-	Please read Documentation/hw_random.txt for details on use.
+	Please read Documentation/admin-guide/hw_random.rst for details on use.
 
 	----------------------------------------------------------
 	This software may be used and distributed according to the terms
-- 
cgit v1.2.3


From 65388dad1bbb51a4eb6cc91b9fa865b57646fb67 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 27 Jun 2019 16:31:35 -0300
Subject: docs: serial: move it to the driver-api

The contents of this directory is mostly driver-api stuff.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/driver-api/index.rst                 |   1 +
 Documentation/driver-api/serial/cyclades_z.rst     |  11 +
 Documentation/driver-api/serial/driver.rst         | 549 ++++++++++++++++++
 Documentation/driver-api/serial/index.rst          |  32 ++
 Documentation/driver-api/serial/moxa-smartio.rst   | 615 +++++++++++++++++++++
 Documentation/driver-api/serial/n_gsm.rst          | 103 ++++
 Documentation/driver-api/serial/rocket.rst         | 185 +++++++
 Documentation/driver-api/serial/serial-iso7816.rst |  90 +++
 Documentation/driver-api/serial/serial-rs485.rst   | 103 ++++
 Documentation/driver-api/serial/tty.rst            | 328 +++++++++++
 Documentation/serial/cyclades_z.rst                |  11 -
 Documentation/serial/driver.rst                    | 549 ------------------
 Documentation/serial/index.rst                     |  32 --
 Documentation/serial/moxa-smartio.rst              | 615 ---------------------
 Documentation/serial/n_gsm.rst                     | 103 ----
 Documentation/serial/rocket.rst                    | 185 -------
 Documentation/serial/serial-iso7816.rst            |  90 ---
 Documentation/serial/serial-rs485.rst              | 103 ----
 Documentation/serial/tty.rst                       | 328 -----------
 MAINTAINERS                                        |   6 +-
 drivers/tty/Kconfig                                |   4 +-
 drivers/tty/serial/ucc_uart.c                      |   2 +-
 include/linux/serial_core.h                        |   2 +-
 23 files changed, 2024 insertions(+), 2023 deletions(-)
 create mode 100644 Documentation/driver-api/serial/cyclades_z.rst
 create mode 100644 Documentation/driver-api/serial/driver.rst
 create mode 100644 Documentation/driver-api/serial/index.rst
 create mode 100644 Documentation/driver-api/serial/moxa-smartio.rst
 create mode 100644 Documentation/driver-api/serial/n_gsm.rst
 create mode 100644 Documentation/driver-api/serial/rocket.rst
 create mode 100644 Documentation/driver-api/serial/serial-iso7816.rst
 create mode 100644 Documentation/driver-api/serial/serial-rs485.rst
 create mode 100644 Documentation/driver-api/serial/tty.rst
 delete mode 100644 Documentation/serial/cyclades_z.rst
 delete mode 100644 Documentation/serial/driver.rst
 delete mode 100644 Documentation/serial/index.rst
 delete mode 100644 Documentation/serial/moxa-smartio.rst
 delete mode 100644 Documentation/serial/n_gsm.rst
 delete mode 100644 Documentation/serial/rocket.rst
 delete mode 100644 Documentation/serial/serial-iso7816.rst
 delete mode 100644 Documentation/serial/serial-rs485.rst
 delete mode 100644 Documentation/serial/tty.rst

(limited to 'include')

diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 1dde9692075c..cf39b8f9d0f9 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -88,6 +88,7 @@ available subsections can be seen below.
    pti_intel_mid
    pwm
    rfkill
+   serial/index
    sgi-ioc4
    sm501
    smsc_ece1099
diff --git a/Documentation/driver-api/serial/cyclades_z.rst b/Documentation/driver-api/serial/cyclades_z.rst
new file mode 100644
index 000000000000..532ff67e2f1c
--- /dev/null
+++ b/Documentation/driver-api/serial/cyclades_z.rst
@@ -0,0 +1,11 @@
+================
+Cyclades-Z notes
+================
+
+The Cyclades-Z must have firmware loaded onto the card before it will
+operate.  This operation should be performed during system startup,
+
+The firmware, loader program and the latest device driver code are
+available from Cyclades at
+
+    ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/
diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
new file mode 100644
index 000000000000..31bd4e16fb1f
--- /dev/null
+++ b/Documentation/driver-api/serial/driver.rst
@@ -0,0 +1,549 @@
+====================
+Low Level Serial API
+====================
+
+
+This document is meant as a brief overview of some aspects of the new serial
+driver.  It is not complete, any questions you have should be directed to
+<rmk@arm.linux.org.uk>
+
+The reference implementation is contained within amba-pl011.c.
+
+
+
+Low Level Serial Hardware Driver
+--------------------------------
+
+The low level serial hardware driver is responsible for supplying port
+information (defined by uart_port) and a set of control methods (defined
+by uart_ops) to the core serial driver.  The low level driver is also
+responsible for handling interrupts for the port, and providing any
+console support.
+
+
+Console Support
+---------------
+
+The serial core provides a few helper functions.  This includes identifing
+the correct port structure (via uart_get_console) and decoding command line
+arguments (uart_parse_options).
+
+There is also a helper function (uart_console_write) which performs a
+character by character write, translating newlines to CRLF sequences.
+Driver writers are recommended to use this function rather than implementing
+their own version.
+
+
+Locking
+-------
+
+It is the responsibility of the low level hardware driver to perform the
+necessary locking using port->lock.  There are some exceptions (which
+are described in the uart_ops listing below.)
+
+There are two locks.  A per-port spinlock, and an overall semaphore.
+
+From the core driver perspective, the port->lock locks the following
+data::
+
+	port->mctrl
+	port->icount
+	port->state->xmit.head (circ_buf->head)
+	port->state->xmit.tail (circ_buf->tail)
+
+The low level driver is free to use this lock to provide any additional
+locking.
+
+The port_sem semaphore is used to protect against ports being added/
+removed or reconfigured at inappropriate times. Since v2.6.27, this
+semaphore has been the 'mutex' member of the tty_port struct, and
+commonly referred to as the port mutex.
+
+
+uart_ops
+--------
+
+The uart_ops structure is the main interface between serial_core and the
+hardware specific driver.  It contains all the methods to control the
+hardware.
+
+  tx_empty(port)
+	This function tests whether the transmitter fifo and shifter
+	for the port described by 'port' is empty.  If it is empty,
+	this function should return TIOCSER_TEMT, otherwise return 0.
+	If the port does not support this operation, then it should
+	return TIOCSER_TEMT.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  set_mctrl(port, mctrl)
+	This function sets the modem control lines for port described
+	by 'port' to the state described by mctrl.  The relevant bits
+	of mctrl are:
+
+		- TIOCM_RTS	RTS signal.
+		- TIOCM_DTR	DTR signal.
+		- TIOCM_OUT1	OUT1 signal.
+		- TIOCM_OUT2	OUT2 signal.
+		- TIOCM_LOOP	Set the port into loopback mode.
+
+	If the appropriate bit is set, the signal should be driven
+	active.  If the bit is clear, the signal should be driven
+	inactive.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  get_mctrl(port)
+	Returns the current state of modem control inputs.  The state
+	of the outputs should not be returned, since the core keeps
+	track of their state.  The state information should include:
+
+		- TIOCM_CAR	state of DCD signal
+		- TIOCM_CTS	state of CTS signal
+		- TIOCM_DSR	state of DSR signal
+		- TIOCM_RI	state of RI signal
+
+	The bit is set if the signal is currently driven active.  If
+	the port does not support CTS, DCD or DSR, the driver should
+	indicate that the signal is permanently active.  If RI is
+	not available, the signal should not be indicated as active.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  stop_tx(port)
+	Stop transmitting characters.  This might be due to the CTS
+	line becoming inactive or the tty layer indicating we want
+	to stop transmission due to an XOFF character.
+
+	The driver should stop transmitting characters as soon as
+	possible.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  start_tx(port)
+	Start transmitting characters.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  throttle(port)
+	Notify the serial driver that input buffers for the line discipline are
+	close to full, and it should somehow signal that no more characters
+	should be sent to the serial port.
+	This will be called only if hardware assisted flow control is enabled.
+
+	Locking: serialized with .unthrottle() and termios modification by the
+	tty layer.
+
+  unthrottle(port)
+	Notify the serial driver that characters can now be sent to the serial
+	port without fear of overrunning the input buffers of the line
+	disciplines.
+
+	This will be called only if hardware assisted flow control is enabled.
+
+	Locking: serialized with .throttle() and termios modification by the
+	tty layer.
+
+  send_xchar(port,ch)
+	Transmit a high priority character, even if the port is stopped.
+	This is used to implement XON/XOFF flow control and tcflow().  If
+	the serial driver does not implement this function, the tty core
+	will append the character to the circular buffer and then call
+	start_tx() / stop_tx() to flush the data out.
+
+	Do not transmit if ch == '\0' (__DISABLED_CHAR).
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  stop_rx(port)
+	Stop receiving characters; the port is in the process of
+	being closed.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  enable_ms(port)
+	Enable the modem status interrupts.
+
+	This method may be called multiple times.  Modem status
+	interrupts should be disabled when the shutdown method is
+	called.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  break_ctl(port,ctl)
+	Control the transmission of a break signal.  If ctl is
+	nonzero, the break signal should be transmitted.  The signal
+	should be terminated when another call is made with a zero
+	ctl.
+
+	Locking: caller holds tty_port->mutex
+
+  startup(port)
+	Grab any interrupt resources and initialise any low level driver
+	state.  Enable the port for reception.  It should not activate
+	RTS nor DTR; this will be done via a separate call to set_mctrl.
+
+	This method will only be called when the port is initially opened.
+
+	Locking: port_sem taken.
+
+	Interrupts: globally disabled.
+
+  shutdown(port)
+	Disable the port, disable any break condition that may be in
+	effect, and free any interrupt resources.  It should not disable
+	RTS nor DTR; this will have already been done via a separate
+	call to set_mctrl.
+
+	Drivers must not access port->state once this call has completed.
+
+	This method will only be called when there are no more users of
+	this port.
+
+	Locking: port_sem taken.
+
+	Interrupts: caller dependent.
+
+  flush_buffer(port)
+	Flush any write buffers, reset any DMA state and stop any
+	ongoing DMA transfers.
+
+	This will be called whenever the port->state->xmit circular
+	buffer is cleared.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  set_termios(port,termios,oldtermios)
+	Change the port parameters, including word length, parity, stop
+	bits.  Update read_status_mask and ignore_status_mask to indicate
+	the types of events we are interested in receiving.  Relevant
+	termios->c_cflag bits are:
+
+		CSIZE
+			- word size
+		CSTOPB
+			- 2 stop bits
+		PARENB
+			- parity enable
+		PARODD
+			- odd parity (when PARENB is in force)
+		CREAD
+			- enable reception of characters (if not set,
+			  still receive characters from the port, but
+			  throw them away.
+		CRTSCTS
+			- if set, enable CTS status change reporting
+		CLOCAL
+			- if not set, enable modem status change
+			  reporting.
+
+	Relevant termios->c_iflag bits are:
+
+		INPCK
+			- enable frame and parity error events to be
+			  passed to the TTY layer.
+		BRKINT / PARMRK
+			- both of these enable break events to be
+			  passed to the TTY layer.
+
+		IGNPAR
+			- ignore parity and framing errors
+		IGNBRK
+			- ignore break errors,  If IGNPAR is also
+			  set, ignore overrun errors as well.
+
+	The interaction of the iflag bits is as follows (parity error
+	given as an example):
+
+	=============== ======= ======  =============================
+	Parity error	INPCK	IGNPAR
+	=============== ======= ======  =============================
+	n/a		0	n/a	character received, marked as
+					TTY_NORMAL
+	None		1	n/a	character received, marked as
+					TTY_NORMAL
+	Yes		1	0	character received, marked as
+					TTY_PARITY
+	Yes		1	1	character discarded
+	=============== ======= ======  =============================
+
+	Other flags may be used (eg, xon/xoff characters) if your
+	hardware supports hardware "soft" flow control.
+
+	Locking: caller holds tty_port->mutex
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  set_ldisc(port,termios)
+	Notifier for discipline change. See Documentation/driver-api/serial/tty.rst.
+
+	Locking: caller holds tty_port->mutex
+
+  pm(port,state,oldstate)
+	Perform any power management related activities on the specified
+	port.  State indicates the new state (defined by
+	enum uart_pm_state), oldstate indicates the previous state.
+
+	This function should not be used to grab any resources.
+
+	This will be called when the port is initially opened and finally
+	closed, except when the port is also the system console.  This
+	will occur even if CONFIG_PM is not set.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  type(port)
+	Return a pointer to a string constant describing the specified
+	port, or return NULL, in which case the string 'unknown' is
+	substituted.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  release_port(port)
+	Release any memory and IO region resources currently in use by
+	the port.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  request_port(port)
+	Request any memory and IO region resources required by the port.
+	If any fail, no resources should be registered when this function
+	returns, and it should return -EBUSY on failure.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  config_port(port,type)
+	Perform any autoconfiguration steps required for the port.  `type`
+	contains a bit mask of the required configuration.  UART_CONFIG_TYPE
+	indicates that the port requires detection and identification.
+	port->type should be set to the type found, or PORT_UNKNOWN if
+	no port was detected.
+
+	UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
+	which should be probed using standard kernel autoprobing techniques.
+	This is not necessary on platforms where ports have interrupts
+	internally hard wired (eg, system on a chip implementations).
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  verify_port(port,serinfo)
+	Verify the new serial port information contained within serinfo is
+	suitable for this port type.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  ioctl(port,cmd,arg)
+	Perform any port specific IOCTLs.  IOCTL commands must be defined
+	using the standard numbering system found in <asm/ioctl.h>
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  poll_init(port)
+	Called by kgdb to perform the minimal hardware initialization needed
+	to support poll_put_char() and poll_get_char().  Unlike ->startup()
+	this should not request interrupts.
+
+	Locking: tty_mutex and tty_port->mutex taken.
+
+	Interrupts: n/a.
+
+  poll_put_char(port,ch)
+	Called by kgdb to write a single character directly to the serial
+	port.  It can and should block until there is space in the TX FIFO.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  poll_get_char(port)
+	Called by kgdb to read a single character directly from the serial
+	port.  If data is available, it should be returned; otherwise
+	the function should return NO_POLL_CHAR immediately.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+Other functions
+---------------
+
+uart_update_timeout(port,cflag,baud)
+	Update the FIFO drain timeout, port->timeout, according to the
+	number of bits, parity, stop bits and baud rate.
+
+	Locking: caller is expected to take port->lock
+
+	Interrupts: n/a
+
+uart_get_baud_rate(port,termios,old,min,max)
+	Return the numeric baud rate for the specified termios, taking
+	account of the special 38400 baud "kludge".  The B0 baud rate
+	is mapped to 9600 baud.
+
+	If the baud rate is not within min..max, then if old is non-NULL,
+	the original baud rate will be tried.  If that exceeds the
+	min..max constraint, 9600 baud will be returned.  termios will
+	be updated to the baud rate in use.
+
+	Note: min..max must always allow 9600 baud to be selected.
+
+	Locking: caller dependent.
+
+	Interrupts: n/a
+
+uart_get_divisor(port,baud)
+	Return the divisor (baud_base / baud) for the specified baud
+	rate, appropriately rounded.
+
+	If 38400 baud and custom divisor is selected, return the
+	custom divisor instead.
+
+	Locking: caller dependent.
+
+	Interrupts: n/a
+
+uart_match_port(port1,port2)
+	This utility function can be used to determine whether two
+	uart_port structures describe the same port.
+
+	Locking: n/a
+
+	Interrupts: n/a
+
+uart_write_wakeup(port)
+	A driver is expected to call this function when the number of
+	characters in the transmit buffer have dropped below a threshold.
+
+	Locking: port->lock should be held.
+
+	Interrupts: n/a
+
+uart_register_driver(drv)
+	Register a uart driver with the core driver.  We in turn register
+	with the tty layer, and initialise the core driver per-port state.
+
+	drv->port should be NULL, and the per-port structures should be
+	registered using uart_add_one_port after this call has succeeded.
+
+	Locking: none
+
+	Interrupts: enabled
+
+uart_unregister_driver()
+	Remove all references to a driver from the core driver.  The low
+	level driver must have removed all its ports via the
+	uart_remove_one_port() if it registered them with uart_add_one_port().
+
+	Locking: none
+
+	Interrupts: enabled
+
+**uart_suspend_port()**
+
+**uart_resume_port()**
+
+**uart_add_one_port()**
+
+**uart_remove_one_port()**
+
+Other notes
+-----------
+
+It is intended some day to drop the 'unused' entries from uart_port, and
+allow low level drivers to register their own individual uart_port's with
+the core.  This will allow drivers to use uart_port as a pointer to a
+structure containing both the uart_port entry with their own extensions,
+thus::
+
+	struct my_port {
+		struct uart_port	port;
+		int			my_stuff;
+	};
+
+Modem control lines via GPIO
+----------------------------
+
+Some helpers are provided in order to set/get modem control lines via GPIO.
+
+mctrl_gpio_init(port, idx):
+	This will get the {cts,rts,...}-gpios from device tree if they are
+	present and request them, set direction etc, and return an
+	allocated structure. `devm_*` functions are used, so there's no need
+	to call mctrl_gpio_free().
+	As this sets up the irq handling make sure to not handle changes to the
+	gpio input lines in your driver, too.
+
+mctrl_gpio_free(dev, gpios):
+	This will free the requested gpios in mctrl_gpio_init().
+	As `devm_*` functions are used, there's generally no need to call
+	this function.
+
+mctrl_gpio_to_gpiod(gpios, gidx)
+	This returns the gpio_desc structure associated to the modem line
+	index.
+
+mctrl_gpio_set(gpios, mctrl):
+	This will sets the gpios according to the mctrl state.
+
+mctrl_gpio_get(gpios, mctrl):
+	This will update mctrl with the gpios values.
+
+mctrl_gpio_enable_ms(gpios):
+	Enables irqs and handling of changes to the ms lines.
+
+mctrl_gpio_disable_ms(gpios):
+	Disables irqs and handling of changes to the ms lines.
diff --git a/Documentation/driver-api/serial/index.rst b/Documentation/driver-api/serial/index.rst
new file mode 100644
index 000000000000..33ad10d05b26
--- /dev/null
+++ b/Documentation/driver-api/serial/index.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+Support for Serial devices
+==========================
+
+.. toctree::
+    :maxdepth: 1
+
+
+    driver
+    tty
+
+Serial drivers
+==============
+
+.. toctree::
+    :maxdepth: 1
+
+    cyclades_z
+    moxa-smartio
+    n_gsm
+    rocket
+    serial-iso7816
+    serial-rs485
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/serial/moxa-smartio.rst b/Documentation/driver-api/serial/moxa-smartio.rst
new file mode 100644
index 000000000000..156100f17c3f
--- /dev/null
+++ b/Documentation/driver-api/serial/moxa-smartio.rst
@@ -0,0 +1,615 @@
+=============================================================
+MOXA Smartio/Industio Family Device Driver Installation Guide
+=============================================================
+
+.. note::
+
+   This file is outdated. It needs some care in order to make it
+   updated to Kernel 5.0 and upper
+
+Copyright (C) 2008, Moxa Inc.
+
+Date: 01/21/2008
+
+.. Content
+
+   1. Introduction
+   2. System Requirement
+   3. Installation
+      3.1 Hardware installation
+      3.2 Driver files
+      3.3 Device naming convention
+      3.4 Module driver configuration
+      3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x.
+      3.6 Custom configuration
+      3.7 Verify driver installation
+   4. Utilities
+   5. Setserial
+   6. Troubleshooting
+
+1. Introduction
+^^^^^^^^^^^^^^^
+
+   The Smartio/Industio/UPCI family Linux driver supports following multiport
+   boards.
+
+    - 2 ports multiport board
+	CP-102U, CP-102UL, CP-102UF
+	CP-132U-I, CP-132UL,
+	CP-132, CP-132I, CP132S, CP-132IS,
+	CI-132, CI-132I, CI-132IS,
+	(C102H, C102HI, C102HIS, C102P, CP-102, CP-102S)
+
+    - 4 ports multiport board
+	CP-104EL,
+	CP-104UL, CP-104JU,
+	CP-134U, CP-134U-I,
+	C104H/PCI, C104HS/PCI,
+	CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL,
+	C104H, C104HS,
+	CI-104J, CI-104JS,
+	CI-134, CI-134I, CI-134IS,
+	(C114HI, CT-114I, C104P),
+	POS-104UL,
+	CB-114,
+	CB-134I
+
+    - 8 ports multiport board
+	CP-118EL, CP-168EL,
+	CP-118U, CP-168U,
+	C168H/PCI,
+	C168H, C168HS,
+	(C168P),
+	CB-108
+
+   This driver and installation procedure have been developed upon Linux Kernel
+   2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order
+   to maintain compatibility, this version has also been properly tested with
+   RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem
+   occurs, please contact Moxa at support@moxa.com.tw.
+
+   In addition to device driver, useful utilities are also provided in this
+   version. They are:
+
+    - msdiag
+		 Diagnostic program for displaying installed Moxa
+                 Smartio/Industio boards.
+    - msmon
+		 Monitor program to observe data count and line status signals.
+    - msterm     A simple terminal program which is useful in testing serial
+	         ports.
+    - io-irq.exe
+		 Configuration program to setup ISA boards. Please note that
+                 this program can only be executed under DOS.
+
+   All the drivers and utilities are published in form of source code under
+   GNU General Public License in this version. Please refer to GNU General
+   Public License announcement in each source code file for more detail.
+
+   In Moxa's Web sites, you may always find latest driver at http://www.moxa.com/.
+
+   This version of driver can be installed as Loadable Module (Module driver)
+   or built-in into kernel (Static driver). You may refer to following
+   installation procedure for suitable one. Before you install the driver,
+   please refer to hardware installation procedure in the User's Manual.
+
+   We assume the user should be familiar with following documents.
+
+   - Serial-HOWTO
+   - Kernel-HOWTO
+
+2. System Requirement
+^^^^^^^^^^^^^^^^^^^^^
+
+   - Hardware platform: Intel x86 machine
+   - Kernel version: 2.4.x or 2.6.x
+   - gcc version 2.72 or later
+   - Maximum 4 boards can be installed in combination
+
+3. Installation
+^^^^^^^^^^^^^^^
+
+3.1 Hardware installation
+=========================
+
+   There are two types of buses, ISA and PCI, for Smartio/Industio
+   family multiport board.
+
+ISA board
+---------
+
+   You'll have to configure CAP address, I/O address, Interrupt Vector
+   as well as IRQ before installing this driver. Please refer to hardware
+   installation procedure in User's Manual before proceed any further.
+   Please make sure the JP1 is open after the ISA board is set properly.
+
+PCI/UPCI board
+--------------
+
+   You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict
+   with other ISA devices. Please refer to hardware installation
+   procedure in User's Manual in advance.
+
+PCI IRQ Sharing
+---------------
+
+   Each port within the same multiport board shares the same IRQ. Up to
+   4 Moxa Smartio/Industio PCI Family multiport boards can be installed
+   together on one system and they can share the same IRQ.
+
+
+3.2 Driver files
+================
+
+   The driver file may be obtained from ftp, CD-ROM or floppy disk. The
+   first step, anyway, is to copy driver file "mxser.tgz" into specified
+   directory. e.g. /moxa. The execute commands as below::
+
+       # cd /
+       # mkdir moxa
+       # cd /moxa
+       # tar xvf /dev/fd0
+
+or::
+
+       # cd /
+       # mkdir moxa
+       # cd /moxa
+       # cp /mnt/cdrom/<driver directory>/mxser.tgz .
+       # tar xvfz mxser.tgz
+
+
+3.3 Device naming convention
+============================
+
+   You may find all the driver and utilities files in /moxa/mxser.
+   Following installation procedure depends on the model you'd like to
+   run the driver. If you prefer module driver, please refer to 3.4.
+   If static driver is required, please refer to 3.5.
+
+Dialin and callout port
+-----------------------
+
+   This driver remains traditional serial device properties. There are
+   two special file name for each serial port. One is dial-in port
+   which is named "ttyMxx". For callout port, the naming convention
+   is "cumxx".
+
+Device naming when more than 2 boards installed
+-----------------------------------------------
+
+   Naming convention for each Smartio/Industio multiport board is
+   pre-defined as below.
+
+   ============ ===============       ==============
+   Board Num.	 Dial-in Port	      Callout port
+   1st board	ttyM0  - ttyM7	      cum0  - cum7
+   2nd board	ttyM8  - ttyM15       cum8  - cum15
+   3rd board	ttyM16 - ttyM23       cum16 - cum23
+   4th board	ttyM24 - ttym31       cum24 - cum31
+   ============ ===============       ==============
+
+.. note::
+
+   Under Kernel 2.6 and upper, the cum Device is Obsolete. So use ttyM*
+   device instead.
+
+Board sequence
+--------------
+
+   This driver will activate ISA boards according to the parameter set
+   in the driver. After all specified ISA board activated, PCI board
+   will be installed in the system automatically driven.
+   Therefore the board number is sorted by the CAP address of ISA boards.
+   For PCI boards, their sequence will be after ISA boards and C168H/PCI
+   has higher priority than C104H/PCI boards.
+
+3.4 Module driver configuration
+===============================
+
+   Module driver is easiest way to install. If you prefer static driver
+   installation, please skip this paragraph.
+
+
+   ------------- Prepare to use the MOXA driver --------------------
+
+3.4.1 Create tty device with correct major number
+-------------------------------------------------
+
+   Before using MOXA driver, your system must have the tty devices
+   which are created with driver's major number. We offer one shell
+   script "msmknod" to simplify the procedure.
+   This step is only needed to be executed once. But you still
+   need to do this procedure when:
+
+   a. You change the driver's major number. Please refer the "3.7"
+      section.
+   b. Your total installed MOXA boards number is changed. Maybe you
+      add/delete one MOXA board.
+   c. You want to change the tty name. This needs to modify the
+      shell script "msmknod"
+
+   The procedure is::
+
+	 # cd /moxa/mxser/driver
+	 # ./msmknod
+
+   This shell script will require the major number for dial-in
+   device and callout device to create tty device. You also need
+   to specify the total installed MOXA board number. Default major
+   numbers for dial-in device and callout device are 30, 35. If
+   you need to change to other number, please refer section "3.7"
+   for more detailed procedure.
+   Msmknod will delete any special files occupying the same device
+   naming.
+
+3.4.2 Build the MOXA driver and utilities
+-----------------------------------------
+
+   Before using the MOXA driver and utilities, you need compile the
+   all the source code. This step is only need to be executed once.
+   But you still re-compile the source code if you modify the source
+   code. For example, if you change the driver's major number (see
+   "3.7" section), then you need to do this step again.
+
+   Find "Makefile" in /moxa/mxser, then run
+
+	 # make clean; make install
+
+   ..note::
+
+	 For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1:
+	 # make clean; make installsp1
+
+	 For Red Hat Enterprise Linux AS4/ES4/WS4:
+	 # make clean; make installsp2
+
+   The driver files "mxser.o" and utilities will be properly compiled
+   and copied to system directories respectively.
+
+------------- Load MOXA driver--------------------
+
+3.4.3 Load the MOXA driver
+--------------------------
+
+   ::
+
+	 # modprobe mxser <argument>
+
+   will activate the module driver. You may run "lsmod" to check
+   if "mxser" is activated. If the MOXA board is ISA board, the
+   <argument> is needed. Please refer to section "3.4.5" for more
+   information.
+
+------------- Load MOXA driver on boot --------------------
+
+3.4.4 Load the mxser driver
+---------------------------
+
+
+   For the above description, you may manually execute
+   "modprobe mxser" to activate this driver and run
+   "rmmod mxser" to remove it.
+
+   However, it's better to have a boot time configuration to
+   eliminate manual operation. Boot time configuration can be
+   achieved by rc file. We offer one "rc.mxser" file to simplify
+   the procedure under "moxa/mxser/driver".
+
+   But if you use ISA board, please modify the "modprobe ..." command
+   to add the argument (see "3.4.5" section). After modifying the
+   rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser"
+   manually to make sure the modification is ok. If any error
+   encountered, please try to modify again. If the modification is
+   completed, follow the below step.
+
+   Run following command for setting rc files::
+
+	 # cd /moxa/mxser/driver
+	 # cp ./rc.mxser /etc/rc.d
+	 # cd /etc/rc.d
+
+   Check "rc.serial" is existed or not. If "rc.serial" doesn't exist,
+   create it by vi, run "chmod 755 rc.serial" to change the permission.
+
+   Add "/etc/rc.d/rc.mxser" in last line.
+
+   Reboot and check if moxa.o activated by "lsmod" command.
+
+3.4.5. specify CAP address
+--------------------------
+
+   If you'd like to drive Smartio/Industio ISA boards in the system,
+   you'll have to add parameter to specify CAP address of given
+   board while activating "mxser.o". The format for parameters are
+   as follows.::
+
+	   modprobe mxser ioaddr=0x???,0x???,0x???,0x???
+				  |  |  |    |
+				  |  |  |    +- 4th ISA board
+				  |  |  +------ 3rd ISA board
+				  |  +------------ 2nd ISA board
+				  +-------------------1st ISA board
+
+3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x
+================================================================
+
+    Note:
+          To use static driver, you must install the linux kernel
+          source package.
+
+3.5.1 Backup the built-in driver in the kernel
+----------------------------------------------
+
+    ::
+
+       # cd /usr/src/linux/drivers/char
+       # mv mxser.c mxser.c.old
+
+       For Red Hat 7.x user, you need to create link:
+       # cd /usr/src
+       # ln -s linux-2.4 linux
+
+3.5.2 Create link
+-----------------
+    ::
+
+	  # cd /usr/src/linux/drivers/char
+	  # ln -s /moxa/mxser/driver/mxser.c mxser.c
+
+3.5.3 Add CAP address list for ISA boards.
+------------------------------------------
+
+    For PCI boards user, please skip this step.
+
+    In module mode, the CAP address for ISA board is given by
+    parameter. In static driver configuration, you'll have to
+    assign it within driver's source code. If you will not
+    install any ISA boards, you may skip to next portion.
+    The instructions to modify driver source code are as
+    below.
+
+    a. run::
+
+	# cd /moxa/mxser/driver
+	# vi mxser.c
+
+    b. Find the array mxserBoardCAP[] as below::
+
+	  static int mxserBoardCAP[] = {0x00, 0x00, 0x00, 0x00};
+
+    c. Change the address within this array using vi. For
+       example, to driver 2 ISA boards with CAP address
+       0x280 and 0x180 as 1st and 2nd board. Just to change
+       the source code as follows::
+
+	  static int mxserBoardCAP[] = {0x280, 0x180, 0x00, 0x00};
+
+3.5.4 Setup kernel configuration
+--------------------------------
+
+    Configure the kernel::
+
+      # cd /usr/src/linux
+      # make menuconfig
+
+    You will go into a menu-driven system. Please select [Character
+    devices][Non-standard serial port support], enable the [Moxa
+    SmartIO support] driver with "[*]" for built-in (not "[M]"), then
+    select [Exit] to exit this program.
+
+3.5.5 Rebuild kernel
+--------------------
+
+    The following are for Linux kernel rebuilding, for your
+    reference only.
+
+    For appropriate details, please refer to the Linux document:
+
+        a. Run the following commands::
+
+	     cd /usr/src/linux
+	     make clean		     # take a few minutes
+	     make dep		     # take a few minutes
+	     make bzImage	     # take probably 10-20 minutes
+	     make install	     # copy boot image to correct position
+
+	f. Please make sure the boot kernel (vmlinuz) is in the
+	   correct position.
+	g. If you use 'lilo' utility, you should check /etc/lilo.conf
+	   'image' item specified the path which is the 'vmlinuz' path,
+	   or you will load wrong (or old) boot kernel image (vmlinuz).
+	   After checking /etc/lilo.conf, please run "lilo".
+
+	  Note that if the result of "make bzImage" is ERROR, then you have to
+	  go back to Linux configuration Setup. Type "make menuconfig" in
+          directory /usr/src/linux.
+
+
+3.5.6 Make tty device and special file
+--------------------------------------
+
+    ::
+       # cd /moxa/mxser/driver
+       # ./msmknod
+
+3.5.7 Make utility
+------------------
+
+    ::
+
+	  # cd /moxa/mxser/utility
+	  # make clean; make install
+
+3.5.8 Reboot
+------------
+
+
+
+3.6 Custom configuration
+========================
+
+    Although this driver already provides you default configuration, you
+    still can change the device name and major number. The instruction to
+    change these parameters are shown as below.
+
+a. Change Device name
+
+    If you'd like to use other device names instead of default naming
+    convention, all you have to do is to modify the internal code
+    within the shell script "msmknod". First, you have to open "msmknod"
+    by vi. Locate each line contains "ttyM" and "cum" and change them
+    to the device name you desired. "msmknod" creates the device names
+    you need next time executed.
+
+b. Change Major number
+
+    If major number 30 and 35 had been occupied, you may have to select
+    2 free major numbers for this driver. There are 3 steps to change
+    major numbers.
+
+3.6.1 Find free major numbers
+-----------------------------
+
+    In /proc/devices, you may find all the major numbers occupied
+    in the system. Please select 2 major numbers that are available.
+    e.g. 40, 45.
+
+3.6.2 Create special files
+--------------------------
+
+   Run /moxa/mxser/driver/msmknod to create special files with
+   specified major numbers.
+
+3.6.3 Modify driver with new major number
+-----------------------------------------
+
+   Run vi to open /moxa/mxser/driver/mxser.c. Locate the line
+   contains "MXSERMAJOR". Change the content as below::
+
+	  #define	  MXSERMAJOR		  40
+	  #define	  MXSERCUMAJOR		  45
+
+    3.6.4 Run "make clean; make install" in /moxa/mxser/driver.
+
+3.7 Verify driver installation
+==============================
+
+    You may refer to /var/log/messages to check the latest status
+    log reported by this driver whenever it's activated.
+
+4. Utilities
+^^^^^^^^^^^^
+
+   There are 3 utilities contained in this driver. They are msdiag, msmon and
+   msterm. These 3 utilities are released in form of source code. They should
+   be compiled into executable file and copied into /usr/bin.
+
+   Before using these utilities, please load driver (refer 3.4 & 3.5) and
+   make sure you had run the "msmknod" utility.
+
+msdiag - Diagnostic
+===================
+
+   This utility provides the function to display what Moxa Smartio/Industio
+   board found by driver in the system.
+
+msmon - Port Monitoring
+=======================
+
+   This utility gives the user a quick view about all the MOXA ports'
+   activities. One can easily learn each port's total received/transmitted
+   (Rx/Tx) character count since the time when the monitoring is started.
+
+   Rx/Tx throughputs per second are also reported in interval basis (e.g.
+   the last 5 seconds) and in average basis (since the time the monitoring
+   is started). You can reset all ports' count by <HOME> key. <+> <->
+   (plus/minus) keys to change the displaying time interval. Press <ENTER>
+   on the port, that cursor stay, to view the port's communication
+   parameters, signal status, and input/output queue.
+
+msterm - Terminal Emulation
+===========================
+
+   This utility provides data sending and receiving ability of all tty ports,
+   especially for MOXA ports. It is quite useful for testing simple
+   application, for example, sending AT command to a modem connected to the
+   port or used as a terminal for login purpose. Note that this is only a
+   dumb terminal emulation without handling full screen operation.
+
+5. Setserial
+^^^^^^^^^^^^
+
+   Supported Setserial parameters are listed as below.
+
+   ============== =========================================================
+   uart		  set UART type(16450-->disable FIFO, 16550A-->enable FIFO)
+   close_delay	  set the amount of time(in 1/100 of a second) that DTR
+		  should be kept low while being closed.
+   closing_wait   set the amount of time(in 1/100 of a second) that the
+		  serial port should wait for data to be drained while
+		  being closed, before the receiver is disable.
+   spd_hi	  Use  57.6kb  when  the application requests 38.4kb.
+   spd_vhi	  Use  115.2kb	when  the application requests 38.4kb.
+   spd_shi	  Use  230.4kb	when  the application requests 38.4kb.
+   spd_warp	  Use  460.8kb	when  the application requests 38.4kb.
+   spd_normal	  Use  38.4kb  when  the application requests 38.4kb.
+   spd_cust	  Use  the custom divisor to set the speed when  the
+		  application requests 38.4kb.
+   divisor	  This option set the custom division.
+   baud_base	  This option set the base baud rate.
+   ============== =========================================================
+
+6. Troubleshooting
+^^^^^^^^^^^^^^^^^^
+
+   The boot time error messages and solutions are stated as clearly as
+   possible. If all the possible solutions fail, please contact our technical
+   support team to get more help.
+
+
+   Error msg:
+	      More than 4 Moxa Smartio/Industio family boards found. Fifth board
+              and after are ignored.
+
+   Solution:
+   To avoid this problem, please unplug fifth and after board, because Moxa
+   driver supports up to 4 boards.
+
+   Error msg:
+	      Request_irq fail, IRQ(?) may be conflict with another device.
+
+   Solution:
+   Other PCI or ISA devices occupy the assigned IRQ. If you are not sure
+   which device causes the situation, please check /proc/interrupts to find
+   free IRQ and simply change another free IRQ for Moxa board.
+
+   Error msg:
+	      Board #: C1xx Series(CAP=xxx) interrupt number invalid.
+
+   Solution:
+   Each port within the same multiport board shares the same IRQ. Please set
+   one IRQ (IRQ doesn't equal to zero) for one Moxa board.
+
+   Error msg:
+	      No interrupt vector be set for Moxa ISA board(CAP=xxx).
+
+   Solution:
+   Moxa ISA board needs an interrupt vector.Please refer to user's manual
+   "Hardware Installation" chapter to set interrupt vector.
+
+   Error msg:
+              Couldn't install MOXA Smartio/Industio family driver!
+
+   Solution:
+   Load Moxa driver fail, the major number may conflict with other devices.
+   Please refer to previous section 3.7 to change a free major number for
+   Moxa driver.
+
+   Error msg:
+              Couldn't install MOXA Smartio/Industio family callout driver!
+
+   Solution:
+   Load Moxa callout driver fail, the callout device major number may
+   conflict with other devices. Please refer to previous section 3.7 to
+   change a free callout device major number for Moxa driver.
diff --git a/Documentation/driver-api/serial/n_gsm.rst b/Documentation/driver-api/serial/n_gsm.rst
new file mode 100644
index 000000000000..f3ad9fd26408
--- /dev/null
+++ b/Documentation/driver-api/serial/n_gsm.rst
@@ -0,0 +1,103 @@
+==============================
+GSM 0710 tty multiplexor HOWTO
+==============================
+
+This line discipline implements the GSM 07.10 multiplexing protocol
+detailed in the following 3GPP document:
+
+	http://www.3gpp.org/ftp/Specs/archive/07_series/07.10/0710-720.zip
+
+This document give some hints on how to use this driver with GPRS and 3G
+modems connected to a physical serial port.
+
+How to use it
+-------------
+1. initialize the modem in 0710 mux mode (usually AT+CMUX= command) through
+   its serial port. Depending on the modem used, you can pass more or less
+   parameters to this command,
+2. switch the serial line to using the n_gsm line discipline by using
+   TIOCSETD ioctl,
+3. configure the mux using GSMIOC_GETCONF / GSMIOC_SETCONF ioctl,
+
+Major parts of the initialization program :
+(a good starting point is util-linux-ng/sys-utils/ldattach.c)::
+
+  #include <linux/gsmmux.h>
+  #define N_GSM0710	21	/* GSM 0710 Mux */
+  #define DEFAULT_SPEED	B115200
+  #define SERIAL_PORT	/dev/ttyS0
+
+	int ldisc = N_GSM0710;
+	struct gsm_config c;
+	struct termios configuration;
+
+	/* open the serial port connected to the modem */
+	fd = open(SERIAL_PORT, O_RDWR | O_NOCTTY | O_NDELAY);
+
+	/* configure the serial port : speed, flow control ... */
+
+	/* send the AT commands to switch the modem to CMUX mode
+	   and check that it's successful (should return OK) */
+	write(fd, "AT+CMUX=0\r", 10);
+
+	/* experience showed that some modems need some time before
+	   being able to answer to the first MUX packet so a delay
+	   may be needed here in some case */
+	sleep(3);
+
+	/* use n_gsm line discipline */
+	ioctl(fd, TIOCSETD, &ldisc);
+
+	/* get n_gsm configuration */
+	ioctl(fd, GSMIOC_GETCONF, &c);
+	/* we are initiator and need encoding 0 (basic) */
+	c.initiator = 1;
+	c.encapsulation = 0;
+	/* our modem defaults to a maximum size of 127 bytes */
+	c.mru = 127;
+	c.mtu = 127;
+	/* set the new configuration */
+	ioctl(fd, GSMIOC_SETCONF, &c);
+
+	/* and wait for ever to keep the line discipline enabled */
+	daemon(0,0);
+	pause();
+
+4. create the devices corresponding to the "virtual" serial ports (take care,
+   each modem has its configuration and some DLC have dedicated functions,
+   for example GPS), starting with minor 1 (DLC0 is reserved for the management
+   of the mux)::
+
+     MAJOR=`cat /proc/devices |grep gsmtty | awk '{print $1}`
+     for i in `seq 1 4`; do
+	mknod /dev/ttygsm$i c $MAJOR $i
+     done
+
+5. use these devices as plain serial ports.
+
+   for example, it's possible:
+
+   - and to use gnokii to send / receive SMS on ttygsm1
+   - to use ppp to establish a datalink on ttygsm2
+
+6. first close all virtual ports before closing the physical port.
+
+   Note that after closing the physical port the modem is still in multiplexing
+   mode. This may prevent a successful re-opening of the port later. To avoid
+   this situation either reset the modem if your hardware allows that or send
+   a disconnect command frame manually before initializing the multiplexing mode
+   for the second time. The byte sequence for the disconnect command frame is::
+
+      0xf9, 0x03, 0xef, 0x03, 0xc3, 0x16, 0xf9.
+
+Additional Documentation
+------------------------
+More practical details on the protocol and how it's supported by industrial
+modems can be found in the following documents :
+
+- http://www.telit.com/module/infopool/download.php?id=616
+- http://www.u-blox.com/images/downloads/Product_Docs/LEON-G100-G200-MuxImplementation_ApplicationNote_%28GSM%20G1-CS-10002%29.pdf
+- http://www.sierrawireless.com/Support/Downloads/AirPrime/WMP_Series/~/media/Support_Downloads/AirPrime/Application_notes/CMUX_Feature_Application_Note-Rev004.ashx
+- http://wm.sim.com/sim/News/photo/2010721161442.pdf
+
+11-03-08 - Eric Bénard - <eric@eukrea.com>
diff --git a/Documentation/driver-api/serial/rocket.rst b/Documentation/driver-api/serial/rocket.rst
new file mode 100644
index 000000000000..23761eae4282
--- /dev/null
+++ b/Documentation/driver-api/serial/rocket.rst
@@ -0,0 +1,185 @@
+================================================
+Comtrol(tm) RocketPort(R)/RocketModem(TM) Series
+================================================
+
+Device Driver for the Linux Operating System
+============================================
+
+Product overview
+----------------
+
+This driver provides a loadable kernel driver for the Comtrol RocketPort
+and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32
+high-speed serial ports or modems.  This driver supports up to a combination
+of four RocketPort or RocketModems boards in one machine simultaneously.
+This file assumes that you are using the RocketPort driver which is
+integrated into the kernel sources.
+
+The driver can also be installed as an external module using the usual
+"make;make install" routine.  This external module driver, obtainable
+from the Comtrol website listed below, is useful for updating the driver
+or installing it into kernels which do not have the driver configured
+into them.  Installations instructions for the external module
+are in the included README and HW_INSTALL files.
+
+RocketPort ISA and RocketModem II PCI boards currently are only supported by
+this driver in module form.
+
+The RocketPort ISA board requires I/O ports to be configured by the DIP
+switches on the board.  See the section "ISA Rocketport Boards" below for
+information on how to set the DIP switches.
+
+You pass the I/O port to the driver using the following module parameters:
+
+board1:
+	I/O port for the first ISA board
+board2:
+	I/O port for the second ISA board
+board3:
+	I/O port for the third ISA board
+board4:
+	I/O port for the fourth ISA board
+
+There is a set of utilities and scripts provided with the external driver
+(downloadable from http://www.comtrol.com) that ease the configuration and
+setup of the ISA cards.
+
+The RocketModem II PCI boards require firmware to be loaded into the card
+before it will function.  The driver has only been tested as a module for this
+board.
+
+Installation Procedures
+-----------------------
+
+RocketPort/RocketModem PCI cards require no driver configuration, they are
+automatically detected and configured.
+
+The RocketPort driver can be installed as a module (recommended) or built
+into the kernel. This is selected, as for other drivers, through the `make config`
+command from the root of the Linux source tree during the kernel build process.
+
+The RocketPort/RocketModem serial ports installed by this driver are assigned
+device major number 46, and will be named /dev/ttyRx, where x is the port number
+starting at zero (ex. /dev/ttyR0, /devttyR1, ...).  If you have multiple cards
+installed in the system, the mapping of port names to serial ports is displayed
+in the system log at /var/log/messages.
+
+If installed as a module, the module must be loaded.  This can be done
+manually by entering "modprobe rocket".  To have the module loaded automatically
+upon system boot, edit a `/etc/modprobe.d/*.conf` file and add the line
+"alias char-major-46 rocket".
+
+In order to use the ports, their device names (nodes) must be created with mknod.
+This is only required once, the system will retain the names once created.  To
+create the RocketPort/RocketModem device names, use the command
+"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero.
+
+For example::
+
+	> mknod /dev/ttyR0 c 46 0
+	> mknod /dev/ttyR1 c 46 1
+	> mknod /dev/ttyR2 c 46 2
+
+The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes)
+for you::
+
+	>/dev/MAKEDEV ttyR
+
+ISA Rocketport Boards
+---------------------
+
+You must assign and configure the I/O addresses used by the ISA Rocketport
+card before installing and using it.  This is done by setting a set of DIP
+switches on the Rocketport board.
+
+
+Setting the I/O address
+-----------------------
+
+Before installing RocketPort(R) or RocketPort RA boards, you must find
+a range of I/O addresses for it to use. The first RocketPort card
+requires a 68-byte contiguous block of I/O addresses, starting at one
+of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h,
+0x300h, 0x340h, 0x380h.  This I/O address must be reflected in the DIP
+switches of *all* of the Rocketport cards.
+
+The second, third, and fourth RocketPort cards require a 64-byte
+contiguous block of I/O addresses, starting at one of the following
+I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h,
+0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h.  The I/O address used by the
+second, third, and fourth Rocketport cards (if present) are set via
+software control.  The DIP switch settings for the I/O address must be
+set to the value of the first Rocketport cards.
+
+In order to distinguish each of the card from the others, each card
+must have a unique board ID set on the dip switches.  The first
+Rocketport board must be set with the DIP switches corresponding to
+the first board, the second board must be set with the DIP switches
+corresponding to the second board, etc.  IMPORTANT: The board ID is
+the only place where the DIP switch settings should differ between the
+various Rocketport boards in a system.
+
+The I/O address range used by any of the RocketPort cards must not
+conflict with any other cards in the system, including other
+RocketPort cards.  Below, you will find a list of commonly used I/O
+address ranges which may be in use by other devices in your system.
+On a Linux system, "cat /proc/ioports" will also be helpful in
+identifying what I/O addresses are being used by devices on your
+system.
+
+Remember, the FIRST RocketPort uses 68 I/O addresses.  So, if you set it
+for 0x100, it will occupy 0x100 to 0x143.  This would mean that you
+CAN NOT set the second, third or fourth board for address 0x140 since
+the first 4 bytes of that range are used by the first board.  You would
+need to set the second, third, or fourth board to one of the next available
+blocks such as 0x180.
+
+RocketPort and RocketPort RA SW1 Settings::
+
+            +-------------------------------+
+            | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 |
+            +-------+-------+---------------+
+            | Unused| Card  | I/O Port Block|
+            +-------------------------------+
+
+  DIP Switches                             DIP Switches
+  7    8                                   6    5
+  ===================                      ===================
+  On   On   UNUSED, MUST BE ON.            On   On   First Card    <==== Default
+                                           On   Off  Second Card
+                                           Off  On   Third Card
+                                           Off  Off  Fourth Card
+
+  DIP Switches         I/O Address Range
+  4    3    2    1     Used by the First Card
+  =====================================
+  On   Off  On   Off   100-143
+  On   Off  Off  On    140-183
+  On   Off  Off  Off   180-1C3       <==== Default
+  Off  On   On   Off   200-243
+  Off  On   Off  On    240-283
+  Off  On   Off  Off   280-2C3
+  Off  Off  On   Off   300-343
+  Off  Off  Off  On    340-383
+  Off  Off  Off  Off   380-3C3
+
+Reporting Bugs
+--------------
+
+For technical support, please provide the following
+information: Driver version, kernel release, distribution of
+kernel, and type of board you are using. Error messages and log
+printouts port configuration details are especially helpful.
+
+USA:
+    :Phone: (612) 494-4100
+    :FAX: (612) 494-4199
+    :email: support@comtrol.com
+
+Comtrol Europe:
+    :Phone: +44 (0) 1 869 323-220
+    :FAX: +44 (0) 1 869 323-211
+    :email: support@comtrol.co.uk
+
+Web:	http://www.comtrol.com
+FTP:	ftp.comtrol.com
diff --git a/Documentation/driver-api/serial/serial-iso7816.rst b/Documentation/driver-api/serial/serial-iso7816.rst
new file mode 100644
index 000000000000..d990143de0c6
--- /dev/null
+++ b/Documentation/driver-api/serial/serial-iso7816.rst
@@ -0,0 +1,90 @@
+=============================
+ISO7816 Serial Communications
+=============================
+
+1. Introduction
+===============
+
+  ISO/IEC7816 is a series of standards specifying integrated circuit cards (ICC)
+  also known as smart cards.
+
+2. Hardware-related considerations
+==================================
+
+  Some CPUs/UARTs (e.g., Microchip AT91) contain a built-in mode capable of
+  handling communication with a smart card.
+
+  For these microcontrollers, the Linux driver should be made capable of
+  working in both modes, and proper ioctls (see later) should be made
+  available at user-level to allow switching from one mode to the other, and
+  vice versa.
+
+3. Data Structures Already Available in the Kernel
+==================================================
+
+  The Linux kernel provides the serial_iso7816 structure (see [1]) to handle
+  ISO7816 communications. This data structure is used to set and configure
+  ISO7816 parameters in ioctls.
+
+  Any driver for devices capable of working both as RS232 and ISO7816 should
+  implement the iso7816_config callback in the uart_port structure. The
+  serial_core calls iso7816_config to do the device specific part in response
+  to TIOCGISO7816 and TIOCSISO7816 ioctls (see below). The iso7816_config
+  callback receives a pointer to struct serial_iso7816.
+
+4. Usage from user-level
+========================
+
+  From user-level, ISO7816 configuration can be get/set using the previous
+  ioctls. For instance, to set ISO7816 you can use the following code::
+
+	#include <linux/serial.h>
+
+	/* Include definition for ISO7816 ioctls: TIOCSISO7816 and TIOCGISO7816 */
+	#include <sys/ioctl.h>
+
+	/* Open your specific device (e.g., /dev/mydevice): */
+	int fd = open ("/dev/mydevice", O_RDWR);
+	if (fd < 0) {
+		/* Error handling. See errno. */
+	}
+
+	struct serial_iso7816 iso7816conf;
+
+	/* Reserved fields as to be zeroed */
+	memset(&iso7816conf, 0, sizeof(iso7816conf));
+
+	/* Enable ISO7816 mode: */
+	iso7816conf.flags |= SER_ISO7816_ENABLED;
+
+	/* Select the protocol: */
+	/* T=0 */
+	iso7816conf.flags |= SER_ISO7816_T(0);
+	/* or T=1 */
+	iso7816conf.flags |= SER_ISO7816_T(1);
+
+	/* Set the guard time: */
+	iso7816conf.tg = 2;
+
+	/* Set the clock frequency*/
+	iso7816conf.clk = 3571200;
+
+	/* Set transmission factors: */
+	iso7816conf.sc_fi = 372;
+	iso7816conf.sc_di = 1;
+
+	if (ioctl(fd_usart, TIOCSISO7816, &iso7816conf) < 0) {
+		/* Error handling. See errno. */
+	}
+
+	/* Use read() and write() syscalls here... */
+
+	/* Close the device when finished: */
+	if (close (fd) < 0) {
+		/* Error handling. See errno. */
+	}
+
+5. References
+=============
+
+ [1]    include/uapi/linux/serial.h
diff --git a/Documentation/driver-api/serial/serial-rs485.rst b/Documentation/driver-api/serial/serial-rs485.rst
new file mode 100644
index 000000000000..6bc824f948f9
--- /dev/null
+++ b/Documentation/driver-api/serial/serial-rs485.rst
@@ -0,0 +1,103 @@
+===========================
+RS485 Serial Communications
+===========================
+
+1. Introduction
+===============
+
+   EIA-485, also known as TIA/EIA-485 or RS-485, is a standard defining the
+   electrical characteristics of drivers and receivers for use in balanced
+   digital multipoint systems.
+   This standard is widely used for communications in industrial automation
+   because it can be used effectively over long distances and in electrically
+   noisy environments.
+
+2. Hardware-related Considerations
+==================================
+
+   Some CPUs/UARTs (e.g., Atmel AT91 or 16C950 UART) contain a built-in
+   half-duplex mode capable of automatically controlling line direction by
+   toggling RTS or DTR signals. That can be used to control external
+   half-duplex hardware like an RS485 transceiver or any RS232-connected
+   half-duplex devices like some modems.
+
+   For these microcontrollers, the Linux driver should be made capable of
+   working in both modes, and proper ioctls (see later) should be made
+   available at user-level to allow switching from one mode to the other, and
+   vice versa.
+
+3. Data Structures Already Available in the Kernel
+==================================================
+
+   The Linux kernel provides the serial_rs485 structure (see [1]) to handle
+   RS485 communications. This data structure is used to set and configure RS485
+   parameters in the platform data and in ioctls.
+
+   The device tree can also provide RS485 boot time parameters (see [2]
+   for bindings). The driver is in charge of filling this data structure from
+   the values given by the device tree.
+
+   Any driver for devices capable of working both as RS232 and RS485 should
+   implement the rs485_config callback in the uart_port structure. The
+   serial_core calls rs485_config to do the device specific part in response
+   to TIOCSRS485 and TIOCGRS485 ioctls (see below). The rs485_config callback
+   receives a pointer to struct serial_rs485.
+
+4. Usage from user-level
+========================
+
+   From user-level, RS485 configuration can be get/set using the previous
+   ioctls. For instance, to set RS485 you can use the following code::
+
+	#include <linux/serial.h>
+
+	/* Include definition for RS485 ioctls: TIOCGRS485 and TIOCSRS485 */
+	#include <sys/ioctl.h>
+
+	/* Open your specific device (e.g., /dev/mydevice): */
+	int fd = open ("/dev/mydevice", O_RDWR);
+	if (fd < 0) {
+		/* Error handling. See errno. */
+	}
+
+	struct serial_rs485 rs485conf;
+
+	/* Enable RS485 mode: */
+	rs485conf.flags |= SER_RS485_ENABLED;
+
+	/* Set logical level for RTS pin equal to 1 when sending: */
+	rs485conf.flags |= SER_RS485_RTS_ON_SEND;
+	/* or, set logical level for RTS pin equal to 0 when sending: */
+	rs485conf.flags &= ~(SER_RS485_RTS_ON_SEND);
+
+	/* Set logical level for RTS pin equal to 1 after sending: */
+	rs485conf.flags |= SER_RS485_RTS_AFTER_SEND;
+	/* or, set logical level for RTS pin equal to 0 after sending: */
+	rs485conf.flags &= ~(SER_RS485_RTS_AFTER_SEND);
+
+	/* Set rts delay before send, if needed: */
+	rs485conf.delay_rts_before_send = ...;
+
+	/* Set rts delay after send, if needed: */
+	rs485conf.delay_rts_after_send = ...;
+
+	/* Set this flag if you want to receive data even while sending data */
+	rs485conf.flags |= SER_RS485_RX_DURING_TX;
+
+	if (ioctl (fd, TIOCSRS485, &rs485conf) < 0) {
+		/* Error handling. See errno. */
+	}
+
+	/* Use read() and write() syscalls here... */
+
+	/* Close the device when finished: */
+	if (close (fd) < 0) {
+		/* Error handling. See errno. */
+	}
+
+5. References
+=============
+
+ [1]	include/uapi/linux/serial.h
+
+ [2]	Documentation/devicetree/bindings/serial/rs485.txt
diff --git a/Documentation/driver-api/serial/tty.rst b/Documentation/driver-api/serial/tty.rst
new file mode 100644
index 000000000000..dd972caacf3e
--- /dev/null
+++ b/Documentation/driver-api/serial/tty.rst
@@ -0,0 +1,328 @@
+=================
+The Lockronomicon
+=================
+
+Your guide to the ancient and twisted locking policies of the tty layer and
+the warped logic behind them. Beware all ye who read on.
+
+
+Line Discipline
+---------------
+
+Line disciplines are registered with tty_register_ldisc() passing the
+discipline number and the ldisc structure. At the point of registration the
+discipline must be ready to use and it is possible it will get used before
+the call returns success. If the call returns an error then it won't get
+called. Do not re-use ldisc numbers as they are part of the userspace ABI
+and writing over an existing ldisc will cause demons to eat your computer.
+After the return the ldisc data has been copied so you may free your own
+copy of the structure. You must not re-register over the top of the line
+discipline even with the same data or your computer again will be eaten by
+demons.
+
+In order to remove a line discipline call tty_unregister_ldisc().
+In ancient times this always worked. In modern times the function will
+return -EBUSY if the ldisc is currently in use. Since the ldisc referencing
+code manages the module counts this should not usually be a concern.
+
+Heed this warning: the reference count field of the registered copies of the
+tty_ldisc structure in the ldisc table counts the number of lines using this
+discipline. The reference count of the tty_ldisc structure within a tty
+counts the number of active users of the ldisc at this instant. In effect it
+counts the number of threads of execution within an ldisc method (plus those
+about to enter and exit although this detail matters not).
+
+Line Discipline Methods
+-----------------------
+
+TTY side interfaces
+^^^^^^^^^^^^^^^^^^^
+
+======================= =======================================================
+open()			Called when the line discipline is attached to
+			the terminal. No other call into the line
+			discipline for this tty will occur until it
+			completes successfully. Should initialize any
+			state needed by the ldisc, and set receive_room
+			in the tty_struct to the maximum amount of data
+			the line discipline is willing to accept from the
+			driver with a single call to receive_buf().
+			Returning an error will prevent the ldisc from
+			being attached. Can sleep.
+
+close()			This is called on a terminal when the line
+			discipline is being unplugged. At the point of
+			execution no further users will enter the
+			ldisc code for this tty. Can sleep.
+
+hangup()		Called when the tty line is hung up.
+			The line discipline should cease I/O to the tty.
+			No further calls into the ldisc code will occur.
+			The return value is ignored. Can sleep.
+
+read()			(optional) A process requests reading data from
+			the line. Multiple read calls may occur in parallel
+			and the ldisc must deal with serialization issues.
+			If not defined, the process will receive an EIO
+			error. May sleep.
+
+write()			(optional) A process requests writing data to the
+			line. Multiple write calls are serialized by the
+			tty layer for the ldisc. If not defined, the
+			process will receive an EIO error. May sleep.
+
+flush_buffer()		(optional) May be called at any point between
+			open and close, and instructs the line discipline
+			to empty its input buffer.
+
+set_termios()		(optional) Called on termios structure changes.
+			The caller passes the old termios data and the
+			current data is in the tty. Called under the
+			termios semaphore so allowed to sleep. Serialized
+			against itself only.
+
+poll()			(optional) Check the status for the poll/select
+			calls. Multiple poll calls may occur in parallel.
+			May sleep.
+
+ioctl()			(optional) Called when an ioctl is handed to the
+			tty layer that might be for the ldisc. Multiple
+			ioctl calls may occur in parallel. May sleep.
+
+compat_ioctl()		(optional) Called when a 32 bit ioctl is handed
+			to the tty layer that might be for the ldisc.
+			Multiple ioctl calls may occur in parallel.
+			May sleep.
+======================= =======================================================
+
+Driver Side Interfaces
+^^^^^^^^^^^^^^^^^^^^^^
+
+======================= =======================================================
+receive_buf()		(optional) Called by the low-level driver to hand
+			a buffer of received bytes to the ldisc for
+			processing. The number of bytes is guaranteed not
+			to exceed the current value of tty->receive_room.
+			All bytes must be processed.
+
+receive_buf2()		(optional) Called by the low-level driver to hand
+			a buffer of received bytes to the ldisc for
+			processing. Returns the number of bytes processed.
+
+			If both receive_buf() and receive_buf2() are
+			defined, receive_buf2() should be preferred.
+
+write_wakeup()		May be called at any point between open and close.
+			The TTY_DO_WRITE_WAKEUP flag indicates if a call
+			is needed but always races versus calls. Thus the
+			ldisc must be careful about setting order and to
+			handle unexpected calls. Must not sleep.
+
+			The driver is forbidden from calling this directly
+			from the ->write call from the ldisc as the ldisc
+			is permitted to call the driver write method from
+			this function. In such a situation defer it.
+
+dcd_change()		Report to the tty line the current DCD pin status
+			changes and the relative timestamp. The timestamp
+			cannot be NULL.
+======================= =======================================================
+
+
+Driver Access
+^^^^^^^^^^^^^
+
+Line discipline methods can call the following methods of the underlying
+hardware driver through the function pointers within the tty->driver
+structure:
+
+======================= =======================================================
+write()			Write a block of characters to the tty device.
+			Returns the number of characters accepted. The
+			character buffer passed to this method is already
+			in kernel space.
+
+put_char()		Queues a character for writing to the tty device.
+			If there is no room in the queue, the character is
+			ignored.
+
+flush_chars()		(Optional) If defined, must be called after
+			queueing characters with put_char() in order to
+			start transmission.
+
+write_room()		Returns the numbers of characters the tty driver
+			will accept for queueing to be written.
+
+ioctl()			Invoke device specific ioctl.
+			Expects data pointers to refer to userspace.
+			Returns ENOIOCTLCMD for unrecognized ioctl numbers.
+
+set_termios()		Notify the tty driver that the device's termios
+			settings have changed. New settings are in
+			tty->termios. Previous settings should be passed in
+			the "old" argument.
+
+			The API is defined such that the driver should return
+			the actual modes selected. This means that the
+			driver function is responsible for modifying any
+			bits in the request it cannot fulfill to indicate
+			the actual modes being used. A device with no
+			hardware capability for change (e.g. a USB dongle or
+			virtual port) can provide NULL for this method.
+
+throttle()		Notify the tty driver that input buffers for the
+			line discipline are close to full, and it should
+			somehow signal that no more characters should be
+			sent to the tty.
+
+unthrottle()		Notify the tty driver that characters can now be
+			sent to the tty without fear of overrunning the
+			input buffers of the line disciplines.
+
+stop()			Ask the tty driver to stop outputting characters
+			to the tty device.
+
+start()			Ask the tty driver to resume sending characters
+			to the tty device.
+
+hangup()		Ask the tty driver to hang up the tty device.
+
+break_ctl()		(Optional) Ask the tty driver to turn on or off
+			BREAK status on the RS-232 port.  If state is -1,
+			then the BREAK status should be turned on; if
+			state is 0, then BREAK should be turned off.
+			If this routine is not implemented, use ioctls
+			TIOCSBRK / TIOCCBRK instead.
+
+wait_until_sent()	Waits until the device has written out all of the
+			characters in its transmitter FIFO.
+
+send_xchar()		Send a high-priority XON/XOFF character to the device.
+======================= =======================================================
+
+
+Flags
+^^^^^
+
+Line discipline methods have access to tty->flags field containing the
+following interesting flags:
+
+======================= =======================================================
+TTY_THROTTLED		Driver input is throttled. The ldisc should call
+			tty->driver->unthrottle() in order to resume
+			reception when it is ready to process more data.
+
+TTY_DO_WRITE_WAKEUP	If set, causes the driver to call the ldisc's
+			write_wakeup() method in order to resume
+			transmission when it can accept more data
+			to transmit.
+
+TTY_IO_ERROR		If set, causes all subsequent userspace read/write
+			calls on the tty to fail, returning -EIO.
+
+TTY_OTHER_CLOSED	Device is a pty and the other side has closed.
+
+TTY_NO_WRITE_SPLIT	Prevent driver from splitting up writes into
+			smaller chunks.
+======================= =======================================================
+
+
+Locking
+^^^^^^^
+
+Callers to the line discipline functions from the tty layer are required to
+take line discipline locks. The same is true of calls from the driver side
+but not yet enforced.
+
+Three calls are now provided::
+
+	ldisc = tty_ldisc_ref(tty);
+
+takes a handle to the line discipline in the tty and returns it. If no ldisc
+is currently attached or the ldisc is being closed and re-opened at this
+point then NULL is returned. While this handle is held the ldisc will not
+change or go away::
+
+	tty_ldisc_deref(ldisc)
+
+Returns the ldisc reference and allows the ldisc to be closed. Returning the
+reference takes away your right to call the ldisc functions until you take
+a new reference::
+
+	ldisc = tty_ldisc_ref_wait(tty);
+
+Performs the same function as tty_ldisc_ref except that it will wait for an
+ldisc change to complete and then return a reference to the new ldisc.
+
+While these functions are slightly slower than the old code they should have
+minimal impact as most receive logic uses the flip buffers and they only
+need to take a reference when they push bits up through the driver.
+
+A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc
+functions are called with the ldisc unavailable. Thus tty_ldisc_ref will
+fail in this situation if used within these functions. Ldisc and driver
+code calling its own functions must be careful in this case.
+
+
+Driver Interface
+----------------
+
+======================= =======================================================
+open()			Called when a device is opened. May sleep
+
+close()			Called when a device is closed. At the point of
+			return from this call the driver must make no
+			further ldisc calls of any kind. May sleep
+
+write()			Called to write bytes to the device. May not
+			sleep. May occur in parallel in special cases.
+			Because this includes panic paths drivers generally
+			shouldn't try and do clever locking here.
+
+put_char()		Stuff a single character onto the queue. The
+			driver is guaranteed following up calls to
+			flush_chars.
+
+flush_chars()		Ask the kernel to write put_char queue
+
+write_room()		Return the number of characters that can be stuffed
+			into the port buffers without overflow (or less).
+			The ldisc is responsible for being intelligent
+			about multi-threading of write_room/write calls
+
+ioctl()			Called when an ioctl may be for the driver
+
+set_termios()		Called on termios change, serialized against
+			itself by a semaphore. May sleep.
+
+set_ldisc()		Notifier for discipline change. At the point this
+			is done the discipline is not yet usable. Can now
+			sleep (I think)
+
+throttle()		Called by the ldisc to ask the driver to do flow
+			control.  Serialization including with unthrottle
+			is the job of the ldisc layer.
+
+unthrottle()		Called by the ldisc to ask the driver to stop flow
+			control.
+
+stop()			Ldisc notifier to the driver to stop output. As with
+			throttle the serializations with start() are down
+			to the ldisc layer.
+
+start()			Ldisc notifier to the driver to start output.
+
+hangup()		Ask the tty driver to cause a hangup initiated
+			from the host side. [Can sleep ??]
+
+break_ctl()		Send RS232 break. Can sleep. Can get called in
+			parallel, driver must serialize (for now), and
+			with write calls.
+
+wait_until_sent()	Wait for characters to exit the hardware queue
+			of the driver. Can sleep
+
+send_xchar()	  	Send XON/XOFF and if possible jump the queue with
+			it in order to get fast flow control responses.
+			Cannot sleep ??
+======================= =======================================================
diff --git a/Documentation/serial/cyclades_z.rst b/Documentation/serial/cyclades_z.rst
deleted file mode 100644
index 532ff67e2f1c..000000000000
--- a/Documentation/serial/cyclades_z.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-================
-Cyclades-Z notes
-================
-
-The Cyclades-Z must have firmware loaded onto the card before it will
-operate.  This operation should be performed during system startup,
-
-The firmware, loader program and the latest device driver code are
-available from Cyclades at
-
-    ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/
diff --git a/Documentation/serial/driver.rst b/Documentation/serial/driver.rst
deleted file mode 100644
index 4537119bf624..000000000000
--- a/Documentation/serial/driver.rst
+++ /dev/null
@@ -1,549 +0,0 @@
-====================
-Low Level Serial API
-====================
-
-
-This document is meant as a brief overview of some aspects of the new serial
-driver.  It is not complete, any questions you have should be directed to
-<rmk@arm.linux.org.uk>
-
-The reference implementation is contained within amba-pl011.c.
-
-
-
-Low Level Serial Hardware Driver
---------------------------------
-
-The low level serial hardware driver is responsible for supplying port
-information (defined by uart_port) and a set of control methods (defined
-by uart_ops) to the core serial driver.  The low level driver is also
-responsible for handling interrupts for the port, and providing any
-console support.
-
-
-Console Support
----------------
-
-The serial core provides a few helper functions.  This includes identifing
-the correct port structure (via uart_get_console) and decoding command line
-arguments (uart_parse_options).
-
-There is also a helper function (uart_console_write) which performs a
-character by character write, translating newlines to CRLF sequences.
-Driver writers are recommended to use this function rather than implementing
-their own version.
-
-
-Locking
--------
-
-It is the responsibility of the low level hardware driver to perform the
-necessary locking using port->lock.  There are some exceptions (which
-are described in the uart_ops listing below.)
-
-There are two locks.  A per-port spinlock, and an overall semaphore.
-
-From the core driver perspective, the port->lock locks the following
-data::
-
-	port->mctrl
-	port->icount
-	port->state->xmit.head (circ_buf->head)
-	port->state->xmit.tail (circ_buf->tail)
-
-The low level driver is free to use this lock to provide any additional
-locking.
-
-The port_sem semaphore is used to protect against ports being added/
-removed or reconfigured at inappropriate times. Since v2.6.27, this
-semaphore has been the 'mutex' member of the tty_port struct, and
-commonly referred to as the port mutex.
-
-
-uart_ops
---------
-
-The uart_ops structure is the main interface between serial_core and the
-hardware specific driver.  It contains all the methods to control the
-hardware.
-
-  tx_empty(port)
-	This function tests whether the transmitter fifo and shifter
-	for the port described by 'port' is empty.  If it is empty,
-	this function should return TIOCSER_TEMT, otherwise return 0.
-	If the port does not support this operation, then it should
-	return TIOCSER_TEMT.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_mctrl(port, mctrl)
-	This function sets the modem control lines for port described
-	by 'port' to the state described by mctrl.  The relevant bits
-	of mctrl are:
-
-		- TIOCM_RTS	RTS signal.
-		- TIOCM_DTR	DTR signal.
-		- TIOCM_OUT1	OUT1 signal.
-		- TIOCM_OUT2	OUT2 signal.
-		- TIOCM_LOOP	Set the port into loopback mode.
-
-	If the appropriate bit is set, the signal should be driven
-	active.  If the bit is clear, the signal should be driven
-	inactive.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  get_mctrl(port)
-	Returns the current state of modem control inputs.  The state
-	of the outputs should not be returned, since the core keeps
-	track of their state.  The state information should include:
-
-		- TIOCM_CAR	state of DCD signal
-		- TIOCM_CTS	state of CTS signal
-		- TIOCM_DSR	state of DSR signal
-		- TIOCM_RI	state of RI signal
-
-	The bit is set if the signal is currently driven active.  If
-	the port does not support CTS, DCD or DSR, the driver should
-	indicate that the signal is permanently active.  If RI is
-	not available, the signal should not be indicated as active.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  stop_tx(port)
-	Stop transmitting characters.  This might be due to the CTS
-	line becoming inactive or the tty layer indicating we want
-	to stop transmission due to an XOFF character.
-
-	The driver should stop transmitting characters as soon as
-	possible.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  start_tx(port)
-	Start transmitting characters.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  throttle(port)
-	Notify the serial driver that input buffers for the line discipline are
-	close to full, and it should somehow signal that no more characters
-	should be sent to the serial port.
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .unthrottle() and termios modification by the
-	tty layer.
-
-  unthrottle(port)
-	Notify the serial driver that characters can now be sent to the serial
-	port without fear of overrunning the input buffers of the line
-	disciplines.
-
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .throttle() and termios modification by the
-	tty layer.
-
-  send_xchar(port,ch)
-	Transmit a high priority character, even if the port is stopped.
-	This is used to implement XON/XOFF flow control and tcflow().  If
-	the serial driver does not implement this function, the tty core
-	will append the character to the circular buffer and then call
-	start_tx() / stop_tx() to flush the data out.
-
-	Do not transmit if ch == '\0' (__DISABLED_CHAR).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  stop_rx(port)
-	Stop receiving characters; the port is in the process of
-	being closed.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  enable_ms(port)
-	Enable the modem status interrupts.
-
-	This method may be called multiple times.  Modem status
-	interrupts should be disabled when the shutdown method is
-	called.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  break_ctl(port,ctl)
-	Control the transmission of a break signal.  If ctl is
-	nonzero, the break signal should be transmitted.  The signal
-	should be terminated when another call is made with a zero
-	ctl.
-
-	Locking: caller holds tty_port->mutex
-
-  startup(port)
-	Grab any interrupt resources and initialise any low level driver
-	state.  Enable the port for reception.  It should not activate
-	RTS nor DTR; this will be done via a separate call to set_mctrl.
-
-	This method will only be called when the port is initially opened.
-
-	Locking: port_sem taken.
-
-	Interrupts: globally disabled.
-
-  shutdown(port)
-	Disable the port, disable any break condition that may be in
-	effect, and free any interrupt resources.  It should not disable
-	RTS nor DTR; this will have already been done via a separate
-	call to set_mctrl.
-
-	Drivers must not access port->state once this call has completed.
-
-	This method will only be called when there are no more users of
-	this port.
-
-	Locking: port_sem taken.
-
-	Interrupts: caller dependent.
-
-  flush_buffer(port)
-	Flush any write buffers, reset any DMA state and stop any
-	ongoing DMA transfers.
-
-	This will be called whenever the port->state->xmit circular
-	buffer is cleared.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  set_termios(port,termios,oldtermios)
-	Change the port parameters, including word length, parity, stop
-	bits.  Update read_status_mask and ignore_status_mask to indicate
-	the types of events we are interested in receiving.  Relevant
-	termios->c_cflag bits are:
-
-		CSIZE
-			- word size
-		CSTOPB
-			- 2 stop bits
-		PARENB
-			- parity enable
-		PARODD
-			- odd parity (when PARENB is in force)
-		CREAD
-			- enable reception of characters (if not set,
-			  still receive characters from the port, but
-			  throw them away.
-		CRTSCTS
-			- if set, enable CTS status change reporting
-		CLOCAL
-			- if not set, enable modem status change
-			  reporting.
-
-	Relevant termios->c_iflag bits are:
-
-		INPCK
-			- enable frame and parity error events to be
-			  passed to the TTY layer.
-		BRKINT / PARMRK
-			- both of these enable break events to be
-			  passed to the TTY layer.
-
-		IGNPAR
-			- ignore parity and framing errors
-		IGNBRK
-			- ignore break errors,  If IGNPAR is also
-			  set, ignore overrun errors as well.
-
-	The interaction of the iflag bits is as follows (parity error
-	given as an example):
-
-	=============== ======= ======  =============================
-	Parity error	INPCK	IGNPAR
-	=============== ======= ======  =============================
-	n/a		0	n/a	character received, marked as
-					TTY_NORMAL
-	None		1	n/a	character received, marked as
-					TTY_NORMAL
-	Yes		1	0	character received, marked as
-					TTY_PARITY
-	Yes		1	1	character discarded
-	=============== ======= ======  =============================
-
-	Other flags may be used (eg, xon/xoff characters) if your
-	hardware supports hardware "soft" flow control.
-
-	Locking: caller holds tty_port->mutex
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_ldisc(port,termios)
-	Notifier for discipline change. See Documentation/serial/tty.rst.
-
-	Locking: caller holds tty_port->mutex
-
-  pm(port,state,oldstate)
-	Perform any power management related activities on the specified
-	port.  State indicates the new state (defined by
-	enum uart_pm_state), oldstate indicates the previous state.
-
-	This function should not be used to grab any resources.
-
-	This will be called when the port is initially opened and finally
-	closed, except when the port is also the system console.  This
-	will occur even if CONFIG_PM is not set.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  type(port)
-	Return a pointer to a string constant describing the specified
-	port, or return NULL, in which case the string 'unknown' is
-	substituted.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  release_port(port)
-	Release any memory and IO region resources currently in use by
-	the port.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  request_port(port)
-	Request any memory and IO region resources required by the port.
-	If any fail, no resources should be registered when this function
-	returns, and it should return -EBUSY on failure.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  config_port(port,type)
-	Perform any autoconfiguration steps required for the port.  `type`
-	contains a bit mask of the required configuration.  UART_CONFIG_TYPE
-	indicates that the port requires detection and identification.
-	port->type should be set to the type found, or PORT_UNKNOWN if
-	no port was detected.
-
-	UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
-	which should be probed using standard kernel autoprobing techniques.
-	This is not necessary on platforms where ports have interrupts
-	internally hard wired (eg, system on a chip implementations).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  verify_port(port,serinfo)
-	Verify the new serial port information contained within serinfo is
-	suitable for this port type.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  ioctl(port,cmd,arg)
-	Perform any port specific IOCTLs.  IOCTL commands must be defined
-	using the standard numbering system found in <asm/ioctl.h>
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  poll_init(port)
-	Called by kgdb to perform the minimal hardware initialization needed
-	to support poll_put_char() and poll_get_char().  Unlike ->startup()
-	this should not request interrupts.
-
-	Locking: tty_mutex and tty_port->mutex taken.
-
-	Interrupts: n/a.
-
-  poll_put_char(port,ch)
-	Called by kgdb to write a single character directly to the serial
-	port.  It can and should block until there is space in the TX FIFO.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  poll_get_char(port)
-	Called by kgdb to read a single character directly from the serial
-	port.  If data is available, it should be returned; otherwise
-	the function should return NO_POLL_CHAR immediately.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-Other functions
----------------
-
-uart_update_timeout(port,cflag,baud)
-	Update the FIFO drain timeout, port->timeout, according to the
-	number of bits, parity, stop bits and baud rate.
-
-	Locking: caller is expected to take port->lock
-
-	Interrupts: n/a
-
-uart_get_baud_rate(port,termios,old,min,max)
-	Return the numeric baud rate for the specified termios, taking
-	account of the special 38400 baud "kludge".  The B0 baud rate
-	is mapped to 9600 baud.
-
-	If the baud rate is not within min..max, then if old is non-NULL,
-	the original baud rate will be tried.  If that exceeds the
-	min..max constraint, 9600 baud will be returned.  termios will
-	be updated to the baud rate in use.
-
-	Note: min..max must always allow 9600 baud to be selected.
-
-	Locking: caller dependent.
-
-	Interrupts: n/a
-
-uart_get_divisor(port,baud)
-	Return the divisor (baud_base / baud) for the specified baud
-	rate, appropriately rounded.
-
-	If 38400 baud and custom divisor is selected, return the
-	custom divisor instead.
-
-	Locking: caller dependent.
-
-	Interrupts: n/a
-
-uart_match_port(port1,port2)
-	This utility function can be used to determine whether two
-	uart_port structures describe the same port.
-
-	Locking: n/a
-
-	Interrupts: n/a
-
-uart_write_wakeup(port)
-	A driver is expected to call this function when the number of
-	characters in the transmit buffer have dropped below a threshold.
-
-	Locking: port->lock should be held.
-
-	Interrupts: n/a
-
-uart_register_driver(drv)
-	Register a uart driver with the core driver.  We in turn register
-	with the tty layer, and initialise the core driver per-port state.
-
-	drv->port should be NULL, and the per-port structures should be
-	registered using uart_add_one_port after this call has succeeded.
-
-	Locking: none
-
-	Interrupts: enabled
-
-uart_unregister_driver()
-	Remove all references to a driver from the core driver.  The low
-	level driver must have removed all its ports via the
-	uart_remove_one_port() if it registered them with uart_add_one_port().
-
-	Locking: none
-
-	Interrupts: enabled
-
-**uart_suspend_port()**
-
-**uart_resume_port()**
-
-**uart_add_one_port()**
-
-**uart_remove_one_port()**
-
-Other notes
------------
-
-It is intended some day to drop the 'unused' entries from uart_port, and
-allow low level drivers to register their own individual uart_port's with
-the core.  This will allow drivers to use uart_port as a pointer to a
-structure containing both the uart_port entry with their own extensions,
-thus::
-
-	struct my_port {
-		struct uart_port	port;
-		int			my_stuff;
-	};
-
-Modem control lines via GPIO
-----------------------------
-
-Some helpers are provided in order to set/get modem control lines via GPIO.
-
-mctrl_gpio_init(port, idx):
-	This will get the {cts,rts,...}-gpios from device tree if they are
-	present and request them, set direction etc, and return an
-	allocated structure. `devm_*` functions are used, so there's no need
-	to call mctrl_gpio_free().
-	As this sets up the irq handling make sure to not handle changes to the
-	gpio input lines in your driver, too.
-
-mctrl_gpio_free(dev, gpios):
-	This will free the requested gpios in mctrl_gpio_init().
-	As `devm_*` functions are used, there's generally no need to call
-	this function.
-
-mctrl_gpio_to_gpiod(gpios, gidx)
-	This returns the gpio_desc structure associated to the modem line
-	index.
-
-mctrl_gpio_set(gpios, mctrl):
-	This will sets the gpios according to the mctrl state.
-
-mctrl_gpio_get(gpios, mctrl):
-	This will update mctrl with the gpios values.
-
-mctrl_gpio_enable_ms(gpios):
-	Enables irqs and handling of changes to the ms lines.
-
-mctrl_gpio_disable_ms(gpios):
-	Disables irqs and handling of changes to the ms lines.
diff --git a/Documentation/serial/index.rst b/Documentation/serial/index.rst
deleted file mode 100644
index d0ba22ea23bf..000000000000
--- a/Documentation/serial/index.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-:orphan:
-
-==========================
-Support for Serial devices
-==========================
-
-.. toctree::
-    :maxdepth: 1
-
-
-    driver
-    tty
-
-Serial drivers
-==============
-
-.. toctree::
-    :maxdepth: 1
-
-    cyclades_z
-    moxa-smartio
-    n_gsm
-    rocket
-    serial-iso7816
-    serial-rs485
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/serial/moxa-smartio.rst b/Documentation/serial/moxa-smartio.rst
deleted file mode 100644
index 156100f17c3f..000000000000
--- a/Documentation/serial/moxa-smartio.rst
+++ /dev/null
@@ -1,615 +0,0 @@
-=============================================================
-MOXA Smartio/Industio Family Device Driver Installation Guide
-=============================================================
-
-.. note::
-
-   This file is outdated. It needs some care in order to make it
-   updated to Kernel 5.0 and upper
-
-Copyright (C) 2008, Moxa Inc.
-
-Date: 01/21/2008
-
-.. Content
-
-   1. Introduction
-   2. System Requirement
-   3. Installation
-      3.1 Hardware installation
-      3.2 Driver files
-      3.3 Device naming convention
-      3.4 Module driver configuration
-      3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x.
-      3.6 Custom configuration
-      3.7 Verify driver installation
-   4. Utilities
-   5. Setserial
-   6. Troubleshooting
-
-1. Introduction
-^^^^^^^^^^^^^^^
-
-   The Smartio/Industio/UPCI family Linux driver supports following multiport
-   boards.
-
-    - 2 ports multiport board
-	CP-102U, CP-102UL, CP-102UF
-	CP-132U-I, CP-132UL,
-	CP-132, CP-132I, CP132S, CP-132IS,
-	CI-132, CI-132I, CI-132IS,
-	(C102H, C102HI, C102HIS, C102P, CP-102, CP-102S)
-
-    - 4 ports multiport board
-	CP-104EL,
-	CP-104UL, CP-104JU,
-	CP-134U, CP-134U-I,
-	C104H/PCI, C104HS/PCI,
-	CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL,
-	C104H, C104HS,
-	CI-104J, CI-104JS,
-	CI-134, CI-134I, CI-134IS,
-	(C114HI, CT-114I, C104P),
-	POS-104UL,
-	CB-114,
-	CB-134I
-
-    - 8 ports multiport board
-	CP-118EL, CP-168EL,
-	CP-118U, CP-168U,
-	C168H/PCI,
-	C168H, C168HS,
-	(C168P),
-	CB-108
-
-   This driver and installation procedure have been developed upon Linux Kernel
-   2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order
-   to maintain compatibility, this version has also been properly tested with
-   RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem
-   occurs, please contact Moxa at support@moxa.com.tw.
-
-   In addition to device driver, useful utilities are also provided in this
-   version. They are:
-
-    - msdiag
-		 Diagnostic program for displaying installed Moxa
-                 Smartio/Industio boards.
-    - msmon
-		 Monitor program to observe data count and line status signals.
-    - msterm     A simple terminal program which is useful in testing serial
-	         ports.
-    - io-irq.exe
-		 Configuration program to setup ISA boards. Please note that
-                 this program can only be executed under DOS.
-
-   All the drivers and utilities are published in form of source code under
-   GNU General Public License in this version. Please refer to GNU General
-   Public License announcement in each source code file for more detail.
-
-   In Moxa's Web sites, you may always find latest driver at http://www.moxa.com/.
-
-   This version of driver can be installed as Loadable Module (Module driver)
-   or built-in into kernel (Static driver). You may refer to following
-   installation procedure for suitable one. Before you install the driver,
-   please refer to hardware installation procedure in the User's Manual.
-
-   We assume the user should be familiar with following documents.
-
-   - Serial-HOWTO
-   - Kernel-HOWTO
-
-2. System Requirement
-^^^^^^^^^^^^^^^^^^^^^
-
-   - Hardware platform: Intel x86 machine
-   - Kernel version: 2.4.x or 2.6.x
-   - gcc version 2.72 or later
-   - Maximum 4 boards can be installed in combination
-
-3. Installation
-^^^^^^^^^^^^^^^
-
-3.1 Hardware installation
-=========================
-
-   There are two types of buses, ISA and PCI, for Smartio/Industio
-   family multiport board.
-
-ISA board
----------
-
-   You'll have to configure CAP address, I/O address, Interrupt Vector
-   as well as IRQ before installing this driver. Please refer to hardware
-   installation procedure in User's Manual before proceed any further.
-   Please make sure the JP1 is open after the ISA board is set properly.
-
-PCI/UPCI board
---------------
-
-   You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict
-   with other ISA devices. Please refer to hardware installation
-   procedure in User's Manual in advance.
-
-PCI IRQ Sharing
----------------
-
-   Each port within the same multiport board shares the same IRQ. Up to
-   4 Moxa Smartio/Industio PCI Family multiport boards can be installed
-   together on one system and they can share the same IRQ.
-
-
-3.2 Driver files
-================
-
-   The driver file may be obtained from ftp, CD-ROM or floppy disk. The
-   first step, anyway, is to copy driver file "mxser.tgz" into specified
-   directory. e.g. /moxa. The execute commands as below::
-
-       # cd /
-       # mkdir moxa
-       # cd /moxa
-       # tar xvf /dev/fd0
-
-or::
-
-       # cd /
-       # mkdir moxa
-       # cd /moxa
-       # cp /mnt/cdrom/<driver directory>/mxser.tgz .
-       # tar xvfz mxser.tgz
-
-
-3.3 Device naming convention
-============================
-
-   You may find all the driver and utilities files in /moxa/mxser.
-   Following installation procedure depends on the model you'd like to
-   run the driver. If you prefer module driver, please refer to 3.4.
-   If static driver is required, please refer to 3.5.
-
-Dialin and callout port
------------------------
-
-   This driver remains traditional serial device properties. There are
-   two special file name for each serial port. One is dial-in port
-   which is named "ttyMxx". For callout port, the naming convention
-   is "cumxx".
-
-Device naming when more than 2 boards installed
------------------------------------------------
-
-   Naming convention for each Smartio/Industio multiport board is
-   pre-defined as below.
-
-   ============ ===============       ==============
-   Board Num.	 Dial-in Port	      Callout port
-   1st board	ttyM0  - ttyM7	      cum0  - cum7
-   2nd board	ttyM8  - ttyM15       cum8  - cum15
-   3rd board	ttyM16 - ttyM23       cum16 - cum23
-   4th board	ttyM24 - ttym31       cum24 - cum31
-   ============ ===============       ==============
-
-.. note::
-
-   Under Kernel 2.6 and upper, the cum Device is Obsolete. So use ttyM*
-   device instead.
-
-Board sequence
---------------
-
-   This driver will activate ISA boards according to the parameter set
-   in the driver. After all specified ISA board activated, PCI board
-   will be installed in the system automatically driven.
-   Therefore the board number is sorted by the CAP address of ISA boards.
-   For PCI boards, their sequence will be after ISA boards and C168H/PCI
-   has higher priority than C104H/PCI boards.
-
-3.4 Module driver configuration
-===============================
-
-   Module driver is easiest way to install. If you prefer static driver
-   installation, please skip this paragraph.
-
-
-   ------------- Prepare to use the MOXA driver --------------------
-
-3.4.1 Create tty device with correct major number
--------------------------------------------------
-
-   Before using MOXA driver, your system must have the tty devices
-   which are created with driver's major number. We offer one shell
-   script "msmknod" to simplify the procedure.
-   This step is only needed to be executed once. But you still
-   need to do this procedure when:
-
-   a. You change the driver's major number. Please refer the "3.7"
-      section.
-   b. Your total installed MOXA boards number is changed. Maybe you
-      add/delete one MOXA board.
-   c. You want to change the tty name. This needs to modify the
-      shell script "msmknod"
-
-   The procedure is::
-
-	 # cd /moxa/mxser/driver
-	 # ./msmknod
-
-   This shell script will require the major number for dial-in
-   device and callout device to create tty device. You also need
-   to specify the total installed MOXA board number. Default major
-   numbers for dial-in device and callout device are 30, 35. If
-   you need to change to other number, please refer section "3.7"
-   for more detailed procedure.
-   Msmknod will delete any special files occupying the same device
-   naming.
-
-3.4.2 Build the MOXA driver and utilities
------------------------------------------
-
-   Before using the MOXA driver and utilities, you need compile the
-   all the source code. This step is only need to be executed once.
-   But you still re-compile the source code if you modify the source
-   code. For example, if you change the driver's major number (see
-   "3.7" section), then you need to do this step again.
-
-   Find "Makefile" in /moxa/mxser, then run
-
-	 # make clean; make install
-
-   ..note::
-
-	 For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1:
-	 # make clean; make installsp1
-
-	 For Red Hat Enterprise Linux AS4/ES4/WS4:
-	 # make clean; make installsp2
-
-   The driver files "mxser.o" and utilities will be properly compiled
-   and copied to system directories respectively.
-
-------------- Load MOXA driver--------------------
-
-3.4.3 Load the MOXA driver
---------------------------
-
-   ::
-
-	 # modprobe mxser <argument>
-
-   will activate the module driver. You may run "lsmod" to check
-   if "mxser" is activated. If the MOXA board is ISA board, the
-   <argument> is needed. Please refer to section "3.4.5" for more
-   information.
-
-------------- Load MOXA driver on boot --------------------
-
-3.4.4 Load the mxser driver
----------------------------
-
-
-   For the above description, you may manually execute
-   "modprobe mxser" to activate this driver and run
-   "rmmod mxser" to remove it.
-
-   However, it's better to have a boot time configuration to
-   eliminate manual operation. Boot time configuration can be
-   achieved by rc file. We offer one "rc.mxser" file to simplify
-   the procedure under "moxa/mxser/driver".
-
-   But if you use ISA board, please modify the "modprobe ..." command
-   to add the argument (see "3.4.5" section). After modifying the
-   rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser"
-   manually to make sure the modification is ok. If any error
-   encountered, please try to modify again. If the modification is
-   completed, follow the below step.
-
-   Run following command for setting rc files::
-
-	 # cd /moxa/mxser/driver
-	 # cp ./rc.mxser /etc/rc.d
-	 # cd /etc/rc.d
-
-   Check "rc.serial" is existed or not. If "rc.serial" doesn't exist,
-   create it by vi, run "chmod 755 rc.serial" to change the permission.
-
-   Add "/etc/rc.d/rc.mxser" in last line.
-
-   Reboot and check if moxa.o activated by "lsmod" command.
-
-3.4.5. specify CAP address
---------------------------
-
-   If you'd like to drive Smartio/Industio ISA boards in the system,
-   you'll have to add parameter to specify CAP address of given
-   board while activating "mxser.o". The format for parameters are
-   as follows.::
-
-	   modprobe mxser ioaddr=0x???,0x???,0x???,0x???
-				  |  |  |    |
-				  |  |  |    +- 4th ISA board
-				  |  |  +------ 3rd ISA board
-				  |  +------------ 2nd ISA board
-				  +-------------------1st ISA board
-
-3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x
-================================================================
-
-    Note:
-          To use static driver, you must install the linux kernel
-          source package.
-
-3.5.1 Backup the built-in driver in the kernel
-----------------------------------------------
-
-    ::
-
-       # cd /usr/src/linux/drivers/char
-       # mv mxser.c mxser.c.old
-
-       For Red Hat 7.x user, you need to create link:
-       # cd /usr/src
-       # ln -s linux-2.4 linux
-
-3.5.2 Create link
------------------
-    ::
-
-	  # cd /usr/src/linux/drivers/char
-	  # ln -s /moxa/mxser/driver/mxser.c mxser.c
-
-3.5.3 Add CAP address list for ISA boards.
-------------------------------------------
-
-    For PCI boards user, please skip this step.
-
-    In module mode, the CAP address for ISA board is given by
-    parameter. In static driver configuration, you'll have to
-    assign it within driver's source code. If you will not
-    install any ISA boards, you may skip to next portion.
-    The instructions to modify driver source code are as
-    below.
-
-    a. run::
-
-	# cd /moxa/mxser/driver
-	# vi mxser.c
-
-    b. Find the array mxserBoardCAP[] as below::
-
-	  static int mxserBoardCAP[] = {0x00, 0x00, 0x00, 0x00};
-
-    c. Change the address within this array using vi. For
-       example, to driver 2 ISA boards with CAP address
-       0x280 and 0x180 as 1st and 2nd board. Just to change
-       the source code as follows::
-
-	  static int mxserBoardCAP[] = {0x280, 0x180, 0x00, 0x00};
-
-3.5.4 Setup kernel configuration
---------------------------------
-
-    Configure the kernel::
-
-      # cd /usr/src/linux
-      # make menuconfig
-
-    You will go into a menu-driven system. Please select [Character
-    devices][Non-standard serial port support], enable the [Moxa
-    SmartIO support] driver with "[*]" for built-in (not "[M]"), then
-    select [Exit] to exit this program.
-
-3.5.5 Rebuild kernel
---------------------
-
-    The following are for Linux kernel rebuilding, for your
-    reference only.
-
-    For appropriate details, please refer to the Linux document:
-
-        a. Run the following commands::
-
-	     cd /usr/src/linux
-	     make clean		     # take a few minutes
-	     make dep		     # take a few minutes
-	     make bzImage	     # take probably 10-20 minutes
-	     make install	     # copy boot image to correct position
-
-	f. Please make sure the boot kernel (vmlinuz) is in the
-	   correct position.
-	g. If you use 'lilo' utility, you should check /etc/lilo.conf
-	   'image' item specified the path which is the 'vmlinuz' path,
-	   or you will load wrong (or old) boot kernel image (vmlinuz).
-	   After checking /etc/lilo.conf, please run "lilo".
-
-	  Note that if the result of "make bzImage" is ERROR, then you have to
-	  go back to Linux configuration Setup. Type "make menuconfig" in
-          directory /usr/src/linux.
-
-
-3.5.6 Make tty device and special file
---------------------------------------
-
-    ::
-       # cd /moxa/mxser/driver
-       # ./msmknod
-
-3.5.7 Make utility
-------------------
-
-    ::
-
-	  # cd /moxa/mxser/utility
-	  # make clean; make install
-
-3.5.8 Reboot
-------------
-
-
-
-3.6 Custom configuration
-========================
-
-    Although this driver already provides you default configuration, you
-    still can change the device name and major number. The instruction to
-    change these parameters are shown as below.
-
-a. Change Device name
-
-    If you'd like to use other device names instead of default naming
-    convention, all you have to do is to modify the internal code
-    within the shell script "msmknod". First, you have to open "msmknod"
-    by vi. Locate each line contains "ttyM" and "cum" and change them
-    to the device name you desired. "msmknod" creates the device names
-    you need next time executed.
-
-b. Change Major number
-
-    If major number 30 and 35 had been occupied, you may have to select
-    2 free major numbers for this driver. There are 3 steps to change
-    major numbers.
-
-3.6.1 Find free major numbers
------------------------------
-
-    In /proc/devices, you may find all the major numbers occupied
-    in the system. Please select 2 major numbers that are available.
-    e.g. 40, 45.
-
-3.6.2 Create special files
---------------------------
-
-   Run /moxa/mxser/driver/msmknod to create special files with
-   specified major numbers.
-
-3.6.3 Modify driver with new major number
------------------------------------------
-
-   Run vi to open /moxa/mxser/driver/mxser.c. Locate the line
-   contains "MXSERMAJOR". Change the content as below::
-
-	  #define	  MXSERMAJOR		  40
-	  #define	  MXSERCUMAJOR		  45
-
-    3.6.4 Run "make clean; make install" in /moxa/mxser/driver.
-
-3.7 Verify driver installation
-==============================
-
-    You may refer to /var/log/messages to check the latest status
-    log reported by this driver whenever it's activated.
-
-4. Utilities
-^^^^^^^^^^^^
-
-   There are 3 utilities contained in this driver. They are msdiag, msmon and
-   msterm. These 3 utilities are released in form of source code. They should
-   be compiled into executable file and copied into /usr/bin.
-
-   Before using these utilities, please load driver (refer 3.4 & 3.5) and
-   make sure you had run the "msmknod" utility.
-
-msdiag - Diagnostic
-===================
-
-   This utility provides the function to display what Moxa Smartio/Industio
-   board found by driver in the system.
-
-msmon - Port Monitoring
-=======================
-
-   This utility gives the user a quick view about all the MOXA ports'
-   activities. One can easily learn each port's total received/transmitted
-   (Rx/Tx) character count since the time when the monitoring is started.
-
-   Rx/Tx throughputs per second are also reported in interval basis (e.g.
-   the last 5 seconds) and in average basis (since the time the monitoring
-   is started). You can reset all ports' count by <HOME> key. <+> <->
-   (plus/minus) keys to change the displaying time interval. Press <ENTER>
-   on the port, that cursor stay, to view the port's communication
-   parameters, signal status, and input/output queue.
-
-msterm - Terminal Emulation
-===========================
-
-   This utility provides data sending and receiving ability of all tty ports,
-   especially for MOXA ports. It is quite useful for testing simple
-   application, for example, sending AT command to a modem connected to the
-   port or used as a terminal for login purpose. Note that this is only a
-   dumb terminal emulation without handling full screen operation.
-
-5. Setserial
-^^^^^^^^^^^^
-
-   Supported Setserial parameters are listed as below.
-
-   ============== =========================================================
-   uart		  set UART type(16450-->disable FIFO, 16550A-->enable FIFO)
-   close_delay	  set the amount of time(in 1/100 of a second) that DTR
-		  should be kept low while being closed.
-   closing_wait   set the amount of time(in 1/100 of a second) that the
-		  serial port should wait for data to be drained while
-		  being closed, before the receiver is disable.
-   spd_hi	  Use  57.6kb  when  the application requests 38.4kb.
-   spd_vhi	  Use  115.2kb	when  the application requests 38.4kb.
-   spd_shi	  Use  230.4kb	when  the application requests 38.4kb.
-   spd_warp	  Use  460.8kb	when  the application requests 38.4kb.
-   spd_normal	  Use  38.4kb  when  the application requests 38.4kb.
-   spd_cust	  Use  the custom divisor to set the speed when  the
-		  application requests 38.4kb.
-   divisor	  This option set the custom division.
-   baud_base	  This option set the base baud rate.
-   ============== =========================================================
-
-6. Troubleshooting
-^^^^^^^^^^^^^^^^^^
-
-   The boot time error messages and solutions are stated as clearly as
-   possible. If all the possible solutions fail, please contact our technical
-   support team to get more help.
-
-
-   Error msg:
-	      More than 4 Moxa Smartio/Industio family boards found. Fifth board
-              and after are ignored.
-
-   Solution:
-   To avoid this problem, please unplug fifth and after board, because Moxa
-   driver supports up to 4 boards.
-
-   Error msg:
-	      Request_irq fail, IRQ(?) may be conflict with another device.
-
-   Solution:
-   Other PCI or ISA devices occupy the assigned IRQ. If you are not sure
-   which device causes the situation, please check /proc/interrupts to find
-   free IRQ and simply change another free IRQ for Moxa board.
-
-   Error msg:
-	      Board #: C1xx Series(CAP=xxx) interrupt number invalid.
-
-   Solution:
-   Each port within the same multiport board shares the same IRQ. Please set
-   one IRQ (IRQ doesn't equal to zero) for one Moxa board.
-
-   Error msg:
-	      No interrupt vector be set for Moxa ISA board(CAP=xxx).
-
-   Solution:
-   Moxa ISA board needs an interrupt vector.Please refer to user's manual
-   "Hardware Installation" chapter to set interrupt vector.
-
-   Error msg:
-              Couldn't install MOXA Smartio/Industio family driver!
-
-   Solution:
-   Load Moxa driver fail, the major number may conflict with other devices.
-   Please refer to previous section 3.7 to change a free major number for
-   Moxa driver.
-
-   Error msg:
-              Couldn't install MOXA Smartio/Industio family callout driver!
-
-   Solution:
-   Load Moxa callout driver fail, the callout device major number may
-   conflict with other devices. Please refer to previous section 3.7 to
-   change a free callout device major number for Moxa driver.
diff --git a/Documentation/serial/n_gsm.rst b/Documentation/serial/n_gsm.rst
deleted file mode 100644
index f3ad9fd26408..000000000000
--- a/Documentation/serial/n_gsm.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-==============================
-GSM 0710 tty multiplexor HOWTO
-==============================
-
-This line discipline implements the GSM 07.10 multiplexing protocol
-detailed in the following 3GPP document:
-
-	http://www.3gpp.org/ftp/Specs/archive/07_series/07.10/0710-720.zip
-
-This document give some hints on how to use this driver with GPRS and 3G
-modems connected to a physical serial port.
-
-How to use it
--------------
-1. initialize the modem in 0710 mux mode (usually AT+CMUX= command) through
-   its serial port. Depending on the modem used, you can pass more or less
-   parameters to this command,
-2. switch the serial line to using the n_gsm line discipline by using
-   TIOCSETD ioctl,
-3. configure the mux using GSMIOC_GETCONF / GSMIOC_SETCONF ioctl,
-
-Major parts of the initialization program :
-(a good starting point is util-linux-ng/sys-utils/ldattach.c)::
-
-  #include <linux/gsmmux.h>
-  #define N_GSM0710	21	/* GSM 0710 Mux */
-  #define DEFAULT_SPEED	B115200
-  #define SERIAL_PORT	/dev/ttyS0
-
-	int ldisc = N_GSM0710;
-	struct gsm_config c;
-	struct termios configuration;
-
-	/* open the serial port connected to the modem */
-	fd = open(SERIAL_PORT, O_RDWR | O_NOCTTY | O_NDELAY);
-
-	/* configure the serial port : speed, flow control ... */
-
-	/* send the AT commands to switch the modem to CMUX mode
-	   and check that it's successful (should return OK) */
-	write(fd, "AT+CMUX=0\r", 10);
-
-	/* experience showed that some modems need some time before
-	   being able to answer to the first MUX packet so a delay
-	   may be needed here in some case */
-	sleep(3);
-
-	/* use n_gsm line discipline */
-	ioctl(fd, TIOCSETD, &ldisc);
-
-	/* get n_gsm configuration */
-	ioctl(fd, GSMIOC_GETCONF, &c);
-	/* we are initiator and need encoding 0 (basic) */
-	c.initiator = 1;
-	c.encapsulation = 0;
-	/* our modem defaults to a maximum size of 127 bytes */
-	c.mru = 127;
-	c.mtu = 127;
-	/* set the new configuration */
-	ioctl(fd, GSMIOC_SETCONF, &c);
-
-	/* and wait for ever to keep the line discipline enabled */
-	daemon(0,0);
-	pause();
-
-4. create the devices corresponding to the "virtual" serial ports (take care,
-   each modem has its configuration and some DLC have dedicated functions,
-   for example GPS), starting with minor 1 (DLC0 is reserved for the management
-   of the mux)::
-
-     MAJOR=`cat /proc/devices |grep gsmtty | awk '{print $1}`
-     for i in `seq 1 4`; do
-	mknod /dev/ttygsm$i c $MAJOR $i
-     done
-
-5. use these devices as plain serial ports.
-
-   for example, it's possible:
-
-   - and to use gnokii to send / receive SMS on ttygsm1
-   - to use ppp to establish a datalink on ttygsm2
-
-6. first close all virtual ports before closing the physical port.
-
-   Note that after closing the physical port the modem is still in multiplexing
-   mode. This may prevent a successful re-opening of the port later. To avoid
-   this situation either reset the modem if your hardware allows that or send
-   a disconnect command frame manually before initializing the multiplexing mode
-   for the second time. The byte sequence for the disconnect command frame is::
-
-      0xf9, 0x03, 0xef, 0x03, 0xc3, 0x16, 0xf9.
-
-Additional Documentation
-------------------------
-More practical details on the protocol and how it's supported by industrial
-modems can be found in the following documents :
-
-- http://www.telit.com/module/infopool/download.php?id=616
-- http://www.u-blox.com/images/downloads/Product_Docs/LEON-G100-G200-MuxImplementation_ApplicationNote_%28GSM%20G1-CS-10002%29.pdf
-- http://www.sierrawireless.com/Support/Downloads/AirPrime/WMP_Series/~/media/Support_Downloads/AirPrime/Application_notes/CMUX_Feature_Application_Note-Rev004.ashx
-- http://wm.sim.com/sim/News/photo/2010721161442.pdf
-
-11-03-08 - Eric Bénard - <eric@eukrea.com>
diff --git a/Documentation/serial/rocket.rst b/Documentation/serial/rocket.rst
deleted file mode 100644
index 23761eae4282..000000000000
--- a/Documentation/serial/rocket.rst
+++ /dev/null
@@ -1,185 +0,0 @@
-================================================
-Comtrol(tm) RocketPort(R)/RocketModem(TM) Series
-================================================
-
-Device Driver for the Linux Operating System
-============================================
-
-Product overview
-----------------
-
-This driver provides a loadable kernel driver for the Comtrol RocketPort
-and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32
-high-speed serial ports or modems.  This driver supports up to a combination
-of four RocketPort or RocketModems boards in one machine simultaneously.
-This file assumes that you are using the RocketPort driver which is
-integrated into the kernel sources.
-
-The driver can also be installed as an external module using the usual
-"make;make install" routine.  This external module driver, obtainable
-from the Comtrol website listed below, is useful for updating the driver
-or installing it into kernels which do not have the driver configured
-into them.  Installations instructions for the external module
-are in the included README and HW_INSTALL files.
-
-RocketPort ISA and RocketModem II PCI boards currently are only supported by
-this driver in module form.
-
-The RocketPort ISA board requires I/O ports to be configured by the DIP
-switches on the board.  See the section "ISA Rocketport Boards" below for
-information on how to set the DIP switches.
-
-You pass the I/O port to the driver using the following module parameters:
-
-board1:
-	I/O port for the first ISA board
-board2:
-	I/O port for the second ISA board
-board3:
-	I/O port for the third ISA board
-board4:
-	I/O port for the fourth ISA board
-
-There is a set of utilities and scripts provided with the external driver
-(downloadable from http://www.comtrol.com) that ease the configuration and
-setup of the ISA cards.
-
-The RocketModem II PCI boards require firmware to be loaded into the card
-before it will function.  The driver has only been tested as a module for this
-board.
-
-Installation Procedures
------------------------
-
-RocketPort/RocketModem PCI cards require no driver configuration, they are
-automatically detected and configured.
-
-The RocketPort driver can be installed as a module (recommended) or built
-into the kernel. This is selected, as for other drivers, through the `make config`
-command from the root of the Linux source tree during the kernel build process.
-
-The RocketPort/RocketModem serial ports installed by this driver are assigned
-device major number 46, and will be named /dev/ttyRx, where x is the port number
-starting at zero (ex. /dev/ttyR0, /devttyR1, ...).  If you have multiple cards
-installed in the system, the mapping of port names to serial ports is displayed
-in the system log at /var/log/messages.
-
-If installed as a module, the module must be loaded.  This can be done
-manually by entering "modprobe rocket".  To have the module loaded automatically
-upon system boot, edit a `/etc/modprobe.d/*.conf` file and add the line
-"alias char-major-46 rocket".
-
-In order to use the ports, their device names (nodes) must be created with mknod.
-This is only required once, the system will retain the names once created.  To
-create the RocketPort/RocketModem device names, use the command
-"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero.
-
-For example::
-
-	> mknod /dev/ttyR0 c 46 0
-	> mknod /dev/ttyR1 c 46 1
-	> mknod /dev/ttyR2 c 46 2
-
-The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes)
-for you::
-
-	>/dev/MAKEDEV ttyR
-
-ISA Rocketport Boards
----------------------
-
-You must assign and configure the I/O addresses used by the ISA Rocketport
-card before installing and using it.  This is done by setting a set of DIP
-switches on the Rocketport board.
-
-
-Setting the I/O address
------------------------
-
-Before installing RocketPort(R) or RocketPort RA boards, you must find
-a range of I/O addresses for it to use. The first RocketPort card
-requires a 68-byte contiguous block of I/O addresses, starting at one
-of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h,
-0x300h, 0x340h, 0x380h.  This I/O address must be reflected in the DIP
-switches of *all* of the Rocketport cards.
-
-The second, third, and fourth RocketPort cards require a 64-byte
-contiguous block of I/O addresses, starting at one of the following
-I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h,
-0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h.  The I/O address used by the
-second, third, and fourth Rocketport cards (if present) are set via
-software control.  The DIP switch settings for the I/O address must be
-set to the value of the first Rocketport cards.
-
-In order to distinguish each of the card from the others, each card
-must have a unique board ID set on the dip switches.  The first
-Rocketport board must be set with the DIP switches corresponding to
-the first board, the second board must be set with the DIP switches
-corresponding to the second board, etc.  IMPORTANT: The board ID is
-the only place where the DIP switch settings should differ between the
-various Rocketport boards in a system.
-
-The I/O address range used by any of the RocketPort cards must not
-conflict with any other cards in the system, including other
-RocketPort cards.  Below, you will find a list of commonly used I/O
-address ranges which may be in use by other devices in your system.
-On a Linux system, "cat /proc/ioports" will also be helpful in
-identifying what I/O addresses are being used by devices on your
-system.
-
-Remember, the FIRST RocketPort uses 68 I/O addresses.  So, if you set it
-for 0x100, it will occupy 0x100 to 0x143.  This would mean that you
-CAN NOT set the second, third or fourth board for address 0x140 since
-the first 4 bytes of that range are used by the first board.  You would
-need to set the second, third, or fourth board to one of the next available
-blocks such as 0x180.
-
-RocketPort and RocketPort RA SW1 Settings::
-
-            +-------------------------------+
-            | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 |
-            +-------+-------+---------------+
-            | Unused| Card  | I/O Port Block|
-            +-------------------------------+
-
-  DIP Switches                             DIP Switches
-  7    8                                   6    5
-  ===================                      ===================
-  On   On   UNUSED, MUST BE ON.            On   On   First Card    <==== Default
-                                           On   Off  Second Card
-                                           Off  On   Third Card
-                                           Off  Off  Fourth Card
-
-  DIP Switches         I/O Address Range
-  4    3    2    1     Used by the First Card
-  =====================================
-  On   Off  On   Off   100-143
-  On   Off  Off  On    140-183
-  On   Off  Off  Off   180-1C3       <==== Default
-  Off  On   On   Off   200-243
-  Off  On   Off  On    240-283
-  Off  On   Off  Off   280-2C3
-  Off  Off  On   Off   300-343
-  Off  Off  Off  On    340-383
-  Off  Off  Off  Off   380-3C3
-
-Reporting Bugs
---------------
-
-For technical support, please provide the following
-information: Driver version, kernel release, distribution of
-kernel, and type of board you are using. Error messages and log
-printouts port configuration details are especially helpful.
-
-USA:
-    :Phone: (612) 494-4100
-    :FAX: (612) 494-4199
-    :email: support@comtrol.com
-
-Comtrol Europe:
-    :Phone: +44 (0) 1 869 323-220
-    :FAX: +44 (0) 1 869 323-211
-    :email: support@comtrol.co.uk
-
-Web:	http://www.comtrol.com
-FTP:	ftp.comtrol.com
diff --git a/Documentation/serial/serial-iso7816.rst b/Documentation/serial/serial-iso7816.rst
deleted file mode 100644
index d990143de0c6..000000000000
--- a/Documentation/serial/serial-iso7816.rst
+++ /dev/null
@@ -1,90 +0,0 @@
-=============================
-ISO7816 Serial Communications
-=============================
-
-1. Introduction
-===============
-
-  ISO/IEC7816 is a series of standards specifying integrated circuit cards (ICC)
-  also known as smart cards.
-
-2. Hardware-related considerations
-==================================
-
-  Some CPUs/UARTs (e.g., Microchip AT91) contain a built-in mode capable of
-  handling communication with a smart card.
-
-  For these microcontrollers, the Linux driver should be made capable of
-  working in both modes, and proper ioctls (see later) should be made
-  available at user-level to allow switching from one mode to the other, and
-  vice versa.
-
-3. Data Structures Already Available in the Kernel
-==================================================
-
-  The Linux kernel provides the serial_iso7816 structure (see [1]) to handle
-  ISO7816 communications. This data structure is used to set and configure
-  ISO7816 parameters in ioctls.
-
-  Any driver for devices capable of working both as RS232 and ISO7816 should
-  implement the iso7816_config callback in the uart_port structure. The
-  serial_core calls iso7816_config to do the device specific part in response
-  to TIOCGISO7816 and TIOCSISO7816 ioctls (see below). The iso7816_config
-  callback receives a pointer to struct serial_iso7816.
-
-4. Usage from user-level
-========================
-
-  From user-level, ISO7816 configuration can be get/set using the previous
-  ioctls. For instance, to set ISO7816 you can use the following code::
-
-	#include <linux/serial.h>
-
-	/* Include definition for ISO7816 ioctls: TIOCSISO7816 and TIOCGISO7816 */
-	#include <sys/ioctl.h>
-
-	/* Open your specific device (e.g., /dev/mydevice): */
-	int fd = open ("/dev/mydevice", O_RDWR);
-	if (fd < 0) {
-		/* Error handling. See errno. */
-	}
-
-	struct serial_iso7816 iso7816conf;
-
-	/* Reserved fields as to be zeroed */
-	memset(&iso7816conf, 0, sizeof(iso7816conf));
-
-	/* Enable ISO7816 mode: */
-	iso7816conf.flags |= SER_ISO7816_ENABLED;
-
-	/* Select the protocol: */
-	/* T=0 */
-	iso7816conf.flags |= SER_ISO7816_T(0);
-	/* or T=1 */
-	iso7816conf.flags |= SER_ISO7816_T(1);
-
-	/* Set the guard time: */
-	iso7816conf.tg = 2;
-
-	/* Set the clock frequency*/
-	iso7816conf.clk = 3571200;
-
-	/* Set transmission factors: */
-	iso7816conf.sc_fi = 372;
-	iso7816conf.sc_di = 1;
-
-	if (ioctl(fd_usart, TIOCSISO7816, &iso7816conf) < 0) {
-		/* Error handling. See errno. */
-	}
-
-	/* Use read() and write() syscalls here... */
-
-	/* Close the device when finished: */
-	if (close (fd) < 0) {
-		/* Error handling. See errno. */
-	}
-
-5. References
-=============
-
- [1]    include/uapi/linux/serial.h
diff --git a/Documentation/serial/serial-rs485.rst b/Documentation/serial/serial-rs485.rst
deleted file mode 100644
index 6bc824f948f9..000000000000
--- a/Documentation/serial/serial-rs485.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-===========================
-RS485 Serial Communications
-===========================
-
-1. Introduction
-===============
-
-   EIA-485, also known as TIA/EIA-485 or RS-485, is a standard defining the
-   electrical characteristics of drivers and receivers for use in balanced
-   digital multipoint systems.
-   This standard is widely used for communications in industrial automation
-   because it can be used effectively over long distances and in electrically
-   noisy environments.
-
-2. Hardware-related Considerations
-==================================
-
-   Some CPUs/UARTs (e.g., Atmel AT91 or 16C950 UART) contain a built-in
-   half-duplex mode capable of automatically controlling line direction by
-   toggling RTS or DTR signals. That can be used to control external
-   half-duplex hardware like an RS485 transceiver or any RS232-connected
-   half-duplex devices like some modems.
-
-   For these microcontrollers, the Linux driver should be made capable of
-   working in both modes, and proper ioctls (see later) should be made
-   available at user-level to allow switching from one mode to the other, and
-   vice versa.
-
-3. Data Structures Already Available in the Kernel
-==================================================
-
-   The Linux kernel provides the serial_rs485 structure (see [1]) to handle
-   RS485 communications. This data structure is used to set and configure RS485
-   parameters in the platform data and in ioctls.
-
-   The device tree can also provide RS485 boot time parameters (see [2]
-   for bindings). The driver is in charge of filling this data structure from
-   the values given by the device tree.
-
-   Any driver for devices capable of working both as RS232 and RS485 should
-   implement the rs485_config callback in the uart_port structure. The
-   serial_core calls rs485_config to do the device specific part in response
-   to TIOCSRS485 and TIOCGRS485 ioctls (see below). The rs485_config callback
-   receives a pointer to struct serial_rs485.
-
-4. Usage from user-level
-========================
-
-   From user-level, RS485 configuration can be get/set using the previous
-   ioctls. For instance, to set RS485 you can use the following code::
-
-	#include <linux/serial.h>
-
-	/* Include definition for RS485 ioctls: TIOCGRS485 and TIOCSRS485 */
-	#include <sys/ioctl.h>
-
-	/* Open your specific device (e.g., /dev/mydevice): */
-	int fd = open ("/dev/mydevice", O_RDWR);
-	if (fd < 0) {
-		/* Error handling. See errno. */
-	}
-
-	struct serial_rs485 rs485conf;
-
-	/* Enable RS485 mode: */
-	rs485conf.flags |= SER_RS485_ENABLED;
-
-	/* Set logical level for RTS pin equal to 1 when sending: */
-	rs485conf.flags |= SER_RS485_RTS_ON_SEND;
-	/* or, set logical level for RTS pin equal to 0 when sending: */
-	rs485conf.flags &= ~(SER_RS485_RTS_ON_SEND);
-
-	/* Set logical level for RTS pin equal to 1 after sending: */
-	rs485conf.flags |= SER_RS485_RTS_AFTER_SEND;
-	/* or, set logical level for RTS pin equal to 0 after sending: */
-	rs485conf.flags &= ~(SER_RS485_RTS_AFTER_SEND);
-
-	/* Set rts delay before send, if needed: */
-	rs485conf.delay_rts_before_send = ...;
-
-	/* Set rts delay after send, if needed: */
-	rs485conf.delay_rts_after_send = ...;
-
-	/* Set this flag if you want to receive data even while sending data */
-	rs485conf.flags |= SER_RS485_RX_DURING_TX;
-
-	if (ioctl (fd, TIOCSRS485, &rs485conf) < 0) {
-		/* Error handling. See errno. */
-	}
-
-	/* Use read() and write() syscalls here... */
-
-	/* Close the device when finished: */
-	if (close (fd) < 0) {
-		/* Error handling. See errno. */
-	}
-
-5. References
-=============
-
- [1]	include/uapi/linux/serial.h
-
- [2]	Documentation/devicetree/bindings/serial/rs485.txt
diff --git a/Documentation/serial/tty.rst b/Documentation/serial/tty.rst
deleted file mode 100644
index dd972caacf3e..000000000000
--- a/Documentation/serial/tty.rst
+++ /dev/null
@@ -1,328 +0,0 @@
-=================
-The Lockronomicon
-=================
-
-Your guide to the ancient and twisted locking policies of the tty layer and
-the warped logic behind them. Beware all ye who read on.
-
-
-Line Discipline
----------------
-
-Line disciplines are registered with tty_register_ldisc() passing the
-discipline number and the ldisc structure. At the point of registration the
-discipline must be ready to use and it is possible it will get used before
-the call returns success. If the call returns an error then it won't get
-called. Do not re-use ldisc numbers as they are part of the userspace ABI
-and writing over an existing ldisc will cause demons to eat your computer.
-After the return the ldisc data has been copied so you may free your own
-copy of the structure. You must not re-register over the top of the line
-discipline even with the same data or your computer again will be eaten by
-demons.
-
-In order to remove a line discipline call tty_unregister_ldisc().
-In ancient times this always worked. In modern times the function will
-return -EBUSY if the ldisc is currently in use. Since the ldisc referencing
-code manages the module counts this should not usually be a concern.
-
-Heed this warning: the reference count field of the registered copies of the
-tty_ldisc structure in the ldisc table counts the number of lines using this
-discipline. The reference count of the tty_ldisc structure within a tty
-counts the number of active users of the ldisc at this instant. In effect it
-counts the number of threads of execution within an ldisc method (plus those
-about to enter and exit although this detail matters not).
-
-Line Discipline Methods
------------------------
-
-TTY side interfaces
-^^^^^^^^^^^^^^^^^^^
-
-======================= =======================================================
-open()			Called when the line discipline is attached to
-			the terminal. No other call into the line
-			discipline for this tty will occur until it
-			completes successfully. Should initialize any
-			state needed by the ldisc, and set receive_room
-			in the tty_struct to the maximum amount of data
-			the line discipline is willing to accept from the
-			driver with a single call to receive_buf().
-			Returning an error will prevent the ldisc from
-			being attached. Can sleep.
-
-close()			This is called on a terminal when the line
-			discipline is being unplugged. At the point of
-			execution no further users will enter the
-			ldisc code for this tty. Can sleep.
-
-hangup()		Called when the tty line is hung up.
-			The line discipline should cease I/O to the tty.
-			No further calls into the ldisc code will occur.
-			The return value is ignored. Can sleep.
-
-read()			(optional) A process requests reading data from
-			the line. Multiple read calls may occur in parallel
-			and the ldisc must deal with serialization issues.
-			If not defined, the process will receive an EIO
-			error. May sleep.
-
-write()			(optional) A process requests writing data to the
-			line. Multiple write calls are serialized by the
-			tty layer for the ldisc. If not defined, the
-			process will receive an EIO error. May sleep.
-
-flush_buffer()		(optional) May be called at any point between
-			open and close, and instructs the line discipline
-			to empty its input buffer.
-
-set_termios()		(optional) Called on termios structure changes.
-			The caller passes the old termios data and the
-			current data is in the tty. Called under the
-			termios semaphore so allowed to sleep. Serialized
-			against itself only.
-
-poll()			(optional) Check the status for the poll/select
-			calls. Multiple poll calls may occur in parallel.
-			May sleep.
-
-ioctl()			(optional) Called when an ioctl is handed to the
-			tty layer that might be for the ldisc. Multiple
-			ioctl calls may occur in parallel. May sleep.
-
-compat_ioctl()		(optional) Called when a 32 bit ioctl is handed
-			to the tty layer that might be for the ldisc.
-			Multiple ioctl calls may occur in parallel.
-			May sleep.
-======================= =======================================================
-
-Driver Side Interfaces
-^^^^^^^^^^^^^^^^^^^^^^
-
-======================= =======================================================
-receive_buf()		(optional) Called by the low-level driver to hand
-			a buffer of received bytes to the ldisc for
-			processing. The number of bytes is guaranteed not
-			to exceed the current value of tty->receive_room.
-			All bytes must be processed.
-
-receive_buf2()		(optional) Called by the low-level driver to hand
-			a buffer of received bytes to the ldisc for
-			processing. Returns the number of bytes processed.
-
-			If both receive_buf() and receive_buf2() are
-			defined, receive_buf2() should be preferred.
-
-write_wakeup()		May be called at any point between open and close.
-			The TTY_DO_WRITE_WAKEUP flag indicates if a call
-			is needed but always races versus calls. Thus the
-			ldisc must be careful about setting order and to
-			handle unexpected calls. Must not sleep.
-
-			The driver is forbidden from calling this directly
-			from the ->write call from the ldisc as the ldisc
-			is permitted to call the driver write method from
-			this function. In such a situation defer it.
-
-dcd_change()		Report to the tty line the current DCD pin status
-			changes and the relative timestamp. The timestamp
-			cannot be NULL.
-======================= =======================================================
-
-
-Driver Access
-^^^^^^^^^^^^^
-
-Line discipline methods can call the following methods of the underlying
-hardware driver through the function pointers within the tty->driver
-structure:
-
-======================= =======================================================
-write()			Write a block of characters to the tty device.
-			Returns the number of characters accepted. The
-			character buffer passed to this method is already
-			in kernel space.
-
-put_char()		Queues a character for writing to the tty device.
-			If there is no room in the queue, the character is
-			ignored.
-
-flush_chars()		(Optional) If defined, must be called after
-			queueing characters with put_char() in order to
-			start transmission.
-
-write_room()		Returns the numbers of characters the tty driver
-			will accept for queueing to be written.
-
-ioctl()			Invoke device specific ioctl.
-			Expects data pointers to refer to userspace.
-			Returns ENOIOCTLCMD for unrecognized ioctl numbers.
-
-set_termios()		Notify the tty driver that the device's termios
-			settings have changed. New settings are in
-			tty->termios. Previous settings should be passed in
-			the "old" argument.
-
-			The API is defined such that the driver should return
-			the actual modes selected. This means that the
-			driver function is responsible for modifying any
-			bits in the request it cannot fulfill to indicate
-			the actual modes being used. A device with no
-			hardware capability for change (e.g. a USB dongle or
-			virtual port) can provide NULL for this method.
-
-throttle()		Notify the tty driver that input buffers for the
-			line discipline are close to full, and it should
-			somehow signal that no more characters should be
-			sent to the tty.
-
-unthrottle()		Notify the tty driver that characters can now be
-			sent to the tty without fear of overrunning the
-			input buffers of the line disciplines.
-
-stop()			Ask the tty driver to stop outputting characters
-			to the tty device.
-
-start()			Ask the tty driver to resume sending characters
-			to the tty device.
-
-hangup()		Ask the tty driver to hang up the tty device.
-
-break_ctl()		(Optional) Ask the tty driver to turn on or off
-			BREAK status on the RS-232 port.  If state is -1,
-			then the BREAK status should be turned on; if
-			state is 0, then BREAK should be turned off.
-			If this routine is not implemented, use ioctls
-			TIOCSBRK / TIOCCBRK instead.
-
-wait_until_sent()	Waits until the device has written out all of the
-			characters in its transmitter FIFO.
-
-send_xchar()		Send a high-priority XON/XOFF character to the device.
-======================= =======================================================
-
-
-Flags
-^^^^^
-
-Line discipline methods have access to tty->flags field containing the
-following interesting flags:
-
-======================= =======================================================
-TTY_THROTTLED		Driver input is throttled. The ldisc should call
-			tty->driver->unthrottle() in order to resume
-			reception when it is ready to process more data.
-
-TTY_DO_WRITE_WAKEUP	If set, causes the driver to call the ldisc's
-			write_wakeup() method in order to resume
-			transmission when it can accept more data
-			to transmit.
-
-TTY_IO_ERROR		If set, causes all subsequent userspace read/write
-			calls on the tty to fail, returning -EIO.
-
-TTY_OTHER_CLOSED	Device is a pty and the other side has closed.
-
-TTY_NO_WRITE_SPLIT	Prevent driver from splitting up writes into
-			smaller chunks.
-======================= =======================================================
-
-
-Locking
-^^^^^^^
-
-Callers to the line discipline functions from the tty layer are required to
-take line discipline locks. The same is true of calls from the driver side
-but not yet enforced.
-
-Three calls are now provided::
-
-	ldisc = tty_ldisc_ref(tty);
-
-takes a handle to the line discipline in the tty and returns it. If no ldisc
-is currently attached or the ldisc is being closed and re-opened at this
-point then NULL is returned. While this handle is held the ldisc will not
-change or go away::
-
-	tty_ldisc_deref(ldisc)
-
-Returns the ldisc reference and allows the ldisc to be closed. Returning the
-reference takes away your right to call the ldisc functions until you take
-a new reference::
-
-	ldisc = tty_ldisc_ref_wait(tty);
-
-Performs the same function as tty_ldisc_ref except that it will wait for an
-ldisc change to complete and then return a reference to the new ldisc.
-
-While these functions are slightly slower than the old code they should have
-minimal impact as most receive logic uses the flip buffers and they only
-need to take a reference when they push bits up through the driver.
-
-A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc
-functions are called with the ldisc unavailable. Thus tty_ldisc_ref will
-fail in this situation if used within these functions. Ldisc and driver
-code calling its own functions must be careful in this case.
-
-
-Driver Interface
-----------------
-
-======================= =======================================================
-open()			Called when a device is opened. May sleep
-
-close()			Called when a device is closed. At the point of
-			return from this call the driver must make no
-			further ldisc calls of any kind. May sleep
-
-write()			Called to write bytes to the device. May not
-			sleep. May occur in parallel in special cases.
-			Because this includes panic paths drivers generally
-			shouldn't try and do clever locking here.
-
-put_char()		Stuff a single character onto the queue. The
-			driver is guaranteed following up calls to
-			flush_chars.
-
-flush_chars()		Ask the kernel to write put_char queue
-
-write_room()		Return the number of characters that can be stuffed
-			into the port buffers without overflow (or less).
-			The ldisc is responsible for being intelligent
-			about multi-threading of write_room/write calls
-
-ioctl()			Called when an ioctl may be for the driver
-
-set_termios()		Called on termios change, serialized against
-			itself by a semaphore. May sleep.
-
-set_ldisc()		Notifier for discipline change. At the point this
-			is done the discipline is not yet usable. Can now
-			sleep (I think)
-
-throttle()		Called by the ldisc to ask the driver to do flow
-			control.  Serialization including with unthrottle
-			is the job of the ldisc layer.
-
-unthrottle()		Called by the ldisc to ask the driver to stop flow
-			control.
-
-stop()			Ldisc notifier to the driver to stop output. As with
-			throttle the serializations with start() are down
-			to the ldisc layer.
-
-start()			Ldisc notifier to the driver to start output.
-
-hangup()		Ask the tty driver to cause a hangup initiated
-			from the host side. [Can sleep ??]
-
-break_ctl()		Send RS232 break. Can sleep. Can get called in
-			parallel, driver must serialize (for now), and
-			with write calls.
-
-wait_until_sent()	Wait for characters to exit the hardware queue
-			of the driver. Can sleep
-
-send_xchar()	  	Send XON/XOFF and if possible jump the queue with
-			it in order to get fast flow control responses.
-			Cannot sleep ??
-======================= =======================================================
diff --git a/MAINTAINERS b/MAINTAINERS
index d1a0a817dd92..4f88bca37c55 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10767,7 +10767,7 @@ F:	include/uapi/linux/meye.h
 MOXA SMARTIO/INDUSTIO/INTELLIO SERIAL CARD
 M:	Jiri Slaby <jirislaby@gmail.com>
 S:	Maintained
-F:	Documentation/serial/moxa-smartio.rst
+F:	Documentation/driver-api/serial/moxa-smartio.rst
 F:	drivers/tty/mxser.*
 
 MR800 AVERMEDIA USB FM RADIO DRIVER
@@ -13689,7 +13689,7 @@ ROCKETPORT DRIVER
 P:	Comtrol Corp.
 W:	http://www.comtrol.com
 S:	Maintained
-F:	Documentation/serial/rocket.rst
+F:	Documentation/driver-api/serial/rocket.rst
 F:	drivers/tty/rocket*
 
 ROCKETPORT EXPRESS/INFINITY DRIVER
@@ -16228,7 +16228,7 @@ M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 M:	Jiri Slaby <jslaby@suse.com>
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git
-F:	Documentation/serial/
+F:	Documentation/driver-api/serial/
 F:	drivers/tty/
 F:	drivers/tty/serial/serial_core.c
 F:	include/linux/serial_core.h
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index ee51b9514225..c7623f99ac0f 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -175,7 +175,7 @@ config ROCKETPORT
 	  This driver supports Comtrol RocketPort and RocketModem PCI boards.   
           These boards provide 2, 4, 8, 16, or 32 high-speed serial ports or
           modems.  For information about the RocketPort/RocketModem  boards
-          and this driver read <file:Documentation/serial/rocket.rst>.
+          and this driver read <file:Documentation/driver-api/serial/rocket.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called rocket.
@@ -193,7 +193,7 @@ config CYCLADES
 	  your Linux box, for instance in order to become a dial-in server.
 
 	  For information about the Cyclades-Z card, read
-	  <file:Documentation/serial/cyclades_z.rst>.
+	  <file:Documentation/driver-api/serial/cyclades_z.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called cyclades.
diff --git a/drivers/tty/serial/ucc_uart.c b/drivers/tty/serial/ucc_uart.c
index 6e3c66ab0e62..a0555ae2b1ef 100644
--- a/drivers/tty/serial/ucc_uart.c
+++ b/drivers/tty/serial/ucc_uart.c
@@ -1081,7 +1081,7 @@ static int qe_uart_verify_port(struct uart_port *port,
 }
 /* UART operations
  *
- * Details on these functions can be found in Documentation/serial/driver.rst
+ * Details on these functions can be found in Documentation/driver-api/serial/driver.rst
  */
 static const struct uart_ops qe_uart_pops = {
 	.tx_empty       = qe_uart_tx_empty,
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 05b179015d6c..2b78cc734719 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -32,7 +32,7 @@ struct device;
 
 /*
  * This structure describes all the operations that can be done on the
- * physical hardware.  See Documentation/serial/driver.rst for details.
+ * physical hardware.  See Documentation/driver-api/serial/driver.rst for details.
  */
 struct uart_ops {
 	unsigned int	(*tx_empty)(struct uart_port *);
-- 
cgit v1.2.3


From 7e4b4dfc98d54bc79f7ca29c8bc6307ed2948014 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 16 Jul 2019 04:06:29 +1000
Subject: Revert "mm: adjust apply_to_pfn_range interface for dropped token."

This reverts commit 6dfc43d3a19174faead54575c204aee106225f43.

Going to revert the whole vmwwgfx pull.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/mm.h    | 2 +-
 mm/as_dirty_helpers.c | 6 ++++--
 mm/memory.c           | 6 +++---
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c45f936bd81c..798cdda9560e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2687,7 +2687,7 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
 struct pfn_range_apply;
-typedef int (*pter_fn_t)(pte_t *pte, unsigned long addr,
+typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			 struct pfn_range_apply *closure);
 struct pfn_range_apply {
 	struct mm_struct *mm;
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
index 6352a3729408..f600e31534fb 100644
--- a/mm/as_dirty_helpers.c
+++ b/mm/as_dirty_helpers.c
@@ -26,6 +26,7 @@ struct apply_as {
 /**
  * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
  * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as
@@ -35,7 +36,7 @@ struct apply_as {
  *
  * Return: Always zero.
  */
-static int apply_pt_wrprotect(pte_t *pte,
+static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
 			      unsigned long addr,
 			      struct pfn_range_apply *closure)
 {
@@ -77,6 +78,7 @@ struct apply_as_clean {
 /**
  * apply_pt_clean - Leaf pte callback to clean a pte
  * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as_clean
@@ -89,7 +91,7 @@ struct apply_as_clean {
  *
  * Return: Always zero.
  */
-static int apply_pt_clean(pte_t *pte,
+static int apply_pt_clean(pte_t *pte, pgtable_t token,
 			  unsigned long addr,
 			  struct pfn_range_apply *closure)
 {
diff --git a/mm/memory.c b/mm/memory.c
index b8218e962231..462aa47f8878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2053,7 +2053,7 @@ static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = closure->ptefn(pte++, addr, closure);
+		err = closure->ptefn(pte++, token, addr, closure);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
@@ -2194,14 +2194,14 @@ struct page_range_apply {
  * Callback wrapper to enable use of apply_to_pfn_range for
  * the apply_to_page_range interface
  */
-static int apply_to_page_range_wrapper(pte_t *pte,
+static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
 				       unsigned long addr,
 				       struct pfn_range_apply *pter)
 {
 	struct page_range_apply *pra =
 		container_of(pter, typeof(*pra), pter);
 
-	return pra->fn(pte, NULL, addr, pra->data);
+	return pra->fn(pte, token, addr, pra->data);
 }
 
 /*
-- 
cgit v1.2.3


From 3729fe2bc2a01f4cc1aa88be8f64af06084c87d6 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 16 Jul 2019 04:07:13 +1000
Subject: Revert "Merge branch 'vmwgfx-next' of
 git://people.freedesktop.org/~thomash/linux into drm-next"

This reverts commit 031e610a6a21448a63dff7a0416e5e206724caac, reversing
changes made to 52d2d44eee8091e740d0d275df1311fb8373c9a9.

The mm changes in there we premature and not fully ack or reviewed by core mm folks,
I dropped the ball by merging them via this tree, so lets take em all back out.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 MAINTAINERS                                        |   1 -
 drivers/gpu/drm/ttm/ttm_bo.c                       |   1 -
 drivers/gpu/drm/ttm/ttm_bo_vm.c                    | 169 +++-----
 drivers/gpu/drm/vmwgfx/Kconfig                     |   1 -
 drivers/gpu/drm/vmwgfx/Makefile                    |   2 +-
 .../drm/vmwgfx/device_include/svga3d_surfacedefs.h | 233 +---------
 drivers/gpu/drm/vmwgfx/ttm_lock.c                  | 100 +++++
 drivers/gpu/drm/vmwgfx/ttm_lock.h                  |  30 ++
 drivers/gpu/drm/vmwgfx/vmwgfx_bo.c                 |  12 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_context.c            |   4 -
 drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c            |  13 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c                | 167 +++++++-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h                | 139 ++----
 drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c            |   1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.c                |  23 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c         | 472 ---------------------
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c           | 245 ++---------
 drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h      |  15 -
 drivers/gpu/drm/vmwgfx/vmwgfx_shader.c             |   8 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_surface.c            | 405 +-----------------
 drivers/gpu/drm/vmwgfx/vmwgfx_validation.c         |  74 +---
 drivers/gpu/drm/vmwgfx/vmwgfx_validation.h         |  16 +-
 include/drm/ttm/ttm_bo_api.h                       |  10 -
 include/drm/ttm/ttm_bo_driver.h                    |   6 -
 include/linux/mm.h                                 |  19 +-
 include/uapi/drm/vmwgfx_drm.h                      |   4 +-
 mm/Kconfig                                         |   3 -
 mm/Makefile                                        |   1 -
 mm/as_dirty_helpers.c                              | 300 -------------
 mm/memory.c                                        | 145 ++-----
 30 files changed, 483 insertions(+), 2136 deletions(-)
 delete mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
 delete mode 100644 mm/as_dirty_helpers.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index d6600715a662..2abf6d28db64 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5191,7 +5191,6 @@ T:	git git://people.freedesktop.org/~thomash/linux
 S:	Supported
 F:	drivers/gpu/drm/vmwgfx/
 F:	include/uapi/drm/vmwgfx_drm.h
-F:	mm/as_dirty_helpers.c
 
 DRM DRIVERS
 M:	David Airlie <airlied@linux.ie>
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index a7fd5a4955c9..58c403eda04e 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1739,7 +1739,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
 	mutex_lock(&ttm_global_mutex);
 	list_add_tail(&bdev->device_list, &glob->device_list);
 	mutex_unlock(&ttm_global_mutex);
-	bdev->vm_ops = &ttm_bo_vm_ops;
 
 	return 0;
 out_no_sys:
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 0c4576cbafcf..6dacff49c1cc 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -42,6 +42,8 @@
 #include <linux/uaccess.h>
 #include <linux/mem_encrypt.h>
 
+#define TTM_BO_VM_NUM_PREFAULT 16
+
 static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
 				struct vm_fault *vmf)
 {
@@ -104,30 +106,25 @@ static unsigned long ttm_bo_io_mem_pfn(struct ttm_buffer_object *bo,
 		+ page_offset;
 }
 
-/**
- * ttm_bo_vm_reserve - Reserve a buffer object in a retryable vm callback
- * @bo: The buffer object
- * @vmf: The fault structure handed to the callback
- *
- * vm callbacks like fault() and *_mkwrite() allow for the mm_sem to be dropped
- * during long waits, and after the wait the callback will be restarted. This
- * is to allow other threads using the same virtual memory space concurrent
- * access to map(), unmap() completely unrelated buffer objects. TTM buffer
- * object reservations sometimes wait for GPU and should therefore be
- * considered long waits. This function reserves the buffer object interruptibly
- * taking this into account. Starvation is avoided by the vm system not
- * allowing too many repeated restarts.
- * This function is intended to be used in customized fault() and _mkwrite()
- * handlers.
- *
- * Return:
- *    0 on success and the bo was reserved.
- *    VM_FAULT_RETRY if blocking wait.
- *    VM_FAULT_NOPAGE if blocking wait and retrying was not allowed.
- */
-vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
-			     struct vm_fault *vmf)
+static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
+	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
+	    vma->vm_private_data;
+	struct ttm_bo_device *bdev = bo->bdev;
+	unsigned long page_offset;
+	unsigned long page_last;
+	unsigned long pfn;
+	struct ttm_tt *ttm = NULL;
+	struct page *page;
+	int err;
+	int i;
+	vm_fault_t ret = VM_FAULT_NOPAGE;
+	unsigned long address = vmf->address;
+	struct ttm_mem_type_manager *man =
+		&bdev->man[bo->mem.mem_type];
+	struct vm_area_struct cvma;
+
 	/*
 	 * Work around locking order reversal in fault / nopfn
 	 * between mmap_sem and bo_reserve: Perform a trylock operation
@@ -154,55 +151,14 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
 		return VM_FAULT_NOPAGE;
 	}
 
-	return 0;
-}
-EXPORT_SYMBOL(ttm_bo_vm_reserve);
-
-/**
- * ttm_bo_vm_fault_reserved - TTM fault helper
- * @vmf: The struct vm_fault given as argument to the fault callback
- * @prot: The page protection to be used for this memory area.
- * @num_prefault: Maximum number of prefault pages. The caller may want to
- * specify this based on madvice settings and the size of the GPU object
- * backed by the memory.
- *
- * This function inserts one or more page table entries pointing to the
- * memory backing the buffer object, and then returns a return code
- * instructing the caller to retry the page access.
- *
- * Return:
- *   VM_FAULT_NOPAGE on success or pending signal
- *   VM_FAULT_SIGBUS on unspecified error
- *   VM_FAULT_OOM on out-of-memory
- *   VM_FAULT_RETRY if retryable wait
- */
-vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
-				    pgprot_t prot,
-				    pgoff_t num_prefault)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct vm_area_struct cvma = *vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	struct ttm_bo_device *bdev = bo->bdev;
-	unsigned long page_offset;
-	unsigned long page_last;
-	unsigned long pfn;
-	struct ttm_tt *ttm = NULL;
-	struct page *page;
-	int err;
-	pgoff_t i;
-	vm_fault_t ret = VM_FAULT_NOPAGE;
-	unsigned long address = vmf->address;
-	struct ttm_mem_type_manager *man =
-		&bdev->man[bo->mem.mem_type];
-
 	/*
 	 * Refuse to fault imported pages. This should be handled
 	 * (if at all) by redirecting mmap to the exporter.
 	 */
-	if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG))
-		return VM_FAULT_SIGBUS;
+	if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_unlock;
+	}
 
 	if (bdev->driver->fault_reserve_notify) {
 		struct dma_fence *moving = dma_fence_get(bo->moving);
@@ -213,9 +169,11 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 			break;
 		case -EBUSY:
 		case -ERESTARTSYS:
-			return VM_FAULT_NOPAGE;
+			ret = VM_FAULT_NOPAGE;
+			goto out_unlock;
 		default:
-			return VM_FAULT_SIGBUS;
+			ret = VM_FAULT_SIGBUS;
+			goto out_unlock;
 		}
 
 		if (bo->moving != moving) {
@@ -231,12 +189,21 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 	 * move.
 	 */
 	ret = ttm_bo_vm_fault_idle(bo, vmf);
-	if (unlikely(ret != 0))
-		return ret;
+	if (unlikely(ret != 0)) {
+		if (ret == VM_FAULT_RETRY &&
+		    !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+			/* The BO has already been unreserved. */
+			return ret;
+		}
+
+		goto out_unlock;
+	}
 
 	err = ttm_mem_io_lock(man, true);
-	if (unlikely(err != 0))
-		return VM_FAULT_NOPAGE;
+	if (unlikely(err != 0)) {
+		ret = VM_FAULT_NOPAGE;
+		goto out_unlock;
+	}
 	err = ttm_mem_io_reserve_vm(bo);
 	if (unlikely(err != 0)) {
 		ret = VM_FAULT_SIGBUS;
@@ -253,8 +220,18 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 		goto out_io_unlock;
 	}
 
-	cvma.vm_page_prot = ttm_io_prot(bo->mem.placement, prot);
-	if (!bo->mem.bus.is_iomem) {
+	/*
+	 * Make a local vma copy to modify the page_prot member
+	 * and vm_flags if necessary. The vma parameter is protected
+	 * by mmap_sem in write mode.
+	 */
+	cvma = *vma;
+	cvma.vm_page_prot = vm_get_page_prot(cvma.vm_flags);
+
+	if (bo->mem.bus.is_iomem) {
+		cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
+						cvma.vm_page_prot);
+	} else {
 		struct ttm_operation_ctx ctx = {
 			.interruptible = false,
 			.no_wait_gpu = false,
@@ -263,21 +240,24 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 		};
 
 		ttm = bo->ttm;
-		if (ttm_tt_populate(bo->ttm, &ctx)) {
+		cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
+						cvma.vm_page_prot);
+
+		/* Allocate all page at once, most common usage */
+		if (ttm_tt_populate(ttm, &ctx)) {
 			ret = VM_FAULT_OOM;
 			goto out_io_unlock;
 		}
-	} else {
-		/* Iomem should not be marked encrypted */
-		cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
 	}
 
 	/*
 	 * Speculatively prefault a number of pages. Only error on
 	 * first page.
 	 */
-	for (i = 0; i < num_prefault; ++i) {
+	for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
 		if (bo->mem.bus.is_iomem) {
+			/* Iomem should not be marked encrypted */
+			cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
 			pfn = ttm_bo_io_mem_pfn(bo, page_offset);
 		} else {
 			page = ttm->pages[page_offset];
@@ -315,26 +295,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 	ret = VM_FAULT_NOPAGE;
 out_io_unlock:
 	ttm_mem_io_unlock(man);
-	return ret;
-}
-EXPORT_SYMBOL(ttm_bo_vm_fault_reserved);
-
-static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	pgprot_t prot;
-	struct ttm_buffer_object *bo = vma->vm_private_data;
-	vm_fault_t ret;
-
-	ret = ttm_bo_vm_reserve(bo, vmf);
-	if (ret)
-		return ret;
-
-	prot = vm_get_page_prot(vma->vm_flags);
-	ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
-	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
-		return ret;
-
+out_unlock:
 	reservation_object_unlock(bo->resv);
 	return ret;
 }
@@ -434,7 +395,7 @@ static int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
 	return ret;
 }
 
-const struct vm_operations_struct ttm_bo_vm_ops = {
+static const struct vm_operations_struct ttm_bo_vm_ops = {
 	.fault = ttm_bo_vm_fault,
 	.open = ttm_bo_vm_open,
 	.close = ttm_bo_vm_close,
@@ -487,7 +448,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
 	if (unlikely(ret != 0))
 		goto out_unref;
 
-	vma->vm_ops = bdev->vm_ops;
+	vma->vm_ops = &ttm_bo_vm_ops;
 
 	/*
 	 * Note: We're transferring the bo reference to
@@ -519,7 +480,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
 
 	ttm_bo_get(bo);
 
-	vma->vm_ops = bo->bdev->vm_ops;
+	vma->vm_ops = &ttm_bo_vm_ops;
 	vma->vm_private_data = bo;
 	vma->vm_flags |= VM_MIXEDMAP;
 	vma->vm_flags |= VM_IO | VM_DONTEXPAND;
diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig
index d5fd81a521f6..6b28a326f8bb 100644
--- a/drivers/gpu/drm/vmwgfx/Kconfig
+++ b/drivers/gpu/drm/vmwgfx/Kconfig
@@ -8,7 +8,6 @@ config DRM_VMWGFX
 	select FB_CFB_IMAGEBLIT
 	select DRM_TTM
 	select FB
-	select AS_DIRTY_HELPERS
 	# Only needed for the transitional use of drm_crtc_init - can be removed
 	# again once vmwgfx sets up the primary plane itself.
 	select DRM_KMS_HELPER
diff --git a/drivers/gpu/drm/vmwgfx/Makefile b/drivers/gpu/drm/vmwgfx/Makefile
index c877a21a0739..8841bd30e1e5 100644
--- a/drivers/gpu/drm/vmwgfx/Makefile
+++ b/drivers/gpu/drm/vmwgfx/Makefile
@@ -8,7 +8,7 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o vmwgfx_kms.o vmwgfx_drv.o \
 	    vmwgfx_cmdbuf_res.o vmwgfx_cmdbuf.o vmwgfx_stdu.o \
 	    vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o vmwgfx_msg.o \
 	    vmwgfx_simple_resource.o vmwgfx_va.o vmwgfx_blit.o \
-	    vmwgfx_validation.o vmwgfx_page_dirty.o \
+	    vmwgfx_validation.o \
 	    ttm_object.o ttm_lock.o
 
 obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
diff --git a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
index 61414f105c67..f2bfd3d80598 100644
--- a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
+++ b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
@@ -1280,6 +1280,7 @@ svga3dsurface_get_pixel_offset(SVGA3dSurfaceFormat format,
 	return offset;
 }
 
+
 static inline u32
 svga3dsurface_get_image_offset(SVGA3dSurfaceFormat format,
 			       surf_size_struct baseLevelSize,
@@ -1374,236 +1375,4 @@ svga3dsurface_is_screen_target_format(SVGA3dSurfaceFormat format)
 	return svga3dsurface_is_dx_screen_target_format(format);
 }
 
-/**
- * struct svga3dsurface_mip - Mimpmap level information
- * @bytes: Bytes required in the backing store of this mipmap level.
- * @img_stride: Byte stride per image.
- * @row_stride: Byte stride per block row.
- * @size: The size of the mipmap.
- */
-struct svga3dsurface_mip {
-	size_t bytes;
-	size_t img_stride;
-	size_t row_stride;
-	struct drm_vmw_size size;
-
-};
-
-/**
- * struct svga3dsurface_cache - Cached surface information
- * @desc: Pointer to the surface descriptor
- * @mip: Array of mipmap level information. Valid size is @num_mip_levels.
- * @mip_chain_bytes: Bytes required in the backing store for the whole chain
- * of mip levels.
- * @sheet_bytes: Bytes required in the backing store for a sheet
- * representing a single sample.
- * @num_mip_levels: Valid size of the @mip array. Number of mipmap levels in
- * a chain.
- * @num_layers: Number of slices in an array texture or number of faces in
- * a cubemap texture.
- */
-struct svga3dsurface_cache {
-	const struct svga3d_surface_desc *desc;
-	struct svga3dsurface_mip mip[DRM_VMW_MAX_MIP_LEVELS];
-	size_t mip_chain_bytes;
-	size_t sheet_bytes;
-	u32 num_mip_levels;
-	u32 num_layers;
-};
-
-/**
- * struct svga3dsurface_loc - Surface location
- * @sub_resource: Surface subresource. Defined as layer * num_mip_levels +
- * mip_level.
- * @x: X coordinate.
- * @y: Y coordinate.
- * @z: Z coordinate.
- */
-struct svga3dsurface_loc {
-	u32 sub_resource;
-	u32 x, y, z;
-};
-
-/**
- * svga3dsurface_subres - Compute the subresource from layer and mipmap.
- * @cache: Surface layout data.
- * @mip_level: The mipmap level.
- * @layer: The surface layer (face or array slice).
- *
- * Return: The subresource.
- */
-static inline u32 svga3dsurface_subres(const struct svga3dsurface_cache *cache,
-				       u32 mip_level, u32 layer)
-{
-	return cache->num_mip_levels * layer + mip_level;
-}
-
-/**
- * svga3dsurface_setup_cache - Build a surface cache entry
- * @size: The surface base level dimensions.
- * @format: The surface format.
- * @num_mip_levels: Number of mipmap levels.
- * @num_layers: Number of layers.
- * @cache: Pointer to a struct svga3dsurface_cach object to be filled in.
- *
- * Return: Zero on success, -EINVAL on invalid surface layout.
- */
-static inline int svga3dsurface_setup_cache(const struct drm_vmw_size *size,
-					    SVGA3dSurfaceFormat format,
-					    u32 num_mip_levels,
-					    u32 num_layers,
-					    u32 num_samples,
-					    struct svga3dsurface_cache *cache)
-{
-	const struct svga3d_surface_desc *desc;
-	u32 i;
-
-	memset(cache, 0, sizeof(*cache));
-	cache->desc = desc = svga3dsurface_get_desc(format);
-	cache->num_mip_levels = num_mip_levels;
-	cache->num_layers = num_layers;
-	for (i = 0; i < cache->num_mip_levels; i++) {
-		struct svga3dsurface_mip *mip = &cache->mip[i];
-
-		mip->size = svga3dsurface_get_mip_size(*size, i);
-		mip->bytes = svga3dsurface_get_image_buffer_size
-			(desc, &mip->size, 0);
-		mip->row_stride =
-			__KERNEL_DIV_ROUND_UP(mip->size.width,
-					      desc->block_size.width) *
-			desc->bytes_per_block * num_samples;
-		if (!mip->row_stride)
-			goto invalid_dim;
-
-		mip->img_stride =
-			__KERNEL_DIV_ROUND_UP(mip->size.height,
-					      desc->block_size.height) *
-			mip->row_stride;
-		if (!mip->img_stride)
-			goto invalid_dim;
-
-		cache->mip_chain_bytes += mip->bytes;
-	}
-	cache->sheet_bytes = cache->mip_chain_bytes * num_layers;
-	if (!cache->sheet_bytes)
-		goto invalid_dim;
-
-	return 0;
-
-invalid_dim:
-	VMW_DEBUG_USER("Invalid surface layout for dirty tracking.\n");
-	return -EINVAL;
-}
-
-/**
- * svga3dsurface_get_loc - Get a surface location from an offset into the
- * backing store
- * @cache: Surface layout data.
- * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
- * @offset: Offset into the surface backing store.
- */
-static inline void
-svga3dsurface_get_loc(const struct svga3dsurface_cache *cache,
-		      struct svga3dsurface_loc *loc,
-		      size_t offset)
-{
-	const struct svga3dsurface_mip *mip = &cache->mip[0];
-	const struct svga3d_surface_desc *desc = cache->desc;
-	u32 layer;
-	int i;
-
-	if (offset >= cache->sheet_bytes)
-		offset %= cache->sheet_bytes;
-
-	layer = offset / cache->mip_chain_bytes;
-	offset -= layer * cache->mip_chain_bytes;
-	for (i = 0; i < cache->num_mip_levels; ++i, ++mip) {
-		if (mip->bytes > offset)
-			break;
-		offset -= mip->bytes;
-	}
-
-	loc->sub_resource = svga3dsurface_subres(cache, i, layer);
-	loc->z = offset / mip->img_stride;
-	offset -= loc->z * mip->img_stride;
-	loc->z *= desc->block_size.depth;
-	loc->y = offset / mip->row_stride;
-	offset -= loc->y * mip->row_stride;
-	loc->y *= desc->block_size.height;
-	loc->x = offset / desc->bytes_per_block;
-	loc->x *= desc->block_size.width;
-}
-
-/**
- * svga3dsurface_inc_loc - Clamp increment a surface location with one block
- * size
- * in each dimension.
- * @loc: Pointer to a struct svga3dsurface_loc to be incremented.
- *
- * When computing the size of a range as size = end - start, the range does not
- * include the end element. However a location representing the last byte
- * of a touched region in the backing store *is* included in the range.
- * This function modifies such a location to match the end definition
- * given as start + size which is the one used in a SVGA3dBox.
- */
-static inline void
-svga3dsurface_inc_loc(const struct svga3dsurface_cache *cache,
-		      struct svga3dsurface_loc *loc)
-{
-	const struct svga3d_surface_desc *desc = cache->desc;
-	u32 mip = loc->sub_resource % cache->num_mip_levels;
-	const struct drm_vmw_size *size = &cache->mip[mip].size;
-
-	loc->sub_resource++;
-	loc->x += desc->block_size.width;
-	if (loc->x > size->width)
-		loc->x = size->width;
-	loc->y += desc->block_size.height;
-	if (loc->y > size->height)
-		loc->y = size->height;
-	loc->z += desc->block_size.depth;
-	if (loc->z > size->depth)
-		loc->z = size->depth;
-}
-
-/**
- * svga3dsurface_min_loc - The start location in a subresource
- * @cache: Surface layout data.
- * @sub_resource: The subresource.
- * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
- */
-static inline void
-svga3dsurface_min_loc(const struct svga3dsurface_cache *cache,
-		      u32 sub_resource,
-		      struct svga3dsurface_loc *loc)
-{
-	loc->sub_resource = sub_resource;
-	loc->x = loc->y = loc->z = 0;
-}
-
-/**
- * svga3dsurface_min_loc - The end location in a subresource
- * @cache: Surface layout data.
- * @sub_resource: The subresource.
- * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
- *
- * Following the end definition given in svga3dsurface_inc_loc(),
- * Compute the end location of a surface subresource.
- */
-static inline void
-svga3dsurface_max_loc(const struct svga3dsurface_cache *cache,
-		      u32 sub_resource,
-		      struct svga3dsurface_loc *loc)
-{
-	const struct drm_vmw_size *size;
-	u32 mip;
-
-	loc->sub_resource = sub_resource + 1;
-	mip = sub_resource % cache->num_mip_levels;
-	size = &cache->mip[mip].size;
-	loc->x = size->width;
-	loc->y = size->height;
-	loc->z = size->depth;
-}
-
 #endif /* _SVGA3D_SURFACEDEFS_H_ */
diff --git a/drivers/gpu/drm/vmwgfx/ttm_lock.c b/drivers/gpu/drm/vmwgfx/ttm_lock.c
index 5971c72e6d10..16b2083cb9d4 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_lock.c
+++ b/drivers/gpu/drm/vmwgfx/ttm_lock.c
@@ -29,6 +29,7 @@
  * Authors: Thomas Hellstrom <thellstrom-at-vmware-dot-com>
  */
 
+#include <drm/ttm/ttm_module.h>
 #include <linux/atomic.h>
 #include <linux/errno.h>
 #include <linux/wait.h>
@@ -48,6 +49,8 @@ void ttm_lock_init(struct ttm_lock *lock)
 	init_waitqueue_head(&lock->queue);
 	lock->rw = 0;
 	lock->flags = 0;
+	lock->kill_takers = false;
+	lock->signal = SIGKILL;
 }
 
 void ttm_read_unlock(struct ttm_lock *lock)
@@ -63,6 +66,11 @@ static bool __ttm_read_lock(struct ttm_lock *lock)
 	bool locked = false;
 
 	spin_lock(&lock->lock);
+	if (unlikely(lock->kill_takers)) {
+		send_sig(lock->signal, current, 0);
+		spin_unlock(&lock->lock);
+		return false;
+	}
 	if (lock->rw >= 0 && lock->flags == 0) {
 		++lock->rw;
 		locked = true;
@@ -90,6 +98,11 @@ static bool __ttm_read_trylock(struct ttm_lock *lock, bool *locked)
 	*locked = false;
 
 	spin_lock(&lock->lock);
+	if (unlikely(lock->kill_takers)) {
+		send_sig(lock->signal, current, 0);
+		spin_unlock(&lock->lock);
+		return false;
+	}
 	if (lock->rw >= 0 && lock->flags == 0) {
 		++lock->rw;
 		block = false;
@@ -134,6 +147,11 @@ static bool __ttm_write_lock(struct ttm_lock *lock)
 	bool locked = false;
 
 	spin_lock(&lock->lock);
+	if (unlikely(lock->kill_takers)) {
+		send_sig(lock->signal, current, 0);
+		spin_unlock(&lock->lock);
+		return false;
+	}
 	if (lock->rw == 0 && ((lock->flags & ~TTM_WRITE_LOCK_PENDING) == 0)) {
 		lock->rw = -1;
 		lock->flags &= ~TTM_WRITE_LOCK_PENDING;
@@ -164,6 +182,88 @@ int ttm_write_lock(struct ttm_lock *lock, bool interruptible)
 	return ret;
 }
 
+static int __ttm_vt_unlock(struct ttm_lock *lock)
+{
+	int ret = 0;
+
+	spin_lock(&lock->lock);
+	if (unlikely(!(lock->flags & TTM_VT_LOCK)))
+		ret = -EINVAL;
+	lock->flags &= ~TTM_VT_LOCK;
+	wake_up_all(&lock->queue);
+	spin_unlock(&lock->lock);
+
+	return ret;
+}
+
+static void ttm_vt_lock_remove(struct ttm_base_object **p_base)
+{
+	struct ttm_base_object *base = *p_base;
+	struct ttm_lock *lock = container_of(base, struct ttm_lock, base);
+	int ret;
+
+	*p_base = NULL;
+	ret = __ttm_vt_unlock(lock);
+	BUG_ON(ret != 0);
+}
+
+static bool __ttm_vt_lock(struct ttm_lock *lock)
+{
+	bool locked = false;
+
+	spin_lock(&lock->lock);
+	if (lock->rw == 0) {
+		lock->flags &= ~TTM_VT_LOCK_PENDING;
+		lock->flags |= TTM_VT_LOCK;
+		locked = true;
+	} else {
+		lock->flags |= TTM_VT_LOCK_PENDING;
+	}
+	spin_unlock(&lock->lock);
+	return locked;
+}
+
+int ttm_vt_lock(struct ttm_lock *lock,
+		bool interruptible,
+		struct ttm_object_file *tfile)
+{
+	int ret = 0;
+
+	if (interruptible) {
+		ret = wait_event_interruptible(lock->queue,
+					       __ttm_vt_lock(lock));
+		if (unlikely(ret != 0)) {
+			spin_lock(&lock->lock);
+			lock->flags &= ~TTM_VT_LOCK_PENDING;
+			wake_up_all(&lock->queue);
+			spin_unlock(&lock->lock);
+			return ret;
+		}
+	} else
+		wait_event(lock->queue, __ttm_vt_lock(lock));
+
+	/*
+	 * Add a base-object, the destructor of which will
+	 * make sure the lock is released if the client dies
+	 * while holding it.
+	 */
+
+	ret = ttm_base_object_init(tfile, &lock->base, false,
+				   ttm_lock_type, &ttm_vt_lock_remove, NULL);
+	if (ret)
+		(void)__ttm_vt_unlock(lock);
+	else
+		lock->vt_holder = tfile;
+
+	return ret;
+}
+
+int ttm_vt_unlock(struct ttm_lock *lock)
+{
+	return ttm_ref_object_base_unref(lock->vt_holder,
+					 lock->base.handle, TTM_REF_USAGE);
+}
+
 void ttm_suspend_unlock(struct ttm_lock *lock)
 {
 	spin_lock(&lock->lock);
diff --git a/drivers/gpu/drm/vmwgfx/ttm_lock.h b/drivers/gpu/drm/vmwgfx/ttm_lock.h
index 3d454e8b491f..0c3af9836863 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_lock.h
+++ b/drivers/gpu/drm/vmwgfx/ttm_lock.h
@@ -63,6 +63,8 @@
  * @lock: Spinlock protecting some lock members.
  * @rw: Read-write lock counter. Protected by @lock.
  * @flags: Lock state. Protected by @lock.
+ * @kill_takers: Boolean whether to kill takers of the lock.
+ * @signal: Signal to send when kill_takers is true.
  */
 
 struct ttm_lock {
@@ -71,6 +73,9 @@ struct ttm_lock {
 	spinlock_t lock;
 	int32_t rw;
 	uint32_t flags;
+	bool kill_takers;
+	int signal;
+	struct ttm_object_file *vt_holder;
 };
 
 
@@ -215,4 +220,29 @@ extern void ttm_write_unlock(struct ttm_lock *lock);
  */
 extern int ttm_write_lock(struct ttm_lock *lock, bool interruptible);
 
+/**
+ * ttm_lock_set_kill
+ *
+ * @lock: Pointer to a struct ttm_lock
+ * @val: Boolean whether to kill processes taking the lock.
+ * @signal: Signal to send to the process taking the lock.
+ *
+ * The kill-when-taking-lock functionality is used to kill processes that keep
+ * on using the TTM functionality when its resources has been taken down, for
+ * example when the X server exits. A typical sequence would look like this:
+ * - X server takes lock in write mode.
+ * - ttm_lock_set_kill() is called with @val set to true.
+ * - As part of X server exit, TTM resources are taken down.
+ * - X server releases the lock on file release.
+ * - Another dri client wants to render, takes the lock and is killed.
+ *
+ */
+static inline void ttm_lock_set_kill(struct ttm_lock *lock, bool val,
+				     int signal)
+{
+	lock->kill_takers = val;
+	if (val)
+		lock->signal = signal;
+}
+
 #endif
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
index e8bc7a7ac031..5d5c2bce01f3 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
@@ -463,8 +463,6 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
 {
 	struct vmw_buffer_object *vmw_bo = vmw_buffer_object(bo);
 
-	WARN_ON(vmw_bo->dirty);
-	WARN_ON(!RB_EMPTY_ROOT(&vmw_bo->res_tree));
 	vmw_bo_unmap(vmw_bo);
 	kfree(vmw_bo);
 }
@@ -478,11 +476,8 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
 static void vmw_user_bo_destroy(struct ttm_buffer_object *bo)
 {
 	struct vmw_user_buffer_object *vmw_user_bo = vmw_user_buffer_object(bo);
-	struct vmw_buffer_object *vbo = &vmw_user_bo->vbo;
 
-	WARN_ON(vbo->dirty);
-	WARN_ON(!RB_EMPTY_ROOT(&vbo->res_tree));
-	vmw_bo_unmap(vbo);
+	vmw_bo_unmap(&vmw_user_bo->vbo);
 	ttm_prime_object_kfree(vmw_user_bo, prime);
 }
 
@@ -515,9 +510,8 @@ int vmw_bo_init(struct vmw_private *dev_priv,
 
 	acc_size = vmw_bo_acc_size(dev_priv, size, user);
 	memset(vmw_bo, 0, sizeof(*vmw_bo));
-	BUILD_BUG_ON(TTM_MAX_BO_PRIORITY <= 3);
-	vmw_bo->base.priority = 3;
-	vmw_bo->res_tree = RB_ROOT;
+
+	INIT_LIST_HEAD(&vmw_bo->res_list);
 
 	ret = ttm_bo_init(bdev, &vmw_bo->base, size,
 			  ttm_bo_type_device, placement,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_context.c b/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
index a56c9d802382..63f111068a44 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
@@ -88,8 +88,6 @@ static const struct vmw_res_func vmw_gb_context_func = {
 	.res_type = vmw_res_context,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "guest backed contexts",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_gb_context_create,
@@ -102,8 +100,6 @@ static const struct vmw_res_func vmw_dx_context_func = {
 	.res_type = vmw_res_dx_context,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "dx contexts",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_dx_context_create,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
index 8c699cb2565b..b4f6e1217c9d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
@@ -116,8 +116,6 @@ static const struct vmw_res_func vmw_cotable_func = {
 	.res_type = vmw_res_cotable,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "context guest backed object tables",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_cotable_create,
@@ -309,7 +307,7 @@ static int vmw_cotable_unbind(struct vmw_resource *res,
 	struct ttm_buffer_object *bo = val_buf->bo;
 	struct vmw_fence_obj *fence;
 
-	if (!vmw_resource_mob_attached(res))
+	if (list_empty(&res->mob_head))
 		return 0;
 
 	WARN_ON_ONCE(bo->mem.mem_type != VMW_PL_MOB);
@@ -455,7 +453,6 @@ static int vmw_cotable_resize(struct vmw_resource *res, size_t new_size)
 		goto out_wait;
 	}
 
-	vmw_resource_mob_detach(res);
 	res->backup = buf;
 	res->backup_size = new_size;
 	vcotbl->size_read_back = cur_size_read_back;
@@ -470,12 +467,12 @@ static int vmw_cotable_resize(struct vmw_resource *res, size_t new_size)
 		res->backup = old_buf;
 		res->backup_size = old_size;
 		vcotbl->size_read_back = old_size_read_back;
-		vmw_resource_mob_attach(res);
 		goto out_wait;
 	}
 
-	vmw_resource_mob_attach(res);
 	/* Let go of the old mob. */
+	list_del(&res->mob_head);
+	list_add_tail(&res->mob_head, &buf->res_list);
 	vmw_bo_unreference(&old_buf);
 	res->id = vcotbl->type;
 
@@ -499,7 +496,7 @@ out_wait:
  * is called before bind() in the validation sequence is instead used for two
  * things.
  * 1) Unscrub the cotable if it is scrubbed and still attached to a backup
- *    buffer.
+ *    buffer, that is, if @res->mob_head is non-empty.
  * 2) Resize the cotable if needed.
  */
 static int vmw_cotable_create(struct vmw_resource *res)
@@ -515,7 +512,7 @@ static int vmw_cotable_create(struct vmw_resource *res)
 		new_size *= 2;
 
 	if (likely(new_size <= res->backup_size)) {
-		if (vcotbl->scrubbed && vmw_resource_mob_attached(res)) {
+		if (vcotbl->scrubbed && !list_empty(&res->mob_head)) {
 			ret = vmw_cotable_unscrub(res);
 			if (ret)
 				return ret;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 8349a6cc126f..4ff11a0077e1 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -254,6 +254,7 @@ static int vmw_restrict_dma_mask;
 static int vmw_assume_16bpp;
 
 static int vmw_probe(struct pci_dev *, const struct pci_device_id *);
+static void vmw_master_init(struct vmw_master *);
 static int vmwgfx_pm_notifier(struct notifier_block *nb, unsigned long val,
 			      void *ptr);
 
@@ -761,6 +762,10 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 	DRM_INFO("MMIO at 0x%08x size is %u kiB\n",
 		 dev_priv->mmio_start, dev_priv->mmio_size / 1024);
 
+	vmw_master_init(&dev_priv->fbdev_master);
+	ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
+	dev_priv->active_master = &dev_priv->fbdev_master;
+
 	dev_priv->mmio_virt = memremap(dev_priv->mmio_start,
 				       dev_priv->mmio_size, MEMREMAP_WB);
 
@@ -828,11 +833,6 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 		DRM_ERROR("Failed initializing TTM buffer object driver.\n");
 		goto out_no_bdev;
 	}
-	dev_priv->vm_ops = *dev_priv->bdev.vm_ops;
-	dev_priv->vm_ops.fault = vmw_bo_vm_fault;
-	dev_priv->vm_ops.pfn_mkwrite = vmw_bo_vm_mkwrite;
-	dev_priv->vm_ops.page_mkwrite = vmw_bo_vm_mkwrite;
-	dev_priv->bdev.vm_ops = &dev_priv->vm_ops;
 
 	/*
 	 * Enable VRAM, but initially don't use it until SVGA is enabled and
@@ -1007,7 +1007,18 @@ static void vmw_driver_unload(struct drm_device *dev)
 static void vmw_postclose(struct drm_device *dev,
 			 struct drm_file *file_priv)
 {
-	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_fpriv *vmw_fp;
+
+	vmw_fp = vmw_fpriv(file_priv);
+
+	if (vmw_fp->locked_master) {
+		struct vmw_master *vmaster =
+			vmw_master(vmw_fp->locked_master);
+
+		ttm_lock_set_kill(&vmaster->lock, true, SIGTERM);
+		ttm_vt_unlock(&vmaster->lock);
+		drm_master_put(&vmw_fp->locked_master);
+	}
 
 	ttm_object_file_release(&vmw_fp->tfile);
 	kfree(vmw_fp);
@@ -1036,6 +1047,55 @@ out_no_tfile:
 	return ret;
 }
 
+static struct vmw_master *vmw_master_check(struct drm_device *dev,
+					   struct drm_file *file_priv,
+					   unsigned int flags)
+{
+	int ret;
+	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_master *vmaster;
+
+	if (!drm_is_primary_client(file_priv) || !(flags & DRM_AUTH))
+		return NULL;
+
+	ret = mutex_lock_interruptible(&dev->master_mutex);
+	if (unlikely(ret != 0))
+		return ERR_PTR(-ERESTARTSYS);
+
+	if (drm_is_current_master(file_priv)) {
+		mutex_unlock(&dev->master_mutex);
+		return NULL;
+	}
+
+	/*
+	 * Check if we were previously master, but now dropped. In that
+	 * case, allow at least render node functionality.
+	 */
+	if (vmw_fp->locked_master) {
+		mutex_unlock(&dev->master_mutex);
+
+		if (flags & DRM_RENDER_ALLOW)
+			return NULL;
+
+		DRM_ERROR("Dropped master trying to access ioctl that "
+			  "requires authentication.\n");
+		return ERR_PTR(-EACCES);
+	}
+	mutex_unlock(&dev->master_mutex);
+
+	/*
+	 * Take the TTM lock. Possibly sleep waiting for the authenticating
+	 * master to become master again, or for a SIGTERM if the
+	 * authenticating master exits.
+	 */
+	vmaster = vmw_master(file_priv->master);
+	ret = ttm_read_lock(&vmaster->lock, true);
+	if (unlikely(ret != 0))
+		vmaster = ERR_PTR(ret);
+
+	return vmaster;
+}
+
 static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 			      unsigned long arg,
 			      long (*ioctl_func)(struct file *, unsigned int,
@@ -1044,6 +1104,7 @@ static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 	struct drm_file *file_priv = filp->private_data;
 	struct drm_device *dev = file_priv->minor->dev;
 	unsigned int nr = DRM_IOCTL_NR(cmd);
+	struct vmw_master *vmaster;
 	unsigned int flags;
 	long ret;
 
@@ -1079,7 +1140,21 @@ static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 	} else if (!drm_ioctl_flags(nr, &flags))
 		return -EINVAL;
 
-	return ioctl_func(filp, cmd, arg);
+	vmaster = vmw_master_check(dev, file_priv, flags);
+	if (IS_ERR(vmaster)) {
+		ret = PTR_ERR(vmaster);
+
+		if (ret != -ERESTARTSYS)
+			DRM_INFO("IOCTL ERROR Command %d, Error %ld.\n",
+				 nr, ret);
+		return ret;
+	}
+
+	ret = ioctl_func(filp, cmd, arg);
+	if (vmaster)
+		ttm_read_unlock(&vmaster->lock);
+
+	return ret;
 
 out_io_encoding:
 	DRM_ERROR("Invalid command format, ioctl %d\n",
@@ -1106,10 +1181,65 @@ static void vmw_lastclose(struct drm_device *dev)
 {
 }
 
+static void vmw_master_init(struct vmw_master *vmaster)
+{
+	ttm_lock_init(&vmaster->lock);
+}
+
+static int vmw_master_create(struct drm_device *dev,
+			     struct drm_master *master)
+{
+	struct vmw_master *vmaster;
+
+	vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL);
+	if (unlikely(!vmaster))
+		return -ENOMEM;
+
+	vmw_master_init(vmaster);
+	ttm_lock_set_kill(&vmaster->lock, true, SIGTERM);
+	master->driver_priv = vmaster;
+
+	return 0;
+}
+
+static void vmw_master_destroy(struct drm_device *dev,
+			       struct drm_master *master)
+{
+	struct vmw_master *vmaster = vmw_master(master);
+
+	master->driver_priv = NULL;
+	kfree(vmaster);
+}
+
 static int vmw_master_set(struct drm_device *dev,
 			  struct drm_file *file_priv,
 			  bool from_open)
 {
+	struct vmw_private *dev_priv = vmw_priv(dev);
+	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_master *active = dev_priv->active_master;
+	struct vmw_master *vmaster = vmw_master(file_priv->master);
+	int ret = 0;
+
+	if (active) {
+		BUG_ON(active != &dev_priv->fbdev_master);
+		ret = ttm_vt_lock(&active->lock, false, vmw_fp->tfile);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ttm_lock_set_kill(&active->lock, true, SIGTERM);
+		dev_priv->active_master = NULL;
+	}
+
+	ttm_lock_set_kill(&vmaster->lock, false, SIGTERM);
+	if (!from_open) {
+		ttm_vt_unlock(&vmaster->lock);
+		BUG_ON(vmw_fp->locked_master != file_priv->master);
+		drm_master_put(&vmw_fp->locked_master);
+	}
+
+	dev_priv->active_master = vmaster;
+
 	/*
 	 * Inform a new master that the layout may have changed while
 	 * it was gone.
@@ -1124,10 +1254,31 @@ static void vmw_master_drop(struct drm_device *dev,
 			    struct drm_file *file_priv)
 {
 	struct vmw_private *dev_priv = vmw_priv(dev);
+	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_master *vmaster = vmw_master(file_priv->master);
+	int ret;
+
+	/**
+	 * Make sure the master doesn't disappear while we have
+	 * it locked.
+	 */
 
+	vmw_fp->locked_master = drm_master_get(file_priv->master);
+	ret = ttm_vt_lock(&vmaster->lock, false, vmw_fp->tfile);
 	vmw_kms_legacy_hotspot_clear(dev_priv);
+	if (unlikely((ret != 0))) {
+		DRM_ERROR("Unable to lock TTM at VT switch.\n");
+		drm_master_put(&vmw_fp->locked_master);
+	}
+
+	ttm_lock_set_kill(&vmaster->lock, false, SIGTERM);
+
 	if (!dev_priv->enable_fb)
 		vmw_svga_disable(dev_priv);
+
+	dev_priv->active_master = &dev_priv->fbdev_master;
+	ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
+	ttm_vt_unlock(&dev_priv->fbdev_master.lock);
 }
 
 /**
@@ -1406,6 +1557,8 @@ static struct drm_driver driver = {
 	.disable_vblank = vmw_disable_vblank,
 	.ioctls = vmw_ioctls,
 	.num_ioctls = ARRAY_SIZE(vmw_ioctls),
+	.master_create = vmw_master_create,
+	.master_destroy = vmw_master_destroy,
 	.master_set = vmw_master_set,
 	.master_drop = vmw_master_drop,
 	.open = vmw_driver_open,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index 3a358a5495e4..366dcfc1f9bb 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -44,9 +44,9 @@
 #include <linux/sync_file.h>
 
 #define VMWGFX_DRIVER_NAME "vmwgfx"
-#define VMWGFX_DRIVER_DATE "20190328"
+#define VMWGFX_DRIVER_DATE "20180704"
 #define VMWGFX_DRIVER_MAJOR 2
-#define VMWGFX_DRIVER_MINOR 16
+#define VMWGFX_DRIVER_MINOR 15
 #define VMWGFX_DRIVER_PATCHLEVEL 0
 #define VMWGFX_FIFO_STATIC_SIZE (1024*1024)
 #define VMWGFX_MAX_RELOCATIONS 2048
@@ -81,30 +81,19 @@
 #define VMW_RES_SHADER ttm_driver_type4
 
 struct vmw_fpriv {
+	struct drm_master *locked_master;
 	struct ttm_object_file *tfile;
 	bool gb_aware; /* user-space is guest-backed aware */
 };
 
-/**
- * struct vmw_buffer_object - TTM buffer object with vmwgfx additions
- * @base: The TTM buffer object
- * @res_tree: RB tree of resources using this buffer object as a backing MOB
- * @pin_count: pin depth
- * @dx_query_ctx: DX context if this buffer object is used as a DX query MOB
- * @map: Kmap object for semi-persistent mappings
- * @res_prios: Eviction priority counts for attached resources
- * @dirty: structure for user-space dirty-tracking
- */
 struct vmw_buffer_object {
 	struct ttm_buffer_object base;
-	struct rb_root res_tree;
+	struct list_head res_list;
 	s32 pin_count;
 	/* Not ref-counted.  Protected by binding_mutex */
 	struct vmw_resource *dx_query_ctx;
 	/* Protected by reservation */
 	struct ttm_bo_kmap_obj map;
-	u32 res_prios[TTM_MAX_BO_PRIORITY];
-	struct vmw_bo_dirty *dirty;
 };
 
 /**
@@ -135,8 +124,7 @@ struct vmw_res_func;
  * @res_dirty: Resource contains data not yet in the backup buffer. Protected
  * by resource reserved.
  * @backup_dirty: Backup buffer contains data not yet in the HW resource.
- * Protected by resource reserved.
- * @coherent: Emulate coherency by tracking vm accesses.
+ * Protecte by resource reserved.
  * @backup: The backup buffer if any. Protected by resource reserved.
  * @backup_offset: Offset into the backup buffer if any. Protected by resource
  * reserved. Note that only a few resource types can have a @backup_offset
@@ -145,32 +133,28 @@ struct vmw_res_func;
  * pin-count greater than zero. It is not on the resource LRU lists and its
  * backup buffer is pinned. Hence it can't be evicted.
  * @func: Method vtable for this resource. Immutable.
- * @mob_node; Node for the MOB backup rbtree. Protected by @backup reserved.
  * @lru_head: List head for the LRU list. Protected by @dev_priv::resource_lock.
+ * @mob_head: List head for the MOB backup list. Protected by @backup reserved.
  * @binding_head: List head for the context binding list. Protected by
  * the @dev_priv::binding_mutex
  * @res_free: The resource destructor.
  * @hw_destroy: Callback to destroy the resource on the device, as part of
  * resource destruction.
  */
-struct vmw_resource_dirty;
 struct vmw_resource {
 	struct kref kref;
 	struct vmw_private *dev_priv;
 	int id;
-	u32 used_prio;
 	unsigned long backup_size;
-	u32 res_dirty : 1;
-	u32 backup_dirty : 1;
-	u32 coherent : 1;
+	bool res_dirty;
+	bool backup_dirty;
 	struct vmw_buffer_object *backup;
 	unsigned long backup_offset;
 	unsigned long pin_count;
 	const struct vmw_res_func *func;
-	struct rb_node mob_node;
 	struct list_head lru_head;
+	struct list_head mob_head;
 	struct list_head binding_head;
-	struct vmw_resource_dirty *dirty;
 	void (*res_free) (struct vmw_resource *res);
 	void (*hw_destroy) (struct vmw_resource *res);
 };
@@ -392,6 +376,10 @@ struct vmw_sw_context{
 struct vmw_legacy_display;
 struct vmw_overlay;
 
+struct vmw_master {
+	struct ttm_lock lock;
+};
+
 struct vmw_vga_topology_state {
 	uint32_t width;
 	uint32_t height;
@@ -554,8 +542,11 @@ struct vmw_private {
 	spinlock_t svga_lock;
 
 	/**
-	 * PM management.
+	 * Master management.
 	 */
+
+	struct vmw_master *active_master;
+	struct vmw_master fbdev_master;
 	struct notifier_block pm_nb;
 	bool refuse_hibernation;
 	bool suspend_locked;
@@ -604,9 +595,6 @@ struct vmw_private {
 
 	/* Validation memory reservation */
 	struct vmw_validation_mem vvm;
-
-	/* VM operations */
-	struct vm_operations_struct vm_ops;
 };
 
 static inline struct vmw_surface *vmw_res_to_srf(struct vmw_resource *res)
@@ -624,6 +612,11 @@ static inline struct vmw_fpriv *vmw_fpriv(struct drm_file *file_priv)
 	return (struct vmw_fpriv *)file_priv->driver_priv;
 }
 
+static inline struct vmw_master *vmw_master(struct drm_master *master)
+{
+	return (struct vmw_master *) master->driver_priv;
+}
+
 /*
  * The locking here is fine-grained, so that it is performed once
  * for every read- and write operation. This is of course costly, but we
@@ -676,8 +669,7 @@ extern void vmw_resource_unreference(struct vmw_resource **p_res);
 extern struct vmw_resource *vmw_resource_reference(struct vmw_resource *res);
 extern struct vmw_resource *
 vmw_resource_reference_unless_doomed(struct vmw_resource *res);
-extern int vmw_resource_validate(struct vmw_resource *res, bool intr,
-				 bool dirtying);
+extern int vmw_resource_validate(struct vmw_resource *res, bool intr);
 extern int vmw_resource_reserve(struct vmw_resource *res, bool interruptible,
 				bool no_backup);
 extern bool vmw_resource_needs_backup(const struct vmw_resource *res);
@@ -717,23 +709,6 @@ extern void vmw_query_move_notify(struct ttm_buffer_object *bo,
 extern int vmw_query_readback_all(struct vmw_buffer_object *dx_query_mob);
 extern void vmw_resource_evict_all(struct vmw_private *dev_priv);
 extern void vmw_resource_unbind_list(struct vmw_buffer_object *vbo);
-void vmw_resource_mob_attach(struct vmw_resource *res);
-void vmw_resource_mob_detach(struct vmw_resource *res);
-void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
-			       pgoff_t end);
-int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
-			pgoff_t end, pgoff_t *num_prefault);
-
-/**
- * vmw_resource_mob_attached - Whether a resource currently has a mob attached
- * @res: The resource
- *
- * Return: true if the resource has a mob attached, false otherwise.
- */
-static inline bool vmw_resource_mob_attached(const struct vmw_resource *res)
-{
-	return !RB_EMPTY_NODE(&res->mob_node);
-}
 
 /**
  * vmw_user_resource_noref_release - release a user resource pointer looked up
@@ -812,54 +787,6 @@ static inline void vmw_user_bo_noref_release(void)
 	ttm_base_object_noref_release();
 }
 
-/**
- * vmw_bo_adjust_prio - Adjust the buffer object eviction priority
- * according to attached resources
- * @vbo: The struct vmw_buffer_object
- */
-static inline void vmw_bo_prio_adjust(struct vmw_buffer_object *vbo)
-{
-	int i = ARRAY_SIZE(vbo->res_prios);
-
-	while (i--) {
-		if (vbo->res_prios[i]) {
-			vbo->base.priority = i;
-			return;
-		}
-	}
-
-	vbo->base.priority = 3;
-}
-
-/**
- * vmw_bo_prio_add - Notify a buffer object of a newly attached resource
- * eviction priority
- * @vbo: The struct vmw_buffer_object
- * @prio: The resource priority
- *
- * After being notified, the code assigns the highest resource eviction priority
- * to the backing buffer object (mob).
- */
-static inline void vmw_bo_prio_add(struct vmw_buffer_object *vbo, int prio)
-{
-	if (vbo->res_prios[prio]++ == 0)
-		vmw_bo_prio_adjust(vbo);
-}
-
-/**
- * vmw_bo_prio_del - Notify a buffer object of a resource with a certain
- * priority being removed
- * @vbo: The struct vmw_buffer_object
- * @prio: The resource priority
- *
- * After being notified, the code assigns the highest resource eviction priority
- * to the backing buffer object (mob).
- */
-static inline void vmw_bo_prio_del(struct vmw_buffer_object *vbo, int prio)
-{
-	if (--vbo->res_prios[prio] == 0)
-		vmw_bo_prio_adjust(vbo);
-}
 
 /**
  * Misc Ioctl functionality - vmwgfx_ioctl.c
@@ -1089,6 +1016,7 @@ void vmw_kms_cursor_snoop(struct vmw_surface *srf,
 int vmw_kms_write_svga(struct vmw_private *vmw_priv,
 		       unsigned width, unsigned height, unsigned pitch,
 		       unsigned bpp, unsigned depth);
+void vmw_kms_idle_workqueues(struct vmw_master *vmaster);
 bool vmw_kms_validate_mode_vram(struct vmw_private *dev_priv,
 				uint32_t pitch,
 				uint32_t height);
@@ -1410,25 +1338,6 @@ int vmw_host_log(const char *log);
 #define VMW_DEBUG_USER(fmt, ...)                                              \
 	DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__)
 
-/**
- * VMW_DEBUG_KMS - Debug output for kernel mode-setting
- *
- * This macro is for debugging vmwgfx mode-setting code.
- */
-#define VMW_DEBUG_KMS(fmt, ...)                                               \
-	DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__)
-
-/* Resource dirtying - vmwgfx_page_dirty.c */
-void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo);
-int vmw_bo_dirty_add(struct vmw_buffer_object *vbo);
-void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res);
-void vmw_bo_dirty_clear_res(struct vmw_resource *res);
-void vmw_bo_dirty_release(struct vmw_buffer_object *vbo);
-void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
-			pgoff_t start, pgoff_t end);
-vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
-vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
-
 /**
  * Inline helper functions
  */
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index 319c1ca35663..33533d126277 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -2560,6 +2560,7 @@ static int vmw_cmd_dx_check_subresource(struct vmw_private *dev_priv,
 		     offsetof(typeof(*cmd), sid));
 
 	cmd = container_of(header, typeof(*cmd), header);
+
 	return vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
 				 VMW_RES_DIRTY_NONE, user_surface_converter,
 				 &cmd->sid, NULL);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index e7222fa2cfdf..b97bc8e5944b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -1462,7 +1462,7 @@ static int vmw_kms_check_display_memory(struct drm_device *dev,
 		if (dev_priv->active_display_unit == vmw_du_screen_target &&
 		    (drm_rect_width(&rects[i]) > dev_priv->stdu_max_width ||
 		     drm_rect_height(&rects[i]) > dev_priv->stdu_max_height)) {
-			VMW_DEBUG_KMS("Screen size not supported.\n");
+			DRM_ERROR("Screen size not supported.\n");
 			return -EINVAL;
 		}
 
@@ -1486,7 +1486,7 @@ static int vmw_kms_check_display_memory(struct drm_device *dev,
 	 * limit on primary bounding box
 	 */
 	if (pixel_mem > dev_priv->prim_bb_mem) {
-		VMW_DEBUG_KMS("Combined output size too large.\n");
+		DRM_ERROR("Combined output size too large.\n");
 		return -EINVAL;
 	}
 
@@ -1496,7 +1496,7 @@ static int vmw_kms_check_display_memory(struct drm_device *dev,
 		bb_mem = (u64) bounding_box.x2 * bounding_box.y2 * 4;
 
 		if (bb_mem > dev_priv->prim_bb_mem) {
-			VMW_DEBUG_KMS("Topology is beyond supported limits.\n");
+			DRM_ERROR("Topology is beyond supported limits.\n");
 			return -EINVAL;
 		}
 	}
@@ -1645,7 +1645,6 @@ static int vmw_kms_check_topology(struct drm_device *dev,
 		struct vmw_connector_state *vmw_conn_state;
 
 		if (!du->pref_active && new_crtc_state->enable) {
-			VMW_DEBUG_KMS("Enabling a disabled display unit\n");
 			ret = -EINVAL;
 			goto clean;
 		}
@@ -1702,10 +1701,8 @@ vmw_kms_atomic_check_modeset(struct drm_device *dev,
 		return ret;
 
 	ret = vmw_kms_check_implicit(dev, state);
-	if (ret) {
-		VMW_DEBUG_KMS("Invalid implicit state\n");
+	if (ret)
 		return ret;
-	}
 
 	if (!state->allow_modeset)
 		return ret;
@@ -2350,9 +2347,6 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 
 	if (!arg->num_outputs) {
 		struct drm_rect def_rect = {0, 0, 800, 600};
-		VMW_DEBUG_KMS("Default layout x1 = %d y1 = %d x2 = %d y2 = %d\n",
-			      def_rect.x1, def_rect.y1,
-			      def_rect.x2, def_rect.y2);
 		vmw_du_update_layout(dev_priv, 1, &def_rect);
 		return 0;
 	}
@@ -2373,7 +2367,6 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 
 	drm_rects = (struct drm_rect *)rects;
 
-	VMW_DEBUG_KMS("Layout count = %u\n", arg->num_outputs);
 	for (i = 0; i < arg->num_outputs; i++) {
 		struct drm_vmw_rect curr_rect;
 
@@ -2390,10 +2383,6 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 		drm_rects[i].x2 = curr_rect.x + curr_rect.w;
 		drm_rects[i].y2 = curr_rect.y + curr_rect.h;
 
-		VMW_DEBUG_KMS("  x1 = %d y1 = %d x2 = %d y2 = %d\n",
-			      drm_rects[i].x1, drm_rects[i].y1,
-			      drm_rects[i].x2, drm_rects[i].y2);
-
 		/*
 		 * Currently this check is limiting the topology within
 		 * mode_config->max (which actually is max texture size
@@ -2404,9 +2393,7 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 		if (drm_rects[i].x1 < 0 ||  drm_rects[i].y1 < 0 ||
 		    drm_rects[i].x2 > mode_config->max_width ||
 		    drm_rects[i].y2 > mode_config->max_height) {
-			VMW_DEBUG_KMS("Invalid layout %d %d %d %d\n",
-				      drm_rects[i].x1, drm_rects[i].y1,
-				      drm_rects[i].x2, drm_rects[i].y2);
+			DRM_ERROR("Invalid GUI layout.\n");
 			ret = -EINVAL;
 			goto out_free;
 		}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
deleted file mode 100644
index 730c51e397dd..000000000000
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
+++ /dev/null
@@ -1,472 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/**************************************************************************
- *
- * Copyright 2019 VMware, Inc., Palo Alto, CA., USA
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-#include "vmwgfx_drv.h"
-
-/*
- * Different methods for tracking dirty:
- * VMW_BO_DIRTY_PAGETABLE - Scan the pagetable for hardware dirty bits
- * VMW_BO_DIRTY_MKWRITE - Write-protect page table entries and record write-
- * accesses in the VM mkwrite() callback
- */
-enum vmw_bo_dirty_method {
-	VMW_BO_DIRTY_PAGETABLE,
-	VMW_BO_DIRTY_MKWRITE,
-};
-
-/*
- * No dirtied pages at scan trigger a transition to the _MKWRITE method,
- * similarly a certain percentage of dirty pages trigger a transition to
- * the _PAGETABLE method. How many triggers should we wait for before
- * changing method?
- */
-#define VMW_DIRTY_NUM_CHANGE_TRIGGERS 2
-
-/* Percentage to trigger a transition to the _PAGETABLE method */
-#define VMW_DIRTY_PERCENTAGE 10
-
-/**
- * struct vmw_bo_dirty - Dirty information for buffer objects
- * @start: First currently dirty bit
- * @end: Last currently dirty bit + 1
- * @method: The currently used dirty method
- * @change_count: Number of consecutive method change triggers
- * @ref_count: Reference count for this structure
- * @bitmap_size: The size of the bitmap in bits. Typically equal to the
- * nuber of pages in the bo.
- * @size: The accounting size for this struct.
- * @bitmap: A bitmap where each bit represents a page. A set bit means a
- * dirty page.
- */
-struct vmw_bo_dirty {
-	unsigned long start;
-	unsigned long end;
-	enum vmw_bo_dirty_method method;
-	unsigned int change_count;
-	unsigned int ref_count;
-	unsigned long bitmap_size;
-	size_t size;
-	unsigned long bitmap[0];
-};
-
-/**
- * vmw_bo_dirty_scan_pagetable - Perform a pagetable scan for dirty bits
- * @vbo: The buffer object to scan
- *
- * Scans the pagetable for dirty bits. Clear those bits and modify the
- * dirty structure with the results. This function may change the
- * dirty-tracking method.
- */
-static void vmw_bo_dirty_scan_pagetable(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	pgoff_t offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-	pgoff_t num_marked;
-
-	num_marked = apply_as_clean(mapping,
-				    offset, dirty->bitmap_size,
-				    offset, &dirty->bitmap[0],
-				    &dirty->start, &dirty->end);
-	if (num_marked == 0)
-		dirty->change_count++;
-	else
-		dirty->change_count = 0;
-
-	if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
-		dirty->change_count = 0;
-		dirty->method = VMW_BO_DIRTY_MKWRITE;
-		apply_as_wrprotect(mapping,
-				   offset, dirty->bitmap_size);
-		apply_as_clean(mapping,
-			       offset, dirty->bitmap_size,
-			       offset, &dirty->bitmap[0],
-			       &dirty->start, &dirty->end);
-	}
-}
-
-/**
- * vmw_bo_dirty_scan_mkwrite - Reset the mkwrite dirty-tracking method
- * @vbo: The buffer object to scan
- *
- * Write-protect pages written to so that consecutive write accesses will
- * trigger a call to mkwrite.
- *
- * This function may change the dirty-tracking method.
- */
-static void vmw_bo_dirty_scan_mkwrite(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-	pgoff_t num_marked;
-
-	if (dirty->end <= dirty->start)
-		return;
-
-	num_marked = apply_as_wrprotect(vbo->base.bdev->dev_mapping,
-					dirty->start + offset,
-					dirty->end - dirty->start);
-
-	if (100UL * num_marked / dirty->bitmap_size >
-	    VMW_DIRTY_PERCENTAGE) {
-		dirty->change_count++;
-	} else {
-		dirty->change_count = 0;
-	}
-
-	if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
-		pgoff_t start = 0;
-		pgoff_t end = dirty->bitmap_size;
-
-		dirty->method = VMW_BO_DIRTY_PAGETABLE;
-		apply_as_clean(mapping, offset, end, offset, &dirty->bitmap[0],
-			       &start, &end);
-		bitmap_clear(&dirty->bitmap[0], 0, dirty->bitmap_size);
-		if (dirty->start < dirty->end)
-			bitmap_set(&dirty->bitmap[0], dirty->start,
-				   dirty->end - dirty->start);
-		dirty->change_count = 0;
-	}
-}
-
-/**
- * vmw_bo_dirty_scan - Scan for dirty pages and add them to the dirty
- * tracking structure
- * @vbo: The buffer object to scan
- *
- * This function may change the dirty tracking method.
- */
-void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-
-	if (dirty->method == VMW_BO_DIRTY_PAGETABLE)
-		vmw_bo_dirty_scan_pagetable(vbo);
-	else
-		vmw_bo_dirty_scan_mkwrite(vbo);
-}
-
-/**
- * vmw_bo_dirty_pre_unmap - write-protect and pick up dirty pages before
- * an unmap_mapping_range operation.
- * @vbo: The buffer object,
- * @start: First page of the range within the buffer object.
- * @end: Last page of the range within the buffer object + 1.
- *
- * If we're using the _PAGETABLE scan method, we may leak dirty pages
- * when calling unmap_mapping_range(). This function makes sure we pick
- * up all dirty pages.
- */
-static void vmw_bo_dirty_pre_unmap(struct vmw_buffer_object *vbo,
-				   pgoff_t start, pgoff_t end)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-
-	if (dirty->method != VMW_BO_DIRTY_PAGETABLE || start >= end)
-		return;
-
-	apply_as_wrprotect(mapping, start + offset, end - start);
-	apply_as_clean(mapping, start + offset, end - start, offset,
-		       &dirty->bitmap[0], &dirty->start, &dirty->end);
-}
-
-/**
- * vmw_bo_dirty_unmap - Clear all ptes pointing to a range within a bo
- * @vbo: The buffer object,
- * @start: First page of the range within the buffer object.
- * @end: Last page of the range within the buffer object + 1.
- *
- * This is similar to ttm_bo_unmap_virtual_locked() except it takes a subrange.
- */
-void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
-			pgoff_t start, pgoff_t end)
-{
-	unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-
-	vmw_bo_dirty_pre_unmap(vbo, start, end);
-	unmap_shared_mapping_range(mapping, (offset + start) << PAGE_SHIFT,
-				   (loff_t) (end - start) << PAGE_SHIFT);
-}
-
-/**
- * vmw_bo_dirty_add - Add a dirty-tracking user to a buffer object
- * @vbo: The buffer object
- *
- * This function registers a dirty-tracking user to a buffer object.
- * A user can be for example a resource or a vma in a special user-space
- * mapping.
- *
- * Return: Zero on success, -ENOMEM on memory allocation failure.
- */
-int vmw_bo_dirty_add(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	pgoff_t num_pages = vbo->base.num_pages;
-	size_t size, acc_size;
-	int ret;
-	static struct ttm_operation_ctx ctx = {
-		.interruptible = false,
-		.no_wait_gpu = false
-	};
-
-	if (dirty) {
-		dirty->ref_count++;
-		return 0;
-	}
-
-	size = sizeof(*dirty) + BITS_TO_LONGS(num_pages) * sizeof(long);
-	acc_size = ttm_round_pot(size);
-	ret = ttm_mem_global_alloc(&ttm_mem_glob, acc_size, &ctx);
-	if (ret) {
-		VMW_DEBUG_USER("Out of graphics memory for buffer object "
-			       "dirty tracker.\n");
-		return ret;
-	}
-	dirty = kvzalloc(size, GFP_KERNEL);
-	if (!dirty) {
-		ret = -ENOMEM;
-		goto out_no_dirty;
-	}
-
-	dirty->size = acc_size;
-	dirty->bitmap_size = num_pages;
-	dirty->start = dirty->bitmap_size;
-	dirty->end = 0;
-	dirty->ref_count = 1;
-	if (num_pages < PAGE_SIZE / sizeof(pte_t)) {
-		dirty->method = VMW_BO_DIRTY_PAGETABLE;
-	} else {
-		struct address_space *mapping = vbo->base.bdev->dev_mapping;
-		pgoff_t offset = drm_vma_node_start(&vbo->base.vma_node);
-
-		dirty->method = VMW_BO_DIRTY_MKWRITE;
-
-		/* Write-protect and then pick up already dirty bits */
-		apply_as_wrprotect(mapping, offset, num_pages);
-		apply_as_clean(mapping, offset, num_pages, offset,
-			       &dirty->bitmap[0], &dirty->start, &dirty->end);
-	}
-
-	vbo->dirty = dirty;
-
-	return 0;
-
-out_no_dirty:
-	ttm_mem_global_free(&ttm_mem_glob, acc_size);
-	return ret;
-}
-
-/**
- * vmw_bo_dirty_release - Release a dirty-tracking user from a buffer object
- * @vbo: The buffer object
- *
- * This function releases a dirty-tracking user from a buffer object.
- * If the reference count reaches zero, then the dirty-tracking object is
- * freed and the pointer to it cleared.
- *
- * Return: Zero on success, -ENOMEM on memory allocation failure.
- */
-void vmw_bo_dirty_release(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-
-	if (dirty && --dirty->ref_count == 0) {
-		size_t acc_size = dirty->size;
-
-		kvfree(dirty);
-		ttm_mem_global_free(&ttm_mem_glob, acc_size);
-		vbo->dirty = NULL;
-	}
-}
-
-/**
- * vmw_bo_dirty_transfer_to_res - Pick up a resource's dirty region from
- * its backing mob.
- * @res: The resource
- *
- * This function will pick up all dirty ranges affecting the resource from
- * it's backup mob, and call vmw_resource_dirty_update() once for each
- * range. The transferred ranges will be cleared from the backing mob's
- * dirty tracking.
- */
-void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res)
-{
-	struct vmw_buffer_object *vbo = res->backup;
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	pgoff_t start, cur, end;
-	unsigned long res_start = res->backup_offset;
-	unsigned long res_end = res->backup_offset + res->backup_size;
-
-	WARN_ON_ONCE(res_start & ~PAGE_MASK);
-	res_start >>= PAGE_SHIFT;
-	res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
-
-	if (res_start >= dirty->end || res_end <= dirty->start)
-		return;
-
-	cur = max(res_start, dirty->start);
-	res_end = max(res_end, dirty->end);
-	while (cur < res_end) {
-		unsigned long num;
-
-		start = find_next_bit(&dirty->bitmap[0], res_end, cur);
-		if (start >= res_end)
-			break;
-
-		end = find_next_zero_bit(&dirty->bitmap[0], res_end, start + 1);
-		cur = end + 1;
-		num = end - start;
-		bitmap_clear(&dirty->bitmap[0], start, num);
-		vmw_resource_dirty_update(res, start, end);
-	}
-
-	if (res_start <= dirty->start && res_end > dirty->start)
-		dirty->start = res_end;
-	if (res_start < dirty->end && res_end >= dirty->end)
-		dirty->end = res_start;
-}
-
-/**
- * vmw_bo_dirty_clear_res - Clear a resource's dirty region from
- * its backing mob.
- * @res: The resource
- *
- * This function will clear all dirty ranges affecting the resource from
- * it's backup mob's dirty tracking.
- */
-void vmw_bo_dirty_clear_res(struct vmw_resource *res)
-{
-	unsigned long res_start = res->backup_offset;
-	unsigned long res_end = res->backup_offset + res->backup_size;
-	struct vmw_buffer_object *vbo = res->backup;
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-
-	res_start >>= PAGE_SHIFT;
-	res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
-
-	if (res_start >= dirty->end || res_end <= dirty->start)
-		return;
-
-	res_start = max(res_start, dirty->start);
-	res_end = min(res_end, dirty->end);
-	bitmap_clear(&dirty->bitmap[0], res_start, res_end - res_start);
-
-	if (res_start <= dirty->start && res_end > dirty->start)
-		dirty->start = res_end;
-	if (res_start < dirty->end && res_end >= dirty->end)
-		dirty->end = res_start;
-}
-
-vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	vm_fault_t ret;
-	unsigned long page_offset;
-	struct vmw_buffer_object *vbo =
-		container_of(bo, typeof(*vbo), base);
-
-	ret = ttm_bo_vm_reserve(bo, vmf);
-	if (ret)
-		return ret;
-
-	page_offset = vmf->pgoff - drm_vma_node_start(&bo->vma_node);
-	if (unlikely(page_offset >= bo->num_pages)) {
-		ret = VM_FAULT_SIGBUS;
-		goto out_unlock;
-	}
-
-	if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE &&
-	    !test_bit(page_offset, &vbo->dirty->bitmap[0])) {
-		struct vmw_bo_dirty *dirty = vbo->dirty;
-
-		__set_bit(page_offset, &dirty->bitmap[0]);
-		dirty->start = min(dirty->start, page_offset);
-		dirty->end = max(dirty->end, page_offset + 1);
-	}
-
-out_unlock:
-	reservation_object_unlock(bo->resv);
-	return ret;
-}
-
-vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	struct vmw_buffer_object *vbo =
-		container_of(bo, struct vmw_buffer_object, base);
-	pgoff_t num_prefault;
-	pgprot_t prot;
-	vm_fault_t ret;
-
-	ret = ttm_bo_vm_reserve(bo, vmf);
-	if (ret)
-		return ret;
-
-	num_prefault = (vma->vm_flags & VM_RAND_READ) ? 1 :
-		TTM_BO_VM_NUM_PREFAULT;
-
-	if (vbo->dirty) {
-		pgoff_t allowed_prefault;
-		unsigned long page_offset;
-
-		page_offset = vmf->pgoff - drm_vma_node_start(&bo->vma_node);
-		if (page_offset >= bo->num_pages ||
-		    vmw_resources_clean(vbo, page_offset,
-					page_offset + PAGE_SIZE,
-					&allowed_prefault)) {
-			ret = VM_FAULT_SIGBUS;
-			goto out_unlock;
-		}
-
-		num_prefault = min(num_prefault, allowed_prefault);
-	}
-
-	/*
-	 * If we don't track dirty using the MKWRITE method, make sure
-	 * sure the page protection is write-enabled so we don't get
-	 * a lot of unnecessary write faults.
-	 */
-	if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
-		prot = vma->vm_page_prot;
-	else
-		prot = vm_get_page_prot(vma->vm_flags);
-
-	ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
-	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
-		return ret;
-
-out_unlock:
-	reservation_object_unlock(bo->resv);
-	return ret;
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index d70ee0df5c13..1d38a8b2f2ec 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -34,51 +34,6 @@
 
 #define VMW_RES_EVICT_ERR_COUNT 10
 
-/**
- * vmw_resource_mob_attach - Mark a resource as attached to its backing mob
- * @res: The resource
- */
-void vmw_resource_mob_attach(struct vmw_resource *res)
-{
-	struct vmw_buffer_object *backup = res->backup;
-	struct rb_node **new = &backup->res_tree.rb_node, *parent = NULL;
-
-	lockdep_assert_held(&backup->base.resv->lock.base);
-	res->used_prio = (res->res_dirty) ? res->func->dirty_prio :
-		res->func->prio;
-
-	while (*new) {
-		struct vmw_resource *this =
-			container_of(*new, struct vmw_resource, mob_node);
-
-		parent = *new;
-		new = (res->backup_offset < this->backup_offset) ?
-			&((*new)->rb_left) : &((*new)->rb_right);
-	}
-
-	rb_link_node(&res->mob_node, parent, new);
-	rb_insert_color(&res->mob_node, &backup->res_tree);
-
-	vmw_bo_prio_add(backup, res->used_prio);
-}
-
-/**
- * vmw_resource_mob_detach - Mark a resource as detached from its backing mob
- * @res: The resource
- */
-void vmw_resource_mob_detach(struct vmw_resource *res)
-{
-	struct vmw_buffer_object *backup = res->backup;
-
-	lockdep_assert_held(&backup->base.resv->lock.base);
-	if (vmw_resource_mob_attached(res)) {
-		rb_erase(&res->mob_node, &backup->res_tree);
-		RB_CLEAR_NODE(&res->mob_node);
-		vmw_bo_prio_del(backup, res->used_prio);
-	}
-}
-
-
 struct vmw_resource *vmw_resource_reference(struct vmw_resource *res)
 {
 	kref_get(&res->kref);
@@ -125,7 +80,7 @@ static void vmw_resource_release(struct kref *kref)
 		struct ttm_buffer_object *bo = &res->backup->base;
 
 		ttm_bo_reserve(bo, false, false, NULL);
-		if (vmw_resource_mob_attached(res) &&
+		if (!list_empty(&res->mob_head) &&
 		    res->func->unbind != NULL) {
 			struct ttm_validate_buffer val_buf;
 
@@ -134,11 +89,7 @@ static void vmw_resource_release(struct kref *kref)
 			res->func->unbind(res, false, &val_buf);
 		}
 		res->backup_dirty = false;
-		vmw_resource_mob_detach(res);
-		if (res->dirty)
-			res->func->dirty_free(res);
-		if (res->coherent)
-			vmw_bo_dirty_release(res->backup);
+		list_del_init(&res->mob_head);
 		ttm_bo_unreserve(bo);
 		vmw_bo_unreference(&res->backup);
 	}
@@ -220,17 +171,14 @@ int vmw_resource_init(struct vmw_private *dev_priv, struct vmw_resource *res,
 	res->res_free = res_free;
 	res->dev_priv = dev_priv;
 	res->func = func;
-	RB_CLEAR_NODE(&res->mob_node);
 	INIT_LIST_HEAD(&res->lru_head);
+	INIT_LIST_HEAD(&res->mob_head);
 	INIT_LIST_HEAD(&res->binding_head);
 	res->id = -1;
 	res->backup = NULL;
 	res->backup_offset = 0;
 	res->backup_dirty = false;
 	res->res_dirty = false;
-	res->coherent = false;
-	res->used_prio = 3;
-	res->dirty = NULL;
 	if (delay_id)
 		return 0;
 	else
@@ -395,8 +343,7 @@ out_no_bo:
  * should be retried once resources have been freed up.
  */
 static int vmw_resource_do_validate(struct vmw_resource *res,
-				    struct ttm_validate_buffer *val_buf,
-				    bool dirtying)
+				    struct ttm_validate_buffer *val_buf)
 {
 	int ret = 0;
 	const struct vmw_res_func *func = res->func;
@@ -408,47 +355,14 @@ static int vmw_resource_do_validate(struct vmw_resource *res,
 	}
 
 	if (func->bind &&
-	    ((func->needs_backup && !vmw_resource_mob_attached(res) &&
+	    ((func->needs_backup && list_empty(&res->mob_head) &&
 	      val_buf->bo != NULL) ||
 	     (!func->needs_backup && val_buf->bo != NULL))) {
 		ret = func->bind(res, val_buf);
 		if (unlikely(ret != 0))
 			goto out_bind_failed;
 		if (func->needs_backup)
-			vmw_resource_mob_attach(res);
-	}
-
-	/*
-	 * Handle the case where the backup mob is marked coherent but
-	 * the resource isn't.
-	 */
-	if (func->dirty_alloc && vmw_resource_mob_attached(res) &&
-	    !res->coherent) {
-		if (res->backup->dirty && !res->dirty) {
-			ret = func->dirty_alloc(res);
-			if (ret)
-				return ret;
-		} else if (!res->backup->dirty && res->dirty) {
-			func->dirty_free(res);
-		}
-	}
-
-	/*
-	 * Transfer the dirty regions to the resource and update
-	 * the resource.
-	 */
-	if (res->dirty) {
-		if (dirtying && !res->res_dirty) {
-			pgoff_t start = res->backup_offset >> PAGE_SHIFT;
-			pgoff_t end = __KERNEL_DIV_ROUND_UP
-				(res->backup_offset + res->backup_size,
-				 PAGE_SIZE);
-
-			vmw_bo_dirty_unmap(res->backup, start, end);
-		}
-
-		vmw_bo_dirty_transfer_to_res(res);
-		return func->dirty_sync(res);
+			list_add_tail(&res->mob_head, &res->backup->res_list);
 	}
 
 	return 0;
@@ -488,29 +402,19 @@ void vmw_resource_unreserve(struct vmw_resource *res,
 
 	if (switch_backup && new_backup != res->backup) {
 		if (res->backup) {
-			vmw_resource_mob_detach(res);
-			if (res->coherent)
-				vmw_bo_dirty_release(res->backup);
+			lockdep_assert_held(&res->backup->base.resv->lock.base);
+			list_del_init(&res->mob_head);
 			vmw_bo_unreference(&res->backup);
 		}
 
 		if (new_backup) {
 			res->backup = vmw_bo_reference(new_backup);
-
-			/*
-			 * The validation code should already have added a
-			 * dirty tracker here.
-			 */
-			WARN_ON(res->coherent && !new_backup->dirty);
-
-			vmw_resource_mob_attach(res);
+			lockdep_assert_held(&new_backup->base.resv->lock.base);
+			list_add_tail(&res->mob_head, &new_backup->res_list);
 		} else {
 			res->backup = NULL;
 		}
-	} else if (switch_backup && res->coherent) {
-		vmw_bo_dirty_release(res->backup);
 	}
-
 	if (switch_backup)
 		res->backup_offset = new_backup_offset;
 
@@ -565,7 +469,7 @@ vmw_resource_check_buffer(struct ww_acquire_ctx *ticket,
 	if (unlikely(ret != 0))
 		goto out_no_reserve;
 
-	if (res->func->needs_backup && !vmw_resource_mob_attached(res))
+	if (res->func->needs_backup && list_empty(&res->mob_head))
 		return 0;
 
 	backup_dirty = res->backup_dirty;
@@ -670,11 +574,11 @@ static int vmw_resource_do_evict(struct ww_acquire_ctx *ticket,
 		return ret;
 
 	if (unlikely(func->unbind != NULL &&
-		     (!func->needs_backup || vmw_resource_mob_attached(res)))) {
+		     (!func->needs_backup || !list_empty(&res->mob_head)))) {
 		ret = func->unbind(res, res->res_dirty, &val_buf);
 		if (unlikely(ret != 0))
 			goto out_no_unbind;
-		vmw_resource_mob_detach(res);
+		list_del_init(&res->mob_head);
 	}
 	ret = func->destroy(res);
 	res->backup_dirty = true;
@@ -691,7 +595,6 @@ out_no_unbind:
  *                         to the device.
  * @res: The resource to make visible to the device.
  * @intr: Perform waits interruptible if possible.
- * @dirtying: Pending GPU operation will dirty the resource
  *
  * On succesful return, any backup DMA buffer pointed to by @res->backup will
  * be reserved and validated.
@@ -701,8 +604,7 @@ out_no_unbind:
  * Return: Zero on success, -ERESTARTSYS if interrupted, negative error code
  * on failure.
  */
-int vmw_resource_validate(struct vmw_resource *res, bool intr,
-			  bool dirtying)
+int vmw_resource_validate(struct vmw_resource *res, bool intr)
 {
 	int ret;
 	struct vmw_resource *evict_res;
@@ -719,7 +621,7 @@ int vmw_resource_validate(struct vmw_resource *res, bool intr,
 	if (res->backup)
 		val_buf.bo = &res->backup->base;
 	do {
-		ret = vmw_resource_do_validate(res, &val_buf, dirtying);
+		ret = vmw_resource_do_validate(res, &val_buf);
 		if (likely(ret != -EBUSY))
 			break;
 
@@ -758,7 +660,7 @@ int vmw_resource_validate(struct vmw_resource *res, bool intr,
 	if (unlikely(ret != 0))
 		goto out_no_validate;
 	else if (!res->func->needs_backup && res->backup) {
-		WARN_ON_ONCE(vmw_resource_mob_attached(res));
+		list_del_init(&res->mob_head);
 		vmw_bo_unreference(&res->backup);
 	}
 
@@ -782,23 +684,22 @@ out_no_validate:
  */
 void vmw_resource_unbind_list(struct vmw_buffer_object *vbo)
 {
+
+	struct vmw_resource *res, *next;
 	struct ttm_validate_buffer val_buf = {
 		.bo = &vbo->base,
 		.num_shared = 0
 	};
 
 	lockdep_assert_held(&vbo->base.resv->lock.base);
-	while (!RB_EMPTY_ROOT(&vbo->res_tree)) {
-		struct rb_node *node = vbo->res_tree.rb_node;
-		struct vmw_resource *res =
-			container_of(node, struct vmw_resource, mob_node);
-
-		if (!WARN_ON_ONCE(!res->func->unbind))
-			(void) res->func->unbind(res, res->res_dirty, &val_buf);
+	list_for_each_entry_safe(res, next, &vbo->res_list, mob_head) {
+		if (!res->func->unbind)
+			continue;
 
+		(void) res->func->unbind(res, res->res_dirty, &val_buf);
 		res->backup_dirty = true;
 		res->res_dirty = false;
-		vmw_resource_mob_detach(res);
+		list_del_init(&res->mob_head);
 	}
 
 	(void) ttm_bo_wait(&vbo->base, false, false);
@@ -1019,7 +920,7 @@ int vmw_resource_pin(struct vmw_resource *res, bool interruptible)
 			/* Do we really need to pin the MOB as well? */
 			vmw_bo_pin_reserved(vbo, true);
 		}
-		ret = vmw_resource_validate(res, interruptible, true);
+		ret = vmw_resource_validate(res, interruptible);
 		if (vbo)
 			ttm_bo_unreserve(&vbo->base);
 		if (ret)
@@ -1079,101 +980,3 @@ enum vmw_res_type vmw_res_type(const struct vmw_resource *res)
 {
 	return res->func->res_type;
 }
-
-/**
- * vmw_resource_update_dirty - Update a resource's dirty tracker with a
- * sequential range of touched backing store memory.
- * @res: The resource.
- * @start: The first page touched.
- * @end: The last page touched + 1.
- */
-void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
-			       pgoff_t end)
-{
-	if (res->dirty)
-		res->func->dirty_range_add(res, start << PAGE_SHIFT,
-					   end << PAGE_SHIFT);
-}
-
-/**
- * vmw_resources_clean - Clean resources intersecting a mob range
- * @vbo: The mob buffer object
- * @start: The mob page offset starting the range
- * @end: The mob page offset ending the range
- * @num_prefault: Returns how many pages including the first have been
- * cleaned and are ok to prefault
- */
-int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
-			pgoff_t end, pgoff_t *num_prefault)
-{
-	struct rb_node *cur = vbo->res_tree.rb_node;
-	struct vmw_resource *found = NULL;
-	unsigned long res_start = start << PAGE_SHIFT;
-	unsigned long res_end = end << PAGE_SHIFT;
-	unsigned long last_cleaned = 0;
-
-	/*
-	 * Find the resource with lowest backup_offset that intersects the
-	 * range.
-	 */
-	while (cur) {
-		struct vmw_resource *cur_res =
-			container_of(cur, struct vmw_resource, mob_node);
-
-		if (cur_res->backup_offset >= res_end) {
-			cur = cur->rb_left;
-		} else if (cur_res->backup_offset + cur_res->backup_size <=
-			   res_start) {
-			cur = cur->rb_right;
-		} else {
-			found = cur_res;
-			cur = cur->rb_left;
-			/* Continue to look for resources with lower offsets */
-		}
-	}
-
-	/*
-	 * In order of increasing backup_offset, clean dirty resorces
-	 * intersecting the range.
-	 */
-	while (found) {
-		if (found->res_dirty) {
-			int ret;
-
-			if (!found->func->clean)
-				return -EINVAL;
-
-			ret = found->func->clean(found);
-			if (ret)
-				return ret;
-
-			found->res_dirty = false;
-		}
-		last_cleaned = found->backup_offset + found->backup_size;
-		cur = rb_next(&found->mob_node);
-		if (!cur)
-			break;
-
-		found = container_of(cur, struct vmw_resource, mob_node);
-		if (found->backup_offset >= res_end)
-			break;
-	}
-
-	/*
-	 * Set number of pages allowed prefaulting and fence the buffer object
-	 */
-	*num_prefault = 1;
-	if (last_cleaned > res_start) {
-		struct ttm_buffer_object *bo = &vbo->base;
-
-		*num_prefault = __KERNEL_DIV_ROUND_UP(last_cleaned - res_start,
-						      PAGE_SIZE);
-		vmw_bo_fence_single(bo, NULL);
-		if (bo->moving)
-			dma_fence_put(bo->moving);
-		bo->moving = dma_fence_get
-			(reservation_object_get_excl(bo->resv));
-	}
-
-	return 0;
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
index 3b7438b2d289..7e19eba0b0b8 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
@@ -71,13 +71,6 @@ struct vmw_user_resource_conv {
  * @commit_notify:     If the resource is a command buffer managed resource,
  *                     callback to notify that a define or remove command
  *                     has been committed to the device.
- * @dirty_alloc:       Allocate a dirty tracker. NULL if dirty-tracking is not
- *                     supported.
- * @dirty_free:        Free the dirty tracker.
- * @dirty_sync:        Upload the dirty mob contents to the resource.
- * @dirty_add_range:   Add a sequential dirty range to the resource
- *                     dirty tracker.
- * @clean:             Clean the resource.
  */
 struct vmw_res_func {
 	enum vmw_res_type res_type;
@@ -85,8 +78,6 @@ struct vmw_res_func {
 	const char *type_name;
 	struct ttm_placement *backup_placement;
 	bool may_evict;
-	u32 prio;
-	u32 dirty_prio;
 
 	int (*create) (struct vmw_resource *res);
 	int (*destroy) (struct vmw_resource *res);
@@ -97,12 +88,6 @@ struct vmw_res_func {
 		       struct ttm_validate_buffer *val_buf);
 	void (*commit_notify)(struct vmw_resource *res,
 			      enum vmw_cmdbuf_res_state state);
-	int (*dirty_alloc)(struct vmw_resource *res);
-	void (*dirty_free)(struct vmw_resource *res);
-	int (*dirty_sync)(struct vmw_resource *res);
-	void (*dirty_range_add)(struct vmw_resource *res, size_t start,
-				 size_t end);
-	int (*clean)(struct vmw_resource *res);
 };
 
 /**
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
index e139fdfd1635..d310d21f0d54 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
@@ -95,8 +95,6 @@ static const struct vmw_res_func vmw_gb_shader_func = {
 	.res_type = vmw_res_shader,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "guest backed shaders",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_gb_shader_create,
@@ -108,9 +106,7 @@ static const struct vmw_res_func vmw_gb_shader_func = {
 static const struct vmw_res_func vmw_dx_shader_func = {
 	.res_type = vmw_res_shader,
 	.needs_backup = true,
-	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
+	.may_evict = false,
 	.type_name = "dx shaders",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_dx_shader_create,
@@ -427,7 +423,7 @@ static int vmw_dx_shader_create(struct vmw_resource *res)
 
 	WARN_ON_ONCE(!shader->committed);
 
-	if (vmw_resource_mob_attached(res)) {
+	if (!list_empty(&res->mob_head)) {
 		mutex_lock(&dev_priv->binding_mutex);
 		ret = vmw_dx_shader_unscrub(res);
 		mutex_unlock(&dev_priv->binding_mutex);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index 862ca44680ca..219471903bc1 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -68,20 +68,6 @@ struct vmw_surface_offset {
 	uint32_t bo_offset;
 };
 
-/**
- * vmw_surface_dirty - Surface dirty-tracker
- * @cache: Cached layout information of the surface.
- * @size: Accounting size for the struct vmw_surface_dirty.
- * @num_subres: Number of subresources.
- * @boxes: Array of SVGA3dBoxes indicating dirty regions. One per subresource.
- */
-struct vmw_surface_dirty {
-	struct svga3dsurface_cache cache;
-	size_t size;
-	u32 num_subres;
-	SVGA3dBox boxes[0];
-};
-
 static void vmw_user_surface_free(struct vmw_resource *res);
 static struct vmw_resource *
 vmw_user_surface_base_to_res(struct ttm_base_object *base);
@@ -110,13 +96,6 @@ vmw_gb_surface_reference_internal(struct drm_device *dev,
 				  struct drm_vmw_gb_surface_ref_ext_rep *rep,
 				  struct drm_file *file_priv);
 
-static void vmw_surface_dirty_free(struct vmw_resource *res);
-static int vmw_surface_dirty_alloc(struct vmw_resource *res);
-static int vmw_surface_dirty_sync(struct vmw_resource *res);
-static void vmw_surface_dirty_range_add(struct vmw_resource *res, size_t start,
-					size_t end);
-static int vmw_surface_clean(struct vmw_resource *res);
-
 static const struct vmw_user_resource_conv user_surface_conv = {
 	.object_type = VMW_RES_SURFACE,
 	.base_obj_to_res = vmw_user_surface_base_to_res,
@@ -133,8 +112,6 @@ static const struct vmw_res_func vmw_legacy_surface_func = {
 	.res_type = vmw_res_surface,
 	.needs_backup = false,
 	.may_evict = true,
-	.prio = 1,
-	.dirty_prio = 1,
 	.type_name = "legacy surfaces",
 	.backup_placement = &vmw_srf_placement,
 	.create = &vmw_legacy_srf_create,
@@ -147,19 +124,12 @@ static const struct vmw_res_func vmw_gb_surface_func = {
 	.res_type = vmw_res_surface,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 1,
-	.dirty_prio = 2,
 	.type_name = "guest backed surfaces",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_gb_surface_create,
 	.destroy = vmw_gb_surface_destroy,
 	.bind = vmw_gb_surface_bind,
-	.unbind = vmw_gb_surface_unbind,
-	.dirty_alloc = vmw_surface_dirty_alloc,
-	.dirty_free = vmw_surface_dirty_free,
-	.dirty_sync = vmw_surface_dirty_sync,
-	.dirty_range_add = vmw_surface_dirty_range_add,
-	.clean = vmw_surface_clean,
+	.unbind = vmw_gb_surface_unbind
 };
 
 /**
@@ -667,7 +637,6 @@ static void vmw_user_surface_free(struct vmw_resource *res)
 	struct vmw_private *dev_priv = srf->res.dev_priv;
 	uint32_t size = user_srf->size;
 
-	WARN_ON_ONCE(res->dirty);
 	if (user_srf->master)
 		drm_master_put(&user_srf->master);
 	kfree(srf->offsets);
@@ -946,6 +915,12 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv,
 		if (unlikely(drm_is_render_client(file_priv)))
 			require_exist = true;
 
+		if (READ_ONCE(vmw_fpriv(file_priv)->locked_master)) {
+			DRM_ERROR("Locked master refused legacy "
+				  "surface reference.\n");
+			return -EACCES;
+		}
+
 		handle = u_handle;
 	}
 
@@ -1195,16 +1170,10 @@ static int vmw_gb_surface_bind(struct vmw_resource *res,
 		cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_SURFACE;
 		cmd2->header.size = sizeof(cmd2->body);
 		cmd2->body.sid = res->id;
+		res->backup_dirty = false;
 	}
 	vmw_fifo_commit(dev_priv, submit_size);
 
-	if (res->backup->dirty && res->backup_dirty) {
-		/* We've just made a full upload. Cear dirty regions. */
-		vmw_bo_dirty_clear_res(res);
-	}
-
-	res->backup_dirty = false;
-
 	return 0;
 }
 
@@ -1669,8 +1638,7 @@ vmw_gb_surface_define_internal(struct drm_device *dev,
 			}
 		}
 	} else if (req->base.drm_surface_flags &
-		   (drm_vmw_surface_flag_create_buffer |
-		    drm_vmw_surface_flag_coherent))
+		   drm_vmw_surface_flag_create_buffer)
 		ret = vmw_user_bo_alloc(dev_priv, tfile,
 					res->backup_size,
 					req->base.drm_surface_flags &
@@ -1684,26 +1652,6 @@ vmw_gb_surface_define_internal(struct drm_device *dev,
 		goto out_unlock;
 	}
 
-	if (req->base.drm_surface_flags & drm_vmw_surface_flag_coherent) {
-		struct vmw_buffer_object *backup = res->backup;
-
-		ttm_bo_reserve(&backup->base, false, false, NULL);
-		if (!res->func->dirty_alloc)
-			ret = -EINVAL;
-		if (!ret)
-			ret = vmw_bo_dirty_add(backup);
-		if (!ret) {
-			res->coherent = true;
-			ret = res->func->dirty_alloc(res);
-		}
-		ttm_bo_unreserve(&backup->base);
-		if (ret) {
-			vmw_resource_unreference(&res);
-			goto out_unlock;
-		}
-
-	}
-
 	tmp = vmw_resource_reference(res);
 	ret = ttm_prime_object_init(tfile, res->backup_size, &user_srf->prime,
 				    req->base.drm_surface_flags &
@@ -1812,338 +1760,3 @@ out_bad_resource:
 
 	return ret;
 }
-
-/**
- * vmw_subres_dirty_add - Add a dirty region to a subresource
- * @dirty: The surfaces's dirty tracker.
- * @loc_start: The location corresponding to the start of the region.
- * @loc_end: The location corresponding to the end of the region.
- *
- * As we are assuming that @loc_start and @loc_end represent a sequential
- * range of backing store memory, if the region spans multiple lines then
- * regardless of the x coordinate, the full lines are dirtied.
- * Correspondingly if the region spans multiple z slices, then full rather
- * than partial z slices are dirtied.
- */
-static void vmw_subres_dirty_add(struct vmw_surface_dirty *dirty,
-				 const struct svga3dsurface_loc *loc_start,
-				 const struct svga3dsurface_loc *loc_end)
-{
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	SVGA3dBox *box = &dirty->boxes[loc_start->sub_resource];
-	u32 mip = loc_start->sub_resource % cache->num_mip_levels;
-	const struct drm_vmw_size *size = &cache->mip[mip].size;
-	u32 box_c2 = box->z + box->d;
-
-	if (WARN_ON(loc_start->sub_resource >= dirty->num_subres))
-		return;
-
-	if (box->d == 0 || box->z > loc_start->z)
-		box->z = loc_start->z;
-	if (box_c2 < loc_end->z)
-		box->d = loc_end->z - box->z;
-
-	if (loc_start->z + 1 == loc_end->z) {
-		box_c2 = box->y + box->h;
-		if (box->h == 0 || box->y > loc_start->y)
-			box->y = loc_start->y;
-		if (box_c2 < loc_end->y)
-			box->h = loc_end->y - box->y;
-
-		if (loc_start->y + 1 == loc_end->y) {
-			box_c2 = box->x + box->w;
-			if (box->w == 0 || box->x > loc_start->x)
-				box->x = loc_start->x;
-			if (box_c2 < loc_end->x)
-				box->w = loc_end->x - box->x;
-		} else {
-			box->x = 0;
-			box->w = size->width;
-		}
-	} else {
-		box->y = 0;
-		box->h = size->height;
-		box->x = 0;
-		box->w = size->width;
-	}
-}
-
-/**
- * vmw_subres_dirty_full - Mark a full subresource as dirty
- * @dirty: The surface's dirty tracker.
- * @subres: The subresource
- */
-static void vmw_subres_dirty_full(struct vmw_surface_dirty *dirty, u32 subres)
-{
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	u32 mip = subres % cache->num_mip_levels;
-	const struct drm_vmw_size *size = &cache->mip[mip].size;
-	SVGA3dBox *box = &dirty->boxes[subres];
-
-	box->x = 0;
-	box->y = 0;
-	box->z = 0;
-	box->w = size->width;
-	box->h = size->height;
-	box->d = size->depth;
-}
-
-/*
- * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for texture
- * surfaces.
- */
-static void vmw_surface_tex_dirty_range_add(struct vmw_resource *res,
-					    size_t start, size_t end)
-{
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	size_t backup_end = res->backup_offset + res->backup_size;
-	struct svga3dsurface_loc loc1, loc2;
-	const struct svga3dsurface_cache *cache;
-
-	start = max_t(size_t, start, res->backup_offset) - res->backup_offset;
-	end = min(end, backup_end) - res->backup_offset;
-	cache = &dirty->cache;
-	svga3dsurface_get_loc(cache, &loc1, start);
-	svga3dsurface_get_loc(cache, &loc2, end - 1);
-	svga3dsurface_inc_loc(cache, &loc2);
-
-	if (loc1.sub_resource + 1 == loc2.sub_resource) {
-		/* Dirty range covers a single sub-resource */
-		vmw_subres_dirty_add(dirty, &loc1, &loc2);
-	} else {
-		/* Dirty range covers multiple sub-resources */
-		struct svga3dsurface_loc loc_min, loc_max;
-		u32 sub_res = loc1.sub_resource;
-
-		svga3dsurface_max_loc(cache, loc1.sub_resource, &loc_max);
-		vmw_subres_dirty_add(dirty, &loc1, &loc_max);
-		svga3dsurface_min_loc(cache, loc2.sub_resource - 1, &loc_min);
-		vmw_subres_dirty_add(dirty, &loc_min, &loc2);
-		for (sub_res = loc1.sub_resource + 1;
-		     sub_res < loc2.sub_resource - 1; ++sub_res)
-			vmw_subres_dirty_full(dirty, sub_res);
-	}
-}
-
-/*
- * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for buffer
- * surfaces.
- */
-static void vmw_surface_buf_dirty_range_add(struct vmw_resource *res,
-					    size_t start, size_t end)
-{
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	size_t backup_end = res->backup_offset + cache->mip_chain_bytes;
-	SVGA3dBox *box = &dirty->boxes[0];
-	u32 box_c2;
-
-	box->h = box->d = 1;
-	start = max_t(size_t, start, res->backup_offset) - res->backup_offset;
-	end = min(end, backup_end) - res->backup_offset;
-	box_c2 = box->x + box->w;
-	if (box->w == 0 || box->x > start)
-		box->x = start;
-	if (box_c2 < end)
-		box->w = end - box->x;
-}
-
-/*
- * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for surfaces
- */
-static void vmw_surface_dirty_range_add(struct vmw_resource *res, size_t start,
-					size_t end)
-{
-	struct vmw_surface *srf = vmw_res_to_srf(res);
-
-	if (WARN_ON(end <= res->backup_offset ||
-		    start >= res->backup_offset + res->backup_size))
-		return;
-
-	if (srf->format == SVGA3D_BUFFER)
-		vmw_surface_buf_dirty_range_add(res, start, end);
-	else
-		vmw_surface_tex_dirty_range_add(res, start, end);
-}
-
-/*
- * vmw_surface_dirty_sync - The surface's dirty_sync callback.
- */
-static int vmw_surface_dirty_sync(struct vmw_resource *res)
-{
-	struct vmw_private *dev_priv = res->dev_priv;
-	bool has_dx = 0;
-	u32 i, num_dirty;
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	size_t alloc_size;
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdDXUpdateSubResource body;
-	} *cmd1;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdUpdateGBImage body;
-	} *cmd2;
-	void *cmd;
-
-	num_dirty = 0;
-	for (i = 0; i < dirty->num_subres; ++i) {
-		const SVGA3dBox *box = &dirty->boxes[i];
-
-		if (box->d)
-			num_dirty++;
-	}
-
-	if (!num_dirty)
-		goto out;
-
-	alloc_size = num_dirty * ((has_dx) ? sizeof(*cmd1) : sizeof(*cmd2));
-	cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
-	if (!cmd)
-		return -ENOMEM;
-
-	cmd1 = cmd;
-	cmd2 = cmd;
-
-	for (i = 0; i < dirty->num_subres; ++i) {
-		const SVGA3dBox *box = &dirty->boxes[i];
-
-		if (!box->d)
-			continue;
-
-		/*
-		 * DX_UPDATE_SUBRESOURCE is aware of array surfaces.
-		 * UPDATE_GB_IMAGE is not.
-		 */
-		if (has_dx) {
-			cmd1->header.id = SVGA_3D_CMD_DX_UPDATE_SUBRESOURCE;
-			cmd1->header.size = sizeof(cmd1->body);
-			cmd1->body.sid = res->id;
-			cmd1->body.subResource = i;
-			cmd1->body.box = *box;
-			cmd1++;
-		} else {
-			cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_IMAGE;
-			cmd2->header.size = sizeof(cmd2->body);
-			cmd2->body.image.sid = res->id;
-			cmd2->body.image.face = i / cache->num_mip_levels;
-			cmd2->body.image.mipmap = i -
-				(cache->num_mip_levels * cmd2->body.image.face);
-			cmd2->body.box = *box;
-			cmd2++;
-		}
-
-	}
-	vmw_fifo_commit(dev_priv, alloc_size);
- out:
-	memset(&dirty->boxes[0], 0, sizeof(dirty->boxes[0]) *
-	       dirty->num_subres);
-
-	return 0;
-}
-
-/*
- * vmw_surface_dirty_alloc - The surface's dirty_alloc callback.
- */
-static int vmw_surface_dirty_alloc(struct vmw_resource *res)
-{
-	struct vmw_surface *srf = vmw_res_to_srf(res);
-	struct vmw_surface_dirty *dirty;
-	u32 num_layers = 1;
-	u32 num_mip;
-	u32 num_subres;
-	u32 num_samples;
-	size_t dirty_size, acc_size;
-	static struct ttm_operation_ctx ctx = {
-		.interruptible = false,
-		.no_wait_gpu = false
-	};
-	int ret;
-
-	if (srf->array_size)
-		num_layers = srf->array_size;
-	else if (srf->flags & SVGA3D_SURFACE_CUBEMAP)
-		num_layers *= SVGA3D_MAX_SURFACE_FACES;
-
-	num_mip = srf->mip_levels[0];
-	if (!num_mip)
-		num_mip = 1;
-
-	num_subres = num_layers * num_mip;
-	dirty_size = sizeof(*dirty) + num_subres * sizeof(dirty->boxes[0]);
-	acc_size = ttm_round_pot(dirty_size);
-	ret = ttm_mem_global_alloc(vmw_mem_glob(res->dev_priv),
-				   acc_size, &ctx);
-	if (ret) {
-		VMW_DEBUG_USER("Out of graphics memory for surface "
-			       "dirty tracker.\n");
-		return ret;
-	}
-
-	dirty = kvzalloc(dirty_size, GFP_KERNEL);
-	if (!dirty) {
-		ret = -ENOMEM;
-		goto out_no_dirty;
-	}
-
-	num_samples = max_t(u32, 1, srf->multisample_count);
-	ret = svga3dsurface_setup_cache(&srf->base_size, srf->format, num_mip,
-					num_layers, num_samples, &dirty->cache);
-	if (ret)
-		goto out_no_cache;
-
-	dirty->num_subres = num_subres;
-	dirty->size = acc_size;
-	res->dirty = (struct vmw_resource_dirty *) dirty;
-
-	return 0;
-
-out_no_cache:
-	kvfree(dirty);
-out_no_dirty:
-	ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
-	return ret;
-}
-
-/*
- * vmw_surface_dirty_free - The surface's dirty_free callback
- */
-static void vmw_surface_dirty_free(struct vmw_resource *res)
-{
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	size_t acc_size = dirty->size;
-
-	kvfree(dirty);
-	ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
-	res->dirty = NULL;
-}
-
-/*
- * vmw_surface_clean - The surface's clean callback
- */
-static int vmw_surface_clean(struct vmw_resource *res)
-{
-	struct vmw_private *dev_priv = res->dev_priv;
-	size_t alloc_size;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdReadbackGBSurface body;
-	} *cmd;
-
-	alloc_size = sizeof(*cmd);
-	cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
-	if (!cmd)
-		return -ENOMEM;
-
-	cmd->header.id = SVGA_3D_CMD_READBACK_GB_SURFACE;
-	cmd->header.size = sizeof(cmd->body);
-	cmd->body.sid = res->id;
-	vmw_fifo_commit(dev_priv, alloc_size);
-
-	return 0;
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
index 9aaf807ed73c..f611b2290a1b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
@@ -33,8 +33,6 @@
  * struct vmw_validation_bo_node - Buffer object validation metadata.
  * @base: Metadata used for TTM reservation- and validation.
  * @hash: A hash entry used for the duplicate detection hash table.
- * @coherent_count: If switching backup buffers, number of new coherent
- * resources that will have this buffer as a backup buffer.
  * @as_mob: Validate as mob.
  * @cpu_blit: Validate for cpu blit access.
  *
@@ -44,7 +42,6 @@
 struct vmw_validation_bo_node {
 	struct ttm_validate_buffer base;
 	struct drm_hash_item hash;
-	unsigned int coherent_count;
 	u32 as_mob : 1;
 	u32 cpu_blit : 1;
 };
@@ -462,19 +459,6 @@ int vmw_validation_res_reserve(struct vmw_validation_context *ctx,
 			if (ret)
 				goto out_unreserve;
 		}
-
-		if (val->switching_backup && val->new_backup &&
-		    res->coherent) {
-			struct vmw_validation_bo_node *bo_node =
-				vmw_validation_find_bo_dup(ctx,
-							   val->new_backup);
-
-			if (WARN_ON(!bo_node)) {
-				ret = -EINVAL;
-				goto out_unreserve;
-			}
-			bo_node->coherent_count++;
-		}
 	}
 
 	return 0;
@@ -578,9 +562,6 @@ int vmw_validation_bo_validate(struct vmw_validation_context *ctx, bool intr)
 	int ret;
 
 	list_for_each_entry(entry, &ctx->bo_list, base.head) {
-		struct vmw_buffer_object *vbo =
-			container_of(entry->base.bo, typeof(*vbo), base);
-
 		if (entry->cpu_blit) {
 			struct ttm_operation_ctx ctx = {
 				.interruptible = intr,
@@ -595,27 +576,6 @@ int vmw_validation_bo_validate(struct vmw_validation_context *ctx, bool intr)
 		}
 		if (ret)
 			return ret;
-
-		/*
-		 * Rather than having the resource code allocating the bo
-		 * dirty tracker in resource_unreserve() where we can't fail,
-		 * Do it here when validating the buffer object.
-		 */
-		if (entry->coherent_count) {
-			unsigned int coherent_count = entry->coherent_count;
-
-			while (coherent_count) {
-				ret = vmw_bo_dirty_add(vbo);
-				if (ret)
-					return ret;
-
-				coherent_count--;
-			}
-			entry->coherent_count -= coherent_count;
-		}
-
-		if (vbo->dirty)
-			vmw_bo_dirty_scan(vbo);
 	}
 	return 0;
 }
@@ -641,8 +601,7 @@ int vmw_validation_res_validate(struct vmw_validation_context *ctx, bool intr)
 		struct vmw_resource *res = val->res;
 		struct vmw_buffer_object *backup = res->backup;
 
-		ret = vmw_resource_validate(res, intr, val->dirty_set &&
-					    val->dirty);
+		ret = vmw_resource_validate(res, intr);
 		if (ret) {
 			if (ret != -ERESTARTSYS)
 				DRM_ERROR("Failed to validate resource.\n");
@@ -869,34 +828,3 @@ int vmw_validation_preload_res(struct vmw_validation_context *ctx,
 	ctx->mem_size_left += size;
 	return 0;
 }
-
-/**
- * vmw_validation_bo_backoff - Unreserve buffer objects registered with a
- * validation context
- * @ctx: The validation context
- *
- * This function unreserves the buffer objects previously reserved using
- * vmw_validation_bo_reserve. It's typically used as part of an error path
- */
-void vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
-{
-	struct vmw_validation_bo_node *entry;
-
-	/*
-	 * Switching coherent resource backup buffers failed.
-	 * Release corresponding buffer object dirty trackers.
-	 */
-	list_for_each_entry(entry, &ctx->bo_list, base.head) {
-		if (entry->coherent_count) {
-			unsigned int coherent_count = entry->coherent_count;
-			struct vmw_buffer_object *vbo =
-				container_of(entry->base.bo, typeof(*vbo),
-					     base);
-
-			while (coherent_count--)
-				vmw_bo_dirty_release(vbo);
-		}
-	}
-
-	ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
index fd83e017c2a5..1d2322ad6fd5 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
@@ -172,6 +172,20 @@ vmw_validation_bo_reserve(struct vmw_validation_context *ctx,
 				      NULL, true);
 }
 
+/**
+ * vmw_validation_bo_backoff - Unreserve buffer objects registered with a
+ * validation context
+ * @ctx: The validation context
+ *
+ * This function unreserves the buffer objects previously reserved using
+ * vmw_validation_bo_reserve. It's typically used as part of an error path
+ */
+static inline void
+vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
+{
+	ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
+}
+
 /**
  * vmw_validation_bo_fence - Unreserve and fence buffer objects registered
  * with a validation context
@@ -254,6 +268,4 @@ int vmw_validation_preload_res(struct vmw_validation_context *ctx,
 			       unsigned int size);
 void vmw_validation_res_set_dirty(struct vmw_validation_context *ctx,
 				  void *val_private, u32 dirty);
-void vmw_validation_bo_backoff(struct vmw_validation_context *ctx);
-
 #endif
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 435d02f719a8..49d9cdfc58f2 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -768,14 +768,4 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
 			struct ttm_operation_ctx *ctx);
 void ttm_bo_swapout_all(struct ttm_bo_device *bdev);
 int ttm_bo_wait_unreserved(struct ttm_buffer_object *bo);
-
-/* Default number of pre-faulted pages in the TTM fault handler */
-#define TTM_BO_VM_NUM_PREFAULT 16
-
-vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
-			     struct vm_fault *vmf);
-
-vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
-				    pgprot_t prot,
-				    pgoff_t num_prefault);
 #endif
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index a2d810a2504d..c9b8ba492f24 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -442,9 +442,6 @@ extern struct ttm_bo_global {
  * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
  * @man: An array of mem_type_managers.
  * @vma_manager: Address space manager
- * @vm_ops: Pointer to the struct vm_operations_struct used for this
- * device's VM operations. The driver may override this before the first
- * mmap() call.
  * lru_lock: Spinlock that protects the buffer+device lru lists and
  * ddestroy lists.
  * @dev_mapping: A pointer to the struct address_space representing the
@@ -463,7 +460,6 @@ struct ttm_bo_device {
 	struct ttm_bo_global *glob;
 	struct ttm_bo_driver *driver;
 	struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
-	const struct vm_operations_struct *vm_ops;
 
 	/*
 	 * Protected by internal locks.
@@ -492,8 +488,6 @@ struct ttm_bo_device {
 	bool no_retry;
 };
 
-extern const struct vm_operations_struct ttm_bo_vm_ops;
-
 /**
  * struct ttm_lru_bulk_move_pos
  *
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 798cdda9560e..dd0b5f4e1e45 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2686,24 +2686,7 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
-struct pfn_range_apply;
-typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
-			 struct pfn_range_apply *closure);
-struct pfn_range_apply {
-	struct mm_struct *mm;
-	pter_fn_t ptefn;
-	unsigned int alloc;
-};
-extern int apply_to_pfn_range(struct pfn_range_apply *closure,
-			      unsigned long address, unsigned long size);
-unsigned long apply_as_wrprotect(struct address_space *mapping,
-				 pgoff_t first_index, pgoff_t nr);
-unsigned long apply_as_clean(struct address_space *mapping,
-			     pgoff_t first_index, pgoff_t nr,
-			     pgoff_t bitmap_pgoff,
-			     unsigned long *bitmap,
-			     pgoff_t *start,
-			     pgoff_t *end);
+
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
 extern void kernel_poison_pages(struct page *page, int numpages, int enable);
diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index 02cab33f2f25..399f58317cff 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -891,13 +891,11 @@ struct drm_vmw_shader_arg {
  *                                      surface.
  * @drm_vmw_surface_flag_create_buffer: Create a backup buffer if none is
  *                                      given.
- * @drm_vmw_surface_flag_coherent:      Back surface with coherent memory.
  */
 enum drm_vmw_surface_flags {
 	drm_vmw_surface_flag_shareable = (1 << 0),
 	drm_vmw_surface_flag_scanout = (1 << 1),
-	drm_vmw_surface_flag_create_buffer = (1 << 2),
-	drm_vmw_surface_flag_coherent = (1 << 3),
+	drm_vmw_surface_flag_create_buffer = (1 << 2)
 };
 
 /**
diff --git a/mm/Kconfig b/mm/Kconfig
index 5006d0e6a5c7..f0c76ba47695 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -765,7 +765,4 @@ config GUP_BENCHMARK
 config ARCH_HAS_PTE_SPECIAL
 	bool
 
-config AS_DIRTY_HELPERS
-        bool
-
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index f5d412bbc2f7..ac5e5ba78874 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,4 +104,3 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
 obj-$(CONFIG_HMM) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
-obj-$(CONFIG_AS_DIRTY_HELPERS) += as_dirty_helpers.o
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
deleted file mode 100644
index f600e31534fb..000000000000
--- a/mm/as_dirty_helpers.c
+++ /dev/null
@@ -1,300 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-#include <linux/mm_types.h>
-#include <linux/hugetlb.h>
-#include <linux/bitops.h>
-#include <linux/mmu_notifier.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-/**
- * struct apply_as - Closure structure for apply_as_range
- * @base: struct pfn_range_apply we derive from
- * @start: Address of first modified pte
- * @end: Address of last modified pte + 1
- * @total: Total number of modified ptes
- * @vma: Pointer to the struct vm_area_struct we're currently operating on
- */
-struct apply_as {
-	struct pfn_range_apply base;
-	unsigned long start;
-	unsigned long end;
-	unsigned long total;
-	struct vm_area_struct *vma;
-};
-
-/**
- * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
- * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
- * @addr: The virtual page address
- * @closure: Pointer to a struct pfn_range_apply embedded in a
- * struct apply_as
- *
- * The function write-protects a pte and records the range in
- * virtual address space of touched ptes for efficient range TLB flushes.
- *
- * Return: Always zero.
- */
-static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
-			      unsigned long addr,
-			      struct pfn_range_apply *closure)
-{
-	struct apply_as *aas = container_of(closure, typeof(*aas), base);
-	pte_t ptent = *pte;
-
-	if (pte_write(ptent)) {
-		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
-
-		ptent = pte_wrprotect(old_pte);
-		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
-		aas->total++;
-		aas->start = min(aas->start, addr);
-		aas->end = max(aas->end, addr + PAGE_SIZE);
-	}
-
-	return 0;
-}
-
-/**
- * struct apply_as_clean - Closure structure for apply_as_clean
- * @base: struct apply_as we derive from
- * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
- * @bitmap: Bitmap with one bit for each page offset in the address_space range
- * covered.
- * @start: Address_space page offset of first modified pte relative
- * to @bitmap_pgoff
- * @end: Address_space page offset of last modified pte relative
- * to @bitmap_pgoff
- */
-struct apply_as_clean {
-	struct apply_as base;
-	pgoff_t bitmap_pgoff;
-	unsigned long *bitmap;
-	pgoff_t start;
-	pgoff_t end;
-};
-
-/**
- * apply_pt_clean - Leaf pte callback to clean a pte
- * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
- * @addr: The virtual page address
- * @closure: Pointer to a struct pfn_range_apply embedded in a
- * struct apply_as_clean
- *
- * The function cleans a pte and records the range in
- * virtual address space of touched ptes for efficient TLB flushes.
- * It also records dirty ptes in a bitmap representing page offsets
- * in the address_space, as well as the first and last of the bits
- * touched.
- *
- * Return: Always zero.
- */
-static int apply_pt_clean(pte_t *pte, pgtable_t token,
-			  unsigned long addr,
-			  struct pfn_range_apply *closure)
-{
-	struct apply_as *aas = container_of(closure, typeof(*aas), base);
-	struct apply_as_clean *clean = container_of(aas, typeof(*clean), base);
-	pte_t ptent = *pte;
-
-	if (pte_dirty(ptent)) {
-		pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) +
-			aas->vma->vm_pgoff - clean->bitmap_pgoff;
-		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
-
-		ptent = pte_mkclean(old_pte);
-		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
-
-		aas->total++;
-		aas->start = min(aas->start, addr);
-		aas->end = max(aas->end, addr + PAGE_SIZE);
-
-		__set_bit(pgoff, clean->bitmap);
-		clean->start = min(clean->start, pgoff);
-		clean->end = max(clean->end, pgoff + 1);
-	}
-
-	return 0;
-}
-
-/**
- * apply_as_range - Apply a pte callback to all PTEs pointing into a range
- * of an address_space.
- * @mapping: Pointer to the struct address_space
- * @aas: Closure structure
- * @first_index: First page offset in the address_space
- * @nr: Number of incremental page offsets to cover
- *
- * Return: Number of ptes touched. Note that this number might be larger
- * than @nr if there are overlapping vmas
- */
-static unsigned long apply_as_range(struct address_space *mapping,
-				    struct apply_as *aas,
-				    pgoff_t first_index, pgoff_t nr)
-{
-	struct vm_area_struct *vma;
-	pgoff_t vba, vea, cba, cea;
-	unsigned long start_addr, end_addr;
-	struct mmu_notifier_range range;
-
-	i_mmap_lock_read(mapping);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
-				  first_index + nr - 1) {
-		unsigned long vm_flags = READ_ONCE(vma->vm_flags);
-
-		/*
-		 * We can only do advisory flag tests below, since we can't
-		 * require the vm's mmap_sem to be held to protect the flags.
-		 * Therefore, callers that strictly depend on specific mmap
-		 * flags to remain constant throughout the operation must
-		 * either ensure those flags are immutable for all relevant
-		 * vmas or can't use this function. Fixing this properly would
-		 * require the vma::vm_flags to be protected by a separate
-		 * lock taken after the i_mmap_lock
-		 */
-
-		/* Skip non-applicable VMAs */
-		if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
-		    (VM_SHARED | VM_WRITE))
-			continue;
-
-		/* Warn on and skip VMAs whose flags indicate illegal usage */
-		if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
-			continue;
-
-		/* Clip to the vma */
-		vba = vma->vm_pgoff;
-		vea = vba + vma_pages(vma);
-		cba = first_index;
-		cba = max(cba, vba);
-		cea = first_index + nr;
-		cea = min(cea, vea);
-
-		/* Translate to virtual address */
-		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
-		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
-		if (start_addr >= end_addr)
-			continue;
-
-		aas->base.mm = vma->vm_mm;
-		aas->vma = vma;
-		aas->start = end_addr;
-		aas->end = start_addr;
-
-		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
-					vma, vma->vm_mm, start_addr, end_addr);
-		mmu_notifier_invalidate_range_start(&range);
-
-		/* Needed when we only change protection? */
-		flush_cache_range(vma, start_addr, end_addr);
-
-		/*
-		 * We're not using tlb_gather_mmu() since typically
-		 * only a small subrange of PTEs are affected.
-		 */
-		inc_tlb_flush_pending(vma->vm_mm);
-
-		/* Should not error since aas->base.alloc == 0 */
-		WARN_ON(apply_to_pfn_range(&aas->base, start_addr,
-					   end_addr - start_addr));
-		if (aas->end > aas->start)
-			flush_tlb_range(vma, aas->start, aas->end);
-
-		mmu_notifier_invalidate_range_end(&range);
-		dec_tlb_flush_pending(vma->vm_mm);
-	}
-	i_mmap_unlock_read(mapping);
-
-	return aas->total;
-}
-
-/**
- * apply_as_wrprotect - Write-protect all ptes in an address_space range
- * @mapping: The address_space we want to write protect
- * @first_index: The first page offset in the range
- * @nr: Number of incremental page offsets to cover
- *
- * WARNING: This function should only be used for address spaces whose
- * vmas are marked VM_IO and that do not contain huge pages.
- * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
- * simply skipped.
- *
- * Return: The number of ptes actually write-protected. Note that
- * already write-protected ptes are not counted.
- */
-unsigned long apply_as_wrprotect(struct address_space *mapping,
-				 pgoff_t first_index, pgoff_t nr)
-{
-	struct apply_as aas = {
-		.base = {
-			.alloc = 0,
-			.ptefn = apply_pt_wrprotect,
-		},
-		.total = 0,
-	};
-
-	return apply_as_range(mapping, &aas, first_index, nr);
-}
-EXPORT_SYMBOL_GPL(apply_as_wrprotect);
-
-/**
- * apply_as_clean - Clean all ptes in an address_space range
- * @mapping: The address_space we want to clean
- * @first_index: The first page offset in the range
- * @nr: Number of incremental page offsets to cover
- * @bitmap_pgoff: The page offset of the first bit in @bitmap
- * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
- * cover the whole range @first_index..@first_index + @nr.
- * @start: Pointer to number of the first set bit in @bitmap.
- * is modified as new bits are set by the function.
- * @end: Pointer to the number of the last set bit in @bitmap.
- * none set. The value is modified as new bits are set by the function.
- *
- * Note: When this function returns there is no guarantee that a CPU has
- * not already dirtied new ptes. However it will not clean any ptes not
- * reported in the bitmap.
- *
- * If a caller needs to make sure all dirty ptes are picked up and none
- * additional are added, it first needs to write-protect the address-space
- * range and make sure new writers are blocked in page_mkwrite() or
- * pfn_mkwrite(). And then after a TLB flush following the write-protection
- * pick up all dirty bits.
- *
- * WARNING: This function should only be used for address spaces whose
- * vmas are marked VM_IO and that do not contain huge pages.
- * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
- * simply skipped.
- *
- * Return: The number of dirty ptes actually cleaned.
- */
-unsigned long apply_as_clean(struct address_space *mapping,
-			     pgoff_t first_index, pgoff_t nr,
-			     pgoff_t bitmap_pgoff,
-			     unsigned long *bitmap,
-			     pgoff_t *start,
-			     pgoff_t *end)
-{
-	bool none_set = (*start >= *end);
-	struct apply_as_clean clean = {
-		.base = {
-			.base = {
-				.alloc = 0,
-				.ptefn = apply_pt_clean,
-			},
-			.total = 0,
-		},
-		.bitmap_pgoff = bitmap_pgoff,
-		.bitmap = bitmap,
-		.start = none_set ? nr : *start,
-		.end = none_set ? 0 : *end,
-	};
-	unsigned long ret = apply_as_range(mapping, &clean.base, first_index,
-					   nr);
-
-	*start = clean.start;
-	*end = clean.end;
-	return ret;
-}
-EXPORT_SYMBOL_GPL(apply_as_clean);
diff --git a/mm/memory.c b/mm/memory.c
index 462aa47f8878..ddf20bd0c317 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2032,17 +2032,18 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
 }
 EXPORT_SYMBOL(vm_iomap_memory);
 
-static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
-			      unsigned long addr, unsigned long end)
+static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	pte_t *pte;
 	int err;
 	pgtable_t token;
 	spinlock_t *uninitialized_var(ptl);
 
-	pte = (closure->mm == &init_mm) ?
+	pte = (mm == &init_mm) ?
 		pte_alloc_kernel(pmd, addr) :
-		pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
+		pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 
@@ -2053,109 +2054,86 @@ static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = closure->ptefn(pte++, token, addr, closure);
+		err = fn(pte++, token, addr, data);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
 
-	if (closure->mm != &init_mm)
+	if (mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
 	return err;
 }
 
-static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
-			      unsigned long addr, unsigned long end)
+static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	int err = 0;
+	int err;
 
 	BUG_ON(pud_huge(*pud));
 
-	pmd = pmd_alloc(closure->mm, pud, addr);
+	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
-
 	do {
 		next = pmd_addr_end(addr, end);
-		if (!closure->alloc && pmd_none_or_clear_bad(pmd))
-			continue;
-		err = apply_to_pte_range(closure, pmd, addr, next);
+		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
-			      unsigned long addr, unsigned long end)
+static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	pud_t *pud;
 	unsigned long next;
-	int err = 0;
+	int err;
 
-	pud = pud_alloc(closure->mm, p4d, addr);
+	pud = pud_alloc(mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
-
 	do {
 		next = pud_addr_end(addr, end);
-		if (!closure->alloc && pud_none_or_clear_bad(pud))
-			continue;
-		err = apply_to_pmd_range(closure, pud, addr, next);
+		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
-			      unsigned long addr, unsigned long end)
+static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	p4d_t *p4d;
 	unsigned long next;
-	int err = 0;
+	int err;
 
-	p4d = p4d_alloc(closure->mm, pgd, addr);
+	p4d = p4d_alloc(mm, pgd, addr);
 	if (!p4d)
 		return -ENOMEM;
-
 	do {
 		next = p4d_addr_end(addr, end);
-		if (!closure->alloc && p4d_none_or_clear_bad(p4d))
-			continue;
-		err = apply_to_pud_range(closure, p4d, addr, next);
+		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
 		if (err)
 			break;
 	} while (p4d++, addr = next, addr != end);
 	return err;
 }
 
-/**
- * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
- * function on each leaf page table entry
- * @closure: Details about how to scan and what function to apply
- * @addr: Start virtual address
- * @size: Size of the region
- *
- * If @closure->alloc is set to 1, the function will fill in the page table
- * as necessary. Otherwise it will skip non-present parts.
- * Note: The caller must ensure that the range does not contain huge pages.
- * The caller must also assure that the proper mmu_notifier functions are
- * called before and after the call to apply_to_pfn_range.
- *
- * WARNING: Do not use this function unless you know exactly what you are
- * doing. It is lacking support for huge pages and transparent huge pages.
- *
- * Return: Zero on success. If the provided function returns a non-zero status,
- * the page table walk will terminate and that status will be returned.
- * If @closure->alloc is set to 1, then this function may also return memory
- * allocation errors arising from allocating page table memory.
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
  */
-int apply_to_pfn_range(struct pfn_range_apply *closure,
-		       unsigned long addr, unsigned long size)
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -2165,65 +2143,16 @@ int apply_to_pfn_range(struct pfn_range_apply *closure,
 	if (WARN_ON(addr >= end))
 		return -EINVAL;
 
-	pgd = pgd_offset(closure->mm, addr);
+	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (!closure->alloc && pgd_none_or_clear_bad(pgd))
-			continue;
-		err = apply_to_p4d_range(closure, pgd, addr, next);
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 
 	return err;
 }
-
-/**
- * struct page_range_apply - Closure structure for apply_to_page_range()
- * @pter: The base closure structure we derive from
- * @fn: The leaf pte function to call
- * @data: The leaf pte function closure
- */
-struct page_range_apply {
-	struct pfn_range_apply pter;
-	pte_fn_t fn;
-	void *data;
-};
-
-/*
- * Callback wrapper to enable use of apply_to_pfn_range for
- * the apply_to_page_range interface
- */
-static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
-				       unsigned long addr,
-				       struct pfn_range_apply *pter)
-{
-	struct page_range_apply *pra =
-		container_of(pter, typeof(*pra), pter);
-
-	return pra->fn(pte, token, addr, pra->data);
-}
-
-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
- *
- * WARNING: Do not use this function unless you know exactly what you are
- * doing. It is lacking support for huge pages and transparent huge pages.
- */
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
-			unsigned long size, pte_fn_t fn, void *data)
-{
-	struct page_range_apply pra = {
-		.pter = {.mm = mm,
-			 .alloc = 1,
-			 .ptefn = apply_to_page_range_wrapper },
-		.fn = fn,
-		.data = data
-	};
-
-	return apply_to_pfn_range(&pra.pter, addr, size);
-}
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
@@ -2309,7 +2238,7 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
 	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
 	/* Restore original flags so that caller is not surprised */
 	vmf->flags = old_flags;
-	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
 		return ret;
 	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
 		lock_page(page);
@@ -2586,7 +2515,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		vmf->flags |= FAULT_FLAG_MKWRITE;
 		ret = vma->vm_ops->pfn_mkwrite(vmf);
-		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))
+		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
 			return ret;
 		return finish_mkwrite_fault(vmf);
 	}
@@ -2607,8 +2536,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		tmp = do_page_mkwrite(vmf);
 		if (unlikely(!tmp || (tmp &
-				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
-				       VM_FAULT_RETRY)))) {
+				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
 			put_page(vmf->page);
 			return tmp;
 		}
@@ -3673,8 +3601,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 		unlock_page(vmf->page);
 		tmp = do_page_mkwrite(vmf);
 		if (unlikely(!tmp ||
-				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
-					VM_FAULT_RETRY)))) {
+				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
 			put_page(vmf->page);
 			return tmp;
 		}
-- 
cgit v1.2.3


From b43995469e5804636a55372e9bbb17ccb22441c5 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 15 Jul 2019 09:39:52 -0700
Subject: bpf: rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok

Rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok to indicate
that it can be used for both loads and stores.

Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h |  2 +-
 net/core/filter.c      | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6d944369ca87..ff65d22cf336 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -747,7 +747,7 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 	return size <= size_default && (size & (size - 1)) == 0;
 }
 
-#define bpf_ctx_wide_store_ok(off, size, type, field)			\
+#define bpf_ctx_wide_access_ok(off, size, type, field)			\
 	(size == sizeof(__u64) &&					\
 	off >= offsetof(type, field) &&					\
 	off + sizeof(__u64) <= offsetofend(type, field) &&		\
diff --git a/net/core/filter.c b/net/core/filter.c
index 47f6386fb17a..c5983ddb1a9f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6890,14 +6890,14 @@ static bool sock_addr_is_valid_access(int off, int size,
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  user_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   user_ip6))
 				return true;
 
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  msg_src_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   msg_src_ip6))
 				return true;
 
 			if (size != size_default)
-- 
cgit v1.2.3


From d4ecfeb15494ec261fef2d25d96eecba66f0b182 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 15 Jul 2019 09:39:53 -0700
Subject: bpf: allow wide aligned loads for bpf_sock_addr user_ip6 and
 msg_src_ip6

Add explicit check for u64 loads of user_ip6 and msg_src_ip6 and
update the comment.

Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  4 ++--
 net/core/filter.c        | 12 +++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6f68438aa4ed..81be929b89fc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3248,7 +3248,7 @@ struct bpf_sock_addr {
 	__u32 user_ip4;		/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 user_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
+	__u32 user_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__u32 user_port;	/* Allows 4-byte read and write.
@@ -3260,7 +3260,7 @@ struct bpf_sock_addr {
 	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
diff --git a/net/core/filter.c b/net/core/filter.c
index c5983ddb1a9f..0f6854ccf894 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6884,9 +6884,19 @@ static bool sock_addr_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
 	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
 				msg_src_ip6[3]):
-		/* Only narrow read access allowed for now. */
 		if (type == BPF_READ) {
 			bpf_ctx_record_field_size(info, size_default);
+
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   user_ip6))
+				return true;
+
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   msg_src_ip6))
+				return true;
+
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
-- 
cgit v1.2.3


From c4dcc8a162784c1f827c7f6d8409598f19708fe6 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 16 Jul 2019 09:36:08 +0530
Subject: cpufreq: Make cpufreq_generic_init() return void

It always returns 0 (success) and its return type should really be void.

Over that, many drivers have added error handling code based on its
return value, which is not required at all.

Change its return type to void and update all the callers.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/bmips-cpufreq.c     | 17 ++++++-----------
 drivers/cpufreq/cpufreq.c           |  4 +---
 drivers/cpufreq/davinci-cpufreq.c   |  3 ++-
 drivers/cpufreq/imx6q-cpufreq.c     |  6 ++----
 drivers/cpufreq/kirkwood-cpufreq.c  |  3 ++-
 drivers/cpufreq/loongson1-cpufreq.c |  8 +++-----
 drivers/cpufreq/loongson2_cpufreq.c |  3 ++-
 drivers/cpufreq/maple-cpufreq.c     |  3 ++-
 drivers/cpufreq/omap-cpufreq.c      | 15 +++++----------
 drivers/cpufreq/pasemi-cpufreq.c    |  3 ++-
 drivers/cpufreq/pmac32-cpufreq.c    |  3 ++-
 drivers/cpufreq/pmac64-cpufreq.c    |  3 ++-
 drivers/cpufreq/s3c2416-cpufreq.c   |  9 ++-------
 drivers/cpufreq/s3c64xx-cpufreq.c   | 15 +++------------
 drivers/cpufreq/s5pv210-cpufreq.c   |  3 ++-
 drivers/cpufreq/sa1100-cpufreq.c    |  3 ++-
 drivers/cpufreq/sa1110-cpufreq.c    |  3 ++-
 drivers/cpufreq/spear-cpufreq.c     |  3 ++-
 drivers/cpufreq/tegra20-cpufreq.c   |  8 +-------
 include/linux/cpufreq.h             |  2 +-
 20 files changed, 46 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/bmips-cpufreq.c b/drivers/cpufreq/bmips-cpufreq.c
index 56a4ebbf00e0..f7c23fa468f0 100644
--- a/drivers/cpufreq/bmips-cpufreq.c
+++ b/drivers/cpufreq/bmips-cpufreq.c
@@ -131,23 +131,18 @@ static int bmips_cpufreq_exit(struct cpufreq_policy *policy)
 static int bmips_cpufreq_init(struct cpufreq_policy *policy)
 {
 	struct cpufreq_frequency_table *freq_table;
-	int ret;
 
 	freq_table = bmips_cpufreq_get_freq_table(policy);
 	if (IS_ERR(freq_table)) {
-		ret = PTR_ERR(freq_table);
-		pr_err("%s: couldn't determine frequency table (%d).\n",
-			BMIPS_CPUFREQ_NAME, ret);
-		return ret;
+		pr_err("%s: couldn't determine frequency table (%ld).\n",
+			BMIPS_CPUFREQ_NAME, PTR_ERR(freq_table));
+		return PTR_ERR(freq_table);
 	}
 
-	ret = cpufreq_generic_init(policy, freq_table, TRANSITION_LATENCY);
-	if (ret)
-		bmips_cpufreq_exit(policy);
-	else
-		pr_info("%s: registered\n", BMIPS_CPUFREQ_NAME);
+	cpufreq_generic_init(policy, freq_table, TRANSITION_LATENCY);
+	pr_info("%s: registered\n", BMIPS_CPUFREQ_NAME);
 
-	return ret;
+	return 0;
 }
 
 static struct cpufreq_driver bmips_cpufreq_driver = {
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 99aa7d20b458..efab334d6ab2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(arch_set_freq_scale);
  * - set policies transition latency
  * - policy->cpus with all possible CPUs
  */
-int cpufreq_generic_init(struct cpufreq_policy *policy,
+void cpufreq_generic_init(struct cpufreq_policy *policy,
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency)
 {
@@ -174,8 +174,6 @@ int cpufreq_generic_init(struct cpufreq_policy *policy,
 	 * share the clock and voltage and clock.
 	 */
 	cpumask_setall(policy->cpus);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(cpufreq_generic_init);
 
diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c
index 940fe85db97a..664fa4ab9d1c 100644
--- a/drivers/cpufreq/davinci-cpufreq.c
+++ b/drivers/cpufreq/davinci-cpufreq.c
@@ -93,7 +93,8 @@ static int davinci_cpu_init(struct cpufreq_policy *policy)
 	 * Setting the latency to 2000 us to accommodate addition of drivers
 	 * to pre/post change notification list.
 	 */
-	return cpufreq_generic_init(policy, freq_table, 2000 * 1000);
+	cpufreq_generic_init(policy, freq_table, 2000 * 1000);
+	return 0;
 }
 
 static struct cpufreq_driver davinci_driver = {
diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 3e17560b1efe..91ea95c97bb2 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -193,14 +193,12 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
 
 static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
 {
-	int ret;
-
 	policy->clk = clks[ARM].clk;
-	ret = cpufreq_generic_init(policy, freq_table, transition_latency);
+	cpufreq_generic_init(policy, freq_table, transition_latency);
 	policy->suspend_freq = max_freq;
 	dev_pm_opp_of_register_em(policy->cpus);
 
-	return ret;
+	return 0;
 }
 
 static struct cpufreq_driver imx6q_cpufreq_driver = {
diff --git a/drivers/cpufreq/kirkwood-cpufreq.c b/drivers/cpufreq/kirkwood-cpufreq.c
index 7ab564c1f7ae..cb74bdc5baaa 100644
--- a/drivers/cpufreq/kirkwood-cpufreq.c
+++ b/drivers/cpufreq/kirkwood-cpufreq.c
@@ -85,7 +85,8 @@ static int kirkwood_cpufreq_target(struct cpufreq_policy *policy,
 /* Module init and exit code */
 static int kirkwood_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, kirkwood_freq_table, 5000);
+	cpufreq_generic_init(policy, kirkwood_freq_table, 5000);
+	return 0;
 }
 
 static struct cpufreq_driver kirkwood_cpufreq_driver = {
diff --git a/drivers/cpufreq/loongson1-cpufreq.c b/drivers/cpufreq/loongson1-cpufreq.c
index 21c9ce8526c0..0ea88778882a 100644
--- a/drivers/cpufreq/loongson1-cpufreq.c
+++ b/drivers/cpufreq/loongson1-cpufreq.c
@@ -81,7 +81,7 @@ static int ls1x_cpufreq_init(struct cpufreq_policy *policy)
 	struct device *cpu_dev = get_cpu_device(policy->cpu);
 	struct cpufreq_frequency_table *freq_tbl;
 	unsigned int pll_freq, freq;
-	int steps, i, ret;
+	int steps, i;
 
 	pll_freq = clk_get_rate(cpufreq->pll_clk) / 1000;
 
@@ -103,11 +103,9 @@ static int ls1x_cpufreq_init(struct cpufreq_policy *policy)
 	freq_tbl[i].frequency = CPUFREQ_TABLE_END;
 
 	policy->clk = cpufreq->clk;
-	ret = cpufreq_generic_init(policy, freq_tbl, 0);
-	if (ret)
-		kfree(freq_tbl);
+	cpufreq_generic_init(policy, freq_tbl, 0);
 
-	return ret;
+	return 0;
 }
 
 static int ls1x_cpufreq_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/loongson2_cpufreq.c b/drivers/cpufreq/loongson2_cpufreq.c
index da344696beed..890813e0bb76 100644
--- a/drivers/cpufreq/loongson2_cpufreq.c
+++ b/drivers/cpufreq/loongson2_cpufreq.c
@@ -95,7 +95,8 @@ static int loongson2_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	}
 
 	policy->clk = cpuclk;
-	return cpufreq_generic_init(policy, &loongson2_clockmod_table[0], 0);
+	cpufreq_generic_init(policy, &loongson2_clockmod_table[0], 0);
+	return 0;
 }
 
 static int loongson2_cpufreq_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/maple-cpufreq.c b/drivers/cpufreq/maple-cpufreq.c
index a94355723ef8..a03cd3ad170f 100644
--- a/drivers/cpufreq/maple-cpufreq.c
+++ b/drivers/cpufreq/maple-cpufreq.c
@@ -143,7 +143,8 @@ static unsigned int maple_cpufreq_get_speed(unsigned int cpu)
 
 static int maple_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, maple_cpu_freqs, 12000);
+	cpufreq_generic_init(policy, maple_cpu_freqs, 12000);
+	return 0;
 }
 
 static struct cpufreq_driver maple_cpufreq_driver = {
diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
index 68052b74d28f..edda20119cfd 100644
--- a/drivers/cpufreq/omap-cpufreq.c
+++ b/drivers/cpufreq/omap-cpufreq.c
@@ -125,23 +125,18 @@ static int omap_cpu_init(struct cpufreq_policy *policy)
 			dev_err(mpu_dev,
 				"%s: cpu%d: failed creating freq table[%d]\n",
 				__func__, policy->cpu, result);
-			goto fail;
+			clk_put(policy->clk);
+			return result;
 		}
 	}
 
 	atomic_inc_return(&freq_table_users);
 
 	/* FIXME: what's the actual transition time? */
-	result = cpufreq_generic_init(policy, freq_table, 300 * 1000);
-	if (!result) {
-		dev_pm_opp_of_register_em(policy->cpus);
-		return 0;
-	}
+	cpufreq_generic_init(policy, freq_table, 300 * 1000);
+	dev_pm_opp_of_register_em(policy->cpus);
 
-	freq_table_free();
-fail:
-	clk_put(policy->clk);
-	return result;
+	return 0;
 }
 
 static int omap_cpu_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c
index 6b1e4abe3248..93f39a1d4c3d 100644
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -196,7 +196,8 @@ static int pas_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	policy->cur = pas_freqs[cur_astate].frequency;
 	ppc_proc_freq = policy->cur * 1000ul;
 
-	return cpufreq_generic_init(policy, pas_freqs, get_gizmo_latency());
+	cpufreq_generic_init(policy, pas_freqs, get_gizmo_latency());
+	return 0;
 
 out_unmap_sdcpwr:
 	iounmap(sdcpwr_mapbase);
diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c
index 9b4ce2eb8222..bc7fc930294e 100644
--- a/drivers/cpufreq/pmac32-cpufreq.c
+++ b/drivers/cpufreq/pmac32-cpufreq.c
@@ -376,7 +376,8 @@ static int pmac_cpufreq_target(	struct cpufreq_policy *policy,
 
 static int pmac_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, pmac_cpu_freqs, transition_latency);
+	cpufreq_generic_init(policy, pmac_cpu_freqs, transition_latency);
+	return 0;
 }
 
 static u32 read_gpio(struct device_node *np)
diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c
index 1d32a863332d..045881494cc9 100644
--- a/drivers/cpufreq/pmac64-cpufreq.c
+++ b/drivers/cpufreq/pmac64-cpufreq.c
@@ -324,7 +324,8 @@ static unsigned int g5_cpufreq_get_speed(unsigned int cpu)
 
 static int g5_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, g5_cpu_freqs, transition_latency);
+	cpufreq_generic_init(policy, g5_cpu_freqs, transition_latency);
+	return 0;
 }
 
 static struct cpufreq_driver g5_cpufreq_driver = {
diff --git a/drivers/cpufreq/s3c2416-cpufreq.c b/drivers/cpufreq/s3c2416-cpufreq.c
index 5b2db3c6568f..124a4c68c5ec 100644
--- a/drivers/cpufreq/s3c2416-cpufreq.c
+++ b/drivers/cpufreq/s3c2416-cpufreq.c
@@ -450,21 +450,16 @@ static int s3c2416_cpufreq_driver_init(struct cpufreq_policy *policy)
 	/* Datasheet says PLL stabalisation time must be at least 300us,
 	 * so but add some fudge. (reference in LOCKCON0 register description)
 	 */
-	ret = cpufreq_generic_init(policy, s3c_freq->freq_table,
+	cpufreq_generic_init(policy, s3c_freq->freq_table,
 			(500 * 1000) + s3c_freq->regulator_latency);
-	if (ret)
-		goto err_freq_table;
-
 	register_reboot_notifier(&s3c2416_cpufreq_reboot_notifier);
 
 	return 0;
 
-err_freq_table:
 #ifdef CONFIG_ARM_S3C2416_CPUFREQ_VCORESCALE
-	regulator_put(s3c_freq->vddarm);
 err_vddarm:
-#endif
 	clk_put(s3c_freq->armclk);
+#endif
 err_armclk:
 	clk_put(s3c_freq->hclk);
 err_hclk:
diff --git a/drivers/cpufreq/s3c64xx-cpufreq.c b/drivers/cpufreq/s3c64xx-cpufreq.c
index 0cb9040eca49..40aafa8299a0 100644
--- a/drivers/cpufreq/s3c64xx-cpufreq.c
+++ b/drivers/cpufreq/s3c64xx-cpufreq.c
@@ -147,7 +147,6 @@ out:
 
 static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
 {
-	int ret;
 	struct cpufreq_frequency_table *freq;
 
 	if (policy->cpu != 0)
@@ -168,8 +167,7 @@ static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
 #ifdef CONFIG_REGULATOR
 	vddarm = regulator_get(NULL, "vddarm");
 	if (IS_ERR(vddarm)) {
-		ret = PTR_ERR(vddarm);
-		pr_err("Failed to obtain VDDARM: %d\n", ret);
+		pr_err("Failed to obtain VDDARM: %ld\n", PTR_ERR(vddarm));
 		pr_err("Only frequency scaling available\n");
 		vddarm = NULL;
 	} else {
@@ -199,16 +197,9 @@ static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
 	 * the PLLs, which we don't currently) is ~300us worst case,
 	 * but add some fudge.
 	 */
-	ret = cpufreq_generic_init(policy, s3c64xx_freq_table,
+	cpufreq_generic_init(policy, s3c64xx_freq_table,
 			(500 * 1000) + regulator_latency);
-	if (ret != 0) {
-		pr_err("Failed to configure frequency table: %d\n",
-		       ret);
-		regulator_put(vddarm);
-		clk_put(policy->clk);
-	}
-
-	return ret;
+	return 0;
 }
 
 static struct cpufreq_driver s3c64xx_cpufreq_driver = {
diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c
index c7b7d1e65b08..0663cc935fa6 100644
--- a/drivers/cpufreq/s5pv210-cpufreq.c
+++ b/drivers/cpufreq/s5pv210-cpufreq.c
@@ -544,7 +544,8 @@ static int s5pv210_cpu_init(struct cpufreq_policy *policy)
 	s5pv210_dram_conf[1].freq = clk_get_rate(dmc1_clk);
 
 	policy->suspend_freq = SLEEP_FREQ;
-	return cpufreq_generic_init(policy, s5pv210_freq_table, 40000);
+	cpufreq_generic_init(policy, s5pv210_freq_table, 40000);
+	return 0;
 
 out_dmc1:
 	clk_put(dmc0_clk);
diff --git a/drivers/cpufreq/sa1100-cpufreq.c b/drivers/cpufreq/sa1100-cpufreq.c
index ab5cab93e638..5c075ef6adc0 100644
--- a/drivers/cpufreq/sa1100-cpufreq.c
+++ b/drivers/cpufreq/sa1100-cpufreq.c
@@ -181,7 +181,8 @@ static int sa1100_target(struct cpufreq_policy *policy, unsigned int ppcr)
 
 static int __init sa1100_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	return 0;
 }
 
 static struct cpufreq_driver sa1100_driver __refdata = {
diff --git a/drivers/cpufreq/sa1110-cpufreq.c b/drivers/cpufreq/sa1110-cpufreq.c
index 66e5fb088ecc..1057d7f65118 100644
--- a/drivers/cpufreq/sa1110-cpufreq.c
+++ b/drivers/cpufreq/sa1110-cpufreq.c
@@ -306,7 +306,8 @@ static int sa1110_target(struct cpufreq_policy *policy, unsigned int ppcr)
 
 static int __init sa1110_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	return 0;
 }
 
 /* sa1110_driver needs __refdata because it must remain after init registers
diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c
index 4074e2615522..73bd8dc47074 100644
--- a/drivers/cpufreq/spear-cpufreq.c
+++ b/drivers/cpufreq/spear-cpufreq.c
@@ -153,8 +153,9 @@ static int spear_cpufreq_target(struct cpufreq_policy *policy,
 static int spear_cpufreq_init(struct cpufreq_policy *policy)
 {
 	policy->clk = spear_cpufreq.clk;
-	return cpufreq_generic_init(policy, spear_cpufreq.freq_tbl,
+	cpufreq_generic_init(policy, spear_cpufreq.freq_tbl,
 			spear_cpufreq.transition_latency);
+	return 0;
 }
 
 static struct cpufreq_driver spear_cpufreq_driver = {
diff --git a/drivers/cpufreq/tegra20-cpufreq.c b/drivers/cpufreq/tegra20-cpufreq.c
index 3c32cc7b0671..f84ecd22f488 100644
--- a/drivers/cpufreq/tegra20-cpufreq.c
+++ b/drivers/cpufreq/tegra20-cpufreq.c
@@ -118,17 +118,11 @@ static int tegra_target(struct cpufreq_policy *policy, unsigned int index)
 static int tegra_cpu_init(struct cpufreq_policy *policy)
 {
 	struct tegra20_cpufreq *cpufreq = cpufreq_get_driver_data();
-	int ret;
 
 	clk_prepare_enable(cpufreq->cpu_clk);
 
 	/* FIXME: what's the actual transition time? */
-	ret = cpufreq_generic_init(policy, freq_table, 300 * 1000);
-	if (ret) {
-		clk_disable_unprepare(cpufreq->cpu_clk);
-		return ret;
-	}
-
+	cpufreq_generic_init(policy, freq_table, 300 * 1000);
 	policy->clk = cpufreq->cpu_clk;
 	policy->suspend_freq = freq_table[0].frequency;
 	return 0;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index afc683021ac5..441ff15b7768 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -995,7 +995,7 @@ extern struct freq_attr *cpufreq_generic_attr[];
 int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy);
 
 unsigned int cpufreq_generic_get(unsigned int cpu);
-int cpufreq_generic_init(struct cpufreq_policy *policy,
+void cpufreq_generic_init(struct cpufreq_policy *policy,
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency);
 #endif /* _LINUX_CPUFREQ_H */
-- 
cgit v1.2.3


From cf034477321e8c88d667514923763b93a0892f49 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.velikov@collabora.com>
Date: Fri, 14 Jun 2019 18:33:35 +0100
Subject: drm/amdgpu: extend AMDGPU_CTX_PRIORITY_NORMAL comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the AMDGPU_CTX_PRIORITY_* defines are used in both
drm_amdgpu_ctx_in::priority and drm_amdgpu_sched_in::priority.

Extend the comment to mention the CAP_SYS_NICE or DRM_MASTER requirement
is only applicable with the former.

Cc: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/drm/amdgpu_drm.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index d799858b9e53..11cc57322962 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -219,7 +219,10 @@ union drm_amdgpu_bo_list {
 #define AMDGPU_CTX_PRIORITY_VERY_LOW    -1023
 #define AMDGPU_CTX_PRIORITY_LOW         -512
 #define AMDGPU_CTX_PRIORITY_NORMAL      0
-/* Selecting a priority above NORMAL requires CAP_SYS_NICE or DRM_MASTER */
+/*
+ * When used in struct drm_amdgpu_ctx_in, a priority above NORMAL requires
+ * CAP_SYS_NICE or DRM_MASTER
+*/
 #define AMDGPU_CTX_PRIORITY_HIGH        512
 #define AMDGPU_CTX_PRIORITY_VERY_HIGH   1023
 
@@ -229,6 +232,7 @@ struct drm_amdgpu_ctx_in {
 	/** For future use, no flags defined so far */
 	__u32	flags;
 	__u32	ctx_id;
+	/** AMDGPU_CTX_PRIORITY_* */
 	__s32	priority;
 };
 
@@ -281,6 +285,7 @@ struct drm_amdgpu_sched_in {
 	/* AMDGPU_SCHED_OP_* */
 	__u32	op;
 	__u32	fd;
+	/** AMDGPU_CTX_PRIORITY_* */
 	__s32	priority;
 	__u32   ctx_id;
 };
-- 
cgit v1.2.3


From 46710f3a34b592ac5c51a95f696b2d2a2a0d9419 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 25 May 2019 09:57:59 -0700
Subject: tracing: Pass type into tracing_generic_entry_update()

All callers of tracing_generic_entry_update() have to initialize
entry->type, so let's just simply move it inside.
Link: http://lkml.kernel.org/r/20190525165802.25944-2-xiyou.wangcong@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h    | 1 +
 kernel/trace/trace.c            | 8 ++++----
 kernel/trace/trace_event_perf.c | 3 +--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8a62731673f7..5c6f2a6c8cd2 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -142,6 +142,7 @@ enum print_line_t {
 enum print_line_t trace_handle_return(struct trace_seq *s);
 
 void tracing_generic_entry_update(struct trace_entry *entry,
+				  unsigned short type,
 				  unsigned long flags,
 				  int pc);
 struct trace_event_file;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 77b9c4ca5faa..6b62e1718548 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -743,8 +743,7 @@ trace_event_setup(struct ring_buffer_event *event,
 {
 	struct trace_entry *ent = ring_buffer_event_data(event);
 
-	tracing_generic_entry_update(ent, flags, pc);
-	ent->type = type;
+	tracing_generic_entry_update(ent, type, flags, pc);
 }
 
 static __always_inline struct ring_buffer_event *
@@ -2312,13 +2311,14 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
 EXPORT_SYMBOL_GPL(trace_handle_return);
 
 void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
-			     int pc)
+tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
+			     unsigned long flags, int pc)
 {
 	struct task_struct *tsk = current;
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->type			= type;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4629a6104474..0892e38ed6fb 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -416,8 +416,7 @@ void perf_trace_buf_update(void *record, u16 type)
 	unsigned long flags;
 
 	local_save_flags(flags);
-	tracing_generic_entry_update(entry, flags, pc);
-	entry->type = type;
+	tracing_generic_entry_update(entry, type, flags, pc);
 }
 NOKPROBE_SYMBOL(perf_trace_buf_update);
 
-- 
cgit v1.2.3


From 0aeb1def44169cbe7119f26cf10b974a2046142e Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 25 May 2019 09:58:01 -0700
Subject: tracing: Make trace_get_fields() global

trace_get_fields() is the only way to read tracepoint fields at
run time, as their fields are defined at compile-time with macros.
Make this function visible to all users and it will be used by
trace event injection code to calculate the size of a tracepoint
entry.
Link: http://lkml.kernel.org/r/20190525165802.25944-4-xiyou.wangcong@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 8 ++++++++
 kernel/trace/trace_events.c  | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 5c6f2a6c8cd2..5150436783e8 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -318,6 +318,14 @@ trace_event_name(struct trace_event_call *call)
 		return call->name;
 }
 
+static inline struct list_head *
+trace_get_fields(struct trace_event_call *event_call)
+{
+	if (!event_call->class->get_fields)
+		return &event_call->class->fields;
+	return event_call->class->get_fields(event_call);
+}
+
 struct trace_array;
 struct trace_subsystem_dir;
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index edc72f3b080c..c7506bc81b75 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -70,14 +70,6 @@ static int system_refcount_dec(struct event_subsystem *system)
 #define while_for_each_event_file()		\
 	}
 
-static struct list_head *
-trace_get_fields(struct trace_event_call *event_call)
-{
-	if (!event_call->class->get_fields)
-		return &event_call->class->fields;
-	return event_call->class->get_fields(event_call);
-}
-
 static struct ftrace_event_field *
 __find_event_field(struct list_head *head, char *name)
 {
-- 
cgit v1.2.3


From 9087c37584fb7d8315877bb55f85e4268cc0b4f4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 10 Jul 2019 19:01:19 +0000
Subject: dma-direct: Force unencrypted DMA under SME for certain DMA masks

If a device doesn't support DMA to a physical address that includes the
encryption bit (currently bit 47, so 48-bit DMA), then the DMA must
occur to unencrypted memory. SWIOTLB is used to satisfy that requirement
if an IOMMU is not active (enabled or configured in passthrough mode).

However, commit fafadcd16595 ("swiotlb: don't dip into swiotlb pool for
coherent allocations") modified the coherent allocation support in
SWIOTLB to use the DMA direct coherent allocation support. When an IOMMU
is not active, this resulted in dma_alloc_coherent() failing for devices
that didn't support DMA addresses that included the encryption bit.

Addressing this requires changes to the force_dma_unencrypted() function
in kernel/dma/direct.c. Since the function is now non-trivial and
SME/SEV specific, update the DMA direct support to add an arch override
for the force_dma_unencrypted() function. The arch override is selected
when CONFIG_AMD_MEM_ENCRYPT is set. The arch override function resides in
the arch/x86/mm/mem_encrypt.c file and forces unencrypted DMA when either
SEV is active or SME is active and the device does not support DMA to
physical addresses that include the encryption bit.

Fixes: fafadcd16595 ("swiotlb: don't dip into swiotlb pool for coherent allocations")
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
[hch: moved the force_dma_unencrypted declaration to dma-mapping.h,
      fold the s390 fix from Halil Pasic]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/s390/Kconfig          |  1 +
 arch/s390/mm/init.c        |  7 ++++++-
 arch/x86/Kconfig           |  1 +
 arch/x86/mm/mem_encrypt.c  | 30 ++++++++++++++++++++++++++++++
 include/linux/dma-direct.h |  9 +++++++++
 kernel/dma/Kconfig         |  3 +++
 kernel/dma/direct.c        | 16 ++++------------
 7 files changed, 54 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5d8570ed6cab..a4ad2733eedf 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -189,6 +189,7 @@ config S390
 	select VIRT_CPU_ACCOUNTING
 	select ARCH_HAS_SCALED_CPUTIME
 	select HAVE_NMI
+	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	select SWIOTLB
 	select GENERIC_ALLOCATOR
 
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index f0bee6af3960..78c319c5ce48 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -30,7 +30,7 @@
 #include <linux/export.h>
 #include <linux/cma.h>
 #include <linux/gfp.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
 #include <asm/processor.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
@@ -161,6 +161,11 @@ bool sev_active(void)
 	return is_prot_virt_guest();
 }
 
+bool force_dma_unencrypted(struct device *dev)
+{
+	return sev_active();
+}
+
 /* protected virtualization */
 static void pv_init(void)
 {
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 879741336771..d1afe92bf994 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1528,6 +1528,7 @@ config AMD_MEM_ENCRYPT
 	depends on X86_64 && CPU_SUP_AMD
 	select DYNAMIC_PHYSICAL_MASK
 	select ARCH_USE_MEMREMAP_PROT
+	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	---help---
 	  Say yes to enable support for the encryption of system memory.
 	  This requires an AMD processor that supports Secure Memory
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index e0df96fdfe46..c805f0a5c16e 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -15,6 +15,10 @@
 #include <linux/dma-direct.h>
 #include <linux/swiotlb.h>
 #include <linux/mem_encrypt.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
@@ -348,6 +352,32 @@ bool sev_active(void)
 }
 EXPORT_SYMBOL(sev_active);
 
+/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
+bool force_dma_unencrypted(struct device *dev)
+{
+	/*
+	 * For SEV, all DMA must be to unencrypted addresses.
+	 */
+	if (sev_active())
+		return true;
+
+	/*
+	 * For SME, all DMA must be to unencrypted addresses if the
+	 * device does not support DMA to addresses that include the
+	 * encryption mask.
+	 */
+	if (sme_active()) {
+		u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
+		u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
+						dev->bus_dma_mask);
+
+		if (dma_dev_mask <= dma_enc_mask)
+			return true;
+	}
+
+	return false;
+}
+
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_free_decrypted_mem(void)
 {
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index b7338702592a..adf993a3bd58 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -32,6 +32,15 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 }
 #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
 
+#ifdef CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED
+bool force_dma_unencrypted(struct device *dev);
+#else
+static inline bool force_dma_unencrypted(struct device *dev)
+{
+	return false;
+}
+#endif /* CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED */
+
 /*
  * If memory encryption is supported, phys_to_dma will set the memory encryption
  * bit in the DMA address, and dma_to_phys will clear it.  The raw __phys_to_dma
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 70f8f8d9200e..9decbba255fc 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -48,6 +48,9 @@ config ARCH_HAS_DMA_COHERENT_TO_PFN
 config ARCH_HAS_DMA_MMAP_PGPROT
 	bool
 
+config ARCH_HAS_FORCE_DMA_UNENCRYPTED
+	bool
+
 config DMA_NONCOHERENT_CACHE_SYNC
 	bool
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b90e1aede743..d7cec866d16b 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -23,14 +23,6 @@
 #define ARCH_ZONE_DMA_BITS 24
 #endif
 
-/*
- * For AMD SEV all DMA must be to unencrypted addresses.
- */
-static inline bool force_dma_unencrypted(void)
-{
-	return sev_active();
-}
-
 static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 {
 	if (!dev->dma_mask) {
@@ -46,7 +38,7 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 static inline dma_addr_t phys_to_dma_direct(struct device *dev,
 		phys_addr_t phys)
 {
-	if (force_dma_unencrypted())
+	if (force_dma_unencrypted(dev))
 		return __phys_to_dma(dev, phys);
 	return phys_to_dma(dev, phys);
 }
@@ -67,7 +59,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 	if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
 		dma_mask = dev->bus_dma_mask;
 
-	if (force_dma_unencrypted())
+	if (force_dma_unencrypted(dev))
 		*phys_mask = __dma_to_phys(dev, dma_mask);
 	else
 		*phys_mask = dma_to_phys(dev, dma_mask);
@@ -159,7 +151,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	}
 
 	ret = page_address(page);
-	if (force_dma_unencrypted()) {
+	if (force_dma_unencrypted(dev)) {
 		set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
 		*dma_handle = __phys_to_dma(dev, page_to_phys(page));
 	} else {
@@ -192,7 +184,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		return;
 	}
 
-	if (force_dma_unencrypted())
+	if (force_dma_unencrypted(dev))
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-- 
cgit v1.2.3


From 4914da2fb0c89205790503f20dfdde854f3afdd8 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 16 Jul 2019 08:56:51 +0200
Subject: ALSA: hda - Don't resume forcibly i915 HDMI/DP codec

We apply the codec resume forcibly at system resume callback for
updating and syncing the jack detection state that may have changed
during sleeping.  This is, however, superfluous for the codec like
Intel HDMI/DP, where the jack detection is managed via the audio
component notification; i.e. the jack state change shall be reported
sooner or later from the graphics side at mode change.

This patch changes the codec resume callback to avoid the forcible
resume conditionally with a new flag, codec->relaxed_resume, for
reducing the resume time.  The flag is set in the codec probe.

Although this doesn't fix the entire bug mentioned in the bugzilla
entry below, it's still a good optimization and some improvements are
seen.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201901
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/hda_codec.h  | 2 ++
 sound/pci/hda/hda_codec.c  | 8 ++++++--
 sound/pci/hda/patch_hdmi.c | 6 +++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/hda_codec.h b/include/sound/hda_codec.h
index 8f46ff3449d5..871993696c5f 100644
--- a/include/sound/hda_codec.h
+++ b/include/sound/hda_codec.h
@@ -252,6 +252,8 @@ struct hda_codec {
 	unsigned int auto_runtime_pm:1; /* enable automatic codec runtime pm */
 	unsigned int force_pin_prefix:1; /* Add location prefix */
 	unsigned int link_down_at_suspend:1; /* link down at runtime suspend */
+	unsigned int relaxed_resume:1;	/* don't resume forcibly for jack */
+
 #ifdef CONFIG_PM
 	unsigned long power_on_acct;
 	unsigned long power_off_acct;
diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c
index 5346631df1ec..e30e86ca6b72 100644
--- a/sound/pci/hda/hda_codec.c
+++ b/sound/pci/hda/hda_codec.c
@@ -2941,15 +2941,19 @@ static int hda_codec_runtime_resume(struct device *dev)
 #ifdef CONFIG_PM_SLEEP
 static int hda_codec_force_resume(struct device *dev)
 {
+	struct hda_codec *codec = dev_to_hda_codec(dev);
+	bool forced_resume = !codec->relaxed_resume;
 	int ret;
 
 	/* The get/put pair below enforces the runtime resume even if the
 	 * device hasn't been used at suspend time.  This trick is needed to
 	 * update the jack state change during the sleep.
 	 */
-	pm_runtime_get_noresume(dev);
+	if (forced_resume)
+		pm_runtime_get_noresume(dev);
 	ret = pm_runtime_force_resume(dev);
-	pm_runtime_put(dev);
+	if (forced_resume)
+		pm_runtime_put(dev);
 	return ret;
 }
 
diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index 0b2a26e2c5f1..bea7b0961080 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -2292,8 +2292,10 @@ static void generic_hdmi_free(struct hda_codec *codec)
 	struct hdmi_spec *spec = codec->spec;
 	int pin_idx, pcm_idx;
 
-	if (codec_has_acomp(codec))
+	if (codec_has_acomp(codec)) {
 		snd_hdac_acomp_register_notifier(&codec->bus->core, NULL);
+		codec->relaxed_resume = 0;
+	}
 
 	for (pin_idx = 0; pin_idx < spec->num_pins; pin_idx++) {
 		struct hdmi_spec_per_pin *per_pin = get_pin(spec, pin_idx);
@@ -2579,6 +2581,8 @@ static void register_i915_notifier(struct hda_codec *codec)
 	spec->drm_audio_ops.pin_eld_notify = intel_pin_eld_notify;
 	snd_hdac_acomp_register_notifier(&codec->bus->core,
 					&spec->drm_audio_ops);
+	/* no need for forcible resume for jack check thanks to notifier */
+	codec->relaxed_resume = 1;
 }
 
 /* setup_stream ops override for HSW+ */
-- 
cgit v1.2.3


From a5b647007e9d794956dbed9339a3354a9fc4d5c3 Mon Sep 17 00:00:00 2001
From: Vedang Patel <vedang.patel@intel.com>
Date: Tue, 16 Jul 2019 12:52:18 -0700
Subject: fix: taprio: Change type of txtime-delay parameter to u32

During the review of the iproute2 patches for txtime-assist mode, it was
pointed out that it does not make sense for the txtime-delay parameter to
be negative. So, change the type of the parameter from s32 to u32.

Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode")
Reported-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Vedang Patel <vedang.patel@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 2 +-
 net/sched/sch_taprio.c         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 1f623252abe8..18f185299f47 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1174,7 +1174,7 @@ enum {
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
 	TCA_TAPRIO_ATTR_FLAGS, /* u32 */
-	TCA_TAPRIO_ATTR_TXTIME_DELAY, /* s32 */
+	TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */
 	__TCA_TAPRIO_ATTR_MAX,
 };
 
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 388750ddc57a..c39db507ba3f 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -75,7 +75,7 @@ struct taprio_sched {
 	struct sched_gate_list __rcu *admin_sched;
 	struct hrtimer advance_timer;
 	struct list_head taprio_list;
-	int txtime_delay;
+	u32 txtime_delay;
 };
 
 static ktime_t sched_base_time(const struct sched_gate_list *sched)
@@ -1113,7 +1113,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 			goto unlock;
 		}
 
-		q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
+		q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
 	}
 
 	if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) &&
@@ -1430,7 +1430,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 		goto options_error;
 
 	if (q->txtime_delay &&
-	    nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
+	    nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
 		goto options_error;
 
 	if (oper && dump_schedule(skb, oper))
-- 
cgit v1.2.3


From 0bf5f9492389aa8df5c8e38fcb4488802d24504d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Jul 2019 16:26:27 -0700
Subject: mm: fix the MAP_UNINITIALIZED flag

We can't expose UAPI symbols differently based on CONFIG_ symbols, as
userspace won't have them available.  Instead always define the flag,
but only respect it based on the config option.

Link: http://lkml.kernel.org/r/20190703122359.18200-2-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/xtensa/include/uapi/asm/mman.h    | 6 +-----
 include/uapi/asm-generic/mman-common.h | 8 +++-----
 mm/nommu.c                             | 4 +++-
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index be726062412b..ebbb48842190 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -56,12 +56,8 @@
 #define MAP_STACK	0x40000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x80000		/* create a huge page mapping */
 #define MAP_FIXED_NOREPLACE 0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */
-#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
-# define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
+#define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
 					 * uninitialized */
-#else
-# define MAP_UNINITIALIZED 0x0		/* Don't support this flag */
-#endif
 
 /*
  * Flags for msync
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index abd238d0f7a4..cb556b430e71 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -19,15 +19,13 @@
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
-#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
-# define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be uninitialized */
-#else
-# define MAP_UNINITIALIZED 0x0		/* Don't support this flag */
-#endif
 
 /* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */
 #define MAP_FIXED_NOREPLACE	0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */
 
+#define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
+					 * uninitialized */
+
 /*
  * Flags for mlock
  */
diff --git a/mm/nommu.c b/mm/nommu.c
index eb3e2e558da1..fed1b6e9c89b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1261,7 +1261,9 @@ unsigned long do_mmap(struct file *file,
 	add_nommu_region(region);
 
 	/* clear anonymous mappings that don't ask for uninitialized data */
-	if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
+	if (!vma->vm_file &&
+	    (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) ||
+	     !(flags & MAP_UNINITIALIZED)))
 		memset((void *)region->vm_start, 0,
 		       region->vm_end - region->vm_start);
 
-- 
cgit v1.2.3


From 89165b8b0ee97bd775ac4376b932fd030f7462bd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Jul 2019 16:26:30 -0700
Subject: mm: provide a print_vma_addr stub for !CONFIG_MMU

Link: http://lkml.kernel.org/r/20190703122359.18200-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0389c34ac529..74797ed20c2c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2767,7 +2767,13 @@ extern int randomize_va_space;
 #endif
 
 const char * arch_vma_name(struct vm_area_struct *vma);
+#ifdef CONFIG_MMU
 void print_vma_addr(char *prefix, unsigned long rip);
+#else
+static inline void print_vma_addr(char *prefix, unsigned long rip)
+{
+}
+#endif
 
 void *sparse_buffer_alloc(unsigned long size);
 struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
-- 
cgit v1.2.3


From 9b98fa22948551e20a15b0b9d22589e3724c361a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Jul 2019 16:26:33 -0700
Subject: mm: stub out all of swapops.h for !CONFIG_MMU

The whole header file deals with swap entries and PTEs, none of which
can exist for nommu builds.  The current nommu ports have lots of stubs
to allow the inline functions in swapops.h to compile, but as none of
this functionality is actually used there is no point in even providing
it.  This way we don't have to provide the stubs for the upcoming RISC-V
nommu port, and can eventually remove it from the existing ports.

Link: http://lkml.kernel.org/r/20190703122359.18200-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swapops.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 15bdb6fe71e5..877fd239b6ff 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -6,6 +6,8 @@
 #include <linux/bug.h>
 #include <linux/mm_types.h>
 
+#ifdef CONFIG_MMU
+
 /*
  * swapcache pages are stored in the swapper_space radix tree.  We want to
  * get good packing density in that tree, so the index should be dense in
@@ -50,13 +52,11 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 	return entry.val & SWP_OFFSET_MASK;
 }
 
-#ifdef CONFIG_MMU
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
 	return !pte_none(pte) && !pte_present(pte);
 }
-#endif
 
 /*
  * Convert the arch-dependent pte representation of a swp_entry_t into an
@@ -360,4 +360,5 @@ static inline int non_swap_entry(swp_entry_t entry)
 }
 #endif
 
+#endif /* CONFIG_MMU */
 #endif /* _LINUX_SWAPOPS_H */
-- 
cgit v1.2.3


From ce251e0e3c0597ea8cab5787df579bd1f9c1aca1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jul 2019 16:26:42 -0700
Subject: include/linux/kernel.h: add typeof_member() macro

Add typeof_member() macro so that types can be extracted without
introducing dummy variables.

Link: http://lkml.kernel.org/r/20190529190720.GA5703@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 0c9bc231107f..4fa360a13c1e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -88,6 +88,8 @@
  */
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
 
+#define typeof_member(T, m)	typeof(((T*)0)->m)
+
 #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
 
 #define DIV_ROUND_DOWN_ULL(ll, d) \
-- 
cgit v1.2.3


From 95b980d62d52c4c1768ee719e8db3efe27ef52b2 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 16 Jul 2019 16:26:57 -0700
Subject: linux/bits.h: make BIT(), GENMASK(), and friends available in
 assembly

BIT(),  GENMASK(), etc. are useful to define register bits of hardware.
However, low-level code is often written in assembly, where they are
not available due to the hard-coded 1UL, 0UL.

In fact, in-kernel headers such as arch/arm64/include/asm/sysreg.h
use _BITUL() instead of BIT() so that the register bit macros are
available in assembly.

Using macros in include/uapi/linux/const.h have two reasons:

[1] For use in uapi headers
  We should use underscore-prefixed variants for user-space.

[2] For use in assembly code
  Since _BITUL() uses UL(1) instead of 1UL, it can be used as an
  alternative of BIT().

For [2], it is pretty easy to change BIT() etc. for use in assembly.

This allows to replace _BUTUL() in kernel-space headers with BIT().

Link: http://lkml.kernel.org/r/20190609153941.17249-1-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bits.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bits.h b/include/linux/bits.h
index 2b7b532c1d51..669d69441a62 100644
--- a/include/linux/bits.h
+++ b/include/linux/bits.h
@@ -1,13 +1,15 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __LINUX_BITS_H
 #define __LINUX_BITS_H
+
+#include <linux/const.h>
 #include <asm/bitsperlong.h>
 
-#define BIT(nr)			(1UL << (nr))
-#define BIT_ULL(nr)		(1ULL << (nr))
-#define BIT_MASK(nr)		(1UL << ((nr) % BITS_PER_LONG))
+#define BIT(nr)			(UL(1) << (nr))
+#define BIT_ULL(nr)		(ULL(1) << (nr))
+#define BIT_MASK(nr)		(UL(1) << ((nr) % BITS_PER_LONG))
 #define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
-#define BIT_ULL_MASK(nr)	(1ULL << ((nr) % BITS_PER_LONG_LONG))
+#define BIT_ULL_MASK(nr)	(ULL(1) << ((nr) % BITS_PER_LONG_LONG))
 #define BIT_ULL_WORD(nr)	((nr) / BITS_PER_LONG_LONG)
 #define BITS_PER_BYTE		8
 
@@ -17,10 +19,11 @@
  * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
  */
 #define GENMASK(h, l) \
-	(((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+	(((~UL(0)) - (UL(1) << (l)) + 1) & \
+	 (~UL(0) >> (BITS_PER_LONG - 1 - (h))))
 
 #define GENMASK_ULL(h, l) \
-	(((~0ULL) - (1ULL << (l)) + 1) & \
-	 (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
+	(((~ULL(0)) - (ULL(1) << (l)) + 1) & \
+	 (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h))))
 
 #endif	/* __LINUX_BITS_H */
-- 
cgit v1.2.3


From c296d4dc13aefe96792538a949996b8938f28f13 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Tue, 16 Jul 2019 16:27:06 -0700
Subject: asm-generic: fix a compilation warning

Fix this compilation warning on x86 by making flush_cache_vmap() inline.

  lib/ioremap.c: In function 'ioremap_page_range':
  lib/ioremap.c:214:16: warning: variable 'start' set but not used [-Wunused-but-set-variable]
    unsigned long start;
                  ^~~~~

While at it, convert all other similar functions to inline for
consistency.

Link: http://lkml.kernel.org/r/1562594592-15228-1-git-send-email-cai@lca.pw
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/cacheflush.h | 74 ++++++++++++++++++++++++++++++++--------
 1 file changed, 60 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h
index 0dd47a6db2cf..a950a22c4890 100644
--- a/include/asm-generic/cacheflush.h
+++ b/include/asm-generic/cacheflush.h
@@ -5,24 +5,70 @@
 /* Keep includes the same across arches.  */
 #include <linux/mm.h>
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
+
 /*
  * The cache doesn't need to be flushed when TLB entries change when
  * the cache is mapped to physical memory, not virtual memory
  */
-#define flush_cache_all()			do { } while (0)
-#define flush_cache_mm(mm)			do { } while (0)
-#define flush_cache_dup_mm(mm)			do { } while (0)
-#define flush_cache_range(vma, start, end)	do { } while (0)
-#define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
-#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
-#define flush_dcache_page(page)			do { } while (0)
-#define flush_dcache_mmap_lock(mapping)		do { } while (0)
-#define flush_dcache_mmap_unlock(mapping)	do { } while (0)
-#define flush_icache_range(start, end)		do { } while (0)
-#define flush_icache_page(vma,pg)		do { } while (0)
-#define flush_icache_user_range(vma,pg,adr,len)	do { } while (0)
-#define flush_cache_vmap(start, end)		do { } while (0)
-#define flush_cache_vunmap(start, end)		do { } while (0)
+static inline void flush_cache_all(void)
+{
+}
+
+static inline void flush_cache_mm(struct mm_struct *mm)
+{
+}
+
+static inline void flush_cache_dup_mm(struct mm_struct *mm)
+{
+}
+
+static inline void flush_cache_range(struct vm_area_struct *vma,
+				     unsigned long start,
+				     unsigned long end)
+{
+}
+
+static inline void flush_cache_page(struct vm_area_struct *vma,
+				    unsigned long vmaddr,
+				    unsigned long pfn)
+{
+}
+
+static inline void flush_dcache_page(struct page *page)
+{
+}
+
+static inline void flush_dcache_mmap_lock(struct address_space *mapping)
+{
+}
+
+static inline void flush_dcache_mmap_unlock(struct address_space *mapping)
+{
+}
+
+static inline void flush_icache_range(unsigned long start, unsigned long end)
+{
+}
+
+static inline void flush_icache_page(struct vm_area_struct *vma,
+				     struct page *page)
+{
+}
+
+static inline void flush_icache_user_range(struct vm_area_struct *vma,
+					   struct page *page,
+					   unsigned long addr, int len)
+{
+}
+
+static inline void flush_cache_vmap(unsigned long start, unsigned long end)
+{
+}
+
+static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
+{
+}
 
 #define copy_to_user_page(vma, page, vaddr, dst, src, len) \
 	do { \
-- 
cgit v1.2.3


From 4c6080cd6f8baad9f7faa3deac9a90e59726b119 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jul 2019 16:27:12 -0700
Subject: lib/list: tweak LIST_POISON2 for better code generation on x86_64

list_del() poisoning can generate 2 64-bit immediate loads but it also can
generate one 64-bit immediate load and an addition:

	48 b8 00 01 00 00 00 00 ad de	movabs rax,0xdead000000000100
	48 89 47 58			mov    QWORD PTR [rdi+0x58],rax
	48 05 00 01 00 00   <=====>	add    rax,0x100
	48 89 47 60			mov    QWORD PTR [rdi+0x60],rax

However on x86_64 not all constants are equal: those within [-128, 127]
range can be added with shorter "add r64, imm32" instruction:

	48 b8 00 01 00 00 00 00 ad de	movabs rax,0xdead000000000100
	48 89 47 58			mov    QWORD PTR [rdi+0x58],rax
	48 83 c0 22	<======>	add    rax,0x22
	48 89 47 60			mov    QWORD PTR [rdi+0x60],rax

Patch saves 2 bytes per some LIST_POISON2 usage.

(Slightly disappointing) space savings on F29 x86_64 config:

	add/remove: 0/0 grow/shrink: 0/2164 up/down: 0/-5184 (-5184)
	Function                                     old     new   delta
	zstd_get_workspace                           548     546      -2
		...
	mlx4_delete_all_resources_for_slave         4826    4804     -22
	Total: Before=83304131, After=83298947, chg -0.01%

New constants are:

	0xdead000000000100
	0xdead000000000122

Note: LIST_POISON1 can't be changed to ...11 because something in page
allocator requires low bit unset.

Link: http://lkml.kernel.org/r/20190513191502.GA8492@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poison.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index d6d980a681c7..df34330b4e34 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -21,7 +21,7 @@
  * non-initialized list entries.
  */
 #define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
-#define LIST_POISON2  ((void *) 0x200 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x122 + POISON_POINTER_DELTA)
 
 /********** include/linux/timer.h **********/
 /*
-- 
cgit v1.2.3


From 0f472d04f59ff89d15b2a1c4eafde7317ddd67a2 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 16 Jul 2019 16:27:33 -0700
Subject: mm/ioremap: probe platform for p4d huge map support

Finish up what commit c2febafc6773 ("mm: convert generic code to 5-level
paging") started while levelling up P4D huge mapping support at par with
PUD and PMD.  A new arch call back arch_ioremap_p4d_supported() is added
which just maintains status quo (P4D huge map not supported) on x86,
arm64 and powerpc.

When HAVE_ARCH_HUGE_VMAP is enabled its just a simple check from the
arch about the support, hence runtime effects are minimal.

Link: http://lkml.kernel.org/r/1561699231-20991-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c                      | 5 +++++
 arch/powerpc/mm/book3s64/radix_pgtable.c | 5 +++++
 arch/x86/mm/ioremap.c                    | 5 +++++
 include/linux/io.h                       | 1 +
 lib/ioremap.c                            | 2 ++
 5 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1b49c08dfa2b..e661469cabdd 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -942,6 +942,11 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 	return dt_virt;
 }
 
+int __init arch_ioremap_p4d_supported(void)
+{
+	return 0;
+}
+
 int __init arch_ioremap_pud_supported(void)
 {
 	/*
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 65c2ba1e1783..b4ca9e95e678 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1237,3 +1237,8 @@ int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size,
 		return 0;
 	}
 }
+
+int __init arch_ioremap_p4d_supported(void)
+{
+	return 0;
+}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index e500f1df1140..63e99f15d7cf 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -459,6 +459,11 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
+int __init arch_ioremap_p4d_supported(void)
+{
+	return 0;
+}
+
 int __init arch_ioremap_pud_supported(void)
 {
 #ifdef CONFIG_X86_64
diff --git a/include/linux/io.h b/include/linux/io.h
index 9876e5801a9d..accac822336a 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -33,6 +33,7 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 void __init ioremap_huge_init(void);
+int arch_ioremap_p4d_supported(void);
 int arch_ioremap_pud_supported(void);
 int arch_ioremap_pmd_supported(void);
 #else
diff --git a/lib/ioremap.c b/lib/ioremap.c
index a95161d9c883..0a2ffadc6d71 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -30,6 +30,8 @@ early_param("nohugeiomap", set_nohugeiomap);
 void __init ioremap_huge_init(void)
 {
 	if (!ioremap_huge_disabled) {
+		if (arch_ioremap_p4d_supported())
+			ioremap_p4d_capable = 1;
 		if (arch_ioremap_pud_supported())
 			ioremap_pud_capable = 1;
 		if (arch_ioremap_pmd_supported())
-- 
cgit v1.2.3


From 9f973cb38088e0cf42e0bae97ff140813e623f13 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 16 Jul 2019 16:27:45 -0700
Subject: lib/rbtree: avoid generating code twice for the cached versions

As was already noted in rbtree.h, the logic to cache rb_first (or
rb_last) can easily be implemented externally to the core rbtree api.

Change the implementation to do just that.  Previously the update of
rb_leftmost was wired deeper into the implmentation, but there were some
disadvantages to that - mostly, lib/rbtree.c had separate instantiations
for rb_insert_color() vs rb_insert_color_cached(), as well as rb_erase()
vs rb_erase_cached(), which were doing exactly the same thing save for
the rb_leftmost update at the start of either function.

   text	   data	    bss	    dec	    hex	filename
   5405	    120	      0	   5525	   1595	lib/rbtree.o-vanilla
   3827	     96	      0	   3923	    f53	lib/rbtree.o-patch

[dave@stgolabs.net: changelog addition]
  Link: http://lkml.kernel.org/r/20190628171416.by5gdizl3rcxk5h5@linux-r8p5
[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20190628045008.39926-1-walken@google.com
Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h           | 70 ++++++++++++++++++++++++++--------------
 include/linux/rbtree_augmented.h | 27 ++++++----------
 lib/rbtree.c                     | 40 ++---------------------
 3 files changed, 59 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index e6337fce08f2..1fd61a9af45c 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -32,25 +32,9 @@ struct rb_root {
 	struct rb_node *rb_node;
 };
 
-/*
- * Leftmost-cached rbtrees.
- *
- * We do not cache the rightmost node based on footprint
- * size vs number of potential users that could benefit
- * from O(1) rb_last(). Just not worth it, users that want
- * this feature can always implement the logic explicitly.
- * Furthermore, users that want to cache both pointers may
- * find it a bit asymmetric, but that's ok.
- */
-struct rb_root_cached {
-	struct rb_root rb_root;
-	struct rb_node *rb_leftmost;
-};
-
 #define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
 
 #define RB_ROOT	(struct rb_root) { NULL, }
-#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
 #define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
@@ -72,12 +56,6 @@ extern struct rb_node *rb_prev(const struct rb_node *);
 extern struct rb_node *rb_first(const struct rb_root *);
 extern struct rb_node *rb_last(const struct rb_root *);
 
-extern void rb_insert_color_cached(struct rb_node *,
-				   struct rb_root_cached *, bool);
-extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
-/* Same as rb_first(), but O(1) */
-#define rb_first_cached(root) (root)->rb_leftmost
-
 /* Postorder iteration - always visit the parent after its children */
 extern struct rb_node *rb_first_postorder(const struct rb_root *);
 extern struct rb_node *rb_next_postorder(const struct rb_node *);
@@ -87,8 +65,6 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 			    struct rb_root *root);
 extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
 				struct rb_root *root);
-extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
-				   struct rb_root_cached *root);
 
 static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
 				struct rb_node **rb_link)
@@ -136,4 +112,50 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent
 			typeof(*pos), field); 1; }); \
 	     pos = n)
 
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+	struct rb_root rb_root;
+	struct rb_node *rb_leftmost;
+};
+
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
+static inline void rb_insert_color_cached(struct rb_node *node,
+					  struct rb_root_cached *root,
+					  bool leftmost)
+{
+	if (leftmost)
+		root->rb_leftmost = node;
+	rb_insert_color(node, &root->rb_root);
+}
+
+static inline void rb_erase_cached(struct rb_node *node,
+				   struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase(node, &root->rb_root);
+}
+
+static inline void rb_replace_node_cached(struct rb_node *victim,
+					  struct rb_node *new,
+					  struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == victim)
+		root->rb_leftmost = new;
+	rb_replace_node(victim, new, &root->rb_root);
+}
+
 #endif	/* _LINUX_RBTREE_H */
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 0f902ccb48b0..179faab29f52 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -30,10 +30,9 @@ struct rb_augment_callbacks {
 	void (*rotate)(struct rb_node *old, struct rb_node *new);
 };
 
-extern void __rb_insert_augmented(struct rb_node *node,
-				  struct rb_root *root,
-				  bool newleft, struct rb_node **leftmost,
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
 /*
  * Fixup the rbtree and update the augmented information when rebalancing.
  *
@@ -48,7 +47,7 @@ static inline void
 rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 		    const struct rb_augment_callbacks *augment)
 {
-	__rb_insert_augmented(node, root, false, NULL, augment->rotate);
+	__rb_insert_augmented(node, root, augment->rotate);
 }
 
 static inline void
@@ -56,8 +55,9 @@ rb_insert_augmented_cached(struct rb_node *node,
 			   struct rb_root_cached *root, bool newleft,
 			   const struct rb_augment_callbacks *augment)
 {
-	__rb_insert_augmented(node, &root->rb_root,
-			      newleft, &root->rb_leftmost, augment->rotate);
+	if (newleft)
+		root->rb_leftmost = node;
+	rb_insert_augmented(node, &root->rb_root, augment);
 }
 
 #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
@@ -150,7 +150,6 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
 
 static __always_inline struct rb_node *
 __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
-		     struct rb_node **leftmost,
 		     const struct rb_augment_callbacks *augment)
 {
 	struct rb_node *child = node->rb_right;
@@ -158,9 +157,6 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 	struct rb_node *parent, *rebalance;
 	unsigned long pc;
 
-	if (leftmost && node == *leftmost)
-		*leftmost = rb_next(node);
-
 	if (!tmp) {
 		/*
 		 * Case 1: node to erase has no more than 1 child (easy!)
@@ -260,8 +256,7 @@ static __always_inline void
 rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		   const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *rebalance = __rb_erase_augmented(node, root,
-							 NULL, augment);
+	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
 	if (rebalance)
 		__rb_erase_color(rebalance, root, augment->rotate);
 }
@@ -270,11 +265,9 @@ static __always_inline void
 rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
 			  const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root,
-							 &root->rb_leftmost,
-							 augment);
-	if (rebalance)
-		__rb_erase_color(rebalance, &root->rb_root, augment->rotate);
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase_augmented(node, &root->rb_root, augment);
 }
 
 #endif	/* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 1ef6e25d031c..abc86c6a3177 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -83,14 +83,10 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
 
 static __always_inline void
 __rb_insert(struct rb_node *node, struct rb_root *root,
-	    bool newleft, struct rb_node **leftmost,
 	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
 	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
 
-	if (newleft)
-		*leftmost = node;
-
 	while (true) {
 		/*
 		 * Loop invariant: node is red.
@@ -437,38 +433,19 @@ static const struct rb_augment_callbacks dummy_callbacks = {
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
-	__rb_insert(node, root, false, NULL, dummy_rotate);
+	__rb_insert(node, root, dummy_rotate);
 }
 EXPORT_SYMBOL(rb_insert_color);
 
 void rb_erase(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, root,
-					 NULL, &dummy_callbacks);
+	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
 	if (rebalance)
 		____rb_erase_color(rebalance, root, dummy_rotate);
 }
 EXPORT_SYMBOL(rb_erase);
 
-void rb_insert_color_cached(struct rb_node *node,
-			    struct rb_root_cached *root, bool leftmost)
-{
-	__rb_insert(node, &root->rb_root, leftmost,
-		    &root->rb_leftmost, dummy_rotate);
-}
-EXPORT_SYMBOL(rb_insert_color_cached);
-
-void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
-{
-	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, &root->rb_root,
-					 &root->rb_leftmost, &dummy_callbacks);
-	if (rebalance)
-		____rb_erase_color(rebalance, &root->rb_root, dummy_rotate);
-}
-EXPORT_SYMBOL(rb_erase_cached);
-
 /*
  * Augmented rbtree manipulation functions.
  *
@@ -477,10 +454,9 @@ EXPORT_SYMBOL(rb_erase_cached);
  */
 
 void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-			   bool newleft, struct rb_node **leftmost,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-	__rb_insert(node, root, newleft, leftmost, augment_rotate);
+	__rb_insert(node, root, augment_rotate);
 }
 EXPORT_SYMBOL(__rb_insert_augmented);
 
@@ -591,16 +567,6 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 }
 EXPORT_SYMBOL(rb_replace_node);
 
-void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
-			    struct rb_root_cached *root)
-{
-	rb_replace_node(victim, new, &root->rb_root);
-
-	if (root->rb_leftmost == victim)
-		root->rb_leftmost = new;
-}
-EXPORT_SYMBOL(rb_replace_node_cached);
-
 void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
 			 struct rb_root *root)
 {
-- 
cgit v1.2.3


From b98cca444d287a63dd96df04af7fb9793567599e Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 16 Jul 2019 16:28:00 -0700
Subject: mm, kprobes: generalize and rename notify_page_fault() as
 kprobe_page_fault()

Architectures which support kprobes have very similar boilerplate around
calling kprobe_fault_handler().  Use a helper function in kprobes.h to
unify them, based on the x86 code.

This changes the behaviour for other architectures when preemption is
enabled.  Previously, they would have disabled preemption while calling
the kprobe handler.  However, preemption would be disabled if this fault
was due to a kprobe, so we know the fault was not due to a kprobe
handler and can simply return failure.

This behaviour was introduced in commit a980c0ef9f6d ("x86/kprobes:
Refactor kprobes_fault() like kprobe_exceptions_notify()")

[anshuman.khandual@arm.com: export kprobe_fault_handler()]
  Link: http://lkml.kernel.org/r/1561133358-8876-1-git-send-email-anshuman.khandual@arm.com
Link: http://lkml.kernel.org/r/1560420444-25737-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/fault.c             | 24 +-----------------------
 arch/arm64/mm/fault.c           | 24 +-----------------------
 arch/ia64/mm/fault.c            | 24 +-----------------------
 arch/mips/include/asm/kprobes.h |  1 +
 arch/mips/kernel/kprobes.c      |  2 +-
 arch/powerpc/mm/fault.c         | 23 ++---------------------
 arch/s390/mm/fault.c            | 16 +---------------
 arch/sh/mm/fault.c              | 18 ++----------------
 arch/sparc/mm/fault_64.c        | 16 +---------------
 arch/x86/mm/fault.c             | 21 ++-------------------
 include/linux/kprobes.h         | 19 +++++++++++++++++++
 11 files changed, 32 insertions(+), 156 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 0e417233dad7..890eeaac3cbb 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -27,28 +27,6 @@
 
 #ifdef CONFIG_MMU
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr)
-{
-	int ret = 0;
-
-	if (!user_mode(regs)) {
-		/* kprobe_running() needs smp_processor_id() */
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, fsr))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr)
-{
-	return 0;
-}
-#endif
-
 /*
  * This is useful to dump out the page tables associated with
  * 'addr' in mm 'mm'.
@@ -265,7 +243,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	vm_fault_t fault;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
-	if (notify_page_fault(regs, fsr))
+	if (kprobe_page_fault(regs, fsr))
 		return 0;
 
 	tsk = current;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c8c61b1eb479..9568c116ac7f 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -59,28 +59,6 @@ static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr)
 	return debug_fault_info + DBG_ESR_EVT(esr);
 }
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, esr))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
-{
-	return 0;
-}
-#endif
-
 static void data_abort_decode(unsigned int esr)
 {
 	pr_alert("Data abort info:\n");
@@ -434,7 +412,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	unsigned long vm_flags = VM_READ | VM_WRITE;
 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
-	if (notify_page_fault(regs, esr))
+	if (kprobe_page_fault(regs, esr))
 		return 0;
 
 	/*
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 3c3a283d3172..c2f299fe9e04 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -21,28 +21,6 @@
 
 extern int die(char *, struct pt_regs *, long);
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-	int ret = 0;
-
-	if (!user_mode(regs)) {
-		/* kprobe_running() needs smp_processor_id() */
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, trap))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-	return 0;
-}
-#endif
-
 /*
  * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
  * (inside region 5, on ia64) and that page is present.
@@ -116,7 +94,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * This is to handle the kprobes on user space access instructions
 	 */
-	if (notify_page_fault(regs, TRAP_BRKPT))
+	if (kprobe_page_fault(regs, TRAP_BRKPT))
 		return;
 
 	if (user_mode(regs))
diff --git a/arch/mips/include/asm/kprobes.h b/arch/mips/include/asm/kprobes.h
index 3cf8e4d5fa28..68b1e5d458cf 100644
--- a/arch/mips/include/asm/kprobes.h
+++ b/arch/mips/include/asm/kprobes.h
@@ -41,6 +41,7 @@ do {									\
 #define kretprobe_blacklist_size 0
 
 void arch_remove_kprobe(struct kprobe *p);
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c
index 81ba1d3c367c..6cfae2411c04 100644
--- a/arch/mips/kernel/kprobes.c
+++ b/arch/mips/kernel/kprobes.c
@@ -398,7 +398,7 @@ out:
 	return 1;
 }
 
-static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index d989592b6fc8..8432c281de92 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -42,26 +42,6 @@
 #include <asm/debug.h>
 #include <asm/kup.h>
 
-static inline bool notify_page_fault(struct pt_regs *regs)
-{
-	bool ret = false;
-
-#ifdef CONFIG_KPROBES
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 11))
-			ret = true;
-		preempt_enable();
-	}
-#endif /* CONFIG_KPROBES */
-
-	if (unlikely(debugger_fault_handler(regs)))
-		ret = true;
-
-	return ret;
-}
-
 /*
  * Check whether the instruction inst is a store using
  * an update addressing form which will update r1.
@@ -461,8 +441,9 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	int is_write = page_fault_is_write(error_code);
 	vm_fault_t fault, major = 0;
 	bool must_retry = false;
+	bool kprobe_fault = kprobe_page_fault(regs, 11);
 
-	if (notify_page_fault(regs))
+	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
 		return 0;
 
 	if (unlikely(page_fault_is_bad(error_code))) {
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 0ba174f779da..63507662828f 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -67,20 +67,6 @@ static int __init fault_init(void)
 }
 early_initcall(fault_init);
 
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (kprobes_built_in() && !user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 14))
-			ret = 1;
-		preempt_enable();
-	}
-	return ret;
-}
-
 /*
  * Find out which address space caused the exception.
  */
@@ -412,7 +398,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
 	 */
 	clear_pt_regs_flag(regs, PIF_PER_TRAP);
 
-	if (notify_page_fault(regs))
+	if (kprobe_page_fault(regs, 14))
 		return 0;
 
 	mm = tsk->mm;
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 3093bc372138..5f51456f4fc7 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -24,20 +24,6 @@
 #include <asm/tlbflush.h>
 #include <asm/traps.h>
 
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-	int ret = 0;
-
-	if (kprobes_built_in() && !user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, trap))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address)
 {
@@ -412,14 +398,14 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	if (unlikely(fault_in_kernel_space(address))) {
 		if (vmalloc_fault(address) >= 0)
 			return;
-		if (notify_page_fault(regs, vec))
+		if (kprobe_page_fault(regs, vec))
 			return;
 
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
 
-	if (unlikely(notify_page_fault(regs, vec)))
+	if (unlikely(kprobe_page_fault(regs, vec)))
 		return;
 
 	/* Only enable interrupts if they were on before the fault */
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 83fda4d9c3b2..2371fb6b97e4 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -38,20 +38,6 @@
 
 int show_unhandled_signals = 1;
 
-static inline __kprobes int notify_page_fault(struct pt_regs *regs)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (kprobes_built_in() && !user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 0))
-			ret = 1;
-		preempt_enable();
-	}
-	return ret;
-}
-
 static void __kprobes unhandled_fault(unsigned long address,
 				      struct task_struct *tsk,
 				      struct pt_regs *regs)
@@ -285,7 +271,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 
 	fault_code = get_thread_fault_code();
 
-	if (notify_page_fault(regs))
+	if (kprobe_page_fault(regs, 0))
 		goto exit_exception;
 
 	si_code = SEGV_MAPERR;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 794f364cb882..d1634c59ed56 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -46,23 +46,6 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
 	return 0;
 }
 
-static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
-{
-	if (!kprobes_built_in())
-		return 0;
-	if (user_mode(regs))
-		return 0;
-	/*
-	 * To be potentially processing a kprobe fault and to be allowed to call
-	 * kprobe_running(), we have to be non-preemptible.
-	 */
-	if (preemptible())
-		return 0;
-	if (!kprobe_running())
-		return 0;
-	return kprobe_fault_handler(regs, X86_TRAP_PF);
-}
-
 /*
  * Prefetch quirks:
  *
@@ -1282,7 +1265,7 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
 		return;
 
 	/* kprobes don't want to hook the spurious faults: */
-	if (kprobes_fault(regs))
+	if (kprobe_page_fault(regs, X86_TRAP_PF))
 		return;
 
 	/*
@@ -1313,7 +1296,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 	mm = tsk->mm;
 
 	/* kprobes don't want to hook the spurious faults: */
-	if (unlikely(kprobes_fault(regs)))
+	if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
 		return;
 
 	/*
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 443d9800ca3f..04bdaf01112c 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -458,4 +458,23 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr)
 }
 #endif
 
+/* Returns true if kprobes handled the fault */
+static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
+					      unsigned int trap)
+{
+	if (!kprobes_built_in())
+		return false;
+	if (user_mode(regs))
+		return false;
+	/*
+	 * To be potentially processing a kprobe fault and to be allowed
+	 * to call kprobe_running(), we have to be non-preemptible.
+	 */
+	if (preemptible())
+		return false;
+	if (!kprobe_running())
+		return false;
+	return kprobe_fault_handler(regs, trap);
+}
+
 #endif /* _LINUX_KPROBES_H */
-- 
cgit v1.2.3


From 694a58e29ef27c4c26f103a9decfd053f94dd34c Mon Sep 17 00:00:00 2001
From: Mikko Rapeli <mikko.rapeli@iki.fi>
Date: Tue, 16 Jul 2019 16:28:07 -0700
Subject: uapi linux/coda.h: use __kernel_pid_t for userspace

Part of a patch by Mikko Rapeli, as Arnd Bergman commented on the
original patch.

   pid_t might differ between libc and the kernel, so the kernel
   interface has to use types that the kernel defines.

Link: http://lkml.kernel.org/r/f374a71f4d351bc8c8b3ac18ad7765c88d806d10.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Mikko Rapeli <mikko.rapeli@iki.fi>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/coda.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h
index 695fade33c64..ed8cb263e482 100644
--- a/include/uapi/linux/coda.h
+++ b/include/uapi/linux/coda.h
@@ -295,8 +295,8 @@ struct coda_statfs {
 struct coda_in_hdr {
     u_int32_t opcode;
     u_int32_t unique;	    /* Keep multiple outstanding msgs distinct */
-    pid_t pid;
-    pid_t pgid;
+    __kernel_pid_t pid;
+    __kernel_pid_t pgid;
     vuid_t uid;
 };
 
-- 
cgit v1.2.3


From f90fb3c7e2c13ae829db2274b88b845a75038b8a Mon Sep 17 00:00:00 2001
From: Mikko Rapeli <mikko.rapeli@iki.fi>
Date: Tue, 16 Jul 2019 16:28:10 -0700
Subject: uapi linux/coda_psdev.h: move upc_req definition from uapi to kernel
 side headers

Only users of upc_req in kernel side fs/coda/psdev.c and
fs/coda/upcall.c already include linux/coda_psdev.h.

Suggested by Jan Harkes <jaharkes@cs.cmu.edu> in
  https://lore.kernel.org/lkml/20150531111913.GA23377@cs.cmu.edu/

Fixes these include/uapi/linux/coda_psdev.h compilation errors in userspace:

  linux/coda_psdev.h:12:19: error: field `uc_chain' has incomplete type
  struct list_head    uc_chain;
                   ^
  linux/coda_psdev.h:13:2: error: unknown type name `caddr_t'
  caddr_t             uc_data;
  ^
  linux/coda_psdev.h:14:2: error: unknown type name `u_short'
  u_short             uc_flags;
  ^
  linux/coda_psdev.h:15:2: error: unknown type name `u_short'
  u_short             uc_inSize;  /* Size is at most 5000 bytes */
  ^
  linux/coda_psdev.h:16:2: error: unknown type name `u_short'
  u_short             uc_outSize;
  ^
  linux/coda_psdev.h:17:2: error: unknown type name `u_short'
  u_short             uc_opcode;  /* copied from data to save lookup */
  ^
  linux/coda_psdev.h:19:2: error: unknown type name `wait_queue_head_t'
  wait_queue_head_t   uc_sleep;   /* process' wait queue */
  ^

Link: http://lkml.kernel.org/r/9f99f5ce6a0563d5266e6cf7aa9585aac2cae971.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Mikko Rapeli <mikko.rapeli@iki.fi>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coda_psdev.h      | 11 +++++++++++
 include/uapi/linux/coda_psdev.h | 13 -------------
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 15170954aa2b..57d2b2faf6a3 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -19,6 +19,17 @@ struct venus_comm {
 	struct mutex	    vc_mutex;
 };
 
+/* messages between coda filesystem in kernel and Venus */
+struct upc_req {
+	struct list_head	uc_chain;
+	caddr_t			uc_data;
+	u_short			uc_flags;
+	u_short			uc_inSize;  /* Size is at most 5000 bytes */
+	u_short			uc_outSize;
+	u_short			uc_opcode;  /* copied from data to save lookup */
+	int			uc_unique;
+	wait_queue_head_t	uc_sleep;   /* process' wait queue */
+};
 
 static inline struct venus_comm *coda_vcp(struct super_block *sb)
 {
diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h
index aa6623efd2dd..d50d51a57fe4 100644
--- a/include/uapi/linux/coda_psdev.h
+++ b/include/uapi/linux/coda_psdev.h
@@ -7,19 +7,6 @@
 #define CODA_PSDEV_MAJOR 67
 #define MAX_CODADEVS  5	   /* how many do we allow */
 
-
-/* messages between coda filesystem in kernel and Venus */
-struct upc_req {
-	struct list_head    uc_chain;
-	caddr_t	            uc_data;
-	u_short	            uc_flags;
-	u_short             uc_inSize;  /* Size is at most 5000 bytes */
-	u_short	            uc_outSize;
-	u_short	            uc_opcode;  /* copied from data to save lookup */
-	int		    uc_unique;
-	wait_queue_head_t   uc_sleep;   /* process' wait queue */
-};
-
 #define CODA_REQ_ASYNC  0x1
 #define CODA_REQ_READ   0x2
 #define CODA_REQ_WRITE  0x4
-- 
cgit v1.2.3


From 6e51f8aa76b67d0a6eb168fd41a81e8478ae07a9 Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Tue, 16 Jul 2019 16:28:16 -0700
Subject: coda: potential buffer overflow in coda_psdev_write()

Add checks to make sure the downcall message we got from the Coda cache
manager is large enough to contain the data it is supposed to have.
i.e.  when we get a CODA_ZAPDIR we can access &out->coda_zapdir.CodaFid.

Link: http://lkml.kernel.org/r/894fb6b250add09e4e3935f14649f21284a5cb18.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/psdev.c            |  8 ++++++--
 fs/coda/upcall.c           | 34 +++++++++++++++++++++++++++++++++-
 include/linux/coda_psdev.h |  3 ++-
 3 files changed, 41 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 241f7e04ad04..b4da2812499e 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -100,8 +100,12 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 	ssize_t retval = 0, count = 0;
 	int error;
 
+	/* make sure there is enough to copy out the (opcode, unique) values */
+	if (nbytes < (2 * sizeof(u_int32_t)))
+		return -EINVAL;
+
         /* Peek at the opcode, uniquefier */
-	if (copy_from_user(&hdr, buf, 2 * sizeof(u_long)))
+	if (copy_from_user(&hdr, buf, 2 * sizeof(u_int32_t)))
 	        return -EFAULT;
 
         if (DOWNCALL(hdr.opcode)) {
@@ -127,7 +131,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 		}
 
 		/* what downcall errors does Venus handle ? */
-		error = coda_downcall(vcp, hdr.opcode, dcbuf);
+		error = coda_downcall(vcp, hdr.opcode, dcbuf, nbytes);
 
 		CODA_FREE(dcbuf, nbytes);
 		if (error) {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1175a1722411..cf1e662681a5 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -804,12 +804,44 @@ exit:
  *
  * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
 
-int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes)
 {
 	struct inode *inode = NULL;
 	struct CodaFid *fid = NULL, *newfid;
 	struct super_block *sb;
 
+	/*
+	 * Make sure we have received enough data from the cache
+	 * manager to populate the necessary fields in the buffer
+	 */
+	switch (opcode) {
+	case CODA_PURGEUSER:
+		if (nbytes < sizeof(struct coda_purgeuser_out))
+			return -EINVAL;
+		break;
+
+	case CODA_ZAPDIR:
+		if (nbytes < sizeof(struct coda_zapdir_out))
+			return -EINVAL;
+		break;
+
+	case CODA_ZAPFILE:
+		if (nbytes < sizeof(struct coda_zapfile_out))
+			return -EINVAL;
+		break;
+
+	case CODA_PURGEFID:
+		if (nbytes < sizeof(struct coda_purgefid_out))
+			return -EINVAL;
+		break;
+
+	case CODA_REPLACE:
+		if (nbytes < sizeof(struct coda_replace_out))
+			return -EINVAL;
+		break;
+	}
+
 	/* Handle invalidation requests. */
 	mutex_lock(&vcp->vc_mutex);
 	sb = vcp->vc_sb;
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 57d2b2faf6a3..d1672fd5e638 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -71,7 +71,8 @@ int venus_symlink(struct super_block *sb, struct CodaFid *fid,
 int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
 int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 		 unsigned int cmd, struct PioctlData *data);
-int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out);
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes);
 int venus_fsync(struct super_block *sb, struct CodaFid *fid);
 int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
 
-- 
cgit v1.2.3


From b2a57e334086602be56b74958d9f29b955cd157f Mon Sep 17 00:00:00 2001
From: Sam Protsenko <semen.protsenko@linaro.org>
Date: Tue, 16 Jul 2019 16:28:20 -0700
Subject: coda: fix build using bare-metal toolchain

The kernel is self-contained project and can be built with bare-metal
toolchain.  But bare-metal toolchain doesn't define __linux__.  Because
of this u_quad_t type is not defined when using bare-metal toolchain and
codafs build fails.  This patch fixes it by defining u_quad_t type
unconditionally.

Link: http://lkml.kernel.org/r/3cbb40b0a57b6f9923a9d67b53473c0b691a3eaa.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Sam Protsenko <semen.protsenko@linaro.org>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coda.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/coda.h b/include/linux/coda.h
index d30209b9cef8..0ca0c83fdb1c 100644
--- a/include/linux/coda.h
+++ b/include/linux/coda.h
@@ -58,8 +58,7 @@ Mellon the rights to redistribute these changes without encumbrance.
 #ifndef _CODA_HEADER_
 #define _CODA_HEADER_
 
-#if defined(__linux__)
 typedef unsigned long long u_quad_t;
-#endif
+
 #include <uapi/linux/coda.h>
 #endif 
-- 
cgit v1.2.3


From 2fe7491d219428a32f09948e88bfaf8e71b9a66b Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Tue, 16 Jul 2019 16:28:26 -0700
Subject: uapi linux/coda_psdev.h: move CODA_REQ_ from uapi to kernel side
 headers

These constants only used internally and not exposed to userspace.

Link: http://lkml.kernel.org/r/baeafc30dad70d8b422ee679420099c2d8aa7da0.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coda_psdev.h      | 5 +++++
 include/uapi/linux/coda_psdev.h | 5 -----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index d1672fd5e638..9487f792770c 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -31,6 +31,11 @@ struct upc_req {
 	wait_queue_head_t	uc_sleep;   /* process' wait queue */
 };
 
+#define CODA_REQ_ASYNC  0x1
+#define CODA_REQ_READ   0x2
+#define CODA_REQ_WRITE  0x4
+#define CODA_REQ_ABORT  0x8
+
 static inline struct venus_comm *coda_vcp(struct super_block *sb)
 {
 	return (struct venus_comm *)((sb)->s_fs_info);
diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h
index d50d51a57fe4..3dacb7fad66a 100644
--- a/include/uapi/linux/coda_psdev.h
+++ b/include/uapi/linux/coda_psdev.h
@@ -7,9 +7,4 @@
 #define CODA_PSDEV_MAJOR 67
 #define MAX_CODADEVS  5	   /* how many do we allow */
 
-#define CODA_REQ_ASYNC  0x1
-#define CODA_REQ_READ   0x2
-#define CODA_REQ_WRITE  0x4
-#define CODA_REQ_ABORT  0x8
-
 #endif /* _UAPI__CODA_PSDEV_H */
-- 
cgit v1.2.3


From 6ced9aa7b56baeb241a715df4539e60d5e3118e2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 16 Jul 2019 16:28:32 -0700
Subject: coda: stop using 'struct timespec' in user API

We exchange file timestamps with user space using psdev device
read/write operations with a fixed but architecture specific binary
layout.

On 32-bit systems, this uses a 'timespec' structure that is defined by
the C library to contain two 32-bit values for seconds and nanoseconds.
As we get ready for the year 2038 overflow of the 32-bit signed seconds,
the kernel now uses 64-bit timestamps internally, and user space will do
the same change by changing the 'timespec' definition in the future.

Unfortunately, this breaks the layout of the coda_vattr structure, so we
need to redefine that in terms of something that does not change.  I'm
introducing a new 'struct vtimespec' structure here that keeps the
existing layout, and the same change has to be done in the coda user
space copy of linux/coda.h before anyone can use that on a 32-bit
architecture with 64-bit time_t.

An open question is what should happen to actual times past y2038, as
they are now truncated to the last valid date when sent to user space,
and interpreted as pre-1970 times when a timestamp with the MSB set is
read back into the kernel.  Alternatively, we could change the new
timespec64_to_coda()/coda_to_timespec64() functions to use a different
interpretation and extend the available range further to the future by
disallowing past timestamps.  This would require more changes in the
user space side though.

Link: http://lkml.kernel.org/r/562b7324149461743e4fbe2fedbf7c242f7e274a.1558117389.git.jaharkes@cs.cmu.edu
Link: https://patchwork.kernel.org/patch/10474735/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Acked-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/coda.txt | 11 ++++++---
 fs/coda/coda_linux.c               | 50 +++++++++++++++++++++++++++++---------
 include/uapi/linux/coda.h          | 20 ++++++++++++---
 3 files changed, 62 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt
index 61311356025d..ea5969068895 100644
--- a/Documentation/filesystems/coda.txt
+++ b/Documentation/filesystems/coda.txt
@@ -481,7 +481,10 @@ kernel support.
 
 
-
+  struct vtimespec {
+          long            tv_sec;         /* seconds */
+          long            tv_nsec;        /* nanoseconds */
+  };
 
   struct coda_vattr {
           enum coda_vtype va_type;        /* vnode type (for create) */
@@ -493,9 +496,9 @@ kernel support.
           long            va_fileid;      /* file id */
           u_quad_t        va_size;        /* file size in bytes */
           long            va_blocksize;   /* blocksize preferred for i/o */
-          struct timespec va_atime;       /* time of last access */
-          struct timespec va_mtime;       /* time of last modification */
-          struct timespec va_ctime;       /* time file changed */
+          struct vtimespec va_atime;      /* time of last access */
+          struct vtimespec va_mtime;      /* time of last modification */
+          struct vtimespec va_ctime;      /* time file changed */
           u_long          va_gen;         /* generation number of file */
           u_long          va_flags;       /* flags defined for file */
           dev_t           va_rdev;        /* device special file represents */
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index f3d543dd9a98..8addcd166908 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -66,6 +66,32 @@ unsigned short coda_flags_to_cflags(unsigned short flags)
 	return coda_flags;
 }
 
+static struct timespec64 coda_to_timespec64(struct vtimespec ts)
+{
+	/*
+	 * We interpret incoming timestamps as 'signed' to match traditional
+	 * usage and support pre-1970 timestamps, but this breaks in y2038
+	 * on 32-bit machines.
+	 */
+	struct timespec64 ts64 = {
+		.tv_sec = ts.tv_sec,
+		.tv_nsec = ts.tv_nsec,
+	};
+
+	return ts64;
+}
+
+static struct vtimespec timespec64_to_coda(struct timespec64 ts64)
+{
+	/* clamp the timestamps to the maximum range rather than wrapping */
+	struct vtimespec ts = {
+		.tv_sec = lower_32_bits(clamp_t(time64_t, ts64.tv_sec,
+						LONG_MIN, LONG_MAX)),
+		.tv_nsec = ts64.tv_nsec,
+	};
+
+	return ts;
+}
 
 /* utility functions below */
 void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
@@ -105,11 +131,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
 	if (attr->va_size != -1)
 		inode->i_blocks = (attr->va_size + 511) >> 9;
 	if (attr->va_atime.tv_sec != -1) 
-		inode->i_atime = timespec_to_timespec64(attr->va_atime);
+		inode->i_atime = coda_to_timespec64(attr->va_atime);
 	if (attr->va_mtime.tv_sec != -1)
-		inode->i_mtime = timespec_to_timespec64(attr->va_mtime);
+		inode->i_mtime = coda_to_timespec64(attr->va_mtime);
         if (attr->va_ctime.tv_sec != -1)
-		inode->i_ctime = timespec_to_timespec64(attr->va_ctime);
+		inode->i_ctime = coda_to_timespec64(attr->va_ctime);
 }
 
 
@@ -130,12 +156,12 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
         vattr->va_uid = (vuid_t) -1; 
         vattr->va_gid = (vgid_t) -1;
         vattr->va_size = (off_t) -1;
-	vattr->va_atime.tv_sec = (time_t) -1;
-	vattr->va_atime.tv_nsec =  (time_t) -1;
-        vattr->va_mtime.tv_sec = (time_t) -1;
-        vattr->va_mtime.tv_nsec = (time_t) -1;
-	vattr->va_ctime.tv_sec = (time_t) -1;
-	vattr->va_ctime.tv_nsec = (time_t) -1;
+	vattr->va_atime.tv_sec = (long) -1;
+	vattr->va_atime.tv_nsec = (long) -1;
+	vattr->va_mtime.tv_sec = (long) -1;
+	vattr->va_mtime.tv_nsec = (long) -1;
+	vattr->va_ctime.tv_sec = (long) -1;
+	vattr->va_ctime.tv_nsec = (long) -1;
         vattr->va_type = C_VNON;
 	vattr->va_fileid = -1;
 	vattr->va_gen = -1;
@@ -175,13 +201,13 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
                 vattr->va_size = iattr->ia_size;
 	}
         if ( valid & ATTR_ATIME ) {
-		vattr->va_atime = timespec64_to_timespec(iattr->ia_atime);
+		vattr->va_atime = timespec64_to_coda(iattr->ia_atime);
 	}
         if ( valid & ATTR_MTIME ) {
-		vattr->va_mtime = timespec64_to_timespec(iattr->ia_mtime);
+		vattr->va_mtime = timespec64_to_coda(iattr->ia_mtime);
 	}
         if ( valid & ATTR_CTIME ) {
-		vattr->va_ctime = timespec64_to_timespec(iattr->ia_ctime);
+		vattr->va_ctime = timespec64_to_coda(iattr->ia_ctime);
 	}
 }
 
diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h
index ed8cb263e482..fc5f7874208a 100644
--- a/include/uapi/linux/coda.h
+++ b/include/uapi/linux/coda.h
@@ -211,6 +211,20 @@ struct CodaFid {
  */
 enum coda_vtype	{ C_VNON, C_VREG, C_VDIR, C_VBLK, C_VCHR, C_VLNK, C_VSOCK, C_VFIFO, C_VBAD };
 
+#ifdef __linux__
+/*
+ * This matches the traditional Linux 'timespec' structure binary layout,
+ * before using 64-bit time_t everywhere. Overflows in y2038 on 32-bit
+ * architectures.
+ */
+struct vtimespec {
+	long		tv_sec;		/* seconds */
+	long		tv_nsec;	/* nanoseconds */
+};
+#else
+#define vtimespec timespec
+#endif
+
 struct coda_vattr {
 	long     	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
@@ -220,9 +234,9 @@ struct coda_vattr {
 	long		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
-	struct timespec	va_atime;	/* time of last access */
-	struct timespec	va_mtime;	/* time of last modification */
-	struct timespec	va_ctime;	/* time file changed */
+	struct vtimespec va_atime;	/* time of last access */
+	struct vtimespec va_mtime;	/* time of last modification */
+	struct vtimespec va_ctime;	/* time file changed */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	cdev_t	        va_rdev;	/* device special file represents */
-- 
cgit v1.2.3


From 5e7c31dfe74703f428220384b2863525957cc160 Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Tue, 16 Jul 2019 16:28:35 -0700
Subject: coda: change Coda's user api to use 64-bit time_t in timespec

Move the 32-bit time_t problems to userspace.

Link: http://lkml.kernel.org/r/8d089068823bfb292a4020f773922fbd82ffad39.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/coda.txt | 10 +++++-----
 fs/coda/coda_linux.c               | 21 +++++++--------------
 include/uapi/linux/coda.h          | 33 +++++++--------------------------
 3 files changed, 19 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt
index ea5969068895..545262c167c3 100644
--- a/Documentation/filesystems/coda.txt
+++ b/Documentation/filesystems/coda.txt
@@ -481,8 +481,8 @@ kernel support.
 
 
-  struct vtimespec {
-          long            tv_sec;         /* seconds */
+  struct coda_timespec {
+          int64_t         tv_sec;         /* seconds */
           long            tv_nsec;        /* nanoseconds */
   };
 
@@ -496,9 +496,9 @@ kernel support.
           long            va_fileid;      /* file id */
           u_quad_t        va_size;        /* file size in bytes */
           long            va_blocksize;   /* blocksize preferred for i/o */
-          struct vtimespec va_atime;      /* time of last access */
-          struct vtimespec va_mtime;      /* time of last modification */
-          struct vtimespec va_ctime;      /* time file changed */
+          struct coda_timespec va_atime;  /* time of last access */
+          struct coda_timespec va_mtime;  /* time of last modification */
+          struct coda_timespec va_ctime;  /* time file changed */
           u_long          va_gen;         /* generation number of file */
           u_long          va_flags;       /* flags defined for file */
           dev_t           va_rdev;        /* device special file represents */
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 8addcd166908..e4b5f02f0dd4 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -66,13 +66,8 @@ unsigned short coda_flags_to_cflags(unsigned short flags)
 	return coda_flags;
 }
 
-static struct timespec64 coda_to_timespec64(struct vtimespec ts)
+static struct timespec64 coda_to_timespec64(struct coda_timespec ts)
 {
-	/*
-	 * We interpret incoming timestamps as 'signed' to match traditional
-	 * usage and support pre-1970 timestamps, but this breaks in y2038
-	 * on 32-bit machines.
-	 */
 	struct timespec64 ts64 = {
 		.tv_sec = ts.tv_sec,
 		.tv_nsec = ts.tv_nsec,
@@ -81,12 +76,10 @@ static struct timespec64 coda_to_timespec64(struct vtimespec ts)
 	return ts64;
 }
 
-static struct vtimespec timespec64_to_coda(struct timespec64 ts64)
+static struct coda_timespec timespec64_to_coda(struct timespec64 ts64)
 {
-	/* clamp the timestamps to the maximum range rather than wrapping */
-	struct vtimespec ts = {
-		.tv_sec = lower_32_bits(clamp_t(time64_t, ts64.tv_sec,
-						LONG_MIN, LONG_MAX)),
+	struct coda_timespec ts = {
+		.tv_sec = ts64.tv_sec,
 		.tv_nsec = ts64.tv_nsec,
 	};
 
@@ -156,11 +149,11 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
         vattr->va_uid = (vuid_t) -1; 
         vattr->va_gid = (vgid_t) -1;
         vattr->va_size = (off_t) -1;
-	vattr->va_atime.tv_sec = (long) -1;
+	vattr->va_atime.tv_sec = (int64_t) -1;
 	vattr->va_atime.tv_nsec = (long) -1;
-	vattr->va_mtime.tv_sec = (long) -1;
+	vattr->va_mtime.tv_sec = (int64_t) -1;
 	vattr->va_mtime.tv_nsec = (long) -1;
-	vattr->va_ctime.tv_sec = (long) -1;
+	vattr->va_ctime.tv_sec = (int64_t) -1;
 	vattr->va_ctime.tv_nsec = (long) -1;
         vattr->va_type = C_VNON;
 	vattr->va_fileid = -1;
diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h
index fc5f7874208a..5dba636b6e11 100644
--- a/include/uapi/linux/coda.h
+++ b/include/uapi/linux/coda.h
@@ -86,10 +86,6 @@ typedef unsigned long long u_quad_t;
 
 #define inline
 
-struct timespec {
-        long       ts_sec;
-        long       ts_nsec;
-};
 #else  /* DJGPP but not KERNEL */
 #include <sys/time.h>
 typedef unsigned long long u_quad_t;
@@ -110,13 +106,6 @@ typedef unsigned long long u_quad_t;
 #define cdev_t dev_t
 #endif
 
-#ifdef __CYGWIN32__
-struct timespec {
-        time_t  tv_sec;         /* seconds */
-        long    tv_nsec;        /* nanoseconds */
-};
-#endif
-
 #ifndef __BIT_TYPES_DEFINED__
 #define __BIT_TYPES_DEFINED__
 typedef signed char	      int8_t;
@@ -211,19 +200,10 @@ struct CodaFid {
  */
 enum coda_vtype	{ C_VNON, C_VREG, C_VDIR, C_VBLK, C_VCHR, C_VLNK, C_VSOCK, C_VFIFO, C_VBAD };
 
-#ifdef __linux__
-/*
- * This matches the traditional Linux 'timespec' structure binary layout,
- * before using 64-bit time_t everywhere. Overflows in y2038 on 32-bit
- * architectures.
- */
-struct vtimespec {
-	long		tv_sec;		/* seconds */
+struct coda_timespec {
+	int64_t		tv_sec;		/* seconds */
 	long		tv_nsec;	/* nanoseconds */
 };
-#else
-#define vtimespec timespec
-#endif
 
 struct coda_vattr {
 	long     	va_type;	/* vnode type (for create) */
@@ -234,9 +214,9 @@ struct coda_vattr {
 	long		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
-	struct vtimespec va_atime;	/* time of last access */
-	struct vtimespec va_mtime;	/* time of last modification */
-	struct vtimespec va_ctime;	/* time file changed */
+	struct coda_timespec va_atime;	/* time of last access */
+	struct coda_timespec va_mtime;	/* time of last modification */
+	struct coda_timespec va_ctime;	/* time file changed */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	cdev_t	        va_rdev;	/* device special file represents */
@@ -301,7 +281,8 @@ struct coda_statfs {
 
 #define CIOC_KERNEL_VERSION _IOWR('c', 10, size_t)
 
-#define CODA_KERNEL_VERSION 3 /* 128-bit file identifiers */
+//      CODA_KERNEL_VERSION 3 /* 128-bit file identifiers */
+#define CODA_KERNEL_VERSION 4 /* 64-bit timespec */
 
 /*
  *        Venus <-> Coda  RPC arguments
-- 
cgit v1.2.3


From 8fc8b9df831387e0d02c1d0f5bb53d327e0d477a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 16 Jul 2019 16:28:47 -0700
Subject: coda: move internal defs out of include/linux/ [ver #2]

Move include/linux/coda_psdev.h to fs/coda/ as there's nothing else that
uses it.

Link: http://lkml.kernel.org/r/3ceeee0415a929b89fb02700b6b4b3a07938acb8.1558117389.git.jaharkes@cs.cmu.edu
Link: https://patchwork.kernel.org/patch/10590257/
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/cache.c            |  2 +-
 fs/coda/cnode.c            |  2 +-
 fs/coda/coda_linux.c       |  2 +-
 fs/coda/coda_psdev.h       | 89 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/coda/dir.c              |  2 +-
 fs/coda/file.c             |  3 +-
 fs/coda/inode.c            |  2 +-
 fs/coda/pioctl.c           |  3 +-
 fs/coda/psdev.c            |  3 +-
 fs/coda/symlink.c          |  3 +-
 fs/coda/upcall.c           |  2 +-
 include/linux/coda_psdev.h | 89 ----------------------------------------------
 12 files changed, 99 insertions(+), 103 deletions(-)
 create mode 100644 fs/coda/coda_psdev.h
 delete mode 100644 include/linux/coda_psdev.h

(limited to 'include')

diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 201fc08a8b4f..3b8c4513118f 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -21,7 +21,7 @@
 #include <linux/spinlock.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 845b5a66952a..2e5badf67f98 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -8,8 +8,8 @@
 #include <linux/time.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
 #include <linux/pagemap.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index e4b5f02f0dd4..2e1a5a192074 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -18,7 +18,7 @@
 #include <linux/string.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 /* initialize the debugging variables */
diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h
new file mode 100644
index 000000000000..012e16f741a6
--- /dev/null
+++ b/fs/coda/coda_psdev.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CODA_PSDEV_H
+#define __CODA_PSDEV_H
+
+#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/coda_psdev.h>
+
+struct kstatfs;
+
+/* messages between coda filesystem in kernel and Venus */
+struct upc_req {
+	struct list_head	uc_chain;
+	caddr_t			uc_data;
+	u_short			uc_flags;
+	u_short			uc_inSize;  /* Size is at most 5000 bytes */
+	u_short			uc_outSize;
+	u_short			uc_opcode;  /* copied from data to save lookup */
+	int			uc_unique;
+	wait_queue_head_t	uc_sleep;   /* process' wait queue */
+};
+
+#define CODA_REQ_ASYNC  0x1
+#define CODA_REQ_READ   0x2
+#define CODA_REQ_WRITE  0x4
+#define CODA_REQ_ABORT  0x8
+
+/* communication pending/processing queues */
+struct venus_comm {
+	u_long		    vc_seq;
+	wait_queue_head_t   vc_waitq; /* Venus wait queue */
+	struct list_head    vc_pending;
+	struct list_head    vc_processing;
+	int                 vc_inuse;
+	struct super_block *vc_sb;
+	struct mutex	    vc_mutex;
+};
+
+static inline struct venus_comm *coda_vcp(struct super_block *sb)
+{
+	return (struct venus_comm *)((sb)->s_fs_info);
+}
+
+/* upcalls */
+int venus_rootfid(struct super_block *sb, struct CodaFid *fidp);
+int venus_getattr(struct super_block *sb, struct CodaFid *fid,
+		  struct coda_vattr *attr);
+int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *);
+int venus_lookup(struct super_block *sb, struct CodaFid *fid,
+		 const char *name, int length, int *type,
+		 struct CodaFid *resfid);
+int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
+		kuid_t uid);
+int venus_open(struct super_block *sb, struct CodaFid *fid, int flags,
+	       struct file **f);
+int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid,
+		const char *name, int length,
+		struct CodaFid *newfid, struct coda_vattr *attrs);
+int venus_create(struct super_block *sb, struct CodaFid *dirfid,
+		 const char *name, int length, int excl, int mode,
+		 struct CodaFid *newfid, struct coda_vattr *attrs);
+int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid,
+		const char *name, int length);
+int venus_remove(struct super_block *sb, struct CodaFid *dirfid,
+		 const char *name, int length);
+int venus_readlink(struct super_block *sb, struct CodaFid *fid,
+		   char *buffer, int *length);
+int venus_rename(struct super_block *sb, struct CodaFid *new_fid,
+		 struct CodaFid *old_fid, size_t old_length,
+		 size_t new_length, const char *old_name,
+		 const char *new_name);
+int venus_link(struct super_block *sb, struct CodaFid *fid,
+		  struct CodaFid *dirfid, const char *name, int len );
+int venus_symlink(struct super_block *sb, struct CodaFid *fid,
+		  const char *name, int len, const char *symname, int symlen);
+int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
+int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
+		 unsigned int cmd, struct PioctlData *data);
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes);
+int venus_fsync(struct super_block *sb, struct CodaFid *fid);
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
+
+/*
+ * Statistics
+ */
+
+extern struct venus_comm coda_comms[];
+#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 7e103eb8ffcd..716a0b932ec0 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -23,7 +23,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 43d371551d2b..a6b32c883a50 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -22,8 +22,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_int.h"
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 23f6ebd08e80..96d832ed23b5 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -27,7 +27,7 @@
 #include <linux/vmalloc.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index e0c17b7dccce..644d48c12ce8 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -20,8 +20,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 /* pioctl ops */
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index e80bda1de6c5..0a61e949a430 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -38,8 +38,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 #include "coda_int.h"
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 202297d156df..8907d0508198 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -17,8 +17,7 @@
 #include <linux/pagemap.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 static int coda_symlink_filler(struct file *file, struct page *page)
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1e2f50722107..eb8cc30f2589 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -33,7 +33,7 @@
 #include <linux/vfs.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
deleted file mode 100644
index 9487f792770c..000000000000
--- a/include/linux/coda_psdev.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __CODA_PSDEV_H
-#define __CODA_PSDEV_H
-
-#include <linux/backing-dev.h>
-#include <linux/mutex.h>
-#include <uapi/linux/coda_psdev.h>
-
-struct kstatfs;
-
-/* communication pending/processing queues */
-struct venus_comm {
-	u_long		    vc_seq;
-	wait_queue_head_t   vc_waitq; /* Venus wait queue */
-	struct list_head    vc_pending;
-	struct list_head    vc_processing;
-	int                 vc_inuse;
-	struct super_block *vc_sb;
-	struct mutex	    vc_mutex;
-};
-
-/* messages between coda filesystem in kernel and Venus */
-struct upc_req {
-	struct list_head	uc_chain;
-	caddr_t			uc_data;
-	u_short			uc_flags;
-	u_short			uc_inSize;  /* Size is at most 5000 bytes */
-	u_short			uc_outSize;
-	u_short			uc_opcode;  /* copied from data to save lookup */
-	int			uc_unique;
-	wait_queue_head_t	uc_sleep;   /* process' wait queue */
-};
-
-#define CODA_REQ_ASYNC  0x1
-#define CODA_REQ_READ   0x2
-#define CODA_REQ_WRITE  0x4
-#define CODA_REQ_ABORT  0x8
-
-static inline struct venus_comm *coda_vcp(struct super_block *sb)
-{
-	return (struct venus_comm *)((sb)->s_fs_info);
-}
-
-/* upcalls */
-int venus_rootfid(struct super_block *sb, struct CodaFid *fidp);
-int venus_getattr(struct super_block *sb, struct CodaFid *fid,
-		  struct coda_vattr *attr);
-int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *);
-int venus_lookup(struct super_block *sb, struct CodaFid *fid, 
-		 const char *name, int length, int *type, 
-		 struct CodaFid *resfid);
-int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
-		kuid_t uid);
-int venus_open(struct super_block *sb, struct CodaFid *fid, int flags,
-	       struct file **f);
-int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, 
-		const char *name, int length, 
-		struct CodaFid *newfid, struct coda_vattr *attrs);
-int venus_create(struct super_block *sb, struct CodaFid *dirfid, 
-		 const char *name, int length, int excl, int mode,
-		 struct CodaFid *newfid, struct coda_vattr *attrs) ;
-int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, 
-		const char *name, int length);
-int venus_remove(struct super_block *sb, struct CodaFid *dirfid, 
-		 const char *name, int length);
-int venus_readlink(struct super_block *sb, struct CodaFid *fid, 
-		   char *buffer, int *length);
-int venus_rename(struct super_block *, struct CodaFid *new_fid, 
-		 struct CodaFid *old_fid, size_t old_length, 
-		 size_t new_length, const char *old_name, 
-		 const char *new_name);
-int venus_link(struct super_block *sb, struct CodaFid *fid, 
-		  struct CodaFid *dirfid, const char *name, int len );
-int venus_symlink(struct super_block *sb, struct CodaFid *fid,
-		  const char *name, int len, const char *symname, int symlen);
-int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
-int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
-		 unsigned int cmd, struct PioctlData *data);
-int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
-		  size_t nbytes);
-int venus_fsync(struct super_block *sb, struct CodaFid *fid);
-int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
-
-/*
- * Statistics
- */
-
-extern struct venus_comm coda_comms[];
-#endif
-- 
cgit v1.2.3


From 6dc280ebeed2c96a2fb933103dafe655a922b9c1 Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Tue, 16 Jul 2019 16:28:51 -0700
Subject: coda: remove uapi/linux/coda_psdev.h

Nothing is left in this header that is used by userspace.

Link: http://lkml.kernel.org/r/bb11378cef94739f2cf89425dd6d302a52c64480.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/coda_psdev.h            |  5 ++++-
 include/uapi/linux/coda_psdev.h | 10 ----------
 2 files changed, 4 insertions(+), 11 deletions(-)
 delete mode 100644 include/uapi/linux/coda_psdev.h

(limited to 'include')

diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h
index 012e16f741a6..801423cbbdfc 100644
--- a/fs/coda/coda_psdev.h
+++ b/fs/coda/coda_psdev.h
@@ -3,8 +3,11 @@
 #define __CODA_PSDEV_H
 
 #include <linux/backing-dev.h>
+#include <linux/magic.h>
 #include <linux/mutex.h>
-#include <linux/coda_psdev.h>
+
+#define CODA_PSDEV_MAJOR 67
+#define MAX_CODADEVS  5	   /* how many do we allow */
 
 struct kstatfs;
 
diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h
deleted file mode 100644
index 3dacb7fad66a..000000000000
--- a/include/uapi/linux/coda_psdev.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI__CODA_PSDEV_H
-#define _UAPI__CODA_PSDEV_H
-
-#include <linux/magic.h>
-
-#define CODA_PSDEV_MAJOR 67
-#define MAX_CODADEVS  5	   /* how many do we allow */
-
-#endif /* _UAPI__CODA_PSDEV_H */
-- 
cgit v1.2.3


From a9fba24c6ac9b66c09dfc2a0e845ecace187e89c Mon Sep 17 00:00:00 2001
From: Pedro Cuadra <pjcuadra@gmail.com>
Date: Tue, 16 Jul 2019 16:29:13 -0700
Subject: coda: add hinting support for partial file caching

This adds support for partial file caching in Coda.  Every read, write
and mmap informs the userspace cache manager about what part of a file
is about to be accessed so that the cache manager can ensure the
relevant parts are available before the operation is allowed to proceed.

When a read or write operation completes, this is also reported to allow
the cache manager to track when partially cached content can be
released.

If the cache manager does not support partial file caching, or when the
entire file has been fetched into the local cache, the cache manager may
return an EOPNOTSUPP error to indicate that intent upcalls are no longer
necessary until the file is closed.

[akpm@linux-foundation.org: little whitespace fixup]
Link: http://lkml.kernel.org/r/20190618181301.6960-1-jaharkes@cs.cmu.edu
Signed-off-by: Pedro Cuadra <pjcuadra@gmail.com>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/coda_fs_i.h       |  1 +
 fs/coda/coda_psdev.h      |  3 ++
 fs/coda/file.c            | 61 +++++++++++++++++++++++++++++++++--------
 fs/coda/psdev.c           |  2 +-
 fs/coda/upcall.c          | 70 +++++++++++++++++++++++++++++++++++++++--------
 include/uapi/linux/coda.h | 29 ++++++++++++++++++--
 6 files changed, 139 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index c99d574d1c43..1763ff95d865 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -40,6 +40,7 @@ struct coda_file_info {
 	int		   cfi_magic;	  /* magic number */
 	struct file	  *cfi_container; /* container file for this cnode */
 	unsigned int	   cfi_mapcount;  /* nr of times this file is mapped */
+	bool		   cfi_access_intent; /* is access intent supported */
 };
 
 /* flags */
diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h
index 801423cbbdfc..52da08c770b0 100644
--- a/fs/coda/coda_psdev.h
+++ b/fs/coda/coda_psdev.h
@@ -83,6 +83,9 @@ int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
 		  size_t nbytes);
 int venus_fsync(struct super_block *sb, struct CodaFid *fid);
 int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
+int venus_access_intent(struct super_block *sb, struct CodaFid *fid,
+			bool *access_intent_supported,
+			size_t count, loff_t ppos, int type);
 
 /*
  * Statistics
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 0dbd13ab72e3..128d63df5bfb 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -20,6 +20,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/uio.h>
 
 #include <linux/coda.h>
 #include "coda_psdev.h"
@@ -37,9 +38,25 @@ static ssize_t
 coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *coda_file = iocb->ki_filp;
+	struct inode *coda_inode = file_inode(coda_file);
 	struct coda_file_info *cfi = coda_ftoc(coda_file);
+	loff_t ki_pos = iocb->ki_pos;
+	size_t count = iov_iter_count(to);
+	ssize_t ret;
+
+	ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+				  &cfi->cfi_access_intent,
+				  count, ki_pos, CODA_ACCESS_TYPE_READ);
+	if (ret)
+		goto finish_read;
 
-	return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0);
+	ret = vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0);
+
+finish_read:
+	venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+			    &cfi->cfi_access_intent,
+			    count, ki_pos, CODA_ACCESS_TYPE_READ_FINISH);
+	return ret;
 }
 
 static ssize_t
@@ -48,10 +65,17 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
 	struct file *coda_file = iocb->ki_filp;
 	struct inode *coda_inode = file_inode(coda_file);
 	struct coda_file_info *cfi = coda_ftoc(coda_file);
-	struct file *host_file;
+	struct file *host_file = cfi->cfi_container;
+	loff_t ki_pos = iocb->ki_pos;
+	size_t count = iov_iter_count(to);
 	ssize_t ret;
 
-	host_file = cfi->cfi_container;
+	ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+				  &cfi->cfi_access_intent,
+				  count, ki_pos, CODA_ACCESS_TYPE_WRITE);
+	if (ret)
+		goto finish_write;
+
 	file_start_write(host_file);
 	inode_lock(coda_inode);
 	ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
@@ -60,6 +84,11 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
 	coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode);
 	inode_unlock(coda_inode);
 	file_end_write(host_file);
+
+finish_write:
+	venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+			    &cfi->cfi_access_intent,
+			    count, ki_pos, CODA_ACCESS_TYPE_WRITE_FINISH);
 	return ret;
 }
 
@@ -94,29 +123,35 @@ coda_vm_close(struct vm_area_struct *vma)
 static int
 coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 {
-	struct coda_file_info *cfi;
+	struct inode *coda_inode = file_inode(coda_file);
+	struct coda_file_info *cfi = coda_ftoc(coda_file);
+	struct file *host_file = cfi->cfi_container;
+	struct inode *host_inode = file_inode(host_file);
 	struct coda_inode_info *cii;
-	struct file *host_file;
-	struct inode *coda_inode, *host_inode;
 	struct coda_vm_ops *cvm_ops;
+	loff_t ppos;
+	size_t count;
 	int ret;
 
-	cfi = coda_ftoc(coda_file);
-	host_file = cfi->cfi_container;
-
 	if (!host_file->f_op->mmap)
 		return -ENODEV;
 
 	if (WARN_ON(coda_file != vma->vm_file))
 		return -EIO;
 
+	count = vma->vm_end - vma->vm_start;
+	ppos = vma->vm_pgoff * PAGE_SIZE;
+
+	ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+				  &cfi->cfi_access_intent,
+				  count, ppos, CODA_ACCESS_TYPE_MMAP);
+	if (ret)
+		return ret;
+
 	cvm_ops = kmalloc(sizeof(struct coda_vm_ops), GFP_KERNEL);
 	if (!cvm_ops)
 		return -ENOMEM;
 
-	coda_inode = file_inode(coda_file);
-	host_inode = file_inode(host_file);
-
 	cii = ITOC(coda_inode);
 	spin_lock(&cii->c_lock);
 	coda_file->f_mapping = host_file->f_mapping;
@@ -188,6 +223,8 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 	cfi->cfi_magic = CODA_MAGIC;
 	cfi->cfi_mapcount = 0;
 	cfi->cfi_container = host_file;
+	/* assume access intents are supported unless we hear otherwise */
+	cfi->cfi_access_intent = true;
 
 	BUG_ON(coda_file->private_data != NULL);
 	coda_file->private_data = cfi;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index ebfbbea9fa48..240669f51eac 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -388,7 +388,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
 MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
 MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
 MODULE_LICENSE("GPL");
-MODULE_VERSION("6.11");
+MODULE_VERSION("7.0");
 
 static int __init init_coda(void)
 {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 15c0e4fdb0e3..eb3b1898da46 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -569,6 +569,47 @@ int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
         return error;
 }
 
+int venus_access_intent(struct super_block *sb, struct CodaFid *fid,
+			bool *access_intent_supported,
+			size_t count, loff_t ppos, int type)
+{
+	union inputArgs *inp;
+	union outputArgs *outp;
+	int insize, outsize, error;
+	bool finalizer =
+		type == CODA_ACCESS_TYPE_READ_FINISH ||
+		type == CODA_ACCESS_TYPE_WRITE_FINISH;
+
+	if (!*access_intent_supported && !finalizer)
+		return 0;
+
+	insize = SIZE(access_intent);
+	UPARG(CODA_ACCESS_INTENT);
+
+	inp->coda_access_intent.VFid = *fid;
+	inp->coda_access_intent.count = count;
+	inp->coda_access_intent.pos = ppos;
+	inp->coda_access_intent.type = type;
+
+	error = coda_upcall(coda_vcp(sb), insize,
+			    finalizer ? NULL : &outsize, inp);
+
+	/*
+	 * we have to free the request buffer for synchronous upcalls
+	 * or when asynchronous upcalls fail, but not when asynchronous
+	 * upcalls succeed
+	 */
+	if (!finalizer || error)
+		kvfree(inp);
+
+	/* Chunked access is not supported or an old Coda client */
+	if (error == -EOPNOTSUPP) {
+		*access_intent_supported = false;
+		error = 0;
+	}
+	return error;
+}
+
 /*
  * coda_upcall and coda_downcall routines.
  */
@@ -598,10 +639,12 @@ static void coda_unblock_signals(sigset_t *old)
  * has seen them,
  * - CODA_CLOSE or CODA_RELEASE upcall  (to avoid reference count problems)
  * - CODA_STORE				(to avoid data loss)
+ * - CODA_ACCESS_INTENT                 (to avoid reference count problems)
  */
 #define CODA_INTERRUPTIBLE(r) (!coda_hard && \
 			       (((r)->uc_opcode != CODA_CLOSE && \
 				 (r)->uc_opcode != CODA_STORE && \
+				 (r)->uc_opcode != CODA_ACCESS_INTENT && \
 				 (r)->uc_opcode != CODA_RELEASE) || \
 				(r)->uc_flags & CODA_REQ_READ))
 
@@ -687,21 +730,25 @@ static int coda_upcall(struct venus_comm *vcp,
 		goto exit;
 	}
 
+	buffer->ih.unique = ++vcp->vc_seq;
+
 	req->uc_data = (void *)buffer;
-	req->uc_flags = 0;
+	req->uc_flags = outSize ? 0 : CODA_REQ_ASYNC;
 	req->uc_inSize = inSize;
-	req->uc_outSize = *outSize ? *outSize : inSize;
-	req->uc_opcode = ((union inputArgs *)buffer)->ih.opcode;
-	req->uc_unique = ++vcp->vc_seq;
+	req->uc_outSize = (outSize && *outSize) ? *outSize : inSize;
+	req->uc_opcode = buffer->ih.opcode;
+	req->uc_unique = buffer->ih.unique;
 	init_waitqueue_head(&req->uc_sleep);
 
-	/* Fill in the common input args. */
-	((union inputArgs *)buffer)->ih.unique = req->uc_unique;
-
 	/* Append msg to pending queue and poke Venus. */
 	list_add_tail(&req->uc_chain, &vcp->vc_pending);
-
 	wake_up_interruptible(&vcp->vc_waitq);
+
+	if (req->uc_flags & CODA_REQ_ASYNC) {
+		mutex_unlock(&vcp->vc_mutex);
+		return 0;
+	}
+
 	/* We can be interrupted while we wait for Venus to process
 	 * our request.  If the interrupt occurs before Venus has read
 	 * the request, we dequeue and return. If it occurs after the
@@ -743,20 +790,20 @@ static int coda_upcall(struct venus_comm *vcp,
 	sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
 	if (!sig_req) goto exit;
 
-	sig_req->uc_data = kvzalloc(sizeof(struct coda_in_hdr), GFP_KERNEL);
-	if (!sig_req->uc_data) {
+	sig_inputArgs = kvzalloc(sizeof(struct coda_in_hdr), GFP_KERNEL);
+	if (!sig_inputArgs) {
 		kfree(sig_req);
 		goto exit;
 	}
 
 	error = -EINTR;
-	sig_inputArgs = (union inputArgs *)sig_req->uc_data;
 	sig_inputArgs->ih.opcode = CODA_SIGNAL;
 	sig_inputArgs->ih.unique = req->uc_unique;
 
 	sig_req->uc_flags = CODA_REQ_ASYNC;
 	sig_req->uc_opcode = sig_inputArgs->ih.opcode;
 	sig_req->uc_unique = sig_inputArgs->ih.unique;
+	sig_req->uc_data = (void *)sig_inputArgs;
 	sig_req->uc_inSize = sizeof(struct coda_in_hdr);
 	sig_req->uc_outSize = sizeof(struct coda_in_hdr);
 
@@ -911,4 +958,3 @@ unlock_out:
 	iput(inode);
 	return 0;
 }
-
diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h
index 5dba636b6e11..aa34c2dcae8d 100644
--- a/include/uapi/linux/coda.h
+++ b/include/uapi/linux/coda.h
@@ -271,7 +271,8 @@ struct coda_statfs {
 #define CODA_STATFS	 34
 #define CODA_STORE	 35
 #define CODA_RELEASE	 36
-#define CODA_NCALLS 37
+#define CODA_ACCESS_INTENT 37
+#define CODA_NCALLS 38
 
 #define DOWNCALL(opcode) (opcode >= CODA_REPLACE && opcode <= CODA_PURGEFID)
 
@@ -281,8 +282,12 @@ struct coda_statfs {
 
 #define CIOC_KERNEL_VERSION _IOWR('c', 10, size_t)
 
+//      CODA_KERNEL_VERSION 0 /* don't care about kernel version number */
+//      CODA_KERNEL_VERSION 1 /* The old venus 4.6 compatible interface */
+//      CODA_KERNEL_VERSION 2 /* venus_lookup gets an extra parameter */
 //      CODA_KERNEL_VERSION 3 /* 128-bit file identifiers */
-#define CODA_KERNEL_VERSION 4 /* 64-bit timespec */
+//      CODA_KERNEL_VERSION 4 /* 64-bit timespec */
+#define CODA_KERNEL_VERSION 5 /* access intent support */
 
 /*
  *        Venus <-> Coda  RPC arguments
@@ -637,6 +642,25 @@ struct coda_statfs_out {
     struct coda_statfs stat;
 };
 
+#define CODA_ACCESS_TYPE_READ		1
+#define CODA_ACCESS_TYPE_WRITE		2
+#define CODA_ACCESS_TYPE_MMAP		3
+#define CODA_ACCESS_TYPE_READ_FINISH	4
+#define CODA_ACCESS_TYPE_WRITE_FINISH	5
+
+/* coda_access_intent: NO_OUT */
+struct coda_access_intent_in {
+	struct coda_in_hdr ih;
+	struct CodaFid VFid;
+	int count;
+	int pos;
+	int type;
+};
+
+struct coda_access_intent_out {
+	struct coda_out_hdr out;
+};
+
 /* 
  * Occasionally, we don't cache the fid returned by CODA_LOOKUP. 
  * For instance, if the fid is inconsistent. 
@@ -668,6 +692,7 @@ union inputArgs {
     struct coda_open_by_fd_in coda_open_by_fd;
     struct coda_open_by_path_in coda_open_by_path;
     struct coda_statfs_in coda_statfs;
+    struct coda_access_intent_in coda_access_intent;
 };
 
 union outputArgs {
-- 
cgit v1.2.3


From 201766a20e30f982ccfe36bebfad9602c3ff574a Mon Sep 17 00:00:00 2001
From: Elvira Khabirova <lineprinter@altlinux.org>
Date: Tue, 16 Jul 2019 16:29:42 -0700
Subject: ptrace: add PTRACE_GET_SYSCALL_INFO request

PTRACE_GET_SYSCALL_INFO is a generic ptrace API that lets ptracer obtain
details of the syscall the tracee is blocked in.

There are two reasons for a special syscall-related ptrace request.

Firstly, with the current ptrace API there are cases when ptracer cannot
retrieve necessary information about syscalls.  Some examples include:

 * The notorious int-0x80-from-64-bit-task issue. See [1] for details.
   In short, if a 64-bit task performs a syscall through int 0x80, its
   tracer has no reliable means to find out that the syscall was, in
   fact, a compat syscall, and misidentifies it.

 * Syscall-enter-stop and syscall-exit-stop look the same for the
   tracer. Common practice is to keep track of the sequence of
   ptrace-stops in order not to mix the two syscall-stops up. But it is
   not as simple as it looks; for example, strace had a (just recently
   fixed) long-standing bug where attaching strace to a tracee that is
   performing the execve system call led to the tracer identifying the
   following syscall-exit-stop as syscall-enter-stop, which messed up
   all the state tracking.

 * Since the introduction of commit 84d77d3f06e7 ("ptrace: Don't allow
   accessing an undumpable mm"), both PTRACE_PEEKDATA and
   process_vm_readv become unavailable when the process dumpable flag is
   cleared. On such architectures as ia64 this results in all syscall
   arguments being unavailable for the tracer.

Secondly, ptracers also have to support a lot of arch-specific code for
obtaining information about the tracee.  For some architectures, this
requires a ptrace(PTRACE_PEEKUSER, ...) invocation for every syscall
argument and return value.

ptrace(2) man page:

long ptrace(enum __ptrace_request request, pid_t pid,
            void *addr, void *data);
...
PTRACE_GET_SYSCALL_INFO
       Retrieve information about the syscall that caused the stop.
       The information is placed into the buffer pointed by "data"
       argument, which should be a pointer to a buffer of type
       "struct ptrace_syscall_info".
       The "addr" argument contains the size of the buffer pointed to
       by "data" argument (i.e., sizeof(struct ptrace_syscall_info)).
       The return value contains the number of bytes available
       to be written by the kernel.
       If the size of data to be written by the kernel exceeds the size
       specified by "addr" argument, the output is truncated.

[ldv@altlinux.org: selftests/seccomp/seccomp_bpf: update for PTRACE_GET_SYSCALL_INFO]
  Link: http://lkml.kernel.org/r/20190708182904.GA12332@altlinux.org
Link: http://lkml.kernel.org/r/20190510152842.GF28558@altlinux.org
Signed-off-by: Elvira Khabirova <lineprinter@altlinux.org>
Co-developed-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Greentime Hu <greentime@andestech.com>
Cc: Helge Deller <deller@gmx.de>	[parisc]
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: kbuild test robot <lkp@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vincent Chen <deanbo422@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h                     |   9 ++-
 include/uapi/linux/ptrace.h                   |  35 +++++++++
 kernel/ptrace.c                               | 101 +++++++++++++++++++++++++-
 tools/testing/selftests/seccomp/seccomp_bpf.c |  13 +++-
 4 files changed, 150 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 8446573cc682..36fb3bbed6b2 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -54,13 +54,15 @@ struct linux_binprm;
 /*
  * ptrace report for syscall entry and exit looks identical.
  */
-static inline int ptrace_report_syscall(struct pt_regs *regs)
+static inline int ptrace_report_syscall(struct pt_regs *regs,
+					unsigned long message)
 {
 	int ptrace = current->ptrace;
 
 	if (!(ptrace & PT_PTRACED))
 		return 0;
 
+	current->ptrace_message = message;
 	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
 
 	/*
@@ -73,6 +75,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs)
 		current->exit_code = 0;
 	}
 
+	current->ptrace_message = 0;
 	return fatal_signal_pending(current);
 }
 
@@ -98,7 +101,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs)
 static inline __must_check int tracehook_report_syscall_entry(
 	struct pt_regs *regs)
 {
-	return ptrace_report_syscall(regs);
+	return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY);
 }
 
 /**
@@ -123,7 +126,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 	if (step)
 		user_single_step_report(regs);
 	else
-		ptrace_report_syscall(regs);
+		ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT);
 }
 
 /**
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index d5a1b8a492b9..a71b6e3b03eb 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -73,6 +73,41 @@ struct seccomp_metadata {
 	__u64 flags;		/* Output: filter's flags */
 };
 
+#define PTRACE_GET_SYSCALL_INFO		0x420e
+#define PTRACE_SYSCALL_INFO_NONE	0
+#define PTRACE_SYSCALL_INFO_ENTRY	1
+#define PTRACE_SYSCALL_INFO_EXIT	2
+#define PTRACE_SYSCALL_INFO_SECCOMP	3
+
+struct ptrace_syscall_info {
+	__u8 op;	/* PTRACE_SYSCALL_INFO_* */
+	__u32 arch __attribute__((__aligned__(sizeof(__u32))));
+	__u64 instruction_pointer;
+	__u64 stack_pointer;
+	union {
+		struct {
+			__u64 nr;
+			__u64 args[6];
+		} entry;
+		struct {
+			__s64 rval;
+			__u8 is_error;
+		} exit;
+		struct {
+			__u64 nr;
+			__u64 args[6];
+			__u32 ret_data;
+		} seccomp;
+	};
+};
+
+/*
+ * These values are stored in task->ptrace_message
+ * by tracehook_report_syscall_* to describe the current syscall-stop.
+ */
+#define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
+#define PTRACE_EVENTMSG_SYSCALL_EXIT	2
+
 /* Read signals from a shared (process wide) queue */
 #define PTRACE_PEEKSIGINFO_SHARED	(1 << 0)
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 83a531cea2f3..cb9ddcc08119 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,8 @@
 #include <linux/compat.h>
 #include <linux/sched/signal.h>
 
+#include <asm/syscall.h>	/* for syscall_get_* */
+
 /*
  * Access another process' address space via ptrace.
  * Source/target buffer must be kernel space,
@@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
  * to ensure no machine forgets it.
  */
 EXPORT_SYMBOL_GPL(task_user_regset_view);
-#endif
+
+static unsigned long
+ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
+			      struct ptrace_syscall_info *info)
+{
+	unsigned long args[ARRAY_SIZE(info->entry.args)];
+	int i;
+
+	info->op = PTRACE_SYSCALL_INFO_ENTRY;
+	info->entry.nr = syscall_get_nr(child, regs);
+	syscall_get_arguments(child, regs, args);
+	for (i = 0; i < ARRAY_SIZE(args); i++)
+		info->entry.args[i] = args[i];
+
+	/* args is the last field in struct ptrace_syscall_info.entry */
+	return offsetofend(struct ptrace_syscall_info, entry.args);
+}
+
+static unsigned long
+ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
+				struct ptrace_syscall_info *info)
+{
+	/*
+	 * As struct ptrace_syscall_info.entry is currently a subset
+	 * of struct ptrace_syscall_info.seccomp, it makes sense to
+	 * initialize that subset using ptrace_get_syscall_info_entry().
+	 * This can be reconsidered in the future if these structures
+	 * diverge significantly enough.
+	 */
+	ptrace_get_syscall_info_entry(child, regs, info);
+	info->op = PTRACE_SYSCALL_INFO_SECCOMP;
+	info->seccomp.ret_data = child->ptrace_message;
+
+	/* ret_data is the last field in struct ptrace_syscall_info.seccomp */
+	return offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
+}
+
+static unsigned long
+ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
+			     struct ptrace_syscall_info *info)
+{
+	info->op = PTRACE_SYSCALL_INFO_EXIT;
+	info->exit.rval = syscall_get_error(child, regs);
+	info->exit.is_error = !!info->exit.rval;
+	if (!info->exit.is_error)
+		info->exit.rval = syscall_get_return_value(child, regs);
+
+	/* is_error is the last field in struct ptrace_syscall_info.exit */
+	return offsetofend(struct ptrace_syscall_info, exit.is_error);
+}
+
+static int
+ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
+			void __user *datavp)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+	struct ptrace_syscall_info info = {
+		.op = PTRACE_SYSCALL_INFO_NONE,
+		.arch = syscall_get_arch(child),
+		.instruction_pointer = instruction_pointer(regs),
+		.stack_pointer = user_stack_pointer(regs),
+	};
+	unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
+	unsigned long write_size;
+
+	/*
+	 * This does not need lock_task_sighand() to access
+	 * child->last_siginfo because ptrace_freeze_traced()
+	 * called earlier by ptrace_check_attach() ensures that
+	 * the tracee cannot go away and clear its last_siginfo.
+	 */
+	switch (child->last_siginfo ? child->last_siginfo->si_code : 0) {
+	case SIGTRAP | 0x80:
+		switch (child->ptrace_message) {
+		case PTRACE_EVENTMSG_SYSCALL_ENTRY:
+			actual_size = ptrace_get_syscall_info_entry(child, regs,
+								    &info);
+			break;
+		case PTRACE_EVENTMSG_SYSCALL_EXIT:
+			actual_size = ptrace_get_syscall_info_exit(child, regs,
+								   &info);
+			break;
+		}
+		break;
+	case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8):
+		actual_size = ptrace_get_syscall_info_seccomp(child, regs,
+							      &info);
+		break;
+	}
+
+	write_size = min(actual_size, user_size);
+	return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
 int ptrace_request(struct task_struct *child, long request,
 		   unsigned long addr, unsigned long data)
@@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request,
 			ret = __put_user(kiov.iov_len, &uiov->iov_len);
 		break;
 	}
+
+	case PTRACE_GET_SYSCALL_INFO:
+		ret = ptrace_get_syscall_info(child, addr, datavp);
+		break;
 #endif
 
 	case PTRACE_SECCOMP_GET_FILTER:
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index dc66fe852768..6ef7f16c4cf5 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1775,13 +1775,18 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
 	unsigned long msg;
 	static bool entry;
 
-	/* Make sure we got an empty message. */
+	/*
+	 * The traditional way to tell PTRACE_SYSCALL entry/exit
+	 * is by counting.
+	 */
+	entry = !entry;
+
+	/* Make sure we got an appropriate message. */
 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
 	EXPECT_EQ(0, ret);
-	EXPECT_EQ(0, msg);
+	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
+			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
 
-	/* The only way to tell PTRACE_SYSCALL entry/exit is by counting. */
-	entry = !entry;
 	if (!entry)
 		return;
 
-- 
cgit v1.2.3


From e2d9018e81ba9357d3bb8bddc0ee58d460d092fe Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jul 2019 16:29:50 -0700
Subject: signal: reorder struct sighand_struct

struct sighand_struct::siglock field is the most used field by far, put
it first so that is can be accessed without IMM8 or IMM32 encoding on
x86_64.

Space savings (on trimmed down VM test config):

add/remove: 0/0 grow/shrink: 8/68 up/down: 49/-1147 (-1098)
Function                                     old     new   delta
complete_signal                              512     533     +21
do_signalfd4                                 335     346     +11
__cleanup_sighand                             39      43      +4
unhandled_signal                              49      52      +3
prepare_signal                               692     695      +3
ignore_signals                                37      40      +3
__tty_check_change.part                      248     251      +3
ksys_unshare                                 780     781      +1
sighand_ctor                                  33      29      -4
ptrace_trap_notify                            60      56      -4
sigqueue_free                                 98      91      -7
run_posix_cpu_timers                        1389    1382      -7
proc_pid_status                             2448    2441      -7
proc_pid_limits                              344     337      -7
posix_cpu_timer_rearm                        222     215      -7
posix_cpu_timer_get                          249     242      -7
kill_pid_info_as_cred                        243     236      -7
freeze_task                                  197     190      -7
flush_old_exec                              1873    1866      -7
do_task_stat                                3363    3356      -7
do_send_sig_info                              98      91      -7
do_group_exit                                147     140      -7
init_sighand                                2088    2080      -8
do_notify_parent_cldstop                     399     391      -8
signalfd_cleanup                              50      41      -9
do_notify_parent                             557     545     -12
__send_signal                               1029    1017     -12
ptrace_stop                                  590     577     -13
get_signal                                  1576    1563     -13
__lock_task_sighand                          112      99     -13
zap_pid_ns_processes                         391     377     -14
update_rlimit_cpu                             78      64     -14
tty_signal_session_leader                    413     399     -14
tty_open_proc_set_tty                        149     135     -14
tty_jobctrl_ioctl                            936     922     -14
set_cpu_itimer                               339     325     -14
ptrace_resume                                226     212     -14
ptrace_notify                                110      96     -14
proc_clear_tty                                81      67     -14
posix_cpu_timer_del                          229     215     -14
kernel_sigaction                             156     142     -14
getrusage                                    977     963     -14
get_current_tty                               98      84     -14
force_sigsegv                                 89      75     -14
force_sig_info                               205     191     -14
flush_signals                                 83      69     -14
flush_itimer_signals                          85      71     -14
do_timer_create                             1120    1106     -14
do_sigpending                                 88      74     -14
do_signal_stop                               537     523     -14
cgroup_init_fs_context                       644     630     -14
call_usermodehelper_exec_async               402     388     -14
calculate_sigpending                          58      44     -14
__x64_sys_timer_delete                       248     234     -14
__set_current_blocked                         80      66     -14
__ptrace_unlink                              310     296     -14
__ptrace_detach.part                         187     173     -14
send_sigqueue                                362     347     -15
get_cpu_itimer                               214     199     -15
signalfd_poll                                175     159     -16
dequeue_signal                               340     323     -17
do_getitimer                                 192     174     -18
release_task.part                           1060    1040     -20
ptrace_peek_siginfo                          408     387     -21
posix_cpu_timer_set                          827     806     -21
exit_signals                                 437     416     -21
do_sigaction                                 541     520     -21
do_setitimer                                 485     464     -21
disassociate_ctty.part                       545     517     -28
__x64_sys_rt_sigtimedwait                    721     679     -42
__x64_sys_ptrace                            1319    1277     -42
ptrace_request                              1828    1782     -46
signalfd_read                                507     459     -48
wait_consider_task                          2027    1971     -56
do_coredump                                 3672    3616     -56
copy_process.part                           6936    6871     -65

Link: http://lkml.kernel.org/r/20190503192800.GA18004@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/signal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 532458698bde..01add55a609b 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -15,10 +15,10 @@
  */
 
 struct sighand_struct {
-	refcount_t		count;
-	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
+	refcount_t		count;
 	wait_queue_head_t	signalfd_wqh;
+	struct k_sigaction	action[_NSIG];
 };
 
 /*
-- 
cgit v1.2.3


From b772434be0891ed1081a08ae7cfd4666728f8e82 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 16 Jul 2019 16:29:53 -0700
Subject: signal: simplify set_user_sigmask/restore_user_sigmask

task->saved_sigmask and ->restore_sigmask are only used in the ret-from-
syscall paths.  This means that set_user_sigmask() can save ->blocked in
->saved_sigmask and do set_restore_sigmask() to indicate that ->blocked
was modified.

This way the callers do not need 2 sigset_t's passed to set/restore and
restore_user_sigmask() renamed to restore_saved_sigmask_unless() turns
into the trivial helper which just calls restore_saved_sigmask().

Link: http://lkml.kernel.org/r/20190606113206.GA9464@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Eric Wong <e@80x24.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: David Laight <David.Laight@aculab.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c                     | 20 +++++--------
 fs/eventpoll.c               | 12 +++-----
 fs/io_uring.c                | 11 ++-----
 fs/select.c                  | 34 ++++++++--------------
 include/linux/compat.h       |  3 +-
 include/linux/sched/signal.h | 12 ++++++--
 include/linux/signal.h       |  4 ---
 kernel/signal.c              | 69 ++++++++++++--------------------------------
 8 files changed, 57 insertions(+), 108 deletions(-)

(limited to 'include')

diff --git a/fs/aio.c b/fs/aio.c
index 2d405733a8c6..8327db0c8e08 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2094,7 +2094,6 @@ SYSCALL_DEFINE6(io_pgetevents,
 		const struct __aio_sigset __user *, usig)
 {
 	struct __aio_sigset	ksig = { NULL, };
-	sigset_t		ksigmask, sigsaved;
 	struct timespec64	ts;
 	bool interrupted;
 	int ret;
@@ -2105,14 +2104,14 @@ SYSCALL_DEFINE6(io_pgetevents,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
@@ -2130,7 +2129,6 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
 		const struct __aio_sigset __user *, usig)
 {
 	struct __aio_sigset	ksig = { NULL, };
-	sigset_t		ksigmask, sigsaved;
 	struct timespec64	ts;
 	bool interrupted;
 	int ret;
@@ -2142,14 +2140,14 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
 		return -EFAULT;
 
 
-	ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
@@ -2198,7 +2196,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 		const struct __compat_aio_sigset __user *, usig)
 {
 	struct __compat_aio_sigset ksig = { NULL, };
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 t;
 	bool interrupted;
 	int ret;
@@ -2209,14 +2206,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
@@ -2234,7 +2231,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
 		const struct __compat_aio_sigset __user *, usig)
 {
 	struct __compat_aio_sigset ksig = { NULL, };
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 t;
 	bool interrupted;
 	int ret;
@@ -2245,14 +2241,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4c74c768ae43..0f9c073d78d5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2313,19 +2313,17 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 		size_t, sigsetsize)
 {
 	int error;
-	sigset_t ksigmask, sigsaved;
 
 	/*
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
-	error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	error = set_user_sigmask(sigmask, sigsetsize);
 	if (error)
 		return error;
 
 	error = do_epoll_wait(epfd, events, maxevents, timeout);
-
-	restore_user_sigmask(sigmask, &sigsaved, error == -EINTR);
+	restore_saved_sigmask_unless(error == -EINTR);
 
 	return error;
 }
@@ -2338,19 +2336,17 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 			compat_size_t, sigsetsize)
 {
 	long err;
-	sigset_t ksigmask, sigsaved;
 
 	/*
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
-	err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	err = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (err)
 		return err;
 
 	err = do_epoll_wait(epfd, events, maxevents, timeout);
-
-	restore_user_sigmask(sigmask, &sigsaved, err == -EINTR);
+	restore_saved_sigmask_unless(err == -EINTR);
 
 	return err;
 }
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d682049c07b2..e2a66e12fbc6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2400,7 +2400,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
 	struct io_cq_ring *ring = ctx->cq_ring;
-	sigset_t ksigmask, sigsaved;
 	int ret;
 
 	if (io_cqring_events(ring) >= min_events)
@@ -2410,21 +2409,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 #ifdef CONFIG_COMPAT
 		if (in_compat_syscall())
 			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
-						      &ksigmask, &sigsaved, sigsz);
+						      sigsz);
 		else
 #endif
-			ret = set_user_sigmask(sig, &ksigmask,
-					       &sigsaved, sigsz);
+			ret = set_user_sigmask(sig, sigsz);
 
 		if (ret)
 			return ret;
 	}
 
 	ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
-
-	if (sig)
-		restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
-
+	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 
diff --git a/fs/select.c b/fs/select.c
index a4d8f6e8b63c..1fc1b247fede 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -730,7 +730,6 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 		       const sigset_t __user *sigmask, size_t sigsetsize,
 		       enum poll_time_type type)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -753,12 +752,12 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 			return -EINVAL;
 	}
 
-	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = core_sys_select(n, inp, outp, exp, to);
-	restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+	restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
 	ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
 
 	return ret;
@@ -1086,7 +1085,6 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 		struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
 		size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1099,17 +1097,16 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 			return -EINVAL;
 	}
 
-	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
 
 	return ret;
@@ -1121,7 +1118,6 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
 		struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
 		size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1134,17 +1130,16 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
 			return -EINVAL;
 	}
 
-	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
 
 	return ret;
@@ -1319,7 +1314,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	void __user *tsp, compat_sigset_t __user *sigmask,
 	compat_size_t sigsetsize, enum poll_time_type type)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1342,12 +1336,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 			return -EINVAL;
 	}
 
-	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = compat_core_sys_select(n, inp, outp, exp, to);
-	restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+	restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
 	ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
 
 	return ret;
@@ -1402,7 +1396,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct old_timespec32 __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1415,17 +1408,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
 			return -EINVAL;
 	}
 
-	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
 
 	return ret;
@@ -1437,7 +1429,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct __kernel_timespec __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1450,17 +1441,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
 			return -EINVAL;
 	}
 
-	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
 
 	return ret;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ebddcb6cfcf8..16dafd9f4b86 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -138,8 +138,7 @@ typedef struct {
 	compat_sigset_word	sig[_COMPAT_NSIG_WORDS];
 } compat_sigset_t;
 
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
-			    sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
 			    size_t sigsetsize);
 
 struct compat_sigaction {
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 01add55a609b..efd8ce7675ed 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -420,7 +420,6 @@ void task_join_group_stop(struct task_struct *task);
 static inline void set_restore_sigmask(void)
 {
 	set_thread_flag(TIF_RESTORE_SIGMASK);
-	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
 
 static inline void clear_tsk_restore_sigmask(struct task_struct *task)
@@ -451,7 +450,6 @@ static inline bool test_and_clear_restore_sigmask(void)
 static inline void set_restore_sigmask(void)
 {
 	current->restore_sigmask = true;
-	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
 static inline void clear_tsk_restore_sigmask(struct task_struct *task)
 {
@@ -484,6 +482,16 @@ static inline void restore_saved_sigmask(void)
 		__set_current_blocked(&current->saved_sigmask);
 }
 
+extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
+
+static inline void restore_saved_sigmask_unless(bool interrupted)
+{
+	if (interrupted)
+		WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+	else
+		restore_saved_sigmask();
+}
+
 static inline sigset_t *sigmask_to_save(void)
 {
 	sigset_t *res = &current->blocked;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 78c2bb376954..b5d99482d3fe 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -273,10 +273,6 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
 			       struct task_struct *p, enum pid_type type);
 extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
-extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
-	sigset_t *oldset, size_t sigsetsize);
-extern void restore_user_sigmask(const void __user *usigmask,
-				 sigset_t *sigsaved, bool interrupted);
 extern void set_current_blocked(sigset_t *);
 extern void __set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
diff --git a/kernel/signal.c b/kernel/signal.c
index dabe100d2091..91b789dd6e72 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2951,80 +2951,49 @@ EXPORT_SYMBOL(sigprocmask);
  *
  * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
  * epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ *
+ * Note that it does set_restore_sigmask() in advance, so it must be always
+ * paired with restore_saved_sigmask_unless() before return from syscall.
  */
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
-		     sigset_t *oldset, size_t sigsetsize)
+int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
 {
-	if (!usigmask)
-		return 0;
+	sigset_t kmask;
 
+	if (!umask)
+		return 0;
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
-	if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+	if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
 		return -EFAULT;
 
-	*oldset = current->blocked;
-	set_current_blocked(set);
+	set_restore_sigmask();
+	current->saved_sigmask = current->blocked;
+	set_current_blocked(&kmask);
 
 	return 0;
 }
-EXPORT_SYMBOL(set_user_sigmask);
 
 #ifdef CONFIG_COMPAT
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
-			    sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
 			    size_t sigsetsize)
 {
-	if (!usigmask)
-		return 0;
+	sigset_t kmask;
 
+	if (!umask)
+		return 0;
 	if (sigsetsize != sizeof(compat_sigset_t))
 		return -EINVAL;
-	if (get_compat_sigset(set, usigmask))
+	if (get_compat_sigset(&kmask, umask))
 		return -EFAULT;
 
-	*oldset = current->blocked;
-	set_current_blocked(set);
+	set_restore_sigmask();
+	current->saved_sigmask = current->blocked;
+	set_current_blocked(&kmask);
 
 	return 0;
 }
-EXPORT_SYMBOL(set_compat_user_sigmask);
 #endif
 
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- *           usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
-				bool interrupted)
-{
-
-	if (!usigmask)
-		return;
-	/*
-	 * When signals are pending, do not restore them here.
-	 * Restoring sigmask here can lead to delivering signals that the above
-	 * syscalls are intended to block because of the sigmask passed in.
-	 */
-	if (interrupted) {
-		current->saved_sigmask = *sigsaved;
-		set_restore_sigmask();
-		return;
-	}
-
-	/*
-	 * This is needed because the fast syscall return path does not restore
-	 * saved_sigmask when signals are not pending.
-	 */
-	set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
 /**
  *  sys_rt_sigprocmask - change the list of currently blocked signals
  *  @how: whether to add, remove, or set signals
-- 
cgit v1.2.3


From f57e515a1b56325a28a0972c632a623a9c84590c Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Tue, 16 Jul 2019 16:30:06 -0700
Subject: kernel/pid.c: convert struct pid count to refcount_t

struct pid's count is an atomic_t field used as a refcount.  Use
refcount_t for it which is basically atomic_t but does additional
checking to prevent use-after-free bugs.

For memory ordering, the only change is with the following:

 -	if ((atomic_read(&pid->count) == 1) ||
 -	     atomic_dec_and_test(&pid->count)) {
 +	if (refcount_dec_and_test(&pid->count)) {
 		kmem_cache_free(ns->pid_cachep, pid);

Here the change is from: Fully ordered --> RELEASE + ACQUIRE (as per
refcount-vs-atomic.rst) This ACQUIRE should take care of making sure the
free happens after the refcount_dec_and_test().

The above hunk also removes atomic_read() since it is not needed for the
code to work and it is unclear how beneficial it is.  The removal lets
refcount_dec_and_test() check for cases where get_pid() happened before
the object was freed.

Link: http://lkml.kernel.org/r/20190701183826.191936-1-joel@joelfernandes.org
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: KJ Tsanaktsidis <ktsanaktsidis@zendesk.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h | 5 +++--
 kernel/pid.c        | 9 ++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 1484db6ca8d1..2a83e434db9d 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -4,6 +4,7 @@
 
 #include <linux/rculist.h>
 #include <linux/wait.h>
+#include <linux/refcount.h>
 
 enum pid_type
 {
@@ -57,7 +58,7 @@ struct upid {
 
 struct pid
 {
-	atomic_t count;
+	refcount_t count;
 	unsigned int level;
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
@@ -74,7 +75,7 @@ extern const struct file_operations pidfd_fops;
 static inline struct pid *get_pid(struct pid *pid)
 {
 	if (pid)
-		atomic_inc(&pid->count);
+		refcount_inc(&pid->count);
 	return pid;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 16263b526560..0a9f2e437217 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,14 +37,14 @@
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
 #include <linux/proc_ns.h>
-#include <linux/proc_fs.h>
+#include <linux/refcount.h>
 #include <linux/anon_inodes.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
 struct pid init_struct_pid = {
-	.count 		= ATOMIC_INIT(1),
+	.count		= REFCOUNT_INIT(1),
 	.tasks		= {
 		{ .first = NULL },
 		{ .first = NULL },
@@ -108,8 +108,7 @@ void put_pid(struct pid *pid)
 		return;
 
 	ns = pid->numbers[pid->level].ns;
-	if ((atomic_read(&pid->count) == 1) ||
-	     atomic_dec_and_test(&pid->count)) {
+	if (refcount_dec_and_test(&pid->count)) {
 		kmem_cache_free(ns->pid_cachep, pid);
 		put_pid_ns(ns);
 	}
@@ -212,7 +211,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	}
 
 	get_pid_ns(ns);
-	atomic_set(&pid->count, 1);
+	refcount_set(&pid->count, 1);
 	for (type = 0; type < PIDTYPE_MAX; ++type)
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 
-- 
cgit v1.2.3


From 6b15f678fb7d5ef54e089e6ace72f007fe6e9895 Mon Sep 17 00:00:00 2001
From: Drew Davenport <ddavenport@chromium.org>
Date: Tue, 16 Jul 2019 16:30:18 -0700
Subject: include/asm-generic/bug.h: fix "cut here" for WARN_ON for
 __WARN_TAINT architectures

For architectures using __WARN_TAINT, the WARN_ON macro did not print
out the "cut here" string.  The other WARN_XXX macros would print "cut
here" inside __warn_printk, which is not called for WARN_ON since it
doesn't have a message to print.

Link: http://lkml.kernel.org/r/20190624154831.163888-1-ddavenport@chromium.org
Fixes: a7bed27af194 ("bug: fix "cut here" location for __WARN_TAINT architectures")
Signed-off-by: Drew Davenport <ddavenport@chromium.org>
Acked-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/bug.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 0e9bd9c83870..aa6c093d9ce9 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -104,8 +104,10 @@ extern void warn_slowpath_null(const char *file, const int line);
 	warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg)
 #else
 extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
-#define __WARN()		__WARN_TAINT(TAINT_WARN)
-#define __WARN_printf(arg...)	do { __warn_printk(arg); __WARN(); } while (0)
+#define __WARN() do { \
+	printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \
+} while (0)
+#define __WARN_printf(arg...)	__WARN_printf_taint(TAINT_WARN, arg)
 #define __WARN_printf_taint(taint, arg...)				\
 	do { __warn_printk(arg); __WARN_TAINT(taint); } while (0)
 #endif
-- 
cgit v1.2.3


From 97a0efea657e986322b09b99016b3f7d2ce37021 Mon Sep 17 00:00:00 2001
From: Tom Levy <tomlevy93@gmail.com>
Date: Tue, 16 Jul 2019 16:30:24 -0700
Subject: include/linux/lz4.h: fix spelling and copy-paste errors in
 documentation

Fix a few spelling and grammar errors, and two places where fast/safe in
the documentation did not match the function.

Link: http://lkml.kernel.org/r/20190321014452.13297-1-tomlevy93@gmail.com
Signed-off-by: Tom Levy <tomlevy93@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Jiri Kosina <trivial@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lz4.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 394e3d9213b8..b16e15b9587a 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -278,7 +278,7 @@ int LZ4_decompress_fast(const char *source, char *dest, int originalSize);
  * @compressedSize: is the precise full size of the compressed block
  * @maxDecompressedSize: is the size of 'dest' buffer
  *
- * Decompresses data fom 'source' into 'dest'.
+ * Decompresses data from 'source' into 'dest'.
  * If the source stream is detected malformed, the function will
  * stop decoding and return a negative result.
  * This function is protected against buffer overflow exploits,
@@ -522,7 +522,7 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
 	const char *dictionary, int dictSize);
 
 /**
- * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode
+ * LZ4_decompress_safe_continue() - Decompress blocks in streaming mode
  * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure
  * @source: source address of the compressed data
  * @dest: output buffer address of the uncompressed data
@@ -530,7 +530,7 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
  * @compressedSize: is the precise full size of the compressed block
  * @maxDecompressedSize: is the size of 'dest' buffer
  *
- * These decoding function allows decompression of multiple blocks
+ * This decoding function allows decompression of multiple blocks
  * in "streaming" mode.
  * Previously decoded blocks *must* remain available at the memory position
  * where they were decoded (up to 64 KB)
@@ -569,7 +569,7 @@ int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
  *	which must be already allocated with 'originalSize' bytes
  * @originalSize: is the original and therefore uncompressed size
  *
- * These decoding function allows decompression of multiple blocks
+ * This decoding function allows decompression of multiple blocks
  * in "streaming" mode.
  * Previously decoded blocks *must* remain available at the memory position
  * where they were decoded (up to 64 KB)
@@ -610,10 +610,10 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
  * @dictStart: pointer to the start of the dictionary in memory
  * @dictSize: size of dictionary
  *
- * These decoding function works the same as
+ * This decoding function works the same as
  * a combination of LZ4_setStreamDecode() followed by
  * LZ4_decompress_safe_continue()
- * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure.
+ * It is stand-alone, and doesn't need an LZ4_streamDecode_t structure.
  *
  * Return: number of bytes decompressed into destination buffer
  *	(necessarily <= maxDecompressedSize)
@@ -633,10 +633,10 @@ int LZ4_decompress_safe_usingDict(const char *source, char *dest,
  * @dictStart: pointer to the start of the dictionary in memory
  * @dictSize: size of dictionary
  *
- * These decoding function works the same as
+ * This decoding function works the same as
  * a combination of LZ4_setStreamDecode() followed by
- * LZ4_decompress_safe_continue()
- * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure.
+ * LZ4_decompress_fast_continue()
+ * It is stand-alone, and doesn't need an LZ4_streamDecode_t structure.
  *
  * Return: number of bytes decompressed into destination buffer
  *	(necessarily <= maxDecompressedSize)
-- 
cgit v1.2.3


From eca499ab3749a4537dee77ffead47a1a2c0dee19 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 16 Jul 2019 16:30:31 -0700
Subject: mm/hotplug: make remove_memory() interface usable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Presently the remove_memory() interface is inherently broken.  It tries
to remove memory but panics if some memory is not offline.  The problem
is that it is impossible to ensure that all memory blocks are offline as
this function also takes lock_device_hotplug that is required to change
memory state via sysfs.

So, between calling this function and offlining all memory blocks there
is always a window when lock_device_hotplug is released, and therefore,
there is always a chance for a panic during this window.

Make this interface to return an error if memory removal fails.  This
way it is safe to call this function without panicking machine, and also
makes it symmetric to add_memory() which already returns an error.

Link: http://lkml.kernel.org/r/20190517215438.6487-3-pasha.tatashin@soleen.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Ross Zwisler <zwisler@kernel.org>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |  8 ++++--
 mm/memory_hotplug.c            | 64 ++++++++++++++++++++++++++++--------------
 2 files changed, 49 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ae892eef8b82..988fde33cd7f 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -324,7 +324,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
 extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
 extern void try_offline_node(int nid);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
-extern void remove_memory(int nid, u64 start, u64 size);
+extern int remove_memory(int nid, u64 start, u64 size);
 extern void __remove_memory(int nid, u64 start, u64 size);
 
 #else
@@ -341,7 +341,11 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 	return -EINVAL;
 }
 
-static inline void remove_memory(int nid, u64 start, u64 size) {}
+static inline int remove_memory(int nid, u64 start, u64 size)
+{
+	return -EBUSY;
+}
+
 static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6166ba5a15f3..4ebe696138e8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1734,9 +1734,10 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 		endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
 		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
 			&beginpa, &endpa);
-	}
 
-	return ret;
+		return -EBUSY;
+	}
+	return 0;
 }
 
 static int check_cpu_on_node(pg_data_t *pgdat)
@@ -1819,19 +1820,9 @@ static void __release_memory_resource(resource_size_t start,
 	}
 }
 
-/**
- * remove_memory
- * @nid: the node ID
- * @start: physical address of the region to remove
- * @size: size of the region to remove
- *
- * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
- * and online/offline operations before this call, as required by
- * try_offline_node().
- */
-void __ref __remove_memory(int nid, u64 start, u64 size)
+static int __ref try_remove_memory(int nid, u64 start, u64 size)
 {
-	int ret;
+	int rc = 0;
 
 	BUG_ON(check_hotplug_memory_range(start, size));
 
@@ -1839,13 +1830,13 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
 
 	/*
 	 * All memory blocks must be offlined before removing memory.  Check
-	 * whether all memory blocks in question are offline and trigger a BUG()
+	 * whether all memory blocks in question are offline and return error
 	 * if this is not the case.
 	 */
-	ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
-				check_memblock_offlined_cb);
-	if (ret)
-		BUG();
+	rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
+			       check_memblock_offlined_cb);
+	if (rc)
+		goto done;
 
 	/* remove memmap entry */
 	firmware_map_remove(start, start + size, "System RAM");
@@ -1857,14 +1848,45 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
 
 	try_offline_node(nid);
 
+done:
 	mem_hotplug_done();
+	return rc;
 }
 
-void remove_memory(int nid, u64 start, u64 size)
+/**
+ * remove_memory
+ * @nid: the node ID
+ * @start: physical address of the region to remove
+ * @size: size of the region to remove
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call, as required by
+ * try_offline_node().
+ */
+void __remove_memory(int nid, u64 start, u64 size)
+{
+
+	/*
+	 * trigger BUG() is some memory is not offlined prior to calling this
+	 * function
+	 */
+	if (try_remove_memory(nid, start, size))
+		BUG();
+}
+
+/*
+ * Remove memory if every memory block is offline, otherwise return -EBUSY is
+ * some memory is not offline
+ */
+int remove_memory(int nid, u64 start, u64 size)
 {
+	int rc;
+
 	lock_device_hotplug();
-	__remove_memory(nid, start, size);
+	rc  = try_remove_memory(nid, start, size);
 	unlock_device_hotplug();
+
+	return rc;
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
-- 
cgit v1.2.3


From 22fcea6f85f2cc74e61bd8b3640faa8467553c24 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 16 Jul 2019 16:30:38 -0700
Subject: mm: move MAP_SYNC to asm-generic/mman-common.h

This enables support for synchronous DAX fault on powerpc

The generic changes are added as part of b6fb293f2497 ("mm: Define
MAP_SYNC and VM_SYNC flags")

Without this, mmap returns EOPNOTSUPP for MAP_SYNC with
MAP_SHARED_VALIDATE

Instead of adding MAP_SYNC with same value to
arch/powerpc/include/uapi/asm/mman.h, I am moving the #define to
asm-generic/mman-common.h.  Two architectures using mman-common.h
directly are sparc and powerpc.  We should be able to consloidate more
#defines to mman-common.h.  That can be done as a separate patch.

Link: http://lkml.kernel.org/r/20190528091120.13322-1-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/asm-generic/mman-common.h | 3 ++-
 include/uapi/asm-generic/mman.h        | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index cb556b430e71..93a8b420ce1e 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -20,7 +20,8 @@
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
 
-/* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */
+/* 0x0100 - 0x40000 flags are defined in asm-generic/mman.h */
+#define MAP_SYNC		0x080000 /* perform synchronous page faults for the mapping */
 #define MAP_FIXED_NOREPLACE	0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */
 
 #define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 653687d9771b..2dffcbf705b3 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -13,7 +13,6 @@
 #define MAP_NONBLOCK	0x10000		/* do not block on IO */
 #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
-#define MAP_SYNC	0x80000		/* perform synchronous page faults for the mapping */
 
 /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
 
-- 
cgit v1.2.3


From 8aa3c927ec10d1230c3ace8357f624479665f701 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 16 Jul 2019 16:30:41 -0700
Subject: mm/mmap: move common defines to mman-common.h

Two architecture that use arch specific MMAP flags are powerpc and
sparc.  We still have few flag values common across them and other
architectures.  Consolidate this in mman-common.h.

Also update the comment to indicate where to find HugeTLB specific
reserved values

Link: http://lkml.kernel.org/r/20190604090950.31417-1-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/uapi/asm/mman.h   | 6 +-----
 arch/sparc/include/uapi/asm/mman.h     | 6 ------
 include/uapi/asm-generic/mman-common.h | 6 +++++-
 include/uapi/asm-generic/mman.h        | 9 ++++-----
 4 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index 65065ce32814..c0c737215b00 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -21,15 +21,11 @@
 #define MAP_DENYWRITE	0x0800		/* ETXTBSY */
 #define MAP_EXECUTABLE	0x1000		/* mark it as an executable */
 
+
 #define MCL_CURRENT     0x2000          /* lock all currently mapped pages */
 #define MCL_FUTURE      0x4000          /* lock all additions to address space */
 #define MCL_ONFAULT	0x8000		/* lock all pages that are faulted in */
 
-#define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
-#define MAP_NONBLOCK	0x10000		/* do not block on IO */
-#define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
-#define MAP_HUGETLB	0x40000		/* create a huge page mapping */
-
 /* Override any generic PKEY permission defines */
 #define PKEY_DISABLE_EXECUTE   0x4
 #undef PKEY_ACCESS_MASK
diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h
index f6f99ec65bb3..cec9f4109687 100644
--- a/arch/sparc/include/uapi/asm/mman.h
+++ b/arch/sparc/include/uapi/asm/mman.h
@@ -22,10 +22,4 @@
 #define MCL_FUTURE      0x4000          /* lock all additions to address space */
 #define MCL_ONFAULT	0x8000		/* lock all pages that are faulted in */
 
-#define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
-#define MAP_NONBLOCK	0x10000		/* do not block on IO */
-#define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
-#define MAP_HUGETLB	0x40000		/* create a huge page mapping */
-
-
 #endif /* _UAPI__SPARC_MMAN_H__ */
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 93a8b420ce1e..63b1f506ea67 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -20,7 +20,11 @@
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
 
-/* 0x0100 - 0x40000 flags are defined in asm-generic/mman.h */
+/* 0x0100 - 0x4000 flags are defined in asm-generic/mman.h */
+#define MAP_POPULATE		0x008000	/* populate (prefault) pagetables */
+#define MAP_NONBLOCK		0x010000	/* do not block on IO */
+#define MAP_STACK		0x020000	/* give out an address that is best suited for process/thread stacks */
+#define MAP_HUGETLB		0x040000	/* create a huge page mapping */
 #define MAP_SYNC		0x080000 /* perform synchronous page faults for the mapping */
 #define MAP_FIXED_NOREPLACE	0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */
 
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 2dffcbf705b3..57e8195d0b53 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -9,12 +9,11 @@
 #define MAP_EXECUTABLE	0x1000		/* mark it as an executable */
 #define MAP_LOCKED	0x2000		/* pages are locked */
 #define MAP_NORESERVE	0x4000		/* don't check for reservations */
-#define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
-#define MAP_NONBLOCK	0x10000		/* do not block on IO */
-#define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
-#define MAP_HUGETLB	0x40000		/* create a huge page mapping */
 
-/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
+/*
+ * Bits [26:31] are reserved, see asm-generic/hugetlb_encode.h
+ * for MAP_HUGETLB usage
+ */
 
 #define MCL_CURRENT	1		/* lock all current mappings */
 #define MCL_FUTURE	2		/* lock all future mappings */
-- 
cgit v1.2.3


From 7588adf8dff12c4b358557a13796a25fef796548 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 16 Jul 2019 16:30:44 -0700
Subject: mm: clean up is_device_*_page() definitions

Refactor is_device_{public,private}_page() with is_pci_p2pdma_page() to
make them all consistent in depending on their respective config options
even when CONFIG_DEV_PAGEMAP_OPS is enabled for other reasons.  This
allows a little more compile-time optimisation as well as the conceptual
and cosmetic cleanup.

Link: http://lkml.kernel.org/r/187c2ab27dea70635d375a61b2f2076d26c032b0.1558547956.git.robin.murphy@arm.com
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Suggested-by: Jerome Glisse <jglisse@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oliver O'Halloran <oohall@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 74797ed20c2c..baa8b8761d8c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -956,41 +956,28 @@ static inline bool put_devmap_managed_page(struct page *page)
 	return false;
 }
 
-static inline bool is_device_private_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
-}
-
-#ifdef CONFIG_PCI_P2PDMA
-static inline bool is_pci_p2pdma_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
-}
-#else /* CONFIG_PCI_P2PDMA */
-static inline bool is_pci_p2pdma_page(const struct page *page)
-{
-	return false;
-}
-#endif /* CONFIG_PCI_P2PDMA */
-
 #else /* CONFIG_DEV_PAGEMAP_OPS */
 static inline bool put_devmap_managed_page(struct page *page)
 {
 	return false;
 }
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
 
 static inline bool is_device_private_page(const struct page *page)
 {
-	return false;
+	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+		IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+		is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
-	return false;
+	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+		IS_ENABLED(CONFIG_PCI_P2PDMA) &&
+		is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
 
 /* 127: arbitrary random number, small enough to assemble well */
 #define page_ref_zero_or_close_to_overflow(page) \
-- 
cgit v1.2.3


From 175967318c3018d01931ac950c82adab5deb47ca Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 16 Jul 2019 16:30:47 -0700
Subject: mm: introduce ARCH_HAS_PTE_DEVMAP

ARCH_HAS_ZONE_DEVICE is somewhat meaningless in itself, and combined
with the long-out-of-date comment can lead to the impression than an
architecture may just enable it (since __add_pages() now "comprehends
device memory" for itself) and expect things to work.

In practice, however, ZONE_DEVICE users have little chance of
functioning correctly without __HAVE_ARCH_PTE_DEVMAP, so let's clean
that up the same way as ARCH_HAS_PTE_SPECIAL and make it the proper
dependency so the real situation is clearer.

Link: http://lkml.kernel.org/r/87554aa78478a02a63f2c4cf60a847279ae3eb3b.1558547956.git.robin.murphy@arm.com
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Acked-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/Kconfig                         | 2 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 1 -
 arch/x86/Kconfig                             | 2 +-
 arch/x86/include/asm/pgtable.h               | 4 ++--
 arch/x86/include/asm/pgtable_types.h         | 1 -
 include/linux/mm.h                           | 4 ++--
 include/linux/pfn_t.h                        | 4 ++--
 mm/Kconfig                                   | 5 ++---
 mm/gup.c                                     | 2 +-
 9 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f516796dd819..d8dcd8820369 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,6 +129,7 @@ config PPC
 	select ARCH_HAS_MMIOWB			if PPC64
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PMEM_API                if PPC64
+	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC64
@@ -136,7 +137,6 @@ config PPC
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if PPC64
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_ZONE_DEVICE		if PPC_BOOK3S_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 62e6ea0a7650..8308f32e9782 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -90,7 +90,6 @@
 #define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
 #define _PAGE_DEVMAP		_RPAGE_SW1 /* software: ZONE_DEVICE page */
-#define __HAVE_ARCH_PTE_DEVMAP
 
 /*
  * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 879741336771..4a55bd01e918 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -70,6 +70,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
@@ -80,7 +81,6 @@ config X86
 	select ARCH_HAS_STRICT_MODULE_RWX
 	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_ZONE_DEVICE		if X86_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5e0509b41986..0bc530c4eb13 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -271,7 +271,7 @@ static inline int has_transparent_hugepage(void)
 	return boot_cpu_has(X86_FEATURE_PSE);
 }
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pmd_devmap(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_DEVMAP);
@@ -732,7 +732,7 @@ static inline int pte_present(pte_t a)
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t a)
 {
 	return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d6ff0bbdb394..b5e49e6bac63 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -103,7 +103,6 @@
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
-#define __HAVE_ARCH_PTE_DEVMAP
 #else
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index baa8b8761d8c..f43f4de4de68 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -547,7 +547,7 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
 struct mmu_gather;
 struct inode;
 
-#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline int pmd_devmap(pmd_t pmd)
 {
 	return 0;
@@ -1750,7 +1750,7 @@ static inline void sync_mm_rss(struct mm_struct *mm)
 }
 #endif
 
-#ifndef __HAVE_ARCH_PTE_DEVMAP
+#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t pte)
 {
 	return 0;
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 01e8037023f7..2d9148221e9a 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -97,7 +97,7 @@ static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
 #endif
 #endif
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline bool pfn_t_devmap(pfn_t pfn)
 {
 	const u64 flags = PFN_DEV|PFN_MAP;
@@ -115,7 +115,7 @@ pmd_t pmd_mkdevmap(pmd_t pmd);
 	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 pud_t pud_mkdevmap(pud_t pud);
 #endif
-#endif /* __HAVE_ARCH_PTE_DEVMAP */
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
 
 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline bool pfn_t_special(pfn_t pfn)
diff --git a/mm/Kconfig b/mm/Kconfig
index 495d7368ced8..56cec636a1fc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,8 +649,7 @@ config IDLE_PAGE_TRACKING
 	  See Documentation/admin-guide/mm/idle_page_tracking.rst for
 	  more details.
 
-# arch_add_memory() comprehends device memory
-config ARCH_HAS_ZONE_DEVICE
+config ARCH_HAS_PTE_DEVMAP
 	bool
 
 config ZONE_DEVICE
@@ -658,7 +657,7 @@ config ZONE_DEVICE
 	depends on MEMORY_HOTPLUG
 	depends on MEMORY_HOTREMOVE
 	depends on SPARSEMEM_VMEMMAP
-	depends on ARCH_HAS_ZONE_DEVICE
+	depends on ARCH_HAS_PTE_DEVMAP
 	select XARRAY_MULTI
 
 	help
diff --git a/mm/gup.c b/mm/gup.c
index 8bbaa5523116..98f13ab37bac 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1895,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {
-- 
cgit v1.2.3


From 79eb597cba06c435b72f220e9d426ae413fc2579 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Tue, 16 Jul 2019 16:30:54 -0700
Subject: mm: add account_locked_vm utility function

locked_vm accounting is done roughly the same way in five places, so
unify them in a helper.

Include the helper's caller in the debug print to distinguish between
callsites.

Error codes stay the same, so user-visible behavior does too.  The one
exception is that the -EPERM case in tce_account_locked_vm is removed
because Alexey has never seen it triggered.

[daniel.m.jordan@oracle.com: v3]
  Link: http://lkml.kernel.org/r/20190529205019.20927-1-daniel.m.jordan@oracle.com
[sfr@canb.auug.org.au: fix mm/util.c]
Link: http://lkml.kernel.org/r/20190524175045.26897-1-daniel.m.jordan@oracle.com
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Cc: Alan Tull <atull@kernel.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Moritz Fischer <mdf@kernel.org>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Steve Sistare <steven.sistare@oracle.com>
Cc: Wu Hao <hao.wu@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kvm/book3s_64_vio.c     | 44 ++-------------------
 arch/powerpc/mm/book3s64/iommu_api.c | 41 ++------------------
 drivers/fpga/dfl-afu-dma-region.c    | 53 ++-----------------------
 drivers/vfio/vfio_iommu_spapr_tce.c  | 54 +++-----------------------
 drivers/vfio/vfio_iommu_type1.c      | 17 +-------
 include/linux/mm.h                   |  4 ++
 mm/util.c                            | 75 ++++++++++++++++++++++++++++++++++++
 7 files changed, 98 insertions(+), 190 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 5bf05cc774e2..e99a14798ab0 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -19,6 +19,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/iommu.h>
 #include <linux/file.h>
+#include <linux/mm.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -45,43 +46,6 @@ static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
 	return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
-{
-	long ret = 0;
-
-	if (!current || !current->mm)
-		return ret; /* process exited */
-
-	down_write(&current->mm->mmap_sem);
-
-	if (inc) {
-		unsigned long locked, lock_limit;
-
-		locked = current->mm->locked_vm + stt_pages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			current->mm->locked_vm += stt_pages;
-	} else {
-		if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
-			stt_pages = current->mm->locked_vm;
-
-		current->mm->locked_vm -= stt_pages;
-	}
-
-	pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
-			inc ? '+' : '-',
-			stt_pages << PAGE_SHIFT,
-			current->mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK),
-			ret ? " - exceeded" : "");
-
-	up_write(&current->mm->mmap_sem);
-
-	return ret;
-}
-
 static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
 {
 	struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
@@ -291,7 +255,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 
 	kvm_put_kvm(stt->kvm);
 
-	kvmppc_account_memlimit(
+	account_locked_vm(current->mm,
 		kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
 	call_rcu(&stt->rcu, release_spapr_tce_table);
 
@@ -316,7 +280,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 		return -EINVAL;
 
 	npages = kvmppc_tce_pages(size);
-	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+	ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true);
 	if (ret)
 		return ret;
 
@@ -362,7 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
 	kfree(stt);
  fail_acct:
-	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
+	account_locked_vm(current->mm, kvmppc_stt_pages(npages), false);
 	return ret;
 }
 
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 90ee3a89722c..b056cae3388b 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -14,6 +14,7 @@
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
 #include <linux/sizes.h>
+#include <linux/mm.h>
 #include <asm/mmu_context.h>
 #include <asm/pte-walk.h>
 #include <linux/mm_inline.h>
@@ -46,40 +47,6 @@ struct mm_iommu_table_group_mem_t {
 	u64 dev_hpa;		/* Device memory base address */
 };
 
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
-		unsigned long npages, bool incr)
-{
-	long ret = 0, locked, lock_limit;
-
-	if (!npages)
-		return 0;
-
-	down_write(&mm->mmap_sem);
-
-	if (incr) {
-		locked = mm->locked_vm + npages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			mm->locked_vm += npages;
-	} else {
-		if (WARN_ON_ONCE(npages > mm->locked_vm))
-			npages = mm->locked_vm;
-		mm->locked_vm -= npages;
-	}
-
-	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-			current ? current->pid : 0,
-			incr ? '+' : '-',
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
-
-	return ret;
-}
-
 bool mm_iommu_preregistered(struct mm_struct *mm)
 {
 	return !list_empty(&mm->context.iommu_group_mem_list);
@@ -96,7 +63,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
 	unsigned long entry, chunk;
 
 	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
-		ret = mm_iommu_adjust_locked_vm(mm, entries, true);
+		ret = account_locked_vm(mm, entries, true);
 		if (ret)
 			return ret;
 
@@ -211,7 +178,7 @@ free_exit:
 	kfree(mem);
 
 unlock_exit:
-	mm_iommu_adjust_locked_vm(mm, locked_entries, false);
+	account_locked_vm(mm, locked_entries, false);
 
 	return ret;
 }
@@ -311,7 +278,7 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
 unlock_exit:
 	mutex_unlock(&mem_list_mutex);
 
-	mm_iommu_adjust_locked_vm(mm, unlock_entries, false);
+	account_locked_vm(mm, unlock_entries, false);
 
 	return ret;
 }
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index dcd80b088c7b..62f924489db5 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -12,6 +12,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/sched/signal.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 
 #include "dfl-afu.h"
 
@@ -31,52 +32,6 @@ void afu_dma_region_init(struct dfl_feature_platform_data *pdata)
 	afu->dma_regions = RB_ROOT;
 }
 
-/**
- * afu_dma_adjust_locked_vm - adjust locked memory
- * @dev: port device
- * @npages: number of pages
- * @incr: increase or decrease locked memory
- *
- * Increase or decrease the locked memory size with npages input.
- *
- * Return 0 on success.
- * Return -ENOMEM if locked memory size is over the limit and no CAP_IPC_LOCK.
- */
-static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)
-{
-	unsigned long locked, lock_limit;
-	int ret = 0;
-
-	/* the task is exiting. */
-	if (!current->mm)
-		return 0;
-
-	down_write(&current->mm->mmap_sem);
-
-	if (incr) {
-		locked = current->mm->locked_vm + npages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			current->mm->locked_vm += npages;
-	} else {
-		if (WARN_ON_ONCE(npages > current->mm->locked_vm))
-			npages = current->mm->locked_vm;
-		current->mm->locked_vm -= npages;
-	}
-
-	dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid,
-		incr ? '+' : '-', npages << PAGE_SHIFT,
-		current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
-		ret ? "- exceeded" : "");
-
-	up_write(&current->mm->mmap_sem);
-
-	return ret;
-}
-
 /**
  * afu_dma_pin_pages - pin pages of given dma memory region
  * @pdata: feature device platform data
@@ -92,7 +47,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
 	struct device *dev = &pdata->dev->dev;
 	int ret, pinned;
 
-	ret = afu_dma_adjust_locked_vm(dev, npages, true);
+	ret = account_locked_vm(current->mm, npages, true);
 	if (ret)
 		return ret;
 
@@ -121,7 +76,7 @@ put_pages:
 free_pages:
 	kfree(region->pages);
 unlock_vm:
-	afu_dma_adjust_locked_vm(dev, npages, false);
+	account_locked_vm(current->mm, npages, false);
 	return ret;
 }
 
@@ -141,7 +96,7 @@ static void afu_dma_unpin_pages(struct dfl_feature_platform_data *pdata,
 
 	put_all_pages(region->pages, npages);
 	kfree(region->pages);
-	afu_dma_adjust_locked_vm(dev, npages, false);
+	account_locked_vm(current->mm, npages, false);
 
 	dev_dbg(dev, "%ld pages unpinned\n", npages);
 }
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 7048c9198c21..8ce9ad21129f 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -19,6 +19,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
+#include <linux/mm.h>
 
 #include <asm/iommu.h>
 #include <asm/tce.h>
@@ -31,51 +32,6 @@
 static void tce_iommu_detach_group(void *iommu_data,
 		struct iommu_group *iommu_group);
 
-static long try_increment_locked_vm(struct mm_struct *mm, long npages)
-{
-	long ret = 0, locked, lock_limit;
-
-	if (WARN_ON_ONCE(!mm))
-		return -EPERM;
-
-	if (!npages)
-		return 0;
-
-	down_write(&mm->mmap_sem);
-	locked = mm->locked_vm + npages;
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-		ret = -ENOMEM;
-	else
-		mm->locked_vm += npages;
-
-	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK),
-			ret ? " - exceeded" : "");
-
-	up_write(&mm->mmap_sem);
-
-	return ret;
-}
-
-static void decrement_locked_vm(struct mm_struct *mm, long npages)
-{
-	if (!mm || !npages)
-		return;
-
-	down_write(&mm->mmap_sem);
-	if (WARN_ON_ONCE(npages > mm->locked_vm))
-		npages = mm->locked_vm;
-	mm->locked_vm -= npages;
-	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
-}
-
 /*
  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  *
@@ -333,7 +289,7 @@ static int tce_iommu_enable(struct tce_container *container)
 		return ret;
 
 	locked = table_group->tce32_size >> PAGE_SHIFT;
-	ret = try_increment_locked_vm(container->mm, locked);
+	ret = account_locked_vm(container->mm, locked, true);
 	if (ret)
 		return ret;
 
@@ -352,7 +308,7 @@ static void tce_iommu_disable(struct tce_container *container)
 	container->enabled = false;
 
 	BUG_ON(!container->mm);
-	decrement_locked_vm(container->mm, container->locked_pages);
+	account_locked_vm(container->mm, container->locked_pages, false);
 }
 
 static void *tce_iommu_open(unsigned long arg)
@@ -656,7 +612,7 @@ static long tce_iommu_create_table(struct tce_container *container,
 	if (!table_size)
 		return -EINVAL;
 
-	ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
+	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
 	if (ret)
 		return ret;
 
@@ -675,7 +631,7 @@ static void tce_iommu_free_table(struct tce_container *container,
 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
 	iommu_tce_table_put(tbl);
-	decrement_locked_vm(container->mm, pages);
+	account_locked_vm(container->mm, pages, false);
 }
 
 static long tce_iommu_create_window(struct tce_container *container,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index add34adfadc7..054391f30fa8 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -272,21 +272,8 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 
 	ret = down_write_killable(&mm->mmap_sem);
 	if (!ret) {
-		if (npage > 0) {
-			if (!dma->lock_cap) {
-				unsigned long limit;
-
-				limit = task_rlimit(dma->task,
-						RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-				if (mm->locked_vm + npage > limit)
-					ret = -ENOMEM;
-			}
-		}
-
-		if (!ret)
-			mm->locked_vm += npage;
-
+		ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
+					  dma->lock_cap);
 		up_write(&mm->mmap_sem);
 	}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f43f4de4de68..bd6512559bed 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1543,6 +1543,10 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 int get_user_pages_fast(unsigned long start, int nr_pages,
 			unsigned int gup_flags, struct page **pages);
 
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+			struct task_struct *task, bool bypass_rlim);
+
 /* Container for pinned pfns / pages */
 struct frame_vector {
 	unsigned int nr_allocated;	/* Number of frames we have space for */
diff --git a/mm/util.c b/mm/util.c
index 68575a315dc5..e6351a80f248 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,7 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/task_stack.h>
 #include <linux/security.h>
 #include <linux/swap.h>
@@ -300,6 +301,80 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 }
 #endif
 
+/**
+ * __account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm:          mm to account against
+ * @pages:       number of pages to account
+ * @inc:         %true if @pages should be considered positive, %false if not
+ * @task:        task used to check RLIMIT_MEMLOCK
+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
+ *
+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
+ * that mmap_sem is held as writer.
+ *
+ * Return:
+ * * 0       on success
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+			struct task_struct *task, bool bypass_rlim)
+{
+	unsigned long locked_vm, limit;
+	int ret = 0;
+
+	lockdep_assert_held_write(&mm->mmap_sem);
+
+	locked_vm = mm->locked_vm;
+	if (inc) {
+		if (!bypass_rlim) {
+			limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+			if (locked_vm + pages > limit)
+				ret = -ENOMEM;
+		}
+		if (!ret)
+			mm->locked_vm = locked_vm + pages;
+	} else {
+		WARN_ON_ONCE(pages > locked_vm);
+		mm->locked_vm = locked_vm - pages;
+	}
+
+	pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
+		 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
+		 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
+		 ret ? " - exceeded" : "");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__account_locked_vm);
+
+/**
+ * account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm:          mm to account against, may be NULL
+ * @pages:       number of pages to account
+ * @inc:         %true if @pages should be considered positive, %false if not
+ *
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
+ *
+ * Return:
+ * * 0       on success, or if mm is NULL
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
+{
+	int ret;
+
+	if (pages == 0 || !mm)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+	ret = __account_locked_vm(mm, pages, inc, current,
+				  capable(CAP_IPC_LOCK));
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(account_locked_vm);
+
 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
 	unsigned long flag, unsigned long pgoff)
-- 
cgit v1.2.3


From 8c2e408e73f735d2e6e8b43f9b038c9abb082939 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 12 Jul 2019 10:46:10 +0530
Subject: virtio_pmem: fix sparse warning

This patch fixes below sparse warning related to __virtio
type in virtio pmem driver. This is reported by Intel test
bot on linux-next tree.

nd_virtio.c:56:28: warning: incorrect type in assignment
                                (different base types)
nd_virtio.c:56:28:    expected unsigned int [unsigned] [usertype] type
nd_virtio.c:56:28:    got restricted __virtio32
nd_virtio.c:93:59: warning: incorrect type in argument 2
                                (different base types)
nd_virtio.c:93:59:    expected restricted __virtio32 [usertype] val
nd_virtio.c:93:59:    got unsigned int [unsigned] [usertype] ret

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/nd_virtio.c       | 4 ++--
 include/uapi/linux/virtio_pmem.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
index 8645275c08c2..10351d5b49fa 100644
--- a/drivers/nvdimm/nd_virtio.c
+++ b/drivers/nvdimm/nd_virtio.c
@@ -53,7 +53,7 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
 	init_waitqueue_head(&req_data->host_acked);
 	init_waitqueue_head(&req_data->wq_buf);
 	INIT_LIST_HEAD(&req_data->list);
-	req_data->req.type = cpu_to_virtio32(vdev, VIRTIO_PMEM_REQ_TYPE_FLUSH);
+	req_data->req.type = cpu_to_le32(VIRTIO_PMEM_REQ_TYPE_FLUSH);
 	sg_init_one(&sg, &req_data->req, sizeof(req_data->req));
 	sgs[0] = &sg;
 	sg_init_one(&ret, &req_data->resp.ret, sizeof(req_data->resp));
@@ -90,7 +90,7 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
 	} else {
 		/* A host repsonse results in "host_ack" getting called */
 		wait_event(req_data->host_acked, req_data->done);
-		err = virtio32_to_cpu(vdev, req_data->resp.ret);
+		err = le32_to_cpu(req_data->resp.ret);
 	}
 
 	kfree(req_data);
diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h
index efcd72f2d20d..9a63ed6d062f 100644
--- a/include/uapi/linux/virtio_pmem.h
+++ b/include/uapi/linux/virtio_pmem.h
@@ -23,12 +23,12 @@ struct virtio_pmem_config {
 
 struct virtio_pmem_resp {
 	/* Host return status corresponding to flush request */
-	__u32 ret;
+	__le32 ret;
 };
 
 struct virtio_pmem_req {
 	/* command type */
-	__u32 type;
+	__le32 type;
 };
 
 #endif
-- 
cgit v1.2.3


From 56cbb429d911991170fe867b4bba14f0efed5829 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Jul 2019 16:57:51 -0400
Subject: switch the remnants of releasing the mountpoint away from fs_pin

We used to need rather convoluted ordering trickery to guarantee
that dput() of ex-mountpoints happens before the final mntput()
of the same.  Since we don't need that anymore, there's no point
playing with fs_pin for that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_pin.c            | 10 ++--------
 fs/mount.h             |  7 +++++--
 fs/namespace.c         | 37 +++++++++++++++++++------------------
 include/linux/fs_pin.h |  1 -
 4 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index a6497cf8ae53..47ef3c71ce90 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -19,20 +19,14 @@ void pin_remove(struct fs_pin *pin)
 	spin_unlock_irq(&pin->wait.lock);
 }
 
-void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
 {
 	spin_lock(&pin_lock);
-	if (p)
-		hlist_add_head(&pin->s_list, p);
+	hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
 	hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
 	spin_unlock(&pin_lock);
 }
 
-void pin_insert(struct fs_pin *pin, struct vfsmount *m)
-{
-	pin_insert_group(pin, m, &m->mnt_sb->s_pins);
-}
-
 void pin_kill(struct fs_pin *p)
 {
 	wait_queue_entry_t wait;
diff --git a/fs/mount.h b/fs/mount.h
index 84aa8cdf4971..711a4093e475 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,7 +58,10 @@ struct mount {
 	struct mount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
-	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+	union {
+		struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+		struct hlist_node mnt_umount;
+	};
 	struct list_head mnt_umounting; /* list entry for umount propagation */
 #ifdef CONFIG_FSNOTIFY
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
@@ -68,7 +71,7 @@ struct mount {
 	int mnt_group_id;		/* peer group identifier */
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	struct hlist_head mnt_pins;
-	struct fs_pin mnt_umount;
+	struct hlist_head mnt_stuck_children;
 } __randomize_layout;
 
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namespace.c b/fs/namespace.c
index 46316ba15615..54a815e48ead 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -171,13 +171,6 @@ unsigned int mnt_get_count(struct mount *mnt)
 #endif
 }
 
-static void drop_mountpoint(struct fs_pin *p)
-{
-	struct mount *m = container_of(p, struct mount, mnt_umount);
-	pin_remove(p);
-	mntput(&m->mnt);
-}
-
 static struct mount *alloc_vfsmnt(const char *name)
 {
 	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -215,7 +208,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
-		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
+		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 	}
 	return mnt;
 
@@ -1087,19 +1080,22 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 
 static void cleanup_mnt(struct mount *mnt)
 {
+	struct hlist_node *p;
+	struct mount *m;
 	/*
-	 * This probably indicates that somebody messed
-	 * up a mnt_want/drop_write() pair.  If this
-	 * happens, the filesystem was probably unable
-	 * to make r/w->r/o transitions.
-	 */
-	/*
+	 * The warning here probably indicates that somebody messed
+	 * up a mnt_want/drop_write() pair.  If this happens, the
+	 * filesystem was probably unable to make r/w->r/o transitions.
 	 * The locking used to deal with mnt_count decrement provides barriers,
 	 * so mnt_get_writers() below is safe.
 	 */
 	WARN_ON(mnt_get_writers(mnt));
 	if (unlikely(mnt->mnt_pins.first))
 		mnt_pin_kill(mnt);
+	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
@@ -1168,6 +1164,7 @@ static void mntput_no_expire(struct mount *mnt)
 		struct mount *p, *tmp;
 		list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
 			__put_mountpoint(unhash_mnt(p), &list);
+			hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
 		}
 	}
 	unlock_mount_hash();
@@ -1360,6 +1357,8 @@ EXPORT_SYMBOL(may_umount);
 static void namespace_unlock(void)
 {
 	struct hlist_head head;
+	struct hlist_node *p;
+	struct mount *m;
 	LIST_HEAD(list);
 
 	hlist_move_list(&unmounted, &head);
@@ -1374,7 +1373,10 @@ static void namespace_unlock(void)
 
 	synchronize_rcu_expedited();
 
-	group_pin_kill(&head);
+	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
 }
 
 static inline void namespace_lock(void)
@@ -1461,8 +1463,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 
 		disconnect = disconnect_mount(p, how);
 
-		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
-				 disconnect ? &unmounted : NULL);
 		if (mnt_has_parent(p)) {
 			mnt_add_count(p->mnt_parent, -1);
 			if (!disconnect) {
@@ -1470,6 +1470,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
 			} else {
 				umount_mnt(p);
+				hlist_add_head(&p->mnt_umount, &unmounted);
 			}
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
@@ -1622,8 +1623,8 @@ void __detach_mounts(struct dentry *dentry)
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
 		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
-			hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
 			umount_mnt(mnt);
+			hlist_add_head(&mnt->mnt_umount, &unmounted);
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 7cab74d66f85..bdd09fd2520c 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -20,6 +20,5 @@ static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
 }
 
 void pin_remove(struct fs_pin *);
-void pin_insert_group(struct fs_pin *, struct vfsmount *, struct hlist_head *);
 void pin_insert(struct fs_pin *, struct vfsmount *);
 void pin_kill(struct fs_pin *);
-- 
cgit v1.2.3


From 7ad388d8e4c703980b7018b938cdeec58832d78d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Jun 2019 14:19:53 +0200
Subject: scsi: core: add a host / host template field for the virt boundary

This allows drivers setting it up easily instead of branching out to block
layer calls in slave_alloc, and ensures the upgraded max_segment_size
setting gets picked up by the DMA layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Kashyap Desai < kashyap.desai@broadcom.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/hosts.c     | 3 +++
 drivers/scsi/scsi_lib.c  | 3 ++-
 include/scsi/scsi_host.h | 3 +++
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 96ed24841c33..f98509d717a0 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -462,6 +462,9 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	else
 		shost->dma_boundary = 0xffffffff;
 
+	if (sht->virt_boundary_mask)
+		shost->virt_boundary_mask = sht->virt_boundary_mask;
+
 	device_initialize(&shost->shost_gendev);
 	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
 	shost->shost_gendev.bus = &scsi_bus_type;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 420e2354b36b..404e5e28ef62 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1791,7 +1791,8 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
 	dma_set_seg_boundary(dev, shost->dma_boundary);
 
 	blk_queue_max_segment_size(q, shost->max_segment_size);
-	dma_set_max_seg_size(dev, shost->max_segment_size);
+	blk_queue_virt_boundary(q, shost->virt_boundary_mask);
+	dma_set_max_seg_size(dev, queue_max_segment_size(q));
 
 	/*
 	 * Set a reasonable default alignment:  The larger of 32-byte (dword),
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index a5fcdad4a03e..cc139dbd71e5 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -369,6 +369,8 @@ struct scsi_host_template {
 	 */
 	unsigned long dma_boundary;
 
+	unsigned long virt_boundary_mask;
+
 	/*
 	 * This specifies "machine infinity" for host templates which don't
 	 * limit the transfer size.  Note this limit represents an absolute
@@ -587,6 +589,7 @@ struct Scsi_Host {
 	unsigned int max_sectors;
 	unsigned int max_segment_size;
 	unsigned long dma_boundary;
+	unsigned long virt_boundary_mask;
 	/*
 	 * In scsi-mq mode, the number of hardware queues supported by the LLD.
 	 *
-- 
cgit v1.2.3


From bce5963bcb4f9934faa52be323994511d59fd13c Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 21 Jun 2019 20:47:03 +0200
Subject: xen/events: fix binding user event channels to cpus

When binding an interdomain event channel to a vcpu via
IOCTL_EVTCHN_BIND_INTERDOMAIN not only the event channel needs to be
bound, but the affinity of the associated IRQi must be changed, too.
Otherwise the IRQ and the event channel won't be moved to another vcpu
in case the original vcpu they were bound to is going offline.

Cc: <stable@vger.kernel.org> # 4.13
Fixes: c48f64ab472389df ("xen-evtchn: Bind dyn evtchn:qemu-dm interrupt to next online VCPU")
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/events/events_base.c | 12 ++++++++++--
 drivers/xen/evtchn.c             |  2 +-
 include/xen/events.h             |  3 ++-
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index ff9b51055b14..2e8570c09789 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1294,7 +1294,7 @@ void rebind_evtchn_irq(int evtchn, int irq)
 }
 
 /* Rebind an evtchn so that it gets delivered to a specific cpu */
-int xen_rebind_evtchn_to_cpu(int evtchn, unsigned tcpu)
+static int xen_rebind_evtchn_to_cpu(int evtchn, unsigned int tcpu)
 {
 	struct evtchn_bind_vcpu bind_vcpu;
 	int masked;
@@ -1328,7 +1328,6 @@ int xen_rebind_evtchn_to_cpu(int evtchn, unsigned tcpu)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(xen_rebind_evtchn_to_cpu);
 
 static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
 			    bool force)
@@ -1342,6 +1341,15 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
 	return ret;
 }
 
+/* To be called with desc->lock held. */
+int xen_set_affinity_evtchn(struct irq_desc *desc, unsigned int tcpu)
+{
+	struct irq_data *d = irq_desc_get_irq_data(desc);
+
+	return set_affinity_irq(d, cpumask_of(tcpu), false);
+}
+EXPORT_SYMBOL_GPL(xen_set_affinity_evtchn);
+
 static void enable_dynirq(struct irq_data *data)
 {
 	int evtchn = evtchn_from_irq(data->irq);
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index f341b016672f..052b55a14ebc 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -447,7 +447,7 @@ static void evtchn_bind_interdom_next_vcpu(int evtchn)
 	this_cpu_write(bind_last_selected_cpu, selected_cpu);
 
 	/* unmask expects irqs to be disabled */
-	xen_rebind_evtchn_to_cpu(evtchn, selected_cpu);
+	xen_set_affinity_evtchn(desc, selected_cpu);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
diff --git a/include/xen/events.h b/include/xen/events.h
index a48897199975..c0e6a0598397 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -3,6 +3,7 @@
 #define _XEN_EVENTS_H
 
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #ifdef CONFIG_PCI_MSI
 #include <linux/msi.h>
 #endif
@@ -59,7 +60,7 @@ void evtchn_put(unsigned int evtchn);
 
 void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
 void rebind_evtchn_irq(int evtchn, int irq);
-int xen_rebind_evtchn_to_cpu(int evtchn, unsigned tcpu);
+int xen_set_affinity_evtchn(struct irq_desc *desc, unsigned int tcpu);
 
 static inline void notify_remote_via_evtchn(int port)
 {
-- 
cgit v1.2.3


From 814bbf49dcd0ad642e7ceb8991e57555c5472cce Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Sun, 14 Jul 2019 14:04:14 +0200
Subject: xen: remove tmem driver

The Xen tmem (transcendent memory) driver can be removed, as the
related Xen hypervisor feature never made it past the "experimental"
state and will be removed in future Xen versions (>= 4.13).

The xen-selfballoon driver depends on tmem, so it can be removed, too.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  21 -
 drivers/xen/Kconfig                             |  23 -
 drivers/xen/Makefile                            |   2 -
 drivers/xen/tmem.c                              | 419 -----------------
 drivers/xen/xen-balloon.c                       |   2 -
 drivers/xen/xen-selfballoon.c                   | 579 ------------------------
 include/xen/balloon.h                           |  10 -
 include/xen/tmem.h                              |  18 -
 8 files changed, 1074 deletions(-)
 delete mode 100644 drivers/xen/tmem.c
 delete mode 100644 drivers/xen/xen-selfballoon.c
 delete mode 100644 include/xen/tmem.h

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f8b62360b18c..99db4975e6b5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4674,27 +4674,6 @@
 			Force threading of all interrupt handlers except those
 			marked explicitly IRQF_NO_THREAD.
 
-	tmem		[KNL,XEN]
-			Enable the Transcendent memory driver if built-in.
-
-	tmem.cleancache=0|1 [KNL, XEN]
-			Default is on (1). Disable the usage of the cleancache
-			API to send anonymous pages to the hypervisor.
-
-	tmem.frontswap=0|1 [KNL, XEN]
-			Default is on (1). Disable the usage of the frontswap
-			API to send swap pages to the hypervisor. If disabled
-			the selfballooning and selfshrinking are force disabled.
-
-	tmem.selfballooning=0|1 [KNL, XEN]
-			Default is on (1). Disable the driving of swap pages
-			to the hypervisor.
-
-	tmem.selfshrinking=0|1 [KNL, XEN]
-			Default is on (1). Partial swapoff that immediately
-			transfers pages from Xen hypervisor back to the
-			kernel based on different criteria.
-
 	topology=	[S390]
 			Format: {off | on}
 			Specify if the kernel should make use of the cpu
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index ec6558b79e9d..79cc75096f42 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -10,21 +10,6 @@ config XEN_BALLOON
 	  the system to expand the domain's memory allocation, or alternatively
 	  return unneeded memory to the system.
 
-config XEN_SELFBALLOONING
-	bool "Dynamically self-balloon kernel memory to target"
-	depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM
-	help
-	  Self-ballooning dynamically balloons available kernel memory driven
-	  by the current usage of anonymous memory ("committed AS") and
-	  controlled by various sysfs-settable parameters.  Configuring
-	  FRONTSWAP is highly recommended; if it is not configured, self-
-	  ballooning is disabled by default. If FRONTSWAP is configured,
-	  frontswap-selfshrinking is enabled by default but can be disabled
-	  with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning
-	  is enabled by default but can be disabled with the 'tmem.selfballooning=0'
-	  kernel boot parameter.  Note that systems without a sufficiently
-	  large swap device should not enable self-ballooning.
-
 config XEN_BALLOON_MEMORY_HOTPLUG
 	bool "Memory hotplug support for Xen balloon driver"
 	depends on XEN_BALLOON && MEMORY_HOTPLUG
@@ -191,14 +176,6 @@ config SWIOTLB_XEN
 	def_bool y
 	select SWIOTLB
 
-config XEN_TMEM
-	tristate
-	depends on !ARM && !ARM64
-	default m if (CLEANCACHE || FRONTSWAP)
-	help
-	  Shim to interface in-kernel Transcendent Memory hooks
-	  (e.g. cleancache and frontswap) to Xen tmem hypercalls.
-
 config XEN_PCIDEV_BACKEND
 	tristate "Xen PCI-device backend driver"
 	depends on PCI && X86 && XEN
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index ad3844d9f876..0c4efa6fe450 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -17,14 +17,12 @@ dom0-$(CONFIG_X86) += pcpu.o
 obj-$(CONFIG_XEN_DOM0)			+= $(dom0-y)
 obj-$(CONFIG_BLOCK)			+= biomerge.o
 obj-$(CONFIG_XEN_BALLOON)		+= xen-balloon.o
-obj-$(CONFIG_XEN_SELFBALLOONING)	+= xen-selfballoon.o
 obj-$(CONFIG_XEN_DEV_EVTCHN)		+= xen-evtchn.o
 obj-$(CONFIG_XEN_GNTDEV)		+= xen-gntdev.o
 obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)	+= xen-gntalloc.o
 obj-$(CONFIG_XENFS)			+= xenfs/
 obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
 obj-$(CONFIG_XEN_PVHVM)			+= platform-pci.o
-obj-$(CONFIG_XEN_TMEM)			+= tmem.o
 obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
 obj-$(CONFIG_XEN_MCE_LOG)		+= mcelog.o
 obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
deleted file mode 100644
index 64d7479ad5ad..000000000000
--- a/drivers/xen/tmem.c
+++ /dev/null
@@ -1,419 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Xen implementation for transcendent memory (tmem)
- *
- * Copyright (C) 2009-2011 Oracle Corp.  All rights reserved.
- * Author: Dan Magenheimer
- */
-
-#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/cleancache.h>
-#include <linux/frontswap.h>
-
-#include <xen/xen.h>
-#include <xen/interface/xen.h>
-#include <xen/page.h>
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-#include <xen/tmem.h>
-
-#ifndef CONFIG_XEN_TMEM_MODULE
-bool __read_mostly tmem_enabled = false;
-
-static int __init enable_tmem(char *s)
-{
-	tmem_enabled = true;
-	return 1;
-}
-__setup("tmem", enable_tmem);
-#endif
-
-#ifdef CONFIG_CLEANCACHE
-static bool cleancache __read_mostly = true;
-module_param(cleancache, bool, S_IRUGO);
-static bool selfballooning __read_mostly = true;
-module_param(selfballooning, bool, S_IRUGO);
-#endif /* CONFIG_CLEANCACHE */
-
-#ifdef CONFIG_FRONTSWAP
-static bool frontswap __read_mostly = true;
-module_param(frontswap, bool, S_IRUGO);
-#else /* CONFIG_FRONTSWAP */
-#define frontswap (0)
-#endif /* CONFIG_FRONTSWAP */
-
-#ifdef CONFIG_XEN_SELFBALLOONING
-static bool selfshrinking __read_mostly = true;
-module_param(selfshrinking, bool, S_IRUGO);
-#endif /* CONFIG_XEN_SELFBALLOONING */
-
-#define TMEM_CONTROL               0
-#define TMEM_NEW_POOL              1
-#define TMEM_DESTROY_POOL          2
-#define TMEM_NEW_PAGE              3
-#define TMEM_PUT_PAGE              4
-#define TMEM_GET_PAGE              5
-#define TMEM_FLUSH_PAGE            6
-#define TMEM_FLUSH_OBJECT          7
-#define TMEM_READ                  8
-#define TMEM_WRITE                 9
-#define TMEM_XCHG                 10
-
-/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
-#define TMEM_POOL_PERSIST          1
-#define TMEM_POOL_SHARED           2
-#define TMEM_POOL_PAGESIZE_SHIFT   4
-#define TMEM_VERSION_SHIFT        24
-
-
-struct tmem_pool_uuid {
-	u64 uuid_lo;
-	u64 uuid_hi;
-};
-
-struct tmem_oid {
-	u64 oid[3];
-};
-
-#define TMEM_POOL_PRIVATE_UUID	{ 0, 0 }
-
-/* flags for tmem_ops.new_pool */
-#define TMEM_POOL_PERSIST          1
-#define TMEM_POOL_SHARED           2
-
-/* xen tmem foundation ops/hypercalls */
-
-static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid,
-	u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
-{
-	struct tmem_op op;
-	int rc = 0;
-
-	op.cmd = tmem_cmd;
-	op.pool_id = tmem_pool;
-	op.u.gen.oid[0] = oid.oid[0];
-	op.u.gen.oid[1] = oid.oid[1];
-	op.u.gen.oid[2] = oid.oid[2];
-	op.u.gen.index = index;
-	op.u.gen.tmem_offset = tmem_offset;
-	op.u.gen.pfn_offset = pfn_offset;
-	op.u.gen.len = len;
-	set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn);
-	rc = HYPERVISOR_tmem_op(&op);
-	return rc;
-}
-
-static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
-				u32 flags, unsigned long pagesize)
-{
-	struct tmem_op op;
-	int rc = 0, pageshift;
-
-	for (pageshift = 0; pagesize != 1; pageshift++)
-		pagesize >>= 1;
-	flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT;
-	flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT;
-	op.cmd = TMEM_NEW_POOL;
-	op.u.new.uuid[0] = uuid.uuid_lo;
-	op.u.new.uuid[1] = uuid.uuid_hi;
-	op.u.new.flags = flags;
-	rc = HYPERVISOR_tmem_op(&op);
-	return rc;
-}
-
-/* xen generic tmem ops */
-
-static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid,
-			     u32 index, struct page *page)
-{
-	return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index,
-			   xen_page_to_gfn(page), 0, 0, 0);
-}
-
-static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid,
-			     u32 index, struct page *page)
-{
-	return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index,
-			   xen_page_to_gfn(page), 0, 0, 0);
-}
-
-static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index)
-{
-	return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index,
-		0, 0, 0, 0);
-}
-
-static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid)
-{
-	return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
-}
-
-
-#ifdef CONFIG_CLEANCACHE
-static int xen_tmem_destroy_pool(u32 pool_id)
-{
-	struct tmem_oid oid = { { 0 } };
-
-	return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
-}
-
-/* cleancache ops */
-
-static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
-				     pgoff_t index, struct page *page)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-	if (pool < 0)
-		return;
-	if (ind != index)
-		return;
-	mb(); /* ensure page is quiescent; tmem may address it with an alias */
-	(void)xen_tmem_put_page((u32)pool, oid, ind, page);
-}
-
-static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
-				    pgoff_t index, struct page *page)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-	int ret;
-
-	/* translate return values to linux semantics */
-	if (pool < 0)
-		return -1;
-	if (ind != index)
-		return -1;
-	ret = xen_tmem_get_page((u32)pool, oid, ind, page);
-	if (ret == 1)
-		return 0;
-	else
-		return -1;
-}
-
-static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key,
-				       pgoff_t index)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-	if (pool < 0)
-		return;
-	if (ind != index)
-		return;
-	(void)xen_tmem_flush_page((u32)pool, oid, ind);
-}
-
-static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key)
-{
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-	if (pool < 0)
-		return;
-	(void)xen_tmem_flush_object((u32)pool, oid);
-}
-
-static void tmem_cleancache_flush_fs(int pool)
-{
-	if (pool < 0)
-		return;
-	(void)xen_tmem_destroy_pool((u32)pool);
-}
-
-static int tmem_cleancache_init_fs(size_t pagesize)
-{
-	struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID;
-
-	return xen_tmem_new_pool(uuid_private, 0, pagesize);
-}
-
-static int tmem_cleancache_init_shared_fs(uuid_t *uuid, size_t pagesize)
-{
-	struct tmem_pool_uuid shared_uuid;
-
-	shared_uuid.uuid_lo = *(u64 *)&uuid->b[0];
-	shared_uuid.uuid_hi = *(u64 *)&uuid->b[8];
-	return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize);
-}
-
-static const struct cleancache_ops tmem_cleancache_ops = {
-	.put_page = tmem_cleancache_put_page,
-	.get_page = tmem_cleancache_get_page,
-	.invalidate_page = tmem_cleancache_flush_page,
-	.invalidate_inode = tmem_cleancache_flush_inode,
-	.invalidate_fs = tmem_cleancache_flush_fs,
-	.init_shared_fs = tmem_cleancache_init_shared_fs,
-	.init_fs = tmem_cleancache_init_fs
-};
-#endif
-
-#ifdef CONFIG_FRONTSWAP
-/* frontswap tmem operations */
-
-/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
-static int tmem_frontswap_poolid;
-
-/*
- * Swizzling increases objects per swaptype, increasing tmem concurrency
- * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
- */
-#define SWIZ_BITS		4
-#define SWIZ_MASK		((1 << SWIZ_BITS) - 1)
-#define _oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
-#define iswiz(_ind)		(_ind >> SWIZ_BITS)
-
-static inline struct tmem_oid oswiz(unsigned type, u32 ind)
-{
-	struct tmem_oid oid = { .oid = { 0 } };
-	oid.oid[0] = _oswiz(type, ind);
-	return oid;
-}
-
-/* returns 0 if the page was successfully put into frontswap, -1 if not */
-static int tmem_frontswap_store(unsigned type, pgoff_t offset,
-				   struct page *page)
-{
-	u64 ind64 = (u64)offset;
-	u32 ind = (u32)offset;
-	int pool = tmem_frontswap_poolid;
-	int ret;
-
-	/* THP isn't supported */
-	if (PageTransHuge(page))
-		return -1;
-
-	if (pool < 0)
-		return -1;
-	if (ind64 != ind)
-		return -1;
-	mb(); /* ensure page is quiescent; tmem may address it with an alias */
-	ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), page);
-	/* translate Xen tmem return values to linux semantics */
-	if (ret == 1)
-		return 0;
-	else
-		return -1;
-}
-
-/*
- * returns 0 if the page was successfully gotten from frontswap, -1 if
- * was not present (should never happen!)
- */
-static int tmem_frontswap_load(unsigned type, pgoff_t offset,
-				   struct page *page)
-{
-	u64 ind64 = (u64)offset;
-	u32 ind = (u32)offset;
-	int pool = tmem_frontswap_poolid;
-	int ret;
-
-	if (pool < 0)
-		return -1;
-	if (ind64 != ind)
-		return -1;
-	ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), page);
-	/* translate Xen tmem return values to linux semantics */
-	if (ret == 1)
-		return 0;
-	else
-		return -1;
-}
-
-/* flush a single page from frontswap */
-static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset)
-{
-	u64 ind64 = (u64)offset;
-	u32 ind = (u32)offset;
-	int pool = tmem_frontswap_poolid;
-
-	if (pool < 0)
-		return;
-	if (ind64 != ind)
-		return;
-	(void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind));
-}
-
-/* flush all pages from the passed swaptype */
-static void tmem_frontswap_flush_area(unsigned type)
-{
-	int pool = tmem_frontswap_poolid;
-	int ind;
-
-	if (pool < 0)
-		return;
-	for (ind = SWIZ_MASK; ind >= 0; ind--)
-		(void)xen_tmem_flush_object(pool, oswiz(type, ind));
-}
-
-static void tmem_frontswap_init(unsigned ignored)
-{
-	struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID;
-
-	/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
-	if (tmem_frontswap_poolid < 0)
-		tmem_frontswap_poolid =
-		    xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE);
-}
-
-static struct frontswap_ops tmem_frontswap_ops = {
-	.store = tmem_frontswap_store,
-	.load = tmem_frontswap_load,
-	.invalidate_page = tmem_frontswap_flush_page,
-	.invalidate_area = tmem_frontswap_flush_area,
-	.init = tmem_frontswap_init
-};
-#endif
-
-static int __init xen_tmem_init(void)
-{
-	if (!xen_domain())
-		return 0;
-#ifdef CONFIG_FRONTSWAP
-	if (tmem_enabled && frontswap) {
-		char *s = "";
-
-		tmem_frontswap_poolid = -1;
-		frontswap_register_ops(&tmem_frontswap_ops);
-		pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n",
-			s);
-	}
-#endif
-#ifdef CONFIG_CLEANCACHE
-	BUILD_BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
-	if (tmem_enabled && cleancache) {
-		int err;
-
-		err = cleancache_register_ops(&tmem_cleancache_ops);
-		if (err)
-			pr_warn("xen-tmem: failed to enable cleancache: %d\n",
-				err);
-		else
-			pr_info("cleancache enabled, RAM provided by "
-				"Xen Transcendent Memory\n");
-	}
-#endif
-#ifdef CONFIG_XEN_SELFBALLOONING
-	/*
-	 * There is no point of driving pages to the swap system if they
-	 * aren't going anywhere in tmem universe.
-	 */
-	if (!frontswap) {
-		selfshrinking = false;
-		selfballooning = false;
-	}
-	xen_selfballoon_init(selfballooning, selfshrinking);
-#endif
-	return 0;
-}
-
-module_init(xen_tmem_init)
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>");
-MODULE_DESCRIPTION("Shim to Xen transcendent memory");
diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
index a67236b02452..6d12fc368210 100644
--- a/drivers/xen/xen-balloon.c
+++ b/drivers/xen/xen-balloon.c
@@ -129,8 +129,6 @@ void xen_balloon_init(void)
 {
 	register_balloon(&balloon_dev);
 
-	register_xen_selfballooning(&balloon_dev);
-
 	register_xenstore_notifier(&xenstore_notifier);
 }
 EXPORT_SYMBOL_GPL(xen_balloon_init);
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c
deleted file mode 100644
index 246f6122c9ee..000000000000
--- a/drivers/xen/xen-selfballoon.c
+++ /dev/null
@@ -1,579 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/******************************************************************************
- * Xen selfballoon driver (and optional frontswap self-shrinking driver)
- *
- * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
- *
- * This code complements the cleancache and frontswap patchsets to optimize
- * support for Xen Transcendent Memory ("tmem").  The policy it implements
- * is rudimentary and will likely improve over time, but it does work well
- * enough today.
- *
- * Two functionalities are implemented here which both use "control theory"
- * (feedback) to optimize memory utilization. In a virtualized environment
- * such as Xen, RAM is often a scarce resource and we would like to ensure
- * that each of a possibly large number of virtual machines is using RAM
- * efficiently, i.e. using as little as possible when under light load
- * and obtaining as much as possible when memory demands are high.
- * Since RAM needs vary highly dynamically and sometimes dramatically,
- * "hysteresis" is used, that is, memory target is determined not just
- * on current data but also on past data stored in the system.
- *
- * "Selfballooning" creates memory pressure by managing the Xen balloon
- * driver to decrease and increase available kernel memory, driven
- * largely by the target value of "Committed_AS" (see /proc/meminfo).
- * Since Committed_AS does not account for clean mapped pages (i.e. pages
- * in RAM that are identical to pages on disk), selfballooning has the
- * affect of pushing less frequently used clean pagecache pages out of
- * kernel RAM and, presumably using cleancache, into Xen tmem where
- * Xen can more efficiently optimize RAM utilization for such pages.
- *
- * When kernel memory demand unexpectedly increases faster than Xen, via
- * the selfballoon driver, is able to (or chooses to) provide usable RAM,
- * the kernel may invoke swapping.  In most cases, frontswap is able
- * to absorb this swapping into Xen tmem.  However, due to the fact
- * that the kernel swap subsystem assumes swapping occurs to a disk,
- * swapped pages may sit on the disk for a very long time; even if
- * the kernel knows the page will never be used again.  This is because
- * the disk space costs very little and can be overwritten when
- * necessary.  When such stale pages are in frontswap, however, they
- * are taking up valuable real estate.  "Frontswap selfshrinking" works
- * to resolve this:  When frontswap activity is otherwise stable
- * and the guest kernel is not under memory pressure, the "frontswap
- * selfshrinking" accounts for this by providing pressure to remove some
- * pages from frontswap and return them to kernel memory.
- *
- * For both "selfballooning" and "frontswap-selfshrinking", a worker
- * thread is used and sysfs tunables are provided to adjust the frequency
- * and rate of adjustments to achieve the goal, as well as to disable one
- * or both functions independently.
- *
- * While some argue that this functionality can and should be implemented
- * in userspace, it has been observed that bad things happen (e.g. OOMs).
- *
- * System configuration note: Selfballooning should not be enabled on
- * systems without a sufficiently large swap device configured; for best
- * results, it is recommended that total swap be increased by the size
- * of the guest memory. Note, that selfballooning should be disabled by default
- * if frontswap is not configured.  Similarly selfballooning should be enabled
- * by default if frontswap is configured and can be disabled with the
- * "tmem.selfballooning=0" kernel boot option.  Finally, when frontswap is
- * configured, frontswap-selfshrinking can be disabled  with the
- * "tmem.selfshrink=0" kernel boot option.
- *
- * Selfballooning is disallowed in domain0 and force-disabled.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/memblock.h>
-#include <linux/swap.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/workqueue.h>
-#include <linux/device.h>
-#include <xen/balloon.h>
-#include <xen/tmem.h>
-#include <xen/xen.h>
-
-/* Enable/disable with sysfs. */
-static int xen_selfballooning_enabled __read_mostly;
-
-/*
- * Controls rate at which memory target (this iteration) approaches
- * ultimate goal when memory need is increasing (up-hysteresis) or
- * decreasing (down-hysteresis). Higher values of hysteresis cause
- * slower increases/decreases. The default values for the various
- * parameters were deemed reasonable by experimentation, may be
- * workload-dependent, and can all be adjusted via sysfs.
- */
-static unsigned int selfballoon_downhysteresis __read_mostly = 8;
-static unsigned int selfballoon_uphysteresis __read_mostly = 1;
-
-/* In HZ, controls frequency of worker invocation. */
-static unsigned int selfballoon_interval __read_mostly = 5;
-
-/*
- * Minimum usable RAM in MB for selfballooning target for balloon.
- * If non-zero, it is added to totalreserve_pages and self-ballooning
- * will not balloon below the sum.  If zero, a piecewise linear function
- * is calculated as a minimum and added to totalreserve_pages.  Note that
- * setting this value indiscriminately may cause OOMs and crashes.
- */
-static unsigned int selfballoon_min_usable_mb;
-
-/*
- * Amount of RAM in MB to add to the target number of pages.
- * Can be used to reserve some more room for caches and the like.
- */
-static unsigned int selfballoon_reserved_mb;
-
-static void selfballoon_process(struct work_struct *work);
-static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process);
-
-#ifdef CONFIG_FRONTSWAP
-#include <linux/frontswap.h>
-
-/* Enable/disable with sysfs. */
-static bool frontswap_selfshrinking __read_mostly;
-
-/*
- * The default values for the following parameters were deemed reasonable
- * by experimentation, may be workload-dependent, and can all be
- * adjusted via sysfs.
- */
-
-/* Control rate for frontswap shrinking. Higher hysteresis is slower. */
-static unsigned int frontswap_hysteresis __read_mostly = 20;
-
-/*
- * Number of selfballoon worker invocations to wait before observing that
- * frontswap selfshrinking should commence. Note that selfshrinking does
- * not use a separate worker thread.
- */
-static unsigned int frontswap_inertia __read_mostly = 3;
-
-/* Countdown to next invocation of frontswap_shrink() */
-static unsigned long frontswap_inertia_counter;
-
-/*
- * Invoked by the selfballoon worker thread, uses current number of pages
- * in frontswap (frontswap_curr_pages()), previous status, and control
- * values (hysteresis and inertia) to determine if frontswap should be
- * shrunk and what the new frontswap size should be.  Note that
- * frontswap_shrink is essentially a partial swapoff that immediately
- * transfers pages from the "swap device" (frontswap) back into kernel
- * RAM; despite the name, frontswap "shrinking" is very different from
- * the "shrinker" interface used by the kernel MM subsystem to reclaim
- * memory.
- */
-static void frontswap_selfshrink(void)
-{
-	static unsigned long cur_frontswap_pages;
-	unsigned long last_frontswap_pages;
-	unsigned long tgt_frontswap_pages;
-
-	last_frontswap_pages = cur_frontswap_pages;
-	cur_frontswap_pages = frontswap_curr_pages();
-	if (!cur_frontswap_pages ||
-			(cur_frontswap_pages > last_frontswap_pages)) {
-		frontswap_inertia_counter = frontswap_inertia;
-		return;
-	}
-	if (frontswap_inertia_counter && --frontswap_inertia_counter)
-		return;
-	if (cur_frontswap_pages <= frontswap_hysteresis)
-		tgt_frontswap_pages = 0;
-	else
-		tgt_frontswap_pages = cur_frontswap_pages -
-			(cur_frontswap_pages / frontswap_hysteresis);
-	frontswap_shrink(tgt_frontswap_pages);
-	frontswap_inertia_counter = frontswap_inertia;
-}
-
-#endif /* CONFIG_FRONTSWAP */
-
-#define MB2PAGES(mb)	((mb) << (20 - PAGE_SHIFT))
-#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT))
-
-/*
- * Use current balloon size, the goal (vm_committed_as), and hysteresis
- * parameters to set a new target balloon size
- */
-static void selfballoon_process(struct work_struct *work)
-{
-	unsigned long cur_pages, goal_pages, tgt_pages, floor_pages;
-	unsigned long useful_pages;
-	bool reset_timer = false;
-
-	if (xen_selfballooning_enabled) {
-		cur_pages = totalram_pages();
-		tgt_pages = cur_pages; /* default is no change */
-		goal_pages = vm_memory_committed() +
-				totalreserve_pages +
-				MB2PAGES(selfballoon_reserved_mb);
-#ifdef CONFIG_FRONTSWAP
-		/* allow space for frontswap pages to be repatriated */
-		if (frontswap_selfshrinking)
-			goal_pages += frontswap_curr_pages();
-#endif
-		if (cur_pages > goal_pages)
-			tgt_pages = cur_pages -
-				((cur_pages - goal_pages) /
-				  selfballoon_downhysteresis);
-		else if (cur_pages < goal_pages)
-			tgt_pages = cur_pages +
-				((goal_pages - cur_pages) /
-				  selfballoon_uphysteresis);
-		/* else if cur_pages == goal_pages, no change */
-		useful_pages = max_pfn - totalreserve_pages;
-		if (selfballoon_min_usable_mb != 0)
-			floor_pages = totalreserve_pages +
-					MB2PAGES(selfballoon_min_usable_mb);
-		/* piecewise linear function ending in ~3% slope */
-		else if (useful_pages < MB2PAGES(16))
-			floor_pages = max_pfn; /* not worth ballooning */
-		else if (useful_pages < MB2PAGES(64))
-			floor_pages = totalreserve_pages + MB2PAGES(16) +
-					((useful_pages - MB2PAGES(16)) >> 1);
-		else if (useful_pages < MB2PAGES(512))
-			floor_pages = totalreserve_pages + MB2PAGES(40) +
-					((useful_pages - MB2PAGES(40)) >> 3);
-		else /* useful_pages >= MB2PAGES(512) */
-			floor_pages = totalreserve_pages + MB2PAGES(99) +
-					((useful_pages - MB2PAGES(99)) >> 5);
-		if (tgt_pages < floor_pages)
-			tgt_pages = floor_pages;
-		balloon_set_new_target(tgt_pages +
-			balloon_stats.current_pages - totalram_pages());
-		reset_timer = true;
-	}
-#ifdef CONFIG_FRONTSWAP
-	if (frontswap_selfshrinking) {
-		frontswap_selfshrink();
-		reset_timer = true;
-	}
-#endif
-	if (reset_timer)
-		schedule_delayed_work(&selfballoon_worker,
-			selfballoon_interval * HZ);
-}
-
-#ifdef CONFIG_SYSFS
-
-#include <linux/capability.h>
-
-#define SELFBALLOON_SHOW(name, format, args...)				\
-	static ssize_t show_##name(struct device *dev,	\
-					  struct device_attribute *attr, \
-					  char *buf) \
-	{ \
-		return sprintf(buf, format, ##args); \
-	}
-
-SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled);
-
-static ssize_t store_selfballooning(struct device *dev,
-			    struct device_attribute *attr,
-			    const char *buf,
-			    size_t count)
-{
-	bool was_enabled = xen_selfballooning_enabled;
-	unsigned long tmp;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	err = kstrtoul(buf, 10, &tmp);
-	if (err)
-		return err;
-	if ((tmp != 0) && (tmp != 1))
-		return -EINVAL;
-
-	xen_selfballooning_enabled = !!tmp;
-	if (!was_enabled && xen_selfballooning_enabled)
-		schedule_delayed_work(&selfballoon_worker,
-			selfballoon_interval * HZ);
-
-	return count;
-}
-
-static DEVICE_ATTR(selfballooning, S_IRUGO | S_IWUSR,
-		   show_selfballooning, store_selfballooning);
-
-SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval);
-
-static ssize_t store_selfballoon_interval(struct device *dev,
-					  struct device_attribute *attr,
-					  const char *buf,
-					  size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	err = kstrtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val == 0)
-		return -EINVAL;
-	selfballoon_interval = val;
-	return count;
-}
-
-static DEVICE_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR,
-		   show_selfballoon_interval, store_selfballoon_interval);
-
-SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis);
-
-static ssize_t store_selfballoon_downhys(struct device *dev,
-					 struct device_attribute *attr,
-					 const char *buf,
-					 size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	err = kstrtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val == 0)
-		return -EINVAL;
-	selfballoon_downhysteresis = val;
-	return count;
-}
-
-static DEVICE_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR,
-		   show_selfballoon_downhys, store_selfballoon_downhys);
-
-
-SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis);
-
-static ssize_t store_selfballoon_uphys(struct device *dev,
-				       struct device_attribute *attr,
-				       const char *buf,
-				       size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	err = kstrtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val == 0)
-		return -EINVAL;
-	selfballoon_uphysteresis = val;
-	return count;
-}
-
-static DEVICE_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR,
-		   show_selfballoon_uphys, store_selfballoon_uphys);
-
-SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n",
-				selfballoon_min_usable_mb);
-
-static ssize_t store_selfballoon_min_usable_mb(struct device *dev,
-					       struct device_attribute *attr,
-					       const char *buf,
-					       size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	err = kstrtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val == 0)
-		return -EINVAL;
-	selfballoon_min_usable_mb = val;
-	return count;
-}
-
-static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR,
-		   show_selfballoon_min_usable_mb,
-		   store_selfballoon_min_usable_mb);
-
-SELFBALLOON_SHOW(selfballoon_reserved_mb, "%d\n",
-				selfballoon_reserved_mb);
-
-static ssize_t store_selfballoon_reserved_mb(struct device *dev,
-					     struct device_attribute *attr,
-					     const char *buf,
-					     size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	err = kstrtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val == 0)
-		return -EINVAL;
-	selfballoon_reserved_mb = val;
-	return count;
-}
-
-static DEVICE_ATTR(selfballoon_reserved_mb, S_IRUGO | S_IWUSR,
-		   show_selfballoon_reserved_mb,
-		   store_selfballoon_reserved_mb);
-
-
-#ifdef CONFIG_FRONTSWAP
-SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking);
-
-static ssize_t store_frontswap_selfshrinking(struct device *dev,
-					     struct device_attribute *attr,
-					     const char *buf,
-					     size_t count)
-{
-	bool was_enabled = frontswap_selfshrinking;
-	unsigned long tmp;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	err = kstrtoul(buf, 10, &tmp);
-	if (err)
-		return err;
-	if ((tmp != 0) && (tmp != 1))
-		return -EINVAL;
-	frontswap_selfshrinking = !!tmp;
-	if (!was_enabled && !xen_selfballooning_enabled &&
-	     frontswap_selfshrinking)
-		schedule_delayed_work(&selfballoon_worker,
-			selfballoon_interval * HZ);
-
-	return count;
-}
-
-static DEVICE_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR,
-		   show_frontswap_selfshrinking, store_frontswap_selfshrinking);
-
-SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia);
-
-static ssize_t store_frontswap_inertia(struct device *dev,
-				       struct device_attribute *attr,
-				       const char *buf,
-				       size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	err = kstrtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val == 0)
-		return -EINVAL;
-	frontswap_inertia = val;
-	frontswap_inertia_counter = val;
-	return count;
-}
-
-static DEVICE_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR,
-		   show_frontswap_inertia, store_frontswap_inertia);
-
-SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis);
-
-static ssize_t store_frontswap_hysteresis(struct device *dev,
-					  struct device_attribute *attr,
-					  const char *buf,
-					  size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	err = kstrtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val == 0)
-		return -EINVAL;
-	frontswap_hysteresis = val;
-	return count;
-}
-
-static DEVICE_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR,
-		   show_frontswap_hysteresis, store_frontswap_hysteresis);
-
-#endif /* CONFIG_FRONTSWAP */
-
-static struct attribute *selfballoon_attrs[] = {
-	&dev_attr_selfballooning.attr,
-	&dev_attr_selfballoon_interval.attr,
-	&dev_attr_selfballoon_downhysteresis.attr,
-	&dev_attr_selfballoon_uphysteresis.attr,
-	&dev_attr_selfballoon_min_usable_mb.attr,
-	&dev_attr_selfballoon_reserved_mb.attr,
-#ifdef CONFIG_FRONTSWAP
-	&dev_attr_frontswap_selfshrinking.attr,
-	&dev_attr_frontswap_hysteresis.attr,
-	&dev_attr_frontswap_inertia.attr,
-#endif
-	NULL
-};
-
-static const struct attribute_group selfballoon_group = {
-	.name = "selfballoon",
-	.attrs = selfballoon_attrs
-};
-#endif
-
-int register_xen_selfballooning(struct device *dev)
-{
-	int error = -1;
-
-#ifdef CONFIG_SYSFS
-	error = sysfs_create_group(&dev->kobj, &selfballoon_group);
-#endif
-	return error;
-}
-EXPORT_SYMBOL(register_xen_selfballooning);
-
-int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink)
-{
-	bool enable = false;
-	unsigned long reserve_pages;
-
-	if (!xen_domain())
-		return -ENODEV;
-
-	if (xen_initial_domain()) {
-		pr_info("Xen selfballooning driver disabled for domain0\n");
-		return -ENODEV;
-	}
-
-	xen_selfballooning_enabled = tmem_enabled && use_selfballooning;
-	if (xen_selfballooning_enabled) {
-		pr_info("Initializing Xen selfballooning driver\n");
-		enable = true;
-	}
-#ifdef CONFIG_FRONTSWAP
-	frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink;
-	if (frontswap_selfshrinking) {
-		pr_info("Initializing frontswap selfshrinking driver\n");
-		enable = true;
-	}
-#endif
-	if (!enable)
-		return -ENODEV;
-
-	/*
-	 * Give selfballoon_reserved_mb a default value(10% of total ram pages)
-	 * to make selfballoon not so aggressive.
-	 *
-	 * There are mainly two reasons:
-	 * 1) The original goal_page didn't consider some pages used by kernel
-	 *    space, like slab pages and memory used by device drivers.
-	 *
-	 * 2) The balloon driver may not give back memory to guest OS fast
-	 *    enough when the workload suddenly aquries a lot of physical memory.
-	 *
-	 * In both cases, the guest OS will suffer from memory pressure and
-	 * OOM killer may be triggered.
-	 * By reserving extra 10% of total ram pages, we can keep the system
-	 * much more reliably and response faster in some cases.
-	 */
-	if (!selfballoon_reserved_mb) {
-		reserve_pages = totalram_pages() / 10;
-		selfballoon_reserved_mb = PAGES2MB(reserve_pages);
-	}
-	schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ);
-
-	return 0;
-}
-EXPORT_SYMBOL(xen_selfballoon_init);
diff --git a/include/xen/balloon.h b/include/xen/balloon.h
index 4914b93a23f2..6fb95aa19405 100644
--- a/include/xen/balloon.h
+++ b/include/xen/balloon.h
@@ -27,16 +27,6 @@ void balloon_set_new_target(unsigned long target);
 int alloc_xenballooned_pages(int nr_pages, struct page **pages);
 void free_xenballooned_pages(int nr_pages, struct page **pages);
 
-struct device;
-#ifdef CONFIG_XEN_SELFBALLOONING
-extern int register_xen_selfballooning(struct device *dev);
-#else
-static inline int register_xen_selfballooning(struct device *dev)
-{
-	return -ENOSYS;
-}
-#endif
-
 #ifdef CONFIG_XEN_BALLOON
 void xen_balloon_init(void);
 #else
diff --git a/include/xen/tmem.h b/include/xen/tmem.h
deleted file mode 100644
index c80bafe31f14..000000000000
--- a/include/xen/tmem.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _XEN_TMEM_H
-#define _XEN_TMEM_H
-
-#include <linux/types.h>
-
-#ifdef CONFIG_XEN_TMEM_MODULE
-#define tmem_enabled true
-#else
-/* defined in drivers/xen/tmem.c */
-extern bool tmem_enabled;
-#endif
-
-#ifdef CONFIG_XEN_SELFBALLOONING
-extern int xen_selfballoon_init(bool, bool);
-#endif
-
-#endif /* _XEN_TMEM_H */
-- 
cgit v1.2.3


From b866455423e040813f113d8b87e8297778ee2014 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Jul 2019 21:59:11 +0200
Subject: dma-mapping: add a dma_addressing_limited helper

This helper returns if the device has issues addressing all present
memory in the system.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 8d13e28a8e07..e11b115dd0e4 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -679,6 +679,20 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
 	return dma_set_mask_and_coherent(dev, mask);
 }
 
+/**
+ * dma_addressing_limited - return if the device is addressing limited
+ * @dev:	device to check
+ *
+ * Return %true if the devices DMA mask is too small to address all memory in
+ * the system, else %false.  Lack of addressing bits is the prime reason for
+ * bounce buffering, but might not be the only one.
+ */
+static inline bool dma_addressing_limited(struct device *dev)
+{
+	return min_not_zero(*dev->dma_mask, dev->bus_dma_mask) <
+		dma_get_required_mask(dev);
+}
+
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		const struct iommu_ops *iommu, bool coherent);
-- 
cgit v1.2.3


From db074436f421967f4f30cfbb6fbc2a728f3e62b3 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 15 Jul 2019 08:50:59 -0700
Subject: iomap: move the direct IO code into a separate file

Move the direct IO code into a separate file so that we can group
related functions in a single file instead of having a single enormous
source file.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 554 -------------------------------------------------
 fs/iomap/Makefile     |   1 +
 fs/iomap/direct-io.c  | 562 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |   7 +
 4 files changed, 570 insertions(+), 554 deletions(-)
 create mode 100644 fs/iomap/direct-io.c

(limited to 'include')

diff --git a/fs/iomap.c b/fs/iomap.c
index ad994c408cb8..c983fedc7081 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -90,12 +90,6 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
 	return written ? written : ret;
 }
 
-static sector_t
-iomap_sector(struct iomap *iomap, loff_t pos)
-{
-	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
-}
-
 static struct iomap_page *
 iomap_page_create(struct inode *inode, struct page *page)
 {
@@ -1148,551 +1142,3 @@ out_unlock:
 	return block_page_mkwrite_return(ret);
 }
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-
-/*
- * Private flags for iomap_dio, must not overlap with the public ones in
- * iomap.h:
- */
-#define IOMAP_DIO_WRITE_FUA	(1 << 28)
-#define IOMAP_DIO_NEED_SYNC	(1 << 29)
-#define IOMAP_DIO_WRITE		(1 << 30)
-#define IOMAP_DIO_DIRTY		(1 << 31)
-
-struct iomap_dio {
-	struct kiocb		*iocb;
-	iomap_dio_end_io_t	*end_io;
-	loff_t			i_size;
-	loff_t			size;
-	atomic_t		ref;
-	unsigned		flags;
-	int			error;
-	bool			wait_for_completion;
-
-	union {
-		/* used during submission and for synchronous completion: */
-		struct {
-			struct iov_iter		*iter;
-			struct task_struct	*waiter;
-			struct request_queue	*last_queue;
-			blk_qc_t		cookie;
-		} submit;
-
-		/* used for aio completion: */
-		struct {
-			struct work_struct	work;
-		} aio;
-	};
-};
-
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
-{
-	struct request_queue *q = READ_ONCE(kiocb->private);
-
-	if (!q)
-		return 0;
-	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
-}
-EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-
-static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
-		struct bio *bio)
-{
-	atomic_inc(&dio->ref);
-
-	if (dio->iocb->ki_flags & IOCB_HIPRI)
-		bio_set_polled(bio, dio->iocb);
-
-	dio->submit.last_queue = bdev_get_queue(iomap->bdev);
-	dio->submit.cookie = submit_bio(bio);
-}
-
-static ssize_t iomap_dio_complete(struct iomap_dio *dio)
-{
-	struct kiocb *iocb = dio->iocb;
-	struct inode *inode = file_inode(iocb->ki_filp);
-	loff_t offset = iocb->ki_pos;
-	ssize_t ret;
-
-	if (dio->end_io) {
-		ret = dio->end_io(iocb,
-				dio->error ? dio->error : dio->size,
-				dio->flags);
-	} else {
-		ret = dio->error;
-	}
-
-	if (likely(!ret)) {
-		ret = dio->size;
-		/* check for short read */
-		if (offset + ret > dio->i_size &&
-		    !(dio->flags & IOMAP_DIO_WRITE))
-			ret = dio->i_size - offset;
-		iocb->ki_pos += ret;
-	}
-
-	/*
-	 * Try again to invalidate clean pages which might have been cached by
-	 * non-direct readahead, or faulted in by get_user_pages() if the source
-	 * of the write was an mmap'ed region of the file we're writing.  Either
-	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
-	 * this invalidation fails, tough, the write still worked...
-	 *
-	 * And this page cache invalidation has to be after dio->end_io(), as
-	 * some filesystems convert unwritten extents to real allocations in
-	 * end_io() when necessary, otherwise a racing buffer read would cache
-	 * zeros from unwritten extents.
-	 */
-	if (!dio->error &&
-	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
-		int err;
-		err = invalidate_inode_pages2_range(inode->i_mapping,
-				offset >> PAGE_SHIFT,
-				(offset + dio->size - 1) >> PAGE_SHIFT);
-		if (err)
-			dio_warn_stale_pagecache(iocb->ki_filp);
-	}
-
-	/*
-	 * If this is a DSYNC write, make sure we push it to stable storage now
-	 * that we've written data.
-	 */
-	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
-		ret = generic_write_sync(iocb, ret);
-
-	inode_dio_end(file_inode(iocb->ki_filp));
-	kfree(dio);
-
-	return ret;
-}
-
-static void iomap_dio_complete_work(struct work_struct *work)
-{
-	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
-	struct kiocb *iocb = dio->iocb;
-
-	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
-}
-
-/*
- * Set an error in the dio if none is set yet.  We have to use cmpxchg
- * as the submission context and the completion context(s) can race to
- * update the error.
- */
-static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
-{
-	cmpxchg(&dio->error, 0, ret);
-}
-
-static void iomap_dio_bio_end_io(struct bio *bio)
-{
-	struct iomap_dio *dio = bio->bi_private;
-	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
-
-	if (bio->bi_status)
-		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
-
-	if (atomic_dec_and_test(&dio->ref)) {
-		if (dio->wait_for_completion) {
-			struct task_struct *waiter = dio->submit.waiter;
-			WRITE_ONCE(dio->submit.waiter, NULL);
-			blk_wake_io_task(waiter);
-		} else if (dio->flags & IOMAP_DIO_WRITE) {
-			struct inode *inode = file_inode(dio->iocb->ki_filp);
-
-			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
-			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
-		} else {
-			iomap_dio_complete_work(&dio->aio.work);
-		}
-	}
-
-	if (should_dirty) {
-		bio_check_pages_dirty(bio);
-	} else {
-		bio_release_pages(bio, false);
-		bio_put(bio);
-	}
-}
-
-static void
-iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
-		unsigned len)
-{
-	struct page *page = ZERO_PAGE(0);
-	int flags = REQ_SYNC | REQ_IDLE;
-	struct bio *bio;
-
-	bio = bio_alloc(GFP_KERNEL, 1);
-	bio_set_dev(bio, iomap->bdev);
-	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
-	bio->bi_private = dio;
-	bio->bi_end_io = iomap_dio_bio_end_io;
-
-	get_page(page);
-	__bio_add_page(bio, page, len, 0);
-	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
-	iomap_dio_submit_bio(dio, iomap, bio);
-}
-
-static loff_t
-iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
-		struct iomap_dio *dio, struct iomap *iomap)
-{
-	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
-	unsigned int fs_block_size = i_blocksize(inode), pad;
-	unsigned int align = iov_iter_alignment(dio->submit.iter);
-	struct iov_iter iter;
-	struct bio *bio;
-	bool need_zeroout = false;
-	bool use_fua = false;
-	int nr_pages, ret = 0;
-	size_t copied = 0;
-
-	if ((pos | length | align) & ((1 << blkbits) - 1))
-		return -EINVAL;
-
-	if (iomap->type == IOMAP_UNWRITTEN) {
-		dio->flags |= IOMAP_DIO_UNWRITTEN;
-		need_zeroout = true;
-	}
-
-	if (iomap->flags & IOMAP_F_SHARED)
-		dio->flags |= IOMAP_DIO_COW;
-
-	if (iomap->flags & IOMAP_F_NEW) {
-		need_zeroout = true;
-	} else if (iomap->type == IOMAP_MAPPED) {
-		/*
-		 * Use a FUA write if we need datasync semantics, this is a pure
-		 * data IO that doesn't require any metadata updates (including
-		 * after IO completion such as unwritten extent conversion) and
-		 * the underlying device supports FUA. This allows us to avoid
-		 * cache flushes on IO completion.
-		 */
-		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
-		    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
-		    blk_queue_fua(bdev_get_queue(iomap->bdev)))
-			use_fua = true;
-	}
-
-	/*
-	 * Operate on a partial iter trimmed to the extent we were called for.
-	 * We'll update the iter in the dio once we're done with this extent.
-	 */
-	iter = *dio->submit.iter;
-	iov_iter_truncate(&iter, length);
-
-	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-	if (nr_pages <= 0)
-		return nr_pages;
-
-	if (need_zeroout) {
-		/* zero out from the start of the block to the write offset */
-		pad = pos & (fs_block_size - 1);
-		if (pad)
-			iomap_dio_zero(dio, iomap, pos - pad, pad);
-	}
-
-	do {
-		size_t n;
-		if (dio->error) {
-			iov_iter_revert(dio->submit.iter, copied);
-			return 0;
-		}
-
-		bio = bio_alloc(GFP_KERNEL, nr_pages);
-		bio_set_dev(bio, iomap->bdev);
-		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
-		bio->bi_write_hint = dio->iocb->ki_hint;
-		bio->bi_ioprio = dio->iocb->ki_ioprio;
-		bio->bi_private = dio;
-		bio->bi_end_io = iomap_dio_bio_end_io;
-
-		ret = bio_iov_iter_get_pages(bio, &iter);
-		if (unlikely(ret)) {
-			/*
-			 * We have to stop part way through an IO. We must fall
-			 * through to the sub-block tail zeroing here, otherwise
-			 * this short IO may expose stale data in the tail of
-			 * the block we haven't written data to.
-			 */
-			bio_put(bio);
-			goto zero_tail;
-		}
-
-		n = bio->bi_iter.bi_size;
-		if (dio->flags & IOMAP_DIO_WRITE) {
-			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
-			if (use_fua)
-				bio->bi_opf |= REQ_FUA;
-			else
-				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
-			task_io_account_write(n);
-		} else {
-			bio->bi_opf = REQ_OP_READ;
-			if (dio->flags & IOMAP_DIO_DIRTY)
-				bio_set_pages_dirty(bio);
-		}
-
-		iov_iter_advance(dio->submit.iter, n);
-
-		dio->size += n;
-		pos += n;
-		copied += n;
-
-		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-		iomap_dio_submit_bio(dio, iomap, bio);
-	} while (nr_pages);
-
-	/*
-	 * We need to zeroout the tail of a sub-block write if the extent type
-	 * requires zeroing or the write extends beyond EOF. If we don't zero
-	 * the block tail in the latter case, we can expose stale data via mmap
-	 * reads of the EOF block.
-	 */
-zero_tail:
-	if (need_zeroout ||
-	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
-		/* zero out from the end of the write to the end of the block */
-		pad = pos & (fs_block_size - 1);
-		if (pad)
-			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
-	}
-	return copied ? copied : ret;
-}
-
-static loff_t
-iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
-{
-	length = iov_iter_zero(length, dio->submit.iter);
-	dio->size += length;
-	return length;
-}
-
-static loff_t
-iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
-		struct iomap_dio *dio, struct iomap *iomap)
-{
-	struct iov_iter *iter = dio->submit.iter;
-	size_t copied;
-
-	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
-
-	if (dio->flags & IOMAP_DIO_WRITE) {
-		loff_t size = inode->i_size;
-
-		if (pos > size)
-			memset(iomap->inline_data + size, 0, pos - size);
-		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
-		if (copied) {
-			if (pos + copied > size)
-				i_size_write(inode, pos + copied);
-			mark_inode_dirty(inode);
-		}
-	} else {
-		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
-	}
-	dio->size += copied;
-	return copied;
-}
-
-static loff_t
-iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
-		void *data, struct iomap *iomap)
-{
-	struct iomap_dio *dio = data;
-
-	switch (iomap->type) {
-	case IOMAP_HOLE:
-		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
-			return -EIO;
-		return iomap_dio_hole_actor(length, dio);
-	case IOMAP_UNWRITTEN:
-		if (!(dio->flags & IOMAP_DIO_WRITE))
-			return iomap_dio_hole_actor(length, dio);
-		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
-	case IOMAP_MAPPED:
-		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
-	case IOMAP_INLINE:
-		return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
-	default:
-		WARN_ON_ONCE(1);
-		return -EIO;
-	}
-}
-
-/*
- * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
- * is being issued as AIO or not.  This allows us to optimise pure data writes
- * to use REQ_FUA rather than requiring generic_write_sync() to issue a
- * REQ_FLUSH post write. This is slightly tricky because a single request here
- * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
- * may be pure data writes. In that case, we still need to do a full data sync
- * completion.
- */
-ssize_t
-iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
-{
-	struct address_space *mapping = iocb->ki_filp->f_mapping;
-	struct inode *inode = file_inode(iocb->ki_filp);
-	size_t count = iov_iter_count(iter);
-	loff_t pos = iocb->ki_pos, start = pos;
-	loff_t end = iocb->ki_pos + count - 1, ret = 0;
-	unsigned int flags = IOMAP_DIRECT;
-	bool wait_for_completion = is_sync_kiocb(iocb);
-	struct blk_plug plug;
-	struct iomap_dio *dio;
-
-	lockdep_assert_held(&inode->i_rwsem);
-
-	if (!count)
-		return 0;
-
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
-	if (!dio)
-		return -ENOMEM;
-
-	dio->iocb = iocb;
-	atomic_set(&dio->ref, 1);
-	dio->size = 0;
-	dio->i_size = i_size_read(inode);
-	dio->end_io = end_io;
-	dio->error = 0;
-	dio->flags = 0;
-
-	dio->submit.iter = iter;
-	dio->submit.waiter = current;
-	dio->submit.cookie = BLK_QC_T_NONE;
-	dio->submit.last_queue = NULL;
-
-	if (iov_iter_rw(iter) == READ) {
-		if (pos >= dio->i_size)
-			goto out_free_dio;
-
-		if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
-			dio->flags |= IOMAP_DIO_DIRTY;
-	} else {
-		flags |= IOMAP_WRITE;
-		dio->flags |= IOMAP_DIO_WRITE;
-
-		/* for data sync or sync, we need sync completion processing */
-		if (iocb->ki_flags & IOCB_DSYNC)
-			dio->flags |= IOMAP_DIO_NEED_SYNC;
-
-		/*
-		 * For datasync only writes, we optimistically try using FUA for
-		 * this IO.  Any non-FUA write that occurs will clear this flag,
-		 * hence we know before completion whether a cache flush is
-		 * necessary.
-		 */
-		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
-			dio->flags |= IOMAP_DIO_WRITE_FUA;
-	}
-
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (filemap_range_has_page(mapping, start, end)) {
-			ret = -EAGAIN;
-			goto out_free_dio;
-		}
-		flags |= IOMAP_NOWAIT;
-	}
-
-	ret = filemap_write_and_wait_range(mapping, start, end);
-	if (ret)
-		goto out_free_dio;
-
-	/*
-	 * Try to invalidate cache pages for the range we're direct
-	 * writing.  If this invalidation fails, tough, the write will
-	 * still work, but racing two incompatible write paths is a
-	 * pretty crazy thing to do, so we don't support it 100%.
-	 */
-	ret = invalidate_inode_pages2_range(mapping,
-			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-	if (ret)
-		dio_warn_stale_pagecache(iocb->ki_filp);
-	ret = 0;
-
-	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
-	    !inode->i_sb->s_dio_done_wq) {
-		ret = sb_init_dio_done_wq(inode->i_sb);
-		if (ret < 0)
-			goto out_free_dio;
-	}
-
-	inode_dio_begin(inode);
-
-	blk_start_plug(&plug);
-	do {
-		ret = iomap_apply(inode, pos, count, flags, ops, dio,
-				iomap_dio_actor);
-		if (ret <= 0) {
-			/* magic error code to fall back to buffered I/O */
-			if (ret == -ENOTBLK) {
-				wait_for_completion = true;
-				ret = 0;
-			}
-			break;
-		}
-		pos += ret;
-
-		if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
-			break;
-	} while ((count = iov_iter_count(iter)) > 0);
-	blk_finish_plug(&plug);
-
-	if (ret < 0)
-		iomap_dio_set_error(dio, ret);
-
-	/*
-	 * If all the writes we issued were FUA, we don't need to flush the
-	 * cache on IO completion. Clear the sync flag for this case.
-	 */
-	if (dio->flags & IOMAP_DIO_WRITE_FUA)
-		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
-
-	WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
-	WRITE_ONCE(iocb->private, dio->submit.last_queue);
-
-	/*
-	 * We are about to drop our additional submission reference, which
-	 * might be the last reference to the dio.  There are three three
-	 * different ways we can progress here:
-	 *
-	 *  (a) If this is the last reference we will always complete and free
-	 *	the dio ourselves.
-	 *  (b) If this is not the last reference, and we serve an asynchronous
-	 *	iocb, we must never touch the dio after the decrement, the
-	 *	I/O completion handler will complete and free it.
-	 *  (c) If this is not the last reference, but we serve a synchronous
-	 *	iocb, the I/O completion handler will wake us up on the drop
-	 *	of the final reference, and we will complete and free it here
-	 *	after we got woken by the I/O completion handler.
-	 */
-	dio->wait_for_completion = wait_for_completion;
-	if (!atomic_dec_and_test(&dio->ref)) {
-		if (!wait_for_completion)
-			return -EIOCBQUEUED;
-
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (!READ_ONCE(dio->submit.waiter))
-				break;
-
-			if (!(iocb->ki_flags & IOCB_HIPRI) ||
-			    !dio->submit.last_queue ||
-			    !blk_poll(dio->submit.last_queue,
-					 dio->submit.cookie, true))
-				io_schedule();
-		}
-		__set_current_state(TASK_RUNNING);
-	}
-
-	return iomap_dio_complete(dio);
-
-out_free_dio:
-	kfree(dio);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 5dfe8b5cf330..a67a97758858 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -6,6 +6,7 @@
 obj-$(CONFIG_FS_IOMAP)		+= iomap.o
 
 iomap-y				+= \
+					direct-io.o \
 					fiemap.o \
 					seek.o
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
new file mode 100644
index 000000000000..10517cea9682
--- /dev/null
+++ b/fs/iomap/direct-io.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/backing-dev.h>
+#include <linux/uio.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "../internal.h"
+
+/*
+ * Private flags for iomap_dio, must not overlap with the public ones in
+ * iomap.h:
+ */
+#define IOMAP_DIO_WRITE_FUA	(1 << 28)
+#define IOMAP_DIO_NEED_SYNC	(1 << 29)
+#define IOMAP_DIO_WRITE		(1 << 30)
+#define IOMAP_DIO_DIRTY		(1 << 31)
+
+struct iomap_dio {
+	struct kiocb		*iocb;
+	iomap_dio_end_io_t	*end_io;
+	loff_t			i_size;
+	loff_t			size;
+	atomic_t		ref;
+	unsigned		flags;
+	int			error;
+	bool			wait_for_completion;
+
+	union {
+		/* used during submission and for synchronous completion: */
+		struct {
+			struct iov_iter		*iter;
+			struct task_struct	*waiter;
+			struct request_queue	*last_queue;
+			blk_qc_t		cookie;
+		} submit;
+
+		/* used for aio completion: */
+		struct {
+			struct work_struct	work;
+		} aio;
+	};
+};
+
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
+{
+	struct request_queue *q = READ_ONCE(kiocb->private);
+
+	if (!q)
+		return 0;
+	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
+}
+EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
+
+static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
+		struct bio *bio)
+{
+	atomic_inc(&dio->ref);
+
+	if (dio->iocb->ki_flags & IOCB_HIPRI)
+		bio_set_polled(bio, dio->iocb);
+
+	dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+	dio->submit.cookie = submit_bio(bio);
+}
+
+static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+{
+	struct kiocb *iocb = dio->iocb;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	loff_t offset = iocb->ki_pos;
+	ssize_t ret;
+
+	if (dio->end_io) {
+		ret = dio->end_io(iocb,
+				dio->error ? dio->error : dio->size,
+				dio->flags);
+	} else {
+		ret = dio->error;
+	}
+
+	if (likely(!ret)) {
+		ret = dio->size;
+		/* check for short read */
+		if (offset + ret > dio->i_size &&
+		    !(dio->flags & IOMAP_DIO_WRITE))
+			ret = dio->i_size - offset;
+		iocb->ki_pos += ret;
+	}
+
+	/*
+	 * Try again to invalidate clean pages which might have been cached by
+	 * non-direct readahead, or faulted in by get_user_pages() if the source
+	 * of the write was an mmap'ed region of the file we're writing.  Either
+	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
+	 * this invalidation fails, tough, the write still worked...
+	 *
+	 * And this page cache invalidation has to be after dio->end_io(), as
+	 * some filesystems convert unwritten extents to real allocations in
+	 * end_io() when necessary, otherwise a racing buffer read would cache
+	 * zeros from unwritten extents.
+	 */
+	if (!dio->error &&
+	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
+		int err;
+		err = invalidate_inode_pages2_range(inode->i_mapping,
+				offset >> PAGE_SHIFT,
+				(offset + dio->size - 1) >> PAGE_SHIFT);
+		if (err)
+			dio_warn_stale_pagecache(iocb->ki_filp);
+	}
+
+	/*
+	 * If this is a DSYNC write, make sure we push it to stable storage now
+	 * that we've written data.
+	 */
+	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+		ret = generic_write_sync(iocb, ret);
+
+	inode_dio_end(file_inode(iocb->ki_filp));
+	kfree(dio);
+
+	return ret;
+}
+
+static void iomap_dio_complete_work(struct work_struct *work)
+{
+	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
+	struct kiocb *iocb = dio->iocb;
+
+	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+}
+
+/*
+ * Set an error in the dio if none is set yet.  We have to use cmpxchg
+ * as the submission context and the completion context(s) can race to
+ * update the error.
+ */
+static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
+{
+	cmpxchg(&dio->error, 0, ret);
+}
+
+static void iomap_dio_bio_end_io(struct bio *bio)
+{
+	struct iomap_dio *dio = bio->bi_private;
+	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+	if (bio->bi_status)
+		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+
+	if (atomic_dec_and_test(&dio->ref)) {
+		if (dio->wait_for_completion) {
+			struct task_struct *waiter = dio->submit.waiter;
+			WRITE_ONCE(dio->submit.waiter, NULL);
+			blk_wake_io_task(waiter);
+		} else if (dio->flags & IOMAP_DIO_WRITE) {
+			struct inode *inode = file_inode(dio->iocb->ki_filp);
+
+			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+		} else {
+			iomap_dio_complete_work(&dio->aio.work);
+		}
+	}
+
+	if (should_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+}
+
+static void
+iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+		unsigned len)
+{
+	struct page *page = ZERO_PAGE(0);
+	int flags = REQ_SYNC | REQ_IDLE;
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	bio_set_dev(bio, iomap->bdev);
+	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+	bio->bi_private = dio;
+	bio->bi_end_io = iomap_dio_bio_end_io;
+
+	get_page(page);
+	__bio_add_page(bio, page, len, 0);
+	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
+	iomap_dio_submit_bio(dio, iomap, bio);
+}
+
+static loff_t
+iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
+		struct iomap_dio *dio, struct iomap *iomap)
+{
+	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int align = iov_iter_alignment(dio->submit.iter);
+	struct iov_iter iter;
+	struct bio *bio;
+	bool need_zeroout = false;
+	bool use_fua = false;
+	int nr_pages, ret = 0;
+	size_t copied = 0;
+
+	if ((pos | length | align) & ((1 << blkbits) - 1))
+		return -EINVAL;
+
+	if (iomap->type == IOMAP_UNWRITTEN) {
+		dio->flags |= IOMAP_DIO_UNWRITTEN;
+		need_zeroout = true;
+	}
+
+	if (iomap->flags & IOMAP_F_SHARED)
+		dio->flags |= IOMAP_DIO_COW;
+
+	if (iomap->flags & IOMAP_F_NEW) {
+		need_zeroout = true;
+	} else if (iomap->type == IOMAP_MAPPED) {
+		/*
+		 * Use a FUA write if we need datasync semantics, this is a pure
+		 * data IO that doesn't require any metadata updates (including
+		 * after IO completion such as unwritten extent conversion) and
+		 * the underlying device supports FUA. This allows us to avoid
+		 * cache flushes on IO completion.
+		 */
+		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+		    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+		    blk_queue_fua(bdev_get_queue(iomap->bdev)))
+			use_fua = true;
+	}
+
+	/*
+	 * Operate on a partial iter trimmed to the extent we were called for.
+	 * We'll update the iter in the dio once we're done with this extent.
+	 */
+	iter = *dio->submit.iter;
+	iov_iter_truncate(&iter, length);
+
+	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+	if (nr_pages <= 0)
+		return nr_pages;
+
+	if (need_zeroout) {
+		/* zero out from the start of the block to the write offset */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos - pad, pad);
+	}
+
+	do {
+		size_t n;
+		if (dio->error) {
+			iov_iter_revert(dio->submit.iter, copied);
+			return 0;
+		}
+
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		bio_set_dev(bio, iomap->bdev);
+		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+		bio->bi_write_hint = dio->iocb->ki_hint;
+		bio->bi_ioprio = dio->iocb->ki_ioprio;
+		bio->bi_private = dio;
+		bio->bi_end_io = iomap_dio_bio_end_io;
+
+		ret = bio_iov_iter_get_pages(bio, &iter);
+		if (unlikely(ret)) {
+			/*
+			 * We have to stop part way through an IO. We must fall
+			 * through to the sub-block tail zeroing here, otherwise
+			 * this short IO may expose stale data in the tail of
+			 * the block we haven't written data to.
+			 */
+			bio_put(bio);
+			goto zero_tail;
+		}
+
+		n = bio->bi_iter.bi_size;
+		if (dio->flags & IOMAP_DIO_WRITE) {
+			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+			if (use_fua)
+				bio->bi_opf |= REQ_FUA;
+			else
+				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+			task_io_account_write(n);
+		} else {
+			bio->bi_opf = REQ_OP_READ;
+			if (dio->flags & IOMAP_DIO_DIRTY)
+				bio_set_pages_dirty(bio);
+		}
+
+		iov_iter_advance(dio->submit.iter, n);
+
+		dio->size += n;
+		pos += n;
+		copied += n;
+
+		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+		iomap_dio_submit_bio(dio, iomap, bio);
+	} while (nr_pages);
+
+	/*
+	 * We need to zeroout the tail of a sub-block write if the extent type
+	 * requires zeroing or the write extends beyond EOF. If we don't zero
+	 * the block tail in the latter case, we can expose stale data via mmap
+	 * reads of the EOF block.
+	 */
+zero_tail:
+	if (need_zeroout ||
+	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
+		/* zero out from the end of the write to the end of the block */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+	}
+	return copied ? copied : ret;
+}
+
+static loff_t
+iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
+{
+	length = iov_iter_zero(length, dio->submit.iter);
+	dio->size += length;
+	return length;
+}
+
+static loff_t
+iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
+		struct iomap_dio *dio, struct iomap *iomap)
+{
+	struct iov_iter *iter = dio->submit.iter;
+	size_t copied;
+
+	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+	if (dio->flags & IOMAP_DIO_WRITE) {
+		loff_t size = inode->i_size;
+
+		if (pos > size)
+			memset(iomap->inline_data + size, 0, pos - size);
+		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
+		if (copied) {
+			if (pos + copied > size)
+				i_size_write(inode, pos + copied);
+			mark_inode_dirty(inode);
+		}
+	} else {
+		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
+	}
+	dio->size += copied;
+	return copied;
+}
+
+static loff_t
+iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct iomap_dio *dio = data;
+
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
+			return -EIO;
+		return iomap_dio_hole_actor(length, dio);
+	case IOMAP_UNWRITTEN:
+		if (!(dio->flags & IOMAP_DIO_WRITE))
+			return iomap_dio_hole_actor(length, dio);
+		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+	case IOMAP_MAPPED:
+		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+	case IOMAP_INLINE:
+		return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+}
+
+/*
+ * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
+ * is being issued as AIO or not.  This allows us to optimise pure data writes
+ * to use REQ_FUA rather than requiring generic_write_sync() to issue a
+ * REQ_FLUSH post write. This is slightly tricky because a single request here
+ * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
+ */
+ssize_t
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	size_t count = iov_iter_count(iter);
+	loff_t pos = iocb->ki_pos, start = pos;
+	loff_t end = iocb->ki_pos + count - 1, ret = 0;
+	unsigned int flags = IOMAP_DIRECT;
+	bool wait_for_completion = is_sync_kiocb(iocb);
+	struct blk_plug plug;
+	struct iomap_dio *dio;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (!count)
+		return 0;
+
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	if (!dio)
+		return -ENOMEM;
+
+	dio->iocb = iocb;
+	atomic_set(&dio->ref, 1);
+	dio->size = 0;
+	dio->i_size = i_size_read(inode);
+	dio->end_io = end_io;
+	dio->error = 0;
+	dio->flags = 0;
+
+	dio->submit.iter = iter;
+	dio->submit.waiter = current;
+	dio->submit.cookie = BLK_QC_T_NONE;
+	dio->submit.last_queue = NULL;
+
+	if (iov_iter_rw(iter) == READ) {
+		if (pos >= dio->i_size)
+			goto out_free_dio;
+
+		if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
+			dio->flags |= IOMAP_DIO_DIRTY;
+	} else {
+		flags |= IOMAP_WRITE;
+		dio->flags |= IOMAP_DIO_WRITE;
+
+		/* for data sync or sync, we need sync completion processing */
+		if (iocb->ki_flags & IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_NEED_SYNC;
+
+		/*
+		 * For datasync only writes, we optimistically try using FUA for
+		 * this IO.  Any non-FUA write that occurs will clear this flag,
+		 * hence we know before completion whether a cache flush is
+		 * necessary.
+		 */
+		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_WRITE_FUA;
+	}
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (filemap_range_has_page(mapping, start, end)) {
+			ret = -EAGAIN;
+			goto out_free_dio;
+		}
+		flags |= IOMAP_NOWAIT;
+	}
+
+	ret = filemap_write_and_wait_range(mapping, start, end);
+	if (ret)
+		goto out_free_dio;
+
+	/*
+	 * Try to invalidate cache pages for the range we're direct
+	 * writing.  If this invalidation fails, tough, the write will
+	 * still work, but racing two incompatible write paths is a
+	 * pretty crazy thing to do, so we don't support it 100%.
+	 */
+	ret = invalidate_inode_pages2_range(mapping,
+			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
+	if (ret)
+		dio_warn_stale_pagecache(iocb->ki_filp);
+	ret = 0;
+
+	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
+	    !inode->i_sb->s_dio_done_wq) {
+		ret = sb_init_dio_done_wq(inode->i_sb);
+		if (ret < 0)
+			goto out_free_dio;
+	}
+
+	inode_dio_begin(inode);
+
+	blk_start_plug(&plug);
+	do {
+		ret = iomap_apply(inode, pos, count, flags, ops, dio,
+				iomap_dio_actor);
+		if (ret <= 0) {
+			/* magic error code to fall back to buffered I/O */
+			if (ret == -ENOTBLK) {
+				wait_for_completion = true;
+				ret = 0;
+			}
+			break;
+		}
+		pos += ret;
+
+		if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+			break;
+	} while ((count = iov_iter_count(iter)) > 0);
+	blk_finish_plug(&plug);
+
+	if (ret < 0)
+		iomap_dio_set_error(dio, ret);
+
+	/*
+	 * If all the writes we issued were FUA, we don't need to flush the
+	 * cache on IO completion. Clear the sync flag for this case.
+	 */
+	if (dio->flags & IOMAP_DIO_WRITE_FUA)
+		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+
+	WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
+	WRITE_ONCE(iocb->private, dio->submit.last_queue);
+
+	/*
+	 * We are about to drop our additional submission reference, which
+	 * might be the last reference to the dio.  There are three three
+	 * different ways we can progress here:
+	 *
+	 *  (a) If this is the last reference we will always complete and free
+	 *	the dio ourselves.
+	 *  (b) If this is not the last reference, and we serve an asynchronous
+	 *	iocb, we must never touch the dio after the decrement, the
+	 *	I/O completion handler will complete and free it.
+	 *  (c) If this is not the last reference, but we serve a synchronous
+	 *	iocb, the I/O completion handler will wake us up on the drop
+	 *	of the final reference, and we will complete and free it here
+	 *	after we got woken by the I/O completion handler.
+	 */
+	dio->wait_for_completion = wait_for_completion;
+	if (!atomic_dec_and_test(&dio->ref)) {
+		if (!wait_for_completion)
+			return -EIOCBQUEUED;
+
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!READ_ONCE(dio->submit.waiter))
+				break;
+
+			if (!(iocb->ki_flags & IOCB_HIPRI) ||
+			    !dio->submit.last_queue ||
+			    !blk_poll(dio->submit.last_queue,
+					 dio->submit.cookie, true))
+				io_schedule();
+		}
+		__set_current_state(TASK_RUNNING);
+	}
+
+	return iomap_dio_complete(dio);
+
+out_free_dio:
+	kfree(dio);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1df9ea187a9a..baa1e2d31f05 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/types.h>
 #include <linux/mm_types.h>
+#include <linux/blkdev.h>
 
 struct address_space;
 struct fiemap_extent_info;
@@ -69,6 +70,12 @@ struct iomap {
 	const struct iomap_page_ops *page_ops;
 };
 
+static inline sector_t
+iomap_sector(struct iomap *iomap, loff_t pos)
+{
+	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
 /*
  * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
  * and page_done will be called for each page written to.  This only applies to
-- 
cgit v1.2.3


From 5d907307adc14cd5148b07629c2b4535acd06062 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 15 Jul 2019 08:51:01 -0700
Subject: iomap: move internal declarations into fs/iomap/

Move internal function declarations out of fs/internal.h into
include/linux/iomap.h so that our transition is complete.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/dax.c              |  1 -
 fs/internal.h         | 10 ----------
 fs/iomap/apply.c      |  2 --
 fs/iomap/fiemap.c     |  2 --
 fs/iomap/seek.c       |  2 --
 fs/iomap/swapfile.c   |  2 --
 include/linux/iomap.h | 10 ++++++++++
 7 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index fe5e33810cd4..cb53f9bd6fd7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -26,7 +26,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/iomap.h>
 #include <asm/pgalloc.h>
-#include "internal.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/fs_dax.h>
diff --git a/fs/internal.h b/fs/internal.h
index 2f3c3de51fad..2b0bebd67904 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -182,15 +182,5 @@ extern const struct dentry_operations ns_dentry_operations;
 extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
 
-/*
- * iomap support:
- */
-typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
-		void *data, struct iomap *iomap);
-
-loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
-		unsigned flags, const struct iomap_ops *ops, void *data,
-		iomap_actor_t actor);
-
 /* direct-io.c: */
 int sb_init_dio_done_wq(struct super_block *sb);
diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
index 9f956cf23867..54c02aecf3cd 100644
--- a/fs/iomap/apply.c
+++ b/fs/iomap/apply.c
@@ -8,8 +8,6 @@
 #include <linux/fs.h>
 #include <linux/iomap.h>
 
-#include "../internal.h"
-
 /*
  * Execute a iomap write on a segment of the mapping that spans a
  * contiguous range of pages that have identical block mapping state.
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 1fc88ec1584d..f26fdd36e383 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -7,8 +7,6 @@
 #include <linux/fs.h>
 #include <linux/iomap.h>
 
-#include "../internal.h"
-
 struct fiemap_ctx {
 	struct fiemap_extent_info *fi;
 	struct iomap prev;
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index 715442eb71aa..c04bad4b2b43 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -10,8 +10,6 @@
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 
-#include "../internal.h"
-
 /*
  * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
  * Returns true if found and updates @lastoff to the offset in file.
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index b79c33631263..152a230f668d 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -9,8 +9,6 @@
 #include <linux/iomap.h>
 #include <linux/swap.h>
 
-#include "../internal.h"
-
 /* Swapfile activation */
 
 struct iomap_swapfile_info {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index baa1e2d31f05..bc499ceae392 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -122,6 +122,16 @@ struct iomap_ops {
 			ssize_t written, unsigned flags, struct iomap *iomap);
 };
 
+/*
+ * Main iomap iterator function.
+ */
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+		void *data, struct iomap *iomap);
+
+loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
+		unsigned flags, const struct iomap_ops *ops, void *data,
+		iomap_actor_t actor);
+
 /*
  * Structure allocate for each page when block size < PAGE_SIZE to track
  * sub-page uptodate status and I/O completions.
-- 
cgit v1.2.3


From 9af93db9e140a4e6e79cdb098919bc928a72cd59 Mon Sep 17 00:00:00 2001
From: Daniel Drake <drake@endlessm.com>
Date: Wed, 17 Jul 2019 13:10:58 +0800
Subject: platform/x86: asus: Rename "fan mode" to "fan boost mode"

The Asus WMI spec indicates that the function being controlled here
is called "Fan Boost Mode". The user-facing documentation also calls it
this.

The spec uses the term "fan mode" is used to refer to other things,
including functionality expected to appear on future products.
We missed this before as we are not dealing with the most readable of
specs, and didn't forsee any confusion around shortening the name.

Rename "fan mode" to "fan boost mode" to improve consistency with the
spec and to avoid a future naming conflict.

There is no interface breakage here since this has yet to be included
in an official kernel release. I also updated the kernel version listed
under ABI accordingly.

Signed-off-by: Daniel Drake <drake@endlessm.com>
Acked-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-platform-asus-wmi |   6 +-
 drivers/platform/x86/asus-wmi.c                   | 118 ++++++++++++----------
 include/linux/platform_data/x86/asus-wmi.h        |   2 +-
 3 files changed, 66 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 87ae5cc983bf..9e99f2909612 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -37,9 +37,9 @@ Contact:	"AceLan Kao" <acelan.kao@canonical.com>
 Description:
 		Resume on lid open. 1 means on, 0 means off.
 
-What:		/sys/devices/platform/<platform>/fan_mode
-Date:		Apr 2019
-KernelVersion:	5.2
+What:		/sys/devices/platform/<platform>/fan_boost_mode
+Date:		Sep 2019
+KernelVersion:	5.3
 Contact:	"Yurii Pavlovskyi" <yurii.pavlovskyi@gmail.com>
 Description:
 		Fan boost mode:
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 18f3a8bad52f..ca28d27dae63 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -68,12 +68,12 @@ MODULE_LICENSE("GPL");
 #define ASUS_FAN_CTRL_MANUAL		1
 #define ASUS_FAN_CTRL_AUTO		2
 
-#define ASUS_FAN_MODE_NORMAL		0
-#define ASUS_FAN_MODE_OVERBOOST		1
-#define ASUS_FAN_MODE_OVERBOOST_MASK	0x01
-#define ASUS_FAN_MODE_SILENT		2
-#define ASUS_FAN_MODE_SILENT_MASK	0x02
-#define ASUS_FAN_MODES_MASK		0x03
+#define ASUS_FAN_BOOST_MODE_NORMAL		0
+#define ASUS_FAN_BOOST_MODE_OVERBOOST		1
+#define ASUS_FAN_BOOST_MODE_OVERBOOST_MASK	0x01
+#define ASUS_FAN_BOOST_MODE_SILENT		2
+#define ASUS_FAN_BOOST_MODE_SILENT_MASK		0x02
+#define ASUS_FAN_BOOST_MODES_MASK		0x03
 
 #define USB_INTEL_XUSB2PR		0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
@@ -182,9 +182,9 @@ struct asus_wmi {
 	int asus_hwmon_num_fans;
 	int asus_hwmon_pwm;
 
-	bool fan_mode_available;
-	u8 fan_mode_mask;
-	u8 fan_mode;
+	bool fan_boost_mode_available;
+	u8 fan_boost_mode_mask;
+	u8 fan_boost_mode;
 
 	struct hotplug_slot hotplug_slot;
 	struct mutex hotplug_lock;
@@ -1487,14 +1487,15 @@ static int asus_wmi_fan_init(struct asus_wmi *asus)
 
 /* Fan mode *******************************************************************/
 
-static int fan_mode_check_present(struct asus_wmi *asus)
+static int fan_boost_mode_check_present(struct asus_wmi *asus)
 {
 	u32 result;
 	int err;
 
-	asus->fan_mode_available = false;
+	asus->fan_boost_mode_available = false;
 
-	err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_MODE, &result);
+	err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_BOOST_MODE,
+				    &result);
 	if (err) {
 		if (err == -ENODEV)
 			return 0;
@@ -1503,72 +1504,77 @@ static int fan_mode_check_present(struct asus_wmi *asus)
 	}
 
 	if ((result & ASUS_WMI_DSTS_PRESENCE_BIT) &&
-			(result & ASUS_FAN_MODES_MASK)) {
-		asus->fan_mode_available = true;
-		asus->fan_mode_mask = result & ASUS_FAN_MODES_MASK;
+			(result & ASUS_FAN_BOOST_MODES_MASK)) {
+		asus->fan_boost_mode_available = true;
+		asus->fan_boost_mode_mask = result & ASUS_FAN_BOOST_MODES_MASK;
 	}
 
 	return 0;
 }
 
-static int fan_mode_write(struct asus_wmi *asus)
+static int fan_boost_mode_write(struct asus_wmi *asus)
 {
 	int err;
 	u8 value;
 	u32 retval;
 
-	value = asus->fan_mode;
+	value = asus->fan_boost_mode;
 
-	pr_info("Set fan mode: %u\n", value);
-	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_MODE, value, &retval);
+	pr_info("Set fan boost mode: %u\n", value);
+	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_BOOST_MODE, value,
+				    &retval);
 
 	if (err) {
-		pr_warn("Failed to set fan mode: %d\n", err);
+		pr_warn("Failed to set fan boost mode: %d\n", err);
 		return err;
 	}
 
 	if (retval != 1) {
-		pr_warn("Failed to set fan mode (retval): 0x%x\n", retval);
+		pr_warn("Failed to set fan boost mode (retval): 0x%x\n",
+			retval);
 		return -EIO;
 	}
 
 	return 0;
 }
 
-static int fan_mode_switch_next(struct asus_wmi *asus)
+static int fan_boost_mode_switch_next(struct asus_wmi *asus)
 {
-	if (asus->fan_mode == ASUS_FAN_MODE_NORMAL) {
-		if (asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK)
-			asus->fan_mode = ASUS_FAN_MODE_OVERBOOST;
-		else if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
-			asus->fan_mode = ASUS_FAN_MODE_SILENT;
-	} else if (asus->fan_mode == ASUS_FAN_MODE_OVERBOOST) {
-		if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
-			asus->fan_mode = ASUS_FAN_MODE_SILENT;
+	u8 mask = asus->fan_boost_mode_mask;
+
+	if (asus->fan_boost_mode == ASUS_FAN_BOOST_MODE_NORMAL) {
+		if (mask & ASUS_FAN_BOOST_MODE_OVERBOOST_MASK)
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_OVERBOOST;
+		else if (mask & ASUS_FAN_BOOST_MODE_SILENT_MASK)
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_SILENT;
+	} else if (asus->fan_boost_mode == ASUS_FAN_BOOST_MODE_OVERBOOST) {
+		if (mask & ASUS_FAN_BOOST_MODE_SILENT_MASK)
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_SILENT;
 		else
-			asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_NORMAL;
 	} else {
-		asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+		asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_NORMAL;
 	}
 
-	return fan_mode_write(asus);
+	return fan_boost_mode_write(asus);
 }
 
-static ssize_t fan_mode_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
+static ssize_t fan_boost_mode_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
 {
 	struct asus_wmi *asus = dev_get_drvdata(dev);
 
-	return scnprintf(buf, PAGE_SIZE, "%d\n", asus->fan_mode);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", asus->fan_boost_mode);
 }
 
-static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t count)
+static ssize_t fan_boost_mode_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
 {
 	int result;
 	u8 new_mode;
-
 	struct asus_wmi *asus = dev_get_drvdata(dev);
+	u8 mask = asus->fan_boost_mode_mask;
 
 	result = kstrtou8(buf, 10, &new_mode);
 	if (result < 0) {
@@ -1576,24 +1582,24 @@ static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr,
 		return result;
 	}
 
-	if (new_mode == ASUS_FAN_MODE_OVERBOOST) {
-		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK))
+	if (new_mode == ASUS_FAN_BOOST_MODE_OVERBOOST) {
+		if (!(mask & ASUS_FAN_BOOST_MODE_OVERBOOST_MASK))
 			return -EINVAL;
-	} else if (new_mode == ASUS_FAN_MODE_SILENT) {
-		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK))
+	} else if (new_mode == ASUS_FAN_BOOST_MODE_SILENT) {
+		if (!(mask & ASUS_FAN_BOOST_MODE_SILENT_MASK))
 			return -EINVAL;
-	} else if (new_mode != ASUS_FAN_MODE_NORMAL) {
+	} else if (new_mode != ASUS_FAN_BOOST_MODE_NORMAL) {
 		return -EINVAL;
 	}
 
-	asus->fan_mode = new_mode;
-	fan_mode_write(asus);
+	asus->fan_boost_mode = new_mode;
+	fan_boost_mode_write(asus);
 
 	return result;
 }
 
-// Fan mode: 0 - normal, 1 - overboost, 2 - silent
-static DEVICE_ATTR_RW(fan_mode);
+// Fan boost mode: 0 - normal, 1 - overboost, 2 - silent
+static DEVICE_ATTR_RW(fan_boost_mode);
 
 /* Backlight ******************************************************************/
 
@@ -1873,8 +1879,8 @@ static void asus_wmi_handle_event_code(int code, struct asus_wmi *asus)
 		return;
 	}
 
-	if (asus->fan_mode_available && code == NOTIFY_KBD_FBM) {
-		fan_mode_switch_next(asus);
+	if (asus->fan_boost_mode_available && code == NOTIFY_KBD_FBM) {
+		fan_boost_mode_switch_next(asus);
 		return;
 	}
 
@@ -2034,7 +2040,7 @@ static struct attribute *platform_attributes[] = {
 	&dev_attr_touchpad.attr,
 	&dev_attr_lid_resume.attr,
 	&dev_attr_als_enable.attr,
-	&dev_attr_fan_mode.attr,
+	&dev_attr_fan_boost_mode.attr,
 	NULL
 };
 
@@ -2056,8 +2062,8 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
 		devid = ASUS_WMI_DEVID_LID_RESUME;
 	else if (attr == &dev_attr_als_enable.attr)
 		devid = ASUS_WMI_DEVID_ALS_ENABLE;
-	else if (attr == &dev_attr_fan_mode.attr)
-		ok = asus->fan_mode_available;
+	else if (attr == &dev_attr_fan_boost_mode.attr)
+		ok = asus->fan_boost_mode_available;
 
 	if (devid != -1)
 		ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0);
@@ -2315,9 +2321,9 @@ static int asus_wmi_add(struct platform_device *pdev)
 	if (err)
 		goto fail_platform;
 
-	err = fan_mode_check_present(asus);
+	err = fan_boost_mode_check_present(asus);
 	if (err)
-		goto fail_fan_mode;
+		goto fail_fan_boost_mode;
 
 	err = asus_wmi_sysfs_init(asus->platform_device);
 	if (err)
@@ -2402,7 +2408,7 @@ fail_hwmon:
 fail_input:
 	asus_wmi_sysfs_exit(asus->platform_device);
 fail_sysfs:
-fail_fan_mode:
+fail_fan_boost_mode:
 fail_platform:
 	kfree(asus);
 	return err;
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 8551156b8dca..4802cd2c7309 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -57,7 +57,7 @@
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT	0x00050021
 #define ASUS_WMI_DEVID_LIGHT_SENSOR	0x00050022 /* ?? */
 #define ASUS_WMI_DEVID_LIGHTBAR		0x00050025
-#define ASUS_WMI_DEVID_FAN_MODE		0x00110018
+#define ASUS_WMI_DEVID_FAN_BOOST_MODE	0x00110018
 
 /* Misc */
 #define ASUS_WMI_DEVID_CAMERA		0x00060013
-- 
cgit v1.2.3


From 733232f8c852bcc2ad6fc1db7f4c43eb01c7c217 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 17 Jul 2019 12:57:06 -0400
Subject: dm: use printk ratelimiting functions

DM provided its own ratelimiting printk wrapper but given printk
advances this is no longer needed.

Also, switching DMDEBUG_LIMIT to using pr_debug_ratelimited() fixes the
reported issue where DMDEBUG_LIMIT() still caused a flood of "callbacks
suppressed" messages.

Reported-by: Milan Broz <gmazyland@gmail.com>
Depends-on: 29fc2bc7539386 ("printk: pr_debug_ratelimited: check state first to reduce "callbacks suppressed" messages")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 include/linux/device-mapper.h | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e1f51d607cc5..603ce5bb4fac 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -530,29 +530,20 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
  *---------------------------------------------------------------*/
 #define DM_NAME "device-mapper"
 
-#define DM_RATELIMIT(pr_func, fmt, ...)					\
-do {									\
-	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,	\
-				      DEFAULT_RATELIMIT_BURST);		\
-									\
-	if (__ratelimit(&rs))						\
-		pr_func(DM_FMT(fmt), ##__VA_ARGS__);			\
-} while (0)
-
 #define DM_FMT(fmt) DM_NAME ": " DM_MSG_PREFIX ": " fmt "\n"
 
 #define DMCRIT(fmt, ...) pr_crit(DM_FMT(fmt), ##__VA_ARGS__)
 
 #define DMERR(fmt, ...) pr_err(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMERR_LIMIT(fmt, ...) DM_RATELIMIT(pr_err, fmt, ##__VA_ARGS__)
+#define DMERR_LIMIT(fmt, ...) pr_err_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 #define DMWARN(fmt, ...) pr_warn(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMWARN_LIMIT(fmt, ...) DM_RATELIMIT(pr_warn, fmt, ##__VA_ARGS__)
+#define DMWARN_LIMIT(fmt, ...) pr_warn_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 #define DMINFO(fmt, ...) pr_info(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMINFO_LIMIT(fmt, ...) DM_RATELIMIT(pr_info, fmt, ##__VA_ARGS__)
+#define DMINFO_LIMIT(fmt, ...) pr_info_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 
 #ifdef CONFIG_DM_DEBUG
 #define DMDEBUG(fmt, ...) printk(KERN_DEBUG DM_FMT(fmt), ##__VA_ARGS__)
-#define DMDEBUG_LIMIT(fmt, ...) DM_RATELIMIT(pr_debug, fmt, ##__VA_ARGS__)
+#define DMDEBUG_LIMIT(fmt, ...) pr_debug_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 #else
 #define DMDEBUG(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
 #define DMDEBUG_LIMIT(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
-- 
cgit v1.2.3


From 7402a4fedc2bc448100c2d086406c708451b16dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 16 Jul 2019 13:51:29 -0400
Subject: SUNRPC: Fix up backchannel slot table accounting

Add a per-transport maximum limit in the socket case, and add
helpers to allow the NFSv4 code to discover that limit.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c                 |  3 +++
 include/linux/sunrpc/bc_xprt.h    |  1 +
 include/linux/sunrpc/clnt.h       |  1 +
 include/linux/sunrpc/xprt.h       |  6 ++++--
 net/sunrpc/backchannel_rqst.c     | 40 +++++++++++++++++++++------------------
 net/sunrpc/clnt.c                 | 13 +++++++++++++
 net/sunrpc/svc.c                  |  2 +-
 net/sunrpc/xprtrdma/backchannel.c |  7 +++++++
 net/sunrpc/xprtrdma/transport.c   |  1 +
 net/sunrpc/xprtrdma/xprt_rdma.h   |  1 +
 net/sunrpc/xprtsock.c             |  1 +
 11 files changed, 55 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 52de7245a2ee..39896afc6edf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8380,6 +8380,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
 {
 	unsigned int max_rqst_sz, max_resp_sz;
 	unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
+	unsigned int max_bc_slots = rpc_num_bc_slots(clnt);
 
 	max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
 	max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -8402,6 +8403,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
 	args->bc_attrs.max_resp_sz_cached = 0;
 	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
 	args->bc_attrs.max_reqs = max_t(unsigned short, max_session_cb_slots, 1);
+	if (args->bc_attrs.max_reqs > max_bc_slots)
+		args->bc_attrs.max_reqs = max_bc_slots;
 
 	dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
 		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index d4229a78524a..87d27e13d885 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -43,6 +43,7 @@ void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
 int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs);
 void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs);
 void xprt_free_bc_rqst(struct rpc_rqst *req);
+unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt);
 
 /*
  * Determine if a shared backchannel is in use
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 4e070e00c143..abc63bd1be2b 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -194,6 +194,7 @@ void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 struct net *	rpc_net_ns(struct rpc_clnt *);
 size_t		rpc_max_payload(struct rpc_clnt *);
 size_t		rpc_max_bc_payload(struct rpc_clnt *);
+unsigned int	rpc_num_bc_slots(struct rpc_clnt *);
 void		rpc_force_rebind(struct rpc_clnt *);
 size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
 const char	*rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index ed76e5fb36c1..13e108bcc9eb 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -158,6 +158,7 @@ struct rpc_xprt_ops {
 	int		(*bc_setup)(struct rpc_xprt *xprt,
 				    unsigned int min_reqs);
 	size_t		(*bc_maxpayload)(struct rpc_xprt *xprt);
+	unsigned int	(*bc_num_slots)(struct rpc_xprt *xprt);
 	void		(*bc_free_rqst)(struct rpc_rqst *rqst);
 	void		(*bc_destroy)(struct rpc_xprt *xprt,
 				      unsigned int max_reqs);
@@ -251,8 +252,9 @@ struct rpc_xprt {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	struct svc_serv		*bc_serv;       /* The RPC service which will */
 						/* process the callback */
-	int			bc_alloc_count;	/* Total number of preallocs */
-	atomic_t		bc_free_slots;
+	unsigned int		bc_alloc_max;
+	unsigned int		bc_alloc_count;	/* Total number of preallocs */
+	atomic_t		bc_slot_count;	/* Number of allocated slots */
 	spinlock_t		bc_pa_lock;	/* Protects the preallocated
 						 * items */
 	struct list_head	bc_pa_list;	/* List of preallocated
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index c47d82622fd1..339e8c077c2d 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -31,25 +31,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+#define BC_MAX_SLOTS	64U
+
+unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt)
+{
+	return BC_MAX_SLOTS;
+}
+
 /*
  * Helper routines that track the number of preallocation elements
  * on the transport.
  */
 static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
 {
-	return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
-}
-
-static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
-{
-	atomic_add(n, &xprt->bc_free_slots);
-	xprt->bc_alloc_count += n;
-}
-
-static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
-{
-	atomic_sub(n, &xprt->bc_free_slots);
-	return xprt->bc_alloc_count -= n;
+	return xprt->bc_alloc_count < xprt->bc_alloc_max;
 }
 
 /*
@@ -145,6 +140,9 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
 
 	dprintk("RPC:       setup backchannel transport\n");
 
+	if (min_reqs > BC_MAX_SLOTS)
+		min_reqs = BC_MAX_SLOTS;
+
 	/*
 	 * We use a temporary list to keep track of the preallocated
 	 * buffers.  Once we're done building the list we splice it
@@ -172,7 +170,9 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
 	 */
 	spin_lock(&xprt->bc_pa_lock);
 	list_splice(&tmp_list, &xprt->bc_pa_list);
-	xprt_inc_alloc_count(xprt, min_reqs);
+	xprt->bc_alloc_count += min_reqs;
+	xprt->bc_alloc_max += min_reqs;
+	atomic_add(min_reqs, &xprt->bc_slot_count);
 	spin_unlock(&xprt->bc_pa_lock);
 
 	dprintk("RPC:       setup backchannel transport done\n");
@@ -220,11 +220,13 @@ void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs)
 		goto out;
 
 	spin_lock_bh(&xprt->bc_pa_lock);
-	xprt_dec_alloc_count(xprt, max_reqs);
+	xprt->bc_alloc_max -= max_reqs;
 	list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
 		dprintk("RPC:        req=%p\n", req);
 		list_del(&req->rq_bc_pa_list);
 		xprt_free_allocation(req);
+		xprt->bc_alloc_count--;
+		atomic_dec(&xprt->bc_slot_count);
 		if (--max_reqs == 0)
 			break;
 	}
@@ -241,13 +243,14 @@ static struct rpc_rqst *xprt_get_bc_request(struct rpc_xprt *xprt, __be32 xid,
 	struct rpc_rqst *req = NULL;
 
 	dprintk("RPC:       allocate a backchannel request\n");
-	if (atomic_read(&xprt->bc_free_slots) <= 0)
-		goto not_found;
 	if (list_empty(&xprt->bc_pa_list)) {
 		if (!new)
 			goto not_found;
+		if (atomic_read(&xprt->bc_slot_count) >= BC_MAX_SLOTS)
+			goto not_found;
 		list_add_tail(&new->rq_bc_pa_list, &xprt->bc_pa_list);
 		xprt->bc_alloc_count++;
+		atomic_inc(&xprt->bc_slot_count);
 	}
 	req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
 				rq_bc_pa_list);
@@ -291,6 +294,7 @@ void xprt_free_bc_rqst(struct rpc_rqst *req)
 	if (xprt_need_to_requeue(xprt)) {
 		list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
 		xprt->bc_alloc_count++;
+		atomic_inc(&xprt->bc_slot_count);
 		req = NULL;
 	}
 	spin_unlock_bh(&xprt->bc_pa_lock);
@@ -357,7 +361,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 
 	spin_lock(&xprt->bc_pa_lock);
 	list_del(&req->rq_bc_pa_list);
-	xprt_dec_alloc_count(xprt, 1);
+	xprt->bc_alloc_count--;
 	spin_unlock(&xprt->bc_pa_lock);
 
 	req->rq_private_buf.len = copied;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 383555d2b522..79c849391cb9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1526,6 +1526,19 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
 
+unsigned int rpc_num_bc_slots(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt *xprt;
+	unsigned int ret;
+
+	rcu_read_lock();
+	xprt = rcu_dereference(clnt->cl_xprt);
+	ret = xprt->ops->bc_num_slots(xprt);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_num_bc_slots);
+
 /**
  * rpc_force_rebind - force transport to check that remote port is unchanged
  * @clnt: client to rebind
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e15cb704453e..220b79988000 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1595,7 +1595,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	/* Parse and execute the bc call */
 	proc_error = svc_process_common(rqstp, argv, resv);
 
-	atomic_inc(&req->rq_xprt->bc_free_slots);
+	atomic_dec(&req->rq_xprt->bc_slot_count);
 	if (!proc_error) {
 		/* Processing error: drop the request */
 		xprt_free_bc_request(req);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index ce986591f213..59e624b1d7a0 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -52,6 +52,13 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
 	return maxmsg - RPCRDMA_HDRLEN_MIN;
 }
 
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	return r_xprt->rx_buf.rb_bc_srv_max_requests;
+}
+
 static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 4993aa49ecbe..52abddac19e5 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -812,6 +812,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	.bc_setup		= xprt_rdma_bc_setup,
 	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
+	.bc_num_slots		= xprt_rdma_bc_max_slots,
 	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
 	.bc_destroy		= xprt_rdma_bc_destroy,
 #endif
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 8378f45d2da7..92ce09fcea74 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -605,6 +605,7 @@ void xprt_rdma_cleanup(void);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
 size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *);
 int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
 void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
 int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3c2cc96afcaa..6b1fca51028a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2788,6 +2788,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 	.bc_setup		= xprt_setup_bc,
 	.bc_maxpayload		= xs_tcp_bc_maxpayload,
+	.bc_num_slots		= xprt_bc_max_slots,
 	.bc_free_rqst		= xprt_free_bc_rqst,
 	.bc_destroy		= xprt_destroy_bc,
 #endif
-- 
cgit v1.2.3


From 3193c0836f203a91bef96d88c64cccf0be090d9c Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 17 Jul 2019 20:36:45 -0500
Subject: bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()

On x86-64, with CONFIG_RETPOLINE=n, GCC's "global common subexpression
elimination" optimization results in ___bpf_prog_run()'s jumptable code
changing from this:

	select_insn:
		jmp *jumptable(, %rax, 8)
		...
	ALU64_ADD_X:
		...
		jmp *jumptable(, %rax, 8)
	ALU_ADD_X:
		...
		jmp *jumptable(, %rax, 8)

to this:

	select_insn:
		mov jumptable, %r12
		jmp *(%r12, %rax, 8)
		...
	ALU64_ADD_X:
		...
		jmp *(%r12, %rax, 8)
	ALU_ADD_X:
		...
		jmp *(%r12, %rax, 8)

The jumptable address is placed in a register once, at the beginning of
the function.  The function execution can then go through multiple
indirect jumps which rely on that same register value.  This has a few
issues:

1) Objtool isn't smart enough to be able to track such a register value
   across multiple recursive indirect jumps through the jump table.

2) With CONFIG_RETPOLINE enabled, this optimization actually results in
   a small slowdown.  I measured a ~4.7% slowdown in the test_bpf
   "tcpdump port 22" selftest.

   This slowdown is actually predicted by the GCC manual:

     Note: When compiling a program using computed gotos, a GCC
     extension, you may get better run-time performance if you
     disable the global common subexpression elimination pass by
     adding -fno-gcse to the command line.

So just disable the optimization for this function.

Fixes: e55a73251da3 ("bpf: Fix ORC unwinding in non-JIT BPF code")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/30c3ca29ba037afcbd860a8672eef0021addf9fe.1563413318.git.jpoimboe@redhat.com
---
 include/linux/compiler-gcc.h   | 2 ++
 include/linux/compiler_types.h | 4 ++++
 kernel/bpf/core.c              | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index e8579412ad21..d7ee4c6bad48 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -170,3 +170,5 @@
 #else
 #define __diag_GCC_8(s)
 #endif
+
+#define __no_fgcse __attribute__((optimize("-fno-gcse")))
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 095d55c3834d..599c27b56c29 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -189,6 +189,10 @@ struct ftrace_likely_data {
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
 
+#ifndef __no_fgcse
+# define __no_fgcse
+#endif
+
 /* Are two types/vars the same type (ignoring qualifiers)? */
 #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7e98f36a14e2..8191a7db2777 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1295,7 +1295,7 @@ bool bpf_opcode_in_insntable(u8 code)
  *
  * Decode and execute eBPF instructions.
  */
-static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 {
 #define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
 #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
-- 
cgit v1.2.3


From d5b9216fd5114be4ed98ca9c1ecc5f164cd8cf5e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 18 Jul 2019 09:32:17 -0400
Subject: pnfs/flexfiles: Add tracepoints for detecting pnfs fallback to MDS

Add tracepoints to allow debugging of the event chain leading to
a pnfs fallback to doing I/O through the MDS.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 26 ++++++++++++
 fs/nfs/nfs4trace.c                     |  8 ++++
 fs/nfs/nfs4trace.h                     | 76 +++++++++++++++++++++++++++++++++-
 fs/nfs/pnfs.c                          |  2 +
 include/linux/nfs4.h                   |  1 +
 5 files changed, 112 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index bcff3bf5ae09..b04e20d28162 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -934,6 +934,10 @@ out_nolseg:
 	if (pgio->pg_error < 0)
 		return;
 out_mds:
+	trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_READ,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = NULL;
 	nfs_pageio_reset_read_mds(pgio);
@@ -1000,6 +1004,10 @@ retry:
 	return;
 
 out_mds:
+	trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_RW,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = NULL;
 	nfs_pageio_reset_write_mds(pgio);
@@ -1026,6 +1034,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 	if (pgio->pg_lseg)
 		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
+	trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_RW,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	/* no lseg means that pnfs is not in use, so no mirroring here */
 	nfs_pageio_reset_write_mds(pgio);
 out:
@@ -1075,6 +1087,10 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
 			hdr->args.count,
 			(unsigned long long)hdr->args.offset);
 
+		trace_pnfs_mds_fallback_write_done(hdr->inode,
+				hdr->args.offset, hdr->args.count,
+				IOMODE_RW, NFS_I(hdr->inode)->layout,
+				hdr->lseg);
 		task->tk_status = pnfs_write_done_resend_to_mds(hdr);
 	}
 }
@@ -1094,6 +1110,10 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
 			hdr->args.count,
 			(unsigned long long)hdr->args.offset);
 
+		trace_pnfs_mds_fallback_read_done(hdr->inode,
+				hdr->args.offset, hdr->args.count,
+				IOMODE_READ, NFS_I(hdr->inode)->layout,
+				hdr->lseg);
 		task->tk_status = pnfs_read_done_resend_to_mds(hdr);
 	}
 }
@@ -1827,6 +1847,9 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 out_failed:
 	if (ff_layout_avoid_mds_available_ds(lseg))
 		return PNFS_TRY_AGAIN;
+	trace_pnfs_mds_fallback_read_pagelist(hdr->inode,
+			hdr->args.offset, hdr->args.count,
+			IOMODE_READ, NFS_I(hdr->inode)->layout, lseg);
 	return PNFS_NOT_ATTEMPTED;
 }
 
@@ -1892,6 +1915,9 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 out_failed:
 	if (ff_layout_avoid_mds_available_ds(lseg))
 		return PNFS_TRY_AGAIN;
+	trace_pnfs_mds_fallback_write_pagelist(hdr->inode,
+			hdr->args.offset, hdr->args.count,
+			IOMODE_RW, NFS_I(hdr->inode)->layout, lseg);
 	return PNFS_NOT_ATTEMPTED;
 }
 
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index e9fb3e50a999..1a8f376b3f73 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -16,4 +16,12 @@
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_get_mirror_count);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist);
 #endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index d85f20945a2b..b2f395fa7350 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1771,6 +1771,7 @@ TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BLOCKED);
 TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_INVALID_OPEN);
 TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETRY);
 TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_EXIT);
 
 #define show_pnfs_update_layout_reason(reason)				\
 	__print_symbolic(reason,					\
@@ -1786,7 +1787,8 @@ TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 		{ PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" },	\
 		{ PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" },	\
 		{ PNFS_UPDATE_LAYOUT_RETRY, "retrying" },	\
-		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
+		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }, \
+		{ PNFS_UPDATE_LAYOUT_EXIT, "exit" })
 
 TRACE_EVENT(pnfs_update_layout,
 		TP_PROTO(struct inode *inode,
@@ -1845,6 +1847,78 @@ TRACE_EVENT(pnfs_update_layout,
 		)
 );
 
+DECLARE_EVENT_CLASS(pnfs_layout_event,
+		TP_PROTO(struct inode *inode,
+			loff_t pos,
+			u64 count,
+			enum pnfs_iomode iomode,
+			struct pnfs_layout_hdr *lo,
+			struct pnfs_layout_segment *lseg
+		),
+		TP_ARGS(inode, pos, count, iomode, lo, lseg),
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u64, fileid)
+			__field(u32, fhandle)
+			__field(loff_t, pos)
+			__field(u64, count)
+			__field(enum pnfs_iomode, iomode)
+			__field(int, layoutstateid_seq)
+			__field(u32, layoutstateid_hash)
+			__field(long, lseg)
+		),
+		TP_fast_assign(
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->pos = pos;
+			__entry->count = count;
+			__entry->iomode = iomode;
+			if (lo != NULL) {
+				__entry->layoutstateid_seq =
+				be32_to_cpu(lo->plh_stateid.seqid);
+				__entry->layoutstateid_hash =
+				nfs_stateid_hash(&lo->plh_stateid);
+			} else {
+				__entry->layoutstateid_seq = 0;
+				__entry->layoutstateid_hash = 0;
+			}
+			__entry->lseg = (long)lseg;
+		),
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"iomode=%s pos=%llu count=%llu "
+			"layoutstateid=%d:0x%08x lseg=0x%lx",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			show_pnfs_iomode(__entry->iomode),
+			(unsigned long long)__entry->pos,
+			(unsigned long long)__entry->count,
+			__entry->layoutstateid_seq, __entry->layoutstateid_hash,
+			__entry->lseg
+		)
+);
+
+#define DEFINE_PNFS_LAYOUT_EVENT(name) \
+	DEFINE_EVENT(pnfs_layout_event, name, \
+		TP_PROTO(struct inode *inode, \
+			loff_t pos, \
+			u64 count, \
+			enum pnfs_iomode iomode, \
+			struct pnfs_layout_hdr *lo, \
+			struct pnfs_layout_segment *lseg \
+		), \
+		TP_ARGS(inode, pos, count, iomode, lo, lseg))
+
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_read);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_write);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_get_mirror_count);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* _TRACE_NFS4_H */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 758917463700..75bd5b552ba4 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2037,6 +2037,8 @@ lookup_again:
 out_put_layout_hdr:
 	if (first)
 		pnfs_clear_first_layoutget(lo);
+	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+				 PNFS_UPDATE_LAYOUT_EXIT);
 	pnfs_put_layout_hdr(lo);
 out:
 	dprintk("%s: inode %s/%llu pNFS layout segment %s for "
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 22494d170619..fd59904a282c 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -660,6 +660,7 @@ enum pnfs_update_layout_reason {
 	PNFS_UPDATE_LAYOUT_BLOCKED,
 	PNFS_UPDATE_LAYOUT_INVALID_OPEN,
 	PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
+	PNFS_UPDATE_LAYOUT_EXIT,
 };
 
 #define NFS4_OP_MAP_NUM_LONGS					\
-- 
cgit v1.2.3


From 80ec922dbd87fd38d15719c86a94457204648aeb Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:56:51 -0700
Subject: mm/memory_hotplug: allow arch_remove_memory() without
 CONFIG_MEMORY_HOTREMOVE

We want to improve error handling while adding memory by allowing to use
arch_remove_memory() and __remove_pages() even if
CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like:

	arch_add_memory()
	rc = do_something();
	if (rc) {
		arch_remove_memory();
	}

We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require
quite some dependencies for memory offlining.

Link: http://lkml.kernel.org/r/20190527111152.16324-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c            | 2 --
 arch/ia64/mm/init.c            | 2 --
 arch/powerpc/mm/mem.c          | 2 --
 arch/s390/mm/init.c            | 2 --
 arch/sh/mm/init.c              | 2 --
 arch/x86/mm/init_32.c          | 2 --
 arch/x86/mm/init_64.c          | 2 --
 drivers/base/memory.c          | 2 --
 include/linux/memory.h         | 2 --
 include/linux/memory_hotplug.h | 2 --
 mm/memory_hotplug.c            | 2 --
 mm/sparse.c                    | 6 ------
 12 files changed, 28 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a21fa7e1167d..750a69dde39b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1074,7 +1074,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
 			   restrictions);
 }
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -1093,4 +1092,3 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
-#endif
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index d28e29103bdb..aae75fd7b810 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -681,7 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return ret;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -693,4 +692,3 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
-#endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 26a8da3723bb..9259337d7374 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -125,7 +125,6 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
 	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void __ref arch_remove_memory(int nid, u64 start, u64 size,
 			     struct vmem_altmap *altmap)
 {
@@ -151,7 +150,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 		pr_warn("Hash collision while resizing HPT\n");
 }
 #endif
-#endif /* CONFIG_MEMORY_HOTPLUG */
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 void __init mem_topology_setup(void)
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 5b1ec2f532e0..4e5bbe328594 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -286,7 +286,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return rc;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -298,5 +297,4 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 	vmem_remove_mapping(start, size);
 }
-#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 13c6a6bb5fd9..dfdbaa50946e 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -429,7 +429,6 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -440,5 +439,4 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	zone = page_zone(pfn_to_page(start_pfn));
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
-#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f265a4316179..4068abb9427f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -860,7 +860,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -872,7 +871,6 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
-#endif
 
 int kernel_set_to_readonly __read_mostly;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 08bbf648827b..5a289a2ab108 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1198,7 +1198,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
 	remove_pagetable(start, end, false, altmap);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void __meminit
 kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 {
@@ -1219,7 +1218,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 	kernel_physical_mapping_remove(start, start + size);
 }
-#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_vsyscall;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index e0aa7f9abb36..92459d6f12be 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -723,7 +723,6 @@ out:
 	return ret;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void
 unregister_memory(struct memory_block *memory)
 {
@@ -762,7 +761,6 @@ void unregister_memory_section(struct mem_section *section)
 out_unlock:
 	mutex_unlock(&mem_sysfs_mutex);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /* return true if the memory block is offlined, otherwise, return false */
 bool is_memblock_offlined(struct memory_block *mem)
diff --git a/include/linux/memory.h b/include/linux/memory.h
index e1dc1bb2b787..474c7c60c8f2 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -112,9 +112,7 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 int hotplug_memory_register(int nid, struct mem_section *section);
-#ifdef CONFIG_MEMORY_HOTREMOVE
 extern void unregister_memory_section(struct mem_section *);
-#endif
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 988fde33cd7f..87bf9c4a889e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -123,12 +123,10 @@ static inline bool movable_node_is_enabled(void)
 	return movable_node_enabled;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 extern void arch_remove_memory(int nid, u64 start, u64 size,
 			       struct vmem_altmap *altmap);
 extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
 			   unsigned long nr_pages, struct vmem_altmap *altmap);
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /*
  * Do we want sysfs memblock files created. This will allow userspace to online
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a8c25fd85ee3..bc11888d5d7e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -318,7 +318,6 @@ out:
 	return err;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
 static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
 				     unsigned long start_pfn,
@@ -580,7 +579,6 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 
 	set_zone_contiguous(zone);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 int set_online_page_callback(online_page_callback_t callback)
 {
diff --git a/mm/sparse.c b/mm/sparse.c
index fd13166949b5..d1d5e05f5b8d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -604,7 +604,6 @@ static void __kfree_section_memmap(struct page *memmap,
 
 	vmemmap_free(start, end, altmap);
 }
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_map_bootmem(struct page *memmap)
 {
 	unsigned long start = (unsigned long)memmap;
@@ -612,7 +611,6 @@ static void free_map_bootmem(struct page *memmap)
 
 	vmemmap_free(start, end, NULL);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #else
 static struct page *__kmalloc_section_memmap(void)
 {
@@ -651,7 +649,6 @@ static void __kfree_section_memmap(struct page *memmap,
 			   get_order(sizeof(struct page) * PAGES_PER_SECTION));
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_map_bootmem(struct page *memmap)
 {
 	unsigned long maps_section_nr, removing_section_nr, i;
@@ -681,7 +678,6 @@ static void free_map_bootmem(struct page *memmap)
 			put_page_bootmem(page);
 	}
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 /**
@@ -746,7 +742,6 @@ out:
 	return ret;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 #ifdef CONFIG_MEMORY_FAILURE
 static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 {
@@ -823,5 +818,4 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 			PAGES_PER_SECTION - map_offset);
 	free_section_usemap(memmap, usemap, altmap);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From db051a0dac13db24d58470d75cee0ce7c6b031a1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:56:56 -0700
Subject: mm/memory_hotplug: create memory block devices after
 arch_add_memory()

Only memory to be added to the buddy and to be onlined/offlined by user
space using /sys/devices/system/memory/...  needs (and should have!)
memory block devices.

Factor out creation of memory block devices.  Create all devices after
arch_add_memory() succeeded.  We can later drop the want_memblock
parameter, because it is now effectively stale.

Only after memory block devices have been added, memory can be onlined
by user space.  This implies, that memory is not visible to user space
at all before arch_add_memory() succeeded.

While at it
 - use WARN_ON_ONCE instead of BUG_ON in moved unregister_memory()
 - introduce find_memory_block_by_id() to search via block id
 - Use find_memory_block_by_id() in init_memory_block() to catch
   duplicates

Link: http://lkml.kernel.org/r/20190527111152.16324-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qian Cai <cai@lca.pw>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 82 +++++++++++++++++++++++++++++++++-----------------
 include/linux/memory.h |  2 +-
 mm/memory_hotplug.c    | 15 ++++-----
 3 files changed, 63 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 92459d6f12be..18a30c3ac0ef 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -39,6 +39,11 @@ static inline int base_memory_block_id(int section_nr)
 	return section_nr / sections_per_block;
 }
 
+static inline int pfn_to_block_id(unsigned long pfn)
+{
+	return base_memory_block_id(pfn_to_section_nr(pfn));
+}
+
 static int memory_subsys_online(struct device *dev);
 static int memory_subsys_offline(struct device *dev);
 
@@ -582,10 +587,9 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
  * A reference for the returned object is held and the reference for the
  * hinted object is released.
  */
-struct memory_block *find_memory_block_hinted(struct mem_section *section,
-					      struct memory_block *hint)
+static struct memory_block *find_memory_block_by_id(int block_id,
+						    struct memory_block *hint)
 {
-	int block_id = base_memory_block_id(__section_nr(section));
 	struct device *hintdev = hint ? &hint->dev : NULL;
 	struct device *dev;
 
@@ -597,6 +601,14 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section,
 	return to_memory_block(dev);
 }
 
+struct memory_block *find_memory_block_hinted(struct mem_section *section,
+					      struct memory_block *hint)
+{
+	int block_id = base_memory_block_id(__section_nr(section));
+
+	return find_memory_block_by_id(block_id, hint);
+}
+
 /*
  * For now, we have a linear search to go find the appropriate
  * memory_block corresponding to a particular phys_index. If
@@ -658,6 +670,11 @@ static int init_memory_block(struct memory_block **memory, int block_id,
 	unsigned long start_pfn;
 	int ret = 0;
 
+	mem = find_memory_block_by_id(block_id, NULL);
+	if (mem) {
+		put_device(&mem->dev);
+		return -EEXIST;
+	}
 	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 	if (!mem)
 		return -ENOMEM;
@@ -695,44 +712,53 @@ static int add_memory_block(int base_section_nr)
 	return 0;
 }
 
+static void unregister_memory(struct memory_block *memory)
+{
+	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
+		return;
+
+	/* drop the ref. we got via find_memory_block() */
+	put_device(&memory->dev);
+	device_unregister(&memory->dev);
+}
+
 /*
- * need an interface for the VM to add new memory regions,
- * but without onlining it.
+ * Create memory block devices for the given memory area. Start and size
+ * have to be aligned to memory block granularity. Memory block devices
+ * will be initialized as offline.
  */
-int hotplug_memory_register(int nid, struct mem_section *section)
+int create_memory_block_devices(unsigned long start, unsigned long size)
 {
-	int block_id = base_memory_block_id(__section_nr(section));
-	int ret = 0;
+	const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
+	int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 	struct memory_block *mem;
+	unsigned long block_id;
+	int ret = 0;
 
-	mutex_lock(&mem_sysfs_mutex);
+	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
+			 !IS_ALIGNED(size, memory_block_size_bytes())))
+		return -EINVAL;
 
-	mem = find_memory_block(section);
-	if (mem) {
-		mem->section_count++;
-		put_device(&mem->dev);
-	} else {
+	mutex_lock(&mem_sysfs_mutex);
+	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 		ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
 		if (ret)
-			goto out;
-		mem->section_count++;
+			break;
+		mem->section_count = sections_per_block;
+	}
+	if (ret) {
+		end_block_id = block_id;
+		for (block_id = start_block_id; block_id != end_block_id;
+		     block_id++) {
+			mem = find_memory_block_by_id(block_id, NULL);
+			mem->section_count = 0;
+			unregister_memory(mem);
+		}
 	}
-
-out:
 	mutex_unlock(&mem_sysfs_mutex);
 	return ret;
 }
 
-static void
-unregister_memory(struct memory_block *memory)
-{
-	BUG_ON(memory->dev.bus != &memory_subsys);
-
-	/* drop the ref. we got via find_memory_block() */
-	put_device(&memory->dev);
-	device_unregister(&memory->dev);
-}
-
 void unregister_memory_section(struct mem_section *section)
 {
 	struct memory_block *mem;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 474c7c60c8f2..db3e8567f900 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -111,7 +111,7 @@ extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
-int hotplug_memory_register(int nid, struct mem_section *section);
+int create_memory_block_devices(unsigned long start, unsigned long size);
 extern void unregister_memory_section(struct mem_section *);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bc11888d5d7e..78291526eb4d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -259,13 +259,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 		return -EEXIST;
 
 	ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
-	if (ret < 0)
-		return ret;
-
-	if (!want_memblock)
-		return 0;
-
-	return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
+	return ret < 0 ? ret : 0;
 }
 
 /*
@@ -1105,6 +1099,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
 	if (ret < 0)
 		goto error;
 
+	/* create memory block devices after memory was added */
+	ret = create_memory_block_devices(start, size);
+	if (ret) {
+		arch_remove_memory(nid, start, size, NULL);
+		goto error;
+	}
+
 	if (new_node) {
 		/* If sysfs file of new node can't be created, cpu on the node
 		 * can't be hot-added. There is no rollback way now.
-- 
cgit v1.2.3


From 05f800a0bd08e14606ac63e0a5c63ed6880acaab Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:01 -0700
Subject: mm/memory_hotplug: drop MHP_MEMBLOCK_API

No longer needed, the callers of arch_add_memory() can handle this
manually.

Link: http://lkml.kernel.org/r/20190527111152.16324-9-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 8 --------
 mm/memory_hotplug.c            | 9 +++------
 2 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 87bf9c4a889e..36c514b80cf1 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -128,14 +128,6 @@ extern void arch_remove_memory(int nid, u64 start, u64 size,
 extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
 			   unsigned long nr_pages, struct vmem_altmap *altmap);
 
-/*
- * Do we want sysfs memblock files created. This will allow userspace to online
- * and offline memory explicitly. Lack of this bit means that the caller has to
- * call move_pfn_range_to_zone to finish the initialization.
- */
-
-#define MHP_MEMBLOCK_API               (1<<0)
-
 /* reasonably generic interface to expand the physical pages */
 extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
 		       struct mhp_restrictions *restrictions);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 78291526eb4d..fb9dc3fa1138 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -251,7 +251,7 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
 static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
-		struct vmem_altmap *altmap, bool want_memblock)
+				   struct vmem_altmap *altmap)
 {
 	int ret;
 
@@ -294,8 +294,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 	}
 
 	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, section_nr_to_pfn(i), altmap,
-				restrictions->flags & MHP_MEMBLOCK_API);
+		err = __add_section(nid, section_nr_to_pfn(i), altmap);
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -1065,9 +1064,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
  */
 int __ref add_memory_resource(int nid, struct resource *res)
 {
-	struct mhp_restrictions restrictions = {
-		.flags = MHP_MEMBLOCK_API,
-	};
+	struct mhp_restrictions restrictions = {};
 	u64 start, size;
 	bool new_node = false;
 	int ret;
-- 
cgit v1.2.3


From 4c4b7f9ba9486c565aead99a198ceeef73ae81f6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:06 -0700
Subject: mm/memory_hotplug: remove memory block devices before
 arch_remove_memory()

Let's factor out removing of memory block devices, which is only
necessary for memory added via add_memory() and friends that created
memory block devices.  Remove the devices before calling
arch_remove_memory().

This finishes factoring out memory block device handling from
arch_add_memory() and arch_remove_memory().

Link: http://lkml.kernel.org/r/20190527111152.16324-10-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 37 ++++++++++++++++++-------------------
 drivers/base/node.c    | 11 ++++++-----
 include/linux/memory.h |  2 +-
 include/linux/node.h   |  6 ++----
 mm/memory_hotplug.c    |  5 +++--
 5 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 18a30c3ac0ef..826dd76f662e 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -759,32 +759,31 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
 	return ret;
 }
 
-void unregister_memory_section(struct mem_section *section)
+/*
+ * Remove memory block devices for the given memory area. Start and size
+ * have to be aligned to memory block granularity. Memory block devices
+ * have to be offline.
+ */
+void remove_memory_block_devices(unsigned long start, unsigned long size)
 {
+	const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
+	const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 	struct memory_block *mem;
+	int block_id;
 
-	if (WARN_ON_ONCE(!present_section(section)))
+	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
+			 !IS_ALIGNED(size, memory_block_size_bytes())))
 		return;
 
 	mutex_lock(&mem_sysfs_mutex);
-
-	/*
-	 * Some users of the memory hotplug do not want/need memblock to
-	 * track all sections. Skip over those.
-	 */
-	mem = find_memory_block(section);
-	if (!mem)
-		goto out_unlock;
-
-	unregister_mem_sect_under_nodes(mem, __section_nr(section));
-
-	mem->section_count--;
-	if (mem->section_count == 0)
+	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
+		mem = find_memory_block_by_id(block_id, NULL);
+		if (WARN_ON_ONCE(!mem))
+			continue;
+		mem->section_count = 0;
+		unregister_memory_block_under_nodes(mem);
 		unregister_memory(mem);
-	else
-		put_device(&mem->dev);
-
-out_unlock:
+	}
 	mutex_unlock(&mem_sysfs_mutex);
 }
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index aa878fbcf705..0b0f38c2c7cd 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -802,9 +802,10 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
 	return 0;
 }
 
-/* unregister memory section under all nodes that it spans */
-int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-				    unsigned long phys_index)
+/*
+ * Unregister memory block device under all nodes that it spans.
+ */
+int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
 	NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
@@ -817,8 +818,8 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
 		return -ENOMEM;
 	nodes_clear(*unlinked_nodes);
 
-	sect_start_pfn = section_nr_to_pfn(phys_index);
-	sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
+	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
+	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
 	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
 		int nid;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index db3e8567f900..f26a5417ec5d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -112,7 +112,7 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 int create_memory_block_devices(unsigned long start, unsigned long size);
-extern void unregister_memory_section(struct mem_section *);
+void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
diff --git a/include/linux/node.h b/include/linux/node.h
index 1a557c589ecb..02a29e71b175 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -139,8 +139,7 @@ extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int register_mem_sect_under_node(struct memory_block *mem_blk,
 						void *arg);
-extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-					   unsigned long phys_index);
+extern int unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
 						   unsigned int cpu_nid,
@@ -176,8 +175,7 @@ static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
 {
 	return 0;
 }
-static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-						  unsigned long phys_index)
+static inline int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
 	return 0;
 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fb9dc3fa1138..37c861e7a717 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -520,8 +520,6 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
 	if (WARN_ON_ONCE(!valid_section(ms)))
 		return;
 
-	unregister_memory_section(ms);
-
 	scn_nr = __section_nr(ms);
 	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
 	__remove_zone(zone, start_pfn);
@@ -1834,6 +1832,9 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 	memblock_free(start, size);
 	memblock_remove(start, size);
 
+	/* remove memory block devices before removing memory */
+	remove_memory_block_devices(start, size);
+
 	arch_remove_memory(nid, start, size, NULL);
 	__release_memory_resource(start, size);
 
-- 
cgit v1.2.3


From a31b264c2b415b29660da0bc2ba291a98629ce51 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:12 -0700
Subject: mm/memory_hotplug: make unregister_memory_block_under_nodes() never
 fail

We really don't want anything during memory hotunplug to fail.  We
always pass a valid memory block device, that check can go.  Avoid
allocating memory and eventually failing.  As we are always called under
lock, we can use a static piece of memory.  This avoids having to put
the structure onto the stack, having to guess about the stack size of
callers.

Patch inspired by a patch from Oscar Salvador.

In the future, there might be no need to iterate over nodes at all.
mem->nid should tell us exactly what to remove.  Memory block devices
with mixed nodes (added during boot) should properly fenced off and
never removed.

Link: http://lkml.kernel.org/r/20190527111152.16324-11-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c  | 18 +++++-------------
 include/linux/node.h |  5 ++---
 2 files changed, 7 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0b0f38c2c7cd..beec80649b33 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -804,20 +804,14 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
 
 /*
  * Unregister memory block device under all nodes that it spans.
+ * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
  */
-int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
+void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
-	NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
+	static nodemask_t unlinked_nodes;
 
-	if (!mem_blk) {
-		NODEMASK_FREE(unlinked_nodes);
-		return -EFAULT;
-	}
-	if (!unlinked_nodes)
-		return -ENOMEM;
-	nodes_clear(*unlinked_nodes);
-
+	nodes_clear(unlinked_nodes);
 	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
 	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
 	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
@@ -828,15 +822,13 @@ int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 			continue;
 		if (!node_online(nid))
 			continue;
-		if (node_test_and_set(nid, *unlinked_nodes))
+		if (node_test_and_set(nid, unlinked_nodes))
 			continue;
 		sysfs_remove_link(&node_devices[nid]->dev.kobj,
 			 kobject_name(&mem_blk->dev.kobj));
 		sysfs_remove_link(&mem_blk->dev.kobj,
 			 kobject_name(&node_devices[nid]->dev.kobj));
 	}
-	NODEMASK_FREE(unlinked_nodes);
-	return 0;
 }
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
diff --git a/include/linux/node.h b/include/linux/node.h
index 02a29e71b175..548c226966a2 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -139,7 +139,7 @@ extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int register_mem_sect_under_node(struct memory_block *mem_blk,
 						void *arg);
-extern int unregister_memory_block_under_nodes(struct memory_block *mem_blk);
+extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
 						   unsigned int cpu_nid,
@@ -175,9 +175,8 @@ static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
 {
 	return 0;
 }
-static inline int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
+static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
-	return 0;
 }
 
 static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
-- 
cgit v1.2.3


From b9bf8d342d9b443c0d19aa57883d8ddb38d965de Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:17 -0700
Subject: mm/memory_hotplug: remove "zone" parameter from
 sparse_remove_one_section

The parameter is unused, so let's drop it.  Memory removal paths should
never care about zones.  This is the job of memory offlining and will
require more refactorings.

Link: http://lkml.kernel.org/r/20190527111152.16324-12-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 +-
 mm/memory_hotplug.c            | 2 +-
 mm/sparse.c                    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 36c514b80cf1..79e0add6a597 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -350,7 +350,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern int sparse_add_one_section(int nid, unsigned long start_pfn,
 				  struct vmem_altmap *altmap);
-extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+extern void sparse_remove_one_section(struct mem_section *ms,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 37c861e7a717..d1d0ceaaca88 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -524,7 +524,7 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
 	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
 	__remove_zone(zone, start_pfn);
 
-	sparse_remove_one_section(zone, ms, map_offset, altmap);
+	sparse_remove_one_section(ms, map_offset, altmap);
 }
 
 /**
diff --git a/mm/sparse.c b/mm/sparse.c
index d1d5e05f5b8d..1552c855d62a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -800,8 +800,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap,
 		free_map_bootmem(memmap);
 }
 
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
-		unsigned long map_offset, struct vmem_altmap *altmap)
+void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
+			       struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
 	unsigned long *usemap = NULL;
-- 
cgit v1.2.3


From 43675e6fbbeadca90c6c5031557ff95e217e6d2f Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Thu, 18 Jul 2019 15:57:24 -0700
Subject: mm: thp: make transhuge_vma_suitable available for anonymous THP

transhuge_vma_suitable() was only available for shmem THP, but anonymous
THP has the same check except pgoff check.  And, it will be used for THP
eligible check in the later patch, so make it available for all kind of
THPs.  This also helps reduce code duplication slightly.

Since anonymous THP doesn't have to check pgoff, so make pgoff check
shmem vma only.

And regroup some functions in include/linux/mm.h to solve compile issue
since transhuge_vma_suitable() needs call vma_is_anonymous() which was
defined after huge_mm.h is included.

[akpm@linux-foundation.org: fix typo]
[yang.shi@linux.alibaba.com: v4]
  Link: http://lkml.kernel.org/r/1563400758-124759-2-git-send-email-yang.shi@linux.alibaba.com
Link: http://lkml.kernel.org/r/1560401041-32207-2-git-send-email-yang.shi@linux.alibaba.com
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 23 +++++++++++++++++++++++
 include/linux/mm.h      | 34 +++++++++++++++++-----------------
 mm/huge_memory.c        |  2 +-
 mm/memory.c             | 13 -------------
 4 files changed, 41 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7cd5c150c21d..45ede62aa85b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -121,6 +121,23 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 
 bool transparent_hugepage_enabled(struct vm_area_struct *vma);
 
+#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
+
+static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
+		unsigned long haddr)
+{
+	/* Don't have to check pgoff for anonymous vma */
+	if (!vma_is_anonymous(vma)) {
+		if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
+			(vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
+			return false;
+	}
+
+	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+		return false;
+	return true;
+}
+
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
@@ -271,6 +288,12 @@ static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
+		unsigned long haddr)
+{
+	return false;
+}
+
 static inline void prep_transhuge_page(struct page *page) {}
 
 #define transparent_hugepage_flags 0UL
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bd6512559bed..48ab7b982d82 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -541,6 +541,23 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
 	vma->vm_ops = NULL;
 }
 
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+	return !vma->vm_ops;
+}
+
+#ifdef CONFIG_SHMEM
+/*
+ * The vma_is_shmem is not inline because it is used only by slow
+ * paths in userfault.
+ */
+bool vma_is_shmem(struct vm_area_struct *vma);
+#else
+static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
+#endif
+
+int vma_is_stack_for_current(struct vm_area_struct *vma);
+
 /* flush_tlb_range() takes a vma, not a mm, and can care about flags */
 #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
 
@@ -1620,23 +1637,6 @@ int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 
-static inline bool vma_is_anonymous(struct vm_area_struct *vma)
-{
-	return !vma->vm_ops;
-}
-
-#ifdef CONFIG_SHMEM
-/*
- * The vma_is_shmem is not inline because it is used only by slow
- * paths in userfault.
- */
-bool vma_is_shmem(struct vm_area_struct *vma);
-#else
-static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
-#endif
-
-int vma_is_stack_for_current(struct vm_area_struct *vma);
-
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 885642c82aaa..782dd1446a3e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -689,7 +689,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	struct page *page;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 
-	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+	if (!transhuge_vma_suitable(vma, haddr))
 		return VM_FAULT_FALLBACK;
 	if (unlikely(anon_vma_prepare(vma)))
 		return VM_FAULT_OOM;
diff --git a/mm/memory.c b/mm/memory.c
index 89325f9c6173..e2bb51b6242e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3162,19 +3162,6 @@ map_pte:
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-
-#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long haddr)
-{
-	if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
-			(vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
-		return false;
-	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
-		return false;
-	return true;
-}
-
 static void deposit_prealloc_pte(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-- 
cgit v1.2.3


From 2491f0a2c0b117b9097e9c9eee0c21f2e5f716d7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:37 -0700
Subject: mm: section numbers use the type "unsigned long"

Patch series "mm: Further memory block device cleanups", v1.

Some further cleanups around memory block devices.  Especially, clean up
and simplify walk_memory_range().  Including some other minor cleanups.

This patch (of 6):

We are using a mixture of "int" and "unsigned long".  Let's make this
consistent by using "unsigned long" everywhere.  We'll do the same with
memory block ids next.

While at it, turn the "unsigned long i" in removable_show() into an int
- sections_per_block is an int.

[akpm@linux-foundation.org: s/unsigned long i/unsigned long nr/]
[david@redhat.com: v3]
  Link: http://lkml.kernel.org/r/20190620183139.4352-2-david@redhat.com
Link: http://lkml.kernel.org/r/20190614100114.311-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 27 +++++++++++++--------------
 include/linux/mmzone.h |  4 ++--
 mm/sparse.c            | 12 ++++++------
 3 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 826dd76f662e..5947b5a5686d 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -34,7 +34,7 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
 
 static int sections_per_block;
 
-static inline int base_memory_block_id(int section_nr)
+static inline int base_memory_block_id(unsigned long section_nr)
 {
 	return section_nr / sections_per_block;
 }
@@ -131,9 +131,9 @@ static ssize_t phys_index_show(struct device *dev,
 static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
-	unsigned long i, pfn;
-	int ret = 1;
 	struct memory_block *mem = to_memory_block(dev);
+	unsigned long pfn;
+	int ret = 1, i;
 
 	if (mem->state != MEM_ONLINE)
 		goto out;
@@ -691,15 +691,15 @@ static int init_memory_block(struct memory_block **memory, int block_id,
 	return ret;
 }
 
-static int add_memory_block(int base_section_nr)
+static int add_memory_block(unsigned long base_section_nr)
 {
+	int ret, section_count = 0;
 	struct memory_block *mem;
-	int i, ret, section_count = 0;
+	unsigned long nr;
 
-	for (i = base_section_nr;
-	     i < base_section_nr + sections_per_block;
-	     i++)
-		if (present_section_nr(i))
+	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
+	     nr++)
+		if (present_section_nr(nr))
 			section_count++;
 
 	if (section_count == 0)
@@ -822,10 +822,9 @@ static const struct attribute_group *memory_root_attr_groups[] = {
  */
 int __init memory_dev_init(void)
 {
-	unsigned int i;
 	int ret;
 	int err;
-	unsigned long block_sz;
+	unsigned long block_sz, nr;
 
 	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
 	if (ret)
@@ -839,9 +838,9 @@ int __init memory_dev_init(void)
 	 * during boot and have been initialized
 	 */
 	mutex_lock(&mem_sysfs_mutex);
-	for (i = 0; i <= __highest_present_section_nr;
-		i += sections_per_block) {
-		err = add_memory_block(i);
+	for (nr = 0; nr <= __highest_present_section_nr;
+	     nr += sections_per_block) {
+		err = add_memory_block(nr);
 		if (!ret)
 			ret = err;
 	}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 70394cabaf4e..298d1c3e4c2e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1219,7 +1219,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 		return NULL;
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
-extern int __section_nr(struct mem_section* ms);
+extern unsigned long __section_nr(struct mem_section *ms);
 extern unsigned long usemap_size(void);
 
 /*
@@ -1291,7 +1291,7 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
-extern int __highest_present_section_nr;
+extern unsigned long __highest_present_section_nr;
 
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 static inline int pfn_valid(unsigned long pfn)
diff --git a/mm/sparse.c b/mm/sparse.c
index fe44b2d3bd7e..b29534cea8c0 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -102,7 +102,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
 {
 	unsigned long root_nr;
 	struct mem_section *root = NULL;
@@ -121,9 +121,9 @@ int __section_nr(struct mem_section* ms)
 	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
 #else
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
 {
-	return (int)(ms - mem_section[0]);
+	return (unsigned long)(ms - mem_section[0]);
 }
 #endif
 
@@ -178,10 +178,10 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
  * Keeping track of this gives us an easy way to break out of
  * those loops early.
  */
-int __highest_present_section_nr;
+unsigned long __highest_present_section_nr;
 static void section_mark_present(struct mem_section *ms)
 {
-	int section_nr = __section_nr(ms);
+	unsigned long section_nr = __section_nr(ms);
 
 	if (section_nr > __highest_present_section_nr)
 		__highest_present_section_nr = section_nr;
@@ -189,7 +189,7 @@ static void section_mark_present(struct mem_section *ms)
 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
 }
 
-static inline int next_present_section_nr(int section_nr)
+static inline unsigned long next_present_section_nr(unsigned long section_nr)
 {
 	do {
 		section_nr++;
-- 
cgit v1.2.3


From 8d595c4c0f768f19db043d378b22e98405f9fd47 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:43 -0700
Subject: mm: make register_mem_sect_under_node() static

It is only used internally.

Link: http://lkml.kernel.org/r/20190614100114.311-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c  | 3 ++-
 include/linux/node.h | 7 -------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index beec80649b33..27391f1e8f60 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -753,7 +753,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
 }
 
 /* register memory section under specified node if it spans that node */
-int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
+static int register_mem_sect_under_node(struct memory_block *mem_blk,
+					 void *arg)
 {
 	int ret, nid = *(int *)arg;
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
diff --git a/include/linux/node.h b/include/linux/node.h
index 548c226966a2..4866f32a02d8 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -137,8 +137,6 @@ static inline int register_one_node(int nid)
 extern void unregister_one_node(int nid);
 extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
-extern int register_mem_sect_under_node(struct memory_block *mem_blk,
-						void *arg);
 extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
@@ -170,11 +168,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 {
 	return 0;
 }
-static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
-							void *arg)
-{
-	return 0;
-}
 static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
 }
-- 
cgit v1.2.3


From fbcf73ce65827c3d8935f38b832a43153a0c78d1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:46 -0700
Subject: mm/memory_hotplug: rename walk_memory_range() and pass start+size
 instead of pfns

walk_memory_range() was once used to iterate over sections.  Now, it
iterates over memory blocks.  Rename the function, fixup the
documentation.

Also, pass start+size instead of PFNs, which is what most callers
already have at hand.  (we'll rework link_mem_sections() most probably
soon)

Follow-up patches will rework, simplify, and move walk_memory_blocks()
to drivers/base/memory.c.

Note: walk_memory_blocks() only works correctly right now if the
start_pfn is aligned to a section start.  This is the case right now,
but we'll generalize the function in a follow up patch so the semantics
match the documentation.

[akpm@linux-foundation.org: remove unused variable]
Link: http://lkml.kernel.org/r/20190614100114.311-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/powernv/memtrace.c | 23 +++++++++++------------
 drivers/acpi/acpi_memhotplug.c            | 19 ++++---------------
 drivers/base/node.c                       |  5 +++--
 include/linux/memory_hotplug.h            |  2 +-
 mm/memory_hotplug.c                       | 24 +++++++++++++-----------
 5 files changed, 32 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 5e53c1392d3b..eb2e75dac369 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -70,23 +70,23 @@ static int change_memblock_state(struct memory_block *mem, void *arg)
 /* called with device_hotplug_lock held */
 static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
 {
-	u64 end_pfn = start_pfn + nr_pages - 1;
+	const unsigned long start = PFN_PHYS(start_pfn);
+	const unsigned long size = PFN_PHYS(nr_pages);
 
-	if (walk_memory_range(start_pfn, end_pfn, NULL,
-	    check_memblock_online))
+	if (walk_memory_blocks(start, size, NULL, check_memblock_online))
 		return false;
 
-	walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
-			  change_memblock_state);
+	walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
+			   change_memblock_state);
 
 	if (offline_pages(start_pfn, nr_pages)) {
-		walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
-				  change_memblock_state);
+		walk_memory_blocks(start, size, (void *)MEM_ONLINE,
+				   change_memblock_state);
 		return false;
 	}
 
-	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
-			  change_memblock_state);
+	walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
+			   change_memblock_state);
 
 
 	return true;
@@ -242,9 +242,8 @@ static int memtrace_online(void)
 		 */
 		if (!memhp_auto_online) {
 			lock_device_hotplug();
-			walk_memory_range(PFN_DOWN(ent->start),
-					  PFN_UP(ent->start + ent->size - 1),
-					  NULL, online_mem_block);
+			walk_memory_blocks(ent->start, ent->size, NULL,
+					   online_mem_block);
 			unlock_device_hotplug();
 		}
 
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index db013dc21c02..e294f44a7850 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -155,16 +155,6 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
 	return 0;
 }
 
-static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info)
-{
-	return PFN_DOWN(info->start_addr);
-}
-
-static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info)
-{
-	return PFN_UP(info->start_addr + info->length-1);
-}
-
 static int acpi_bind_memblk(struct memory_block *mem, void *arg)
 {
 	return acpi_bind_one(&mem->dev, arg);
@@ -173,9 +163,8 @@ static int acpi_bind_memblk(struct memory_block *mem, void *arg)
 static int acpi_bind_memory_blocks(struct acpi_memory_info *info,
 				   struct acpi_device *adev)
 {
-	return walk_memory_range(acpi_meminfo_start_pfn(info),
-				 acpi_meminfo_end_pfn(info), adev,
-				 acpi_bind_memblk);
+	return walk_memory_blocks(info->start_addr, info->length, adev,
+				  acpi_bind_memblk);
 }
 
 static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
@@ -186,8 +175,8 @@ static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
 
 static void acpi_unbind_memory_blocks(struct acpi_memory_info *info)
 {
-	walk_memory_range(acpi_meminfo_start_pfn(info),
-			  acpi_meminfo_end_pfn(info), NULL, acpi_unbind_memblk);
+	walk_memory_blocks(info->start_addr, info->length, NULL,
+			   acpi_unbind_memblk);
 }
 
 static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 27391f1e8f60..75b7e6f6535b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -834,8 +834,9 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
 {
-	return walk_memory_range(start_pfn, end_pfn, (void *)&nid,
-					register_mem_sect_under_node);
+	return walk_memory_blocks(PFN_PHYS(start_pfn),
+				  PFN_PHYS(end_pfn - start_pfn), (void *)&nid,
+				  register_mem_sect_under_node);
 }
 
 #ifdef CONFIG_HUGETLBFS
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 79e0add6a597..d9fffc34949f 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -340,7 +340,7 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern void __ref free_area_init_core_hotplug(int nid);
-extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+extern int walk_memory_blocks(unsigned long start, unsigned long size,
 		void *arg, int (*func)(struct memory_block *, void *));
 extern int __add_memory(int nid, u64 start, u64 size);
 extern int add_memory(int nid, u64 start, u64 size);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d1d0ceaaca88..b3ef84e408fa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1124,8 +1124,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
 
 	/* online pages if requested */
 	if (memhp_auto_online)
-		walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
-				  NULL, online_memory_block);
+		walk_memory_blocks(start, size, NULL, online_memory_block);
 
 	return ret;
 error:
@@ -1663,20 +1662,24 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /**
- * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
- * @start_pfn: start pfn of the memory range
- * @end_pfn: end pfn of the memory range
+ * walk_memory_blocks - walk through all present memory blocks overlapped
+ *			by the range [start, start + size)
+ *
+ * @start: start address of the memory range
+ * @size: size of the memory range
  * @arg: argument passed to func
- * @func: callback for each memory section walked
+ * @func: callback for each memory block walked
  *
- * This function walks through all present mem sections in range
- * [start_pfn, end_pfn) and call func on each mem section.
+ * This function walks through all present memory blocks overlapped by the
+ * range [start, start + size), calling func on each memory block.
  *
  * Returns the return value of func.
  */
-int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+int walk_memory_blocks(unsigned long start, unsigned long size,
 		void *arg, int (*func)(struct memory_block *, void *))
 {
+	const unsigned long start_pfn = PFN_DOWN(start);
+	const unsigned long end_pfn = PFN_UP(start + size - 1);
 	struct memory_block *mem = NULL;
 	struct mem_section *section;
 	unsigned long pfn, section_nr;
@@ -1822,8 +1825,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 	 * whether all memory blocks in question are offline and return error
 	 * if this is not the case.
 	 */
-	rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
-			       check_memblock_offlined_cb);
+	rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
 	if (rc)
 		goto done;
 
-- 
cgit v1.2.3


From ea8846411ad686ff626e00bb2c3821b3db2ab56a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:50 -0700
Subject: mm/memory_hotplug: move and simplify walk_memory_blocks()

Let's move walk_memory_blocks() to the place where memory block logic
resides and simplify it.  While at it, add a type for the callback
function.

Link: http://lkml.kernel.org/r/20190614100114.311-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Mike Travis <mike.travis@hpe.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          | 42 ++++++++++++++++++++++++++++++++
 include/linux/memory.h         |  3 +++
 include/linux/memory_hotplug.h |  2 --
 mm/memory_hotplug.c            | 55 ------------------------------------------
 4 files changed, 45 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c54e80fd25a8..0204384b4d1d 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -44,6 +44,11 @@ static inline unsigned long pfn_to_block_id(unsigned long pfn)
 	return base_memory_block_id(pfn_to_section_nr(pfn));
 }
 
+static inline unsigned long phys_to_block_id(unsigned long phys)
+{
+	return pfn_to_block_id(PFN_DOWN(phys));
+}
+
 static int memory_subsys_online(struct device *dev);
 static int memory_subsys_offline(struct device *dev);
 
@@ -851,3 +856,40 @@ out:
 		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
 	return ret;
 }
+
+/**
+ * walk_memory_blocks - walk through all present memory blocks overlapped
+ *			by the range [start, start + size)
+ *
+ * @start: start address of the memory range
+ * @size: size of the memory range
+ * @arg: argument passed to func
+ * @func: callback for each memory section walked
+ *
+ * This function walks through all present memory blocks overlapped by the
+ * range [start, start + size), calling func on each memory block.
+ *
+ * In case func() returns an error, walking is aborted and the error is
+ * returned.
+ */
+int walk_memory_blocks(unsigned long start, unsigned long size,
+		       void *arg, walk_memory_blocks_func_t func)
+{
+	const unsigned long start_block_id = phys_to_block_id(start);
+	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
+	struct memory_block *mem;
+	unsigned long block_id;
+	int ret = 0;
+
+	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
+		mem = find_memory_block_by_id(block_id, NULL);
+		if (!mem)
+			continue;
+
+		ret = func(mem, arg);
+		put_device(&mem->dev);
+		if (ret)
+			break;
+	}
+	return ret;
+}
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f26a5417ec5d..b3b388775a30 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -119,6 +119,9 @@ extern int memory_isolate_notify(unsigned long val, void *v);
 extern struct memory_block *find_memory_block_hinted(struct mem_section *,
 							struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
+typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
+extern int walk_memory_blocks(unsigned long start, unsigned long size,
+			      void *arg, walk_memory_blocks_func_t func);
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d9fffc34949f..475aff8efbf8 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -340,8 +340,6 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern void __ref free_area_init_core_hotplug(int nid);
-extern int walk_memory_blocks(unsigned long start, unsigned long size,
-		void *arg, int (*func)(struct memory_block *, void *));
 extern int __add_memory(int nid, u64 start, u64 size);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int add_memory_resource(int nid, struct resource *resource);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b3ef84e408fa..fafee5f13ef2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1659,62 +1659,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return __offline_pages(start_pfn, start_pfn + nr_pages);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
-/**
- * walk_memory_blocks - walk through all present memory blocks overlapped
- *			by the range [start, start + size)
- *
- * @start: start address of the memory range
- * @size: size of the memory range
- * @arg: argument passed to func
- * @func: callback for each memory block walked
- *
- * This function walks through all present memory blocks overlapped by the
- * range [start, start + size), calling func on each memory block.
- *
- * Returns the return value of func.
- */
-int walk_memory_blocks(unsigned long start, unsigned long size,
-		void *arg, int (*func)(struct memory_block *, void *))
-{
-	const unsigned long start_pfn = PFN_DOWN(start);
-	const unsigned long end_pfn = PFN_UP(start + size - 1);
-	struct memory_block *mem = NULL;
-	struct mem_section *section;
-	unsigned long pfn, section_nr;
-	int ret;
-
-	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-		section_nr = pfn_to_section_nr(pfn);
-		if (!present_section_nr(section_nr))
-			continue;
-
-		section = __nr_to_section(section_nr);
-		/* same memblock? */
-		if (mem)
-			if ((section_nr >= mem->start_section_nr) &&
-			    (section_nr <= mem->end_section_nr))
-				continue;
-
-		mem = find_memory_block_hinted(section, mem);
-		if (!mem)
-			continue;
-
-		ret = func(mem, arg);
-		if (ret) {
-			kobject_put(&mem->dev.kobj);
-			return ret;
-		}
-	}
-
-	if (mem)
-		kobject_put(&mem->dev.kobj);
-
-	return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 {
 	int ret = !is_memblock_offlined(mem);
-- 
cgit v1.2.3


From dd625285910d3cff535fa76355e49949513918a4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:53 -0700
Subject: drivers/base/memory.c: get rid of find_memory_block_hinted()

No longer needed, let's remove it.  Also, drop the "hint" parameter
completely from "find_memory_block_by_id", as nobody needs it anymore.

[david@redhat.com: v3]
  Link: http://lkml.kernel.org/r/20190620183139.4352-7-david@redhat.com
[david@redhat.com: handle zero-length walks]
  Link: http://lkml.kernel.org/r/1c2edc22-afd7-2211-c4c7-40e54e5007e8@redhat.com
Link: http://lkml.kernel.org/r/20190614100114.311-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Qian Cai <cai@lca.pw>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Mike Travis <mike.travis@hpe.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 40 ++++++++++++++--------------------------
 include/linux/memory.h |  2 --
 2 files changed, 14 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 0204384b4d1d..20c39d1bcef8 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -588,30 +588,13 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 	return 0;
 }
 
-/*
- * A reference for the returned object is held and the reference for the
- * hinted object is released.
- */
-static struct memory_block *find_memory_block_by_id(unsigned long block_id,
-						    struct memory_block *hint)
+/* A reference for the returned memory block device is acquired. */
+static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 {
-	struct device *hintdev = hint ? &hint->dev : NULL;
 	struct device *dev;
 
-	dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
-	if (hint)
-		put_device(&hint->dev);
-	if (!dev)
-		return NULL;
-	return to_memory_block(dev);
-}
-
-struct memory_block *find_memory_block_hinted(struct mem_section *section,
-					      struct memory_block *hint)
-{
-	unsigned long block_id = base_memory_block_id(__section_nr(section));
-
-	return find_memory_block_by_id(block_id, hint);
+	dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL);
+	return dev ? to_memory_block(dev) : NULL;
 }
 
 /*
@@ -624,7 +607,9 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section,
  */
 struct memory_block *find_memory_block(struct mem_section *section)
 {
-	return find_memory_block_hinted(section, NULL);
+	unsigned long block_id = base_memory_block_id(__section_nr(section));
+
+	return find_memory_block_by_id(block_id);
 }
 
 static struct attribute *memory_memblk_attrs[] = {
@@ -675,7 +660,7 @@ static int init_memory_block(struct memory_block **memory,
 	unsigned long start_pfn;
 	int ret = 0;
 
-	mem = find_memory_block_by_id(block_id, NULL);
+	mem = find_memory_block_by_id(block_id);
 	if (mem) {
 		put_device(&mem->dev);
 		return -EEXIST;
@@ -755,7 +740,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
 		end_block_id = block_id;
 		for (block_id = start_block_id; block_id != end_block_id;
 		     block_id++) {
-			mem = find_memory_block_by_id(block_id, NULL);
+			mem = find_memory_block_by_id(block_id);
 			mem->section_count = 0;
 			unregister_memory(mem);
 		}
@@ -782,7 +767,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
 
 	mutex_lock(&mem_sysfs_mutex);
 	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-		mem = find_memory_block_by_id(block_id, NULL);
+		mem = find_memory_block_by_id(block_id);
 		if (WARN_ON_ONCE(!mem))
 			continue;
 		mem->section_count = 0;
@@ -881,8 +866,11 @@ int walk_memory_blocks(unsigned long start, unsigned long size,
 	unsigned long block_id;
 	int ret = 0;
 
+	if (!size)
+		return 0;
+
 	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
-		mem = find_memory_block_by_id(block_id, NULL);
+		mem = find_memory_block_by_id(block_id);
 		if (!mem)
 			continue;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index b3b388775a30..02e633f3ede0 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -116,8 +116,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
-extern struct memory_block *find_memory_block_hinted(struct mem_section *,
-							struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
 typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
-- 
cgit v1.2.3


From f1eca35a0dc7cb3cdb00c88c8c5e5138a65face0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:57:57 -0700
Subject: mm/sparsemem: introduce struct mem_section_usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm: Sub-section memory hotplug support", v10.

The memory hotplug section is an arbitrary / convenient unit for memory
hotplug.  'Section-size' units have bled into the user interface
('memblock' sysfs) and can not be changed without breaking existing
userspace.  The section-size constraint, while mostly benign for typical
memory hotplug, has and continues to wreak havoc with 'device-memory'
use cases, persistent memory (pmem) in particular.  Recall that pmem
uses devm_memremap_pages(), and subsequently arch_add_memory(), to
allocate a 'struct page' memmap for pmem.  However, it does not use the
'bottom half' of memory hotplug, i.e.  never marks pmem pages online and
never exposes the userspace memblock interface for pmem.  This leaves an
opening to redress the section-size constraint.

To date, the libnvdimm subsystem has attempted to inject padding to
satisfy the internal constraints of arch_add_memory().  Beyond
complicating the code, leading to bugs [2], wasting memory, and limiting
configuration flexibility, the padding hack is broken when the platform
changes this physical memory alignment of pmem from one boot to the
next.  Device failure (intermittent or permanent) and physical
reconfiguration are events that can cause the platform firmware to
change the physical placement of pmem on a subsequent boot, and device
failure is an everyday event in a data-center.

It turns out that sections are only a hard requirement of the
user-facing interface for memory hotplug and with a bit more
infrastructure sub-section arch_add_memory() support can be added for
kernel internal usages like devm_memremap_pages().  Here is an analysis
of the current design assumptions in the current code and how they are
addressed in the new implementation:

Current design assumptions:

 - Sections that describe boot memory (early sections) are never
   unplugged / removed.

 - pfn_valid(), in the CONFIG_SPARSEMEM_VMEMMAP=y, case devolves to a
   valid_section() check

 - __add_pages() and helper routines assume all operations occur in
   PAGES_PER_SECTION units.

 - The memblock sysfs interface only comprehends full sections

New design assumptions:

 - Sections are instrumented with a sub-section bitmask to track (on
   x86) individual 2MB sub-divisions of a 128MB section.

 - Partially populated early sections can be extended with additional
   sub-sections, and those sub-sections can be removed with
   arch_remove_memory(). With this in place we no longer lose usable
   memory capacity to padding.

 - pfn_valid() is updated to look deeper than valid_section() to also
   check the active-sub-section mask. This indication is in the same
   cacheline as the valid_section() so the performance impact is
   expected to be negligible. So far the lkp robot has not reported any
   regressions.

 - Outside of the core vmemmap population routines which are replaced,
   other helper routines like shrink_{zone,pgdat}_span() are updated to
   handle the smaller granularity. Core memory hotplug routines that
   deal with online memory are not touched.

 - The existing memblock sysfs user api guarantees / assumptions are not
   touched since this capability is limited to !online
   !memblock-sysfs-accessible sections.

Meanwhile the issue reports continue to roll in from users that do not
understand when and how the 128MB constraint will bite them.  The current
implementation relied on being able to support at least one misaligned
namespace, but that immediately falls over on any moderately complex
namespace creation attempt.  Beyond the initial problem of 'System RAM'
colliding with pmem, and the unsolvable problem of physical alignment
changes, Linux is now being exposed to platforms that collide pmem ranges
with other pmem ranges by default [3].  In short, devm_memremap_pages()
has pushed the venerable section-size constraint past the breaking point,
and the simplicity of section-aligned arch_add_memory() is no longer
tenable.

These patches are exposed to the kbuild robot on a subsection-v10 branch
[4], and a preview of the unit test for this functionality is available
on the 'subsection-pending' branch of ndctl [5].

[2]: https://lore.kernel.org/r/155000671719.348031.2347363160141119237.stgit@dwillia2-desk3.amr.corp.intel.com
[3]: https://github.com/pmem/ndctl/issues/76
[4]: https://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm.git/log/?h=subsection-v10
[5]: https://github.com/pmem/ndctl/commit/7c59b4867e1c

This patch (of 13):

Towards enabling memory hotplug to track partial population of a section,
introduce 'struct mem_section_usage'.

A pointer to a 'struct mem_section_usage' instance replaces the existing
pointer to a 'pageblock_flags' bitmap.  Effectively it adds one more
'unsigned long' beyond the 'pageblock_flags' (usemap) allocation to house
a new 'subsection_map' bitmap.  The new bitmap enables the memory
hot{plug,remove} implementation to act on incremental sub-divisions of a
section.

SUBSECTION_SHIFT is defined as global constant instead of per-architecture
value like SECTION_SIZE_BITS in order to allow cross-arch compatibility of
subsection users.  Specifically a common subsection size allows for the
possibility that persistent memory namespace configurations be made
compatible across architectures.

The primary motivation for this functionality is to support platforms that
mix "System RAM" and "Persistent Memory" within a single section, or
multiple PMEM ranges with different mapping lifetimes within a single
section.  The section restriction for hotplug has caused an ongoing saga
of hacks and bugs for devm_memremap_pages() users.

Beyond the fixups to teach existing paths how to retrieve the 'usemap'
from a section, and updates to usemap allocation path, there are no
expected behavior changes.

Link: http://lkml.kernel.org/r/156092349845.979959.73333291612799019.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Qian Cai <cai@lca.pw>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 28 +++++++++++++++--
 mm/memory_hotplug.c    | 18 ++++++-----
 mm/page_alloc.c        |  2 +-
 mm/sparse.c            | 81 +++++++++++++++++++++++++-------------------------
 4 files changed, 76 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 298d1c3e4c2e..2520336bdfd1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1160,6 +1160,24 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SECTION_ALIGN_UP(pfn)	(((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
 #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
 
+#define SUBSECTION_SHIFT 21
+
+#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
+#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
+#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))
+
+#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
+#error Subsection size exceeds section size
+#else
+#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
+#endif
+
+struct mem_section_usage {
+	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
+	/* See declaration of similar field in struct zone */
+	unsigned long pageblock_flags[0];
+};
+
 struct page;
 struct page_ext;
 struct mem_section {
@@ -1177,8 +1195,7 @@ struct mem_section {
 	 */
 	unsigned long section_mem_map;
 
-	/* See declaration of similar field in struct zone */
-	unsigned long *pageblock_flags;
+	struct mem_section_usage *usage;
 #ifdef CONFIG_PAGE_EXTENSION
 	/*
 	 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
@@ -1209,6 +1226,11 @@ extern struct mem_section **mem_section;
 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
 #endif
 
+static inline unsigned long *section_to_usemap(struct mem_section *ms)
+{
+	return ms->usage->pageblock_flags;
+}
+
 static inline struct mem_section *__nr_to_section(unsigned long nr)
 {
 #ifdef CONFIG_SPARSEMEM_EXTREME
@@ -1220,7 +1242,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
 extern unsigned long __section_nr(struct mem_section *ms);
-extern unsigned long usemap_size(void);
+extern size_t mem_section_usage_size(void);
 
 /*
  * We use the lower bits of the mem_map pointer to store
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fafee5f13ef2..cf9d979a6498 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -166,9 +166,10 @@ void put_page_bootmem(struct page *page)
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
-	unsigned long *usemap, mapsize, section_nr, i;
+	unsigned long mapsize, section_nr, i;
 	struct mem_section *ms;
 	struct page *page, *memmap;
+	struct mem_section_usage *usage;
 
 	section_nr = pfn_to_section_nr(start_pfn);
 	ms = __nr_to_section(section_nr);
@@ -188,10 +189,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 	for (i = 0; i < mapsize; i++, page++)
 		get_page_bootmem(section_nr, page, SECTION_INFO);
 
-	usemap = ms->pageblock_flags;
-	page = virt_to_page(usemap);
+	usage = ms->usage;
+	page = virt_to_page(usage);
 
-	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
 
 	for (i = 0; i < mapsize; i++, page++)
 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
@@ -200,9 +201,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 #else /* CONFIG_SPARSEMEM_VMEMMAP */
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
-	unsigned long *usemap, mapsize, section_nr, i;
+	unsigned long mapsize, section_nr, i;
 	struct mem_section *ms;
 	struct page *page, *memmap;
+	struct mem_section_usage *usage;
 
 	section_nr = pfn_to_section_nr(start_pfn);
 	ms = __nr_to_section(section_nr);
@@ -211,10 +213,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 
 	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
 
-	usemap = ms->pageblock_flags;
-	page = virt_to_page(usemap);
+	usage = ms->usage;
+	page = virt_to_page(usage);
 
-	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
 
 	for (i = 0; i < mapsize; i++, page++)
 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e515bfcf7f28..be78bafbfe3a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -450,7 +450,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
-	return __pfn_to_section(pfn)->pageblock_flags;
+	return section_to_usemap(__pfn_to_section(pfn));
 #else
 	return page_zone(page)->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
diff --git a/mm/sparse.c b/mm/sparse.c
index b29534cea8c0..41bef8e1f65c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -288,33 +288,31 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
 
 static void __meminit sparse_init_one_section(struct mem_section *ms,
 		unsigned long pnum, struct page *mem_map,
-		unsigned long *pageblock_bitmap)
+		struct mem_section_usage *usage)
 {
 	ms->section_mem_map &= ~SECTION_MAP_MASK;
 	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
 							SECTION_HAS_MEM_MAP;
- 	ms->pageblock_flags = pageblock_bitmap;
+	ms->usage = usage;
 }
 
-unsigned long usemap_size(void)
+static unsigned long usemap_size(void)
 {
 	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-static unsigned long *__kmalloc_section_usemap(void)
+size_t mem_section_usage_size(void)
 {
-	return kmalloc(usemap_size(), GFP_KERNEL);
+	return sizeof(struct mem_section_usage) + usemap_size();
 }
-#endif /* CONFIG_MEMORY_HOTPLUG */
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static unsigned long * __init
+static struct mem_section_usage * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 					 unsigned long size)
 {
+	struct mem_section_usage *usage;
 	unsigned long goal, limit;
-	unsigned long *p;
 	int nid;
 	/*
 	 * A page may contain usemaps for other sections preventing the
@@ -330,15 +328,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	limit = goal + (1UL << PA_SECTION_SHIFT);
 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
-	p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
-	if (!p && limit) {
+	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
+	if (!usage && limit) {
 		limit = 0;
 		goto again;
 	}
-	return p;
+	return usage;
 }
 
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+		struct mem_section_usage *usage)
 {
 	unsigned long usemap_snr, pgdat_snr;
 	static unsigned long old_usemap_snr;
@@ -352,7 +351,7 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 		old_pgdat_snr = NR_MEM_SECTIONS;
 	}
 
-	usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
 	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
 	if (usemap_snr == pgdat_snr)
 		return;
@@ -380,14 +379,15 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 		usemap_snr, pgdat_snr, nid);
 }
 #else
-static unsigned long * __init
+static struct mem_section_usage * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 					 unsigned long size)
 {
 	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
 }
 
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+		struct mem_section_usage *usage)
 {
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -474,14 +474,13 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 				   unsigned long pnum_end,
 				   unsigned long map_count)
 {
-	unsigned long pnum, usemap_longs, *usemap;
+	struct mem_section_usage *usage;
+	unsigned long pnum;
 	struct page *map;
 
-	usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
-	usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
-							  usemap_size() *
-							  map_count);
-	if (!usemap) {
+	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+			mem_section_usage_size() * map_count);
+	if (!usage) {
 		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
 		goto failed;
 	}
@@ -497,9 +496,9 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 			pnum_begin = pnum;
 			goto failed;
 		}
-		check_usemap_section_nr(nid, usemap);
-		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
-		usemap += usemap_longs;
+		check_usemap_section_nr(nid, usage);
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage);
+		usage = (void *) usage + mem_section_usage_size();
 	}
 	sparse_buffer_fini();
 	return;
@@ -697,9 +696,9 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 				     struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
+	struct mem_section_usage *usage;
 	struct mem_section *ms;
 	struct page *memmap;
-	unsigned long *usemap;
 	int ret;
 
 	/*
@@ -713,8 +712,8 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 	memmap = kmalloc_section_memmap(section_nr, nid, altmap);
 	if (!memmap)
 		return -ENOMEM;
-	usemap = __kmalloc_section_usemap();
-	if (!usemap) {
+	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+	if (!usage) {
 		__kfree_section_memmap(memmap, altmap);
 		return -ENOMEM;
 	}
@@ -733,11 +732,11 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 
 	set_section_nid(section_nr, nid);
 	section_mark_present(ms);
-	sparse_init_one_section(ms, section_nr, memmap, usemap);
+	sparse_init_one_section(ms, section_nr, memmap, usage);
 
 out:
 	if (ret < 0) {
-		kfree(usemap);
+		kfree(usage);
 		__kfree_section_memmap(memmap, altmap);
 	}
 	return ret;
@@ -773,20 +772,20 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usemap(struct page *memmap, unsigned long *usemap,
-		struct vmem_altmap *altmap)
+static void free_section_usage(struct page *memmap,
+		struct mem_section_usage *usage, struct vmem_altmap *altmap)
 {
-	struct page *usemap_page;
+	struct page *usage_page;
 
-	if (!usemap)
+	if (!usage)
 		return;
 
-	usemap_page = virt_to_page(usemap);
+	usage_page = virt_to_page(usage);
 	/*
 	 * Check to see if allocation came from hot-plug-add
 	 */
-	if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
-		kfree(usemap);
+	if (PageSlab(usage_page) || PageCompound(usage_page)) {
+		kfree(usage);
 		if (memmap)
 			__kfree_section_memmap(memmap, altmap);
 		return;
@@ -805,18 +804,18 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 			       struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
-	unsigned long *usemap = NULL;
+	struct mem_section_usage *usage = NULL;
 
 	if (ms->section_mem_map) {
-		usemap = ms->pageblock_flags;
+		usage = ms->usage;
 		memmap = sparse_decode_mem_map(ms->section_mem_map,
 						__section_nr(ms));
 		ms->section_mem_map = 0;
-		ms->pageblock_flags = NULL;
+		ms->usage = NULL;
 	}
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usemap(memmap, usemap, altmap);
+	free_section_usage(memmap, usage, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From 326e1b8f83a4318b09033ef754f40c785aed5e68 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:00 -0700
Subject: mm/sparsemem: introduce a SECTION_IS_EARLY flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for sub-section hotplug, track whether a given section
was created during early memory initialization, or later via memory
hotplug.  This distinction is needed to maintain the coarse expectation
that pfn_valid() returns true for any pfn within a given section even if
that section has pages that are reserved from the page allocator.

For example one of the of goals of subsection hotplug is to support
cases where the system physical memory layout collides System RAM and
PMEM within a section.  Several pfn_valid() users expect to just check
if a section is valid, but they are not careful to check if the given
pfn is within a "System RAM" boundary and instead expect pgdat
information to further validate the pfn.

Rather than unwind those paths to make their pfn_valid() queries more
precise a follow on patch uses the SECTION_IS_EARLY flag to maintain the
traditional expectation that pfn_valid() returns true for all early
sections.

Link: https://lore.kernel.org/lkml/1560366952-10660-1-git-send-email-cai@lca.pw/
Link: http://lkml.kernel.org/r/156092350358.979959.5817209875548072819.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Qian Cai <cai@lca.pw>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  8 +++++++-
 mm/sparse.c            | 20 +++++++++-----------
 2 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2520336bdfd1..4be40634238b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1260,7 +1260,8 @@ extern size_t mem_section_usage_size(void);
 #define	SECTION_MARKED_PRESENT	(1UL<<0)
 #define SECTION_HAS_MEM_MAP	(1UL<<1)
 #define SECTION_IS_ONLINE	(1UL<<2)
-#define SECTION_MAP_LAST_BIT	(1UL<<3)
+#define SECTION_IS_EARLY	(1UL<<3)
+#define SECTION_MAP_LAST_BIT	(1UL<<4)
 #define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
 #define SECTION_NID_SHIFT	3
 
@@ -1286,6 +1287,11 @@ static inline int valid_section(struct mem_section *section)
 	return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
 }
 
+static inline int early_section(struct mem_section *section)
+{
+	return (section && (section->section_mem_map & SECTION_IS_EARLY));
+}
+
 static inline int valid_section_nr(unsigned long nr)
 {
 	return valid_section(__nr_to_section(nr));
diff --git a/mm/sparse.c b/mm/sparse.c
index 41bef8e1f65c..6d23a526279a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -288,11 +288,11 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
 
 static void __meminit sparse_init_one_section(struct mem_section *ms,
 		unsigned long pnum, struct page *mem_map,
-		struct mem_section_usage *usage)
+		struct mem_section_usage *usage, unsigned long flags)
 {
 	ms->section_mem_map &= ~SECTION_MAP_MASK;
-	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
-							SECTION_HAS_MEM_MAP;
+	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
+		| SECTION_HAS_MEM_MAP | flags;
 	ms->usage = usage;
 }
 
@@ -497,7 +497,8 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 			goto failed;
 		}
 		check_usemap_section_nr(nid, usage);
-		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage);
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
+				SECTION_IS_EARLY);
 		usage = (void *) usage + mem_section_usage_size();
 	}
 	sparse_buffer_fini();
@@ -732,7 +733,7 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 
 	set_section_nid(section_nr, nid);
 	section_mark_present(ms);
-	sparse_init_one_section(ms, section_nr, memmap, usage);
+	sparse_init_one_section(ms, section_nr, memmap, usage, 0);
 
 out:
 	if (ret < 0) {
@@ -772,19 +773,16 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usage(struct page *memmap,
+static void free_section_usage(struct mem_section *ms, struct page *memmap,
 		struct mem_section_usage *usage, struct vmem_altmap *altmap)
 {
-	struct page *usage_page;
-
 	if (!usage)
 		return;
 
-	usage_page = virt_to_page(usage);
 	/*
 	 * Check to see if allocation came from hot-plug-add
 	 */
-	if (PageSlab(usage_page) || PageCompound(usage_page)) {
+	if (!early_section(ms)) {
 		kfree(usage);
 		if (memmap)
 			__kfree_section_memmap(memmap, altmap);
@@ -816,6 +814,6 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usage(memmap, usage, altmap);
+	free_section_usage(ms, memmap, usage, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From f46edbd1b1516da1fb34c917775168d5df576f78 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:04 -0700
Subject: mm/sparsemem: add helpers track active portions of a section at boot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare for hot{plug,remove} of sub-ranges of a section by tracking a
sub-section active bitmask, each bit representing a PMD_SIZE span of the
architecture's memory hotplug section size.

The implications of a partially populated section is that pfn_valid()
needs to go beyond a valid_section() check and either determine that the
section is an "early section", or read the sub-section active ranges
from the bitmask.  The expectation is that the bitmask (subsection_map)
fits in the same cacheline as the valid_section() / early_section()
data, so the incremental performance overhead to pfn_valid() should be
negligible.

The rationale for using early_section() to short-ciruit the
subsection_map check is that there are legacy code paths that use
pfn_valid() at section granularity before validating the pfn against
pgdat data.  So, the early_section() check allows those traditional
assumptions to persist while also permitting subsection_map to tell the
truth for purposes of populating the unused portions of early sections
with PMEM and other ZONE_DEVICE mappings.

Link: http://lkml.kernel.org/r/156092350874.979959.18185938451405518285.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Qian Cai <cai@lca.pw>
Tested-by: Jane Chu <jane.chu@oracle.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 33 ++++++++++++++++++++++++++++++++-
 mm/page_alloc.c        | 10 ++++++++--
 mm/sparse.c            | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4be40634238b..7747ec9de588 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1178,6 +1178,8 @@ struct mem_section_usage {
 	unsigned long pageblock_flags[0];
 };
 
+void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
+
 struct page;
 struct page_ext;
 struct mem_section {
@@ -1321,12 +1323,40 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 
 extern unsigned long __highest_present_section_nr;
 
+static inline int subsection_map_index(unsigned long pfn)
+{
+	return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
+{
+	int idx = subsection_map_index(pfn);
+
+	return test_bit(idx, ms->usage->subsection_map);
+}
+#else
+static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
+{
+	return 1;
+}
+#endif
+
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 static inline int pfn_valid(unsigned long pfn)
 {
+	struct mem_section *ms;
+
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
-	return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
+	ms = __nr_to_section(pfn_to_section_nr(pfn));
+	if (!valid_section(ms))
+		return 0;
+	/*
+	 * Traditionally early sections always returned pfn_valid() for
+	 * the entire section-sized span.
+	 */
+	return early_section(ms) || pfn_section_valid(ms, pfn);
 }
 #endif
 
@@ -1358,6 +1388,7 @@ void sparse_init(void);
 #define sparse_init()	do {} while (0)
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #define pfn_present pfn_valid
+#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index be78bafbfe3a..c4cdd3954804 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7351,12 +7351,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 
-	/* Print out the early node map */
+	/*
+	 * Print out the early node map, and initialize the
+	 * subsection-map relative to active online memory ranges to
+	 * enable future "sub-section" extensions of the memory map.
+	 */
 	pr_info("Early memory node ranges\n");
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
 			(u64)start_pfn << PAGE_SHIFT,
 			((u64)end_pfn << PAGE_SHIFT) - 1);
+		subsection_map_init(start_pfn, end_pfn - start_pfn);
+	}
 
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
diff --git a/mm/sparse.c b/mm/sparse.c
index 6d23a526279a..26b48ee1a262 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -210,6 +210,41 @@ static inline unsigned long first_present_section_nr(void)
 	return next_present_section_nr(-1);
 }
 
+void subsection_mask_set(unsigned long *map, unsigned long pfn,
+		unsigned long nr_pages)
+{
+	int idx = subsection_map_index(pfn);
+	int end = subsection_map_index(pfn + nr_pages - 1);
+
+	bitmap_set(map, idx, end - idx + 1);
+}
+
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+	int i, start_sec = pfn_to_section_nr(pfn);
+
+	if (!nr_pages)
+		return;
+
+	for (i = start_sec; i <= end_sec; i++) {
+		struct mem_section *ms;
+		unsigned long pfns;
+
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		ms = __nr_to_section(i);
+		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
+
+		pr_debug("%s: sec: %d pfns: %ld set(%d, %d)\n", __func__, i,
+				pfns, subsection_map_index(pfn),
+				subsection_map_index(pfn + pfns - 1));
+
+		pfn += pfns;
+		nr_pages -= pfns;
+	}
+}
+
 /* Record a memory area against a node. */
 void __init memory_present(int nid, unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3


From e9c0a3f05477e18d2dae816cb61b62be1b7e90d3 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:11 -0700
Subject: mm/sparsemem: convert kmalloc_section_memmap() to
 populate_section_memmap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow sub-section sized ranges to be added to the memmap.

populate_section_memmap() takes an explict pfn range rather than
assuming a full section, and those parameters are plumbed all the way
through to vmmemap_populate().  There should be no sub-section usage in
current deployments.  New warnings are added to clarify which memmap
allocation paths are sub-section capable.

Link: http://lkml.kernel.org/r/156092352058.979959.6551283472062305149.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/init_64.c |  4 +++-
 include/linux/mm.h    |  4 ++--
 mm/sparse-vmemmap.c   | 21 ++++++++++++++-------
 mm/sparse.c           | 50 +++++++++++++++++++++++++++-----------------------
 4 files changed, 46 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a289a2ab108..a6b5c653727b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1518,7 +1518,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 {
 	int err;
 
-	if (boot_cpu_has(X86_FEATURE_PSE))
+	if (end - start < PAGES_PER_SECTION * sizeof(struct page))
+		err = vmemmap_populate_basepages(start, end, node);
+	else if (boot_cpu_has(X86_FEATURE_PSE))
 		err = vmemmap_populate_hugepages(start, end, node, altmap);
 	else if (altmap) {
 		pr_err_once("%s: no cpu support for altmap allocations\n",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 48ab7b982d82..0334ca97c584 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2767,8 +2767,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 #endif
 
 void *sparse_buffer_alloc(unsigned long size);
-struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap);
+struct page * __populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
 p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 7fec05796796..200aef686722 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -245,19 +245,26 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
 	return 0;
 }
 
-struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap)
+struct page * __meminit __populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	unsigned long start;
 	unsigned long end;
-	struct page *map;
 
-	map = pfn_to_page(pnum * PAGES_PER_SECTION);
-	start = (unsigned long)map;
-	end = (unsigned long)(map + PAGES_PER_SECTION);
+	/*
+	 * The minimum granularity of memmap extensions is
+	 * PAGES_PER_SUBSECTION as allocations are tracked in the
+	 * 'subsection_map' bitmap of the section.
+	 */
+	end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);
+	pfn &= PAGE_SUBSECTION_MASK;
+	nr_pages = end - pfn;
+
+	start = (unsigned long) pfn_to_page(pfn);
+	end = start + nr_pages * sizeof(struct page);
 
 	if (vmemmap_populate(start, end, nid, altmap))
 		return NULL;
 
-	return map;
+	return pfn_to_page(pfn);
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 26b48ee1a262..6b01022e23a9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -439,8 +439,8 @@ static unsigned long __init section_map_size(void)
 	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
 }
 
-struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap)
+struct page __init *__populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	unsigned long size = section_map_size();
 	struct page *map = sparse_buffer_alloc(size);
@@ -521,10 +521,13 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 	}
 	sparse_buffer_init(map_count * section_map_size(), nid);
 	for_each_present_section_nr(pnum_begin, pnum) {
+		unsigned long pfn = section_nr_to_pfn(pnum);
+
 		if (pnum >= pnum_end)
 			break;
 
-		map = sparse_mem_map_populate(pnum, nid, NULL);
+		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
+				nid, NULL);
 		if (!map) {
 			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
 			       __func__, nid);
@@ -625,17 +628,17 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap)
+static struct page *populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
-	/* This will make the necessary allocations eventually. */
-	return sparse_mem_map_populate(pnum, nid, altmap);
+	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
 }
-static void __kfree_section_memmap(struct page *memmap,
+
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 		struct vmem_altmap *altmap)
 {
-	unsigned long start = (unsigned long)memmap;
-	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+	unsigned long start = (unsigned long) pfn_to_page(pfn);
+	unsigned long end = start + nr_pages * sizeof(struct page);
 
 	vmemmap_free(start, end, altmap);
 }
@@ -647,7 +650,8 @@ static void free_map_bootmem(struct page *memmap)
 	vmemmap_free(start, end, NULL);
 }
 #else
-static struct page *__kmalloc_section_memmap(void)
+struct page *populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	struct page *page, *ret;
 	unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
@@ -668,15 +672,11 @@ got_map_ptr:
 	return ret;
 }
 
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 		struct vmem_altmap *altmap)
 {
-	return __kmalloc_section_memmap();
-}
+	struct page *memmap = pfn_to_page(pfn);
 
-static void __kfree_section_memmap(struct page *memmap,
-		struct vmem_altmap *altmap)
-{
 	if (is_vmalloc_addr(memmap))
 		vfree(memmap);
 	else
@@ -745,12 +745,13 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 	if (ret < 0 && ret != -EEXIST)
 		return ret;
 	ret = 0;
-	memmap = kmalloc_section_memmap(section_nr, nid, altmap);
+	memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid,
+			altmap);
 	if (!memmap)
 		return -ENOMEM;
 	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
 	if (!usage) {
-		__kfree_section_memmap(memmap, altmap);
+		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
 		return -ENOMEM;
 	}
 
@@ -773,7 +774,7 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 out:
 	if (ret < 0) {
 		kfree(usage);
-		__kfree_section_memmap(memmap, altmap);
+		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
 	}
 	return ret;
 }
@@ -809,7 +810,8 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 #endif
 
 static void free_section_usage(struct mem_section *ms, struct page *memmap,
-		struct mem_section_usage *usage, struct vmem_altmap *altmap)
+		struct mem_section_usage *usage, unsigned long pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	if (!usage)
 		return;
@@ -820,7 +822,7 @@ static void free_section_usage(struct mem_section *ms, struct page *memmap,
 	if (!early_section(ms)) {
 		kfree(usage);
 		if (memmap)
-			__kfree_section_memmap(memmap, altmap);
+			depopulate_section_memmap(pfn, nr_pages, altmap);
 		return;
 	}
 
@@ -849,6 +851,8 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usage(ms, memmap, usage, altmap);
+	free_section_usage(ms, memmap, usage,
+			section_nr_to_pfn(__section_nr(ms)),
+			PAGES_PER_SECTION, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From 46d945aeab4d7dd837bd0724662de2caf712f047 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:18 -0700
Subject: mm: kill is_dev_zone() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Given there are no more usages of is_dev_zone() outside of 'ifdef
CONFIG_ZONE_DEVICE' protection, kill off the compilation helper.

Link: http://lkml.kernel.org/r/156092353211.979959.1489004866360828964.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Cc: Michal Hocko <mhocko@suse.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 12 ------------
 mm/page_alloc.c        |  2 +-
 2 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7747ec9de588..8331e76677c0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -855,18 +855,6 @@ static inline int local_memory_node(int node_id) { return node_id; };
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 
-#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_dev_zone(const struct zone *zone)
-{
-	return zone_idx(zone) == ZONE_DEVICE;
-}
-#else
-static inline bool is_dev_zone(const struct zone *zone)
-{
-	return false;
-}
-#endif
-
 /*
  * Returns true if a zone has pages managed by the buddy allocator.
  * All the reclaim decisions have to use this function rather than
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c4cdd3954804..2c74367a8eba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5926,7 +5926,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 	unsigned long start = jiffies;
 	int nid = pgdat->node_id;
 
-	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
+	if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
 		return;
 
 	/*
-- 
cgit v1.2.3


From 7ea6216049ff9cf250a6722cd766d99c8d1424e5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:22 -0700
Subject: mm/sparsemem: prepare for sub-section ranges
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare the memory hot-{add,remove} paths for handling sub-section
ranges by plumbing the starting page frame and number of pages being
handled through arch_{add,remove}_memory() to
sparse_{add,remove}_one_section().

This is simply plumbing, small cleanups, and some identifier renames.
No intended functional changes.

Link: http://lkml.kernel.org/r/156092353780.979959.9713046515562743194.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |   5 +-
 mm/memory_hotplug.c            | 114 +++++++++++++++++++++++++----------------
 mm/sparse.c                    |  16 +++---
 3 files changed, 81 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 475aff8efbf8..2d636a7491a4 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -346,9 +346,10 @@ extern int add_memory_resource(int nid, struct resource *resource);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap);
 extern bool is_memblock_offlined(struct memory_block *mem);
-extern int sparse_add_one_section(int nid, unsigned long start_pfn,
-				  struct vmem_altmap *altmap);
+extern int sparse_add_section(int nid, unsigned long pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap);
 extern void sparse_remove_one_section(struct mem_section *ms,
+		unsigned long pfn, unsigned long nr_pages,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 11220044b01a..3fbb2cfab126 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -252,51 +252,84 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 }
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
-static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
-				   struct vmem_altmap *altmap)
+static int __meminit __add_section(int nid, unsigned long pfn,
+		unsigned long nr_pages,	struct vmem_altmap *altmap)
 {
 	int ret;
 
-	if (pfn_valid(phys_start_pfn))
+	if (pfn_valid(pfn))
 		return -EEXIST;
 
-	ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
+	ret = sparse_add_section(nid, pfn, nr_pages, altmap);
 	return ret < 0 ? ret : 0;
 }
 
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
+		const char *reason)
+{
+	/*
+	 * Disallow all operations smaller than a sub-section and only
+	 * allow operations smaller than a section for
+	 * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
+	 * enforces a larger memory_block_size_bytes() granularity for
+	 * memory that will be marked online, so this check should only
+	 * fire for direct arch_{add,remove}_memory() users outside of
+	 * add_memory_resource().
+	 */
+	unsigned long min_align;
+
+	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+		min_align = PAGES_PER_SUBSECTION;
+	else
+		min_align = PAGES_PER_SECTION;
+	if (!IS_ALIGNED(pfn, min_align)
+			|| !IS_ALIGNED(nr_pages, min_align)) {
+		WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
+				reason, pfn, pfn + nr_pages - 1);
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /*
  * Reasonably generic function for adding memory.  It is
  * expected that archs that support memory hotplug will
  * call this function after deciding the zone to which to
  * add the new pages.
  */
-int __ref __add_pages(int nid, unsigned long phys_start_pfn,
-		unsigned long nr_pages, struct mhp_restrictions *restrictions)
+int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
+		struct mhp_restrictions *restrictions)
 {
 	unsigned long i;
-	int err = 0;
-	int start_sec, end_sec;
+	int start_sec, end_sec, err;
 	struct vmem_altmap *altmap = restrictions->altmap;
 
-	/* during initialize mem_map, align hot-added range to section */
-	start_sec = pfn_to_section_nr(phys_start_pfn);
-	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
-
 	if (altmap) {
 		/*
 		 * Validate altmap is within bounds of the total request
 		 */
-		if (altmap->base_pfn != phys_start_pfn
+		if (altmap->base_pfn != pfn
 				|| vmem_altmap_offset(altmap) > nr_pages) {
 			pr_warn_once("memory add fail, invalid altmap\n");
-			err = -EINVAL;
-			goto out;
+			return -EINVAL;
 		}
 		altmap->alloc = 0;
 	}
 
+	err = check_pfn_span(pfn, nr_pages, "add");
+	if (err)
+		return err;
+
+	start_sec = pfn_to_section_nr(pfn);
+	end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
 	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, section_nr_to_pfn(i), altmap);
+		unsigned long pfns;
+
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		err = __add_section(nid, pfn, pfns, altmap);
+		pfn += pfns;
+		nr_pages -= pfns;
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -309,7 +342,6 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 		cond_resched();
 	}
 	vmemmap_populate_print_last();
-out:
 	return err;
 }
 
@@ -487,10 +519,10 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
 	pgdat->node_spanned_pages = 0;
 }
 
-static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+static void __remove_zone(struct zone *zone, unsigned long start_pfn,
+		unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
-	int nr_pages = PAGES_PER_SECTION;
 	unsigned long flags;
 
 	pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -499,27 +531,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 }
 
-static void __remove_section(struct zone *zone, struct mem_section *ms,
-			     unsigned long map_offset,
-			     struct vmem_altmap *altmap)
+static void __remove_section(struct zone *zone, unsigned long pfn,
+		unsigned long nr_pages, unsigned long map_offset,
+		struct vmem_altmap *altmap)
 {
-	unsigned long start_pfn;
-	int scn_nr;
+	struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
 
 	if (WARN_ON_ONCE(!valid_section(ms)))
 		return;
 
-	scn_nr = __section_nr(ms);
-	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
-	__remove_zone(zone, start_pfn);
-
-	sparse_remove_one_section(ms, map_offset, altmap);
+	__remove_zone(zone, pfn, nr_pages);
+	sparse_remove_one_section(ms, pfn, nr_pages, map_offset, altmap);
 }
 
 /**
  * __remove_pages() - remove sections of pages from a zone
  * @zone: zone from which pages need to be removed
- * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @pfn: starting pageframe (must be aligned to start of a section)
  * @nr_pages: number of pages to remove (must be multiple of section size)
  * @altmap: alternative device page map or %NULL if default memmap is used
  *
@@ -528,30 +556,30 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
  * sure that pages are marked reserved and zones are adjust properly by
  * calling offline_pages().
  */
-void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+void __remove_pages(struct zone *zone, unsigned long pfn,
 		    unsigned long nr_pages, struct vmem_altmap *altmap)
 {
-	unsigned long i;
 	unsigned long map_offset = 0;
-	int sections_to_remove;
+	int i, start_sec, end_sec;
 
 	map_offset = vmem_altmap_offset(altmap);
 
 	clear_zone_contiguous(zone);
 
-	/*
-	 * We can only remove entire sections
-	 */
-	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
-	BUG_ON(nr_pages % PAGES_PER_SECTION);
+	if (check_pfn_span(pfn, nr_pages, "remove"))
+		return;
 
-	sections_to_remove = nr_pages / PAGES_PER_SECTION;
-	for (i = 0; i < sections_to_remove; i++) {
-		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+	start_sec = pfn_to_section_nr(pfn);
+	end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+	for (i = start_sec; i <= end_sec; i++) {
+		unsigned long pfns;
 
 		cond_resched();
-		__remove_section(zone, __pfn_to_section(pfn), map_offset,
-				 altmap);
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		__remove_section(zone, pfn, pfns, map_offset, altmap);
+		pfn += pfns;
+		nr_pages -= pfns;
 		map_offset = 0;
 	}
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 6b01022e23a9..41579b66fff1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -728,8 +728,8 @@ static void free_map_bootmem(struct page *memmap)
  * * -EEXIST	- Section has been present.
  * * -ENOMEM	- Out of memory.
  */
-int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
-				     struct vmem_altmap *altmap)
+int __meminit sparse_add_section(int nid, unsigned long start_pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
 	struct mem_section_usage *usage;
@@ -835,8 +835,9 @@ static void free_section_usage(struct mem_section *ms, struct page *memmap,
 		free_map_bootmem(memmap);
 }
 
-void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
-			       struct vmem_altmap *altmap)
+void sparse_remove_one_section(struct mem_section *ms, unsigned long pfn,
+		unsigned long nr_pages, unsigned long map_offset,
+		struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
 	struct mem_section_usage *usage = NULL;
@@ -849,10 +850,7 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 		ms->usage = NULL;
 	}
 
-	clear_hwpoisoned_pages(memmap + map_offset,
-			PAGES_PER_SECTION - map_offset);
-	free_section_usage(ms, memmap, usage,
-			section_nr_to_pfn(__section_nr(ms)),
-			PAGES_PER_SECTION, altmap);
+	clear_hwpoisoned_pages(memmap + map_offset, nr_pages - map_offset);
+	free_section_usage(ms, memmap, usage, pfn, nr_pages, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From ba72b4c8cf60e452cf6f0258ed9ee697957b7dfd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:26 -0700
Subject: mm/sparsemem: support sub-section hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The libnvdimm sub-system has suffered a series of hacks and broken
workarounds for the memory-hotplug implementation's awkward
section-aligned (128MB) granularity.

For example the following backtrace is emitted when attempting
arch_add_memory() with physical address ranges that intersect 'System
RAM' (RAM) with 'Persistent Memory' (PMEM) within a given section:

    # cat /proc/iomem | grep -A1 -B1 Persistent\ Memory
    100000000-1ffffffff : System RAM
    200000000-303ffffff : Persistent Memory (legacy)
    304000000-43fffffff : System RAM
    440000000-23ffffffff : Persistent Memory
    2400000000-43bfffffff : Persistent Memory
      2400000000-43bfffffff : namespace2.0

    WARNING: CPU: 38 PID: 928 at arch/x86/mm/init_64.c:850 add_pages+0x5c/0x60
    [..]
    RIP: 0010:add_pages+0x5c/0x60
    [..]
    Call Trace:
     devm_memremap_pages+0x460/0x6e0
     pmem_attach_disk+0x29e/0x680 [nd_pmem]
     ? nd_dax_probe+0xfc/0x120 [libnvdimm]
     nvdimm_bus_probe+0x66/0x160 [libnvdimm]

It was discovered that the problem goes beyond RAM vs PMEM collisions as
some platform produce PMEM vs PMEM collisions within a given section.
The libnvdimm workaround for that case revealed that the libnvdimm
section-alignment-padding implementation has been broken for a long
while.

A fix for that long-standing breakage introduces as many problems as it
solves as it would require a backward-incompatible change to the
namespace metadata interpretation.  Instead of that dubious route [1],
address the root problem in the memory-hotplug implementation.

Note that EEXIST is no longer treated as success as that is how
sparse_add_section() reports subsection collisions, it was also obviated
by recent changes to perform the request_region() for 'System RAM'
before arch_add_memory() in the add_memory() sequence.

[1] https://lore.kernel.org/r/155000671719.348031.2347363160141119237.stgit@dwillia2-desk3.amr.corp.intel.com

[osalvador@suse.de: fix deactivate_section for early sections]
  Link: http://lkml.kernel.org/r/20190715081549.32577-2-osalvador@suse.de
Link: http://lkml.kernel.org/r/156092354368.979959.6232443923440952359.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |   2 +-
 mm/memory_hotplug.c            |  27 +-----
 mm/page_alloc.c                |   2 +-
 mm/sparse.c                    | 206 +++++++++++++++++++++++++++--------------
 4 files changed, 141 insertions(+), 96 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 2d636a7491a4..f46ea71b4ffd 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -348,7 +348,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern int sparse_add_section(int nid, unsigned long pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap);
-extern void sparse_remove_one_section(struct mem_section *ms,
+extern void sparse_remove_section(struct mem_section *ms,
 		unsigned long pfn, unsigned long nr_pages,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3fbb2cfab126..aafb71594ee3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -252,18 +252,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 }
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
-static int __meminit __add_section(int nid, unsigned long pfn,
-		unsigned long nr_pages,	struct vmem_altmap *altmap)
-{
-	int ret;
-
-	if (pfn_valid(pfn))
-		return -EEXIST;
-
-	ret = sparse_add_section(nid, pfn, nr_pages, altmap);
-	return ret < 0 ? ret : 0;
-}
-
 static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
 		const char *reason)
 {
@@ -327,18 +315,11 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
 
 		pfns = min(nr_pages, PAGES_PER_SECTION
 				- (pfn & ~PAGE_SECTION_MASK));
-		err = __add_section(nid, pfn, pfns, altmap);
+		err = sparse_add_section(nid, pfn, pfns, altmap);
+		if (err)
+			break;
 		pfn += pfns;
 		nr_pages -= pfns;
-
-		/*
-		 * EEXIST is finally dealt with by ioresource collision
-		 * check. see add_memory() => register_memory_resource()
-		 * Warning will be printed if there is collision.
-		 */
-		if (err && (err != -EEXIST))
-			break;
-		err = 0;
 		cond_resched();
 	}
 	vmemmap_populate_print_last();
@@ -541,7 +522,7 @@ static void __remove_section(struct zone *zone, unsigned long pfn,
 		return;
 
 	__remove_zone(zone, pfn, nr_pages);
-	sparse_remove_one_section(ms, pfn, nr_pages, map_offset, altmap);
+	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
 }
 
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c74367a8eba..272c6de1bf4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5974,7 +5974,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 		 * pfn out of zone.
 		 *
 		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
-		 * because this is done early in sparse_add_one_section
+		 * because this is done early in section_activate()
 		 */
 		if (!(pfn & (pageblock_nr_pages - 1))) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
diff --git a/mm/sparse.c b/mm/sparse.c
index 41579b66fff1..a205a2ac66a4 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -83,8 +83,15 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
 	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
 	struct mem_section *section;
 
+	/*
+	 * An existing section is possible in the sub-section hotplug
+	 * case. First hot-add instantiates, follow-on hot-add reuses
+	 * the existing section.
+	 *
+	 * The mem_hotplug_lock resolves the apparent race below.
+	 */
 	if (mem_section[root])
-		return -EEXIST;
+		return 0;
 
 	section = sparse_index_alloc(nid);
 	if (!section)
@@ -715,10 +722,120 @@ static void free_map_bootmem(struct page *memmap)
 }
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
+static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
+		struct vmem_altmap *altmap)
+{
+	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+	struct mem_section *ms = __pfn_to_section(pfn);
+	bool section_is_early = early_section(ms);
+	struct page *memmap = NULL;
+	unsigned long *subsection_map = ms->usage
+		? &ms->usage->subsection_map[0] : NULL;
+
+	subsection_mask_set(map, pfn, nr_pages);
+	if (subsection_map)
+		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+				"section already deactivated (%#lx + %ld)\n",
+				pfn, nr_pages))
+		return;
+
+	/*
+	 * There are 3 cases to handle across two configurations
+	 * (SPARSEMEM_VMEMMAP={y,n}):
+	 *
+	 * 1/ deactivation of a partial hot-added section (only possible
+	 * in the SPARSEMEM_VMEMMAP=y case).
+	 *    a/ section was present at memory init
+	 *    b/ section was hot-added post memory init
+	 * 2/ deactivation of a complete hot-added section
+	 * 3/ deactivation of a complete section from memory init
+	 *
+	 * For 1/, when subsection_map does not empty we will not be
+	 * freeing the usage map, but still need to free the vmemmap
+	 * range.
+	 *
+	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
+	 */
+	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+		unsigned long section_nr = pfn_to_section_nr(pfn);
+
+		if (!section_is_early) {
+			kfree(ms->usage);
+			ms->usage = NULL;
+		}
+		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+	}
+
+	if (section_is_early && memmap)
+		free_map_bootmem(memmap);
+	else
+		depopulate_section_memmap(pfn, nr_pages, altmap);
+}
+
+static struct page * __meminit section_activate(int nid, unsigned long pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+	struct mem_section *ms = __pfn_to_section(pfn);
+	struct mem_section_usage *usage = NULL;
+	unsigned long *subsection_map;
+	struct page *memmap;
+	int rc = 0;
+
+	subsection_mask_set(map, pfn, nr_pages);
+
+	if (!ms->usage) {
+		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+		if (!usage)
+			return ERR_PTR(-ENOMEM);
+		ms->usage = usage;
+	}
+	subsection_map = &ms->usage->subsection_map[0];
+
+	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+		rc = -EINVAL;
+	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+		rc = -EEXIST;
+	else
+		bitmap_or(subsection_map, map, subsection_map,
+				SUBSECTIONS_PER_SECTION);
+
+	if (rc) {
+		if (usage)
+			ms->usage = NULL;
+		kfree(usage);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * The early init code does not consider partially populated
+	 * initial sections, it simply assumes that memory will never be
+	 * referenced.  If we hot-add memory into such a section then we
+	 * do not need to populate the memmap and can simply reuse what
+	 * is already there.
+	 */
+	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
+		return pfn_to_page(pfn);
+
+	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
+	if (!memmap) {
+		section_deactivate(pfn, nr_pages, altmap);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return memmap;
+}
+
 /**
- * sparse_add_one_section - add a memory section
+ * sparse_add_section - add a memory section, or populate an existing one
  * @nid: The node to add section on
  * @start_pfn: start pfn of the memory range
+ * @nr_pages: number of pfns to add in the section
  * @altmap: device page map
  *
  * This is only intended for hotplug.
@@ -732,51 +849,34 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
-	struct mem_section_usage *usage;
 	struct mem_section *ms;
 	struct page *memmap;
 	int ret;
 
-	/*
-	 * no locking for this, because it does its own
-	 * plus, it does a kmalloc
-	 */
 	ret = sparse_index_init(section_nr, nid);
-	if (ret < 0 && ret != -EEXIST)
+	if (ret < 0)
 		return ret;
-	ret = 0;
-	memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid,
-			altmap);
-	if (!memmap)
-		return -ENOMEM;
-	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
-	if (!usage) {
-		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
-		return -ENOMEM;
-	}
 
-	ms = __pfn_to_section(start_pfn);
-	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
-		ret = -EEXIST;
-		goto out;
-	}
+	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
+	if (IS_ERR(memmap))
+		return PTR_ERR(memmap);
 
 	/*
 	 * Poison uninitialized struct pages in order to catch invalid flags
 	 * combinations.
 	 */
-	page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
+	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
 
+	ms = __pfn_to_section(start_pfn);
 	set_section_nid(section_nr, nid);
 	section_mark_present(ms);
-	sparse_init_one_section(ms, section_nr, memmap, usage, 0);
 
-out:
-	if (ret < 0) {
-		kfree(usage);
-		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
-	}
-	return ret;
+	/* Align memmap to section boundary in the subsection case */
+	if (section_nr_to_pfn(section_nr) != start_pfn)
+		memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
+	sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
+
+	return 0;
 }
 
 #ifdef CONFIG_MEMORY_FAILURE
@@ -809,48 +909,12 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usage(struct mem_section *ms, struct page *memmap,
-		struct mem_section_usage *usage, unsigned long pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap)
-{
-	if (!usage)
-		return;
-
-	/*
-	 * Check to see if allocation came from hot-plug-add
-	 */
-	if (!early_section(ms)) {
-		kfree(usage);
-		if (memmap)
-			depopulate_section_memmap(pfn, nr_pages, altmap);
-		return;
-	}
-
-	/*
-	 * The usemap came from bootmem. This is packed with other usemaps
-	 * on the section which has pgdat at boot time. Just keep it as is now.
-	 */
-
-	if (memmap)
-		free_map_bootmem(memmap);
-}
-
-void sparse_remove_one_section(struct mem_section *ms, unsigned long pfn,
+void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
 		unsigned long nr_pages, unsigned long map_offset,
 		struct vmem_altmap *altmap)
 {
-	struct page *memmap = NULL;
-	struct mem_section_usage *usage = NULL;
-
-	if (ms->section_mem_map) {
-		usage = ms->usage;
-		memmap = sparse_decode_mem_map(ms->section_mem_map,
-						__section_nr(ms));
-		ms->section_mem_map = 0;
-		ms->usage = NULL;
-	}
-
-	clear_hwpoisoned_pages(memmap + map_offset, nr_pages - map_offset);
-	free_section_usage(ms, memmap, usage, pfn, nr_pages, altmap);
+	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
+			nr_pages - map_offset);
+	section_deactivate(pfn, nr_pages, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From a3619190d62ed9d66416891be2416f6bea2b3ca4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:40 -0700
Subject: libnvdimm/pfn: stop padding pmem namespaces to section alignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the mm core supports section-unaligned hotplug of ZONE_DEVICE
memory, we no longer need to add padding at pfn/dax device creation
time.  The kernel will still honor padding established by older kernels.

Link: http://lkml.kernel.org/r/156092356588.979959.6793371748950931916.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Cc: David Hildenbrand <david@redhat.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/nvdimm/pfn.h      | 14 ---------
 drivers/nvdimm/pfn_devs.c | 77 ++++++++---------------------------------------
 include/linux/mmzone.h    |  3 ++
 3 files changed, 16 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index dfb2bcda8f5a..7381673b7b70 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -33,18 +33,4 @@ struct nd_pfn_sb {
 	__le64 checksum;
 };
 
-#ifdef CONFIG_SPARSEMEM
-#define PFN_SECTION_ALIGN_DOWN(x) SECTION_ALIGN_DOWN(x)
-#define PFN_SECTION_ALIGN_UP(x) SECTION_ALIGN_UP(x)
-#else
-/*
- * In this case ZONE_DEVICE=n and we will disable 'pfn' device support,
- * but we still want pmem to compile.
- */
-#define PFN_SECTION_ALIGN_DOWN(x) (x)
-#define PFN_SECTION_ALIGN_UP(x) (x)
-#endif
-
-#define PHYS_SECTION_ALIGN_DOWN(x) PFN_PHYS(PFN_SECTION_ALIGN_DOWN(PHYS_PFN(x)))
-#define PHYS_SECTION_ALIGN_UP(x) PFN_PHYS(PFN_SECTION_ALIGN_UP(PHYS_PFN(x)))
 #endif /* __NVDIMM_PFN_H */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 06f465c0baf3..df2bdbd22450 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -587,14 +587,14 @@ static u32 info_block_reserve(void)
 }
 
 /*
- * We hotplug memory at section granularity, pad the reserved area from
- * the previous section base to the namespace base address.
+ * We hotplug memory at sub-section granularity, pad the reserved area
+ * from the previous section base to the namespace base address.
  */
 static unsigned long init_altmap_base(resource_size_t base)
 {
 	unsigned long base_pfn = PHYS_PFN(base);
 
-	return PFN_SECTION_ALIGN_DOWN(base_pfn);
+	return SUBSECTION_ALIGN_DOWN(base_pfn);
 }
 
 static unsigned long init_altmap_reserve(resource_size_t base)
@@ -602,7 +602,7 @@ static unsigned long init_altmap_reserve(resource_size_t base)
 	unsigned long reserve = info_block_reserve() >> PAGE_SHIFT;
 	unsigned long base_pfn = PHYS_PFN(base);
 
-	reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn);
+	reserve += base_pfn - SUBSECTION_ALIGN_DOWN(base_pfn);
 	return reserve;
 }
 
@@ -632,8 +632,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 			return -EINVAL;
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
-		nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
-					- offset) / PAGE_SIZE);
+		nd_pfn->npfns = PHYS_PFN((resource_size(res) - offset));
 		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
 			dev_info(&nd_pfn->dev,
 					"number of pfns truncated from %lld to %ld\n",
@@ -649,54 +648,14 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 	return 0;
 }
 
-static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys)
-{
-	return min_t(u64, PHYS_SECTION_ALIGN_DOWN(phys),
-			ALIGN_DOWN(phys, nd_pfn->align));
-}
-
-/*
- * Check if pmem collides with 'System RAM', or other regions when
- * section aligned.  Trim it accordingly.
- */
-static void trim_pfn_device(struct nd_pfn *nd_pfn, u32 *start_pad, u32 *end_trunc)
-{
-	struct nd_namespace_common *ndns = nd_pfn->ndns;
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	struct nd_region *nd_region = to_nd_region(nd_pfn->dev.parent);
-	const resource_size_t start = nsio->res.start;
-	const resource_size_t end = start + resource_size(&nsio->res);
-	resource_size_t adjust, size;
-
-	*start_pad = 0;
-	*end_trunc = 0;
-
-	adjust = start - PHYS_SECTION_ALIGN_DOWN(start);
-	size = resource_size(&nsio->res) + adjust;
-	if (region_intersects(start - adjust, size, IORESOURCE_SYSTEM_RAM,
-				IORES_DESC_NONE) == REGION_MIXED
-			|| nd_region_conflict(nd_region, start - adjust, size))
-		*start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
-
-	/* Now check that end of the range does not collide. */
-	adjust = PHYS_SECTION_ALIGN_UP(end) - end;
-	size = resource_size(&nsio->res) + adjust;
-	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
-				IORES_DESC_NONE) == REGION_MIXED
-			|| !IS_ALIGNED(end, nd_pfn->align)
-			|| nd_region_conflict(nd_region, start, size))
-		*end_trunc = end - phys_pmem_align_down(nd_pfn, end);
-}
-
 static int nd_pfn_init(struct nd_pfn *nd_pfn)
 {
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	u32 start_pad, end_trunc, reserve = info_block_reserve();
 	resource_size_t start, size;
 	struct nd_region *nd_region;
+	unsigned long npfns, align;
 	struct nd_pfn_sb *pfn_sb;
-	unsigned long npfns;
 	phys_addr_t offset;
 	const char *sig;
 	u64 checksum;
@@ -727,43 +686,35 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 		return -ENXIO;
 	}
 
-	memset(pfn_sb, 0, sizeof(*pfn_sb));
-
-	trim_pfn_device(nd_pfn, &start_pad, &end_trunc);
-	if (start_pad + end_trunc)
-		dev_info(&nd_pfn->dev, "%s alignment collision, truncate %d bytes\n",
-				dev_name(&ndns->dev), start_pad + end_trunc);
-
 	/*
 	 * Note, we use 64 here for the standard size of struct page,
 	 * debugging options may cause it to be larger in which case the
 	 * implementation will limit the pfns advertised through
 	 * ->direct_access() to those that are included in the memmap.
 	 */
-	start = nsio->res.start + start_pad;
+	start = nsio->res.start;
 	size = resource_size(&nsio->res);
-	npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - reserve)
-			/ PAGE_SIZE);
+	npfns = PHYS_PFN(size - SZ_8K);
+	align = max(nd_pfn->align, (1UL << SUBSECTION_SHIFT));
 	if (nd_pfn->mode == PFN_MODE_PMEM) {
 		/*
 		 * The altmap should be padded out to the block size used
 		 * when populating the vmemmap. This *should* be equal to
 		 * PMD_SIZE for most architectures.
 		 */
-		offset = ALIGN(start + reserve + 64 * npfns,
-				max(nd_pfn->align, PMD_SIZE)) - start;
+		offset = ALIGN(start + SZ_8K + 64 * npfns, align) - start;
 	} else if (nd_pfn->mode == PFN_MODE_RAM)
-		offset = ALIGN(start + reserve, nd_pfn->align) - start;
+		offset = ALIGN(start + SZ_8K, align) - start;
 	else
 		return -ENXIO;
 
-	if (offset + start_pad + end_trunc >= size) {
+	if (offset >= size) {
 		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
 				dev_name(&ndns->dev));
 		return -ENXIO;
 	}
 
-	npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
+	npfns = PHYS_PFN(size - offset);
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
@@ -772,8 +723,6 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
 	pfn_sb->version_major = cpu_to_le16(1);
 	pfn_sb->version_minor = cpu_to_le16(3);
-	pfn_sb->start_pad = cpu_to_le32(start_pad);
-	pfn_sb->end_trunc = cpu_to_le32(end_trunc);
 	pfn_sb->align = cpu_to_le32(nd_pfn->align);
 	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
 	pfn_sb->checksum = cpu_to_le64(checksum);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8331e76677c0..d77d717c620c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1160,6 +1160,9 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
 #endif
 
+#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
+#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
+
 struct mem_section_usage {
 	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
 	/* See declaration of similar field in struct zone */
-- 
cgit v1.2.3


From 371096949f0ad3950b06729989bd27de51b8c5f5 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 18 Jul 2019 15:58:46 -0700
Subject: mm: migrate: remove unused mode argument

migrate_page_move_mapping() doesn't use the mode argument.  Remove it
and update callers accordingly.

Link: http://lkml.kernel.org/r/20190508210301.8472-1-keith.busch@intel.com
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c                | 2 +-
 fs/f2fs/data.c          | 2 +-
 fs/iomap.c              | 2 +-
 fs/ubifs/file.c         | 2 +-
 include/linux/migrate.h | 3 +--
 mm/migrate.c            | 7 +++----
 6 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/aio.c b/fs/aio.c
index 8327db0c8e08..8b3aa2739906 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -425,7 +425,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 	BUG_ON(PageWriteback(old));
 	get_page(new);
 
-	rc = migrate_page_move_mapping(mapping, new, old, mode, 1);
+	rc = migrate_page_move_mapping(mapping, new, old, 1);
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		put_page(new);
 		goto out_unlock;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4eb2f3920140..abbf14e9bd72 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2919,7 +2919,7 @@ int f2fs_migrate_page(struct address_space *mapping,
 	/* one extra reference was held for atomic_write page */
 	extra_count = atomic_written ? 1 : 0;
 	rc = migrate_page_move_mapping(mapping, newpage,
-				page, mode, extra_count);
+				page, extra_count);
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		if (atomic_written)
 			mutex_unlock(&fi->inmem_lock);
diff --git a/fs/iomap.c b/fs/iomap.c
index 217c3e5a13d6..3e7f16a05653 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -566,7 +566,7 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
 {
 	int ret;
 
-	ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (ret != MIGRATEPAGE_SUCCESS)
 		return ret;
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e5f8de62fc51..400970d740bb 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1470,7 +1470,7 @@ static int ubifs_migrate_page(struct address_space *mapping,
 {
 	int rc;
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e13d9bf2f9a5..7f04754c7f2b 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -77,8 +77,7 @@ extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode,
-		int extra_count);
+		struct page *newpage, struct page *page, int extra_count);
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
diff --git a/mm/migrate.c b/mm/migrate.c
index 3445747e229d..8992741f10aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -394,8 +394,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
 int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode,
-		int extra_count)
+		struct page *newpage, struct page *page, int extra_count)
 {
 	XA_STATE(xas, &mapping->i_pages, page_index(page));
 	struct zone *oldzone, *newzone;
@@ -681,7 +680,7 @@ int migrate_page(struct address_space *mapping,
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
@@ -780,7 +779,7 @@ recheck_buffers:
 		}
 	}
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		goto unlock_buffers;
 
-- 
cgit v1.2.3


From eec4844fae7c033a0c1fc1eb3b8517aeb8b6cc49 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Thu, 18 Jul 2019 15:58:50 -0700
Subject: proc/sysctl: add shared variables for range check

In the sysctl code the proc_dointvec_minmax() function is often used to
validate the user supplied value between an allowed range.  This
function uses the extra1 and extra2 members from struct ctl_table as
minimum and maximum allowed value.

On sysctl handler declaration, in every source file there are some
readonly variables containing just an integer which address is assigned
to the extra1 and extra2 members, so the sysctl range is enforced.

The special values 0, 1 and INT_MAX are very often used as range
boundary, leading duplication of variables like zero=0, one=1,
int_max=INT_MAX in different source files:

    $ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l
    248

Add a const int array containing the most commonly used values, some
macros to refer more easily to the correct array member, and use them
instead of creating a local one for every object file.

This is the bloat-o-meter output comparing the old and new binary
compiled with the default Fedora config:

    # scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o
    add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164)
    Data                                         old     new   delta
    sysctl_vals                                    -      12     +12
    __kstrtab_sysctl_vals                          -      12     +12
    max                                           14      10      -4
    int_max                                       16       -     -16
    one                                           68       -     -68
    zero                                         128      28    -100
    Total: Before=20583249, After=20583085, chg -0.00%

[mcroce@redhat.com: tipc: remove two unused variables]
  Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com
[akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c]
[arnd@arndb.de: proc/sysctl: make firmware loader table conditional]
  Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de
[akpm@linux-foundation.org: fix fs/eventpoll.c]
Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/appldata/appldata_base.c            |  15 +-
 arch/s390/kernel/topology.c                   |   6 +-
 arch/x86/entry/vdso/vdso32-setup.c            |   7 +-
 arch/x86/kernel/itmt.c                        |   6 +-
 drivers/base/firmware_loader/fallback_table.c |  13 +-
 drivers/gpu/drm/i915/i915_perf.c              |   8 +-
 drivers/hv/vmbus_drv.c                        |   6 +-
 drivers/tty/tty_ldisc.c                       |   6 +-
 drivers/xen/balloon.c                         |   7 +-
 fs/eventpoll.c                                |   4 +-
 fs/notify/inotify/inotify_user.c              |   8 +-
 fs/proc/proc_sysctl.c                         |   4 +
 include/linux/sysctl.h                        |   7 +
 ipc/ipc_sysctl.c                              |  35 +++--
 kernel/pid_namespace.c                        |   3 +-
 kernel/sysctl.c                               | 197 +++++++++++++-------------
 kernel/ucount.c                               |   6 +-
 net/core/neighbour.c                          |  20 ++-
 net/core/sysctl_net_core.c                    |  34 +++--
 net/dccp/sysctl.c                             |  16 +--
 net/ipv4/sysctl_net_ipv4.c                    |  60 ++++----
 net/ipv6/addrconf.c                           |   6 +-
 net/ipv6/route.c                              |   7 +-
 net/ipv6/sysctl_net_ipv6.c                    |  10 +-
 net/mpls/af_mpls.c                            |  10 +-
 net/netfilter/ipvs/ip_vs_ctl.c                |   3 +-
 net/rxrpc/sysctl.c                            |   9 +-
 net/sctp/sysctl.c                             |  35 +++--
 net/sunrpc/xprtrdma/transport.c               |   3 +-
 net/tipc/sysctl.c                             |   6 +-
 security/keys/sysctl.c                        |  26 ++--
 security/loadpin/loadpin.c                    |   6 +-
 security/yama/yama_lsm.c                      |   3 +-
 33 files changed, 270 insertions(+), 322 deletions(-)

(limited to 'include')

diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index e4b58240ec53..aa738cad1338 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -220,15 +220,13 @@ appldata_timer_handler(struct ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int timer_active = appldata_timer_active;
-	int zero = 0;
-	int one = 1;
 	int rc;
 	struct ctl_table ctl_entry = {
 		.procname	= ctl->procname,
 		.data		= &timer_active,
 		.maxlen		= sizeof(int),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	};
 
 	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
@@ -255,13 +253,12 @@ appldata_interval_handler(struct ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int interval = appldata_interval;
-	int one = 1;
 	int rc;
 	struct ctl_table ctl_entry = {
 		.procname	= ctl->procname,
 		.data		= &interval,
 		.maxlen		= sizeof(int),
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	};
 
 	rc = proc_dointvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
@@ -289,13 +286,11 @@ appldata_generic_handler(struct ctl_table *ctl, int write,
 	struct list_head *lh;
 	int rc, found;
 	int active;
-	int zero = 0;
-	int one = 1;
 	struct ctl_table ctl_entry = {
 		.data		= &active,
 		.maxlen		= sizeof(int),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	};
 
 	found = 0;
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 8964a3f60aad..2db6fb405a9a 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -587,15 +587,13 @@ static int topology_ctl_handler(struct ctl_table *ctl, int write,
 {
 	int enabled = topology_is_enabled();
 	int new_mode;
-	int zero = 0;
-	int one = 1;
 	int rc;
 	struct ctl_table ctl_entry = {
 		.procname	= ctl->procname,
 		.data		= &enabled,
 		.maxlen		= sizeof(int),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	};
 
 	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 42d4c89f990e..240626e7f55a 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -65,9 +65,6 @@ subsys_initcall(sysenter_setup);
 /* Register vsyscall32 into the ABI table */
 #include <linux/sysctl.h>
 
-static const int zero;
-static const int one = 1;
-
 static struct ctl_table abi_table2[] = {
 	{
 		.procname	= "vsyscall32",
@@ -75,8 +72,8 @@ static struct ctl_table abi_table2[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (int *)&zero,
-		.extra2		= (int *)&one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{}
 };
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 838cf8a32c49..1cb3ca9bba49 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -65,8 +65,6 @@ static int sched_itmt_update_handler(struct ctl_table *table, int write,
 	return ret;
 }
 
-static unsigned int zero;
-static unsigned int one = 1;
 static struct ctl_table itmt_kern_table[] = {
 	{
 		.procname	= "sched_itmt_enabled",
@@ -74,8 +72,8 @@ static struct ctl_table itmt_kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sched_itmt_update_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{}
 };
diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c
index 776dd69cf5be..ba9d30b28edc 100644
--- a/drivers/base/firmware_loader/fallback_table.c
+++ b/drivers/base/firmware_loader/fallback_table.c
@@ -16,9 +16,6 @@
  * firmware fallback configuration table
  */
 
-static unsigned int zero;
-static unsigned int one = 1;
-
 struct firmware_fallback_config fw_fallback_config = {
 	.force_sysfs_fallback = IS_ENABLED(CONFIG_FW_LOADER_USER_HELPER_FALLBACK),
 	.loading_timeout = 60,
@@ -26,6 +23,7 @@ struct firmware_fallback_config fw_fallback_config = {
 };
 EXPORT_SYMBOL_GPL(fw_fallback_config);
 
+#ifdef CONFIG_SYSCTL
 struct ctl_table firmware_config_table[] = {
 	{
 		.procname	= "force_sysfs_fallback",
@@ -33,8 +31,8 @@ struct ctl_table firmware_config_table[] = {
 		.maxlen         = sizeof(unsigned int),
 		.mode           = 0644,
 		.proc_handler   = proc_douintvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "ignore_sysfs_fallback",
@@ -42,9 +40,10 @@ struct ctl_table firmware_config_table[] = {
 		.maxlen         = sizeof(unsigned int),
 		.mode           = 0644,
 		.proc_handler   = proc_douintvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{ }
 };
 EXPORT_SYMBOL_GPL(firmware_config_table);
+#endif
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 3d8162d28730..a700c5c3d167 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -274,8 +274,6 @@
 #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY)
 
 /* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */
-static int zero;
-static int one = 1;
 static u32 i915_perf_stream_paranoid = true;
 
 /* The maximum exponent the hardware accepts is 63 (essentially it selects one
@@ -3366,8 +3364,8 @@ static struct ctl_table oa_table[] = {
 	 .maxlen = sizeof(i915_perf_stream_paranoid),
 	 .mode = 0644,
 	 .proc_handler = proc_dointvec_minmax,
-	 .extra1 = &zero,
-	 .extra2 = &one,
+	 .extra1 = SYSCTL_ZERO,
+	 .extra2 = SYSCTL_ONE,
 	 },
 	{
 	 .procname = "oa_max_sample_rate",
@@ -3375,7 +3373,7 @@ static struct ctl_table oa_table[] = {
 	 .maxlen = sizeof(i915_oa_max_sample_rate),
 	 .mode = 0644,
 	 .proc_handler = proc_dointvec_minmax,
-	 .extra1 = &zero,
+	 .extra1 = SYSCTL_ZERO,
 	 .extra2 = &oa_sample_rate_hard_limit,
 	 },
 	{}
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 894da5abdc55..ebd35fc35290 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1197,8 +1197,6 @@ static struct kmsg_dumper hv_kmsg_dumper = {
 };
 
 static struct ctl_table_header *hv_ctl_table_hdr;
-static int zero;
-static int one = 1;
 
 /*
  * sysctl option to allow the user to control whether kmsg data should be
@@ -1211,8 +1209,8 @@ static struct ctl_table hv_ctl_table[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
 	},
 	{}
 };
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index fde8d4073e74..4c49f53afa3e 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -855,8 +855,6 @@ void tty_ldisc_deinit(struct tty_struct *tty)
 	tty->ldisc = NULL;
 }
 
-static int zero;
-static int one = 1;
 static struct ctl_table tty_table[] = {
 	{
 		.procname	= "ldisc_autoload",
@@ -864,8 +862,8 @@ static struct ctl_table tty_table[] = {
 		.maxlen		= sizeof(tty_ldisc_autoload),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index d37dd5bb7a8f..37a36c6b9f93 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -77,9 +77,6 @@ static int xen_hotplug_unpopulated;
 
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 
-static int zero;
-static int one = 1;
-
 static struct ctl_table balloon_table[] = {
 	{
 		.procname	= "hotplug_unpopulated",
@@ -87,8 +84,8 @@ static struct ctl_table balloon_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &zero,
-		.extra2         = &one,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0f9c073d78d5..d7f1f5011fac 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -291,7 +291,7 @@ static LIST_HEAD(tfile_check_list);
 
 #include <linux/sysctl.h>
 
-static long zero;
+static long long_zero;
 static long long_max = LONG_MAX;
 
 struct ctl_table epoll_table[] = {
@@ -301,7 +301,7 @@ struct ctl_table epoll_table[] = {
 		.maxlen		= sizeof(max_user_watches),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &zero,
+		.extra1		= &long_zero,
 		.extra2		= &long_max,
 	},
 	{ }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index cce8de32779f..0b815178126e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -45,8 +45,6 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
 #include <linux/sysctl.h>
 
-static int zero;
-
 struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
@@ -54,7 +52,7 @@ struct ctl_table inotify_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "max_user_watches",
@@ -62,7 +60,7 @@ struct ctl_table inotify_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "max_queued_events",
@@ -70,7 +68,7 @@ struct ctl_table inotify_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero
+		.extra1		= SYSCTL_ZERO
 	},
 	{ }
 };
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 36ad1b0d6259..d80989b6c344 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -22,6 +22,10 @@ static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
 static const struct inode_operations proc_sys_dir_operations;
 
+/* shared constants to be used in various sysctls */
+const int sysctl_vals[] = { 0, 1, INT_MAX };
+EXPORT_SYMBOL(sysctl_vals);
+
 /* Support for permanently empty directories */
 
 struct ctl_table sysctl_mount_point[] = {
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index aadd310769d0..6df477329b76 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -37,6 +37,13 @@ struct ctl_table_root;
 struct ctl_table_header;
 struct ctl_dir;
 
+/* Keep the same order as in fs/proc/proc_sysctl.c */
+#define SYSCTL_ZERO	((void *)&sysctl_vals[0])
+#define SYSCTL_ONE	((void *)&sysctl_vals[1])
+#define SYSCTL_INT_MAX	((void *)&sysctl_vals[2])
+
+extern const int sysctl_vals[];
+
 typedef int proc_handler (struct ctl_table *ctl, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos);
 
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 2b14ce8ce73f..affd66537e87 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -113,9 +113,6 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 #define proc_ipc_sem_dointvec	   NULL
 #endif
 
-static int zero;
-static int one = 1;
-static int int_max = INT_MAX;
 int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
 int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
@@ -141,7 +138,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlmni),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
 	},
 	{
@@ -150,8 +147,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "msgmax",
@@ -159,8 +156,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmax),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msgmni",
@@ -168,7 +165,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmni),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
 	},
 	{
@@ -177,8 +174,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_auto_msgmni,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	=  "msgmnb",
@@ -186,8 +183,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmnb),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "sem",
@@ -203,8 +200,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msg_next_id",
@@ -212,8 +209,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "shm_next_id",
@@ -221,8 +218,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 #endif
 	{}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d726cef241c..a6a79f85c81a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -291,14 +291,13 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 }
 
 extern int pid_max;
-static int zero = 0;
 static struct ctl_table pid_ns_ctl_table[] = {
 	{
 		.procname = "ns_last_pid",
 		.maxlen = sizeof(int),
 		.mode = 0666, /* permissions are checked in the handler */
 		.proc_handler = pid_ns_ctl_handler,
-		.extra1 = &zero,
+		.extra1 = SYSCTL_ZERO,
 		.extra2 = &pid_max,
 	},
 	{ }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 43186ccfa139..078950d9605b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,9 +125,6 @@ static int sixty = 60;
 #endif
 
 static int __maybe_unused neg_one = -1;
-
-static int zero;
-static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long zero_ul;
@@ -385,8 +382,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sysctl_schedstats,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SCHEDSTATS */
 #endif /* CONFIG_SMP */
@@ -418,7 +415,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "numa_balancing",
@@ -426,8 +423,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sysctl_numa_balancing,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
@@ -475,8 +472,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -486,7 +483,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 #endif
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
@@ -496,8 +493,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sched_energy_aware_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_PROVE_LOCKING
@@ -562,7 +559,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &neg_one,
-		.extra2		= &one,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_LATENCYTOP
@@ -696,8 +693,8 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		/* only handle a transition from default "0" to "1" */
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_MODULES
@@ -715,8 +712,8 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		/* only handle a transition from default "0" to "1" */
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_UEVENT_HELPER
@@ -875,7 +872,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ten_thousand,
 	},
 	{
@@ -891,8 +888,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_sysadmin,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "kptr_restrict",
@@ -900,7 +897,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_sysadmin,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 #endif
@@ -925,8 +922,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = proc_watchdog,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "watchdog_thresh",
@@ -934,7 +931,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_watchdog_thresh,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &sixty,
 	},
 	{
@@ -943,8 +940,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= NMI_WATCHDOG_SYSCTL_PERM,
 		.proc_handler   = proc_nmi_watchdog,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "watchdog_cpumask",
@@ -960,8 +957,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = proc_soft_watchdog,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "softlockup_panic",
@@ -969,8 +966,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #ifdef CONFIG_SMP
 	{
@@ -979,8 +976,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SMP */
 #endif
@@ -991,8 +988,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #ifdef CONFIG_SMP
 	{
@@ -1001,8 +998,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SMP */
 #endif
@@ -1115,8 +1112,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "hung_task_check_count",
@@ -1124,7 +1121,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "hung_task_timeout_secs",
@@ -1201,7 +1198,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
 		.proc_handler	= perf_proc_update_handler,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "perf_cpu_time_max_percent",
@@ -1209,7 +1206,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_cpu_time_max_percent),
 		.mode		= 0644,
 		.proc_handler	= perf_cpu_time_max_percent_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1218,7 +1215,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_event_max_stack),
 		.mode		= 0644,
 		.proc_handler	= perf_event_max_stack_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &six_hundred_forty_kb,
 	},
 	{
@@ -1227,7 +1224,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_event_max_contexts_per_stack),
 		.mode		= 0644,
 		.proc_handler	= perf_event_max_stack_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_thousand,
 	},
 #endif
@@ -1237,8 +1234,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 	{
@@ -1247,8 +1244,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= timer_migration_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_BPF_SYSCALL
@@ -1259,8 +1256,8 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		/* only handle a transition from default "0" to "1" */
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "bpf_stats_enabled",
@@ -1277,8 +1274,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_panic_on_rcu_stall),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
@@ -1288,8 +1285,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= stack_erasing_sysctl,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{ }
@@ -1302,7 +1299,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_overcommit_memory),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1311,7 +1308,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_panic_on_oom),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1348,7 +1345,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "dirty_background_ratio",
@@ -1356,7 +1353,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(dirty_background_ratio),
 		.mode		= 0644,
 		.proc_handler	= dirty_background_ratio_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1373,7 +1370,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(vm_dirty_ratio),
 		.mode		= 0644,
 		.proc_handler	= dirty_ratio_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1397,7 +1394,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "dirtytime_expire_seconds",
@@ -1405,7 +1402,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(dirtytime_expire_interval),
 		.mode		= 0644,
 		.proc_handler	= dirtytime_interval_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "swappiness",
@@ -1413,7 +1410,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(vm_swappiness),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 #ifdef CONFIG_HUGETLB_PAGE
@@ -1438,8 +1435,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen			= sizeof(int),
 		.mode			= 0644,
 		.proc_handler	= sysctl_vm_numa_stat_handler,
-		.extra1			= &zero,
-		.extra2			= &one,
+		.extra1			= SYSCTL_ZERO,
+		.extra2			= SYSCTL_ONE,
 	},
 #endif
 	 {
@@ -1470,7 +1467,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= drop_caches_sysctl_handler,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &four,
 	},
 #ifdef CONFIG_COMPACTION
@@ -1496,8 +1493,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 
 #endif /* CONFIG_COMPACTION */
@@ -1507,7 +1504,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(min_free_kbytes),
 		.mode		= 0644,
 		.proc_handler	= min_free_kbytes_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "watermark_boost_factor",
@@ -1515,7 +1512,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(watermark_boost_factor),
 		.mode		= 0644,
 		.proc_handler	= watermark_boost_factor_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "watermark_scale_factor",
@@ -1523,7 +1520,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(watermark_scale_factor),
 		.mode		= 0644,
 		.proc_handler	= watermark_scale_factor_sysctl_handler,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &one_thousand,
 	},
 	{
@@ -1532,7 +1529,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(percpu_pagelist_fraction),
 		.mode		= 0644,
 		.proc_handler	= percpu_pagelist_fraction_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #ifdef CONFIG_MMU
 	{
@@ -1541,7 +1538,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_max_map_count),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #else
 	{
@@ -1550,7 +1547,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_nr_trim_pages),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 	{
@@ -1566,7 +1563,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(block_dump),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "vfs_cache_pressure",
@@ -1574,7 +1571,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_vfs_cache_pressure),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 	{
@@ -1583,7 +1580,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_legacy_va_layout),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 #ifdef CONFIG_NUMA
@@ -1593,7 +1590,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(node_reclaim_mode),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "min_unmapped_ratio",
@@ -1601,7 +1598,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_min_unmapped_ratio),
 		.mode		= 0644,
 		.proc_handler	= sysctl_min_unmapped_ratio_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1610,7 +1607,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_min_slab_ratio),
 		.mode		= 0644,
 		.proc_handler	= sysctl_min_slab_ratio_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 #endif
@@ -1661,7 +1658,7 @@ static struct ctl_table vm_table[] = {
 #endif
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 #ifdef CONFIG_HIGHMEM
@@ -1671,8 +1668,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(vm_highmem_is_dirtyable),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
@@ -1682,8 +1679,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "memory_failure_recovery",
@@ -1691,8 +1688,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_memory_failure_recovery),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -1738,8 +1735,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{ }
@@ -1875,8 +1872,8 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "protected_hardlinks",
@@ -1884,8 +1881,8 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "protected_fifos",
@@ -1893,7 +1890,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1902,7 +1899,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1911,7 +1908,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_coredump,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
@@ -1948,7 +1945,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{ }
 };
@@ -1970,8 +1967,8 @@ static struct ctl_table debug_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_kprobes_optimization_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{ }
@@ -3395,8 +3392,8 @@ int proc_do_static_key(struct ctl_table *table, int write,
 		.data   = &val,
 		.maxlen = sizeof(val),
 		.mode   = table->mode,
-		.extra1 = &zero,
-		.extra2 = &one,
+		.extra1 = SYSCTL_ZERO,
+		.extra2 = SYSCTL_ONE,
 	};
 
 	if (write && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/ucount.c b/kernel/ucount.c
index feb128c7b5d9..a53cc2b4179c 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -52,16 +52,14 @@ static struct ctl_table_root set_root = {
 	.permissions = set_permissions,
 };
 
-static int zero = 0;
-static int int_max = INT_MAX;
 #define UCOUNT_ENTRY(name)				\
 	{						\
 		.procname	= name,			\
 		.maxlen		= sizeof(int),		\
 		.mode		= 0644,			\
 		.proc_handler	= proc_dointvec_minmax,	\
-		.extra1		= &zero,		\
-		.extra2		= &int_max,		\
+		.extra1		= SYSCTL_ZERO,		\
+		.extra2		= SYSCTL_INT_MAX,	\
 	}
 static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_user_namespaces"),
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 742cea4ce72e..26da97359d5b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3374,8 +3374,6 @@ void neigh_app_ns(struct neighbour *n)
 EXPORT_SYMBOL(neigh_app_ns);
 
 #ifdef CONFIG_SYSCTL
-static int zero;
-static int int_max = INT_MAX;
 static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
 
 static int proc_unres_qlen(struct ctl_table *ctl, int write,
@@ -3384,7 +3382,7 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write,
 	int size, ret;
 	struct ctl_table tmp = *ctl;
 
-	tmp.extra1 = &zero;
+	tmp.extra1 = SYSCTL_ZERO;
 	tmp.extra2 = &unres_qlen_max;
 	tmp.data = &size;
 
@@ -3449,8 +3447,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
 	struct ctl_table tmp = *ctl;
 	int ret;
 
-	tmp.extra1 = &zero;
-	tmp.extra2 = &int_max;
+	tmp.extra1 = SYSCTL_ZERO;
+	tmp.extra2 = SYSCTL_INT_MAX;
 
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 	neigh_proc_update(ctl, write);
@@ -3595,24 +3593,24 @@ static struct neigh_sysctl_table {
 			.procname	= "gc_thresh1",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.extra1 	= &zero,
-			.extra2		= &int_max,
+			.extra1		= SYSCTL_ZERO,
+			.extra2		= SYSCTL_INT_MAX,
 			.proc_handler	= proc_dointvec_minmax,
 		},
 		[NEIGH_VAR_GC_THRESH2] = {
 			.procname	= "gc_thresh2",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.extra1 	= &zero,
-			.extra2		= &int_max,
+			.extra1		= SYSCTL_ZERO,
+			.extra2		= SYSCTL_INT_MAX,
 			.proc_handler	= proc_dointvec_minmax,
 		},
 		[NEIGH_VAR_GC_THRESH3] = {
 			.procname	= "gc_thresh3",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.extra1 	= &zero,
-			.extra2		= &int_max,
+			.extra1		= SYSCTL_ZERO,
+			.extra2		= SYSCTL_INT_MAX,
 			.proc_handler	= proc_dointvec_minmax,
 		},
 		{},
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index f9204719aeee..8da5b3a54dac 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -22,8 +22,6 @@
 #include <net/busy_poll.h>
 #include <net/pkt_sched.h>
 
-static int zero = 0;
-static int one = 1;
 static int two __maybe_unused = 2;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -390,10 +388,10 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_bpf_enable,
 # ifdef CONFIG_BPF_JIT_ALWAYS_ON
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 # else
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 # endif
 	},
@@ -404,7 +402,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -413,8 +411,8 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 # endif
 	{
@@ -461,8 +459,8 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
 	},
 #ifdef CONFIG_RPS
 	{
@@ -493,7 +491,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "busy_read",
@@ -501,7 +499,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 #ifdef CONFIG_NET_SCHED
@@ -533,7 +531,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &max_skb_frags,
 	},
 	{
@@ -542,7 +540,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "fb_tunnels_only_for_init_net",
@@ -550,8 +548,8 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "devconf_inherit_init_net",
@@ -559,7 +557,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -578,7 +576,7 @@ static struct ctl_table netns_core_table[] = {
 		.data		= &init_net.core.sysctl_somaxconn,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.proc_handler	= proc_dointvec_minmax
 	},
 	{ }
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index b59040f268a9..ee8d4f5afa72 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -16,9 +16,7 @@
 #endif
 
 /* Boundary values */
-static int		zero     = 0,
-			one      = 1,
-			u8_max   = 0xFF;
+static int		u8_max   = 0xFF;
 static unsigned long	seqw_min = DCCPF_SEQ_WMIN,
 			seqw_max = 0xFFFFFFFF;		/* maximum on 32 bit */
 
@@ -38,7 +36,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_rx_ccid),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,		/* RFC 4340, 10. */
 	},
 	{
@@ -47,7 +45,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_tx_ccid),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,		/* RFC 4340, 10. */
 	},
 	{
@@ -56,7 +54,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_request_retries),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &u8_max,
 	},
 	{
@@ -65,7 +63,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_retries1),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,
 	},
 	{
@@ -74,7 +72,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_retries2),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,
 	},
 	{
@@ -83,7 +81,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_tx_qlen),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "sync_ratelimit",
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7d66306b5f39..0b980e841927 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,8 +28,6 @@
 #include <net/protocol.h>
 #include <net/netevent.h>
 
-static int zero;
-static int one = 1;
 static int two = 2;
 static int four = 4;
 static int thousand = 1000;
@@ -576,7 +574,7 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "icmp_msgs_burst",
@@ -584,7 +582,7 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "udp_mem",
@@ -674,8 +672,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -763,8 +761,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = ipv4_fwd_update_priority,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "ip_nonlocal_bind",
@@ -794,8 +792,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -864,7 +862,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 #endif
 	{
@@ -969,7 +967,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1011,7 +1009,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_tfo_blackhole_detect_timeout,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
@@ -1020,8 +1018,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "fib_multipath_hash_policy",
@@ -1029,8 +1027,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_fib_multipath_hash_policy,
-		.extra1		= &zero,
-		.extra2		= &two,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -1047,8 +1045,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -1078,7 +1076,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &four,
 	},
 	{
@@ -1222,7 +1220,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &gso_max_segs,
 	},
 	{
@@ -1231,7 +1229,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_day_secs
 	},
 	{
@@ -1240,8 +1238,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "tcp_invalid_ratelimit",
@@ -1256,7 +1254,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &thousand,
 	},
 	{
@@ -1265,7 +1263,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &thousand,
 	},
 	{
@@ -1274,7 +1272,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_wmem),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "tcp_rmem",
@@ -1282,7 +1280,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_rmem),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "tcp_comp_sack_delay_ns",
@@ -1297,7 +1295,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &comp_sack_nr_max,
 	},
 	{
@@ -1306,7 +1304,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_udp_rmem_min),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 	{
 		.procname	= "udp_wmem_min",
@@ -1314,7 +1312,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_udp_wmem_min),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 	{ }
 };
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 521e3203e83a..dc73888c7859 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6432,8 +6432,6 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
 }
 
 static int minus_one = -1;
-static const int zero = 0;
-static const int one = 1;
 static const int two_five_five = 255;
 
 static const struct ctl_table addrconf_sysctl[] = {
@@ -6450,7 +6448,7 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&two_five_five,
 	},
 	{
@@ -6809,7 +6807,7 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&zero,
+		.extra1		= (void *)SYSCTL_ZERO,
 		.extra2		= (void *)&two_five_five,
 	},
 	{
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4d2e6b31a8d6..8b0c33fb19a2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6031,9 +6031,6 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 	return 0;
 }
 
-static int zero;
-static int one = 1;
-
 static struct ctl_table ipv6_route_table_template[] = {
 	{
 		.procname	=	"flush",
@@ -6111,8 +6108,8 @@ static struct ctl_table ipv6_route_table_template[] = {
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
 		.proc_handler	=	proc_dointvec_minmax,
-		.extra1		=	&zero,
-		.extra2		=	&one,
+		.extra1		=	SYSCTL_ZERO,
+		.extra2		=	SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index dc4c91e0bfb8..ec8fcfc60a27 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -21,8 +21,6 @@
 #include <net/calipso.h>
 #endif
 
-static int zero;
-static int one = 1;
 static int flowlabel_reflect_max = 0x7;
 static int auto_flowlabels_min;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
@@ -115,7 +113,7 @@ static struct ctl_table ipv6_table_template[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &flowlabel_reflect_max,
 	},
 	{
@@ -152,8 +150,8 @@ static struct ctl_table ipv6_table_template[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = proc_rt6_multipath_hash_policy,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "seg6_flowlabel",
@@ -179,7 +177,7 @@ static struct ctl_table ipv6_rotable[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 #ifdef CONFIG_NETLABEL
 	{
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 198ec4fe4148..c312741df2ce 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -37,8 +37,6 @@
 
 #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1)
 
-static int zero = 0;
-static int one = 1;
 static int label_limit = (1 << 20) - 1;
 static int ttl_max = 255;
 
@@ -2607,7 +2605,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write,
 		.data		= &platform_labels,
 		.maxlen		= sizeof(int),
 		.mode		= table->mode,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &label_limit,
 	};
 
@@ -2636,8 +2634,8 @@ static const struct ctl_table mpls_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "default_ttl",
@@ -2645,7 +2643,7 @@ static const struct ctl_table mpls_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &ttl_max,
 	},
 	{ }
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 07e0967bf129..060565e7d227 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1726,7 +1726,6 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
 
 #ifdef CONFIG_SYSCTL
 
-static int zero;
 static int three = 3;
 
 static int
@@ -1935,7 +1934,7 @@ static struct ctl_table vs_vars[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &three,
 	},
 	{
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 1e3fa67d91aa..2bbb38161851 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -11,7 +11,6 @@
 #include "ar-internal.h"
 
 static struct ctl_table_header *rxrpc_sysctl_reg_table;
-static const unsigned int one = 1;
 static const unsigned int four = 4;
 static const unsigned int thirtytwo = 32;
 static const unsigned int n_65535 = 65535;
@@ -97,7 +96,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&rxrpc_max_client_connections,
 	},
 	{
@@ -115,7 +114,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&n_max_acks,
 	},
 	{
@@ -124,7 +123,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&n_65535,
 	},
 	{
@@ -133,7 +132,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&four,
 	},
 
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 9a19147902f1..1250751bca1b 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -25,10 +25,7 @@
 #include <net/sctp/sctp.h>
 #include <linux/sysctl.h>
 
-static int zero = 0;
-static int one = 1;
 static int timer_max = 86400000; /* ms in one day */
-static int int_max = INT_MAX;
 static int sack_timer_min = 1;
 static int sack_timer_max = 500;
 static int addr_scope_max = SCTP_SCOPE_POLICY_MAX;
@@ -92,7 +89,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &timer_max
 	},
 	{
@@ -101,7 +98,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_sctp_do_rto_min,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &init_net.sctp.rto_max
 	},
 	{
@@ -137,8 +134,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "cookie_preserve_enable",
@@ -160,7 +157,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &timer_max
 	},
 	{
@@ -178,7 +175,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &timer_max
 	},
 	{
@@ -187,8 +184,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "path_max_retrans",
@@ -196,8 +193,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "max_init_retransmits",
@@ -205,8 +202,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "pf_retrans",
@@ -214,8 +211,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "sndbuf_policy",
@@ -286,7 +283,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &addr_scope_max,
 	},
 	{
@@ -295,7 +292,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &rwnd_scale_max,
 	},
 	{
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 1f73a6a7e43c..ffb1684c4573 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -80,7 +80,6 @@ static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
 static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
 static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
-static unsigned int zero;
 static unsigned int max_padding = PAGE_SIZE;
 static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
 static unsigned int max_memreg = RPCRDMA_LAST - 1;
@@ -122,7 +121,7 @@ static struct ctl_table xr_tunables_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &max_padding,
 	},
 	{
diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c
index 9df82a573aa7..6159d327db76 100644
--- a/net/tipc/sysctl.c
+++ b/net/tipc/sysctl.c
@@ -38,8 +38,6 @@
 
 #include <linux/sysctl.h>
 
-static int zero;
-static int one = 1;
 static struct ctl_table_header *tipc_ctl_hdr;
 
 static struct ctl_table tipc_table[] = {
@@ -49,7 +47,7 @@ static struct ctl_table tipc_table[] = {
 		.maxlen		= sizeof(sysctl_tipc_rmem),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 	},
 	{
 		.procname	= "named_timeout",
@@ -57,7 +55,7 @@ static struct ctl_table tipc_table[] = {
 		.maxlen		= sizeof(sysctl_tipc_named_timeout),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &zero,
+		.extra1         = SYSCTL_ZERO,
 	},
 	{
 		.procname       = "sk_filter",
diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c
index dd1e21fab827..b46b651b3c4c 100644
--- a/security/keys/sysctl.c
+++ b/security/keys/sysctl.c
@@ -9,8 +9,6 @@
 #include <linux/sysctl.h>
 #include "internal.h"
 
-static const int zero, one = 1, max = INT_MAX;
-
 struct ctl_table key_sysctls[] = {
 	{
 		.procname = "maxkeys",
@@ -18,8 +16,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "maxbytes",
@@ -27,8 +25,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "root_maxkeys",
@@ -36,8 +34,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "root_maxbytes",
@@ -45,8 +43,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "gc_delay",
@@ -54,8 +52,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &zero,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ZERO,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 	{
@@ -64,8 +62,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &zero,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ZERO,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 #endif
 	{ }
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index 81519c804888..ee5cb944f4ad 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -43,8 +43,6 @@ static struct super_block *pinned_root;
 static DEFINE_SPINLOCK(pinned_root_spinlock);
 
 #ifdef CONFIG_SYSCTL
-static int zero;
-static int one = 1;
 
 static struct ctl_path loadpin_sysctl_path[] = {
 	{ .procname = "kernel", },
@@ -59,8 +57,8 @@ static struct ctl_table loadpin_sysctl_table[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec_minmax,
-		.extra1         = &zero,
-		.extra2         = &one,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 01c6239c4493..94dc346370b1 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -445,7 +445,6 @@ static int yama_dointvec_minmax(struct ctl_table *table, int write,
 	return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos);
 }
 
-static int zero;
 static int max_scope = YAMA_SCOPE_NO_ATTACH;
 
 static struct ctl_path yama_sysctl_path[] = {
@@ -461,7 +460,7 @@ static struct ctl_table yama_sysctl_table[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = yama_dointvec_minmax,
-		.extra1         = &zero,
+		.extra1         = SYSCTL_ZERO,
 		.extra2         = &max_scope,
 	},
 	{ }
-- 
cgit v1.2.3


From 8d650cdedaabb33e85e9b7c517c0c71fcecc1de9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 18 Jul 2019 19:28:14 -0700
Subject: tcp: fix tcp_set_congestion_control() use from bpf hook

Neal reported incorrect use of ns_capable() from bpf hook.

bpf_setsockopt(...TCP_CONGESTION...)
  -> tcp_set_congestion_control()
   -> ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)
    -> ns_capable_common()
     -> current_cred()
      -> rcu_dereference_protected(current->cred, 1)

Accessing 'current' in bpf context makes no sense, since packets
are processed from softirq context.

As Neal stated : The capability check in tcp_set_congestion_control()
was written assuming a system call context, and then was reused from
a BPF call site.

The fix is to add a new parameter to tcp_set_congestion_control(),
so that the ns_capable() call is only performed under the right
context.

Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Lawrence Brakmo <brakmo@fb.com>
Reported-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 3 ++-
 net/core/filter.c   | 2 +-
 net/ipv4/tcp.c      | 4 +++-
 net/ipv4/tcp_cong.c | 6 +++---
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cca3c59b98bf..f42d300f0cfa 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1064,7 +1064,8 @@ void tcp_get_default_congestion_control(struct net *net, char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit);
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
+			       bool reinit, bool cap_net_admin);
 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 0f6854ccf894..4e2a79b2fd77 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4335,7 +4335,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 						    TCP_CA_NAME_MAX-1));
 			name[TCP_CA_NAME_MAX-1] = 0;
 			ret = tcp_set_congestion_control(sk, name, false,
-							 reinit);
+							 reinit, true);
 		} else {
 			struct tcp_sock *tp = tcp_sk(sk);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7846afacdf0b..776905899ac0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2785,7 +2785,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		name[val] = 0;
 
 		lock_sock(sk);
-		err = tcp_set_congestion_control(sk, name, true, true);
+		err = tcp_set_congestion_control(sk, name, true, true,
+						 ns_capable(sock_net(sk)->user_ns,
+							    CAP_NET_ADMIN));
 		release_sock(sk);
 		return err;
 	}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index e1862b64a90f..c445a81d144e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -333,7 +333,8 @@ out:
  * tcp_reinit_congestion_control (if the current congestion control was
  * already initialized.
  */
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit)
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
+			       bool reinit, bool cap_net_admin)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_congestion_ops *ca;
@@ -369,8 +370,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo
 		} else {
 			err = -EBUSY;
 		}
-	} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
-		     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
+	} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) {
 		err = -EPERM;
 	} else if (!try_module_get(ca->owner)) {
 		err = -EBUSY;
-- 
cgit v1.2.3


From 67bf47452ea00edd90e796054229b651e64b82c1 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Wed, 17 Jul 2019 15:29:13 +0900
Subject: kbuild: update compile-test header list for v5.3-rc1

 - Some headers graduated from the blacklist

 - hyperv_timer.h joined the header-test when CONFIG_X86=y

 - nf_tables*.h joined the header-test when CONFIG_NF_TABLES is
   enabled.

 - The entry for nf_tables_offload.h was added to fix build error for
   the combination of CONFIG_NF_TABLES=n and CONFIG_KERNEL_HEADER_TEST=y.

 - The entry for iomap.h was added because this header is supposed to
   be included only when CONFIG_BLOCK=y

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 include/Kbuild       | 14 ++++++--------
 usr/include/Makefile |  8 --------
 2 files changed, 6 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/Kbuild b/include/Kbuild
index 7e9f1acb9dd5..c38f0d46b267 100644
--- a/include/Kbuild
+++ b/include/Kbuild
@@ -31,7 +31,7 @@ header-test-			+= acpi/platform/acintel.h
 header-test-			+= acpi/platform/aclinux.h
 header-test-			+= acpi/platform/aclinuxex.h
 header-test-			+= acpi/processor.h
-header-test-			+= clocksource/hyperv_timer.h
+header-test-$(CONFIG_X86)	+= clocksource/hyperv_timer.h
 header-test-			+= clocksource/timer-sp804.h
 header-test-			+= crypto/cast_common.h
 header-test-			+= crypto/internal/cryptouser.h
@@ -246,6 +246,7 @@ header-test-			+= linux/intel-pti.h
 header-test-			+= linux/intel-svm.h
 header-test-			+= linux/interconnect-provider.h
 header-test-			+= linux/ioc3.h
+header-test-$(CONFIG_BLOCK)	+= linux/iomap.h
 header-test-			+= linux/ipack.h
 header-test-			+= linux/irq_cpustat.h
 header-test-			+= linux/irq_poll.h
@@ -454,9 +455,6 @@ header-test-			+= linux/phy/omap_control_phy.h
 header-test-			+= linux/phy/tegra/xusb.h
 header-test-			+= linux/phy/ulpi_phy.h
 header-test-			+= linux/phy_fixed.h
-header-test-			+= linux/pinctrl/pinconf-generic.h
-header-test-			+= linux/pinctrl/pinconf.h
-header-test-			+= linux/pinctrl/pinctrl.h
 header-test-			+= linux/pipe_fs_i.h
 header-test-			+= linux/pktcdvd.h
 header-test-			+= linux/pl320-ipc.h
@@ -905,10 +903,11 @@ header-test-			+= net/netfilter/nf_nat_redirect.h
 header-test-			+= net/netfilter/nf_queue.h
 header-test-			+= net/netfilter/nf_reject.h
 header-test-			+= net/netfilter/nf_synproxy.h
-header-test-			+= net/netfilter/nf_tables.h
-header-test-			+= net/netfilter/nf_tables_core.h
-header-test-			+= net/netfilter/nf_tables_ipv4.h
+header-test-$(CONFIG_NF_TABLES)	+= net/netfilter/nf_tables.h
+header-test-$(CONFIG_NF_TABLES)	+= net/netfilter/nf_tables_core.h
+header-test-$(CONFIG_NF_TABLES)	+= net/netfilter/nf_tables_ipv4.h
 header-test-			+= net/netfilter/nf_tables_ipv6.h
+header-test-$(CONFIG_NF_TABLES)	+= net/netfilter/nf_tables_offload.h
 header-test-			+= net/netfilter/nft_fib.h
 header-test-			+= net/netfilter/nft_meta.h
 header-test-			+= net/netfilter/nft_reject.h
@@ -949,7 +948,6 @@ header-test-			+= pcmcia/ds.h
 header-test-			+= rdma/ib.h
 header-test-			+= rdma/iw_portmap.h
 header-test-			+= rdma/opa_port_info.h
-header-test-			+= rdma/rdma_counter.h
 header-test-			+= rdma/rdmavt_cq.h
 header-test-			+= rdma/restrack.h
 header-test-			+= rdma/signature.h
diff --git a/usr/include/Makefile b/usr/include/Makefile
index cd8daa20d487..aa316d99e035 100644
--- a/usr/include/Makefile
+++ b/usr/include/Makefile
@@ -30,8 +30,6 @@ header-test-$(CONFIG_CPU_BIG_ENDIAN) += linux/byteorder/big_endian.h
 header-test-$(CONFIG_CPU_LITTLE_ENDIAN) += linux/byteorder/little_endian.h
 header-test- += linux/coda.h
 header-test- += linux/coda_psdev.h
-header-test- += linux/dvb/audio.h
-header-test- += linux/dvb/osd.h
 header-test- += linux/elfcore.h
 header-test- += linux/errqueue.h
 header-test- += linux/fsmap.h
@@ -44,7 +42,6 @@ header-test- += linux/netfilter_bridge/ebtables.h
 header-test- += linux/netfilter_ipv4/ipt_LOG.h
 header-test- += linux/netfilter_ipv6/ip6t_LOG.h
 header-test- += linux/nfc.h
-header-test- += linux/nilfs2_ondisk.h
 header-test- += linux/omap3isp.h
 header-test- += linux/omapfb.h
 header-test- += linux/patchkey.h
@@ -59,9 +56,6 @@ header-test- += linux/v4l2-mediabus.h
 header-test- += linux/v4l2-subdev.h
 header-test- += linux/videodev2.h
 header-test- += linux/vm_sockets.h
-header-test- += misc/ocxl.h
-header-test- += mtd/mtd-abi.h
-header-test- += mtd/mtd-user.h
 header-test- += scsi/scsi_bsg_fc.h
 header-test- += scsi/scsi_netlink.h
 header-test- += scsi/scsi_netlink_fc.h
@@ -108,7 +102,6 @@ header-test- += linux/bpf_perf_event.h
 endif
 
 ifeq ($(SRCARCH),s390)
-header-test- += asm/runtime_instr.h
 header-test- += asm/zcrypt.h
 endif
 
@@ -116,7 +109,6 @@ ifeq ($(SRCARCH),sparc)
 header-test- += asm/stat.h
 header-test- += asm/uctx.h
 header-test- += asm/fbio.h
-header-test- += asm/openpromio.h
 endif
 
 # asm-generic/*.h is used by asm/*.h, and should not be included directly
-- 
cgit v1.2.3


From bca031e2c8aa22a978a2452bf959e27e9fa73dc7 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Thu, 18 Jul 2019 08:15:10 +0000
Subject: KVM: arm/arm64: Introduce kvm_pmu_vcpu_init() to setup PMU counter
 index

We use "pmc->idx" and the "chained" bitmap to determine if the pmc is
chained, in kvm_pmu_pmc_is_chained().  But idx might be uninitialized
(and random) when we doing this decision, through a KVM_ARM_VCPU_INIT
ioctl -> kvm_pmu_vcpu_reset(). And the test_bit() against this random
idx will potentially hit a KASAN BUG [1].

In general, idx is the static property of a PMU counter that is not
expected to be modified across resets, as suggested by Julien.  It
looks more reasonable if we can setup the PMU counter idx for a vcpu
in its creation time. Introduce a new function - kvm_pmu_vcpu_init()
for this basic setup. Oh, and the KASAN BUG will get fixed this way.

[1] https://www.spinics.net/lists/kvm-arm/msg36700.html

Fixes: 80f393a23be6 ("KVM: arm/arm64: Support chained PMU counters")
Suggested-by: Andrew Murray <andrew.murray@arm.com>
Suggested-by: Julien Thierry <julien.thierry@arm.com>
Acked-by: Julien Thierry <julien.thierry@arm.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_pmu.h |  2 ++
 virt/kvm/arm/arm.c    |  2 ++
 virt/kvm/arm/pmu.c    | 18 +++++++++++++++---
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 16c769a7f979..6db030439e29 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -34,6 +34,7 @@ struct kvm_pmu {
 u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx);
 void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val);
 u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu);
+void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu);
 void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu);
 void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val);
@@ -71,6 +72,7 @@ static inline u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
 {
 	return 0;
 }
+static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {}
 static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {}
 static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {}
 static inline void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) {}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index f645c0fbf7ec..c704fa696184 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -340,6 +340,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	/* Set up the timer */
 	kvm_timer_vcpu_init(vcpu);
 
+	kvm_pmu_vcpu_init(vcpu);
+
 	kvm_arm_reset_debug_ptr(vcpu);
 
 	return kvm_vgic_vcpu_init(vcpu);
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 3dd8238ed246..362a01886bab 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -214,6 +214,20 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
 	kvm_pmu_release_perf_event(pmc);
 }
 
+/**
+ * kvm_pmu_vcpu_init - assign pmu counter idx for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+		pmu->pmc[i].idx = i;
+}
+
 /**
  * kvm_pmu_vcpu_reset - reset pmu state for cpu
  * @vcpu: The vcpu pointer
@@ -224,10 +238,8 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
 	int i;
 	struct kvm_pmu *pmu = &vcpu->arch.pmu;
 
-	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
+	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
 		kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]);
-		pmu->pmc[i].idx = i;
-	}
 
 	bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS);
 }
-- 
cgit v1.2.3