diff options
author | Dave Airlie <airlied@redhat.com> | 2020-11-06 11:44:30 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2020-11-06 11:45:21 +1000 |
commit | 866bc2d3c40abc044c4ede51529a82dc0d561216 (patch) | |
tree | 0537bafb8e61bfedf52a3eea3c3c05dd8cc28bbd | |
parent | 53aa37fb8dc4bc86cc446169b60e2afe33dc81b4 (diff) | |
parent | 537457a979a02a410b555fab289dcb28b588f33b (diff) | |
download | lwn-866bc2d3c40abc044c4ede51529a82dc0d561216.tar.gz lwn-866bc2d3c40abc044c4ede51529a82dc0d561216.zip |
Merge tag 'drm-intel-fixes-2020-11-05' of git://anongit.freedesktop.org/drm/drm-intel into drm-fixes
- GVT fixes including vGPU suspend/resume fixes and workaround for APL guest GPU hang.
- Fix set domain's cache coherency (Chris)
- Fixes around breadcrumbs (Chris)
- Fix encoder lookup during PSR atomic (Imre)
- Hold onto an explicit ref to i915_vma_work.pinned (Chris)
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20201105173026.GA858446@intel.com
-rw-r--r-- | drivers/gpu/drm/i915/display/intel_psr.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_domain.c | 28 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gt/intel_engine.h | 55 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gt/intel_lrc.c | 31 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gt/intel_timeline.c | 18 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gt/intel_timeline_types.h | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gvt/handlers.c | 47 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gvt/scheduler.c | 15 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_vma.c | 6 |
9 files changed, 139 insertions, 65 deletions
diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 8a9d0bdde1bf..40e9cb29233d 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -1754,7 +1754,7 @@ void intel_psr_atomic_check(struct drm_connector *connector, return; intel_connector = to_intel_connector(connector); - dig_port = enc_to_dig_port(intel_attached_encoder(intel_connector)); + dig_port = enc_to_dig_port(to_intel_encoder(new_state->best_encoder)); if (dev_priv->psr.dp != &dig_port->dp) return; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index 7c90a63c273d..fcce6909f201 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -509,21 +509,6 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, return -ENOENT; /* - * Already in the desired write domain? Nothing for us to do! - * - * We apply a little bit of cunning here to catch a broader set of - * no-ops. If obj->write_domain is set, we must be in the same - * obj->read_domains, and only that domain. Therefore, if that - * obj->write_domain matches the request read_domains, we are - * already in the same read/write domain and can skip the operation, - * without having to further check the requested write_domain. - */ - if (READ_ONCE(obj->write_domain) == read_domains) { - err = 0; - goto out; - } - - /* * Try to flush the object off the GPU without holding the lock. * We will repeat the flush holding the lock in the normal manner * to catch cases where we are gazumped. @@ -560,6 +545,19 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, if (err) goto out; + /* + * Already in the desired write domain? Nothing for us to do! + * + * We apply a little bit of cunning here to catch a broader set of + * no-ops. If obj->write_domain is set, we must be in the same + * obj->read_domains, and only that domain. Therefore, if that + * obj->write_domain matches the request read_domains, we are + * already in the same read/write domain and can skip the operation, + * without having to further check the requested write_domain. + */ + if (READ_ONCE(obj->write_domain) == read_domains) + goto out_unpin; + err = i915_gem_object_lock_interruptible(obj, NULL); if (err) goto out_unpin; diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index 7c3a1012e702..760fefdfe392 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -245,22 +245,14 @@ static inline u32 *gen12_emit_pipe_control(u32 *batch, u32 flags0, u32 flags1, u } static inline u32 * -__gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags0, u32 flags1) +__gen8_emit_write_rcs(u32 *cs, u32 value, u32 offset, u32 flags0, u32 flags1) { - /* We're using qword write, offset should be aligned to 8 bytes. */ - GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); - - /* w/a for post sync ops following a GPGPU operation we - * need a prior CS_STALL, which is emitted by the flush - * following the batch. - */ *cs++ = GFX_OP_PIPE_CONTROL(6) | flags0; - *cs++ = flags1 | PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_GLOBAL_GTT_IVB; - *cs++ = gtt_offset; + *cs++ = flags1 | PIPE_CONTROL_QW_WRITE; + *cs++ = offset; *cs++ = 0; *cs++ = value; - /* We're thrashing one dword of HWS. */ - *cs++ = 0; + *cs++ = 0; /* We're thrashing one extra dword. */ return cs; } @@ -268,13 +260,38 @@ __gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags0, u32 f static inline u32* gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags) { - return __gen8_emit_ggtt_write_rcs(cs, value, gtt_offset, 0, flags); + /* We're using qword write, offset should be aligned to 8 bytes. */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + return __gen8_emit_write_rcs(cs, + value, + gtt_offset, + 0, + flags | PIPE_CONTROL_GLOBAL_GTT_IVB); } static inline u32* gen12_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags0, u32 flags1) { - return __gen8_emit_ggtt_write_rcs(cs, value, gtt_offset, flags0, flags1); + /* We're using qword write, offset should be aligned to 8 bytes. */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + return __gen8_emit_write_rcs(cs, + value, + gtt_offset, + flags0, + flags1 | PIPE_CONTROL_GLOBAL_GTT_IVB); +} + +static inline u32 * +__gen8_emit_flush_dw(u32 *cs, u32 value, u32 gtt_offset, u32 flags) +{ + *cs++ = (MI_FLUSH_DW + 1) | flags; + *cs++ = gtt_offset; + *cs++ = 0; + *cs++ = value; + + return cs; } static inline u32 * @@ -285,12 +302,10 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset, u32 flags) /* Offset should be aligned to 8 bytes for both (QW/DW) write types */ GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); - *cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW | flags; - *cs++ = gtt_offset | MI_FLUSH_DW_USE_GTT; - *cs++ = 0; - *cs++ = value; - - return cs; + return __gen8_emit_flush_dw(cs, + value, + gtt_offset | MI_FLUSH_DW_USE_GTT, + flags | MI_FLUSH_DW_OP_STOREDW); } static inline void __intel_engine_reset(struct intel_engine_cs *engine, diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index a32aabce7901..f82c6dd1de18 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -3547,6 +3547,19 @@ static const struct intel_context_ops execlists_context_ops = { .destroy = execlists_context_destroy, }; +static u32 hwsp_offset(const struct i915_request *rq) +{ + const struct intel_timeline_cacheline *cl; + + /* Before the request is executed, the timeline/cachline is fixed */ + + cl = rcu_dereference_protected(rq->hwsp_cacheline, 1); + if (cl) + return cl->ggtt_offset; + + return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset; +} + static int gen8_emit_init_breadcrumb(struct i915_request *rq) { u32 *cs; @@ -3569,7 +3582,7 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq) *cs++ = MI_NOOP; *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; - *cs++ = i915_request_timeline(rq)->hwsp_offset; + *cs++ = hwsp_offset(rq); *cs++ = 0; *cs++ = rq->fence.seqno - 1; @@ -4886,11 +4899,9 @@ gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) return gen8_emit_wa_tail(request, cs); } -static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs) +static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs) { - u32 addr = i915_request_active_timeline(request)->hwsp_offset; - - return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0); + return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0); } static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) @@ -4909,7 +4920,7 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ cs = gen8_emit_ggtt_write_rcs(cs, request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, + hwsp_offset(request), PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_CS_STALL); @@ -4921,7 +4932,7 @@ gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) { cs = gen8_emit_ggtt_write_rcs(cs, request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, + hwsp_offset(request), PIPE_CONTROL_CS_STALL | PIPE_CONTROL_TILE_CACHE_FLUSH | PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | @@ -4983,7 +4994,9 @@ gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) { - return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); + /* XXX Stalling flush before seqno write; post-sync not */ + cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0)); + return gen12_emit_fini_breadcrumb_tail(rq, cs); } static u32 * @@ -4991,7 +5004,7 @@ gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) { cs = gen12_emit_ggtt_write_rcs(cs, request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, + hwsp_offset(request), PIPE_CONTROL0_HDC_PIPELINE_FLUSH, PIPE_CONTROL_CS_STALL | PIPE_CONTROL_TILE_CACHE_FLUSH | diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c index a2f74cefe4c3..7ea94d201fe6 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline.c +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c @@ -188,10 +188,14 @@ cacheline_alloc(struct intel_timeline_hwsp *hwsp, unsigned int cacheline) return cl; } -static void cacheline_acquire(struct intel_timeline_cacheline *cl) +static void cacheline_acquire(struct intel_timeline_cacheline *cl, + u32 ggtt_offset) { - if (cl) - i915_active_acquire(&cl->active); + if (!cl) + return; + + cl->ggtt_offset = ggtt_offset; + i915_active_acquire(&cl->active); } static void cacheline_release(struct intel_timeline_cacheline *cl) @@ -340,7 +344,7 @@ int intel_timeline_pin(struct intel_timeline *tl, struct i915_gem_ww_ctx *ww) GT_TRACE(tl->gt, "timeline:%llx using HWSP offset:%x\n", tl->fence_context, tl->hwsp_offset); - cacheline_acquire(tl->hwsp_cacheline); + cacheline_acquire(tl->hwsp_cacheline, tl->hwsp_offset); if (atomic_fetch_inc(&tl->pin_count)) { cacheline_release(tl->hwsp_cacheline); __i915_vma_unpin(tl->hwsp_ggtt); @@ -515,7 +519,7 @@ __intel_timeline_get_seqno(struct intel_timeline *tl, GT_TRACE(tl->gt, "timeline:%llx using HWSP offset:%x\n", tl->fence_context, tl->hwsp_offset); - cacheline_acquire(cl); + cacheline_acquire(cl, tl->hwsp_offset); tl->hwsp_cacheline = cl; *seqno = timeline_advance(tl); @@ -573,9 +577,7 @@ int intel_timeline_read_hwsp(struct i915_request *from, if (err) goto out; - *hwsp = i915_ggtt_offset(cl->hwsp->vma) + - ptr_unmask_bits(cl->vaddr, CACHELINE_BITS) * CACHELINE_BYTES; - + *hwsp = cl->ggtt_offset; out: i915_active_release(&cl->active); return err; diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h index 02181c5020db..4474f487f589 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h +++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h @@ -94,6 +94,8 @@ struct intel_timeline_cacheline { struct intel_timeline_hwsp *hwsp; void *vaddr; + u32 ggtt_offset; + struct rcu_head rcu; }; diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 3be37e6fe33d..eb342a759943 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -1489,7 +1489,8 @@ static int hws_pga_write(struct intel_vgpu *vgpu, unsigned int offset, const struct intel_engine_cs *engine = intel_gvt_render_mmio_to_engine(vgpu->gvt, offset); - if (!intel_gvt_ggtt_validate_range(vgpu, value, I915_GTT_PAGE_SIZE)) { + if (value != 0 && + !intel_gvt_ggtt_validate_range(vgpu, value, I915_GTT_PAGE_SIZE)) { gvt_vgpu_err("write invalid HWSP address, reg:0x%x, value:0x%x\n", offset, value); return -EINVAL; @@ -1650,6 +1651,34 @@ static int edp_psr_imr_iir_write(struct intel_vgpu *vgpu, return 0; } +/** + * FixMe: + * If guest fills non-priv batch buffer on ApolloLake/Broxton as Mesa i965 did: + * 717e7539124d (i965: Use a WC map and memcpy for the batch instead of pwrite.) + * Due to the missing flush of bb filled by VM vCPU, host GPU hangs on executing + * these MI_BATCH_BUFFER. + * Temporarily workaround this by setting SNOOP bit for PAT3 used by PPGTT + * PML4 PTE: PAT(0) PCD(1) PWT(1). + * The performance is still expected to be low, will need further improvement. + */ +static int bxt_ppat_low_write(struct intel_vgpu *vgpu, unsigned int offset, + void *p_data, unsigned int bytes) +{ + u64 pat = + GEN8_PPAT(0, CHV_PPAT_SNOOP) | + GEN8_PPAT(1, 0) | + GEN8_PPAT(2, 0) | + GEN8_PPAT(3, CHV_PPAT_SNOOP) | + GEN8_PPAT(4, CHV_PPAT_SNOOP) | + GEN8_PPAT(5, CHV_PPAT_SNOOP) | + GEN8_PPAT(6, CHV_PPAT_SNOOP) | + GEN8_PPAT(7, CHV_PPAT_SNOOP); + + vgpu_vreg(vgpu, offset) = lower_32_bits(pat); + + return 0; +} + static int guc_status_read(struct intel_vgpu *vgpu, unsigned int offset, void *p_data, unsigned int bytes) @@ -2812,7 +2841,7 @@ static int init_bdw_mmio_info(struct intel_gvt *gvt) MMIO_DH(GEN6_PCODE_MAILBOX, D_BDW_PLUS, NULL, mailbox_write); - MMIO_D(GEN8_PRIVATE_PAT_LO, D_BDW_PLUS); + MMIO_D(GEN8_PRIVATE_PAT_LO, D_BDW_PLUS & ~D_BXT); MMIO_D(GEN8_PRIVATE_PAT_HI, D_BDW_PLUS); MMIO_D(GAMTARBMODE, D_BDW_PLUS); @@ -3139,7 +3168,7 @@ static int init_skl_mmio_info(struct intel_gvt *gvt) NULL, NULL); MMIO_DFH(GAMT_CHKN_BIT_REG, D_KBL | D_CFL, F_CMD_ACCESS, NULL, NULL); - MMIO_D(GEN9_CTX_PREEMPT_REG, D_SKL_PLUS); + MMIO_D(GEN9_CTX_PREEMPT_REG, D_SKL_PLUS & ~D_BXT); return 0; } @@ -3313,9 +3342,21 @@ static int init_bxt_mmio_info(struct intel_gvt *gvt) MMIO_D(GEN8_PUSHBUS_SHIFT, D_BXT); MMIO_D(GEN6_GFXPAUSE, D_BXT); MMIO_DFH(GEN8_L3SQCREG1, D_BXT, F_CMD_ACCESS, NULL, NULL); + MMIO_DFH(GEN8_L3CNTLREG, D_BXT, F_CMD_ACCESS, NULL, NULL); + MMIO_DFH(_MMIO(0x20D8), D_BXT, F_CMD_ACCESS, NULL, NULL); + MMIO_F(GEN8_RING_CS_GPR(RENDER_RING_BASE, 0), 0x40, F_CMD_ACCESS, + 0, 0, D_BXT, NULL, NULL); + MMIO_F(GEN8_RING_CS_GPR(GEN6_BSD_RING_BASE, 0), 0x40, F_CMD_ACCESS, + 0, 0, D_BXT, NULL, NULL); + MMIO_F(GEN8_RING_CS_GPR(BLT_RING_BASE, 0), 0x40, F_CMD_ACCESS, + 0, 0, D_BXT, NULL, NULL); + MMIO_F(GEN8_RING_CS_GPR(VEBOX_RING_BASE, 0), 0x40, F_CMD_ACCESS, + 0, 0, D_BXT, NULL, NULL); MMIO_DFH(GEN9_CTX_PREEMPT_REG, D_BXT, F_CMD_ACCESS, NULL, NULL); + MMIO_DH(GEN8_PRIVATE_PAT_LO, D_BXT, NULL, bxt_ppat_low_write); + return 0; } diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index 1570eb8aa978..aed2ef6466a2 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -1277,7 +1277,7 @@ void intel_vgpu_clean_submission(struct intel_vgpu *vgpu) i915_context_ppgtt_root_restore(s, i915_vm_to_ppgtt(s->shadow[0]->vm)); for_each_engine(engine, vgpu->gvt->gt, id) - intel_context_unpin(s->shadow[id]); + intel_context_put(s->shadow[id]); kmem_cache_destroy(s->workloads); } @@ -1369,11 +1369,6 @@ int intel_vgpu_setup_submission(struct intel_vgpu *vgpu) ce->ring = __intel_context_ring_size(ring_size); } - ret = intel_context_pin(ce); - intel_context_put(ce); - if (ret) - goto out_shadow_ctx; - s->shadow[i] = ce; } @@ -1405,7 +1400,6 @@ out_shadow_ctx: if (IS_ERR(s->shadow[i])) break; - intel_context_unpin(s->shadow[i]); intel_context_put(s->shadow[i]); } i915_vm_put(&ppgtt->vm); @@ -1479,6 +1473,7 @@ void intel_vgpu_destroy_workload(struct intel_vgpu_workload *workload) { struct intel_vgpu_submission *s = &workload->vgpu->submission; + intel_context_unpin(s->shadow[workload->engine->id]); release_shadow_batch_buffer(workload); release_shadow_wa_ctx(&workload->wa_ctx); @@ -1724,6 +1719,12 @@ intel_vgpu_create_workload(struct intel_vgpu *vgpu, return ERR_PTR(ret); } + ret = intel_context_pin(s->shadow[engine->id]); + if (ret) { + intel_vgpu_destroy_workload(workload); + return ERR_PTR(ret); + } + return workload; } diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index ffb5287e055a..caa9b041616b 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -314,8 +314,10 @@ static void __vma_release(struct dma_fence_work *work) { struct i915_vma_work *vw = container_of(work, typeof(*vw), base); - if (vw->pinned) + if (vw->pinned) { __i915_gem_object_unpin_pages(vw->pinned); + i915_gem_object_put(vw->pinned); + } i915_vm_free_pt_stash(vw->vm, &vw->stash); i915_vm_put(vw->vm); @@ -431,7 +433,7 @@ int i915_vma_bind(struct i915_vma *vma, if (vma->obj) { __i915_gem_object_pin_pages(vma->obj); - work->pinned = vma->obj; + work->pinned = i915_gem_object_get(vma->obj); } } else { vma->ops->bind_vma(vma->vm, NULL, vma, cache_level, bind_flags); |