mirror of
git://git.yoctoproject.org/linux-yocto.git
synced 2025-08-22 00:42:01 +02:00
drm/xe: Introduce the wedged_mode debugfs
So, the wedged mode can be selected per device at runtime, before the tests or before reproducing the issue. v2: - s/busted/wedged - some locking consistency v3: - remove mutex - toggle guc reset policy on any mode change Cc: Lucas De Marchi <lucas.demarchi@intel.com> Cc: Alan Previn <alan.previn.teres.alexis@intel.com> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com> Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240423221817.1285081-4-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
This commit is contained in:
parent
8ed9aaae39
commit
6b8ef44cc0
|
@ -12,6 +12,8 @@
|
|||
#include "xe_bo.h"
|
||||
#include "xe_device.h"
|
||||
#include "xe_gt_debugfs.h"
|
||||
#include "xe_gt_printk.h"
|
||||
#include "xe_guc_ads.h"
|
||||
#include "xe_pm.h"
|
||||
#include "xe_sriov.h"
|
||||
#include "xe_step.h"
|
||||
|
@ -117,6 +119,56 @@ static const struct file_operations forcewake_all_fops = {
|
|||
.release = forcewake_release,
|
||||
};
|
||||
|
||||
static ssize_t wedged_mode_show(struct file *f, char __user *ubuf,
|
||||
size_t size, loff_t *pos)
|
||||
{
|
||||
struct xe_device *xe = file_inode(f)->i_private;
|
||||
char buf[32];
|
||||
int len = 0;
|
||||
|
||||
len = scnprintf(buf, sizeof(buf), "%d\n", xe->wedged.mode);
|
||||
|
||||
return simple_read_from_buffer(ubuf, size, pos, buf, len);
|
||||
}
|
||||
|
||||
static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
|
||||
size_t size, loff_t *pos)
|
||||
{
|
||||
struct xe_device *xe = file_inode(f)->i_private;
|
||||
struct xe_gt *gt;
|
||||
u32 wedged_mode;
|
||||
ssize_t ret;
|
||||
u8 id;
|
||||
|
||||
ret = kstrtouint_from_user(ubuf, size, 0, &wedged_mode);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (wedged_mode > 2)
|
||||
return -EINVAL;
|
||||
|
||||
if (xe->wedged.mode == wedged_mode)
|
||||
return 0;
|
||||
|
||||
xe->wedged.mode = wedged_mode;
|
||||
|
||||
for_each_gt(gt, xe, id) {
|
||||
ret = xe_guc_ads_scheduler_policy_toggle_reset(>->uc.guc.ads);
|
||||
if (ret) {
|
||||
xe_gt_err(gt, "Failed to update GuC ADS scheduler policy. GuC may still cause engine reset even with wedged_mode=2\n");
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
static const struct file_operations wedged_mode_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.read = wedged_mode_show,
|
||||
.write = wedged_mode_set,
|
||||
};
|
||||
|
||||
void xe_debugfs_register(struct xe_device *xe)
|
||||
{
|
||||
struct ttm_device *bdev = &xe->ttm;
|
||||
|
@ -134,6 +186,9 @@ void xe_debugfs_register(struct xe_device *xe)
|
|||
debugfs_create_file("forcewake_all", 0400, root, xe,
|
||||
&forcewake_all_fops);
|
||||
|
||||
debugfs_create_file("wedged_mode", 0400, root, xe,
|
||||
&wedged_mode_fops);
|
||||
|
||||
for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) {
|
||||
man = ttm_manager_type(bdev, mem_type);
|
||||
|
||||
|
|
|
@ -506,6 +506,8 @@ int xe_device_probe_early(struct xe_device *xe)
|
|||
if (err)
|
||||
return err;
|
||||
|
||||
xe->wedged.mode = xe_modparam.wedged_mode;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -769,7 +771,7 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
|
|||
* xe_device_declare_wedged - Declare device wedged
|
||||
* @xe: xe device instance
|
||||
*
|
||||
* This is a final state that can only be cleared with a module
|
||||
* This is a final state that can only be cleared with a mudule
|
||||
* re-probe (unbind + bind).
|
||||
* In this state every IOCTL will be blocked so the GT cannot be used.
|
||||
* In general it will be called upon any critical error such as gt reset
|
||||
|
@ -781,10 +783,12 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
|
|||
*/
|
||||
void xe_device_declare_wedged(struct xe_device *xe)
|
||||
{
|
||||
if (xe_modparam.wedged_mode == 0)
|
||||
if (xe->wedged.mode == 0) {
|
||||
drm_dbg(&xe->drm, "Wedged mode is forcebly disabled\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!atomic_xchg(&xe->wedged, 1)) {
|
||||
if (!atomic_xchg(&xe->wedged.flag, 1)) {
|
||||
xe->needs_flr_on_fini = true;
|
||||
drm_err(&xe->drm,
|
||||
"CRITICAL: Xe has declared device %s as wedged.\n"
|
||||
|
|
|
@ -169,7 +169,7 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);
|
|||
|
||||
static inline bool xe_device_wedged(struct xe_device *xe)
|
||||
{
|
||||
return atomic_read(&xe->wedged);
|
||||
return atomic_read(&xe->wedged.flag);
|
||||
}
|
||||
|
||||
void xe_device_declare_wedged(struct xe_device *xe);
|
||||
|
|
|
@ -459,8 +459,13 @@ struct xe_device {
|
|||
/** @needs_flr_on_fini: requests function-reset on fini */
|
||||
bool needs_flr_on_fini;
|
||||
|
||||
/** @wedged: Xe device faced a critical error and is now blocked. */
|
||||
atomic_t wedged;
|
||||
/** @wedged: Struct to control Wedged States and mode */
|
||||
struct {
|
||||
/** @wedged.flag: Xe device faced a critical error and is now blocked. */
|
||||
atomic_t flag;
|
||||
/** @wedged.mode: Mode controlled by kernel parameter and debugfs */
|
||||
int mode;
|
||||
} wedged;
|
||||
|
||||
/* private: */
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include <generated/xe_wa_oob.h>
|
||||
|
||||
#include "abi/guc_actions_abi.h"
|
||||
#include "regs/xe_engine_regs.h"
|
||||
#include "regs/xe_gt_regs.h"
|
||||
#include "regs/xe_guc_regs.h"
|
||||
|
@ -16,11 +17,11 @@
|
|||
#include "xe_gt.h"
|
||||
#include "xe_gt_ccs_mode.h"
|
||||
#include "xe_guc.h"
|
||||
#include "xe_guc_ct.h"
|
||||
#include "xe_hw_engine.h"
|
||||
#include "xe_lrc.h"
|
||||
#include "xe_map.h"
|
||||
#include "xe_mmio.h"
|
||||
#include "xe_module.h"
|
||||
#include "xe_platform_types.h"
|
||||
#include "xe_wa.h"
|
||||
|
||||
|
@ -441,6 +442,7 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)
|
|||
|
||||
static void guc_policies_init(struct xe_guc_ads *ads)
|
||||
{
|
||||
struct xe_device *xe = ads_to_xe(ads);
|
||||
u32 global_flags = 0;
|
||||
|
||||
ads_blob_write(ads, policies.dpc_promote_time,
|
||||
|
@ -448,7 +450,7 @@ static void guc_policies_init(struct xe_guc_ads *ads)
|
|||
ads_blob_write(ads, policies.max_num_work_items,
|
||||
GLOBAL_POLICY_MAX_NUM_WI);
|
||||
|
||||
if (xe_modparam.wedged_mode == 2)
|
||||
if (xe->wedged.mode == 2)
|
||||
global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
|
||||
|
||||
ads_blob_write(ads, policies.global_flags, global_flags);
|
||||
|
@ -806,3 +808,57 @@ void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads)
|
|||
{
|
||||
guc_populate_golden_lrc(ads);
|
||||
}
|
||||
|
||||
static int guc_ads_action_update_policies(struct xe_guc_ads *ads, u32 policy_offset)
|
||||
{
|
||||
struct xe_guc_ct *ct = &ads_to_guc(ads)->ct;
|
||||
u32 action[] = {
|
||||
XE_GUC_ACTION_GLOBAL_SCHED_POLICY_CHANGE,
|
||||
policy_offset
|
||||
};
|
||||
|
||||
return xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* xe_guc_ads_scheduler_policy_toggle_reset - Toggle reset policy
|
||||
* @ads: Additional data structures object
|
||||
*
|
||||
* This function update the GuC's engine reset policy based on wedged.mode.
|
||||
*
|
||||
* Return: 0 on success, and negative error code otherwise.
|
||||
*/
|
||||
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads)
|
||||
{
|
||||
struct xe_device *xe = ads_to_xe(ads);
|
||||
struct xe_gt *gt = ads_to_gt(ads);
|
||||
struct xe_tile *tile = gt_to_tile(gt);
|
||||
struct guc_policies *policies;
|
||||
struct xe_bo *bo;
|
||||
int ret = 0;
|
||||
|
||||
policies = kmalloc(sizeof(*policies), GFP_KERNEL);
|
||||
if (!policies)
|
||||
return -ENOMEM;
|
||||
|
||||
policies->dpc_promote_time = ads_blob_read(ads, policies.dpc_promote_time);
|
||||
policies->max_num_work_items = ads_blob_read(ads, policies.max_num_work_items);
|
||||
policies->is_valid = 1;
|
||||
if (xe->wedged.mode == 2)
|
||||
policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
|
||||
else
|
||||
policies->global_flags &= ~GLOBAL_POLICY_DISABLE_ENGINE_RESET;
|
||||
|
||||
bo = xe_managed_bo_create_from_data(xe, tile, policies, sizeof(struct guc_policies),
|
||||
XE_BO_FLAG_VRAM_IF_DGFX(tile) |
|
||||
XE_BO_FLAG_GGTT);
|
||||
if (IS_ERR(bo)) {
|
||||
ret = PTR_ERR(bo);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = guc_ads_action_update_policies(ads, xe_bo_ggtt_addr(bo));
|
||||
out:
|
||||
kfree(policies);
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -13,5 +13,6 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads);
|
|||
void xe_guc_ads_populate(struct xe_guc_ads *ads);
|
||||
void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads);
|
||||
void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads);
|
||||
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -35,7 +35,6 @@
|
|||
#include "xe_macros.h"
|
||||
#include "xe_map.h"
|
||||
#include "xe_mocs.h"
|
||||
#include "xe_module.h"
|
||||
#include "xe_ring_ops_types.h"
|
||||
#include "xe_sched_job.h"
|
||||
#include "xe_trace.h"
|
||||
|
@ -868,26 +867,38 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
|
|||
xe_sched_tdr_queue_imm(&q->guc->sched);
|
||||
}
|
||||
|
||||
static void guc_submit_wedged(struct xe_guc *guc)
|
||||
static bool guc_submit_hint_wedged(struct xe_guc *guc)
|
||||
{
|
||||
struct xe_device *xe = guc_to_xe(guc);
|
||||
struct xe_exec_queue *q;
|
||||
unsigned long index;
|
||||
int err;
|
||||
|
||||
xe_device_declare_wedged(guc_to_xe(guc));
|
||||
if (xe->wedged.mode != 2)
|
||||
return false;
|
||||
|
||||
if (xe_device_wedged(xe))
|
||||
return true;
|
||||
|
||||
xe_device_declare_wedged(xe);
|
||||
|
||||
xe_guc_submit_reset_prepare(guc);
|
||||
xe_guc_ct_stop(&guc->ct);
|
||||
|
||||
err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
|
||||
guc_submit_wedged_fini, guc);
|
||||
if (err)
|
||||
return;
|
||||
if (err) {
|
||||
drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n");
|
||||
return true; /* Device is wedged anyway */
|
||||
}
|
||||
|
||||
mutex_lock(&guc->submission_state.lock);
|
||||
xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
|
||||
if (xe_exec_queue_get_unless_zero(q))
|
||||
set_exec_queue_wedged(q);
|
||||
mutex_unlock(&guc->submission_state.lock);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
|
||||
|
@ -898,15 +909,12 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
|
|||
struct xe_guc *guc = exec_queue_to_guc(q);
|
||||
struct xe_device *xe = guc_to_xe(guc);
|
||||
struct xe_gpu_scheduler *sched = &ge->sched;
|
||||
bool wedged = xe_device_wedged(xe);
|
||||
bool wedged;
|
||||
|
||||
xe_assert(xe, xe_exec_queue_is_lr(q));
|
||||
trace_xe_exec_queue_lr_cleanup(q);
|
||||
|
||||
if (!wedged && xe_modparam.wedged_mode == 2) {
|
||||
guc_submit_wedged(exec_queue_to_guc(q));
|
||||
wedged = true;
|
||||
}
|
||||
wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
|
||||
|
||||
/* Kill the run_job / process_msg entry points */
|
||||
xe_sched_submission_stop(sched);
|
||||
|
@ -957,7 +965,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
|
|||
struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
|
||||
int err = -ETIME;
|
||||
int i = 0;
|
||||
bool wedged = xe_device_wedged(xe);
|
||||
bool wedged;
|
||||
|
||||
/*
|
||||
* TDR has fired before free job worker. Common if exec queue
|
||||
|
@ -981,10 +989,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
|
|||
|
||||
trace_xe_sched_job_timedout(job);
|
||||
|
||||
if (!wedged && xe_modparam.wedged_mode == 2) {
|
||||
guc_submit_wedged(exec_queue_to_guc(q));
|
||||
wedged = true;
|
||||
}
|
||||
wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
|
||||
|
||||
/* Kill the run_job entry point */
|
||||
xe_sched_submission_stop(sched);
|
||||
|
|
Loading…
Reference in New Issue
Block a user