blob: daf9ac0b711f9bc8e61c78267987382fe4be6389 [file] [log] [blame]
xinhui panc030f2e2018-10-31 14:38:28 +08001/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 *
23 */
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
Sam Ravnborgf8677232019-06-10 00:07:51 +020027#include <linux/uaccess.h>
Andrey Grodzovsky7c6e68c2019-09-13 17:40:32 -050028#include <linux/reboot.h>
29#include <linux/syscalls.h>
Sam Ravnborgf8677232019-06-10 00:07:51 +020030
xinhui panc030f2e2018-10-31 14:38:28 +080031#include "amdgpu.h"
32#include "amdgpu_ras.h"
xinhui panb404ae82019-03-07 11:49:26 +080033#include "amdgpu_atomfirmware.h"
Hawking Zhang4e644ff2019-06-05 14:57:00 +080034#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
xinhui panc030f2e2018-10-31 14:38:28 +080035
xinhui panc030f2e2018-10-31 14:38:28 +080036const char *ras_error_string[] = {
37 "none",
38 "parity",
39 "single_correctable",
40 "multi_uncorrectable",
41 "poison",
42};
43
44const char *ras_block_string[] = {
45 "umc",
46 "sdma",
47 "gfx",
48 "mmhub",
49 "athub",
50 "pcie_bif",
51 "hdp",
52 "xgmi_wafl",
53 "df",
54 "smn",
55 "sem",
56 "mp0",
57 "mp1",
58 "fuse",
59};
60
61#define ras_err_str(i) (ras_error_string[ffs(i)])
62#define ras_block_str(i) (ras_block_string[i])
63
xinhui pana5648082019-05-08 19:12:24 +080064#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
65#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
xinhui pan108c6a62019-03-11 15:23:00 +080066#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
67
Tao Zhou7cdc2ee32019-07-24 11:19:56 +080068/* inject address is 52 bits */
69#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
70
Andrey Grodzovsky7c6e68c2019-09-13 17:40:32 -050071
72atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
73
xinhui panefb426d2019-05-28 14:47:31 +080074static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
75 uint64_t offset, uint64_t size,
76 struct amdgpu_bo **bo_ptr);
77static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
78 struct amdgpu_bo **bo_ptr);
79
xinhui panc030f2e2018-10-31 14:38:28 +080080static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
81 size_t size, loff_t *pos)
82{
83 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
84 struct ras_query_if info = {
85 .head = obj->head,
86 };
87 ssize_t s;
88 char val[128];
89
90 if (amdgpu_ras_error_query(obj->adev, &info))
91 return -EINVAL;
92
93 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
94 "ue", info.ue_count,
95 "ce", info.ce_count);
96 if (*pos >= s)
97 return 0;
98
99 s -= *pos;
100 s = min_t(u64, s, size);
101
102
103 if (copy_to_user(buf, &val[*pos], s))
104 return -EINVAL;
105
106 *pos += s;
107
108 return s;
109}
110
xinhui panc030f2e2018-10-31 14:38:28 +0800111static const struct file_operations amdgpu_ras_debugfs_ops = {
112 .owner = THIS_MODULE,
113 .read = amdgpu_ras_debugfs_read,
xinhui pan190211a2019-03-21 15:00:47 +0800114 .write = NULL,
xinhui panc030f2e2018-10-31 14:38:28 +0800115 .llseek = default_llseek
116};
117
xinhui pan96ebb302019-03-01 16:32:11 +0800118static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
119{
120 int i;
121
122 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
123 *block_id = i;
124 if (strcmp(name, ras_block_str(i)) == 0)
125 return 0;
126 }
127 return -EINVAL;
128}
129
130static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
131 const char __user *buf, size_t size,
132 loff_t *pos, struct ras_debug_if *data)
133{
134 ssize_t s = min_t(u64, 64, size);
135 char str[65];
136 char block_name[33];
137 char err[9] = "ue";
138 int op = -1;
139 int block_id;
Tao Zhou44494f92019-08-07 14:27:42 +0800140 uint32_t sub_block;
xinhui pan96ebb302019-03-01 16:32:11 +0800141 u64 address, value;
142
143 if (*pos)
144 return -EINVAL;
145 *pos = size;
146
147 memset(str, 0, sizeof(str));
148 memset(data, 0, sizeof(*data));
149
150 if (copy_from_user(str, buf, s))
151 return -EINVAL;
152
153 if (sscanf(str, "disable %32s", block_name) == 1)
154 op = 0;
155 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
156 op = 1;
157 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
158 op = 2;
Andrey Grodzovskyd5ea0932019-08-22 15:01:37 -0400159 else if (sscanf(str, "reboot %32s", block_name) == 1)
160 op = 3;
xinhui panb0762962019-03-11 18:10:57 +0800161 else if (str[0] && str[1] && str[2] && str[3])
xinhui pan96ebb302019-03-01 16:32:11 +0800162 /* ascii string, but commands are not matched. */
163 return -EINVAL;
164
165 if (op != -1) {
166 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
167 return -EINVAL;
168
169 data->head.block = block_id;
Tao Zhoue1063492019-07-23 13:07:24 +0800170 /* only ue and ce errors are supported */
171 if (!memcmp("ue", err, 2))
172 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
173 else if (!memcmp("ce", err, 2))
174 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
175 else
176 return -EINVAL;
177
xinhui pan96ebb302019-03-01 16:32:11 +0800178 data->op = op;
179
180 if (op == 2) {
Tao Zhou44494f92019-08-07 14:27:42 +0800181 if (sscanf(str, "%*s %*s %*s %u %llu %llu",
182 &sub_block, &address, &value) != 3)
183 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
184 &sub_block, &address, &value) != 3)
xinhui pan96ebb302019-03-01 16:32:11 +0800185 return -EINVAL;
Tao Zhou44494f92019-08-07 14:27:42 +0800186 data->head.sub_block_index = sub_block;
xinhui pan96ebb302019-03-01 16:32:11 +0800187 data->inject.address = address;
188 data->inject.value = value;
189 }
190 } else {
xinhui pan73aa8e12019-03-19 11:16:32 +0800191 if (size < sizeof(*data))
xinhui pan96ebb302019-03-01 16:32:11 +0800192 return -EINVAL;
193
194 if (copy_from_user(data, buf, sizeof(*data)))
195 return -EINVAL;
196 }
197
198 return 0;
199}
Andrey Grodzovsky7c6e68c2019-09-13 17:40:32 -0500200
201static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
202 struct ras_common_if *head);
203
Tom St Denis74abc222019-05-24 09:21:54 -0400204/**
205 * DOC: AMDGPU RAS debugfs control interface
xinhui pan36ea1bd2019-01-31 16:55:07 +0800206 *
207 * It accepts struct ras_debug_if who has two members.
208 *
209 * First member: ras_debug_if::head or ras_debug_if::inject.
xinhui pan96ebb302019-03-01 16:32:11 +0800210 *
211 * head is used to indicate which IP block will be under control.
xinhui pan36ea1bd2019-01-31 16:55:07 +0800212 *
213 * head has four members, they are block, type, sub_block_index, name.
214 * block: which IP will be under control.
215 * type: what kind of error will be enabled/disabled/injected.
216 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
217 * name: the name of IP.
218 *
219 * inject has two more members than head, they are address, value.
220 * As their names indicate, inject operation will write the
221 * value to the address.
222 *
223 * Second member: struct ras_debug_if::op.
224 * It has three kinds of operations.
225 * 0: disable RAS on the block. Take ::head as its data.
226 * 1: enable RAS on the block. Take ::head as its data.
227 * 2: inject errors on the block. Take ::inject as its data.
228 *
xinhui pan96ebb302019-03-01 16:32:11 +0800229 * How to use the interface?
230 * programs:
231 * copy the struct ras_debug_if in your codes and initialize it.
232 * write the struct to the control node.
233 *
234 * bash:
Tao Zhou44494f92019-08-07 14:27:42 +0800235 * echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl
xinhui pan96ebb302019-03-01 16:32:11 +0800236 * op: disable, enable, inject
237 * disable: only block is needed
238 * enable: block and error are needed
239 * inject: error, address, value are needed
240 * block: umc, smda, gfx, .........
241 * see ras_block_string[] for details
242 * error: ue, ce
243 * ue: multi_uncorrectable
244 * ce: single_correctable
Tao Zhou44494f92019-08-07 14:27:42 +0800245 * sub_block: sub block index, pass 0 if there is no sub block
xinhui pan96ebb302019-03-01 16:32:11 +0800246 *
247 * here are some examples for bash commands,
Tao Zhou44494f92019-08-07 14:27:42 +0800248 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
249 * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
xinhui pan96ebb302019-03-01 16:32:11 +0800250 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
251 *
xinhui pan36ea1bd2019-01-31 16:55:07 +0800252 * How to check the result?
253 *
254 * For disable/enable, please check ras features at
255 * /sys/class/drm/card[0/1/2...]/device/ras/features
256 *
257 * For inject, please check corresponding err count at
258 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
259 *
260 * NOTE: operation is only allowed on blocks which are supported.
261 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
262 */
263static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
264 size_t size, loff_t *pos)
265{
266 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
267 struct ras_debug_if data;
268 int ret = 0;
269
xinhui pan96ebb302019-03-01 16:32:11 +0800270 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
271 if (ret)
xinhui pan36ea1bd2019-01-31 16:55:07 +0800272 return -EINVAL;
273
xinhui pan36ea1bd2019-01-31 16:55:07 +0800274 if (!amdgpu_ras_is_supported(adev, data.head.block))
275 return -EINVAL;
276
277 switch (data.op) {
278 case 0:
279 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
280 break;
281 case 1:
282 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
283 break;
284 case 2:
Tao Zhou7cdc2ee32019-07-24 11:19:56 +0800285 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
286 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
287 ret = -EINVAL;
288 break;
289 }
290
291 /* data.inject.address is offset instead of absolute gpu address */
xinhui pan36ea1bd2019-01-31 16:55:07 +0800292 ret = amdgpu_ras_error_inject(adev, &data.inject);
293 break;
Andrey Grodzovskyd5ea0932019-08-22 15:01:37 -0400294 case 3:
295 amdgpu_ras_get_context(adev)->reboot = true;
296 break;
xinhui pan96ebb302019-03-01 16:32:11 +0800297 default:
298 ret = -EINVAL;
299 break;
xinhui pan36ea1bd2019-01-31 16:55:07 +0800300 };
301
302 if (ret)
303 return -EINVAL;
304
305 return size;
306}
307
Andrey Grodzovsky084fe132019-09-09 16:00:56 -0400308/**
309 * DOC: AMDGPU RAS debugfs EEPROM table reset interface
310 *
311 * Usage: echo 1 > ../ras/ras_eeprom_reset will reset EEPROM table to 0 entries.
312 */
313static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
314 size_t size, loff_t *pos)
315{
316 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
317 int ret;
318
319 ret = amdgpu_ras_eeprom_reset_table(&adev->psp.ras.ras->eeprom_control);
320
321 return ret == 1 ? size : -EIO;
322}
323
xinhui pan36ea1bd2019-01-31 16:55:07 +0800324static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
325 .owner = THIS_MODULE,
326 .read = NULL,
327 .write = amdgpu_ras_debugfs_ctrl_write,
328 .llseek = default_llseek
329};
330
Andrey Grodzovsky084fe132019-09-09 16:00:56 -0400331static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
332 .owner = THIS_MODULE,
333 .read = NULL,
334 .write = amdgpu_ras_debugfs_eeprom_write,
335 .llseek = default_llseek
336};
337
xinhui panc030f2e2018-10-31 14:38:28 +0800338static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
339 struct device_attribute *attr, char *buf)
340{
341 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
342 struct ras_query_if info = {
343 .head = obj->head,
344 };
345
346 if (amdgpu_ras_error_query(obj->adev, &info))
347 return -EINVAL;
348
349 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
350 "ue", info.ue_count,
351 "ce", info.ce_count);
352}
353
354/* obj begin */
355
356#define get_obj(obj) do { (obj)->use++; } while (0)
357#define alive_obj(obj) ((obj)->use)
358
359static inline void put_obj(struct ras_manager *obj)
360{
361 if (obj && --obj->use == 0)
362 list_del(&obj->node);
363 if (obj && obj->use < 0) {
364 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
365 }
366}
367
368/* make one obj and return it. */
369static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
370 struct ras_common_if *head)
371{
372 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
373 struct ras_manager *obj;
374
375 if (!con)
376 return NULL;
377
378 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
379 return NULL;
380
381 obj = &con->objs[head->block];
382 /* already exist. return obj? */
383 if (alive_obj(obj))
384 return NULL;
385
386 obj->head = *head;
387 obj->adev = adev;
388 list_add(&obj->node, &con->head);
389 get_obj(obj);
390
391 return obj;
392}
393
394/* return an obj equal to head, or the first when head is NULL */
395static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
396 struct ras_common_if *head)
397{
398 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
399 struct ras_manager *obj;
400 int i;
401
402 if (!con)
403 return NULL;
404
405 if (head) {
406 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
407 return NULL;
408
409 obj = &con->objs[head->block];
410
411 if (alive_obj(obj)) {
412 WARN_ON(head->block != obj->head.block);
413 return obj;
414 }
415 } else {
416 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
417 obj = &con->objs[i];
418 if (alive_obj(obj)) {
419 WARN_ON(i != obj->head.block);
420 return obj;
421 }
422 }
423 }
424
425 return NULL;
426}
427/* obj end */
428
429/* feature ctl begin */
430static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
431 struct ras_common_if *head)
432{
xinhui pan5caf4662019-03-11 14:12:40 +0800433 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
434
435 return con->hw_supported & BIT(head->block);
xinhui panc030f2e2018-10-31 14:38:28 +0800436}
437
438static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
439 struct ras_common_if *head)
440{
441 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
442
443 return con->features & BIT(head->block);
444}
445
446/*
447 * if obj is not created, then create one.
448 * set feature enable flag.
449 */
450static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
451 struct ras_common_if *head, int enable)
452{
453 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
454 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
455
xinhui pan5caf4662019-03-11 14:12:40 +0800456 /* If hardware does not support ras, then do not create obj.
457 * But if hardware support ras, we can create the obj.
458 * Ras framework checks con->hw_supported to see if it need do
459 * corresponding initialization.
460 * IP checks con->support to see if it need disable ras.
461 */
xinhui panc030f2e2018-10-31 14:38:28 +0800462 if (!amdgpu_ras_is_feature_allowed(adev, head))
463 return 0;
464 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
465 return 0;
466
467 if (enable) {
468 if (!obj) {
469 obj = amdgpu_ras_create_obj(adev, head);
470 if (!obj)
471 return -EINVAL;
472 } else {
473 /* In case we create obj somewhere else */
474 get_obj(obj);
475 }
476 con->features |= BIT(head->block);
477 } else {
478 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
479 con->features &= ~BIT(head->block);
480 put_obj(obj);
481 }
482 }
483
484 return 0;
485}
486
487/* wrapper of psp_ras_enable_features */
488int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
489 struct ras_common_if *head, bool enable)
490{
491 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
492 union ta_ras_cmd_input info;
493 int ret;
494
495 if (!con)
496 return -EINVAL;
497
498 if (!enable) {
499 info.disable_features = (struct ta_ras_disable_features_input) {
xinhui pan828cfa22019-03-21 15:13:38 +0800500 .block_id = amdgpu_ras_block_to_ta(head->block),
501 .error_type = amdgpu_ras_error_to_ta(head->type),
xinhui panc030f2e2018-10-31 14:38:28 +0800502 };
503 } else {
504 info.enable_features = (struct ta_ras_enable_features_input) {
xinhui pan828cfa22019-03-21 15:13:38 +0800505 .block_id = amdgpu_ras_block_to_ta(head->block),
506 .error_type = amdgpu_ras_error_to_ta(head->type),
xinhui panc030f2e2018-10-31 14:38:28 +0800507 };
508 }
509
510 /* Do not enable if it is not allowed. */
511 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
512 /* Are we alerady in that state we are going to set? */
513 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
514 return 0;
515
516 ret = psp_ras_enable_features(&adev->psp, &info, enable);
517 if (ret) {
518 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
519 enable ? "enable":"disable",
520 ras_block_str(head->block),
521 ret);
xinhui pan7af23eb2019-05-08 16:13:03 +0800522 if (ret == TA_RAS_STATUS__RESET_NEEDED)
523 return -EAGAIN;
xinhui panc030f2e2018-10-31 14:38:28 +0800524 return -EINVAL;
525 }
526
527 /* setup the obj */
528 __amdgpu_ras_feature_enable(adev, head, enable);
529
530 return 0;
531}
532
xinhui pan77de5022019-04-08 14:49:37 +0800533/* Only used in device probe stage and called only once. */
534int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
535 struct ras_common_if *head, bool enable)
536{
537 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
538 int ret;
539
540 if (!con)
541 return -EINVAL;
542
543 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
xinhui pan7af23eb2019-05-08 16:13:03 +0800544 if (enable) {
545 /* There is no harm to issue a ras TA cmd regardless of
546 * the currecnt ras state.
547 * If current state == target state, it will do nothing
548 * But sometimes it requests driver to reset and repost
549 * with error code -EAGAIN.
550 */
551 ret = amdgpu_ras_feature_enable(adev, head, 1);
552 /* With old ras TA, we might fail to enable ras.
553 * Log it and just setup the object.
554 * TODO need remove this WA in the future.
555 */
556 if (ret == -EINVAL) {
557 ret = __amdgpu_ras_feature_enable(adev, head, 1);
558 if (!ret)
559 DRM_INFO("RAS INFO: %s setup object\n",
560 ras_block_str(head->block));
561 }
562 } else {
563 /* setup the object then issue a ras TA disable cmd.*/
564 ret = __amdgpu_ras_feature_enable(adev, head, 1);
565 if (ret)
566 return ret;
xinhui pan77de5022019-04-08 14:49:37 +0800567
xinhui pan77de5022019-04-08 14:49:37 +0800568 ret = amdgpu_ras_feature_enable(adev, head, 0);
xinhui pan7af23eb2019-05-08 16:13:03 +0800569 }
xinhui pan77de5022019-04-08 14:49:37 +0800570 } else
571 ret = amdgpu_ras_feature_enable(adev, head, enable);
572
573 return ret;
574}
575
xinhui panc030f2e2018-10-31 14:38:28 +0800576static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
577 bool bypass)
578{
579 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
580 struct ras_manager *obj, *tmp;
581
582 list_for_each_entry_safe(obj, tmp, &con->head, node) {
583 /* bypass psp.
584 * aka just release the obj and corresponding flags
585 */
586 if (bypass) {
587 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
588 break;
589 } else {
590 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
591 break;
592 }
kbuild test robot289d5132019-03-06 13:26:11 +0800593 }
xinhui panc030f2e2018-10-31 14:38:28 +0800594
595 return con->features;
596}
597
598static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
599 bool bypass)
600{
601 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
602 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
603 int i;
xinhui pan191051a2019-04-03 09:52:59 +0800604 const enum amdgpu_ras_error_type default_ras_type =
605 AMDGPU_RAS_ERROR__NONE;
xinhui panc030f2e2018-10-31 14:38:28 +0800606
607 for (i = 0; i < ras_block_count; i++) {
608 struct ras_common_if head = {
609 .block = i,
xinhui pan191051a2019-04-03 09:52:59 +0800610 .type = default_ras_type,
xinhui panc030f2e2018-10-31 14:38:28 +0800611 .sub_block_index = 0,
612 };
613 strcpy(head.name, ras_block_str(i));
614 if (bypass) {
615 /*
616 * bypass psp. vbios enable ras for us.
617 * so just create the obj
618 */
619 if (__amdgpu_ras_feature_enable(adev, &head, 1))
620 break;
621 } else {
622 if (amdgpu_ras_feature_enable(adev, &head, 1))
623 break;
624 }
kbuild test robot289d5132019-03-06 13:26:11 +0800625 }
xinhui panc030f2e2018-10-31 14:38:28 +0800626
627 return con->features;
628}
629/* feature ctl end */
630
631/* query/inject/cure begin */
632int amdgpu_ras_error_query(struct amdgpu_device *adev,
633 struct ras_query_if *info)
634{
635 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
Tao Zhou6f102db2019-07-22 19:20:29 +0800636 struct ras_err_data err_data = {0, 0, 0, NULL};
xinhui panc030f2e2018-10-31 14:38:28 +0800637
638 if (!obj)
639 return -EINVAL;
xinhui panc030f2e2018-10-31 14:38:28 +0800640
Hawking Zhang939e22582019-07-17 21:49:53 +0800641 switch (info->head.block) {
642 case AMDGPU_RAS_BLOCK__UMC:
Tao Zhou045c0212019-07-23 12:18:39 +0800643 if (adev->umc.funcs->query_ras_error_count)
644 adev->umc.funcs->query_ras_error_count(adev, &err_data);
Tao Zhou13b7c46c2019-08-01 11:41:39 +0800645 /* umc query_ras_error_address is also responsible for clearing
646 * error status
647 */
648 if (adev->umc.funcs->query_ras_error_address)
649 adev->umc.funcs->query_ras_error_address(adev, &err_data);
Hawking Zhang939e22582019-07-17 21:49:53 +0800650 break;
Dennis Li83b05822019-07-31 20:45:50 +0800651 case AMDGPU_RAS_BLOCK__GFX:
652 if (adev->gfx.funcs->query_ras_error_count)
653 adev->gfx.funcs->query_ras_error_count(adev, &err_data);
654 break;
Tao Zhou9fb2d8d2019-08-06 20:22:49 +0800655 case AMDGPU_RAS_BLOCK__MMHUB:
656 if (adev->mmhub_funcs->query_ras_error_count)
657 adev->mmhub_funcs->query_ras_error_count(adev, &err_data);
658 break;
Guchun Chend7bd6802019-09-11 11:07:15 +0800659 case AMDGPU_RAS_BLOCK__PCIE_BIF:
660 if (adev->nbio.funcs->query_ras_error_count)
661 adev->nbio.funcs->query_ras_error_count(adev, &err_data);
662 break;
Hawking Zhang939e22582019-07-17 21:49:53 +0800663 default:
664 break;
665 }
Tao Zhou05a58342019-07-31 20:28:13 +0800666
667 obj->err_data.ue_count += err_data.ue_count;
668 obj->err_data.ce_count += err_data.ce_count;
669
xinhui panc030f2e2018-10-31 14:38:28 +0800670 info->ue_count = obj->err_data.ue_count;
671 info->ce_count = obj->err_data.ce_count;
672
Andrey Grodzovsky7c6e68c2019-09-13 17:40:32 -0500673 if (err_data.ce_count) {
Tao Zhou05a58342019-07-31 20:28:13 +0800674 dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
675 obj->err_data.ce_count, ras_block_str(info->head.block));
Andrey Grodzovsky7c6e68c2019-09-13 17:40:32 -0500676 }
677 if (err_data.ue_count) {
Tao Zhou05a58342019-07-31 20:28:13 +0800678 dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
679 obj->err_data.ue_count, ras_block_str(info->head.block));
Andrey Grodzovsky7c6e68c2019-09-13 17:40:32 -0500680 }
Tao Zhou05a58342019-07-31 20:28:13 +0800681
xinhui panc030f2e2018-10-31 14:38:28 +0800682 return 0;
683}
684
685/* wrapper of psp_ras_trigger_error */
686int amdgpu_ras_error_inject(struct amdgpu_device *adev,
687 struct ras_inject_if *info)
688{
689 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
690 struct ta_ras_trigger_error_input block_info = {
xinhui pan828cfa22019-03-21 15:13:38 +0800691 .block_id = amdgpu_ras_block_to_ta(info->head.block),
692 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
xinhui panc030f2e2018-10-31 14:38:28 +0800693 .sub_block_index = info->head.sub_block_index,
694 .address = info->address,
695 .value = info->value,
696 };
697 int ret = 0;
698
699 if (!obj)
700 return -EINVAL;
701
Dennis Li83b05822019-07-31 20:45:50 +0800702 switch (info->head.block) {
703 case AMDGPU_RAS_BLOCK__GFX:
704 if (adev->gfx.funcs->ras_error_inject)
705 ret = adev->gfx.funcs->ras_error_inject(adev, info);
706 else
707 ret = -EINVAL;
708 break;
709 case AMDGPU_RAS_BLOCK__UMC:
Tao Zhou9fb2d8d2019-08-06 20:22:49 +0800710 case AMDGPU_RAS_BLOCK__MMHUB:
Hawking Zhangf3170352019-09-08 09:09:15 +0800711 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
Guchun Chend7bd6802019-09-11 11:07:15 +0800712 case AMDGPU_RAS_BLOCK__PCIE_BIF:
Dennis Li83b05822019-07-31 20:45:50 +0800713 ret = psp_ras_trigger_error(&adev->psp, &block_info);
714 break;
715 default:
Hawking Zhanga5dd40c2019-07-18 13:59:38 +0800716 DRM_INFO("%s error injection is not supported yet\n",
717 ras_block_str(info->head.block));
Dennis Li83b05822019-07-31 20:45:50 +0800718 ret = -EINVAL;
Hawking Zhanga5dd40c2019-07-18 13:59:38 +0800719 }
720
xinhui panc030f2e2018-10-31 14:38:28 +0800721 if (ret)
722 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
723 ras_block_str(info->head.block),
724 ret);
725
726 return ret;
727}
728
729int amdgpu_ras_error_cure(struct amdgpu_device *adev,
730 struct ras_cure_if *info)
731{
732 /* psp fw has no cure interface for now. */
733 return 0;
734}
735
736/* get the total error counts on all IPs */
Guchun Chen64cc5412019-08-16 15:06:52 +0800737unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
xinhui panc030f2e2018-10-31 14:38:28 +0800738 bool is_ce)
739{
740 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
741 struct ras_manager *obj;
742 struct ras_err_data data = {0, 0};
743
744 if (!con)
Guchun Chen64cc5412019-08-16 15:06:52 +0800745 return 0;
xinhui panc030f2e2018-10-31 14:38:28 +0800746
747 list_for_each_entry(obj, &con->head, node) {
748 struct ras_query_if info = {
749 .head = obj->head,
750 };
751
752 if (amdgpu_ras_error_query(adev, &info))
Guchun Chen64cc5412019-08-16 15:06:52 +0800753 return 0;
xinhui panc030f2e2018-10-31 14:38:28 +0800754
755 data.ce_count += info.ce_count;
756 data.ue_count += info.ue_count;
757 }
758
759 return is_ce ? data.ce_count : data.ue_count;
760}
761/* query/inject/cure end */
762
763
764/* sysfs begin */
765
xinhui pan466b1792019-05-07 11:53:31 +0800766static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
767 struct ras_badpage **bps, unsigned int *count);
768
769static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
770{
771 switch (flags) {
772 case 0:
773 return "R";
774 case 1:
775 return "P";
776 case 2:
777 default:
778 return "F";
779 };
780}
781
782/*
783 * DOC: ras sysfs gpu_vram_bad_pages interface
784 *
785 * It allows user to read the bad pages of vram on the gpu through
786 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
787 *
788 * It outputs multiple lines, and each line stands for one gpu page.
789 *
790 * The format of one line is below,
791 * gpu pfn : gpu page size : flags
792 *
793 * gpu pfn and gpu page size are printed in hex format.
794 * flags can be one of below character,
795 * R: reserved, this gpu page is reserved and not able to use.
796 * P: pending for reserve, this gpu page is marked as bad, will be reserved
797 * in next window of page_reserve.
798 * F: unable to reserve. this gpu page can't be reserved due to some reasons.
799 *
800 * examples:
801 * 0x00000001 : 0x00001000 : R
802 * 0x00000002 : 0x00001000 : P
803 */
804
805static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
806 struct kobject *kobj, struct bin_attribute *attr,
807 char *buf, loff_t ppos, size_t count)
808{
809 struct amdgpu_ras *con =
810 container_of(attr, struct amdgpu_ras, badpages_attr);
811 struct amdgpu_device *adev = con->adev;
812 const unsigned int element_size =
813 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
Slava Abramovd6ee4002019-05-16 16:17:53 -0400814 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
815 unsigned int end = div64_ul(ppos + count - 1, element_size);
xinhui pan466b1792019-05-07 11:53:31 +0800816 ssize_t s = 0;
817 struct ras_badpage *bps = NULL;
818 unsigned int bps_count = 0;
819
820 memset(buf, 0, count);
821
822 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
823 return 0;
824
825 for (; start < end && start < bps_count; start++)
826 s += scnprintf(&buf[s], element_size + 1,
827 "0x%08x : 0x%08x : %1s\n",
828 bps[start].bp,
829 bps[start].size,
830 amdgpu_ras_badpage_flags_str(bps[start].flags));
831
832 kfree(bps);
833
834 return s;
835}
836
xinhui panc030f2e2018-10-31 14:38:28 +0800837static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
838 struct device_attribute *attr, char *buf)
839{
840 struct amdgpu_ras *con =
841 container_of(attr, struct amdgpu_ras, features_attr);
xinhui panc030f2e2018-10-31 14:38:28 +0800842
Tao Zhou5212a3bd2019-08-09 17:39:06 +0800843 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
xinhui panc030f2e2018-10-31 14:38:28 +0800844}
845
846static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
847{
848 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
849 struct attribute *attrs[] = {
850 &con->features_attr.attr,
851 NULL
852 };
xinhui pan466b1792019-05-07 11:53:31 +0800853 struct bin_attribute *bin_attrs[] = {
854 &con->badpages_attr,
855 NULL
856 };
xinhui panc030f2e2018-10-31 14:38:28 +0800857 struct attribute_group group = {
858 .name = "ras",
859 .attrs = attrs,
xinhui pan466b1792019-05-07 11:53:31 +0800860 .bin_attrs = bin_attrs,
xinhui panc030f2e2018-10-31 14:38:28 +0800861 };
862
863 con->features_attr = (struct device_attribute) {
864 .attr = {
865 .name = "features",
866 .mode = S_IRUGO,
867 },
868 .show = amdgpu_ras_sysfs_features_read,
869 };
xinhui pan466b1792019-05-07 11:53:31 +0800870
871 con->badpages_attr = (struct bin_attribute) {
872 .attr = {
873 .name = "gpu_vram_bad_pages",
874 .mode = S_IRUGO,
875 },
876 .size = 0,
877 .private = NULL,
878 .read = amdgpu_ras_sysfs_badpages_read,
879 };
880
xinhui pan163def42019-03-11 19:34:23 +0800881 sysfs_attr_init(attrs[0]);
xinhui pan466b1792019-05-07 11:53:31 +0800882 sysfs_bin_attr_init(bin_attrs[0]);
xinhui panc030f2e2018-10-31 14:38:28 +0800883
884 return sysfs_create_group(&adev->dev->kobj, &group);
885}
886
887static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
888{
889 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
890 struct attribute *attrs[] = {
891 &con->features_attr.attr,
892 NULL
893 };
xinhui pan466b1792019-05-07 11:53:31 +0800894 struct bin_attribute *bin_attrs[] = {
895 &con->badpages_attr,
896 NULL
897 };
xinhui panc030f2e2018-10-31 14:38:28 +0800898 struct attribute_group group = {
899 .name = "ras",
900 .attrs = attrs,
xinhui pan466b1792019-05-07 11:53:31 +0800901 .bin_attrs = bin_attrs,
xinhui panc030f2e2018-10-31 14:38:28 +0800902 };
903
904 sysfs_remove_group(&adev->dev->kobj, &group);
905
906 return 0;
907}
908
909int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
910 struct ras_fs_if *head)
911{
912 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
913
914 if (!obj || obj->attr_inuse)
915 return -EINVAL;
916
917 get_obj(obj);
918
919 memcpy(obj->fs_data.sysfs_name,
920 head->sysfs_name,
921 sizeof(obj->fs_data.sysfs_name));
922
923 obj->sysfs_attr = (struct device_attribute){
924 .attr = {
925 .name = obj->fs_data.sysfs_name,
926 .mode = S_IRUGO,
927 },
928 .show = amdgpu_ras_sysfs_read,
929 };
xinhui pan163def42019-03-11 19:34:23 +0800930 sysfs_attr_init(&obj->sysfs_attr.attr);
xinhui panc030f2e2018-10-31 14:38:28 +0800931
932 if (sysfs_add_file_to_group(&adev->dev->kobj,
933 &obj->sysfs_attr.attr,
934 "ras")) {
935 put_obj(obj);
936 return -EINVAL;
937 }
938
939 obj->attr_inuse = 1;
940
941 return 0;
942}
943
944int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
945 struct ras_common_if *head)
946{
947 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
948
949 if (!obj || !obj->attr_inuse)
950 return -EINVAL;
951
952 sysfs_remove_file_from_group(&adev->dev->kobj,
953 &obj->sysfs_attr.attr,
954 "ras");
955 obj->attr_inuse = 0;
956 put_obj(obj);
957
958 return 0;
959}
960
961static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
962{
963 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
964 struct ras_manager *obj, *tmp;
965
966 list_for_each_entry_safe(obj, tmp, &con->head, node) {
967 amdgpu_ras_sysfs_remove(adev, &obj->head);
968 }
969
970 amdgpu_ras_sysfs_remove_feature_node(adev);
971
972 return 0;
973}
974/* sysfs end */
975
976/* debugfs begin */
Greg Kroah-Hartman450f30e2019-06-13 15:19:19 +0200977static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
xinhui pan36ea1bd2019-01-31 16:55:07 +0800978{
979 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
980 struct drm_minor *minor = adev->ddev->primary;
xinhui pan36ea1bd2019-01-31 16:55:07 +0800981
Greg Kroah-Hartman450f30e2019-06-13 15:19:19 +0200982 con->dir = debugfs_create_dir("ras", minor->debugfs_root);
Guchun Chen012dd142019-09-16 13:42:46 +0800983 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
984 adev, &amdgpu_ras_debugfs_ctrl_ops);
985 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
986 adev, &amdgpu_ras_debugfs_eeprom_ops);
xinhui pan36ea1bd2019-01-31 16:55:07 +0800987}
988
Greg Kroah-Hartman450f30e2019-06-13 15:19:19 +0200989void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
xinhui panc030f2e2018-10-31 14:38:28 +0800990 struct ras_fs_if *head)
991{
992 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
993 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
xinhui panc030f2e2018-10-31 14:38:28 +0800994
995 if (!obj || obj->ent)
Greg Kroah-Hartman450f30e2019-06-13 15:19:19 +0200996 return;
xinhui panc030f2e2018-10-31 14:38:28 +0800997
998 get_obj(obj);
999
1000 memcpy(obj->fs_data.debugfs_name,
1001 head->debugfs_name,
1002 sizeof(obj->fs_data.debugfs_name));
1003
Greg Kroah-Hartman450f30e2019-06-13 15:19:19 +02001004 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
1005 S_IWUGO | S_IRUGO, con->dir, obj,
1006 &amdgpu_ras_debugfs_ops);
xinhui panc030f2e2018-10-31 14:38:28 +08001007}
1008
Greg Kroah-Hartman450f30e2019-06-13 15:19:19 +02001009void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
xinhui panc030f2e2018-10-31 14:38:28 +08001010 struct ras_common_if *head)
1011{
1012 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1013
1014 if (!obj || !obj->ent)
Greg Kroah-Hartman450f30e2019-06-13 15:19:19 +02001015 return;
xinhui panc030f2e2018-10-31 14:38:28 +08001016
1017 debugfs_remove(obj->ent);
1018 obj->ent = NULL;
1019 put_obj(obj);
xinhui panc030f2e2018-10-31 14:38:28 +08001020}
1021
Greg Kroah-Hartman450f30e2019-06-13 15:19:19 +02001022static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
xinhui panc030f2e2018-10-31 14:38:28 +08001023{
1024 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1025 struct ras_manager *obj, *tmp;
1026
1027 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1028 amdgpu_ras_debugfs_remove(adev, &obj->head);
1029 }
1030
Guchun Chen012dd142019-09-16 13:42:46 +08001031 debugfs_remove_recursive(con->dir);
xinhui panc030f2e2018-10-31 14:38:28 +08001032 con->dir = NULL;
xinhui panc030f2e2018-10-31 14:38:28 +08001033}
1034/* debugfs end */
1035
1036/* ras fs */
1037
1038static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1039{
xinhui panc030f2e2018-10-31 14:38:28 +08001040 amdgpu_ras_sysfs_create_feature_node(adev);
xinhui pan36ea1bd2019-01-31 16:55:07 +08001041 amdgpu_ras_debugfs_create_ctrl_node(adev);
xinhui panc030f2e2018-10-31 14:38:28 +08001042
1043 return 0;
1044}
1045
1046static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1047{
1048 amdgpu_ras_debugfs_remove_all(adev);
1049 amdgpu_ras_sysfs_remove_all(adev);
1050 return 0;
1051}
1052/* ras fs end */
1053
1054/* ih begin */
1055static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1056{
1057 struct ras_ih_data *data = &obj->ih_data;
1058 struct amdgpu_iv_entry entry;
1059 int ret;
Tao Zhoucf04dfd2019-07-22 20:27:25 +08001060 struct ras_err_data err_data = {0, 0, 0, NULL};
xinhui panc030f2e2018-10-31 14:38:28 +08001061
1062 while (data->rptr != data->wptr) {
1063 rmb();
1064 memcpy(&entry, &data->ring[data->rptr],
1065 data->element_size);
1066
1067 wmb();
1068 data->rptr = (data->aligned_element_size +
1069 data->rptr) % data->ring_size;
1070
1071 /* Let IP handle its data, maybe we need get the output
1072 * from the callback to udpate the error type/count, etc
1073 */
1074 if (data->cb) {
Tao Zhoucf04dfd2019-07-22 20:27:25 +08001075 ret = data->cb(obj->adev, &err_data, &entry);
xinhui panc030f2e2018-10-31 14:38:28 +08001076 /* ue will trigger an interrupt, and in that case
1077 * we need do a reset to recovery the whole system.
1078 * But leave IP do that recovery, here we just dispatch
1079 * the error.
1080 */
Tao Zhoubd2280d2019-08-01 17:30:35 +08001081 if (ret == AMDGPU_RAS_SUCCESS) {
Tao Zhou51437622019-07-29 16:04:33 +08001082 /* these counts could be left as 0 if
1083 * some blocks do not count error number
1084 */
Tao Zhoucf04dfd2019-07-22 20:27:25 +08001085 obj->err_data.ue_count += err_data.ue_count;
Tao Zhou51437622019-07-29 16:04:33 +08001086 obj->err_data.ce_count += err_data.ce_count;
xinhui panc030f2e2018-10-31 14:38:28 +08001087 }
xinhui panc030f2e2018-10-31 14:38:28 +08001088 }
1089 }
1090}
1091
1092static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1093{
1094 struct ras_ih_data *data =
1095 container_of(work, struct ras_ih_data, ih_work);
1096 struct ras_manager *obj =
1097 container_of(data, struct ras_manager, ih_data);
1098
1099 amdgpu_ras_interrupt_handler(obj);
1100}
1101
1102int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1103 struct ras_dispatch_if *info)
1104{
1105 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1106 struct ras_ih_data *data = &obj->ih_data;
1107
1108 if (!obj)
1109 return -EINVAL;
1110
1111 if (data->inuse == 0)
1112 return 0;
1113
1114 /* Might be overflow... */
1115 memcpy(&data->ring[data->wptr], info->entry,
1116 data->element_size);
1117
1118 wmb();
1119 data->wptr = (data->aligned_element_size +
1120 data->wptr) % data->ring_size;
1121
1122 schedule_work(&data->ih_work);
1123
1124 return 0;
1125}
1126
1127int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1128 struct ras_ih_if *info)
1129{
1130 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1131 struct ras_ih_data *data;
1132
1133 if (!obj)
1134 return -EINVAL;
1135
1136 data = &obj->ih_data;
1137 if (data->inuse == 0)
1138 return 0;
1139
1140 cancel_work_sync(&data->ih_work);
1141
1142 kfree(data->ring);
1143 memset(data, 0, sizeof(*data));
1144 put_obj(obj);
1145
1146 return 0;
1147}
1148
1149int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1150 struct ras_ih_if *info)
1151{
1152 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1153 struct ras_ih_data *data;
1154
1155 if (!obj) {
1156 /* in case we registe the IH before enable ras feature */
1157 obj = amdgpu_ras_create_obj(adev, &info->head);
1158 if (!obj)
1159 return -EINVAL;
1160 } else
1161 get_obj(obj);
1162
1163 data = &obj->ih_data;
1164 /* add the callback.etc */
1165 *data = (struct ras_ih_data) {
1166 .inuse = 0,
1167 .cb = info->cb,
1168 .element_size = sizeof(struct amdgpu_iv_entry),
1169 .rptr = 0,
1170 .wptr = 0,
1171 };
1172
1173 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1174
1175 data->aligned_element_size = ALIGN(data->element_size, 8);
1176 /* the ring can store 64 iv entries. */
1177 data->ring_size = 64 * data->aligned_element_size;
1178 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1179 if (!data->ring) {
1180 put_obj(obj);
1181 return -ENOMEM;
1182 }
1183
1184 /* IH is ready */
1185 data->inuse = 1;
1186
1187 return 0;
1188}
1189
1190static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1191{
1192 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1193 struct ras_manager *obj, *tmp;
1194
1195 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1196 struct ras_ih_if info = {
1197 .head = obj->head,
1198 };
1199 amdgpu_ras_interrupt_remove_handler(adev, &info);
1200 }
1201
1202 return 0;
1203}
1204/* ih end */
1205
1206/* recovery begin */
xinhui pan466b1792019-05-07 11:53:31 +08001207
1208/* return 0 on success.
1209 * caller need free bps.
1210 */
1211static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1212 struct ras_badpage **bps, unsigned int *count)
1213{
1214 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1215 struct ras_err_handler_data *data;
1216 int i = 0;
1217 int ret = 0;
1218
1219 if (!con || !con->eh_data || !bps || !count)
1220 return -EINVAL;
1221
1222 mutex_lock(&con->recovery_lock);
1223 data = con->eh_data;
1224 if (!data || data->count == 0) {
1225 *bps = NULL;
1226 goto out;
1227 }
1228
1229 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1230 if (!*bps) {
1231 ret = -ENOMEM;
1232 goto out;
1233 }
1234
1235 for (; i < data->count; i++) {
1236 (*bps)[i] = (struct ras_badpage){
Tao Zhou9dc23a62019-08-13 10:39:05 +08001237 .bp = data->bps[i].retired_page,
xinhui pan466b1792019-05-07 11:53:31 +08001238 .size = AMDGPU_GPU_PAGE_SIZE,
1239 .flags = 0,
1240 };
1241
1242 if (data->last_reserved <= i)
1243 (*bps)[i].flags = 1;
Tao Zhou9dc23a62019-08-13 10:39:05 +08001244 else if (data->bps_bo[i] == NULL)
xinhui pan466b1792019-05-07 11:53:31 +08001245 (*bps)[i].flags = 2;
1246 }
1247
1248 *count = data->count;
1249out:
1250 mutex_unlock(&con->recovery_lock);
1251 return ret;
1252}
1253
xinhui panc030f2e2018-10-31 14:38:28 +08001254static void amdgpu_ras_do_recovery(struct work_struct *work)
1255{
1256 struct amdgpu_ras *ras =
1257 container_of(work, struct amdgpu_ras, recovery_work);
1258
1259 amdgpu_device_gpu_recover(ras->adev, 0);
1260 atomic_set(&ras->in_recovery, 0);
1261}
1262
1263static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
1264 struct amdgpu_bo **bo_ptr)
1265{
1266 /* no need to free it actually. */
1267 amdgpu_bo_free_kernel(bo_ptr, NULL, NULL);
1268 return 0;
1269}
1270
1271/* reserve vram with size@offset */
1272static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
1273 uint64_t offset, uint64_t size,
1274 struct amdgpu_bo **bo_ptr)
1275{
1276 struct ttm_operation_ctx ctx = { false, false };
1277 struct amdgpu_bo_param bp;
1278 int r = 0;
1279 int i;
1280 struct amdgpu_bo *bo;
1281
1282 if (bo_ptr)
1283 *bo_ptr = NULL;
1284 memset(&bp, 0, sizeof(bp));
1285 bp.size = size;
1286 bp.byte_align = PAGE_SIZE;
1287 bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
1288 bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
1289 AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
1290 bp.type = ttm_bo_type_kernel;
1291 bp.resv = NULL;
1292
1293 r = amdgpu_bo_create(adev, &bp, &bo);
1294 if (r)
1295 return -EINVAL;
1296
1297 r = amdgpu_bo_reserve(bo, false);
1298 if (r)
1299 goto error_reserve;
1300
1301 offset = ALIGN(offset, PAGE_SIZE);
1302 for (i = 0; i < bo->placement.num_placement; ++i) {
1303 bo->placements[i].fpfn = offset >> PAGE_SHIFT;
1304 bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT;
1305 }
1306
1307 ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem);
1308 r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx);
1309 if (r)
1310 goto error_pin;
1311
1312 r = amdgpu_bo_pin_restricted(bo,
1313 AMDGPU_GEM_DOMAIN_VRAM,
1314 offset,
1315 offset + size);
1316 if (r)
1317 goto error_pin;
1318
1319 if (bo_ptr)
1320 *bo_ptr = bo;
1321
1322 amdgpu_bo_unreserve(bo);
1323 return r;
1324
1325error_pin:
1326 amdgpu_bo_unreserve(bo);
1327error_reserve:
1328 amdgpu_bo_unref(&bo);
1329 return r;
1330}
1331
1332/* alloc/realloc bps array */
1333static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1334 struct ras_err_handler_data *data, int pages)
1335{
1336 unsigned int old_space = data->count + data->space_left;
1337 unsigned int new_space = old_space + pages;
Tao Zhou9dc23a62019-08-13 10:39:05 +08001338 unsigned int align_space = ALIGN(new_space, 512);
1339 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1340 struct amdgpu_bo **bps_bo =
1341 kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
xinhui panc030f2e2018-10-31 14:38:28 +08001342
Tao Zhou9dc23a62019-08-13 10:39:05 +08001343 if (!bps || !bps_bo) {
1344 kfree(bps);
1345 kfree(bps_bo);
xinhui panc030f2e2018-10-31 14:38:28 +08001346 return -ENOMEM;
Tao Zhou9dc23a62019-08-13 10:39:05 +08001347 }
xinhui panc030f2e2018-10-31 14:38:28 +08001348
1349 if (data->bps) {
Tao Zhou9dc23a62019-08-13 10:39:05 +08001350 memcpy(bps, data->bps,
xinhui panc030f2e2018-10-31 14:38:28 +08001351 data->count * sizeof(*data->bps));
1352 kfree(data->bps);
1353 }
Tao Zhou9dc23a62019-08-13 10:39:05 +08001354 if (data->bps_bo) {
1355 memcpy(bps_bo, data->bps_bo,
1356 data->count * sizeof(*data->bps_bo));
1357 kfree(data->bps_bo);
1358 }
xinhui panc030f2e2018-10-31 14:38:28 +08001359
Tao Zhou9dc23a62019-08-13 10:39:05 +08001360 data->bps = bps;
1361 data->bps_bo = bps_bo;
xinhui panc030f2e2018-10-31 14:38:28 +08001362 data->space_left += align_space - old_space;
1363 return 0;
1364}
1365
1366/* it deal with vram only. */
1367int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
Tao Zhou9dc23a62019-08-13 10:39:05 +08001368 struct eeprom_table_record *bps, int pages)
xinhui panc030f2e2018-10-31 14:38:28 +08001369{
1370 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
xinhui pan73aa8e12019-03-19 11:16:32 +08001371 struct ras_err_handler_data *data;
xinhui panc030f2e2018-10-31 14:38:28 +08001372 int ret = 0;
1373
xinhui pan73aa8e12019-03-19 11:16:32 +08001374 if (!con || !con->eh_data || !bps || pages <= 0)
xinhui panc030f2e2018-10-31 14:38:28 +08001375 return 0;
1376
1377 mutex_lock(&con->recovery_lock);
xinhui pan73aa8e12019-03-19 11:16:32 +08001378 data = con->eh_data;
xinhui panc030f2e2018-10-31 14:38:28 +08001379 if (!data)
1380 goto out;
1381
1382 if (data->space_left <= pages)
1383 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1384 ret = -ENOMEM;
1385 goto out;
1386 }
1387
Tao Zhou9dc23a62019-08-13 10:39:05 +08001388 memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
1389 data->count += pages;
xinhui panc030f2e2018-10-31 14:38:28 +08001390 data->space_left -= pages;
Tao Zhou9dc23a62019-08-13 10:39:05 +08001391
xinhui panc030f2e2018-10-31 14:38:28 +08001392out:
1393 mutex_unlock(&con->recovery_lock);
1394
1395 return ret;
1396}
1397
Tao Zhou78ad00c2019-08-15 14:55:55 +08001398/*
1399 * write error record array to eeprom, the function should be
1400 * protected by recovery_lock
1401 */
1402static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1403{
1404 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1405 struct ras_err_handler_data *data;
1406 struct amdgpu_ras_eeprom_control *control =
1407 &adev->psp.ras.ras->eeprom_control;
1408 int save_count;
1409
1410 if (!con || !con->eh_data)
1411 return 0;
1412
1413 data = con->eh_data;
1414 save_count = data->count - control->num_recs;
1415 /* only new entries are saved */
1416 if (save_count > 0)
1417 if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control,
1418 &data->bps[control->num_recs],
1419 true,
1420 save_count)) {
1421 DRM_ERROR("Failed to save EEPROM table data!");
1422 return -EIO;
1423 }
1424
1425 return 0;
1426}
1427
1428/*
1429 * read error record array in eeprom and reserve enough space for
1430 * storing new bad pages
1431 */
1432static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1433{
1434 struct amdgpu_ras_eeprom_control *control =
1435 &adev->psp.ras.ras->eeprom_control;
1436 struct eeprom_table_record *bps = NULL;
1437 int ret = 0;
1438
1439 /* no bad page record, skip eeprom access */
1440 if (!control->num_recs)
1441 return ret;
1442
1443 bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
1444 if (!bps)
1445 return -ENOMEM;
1446
1447 if (amdgpu_ras_eeprom_process_recods(control, bps, false,
1448 control->num_recs)) {
1449 DRM_ERROR("Failed to load EEPROM table records!");
1450 ret = -EIO;
1451 goto out;
1452 }
1453
1454 ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
1455
1456out:
1457 kfree(bps);
1458 return ret;
1459}
1460
xinhui panc030f2e2018-10-31 14:38:28 +08001461/* called in gpu recovery/init */
1462int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1463{
1464 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
xinhui pan73aa8e12019-03-19 11:16:32 +08001465 struct ras_err_handler_data *data;
xinhui panc030f2e2018-10-31 14:38:28 +08001466 uint64_t bp;
1467 struct amdgpu_bo *bo;
Tao Zhou78ad00c2019-08-15 14:55:55 +08001468 int i, ret = 0;
xinhui panc030f2e2018-10-31 14:38:28 +08001469
xinhui pan73aa8e12019-03-19 11:16:32 +08001470 if (!con || !con->eh_data)
xinhui panc030f2e2018-10-31 14:38:28 +08001471 return 0;
1472
1473 mutex_lock(&con->recovery_lock);
xinhui pan73aa8e12019-03-19 11:16:32 +08001474 data = con->eh_data;
1475 if (!data)
1476 goto out;
xinhui panc030f2e2018-10-31 14:38:28 +08001477 /* reserve vram at driver post stage. */
1478 for (i = data->last_reserved; i < data->count; i++) {
Tao Zhou9dc23a62019-08-13 10:39:05 +08001479 bp = data->bps[i].retired_page;
xinhui panc030f2e2018-10-31 14:38:28 +08001480
1481 if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
1482 PAGE_SIZE, &bo))
1483 DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1484
Tao Zhou9dc23a62019-08-13 10:39:05 +08001485 data->bps_bo[i] = bo;
xinhui panc030f2e2018-10-31 14:38:28 +08001486 data->last_reserved = i + 1;
1487 }
Tao Zhou78ad00c2019-08-15 14:55:55 +08001488
1489 /* continue to save bad pages to eeprom even reesrve_vram fails */
1490 ret = amdgpu_ras_save_bad_pages(adev);
xinhui pan73aa8e12019-03-19 11:16:32 +08001491out:
xinhui panc030f2e2018-10-31 14:38:28 +08001492 mutex_unlock(&con->recovery_lock);
Tao Zhou78ad00c2019-08-15 14:55:55 +08001493 return ret;
xinhui panc030f2e2018-10-31 14:38:28 +08001494}
1495
1496/* called when driver unload */
1497static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1498{
1499 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
xinhui pan73aa8e12019-03-19 11:16:32 +08001500 struct ras_err_handler_data *data;
xinhui panc030f2e2018-10-31 14:38:28 +08001501 struct amdgpu_bo *bo;
1502 int i;
1503
xinhui pan73aa8e12019-03-19 11:16:32 +08001504 if (!con || !con->eh_data)
xinhui panc030f2e2018-10-31 14:38:28 +08001505 return 0;
1506
1507 mutex_lock(&con->recovery_lock);
xinhui pan73aa8e12019-03-19 11:16:32 +08001508 data = con->eh_data;
1509 if (!data)
1510 goto out;
1511
xinhui panc030f2e2018-10-31 14:38:28 +08001512 for (i = data->last_reserved - 1; i >= 0; i--) {
Tao Zhou9dc23a62019-08-13 10:39:05 +08001513 bo = data->bps_bo[i];
xinhui panc030f2e2018-10-31 14:38:28 +08001514
1515 amdgpu_ras_release_vram(adev, &bo);
1516
Tao Zhou9dc23a62019-08-13 10:39:05 +08001517 data->bps_bo[i] = bo;
xinhui panc030f2e2018-10-31 14:38:28 +08001518 data->last_reserved = i;
1519 }
xinhui pan73aa8e12019-03-19 11:16:32 +08001520out:
xinhui panc030f2e2018-10-31 14:38:28 +08001521 mutex_unlock(&con->recovery_lock);
1522 return 0;
1523}
1524
Tao Zhou1a6fc072019-08-30 19:50:39 +08001525int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
xinhui panc030f2e2018-10-31 14:38:28 +08001526{
1527 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
Andrey Grodzovsky4d1337d2019-09-06 17:23:44 -04001528 struct ras_err_handler_data **data;
Tao Zhou78ad00c2019-08-15 14:55:55 +08001529 int ret;
xinhui panc030f2e2018-10-31 14:38:28 +08001530
Andrey Grodzovsky4d1337d2019-09-06 17:23:44 -04001531 if (con)
1532 data = &con->eh_data;
1533 else
1534 return 0;
1535
Tao Zhou1a6fc072019-08-30 19:50:39 +08001536 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
1537 if (!*data) {
1538 ret = -ENOMEM;
1539 goto out;
1540 }
xinhui panc030f2e2018-10-31 14:38:28 +08001541
1542 mutex_init(&con->recovery_lock);
1543 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1544 atomic_set(&con->in_recovery, 0);
1545 con->adev = adev;
1546
Tao Zhou78ad00c2019-08-15 14:55:55 +08001547 ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
1548 if (ret)
Tao Zhou1a6fc072019-08-30 19:50:39 +08001549 goto free;
Tao Zhou78ad00c2019-08-15 14:55:55 +08001550
1551 if (adev->psp.ras.ras->eeprom_control.num_recs) {
1552 ret = amdgpu_ras_load_bad_pages(adev);
1553 if (ret)
Tao Zhou1a6fc072019-08-30 19:50:39 +08001554 goto free;
Tao Zhou78ad00c2019-08-15 14:55:55 +08001555 ret = amdgpu_ras_reserve_bad_pages(adev);
1556 if (ret)
Tao Zhou1a6fc072019-08-30 19:50:39 +08001557 goto release;
Tao Zhou78ad00c2019-08-15 14:55:55 +08001558 }
xinhui panc030f2e2018-10-31 14:38:28 +08001559
1560 return 0;
Tao Zhou1a6fc072019-08-30 19:50:39 +08001561
1562release:
1563 amdgpu_ras_release_bad_pages(adev);
1564free:
1565 con->eh_data = NULL;
1566 kfree((*data)->bps);
1567 kfree((*data)->bps_bo);
1568 kfree(*data);
1569out:
1570 DRM_WARN("Failed to initialize ras recovery!\n");
1571
1572 return ret;
xinhui panc030f2e2018-10-31 14:38:28 +08001573}
1574
1575static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1576{
1577 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1578 struct ras_err_handler_data *data = con->eh_data;
1579
Tao Zhou1a6fc072019-08-30 19:50:39 +08001580 /* recovery_init failed to init it, fini is useless */
1581 if (!data)
1582 return 0;
1583
xinhui panc030f2e2018-10-31 14:38:28 +08001584 cancel_work_sync(&con->recovery_work);
xinhui panc030f2e2018-10-31 14:38:28 +08001585 amdgpu_ras_release_bad_pages(adev);
1586
1587 mutex_lock(&con->recovery_lock);
1588 con->eh_data = NULL;
1589 kfree(data->bps);
Tao Zhou1a6fc072019-08-30 19:50:39 +08001590 kfree(data->bps_bo);
xinhui panc030f2e2018-10-31 14:38:28 +08001591 kfree(data);
1592 mutex_unlock(&con->recovery_lock);
1593
1594 return 0;
1595}
1596/* recovery end */
1597
xinhui pana5648082019-05-08 19:12:24 +08001598/* return 0 if ras will reset gpu and repost.*/
1599int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1600 unsigned int block)
1601{
1602 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1603
1604 if (!ras)
1605 return -EINVAL;
1606
1607 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1608 return 0;
1609}
1610
xinhui pan5caf4662019-03-11 14:12:40 +08001611/*
1612 * check hardware's ras ability which will be saved in hw_supported.
1613 * if hardware does not support ras, we can skip some ras initializtion and
1614 * forbid some ras operations from IP.
1615 * if software itself, say boot parameter, limit the ras ability. We still
1616 * need allow IP do some limited operations, like disable. In such case,
1617 * we have to initialize ras as normal. but need check if operation is
1618 * allowed or not in each function.
1619 */
1620static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1621 uint32_t *hw_supported, uint32_t *supported)
xinhui panc030f2e2018-10-31 14:38:28 +08001622{
xinhui pan5caf4662019-03-11 14:12:40 +08001623 *hw_supported = 0;
1624 *supported = 0;
xinhui panc030f2e2018-10-31 14:38:28 +08001625
xinhui pan5caf4662019-03-11 14:12:40 +08001626 if (amdgpu_sriov_vf(adev) ||
xinhui panb404ae82019-03-07 11:49:26 +08001627 adev->asic_type != CHIP_VEGA20)
xinhui pan5caf4662019-03-11 14:12:40 +08001628 return;
xinhui panb404ae82019-03-07 11:49:26 +08001629
xinhui pan5d0f9032019-03-12 17:15:57 +08001630 if (adev->is_atom_fw &&
1631 (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1632 amdgpu_atomfirmware_sram_ecc_supported(adev)))
xinhui pan5caf4662019-03-11 14:12:40 +08001633 *hw_supported = AMDGPU_RAS_BLOCK_MASK;
xinhui panb404ae82019-03-07 11:49:26 +08001634
xinhui pan5caf4662019-03-11 14:12:40 +08001635 *supported = amdgpu_ras_enable == 0 ?
1636 0 : *hw_supported & amdgpu_ras_mask;
xinhui panc030f2e2018-10-31 14:38:28 +08001637}
1638
1639int amdgpu_ras_init(struct amdgpu_device *adev)
1640{
1641 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
Hawking Zhang4e644ff2019-06-05 14:57:00 +08001642 int r;
xinhui panc030f2e2018-10-31 14:38:28 +08001643
xinhui panb404ae82019-03-07 11:49:26 +08001644 if (con)
xinhui panc030f2e2018-10-31 14:38:28 +08001645 return 0;
1646
1647 con = kmalloc(sizeof(struct amdgpu_ras) +
1648 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1649 GFP_KERNEL|__GFP_ZERO);
1650 if (!con)
1651 return -ENOMEM;
1652
1653 con->objs = (struct ras_manager *)(con + 1);
1654
1655 amdgpu_ras_set_context(adev, con);
1656
xinhui pan5caf4662019-03-11 14:12:40 +08001657 amdgpu_ras_check_supported(adev, &con->hw_supported,
1658 &con->supported);
Hawking Zhangfb2a3602019-07-18 12:49:15 +08001659 if (!con->hw_supported) {
1660 amdgpu_ras_set_context(adev, NULL);
1661 kfree(con);
1662 return 0;
1663 }
1664
xinhui panc030f2e2018-10-31 14:38:28 +08001665 con->features = 0;
1666 INIT_LIST_HEAD(&con->head);
xinhui pan108c6a62019-03-11 15:23:00 +08001667 /* Might need get this flag from vbios. */
1668 con->flags = RAS_DEFAULT_FLAGS;
xinhui panc030f2e2018-10-31 14:38:28 +08001669
Hawking Zhang4e644ff2019-06-05 14:57:00 +08001670 if (adev->nbio.funcs->init_ras_controller_interrupt) {
1671 r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
1672 if (r)
1673 return r;
1674 }
1675
1676 if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
1677 r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
1678 if (r)
1679 return r;
1680 }
1681
xinhui panc030f2e2018-10-31 14:38:28 +08001682 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1683
xinhui panc030f2e2018-10-31 14:38:28 +08001684 if (amdgpu_ras_fs_init(adev))
1685 goto fs_out;
1686
xinhui pan5d0f9032019-03-12 17:15:57 +08001687 DRM_INFO("RAS INFO: ras initialized successfully, "
1688 "hardware ability[%x] ras_mask[%x]\n",
1689 con->hw_supported, con->supported);
xinhui panc030f2e2018-10-31 14:38:28 +08001690 return 0;
1691fs_out:
xinhui panc030f2e2018-10-31 14:38:28 +08001692 amdgpu_ras_set_context(adev, NULL);
1693 kfree(con);
1694
1695 return -EINVAL;
1696}
1697
Hawking Zhangb293e892019-08-30 13:29:18 +08001698/* helper function to handle common stuff in ip late init phase */
1699int amdgpu_ras_late_init(struct amdgpu_device *adev,
1700 struct ras_common_if *ras_block,
1701 struct ras_fs_if *fs_info,
1702 struct ras_ih_if *ih_info)
1703{
1704 int r;
1705
1706 /* disable RAS feature per IP block if it is not supported */
1707 if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
1708 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
1709 return 0;
1710 }
1711
1712 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
1713 if (r) {
1714 if (r == -EAGAIN) {
1715 /* request gpu reset. will run again */
1716 amdgpu_ras_request_reset_on_boot(adev,
1717 ras_block->block);
1718 return 0;
1719 } else if (adev->in_suspend || adev->in_gpu_reset) {
1720 /* in resume phase, if fail to enable ras,
1721 * clean up all ras fs nodes, and disable ras */
1722 goto cleanup;
1723 } else
1724 return r;
1725 }
1726
1727 /* in resume phase, no need to create ras fs node */
1728 if (adev->in_suspend || adev->in_gpu_reset)
1729 return 0;
1730
1731 if (ih_info->cb) {
1732 r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
1733 if (r)
1734 goto interrupt;
1735 }
1736
1737 amdgpu_ras_debugfs_create(adev, fs_info);
1738
1739 r = amdgpu_ras_sysfs_create(adev, fs_info);
1740 if (r)
1741 goto sysfs;
1742
1743 return 0;
1744cleanup:
1745 amdgpu_ras_sysfs_remove(adev, ras_block);
1746sysfs:
1747 amdgpu_ras_debugfs_remove(adev, ras_block);
1748 if (ih_info->cb)
1749 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
1750interrupt:
1751 amdgpu_ras_feature_enable(adev, ras_block, 0);
1752 return r;
1753}
1754
1755/* helper function to remove ras fs node and interrupt handler */
1756void amdgpu_ras_late_fini(struct amdgpu_device *adev,
1757 struct ras_common_if *ras_block,
1758 struct ras_ih_if *ih_info)
1759{
1760 if (!ras_block || !ih_info)
1761 return;
1762
1763 amdgpu_ras_sysfs_remove(adev, ras_block);
1764 amdgpu_ras_debugfs_remove(adev, ras_block);
1765 if (ih_info->cb)
1766 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
1767 amdgpu_ras_feature_enable(adev, ras_block, 0);
1768}
1769
xinhui pana5648082019-05-08 19:12:24 +08001770/* do some init work after IP late init as dependence.
xinhui pan511fdbc2019-05-09 08:26:27 +08001771 * and it runs in resume/gpu reset/booting up cases.
xinhui pana5648082019-05-08 19:12:24 +08001772 */
xinhui pan511fdbc2019-05-09 08:26:27 +08001773void amdgpu_ras_resume(struct amdgpu_device *adev)
xinhui pan108c6a62019-03-11 15:23:00 +08001774{
1775 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1776 struct ras_manager *obj, *tmp;
1777
1778 if (!con)
1779 return;
1780
xinhui pan108c6a62019-03-11 15:23:00 +08001781 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
xinhui pan191051a2019-04-03 09:52:59 +08001782 /* Set up all other IPs which are not implemented. There is a
1783 * tricky thing that IP's actual ras error type should be
1784 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
1785 * ERROR_NONE make sense anyway.
1786 */
1787 amdgpu_ras_enable_all_features(adev, 1);
1788
1789 /* We enable ras on all hw_supported block, but as boot
1790 * parameter might disable some of them and one or more IP has
1791 * not implemented yet. So we disable them on behalf.
1792 */
xinhui pan108c6a62019-03-11 15:23:00 +08001793 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1794 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1795 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1796 /* there should be no any reference. */
1797 WARN_ON(alive_obj(obj));
1798 }
xinhui pan191051a2019-04-03 09:52:59 +08001799 }
xinhui pan108c6a62019-03-11 15:23:00 +08001800 }
xinhui pana5648082019-05-08 19:12:24 +08001801
1802 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1803 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1804 /* setup ras obj state as disabled.
1805 * for init_by_vbios case.
1806 * if we want to enable ras, just enable it in a normal way.
1807 * If we want do disable it, need setup ras obj as enabled,
1808 * then issue another TA disable cmd.
1809 * See feature_enable_on_boot
1810 */
1811 amdgpu_ras_disable_all_features(adev, 1);
1812 amdgpu_ras_reset_gpu(adev, 0);
1813 }
xinhui pan108c6a62019-03-11 15:23:00 +08001814}
1815
xinhui pan511fdbc2019-05-09 08:26:27 +08001816void amdgpu_ras_suspend(struct amdgpu_device *adev)
1817{
1818 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1819
1820 if (!con)
1821 return;
1822
1823 amdgpu_ras_disable_all_features(adev, 0);
1824 /* Make sure all ras objects are disabled. */
1825 if (con->features)
1826 amdgpu_ras_disable_all_features(adev, 1);
1827}
1828
xinhui panc030f2e2018-10-31 14:38:28 +08001829/* do some fini work before IP fini as dependence */
1830int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1831{
1832 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1833
1834 if (!con)
1835 return 0;
1836
1837 /* Need disable ras on all IPs here before ip [hw/sw]fini */
1838 amdgpu_ras_disable_all_features(adev, 0);
1839 amdgpu_ras_recovery_fini(adev);
1840 return 0;
1841}
1842
1843int amdgpu_ras_fini(struct amdgpu_device *adev)
1844{
1845 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1846
1847 if (!con)
1848 return 0;
1849
1850 amdgpu_ras_fs_fini(adev);
1851 amdgpu_ras_interrupt_remove_all(adev);
1852
1853 WARN(con->features, "Feature mask is not cleared");
1854
1855 if (con->features)
1856 amdgpu_ras_disable_all_features(adev, 1);
1857
1858 amdgpu_ras_set_context(adev, NULL);
1859 kfree(con);
1860
1861 return 0;
1862}
Andrey Grodzovsky7c6e68c2019-09-13 17:40:32 -05001863
1864void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
1865{
1866 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
Andrey Grodzovskyd5ea0932019-08-22 15:01:37 -04001867 DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
1868
1869 amdgpu_ras_reset_gpu(adev, false);
Andrey Grodzovsky7c6e68c2019-09-13 17:40:32 -05001870 }
1871}