[gpus.0]
device_name = "A100" # device name
arch = 7 # architecture, see nvmlDeviceArchitecture_t below
pci = "00:00" # pci bus_id:device_id
#uuid = "xxxxx" # card uuid
link_gen = 4 # pcie link generation
link_width_current = 16 # pcie link width current
link_width_max = 16 # pcie link width max
nvlink_active = [true, true, true, true, true, true] # if nvlink lane is active
remapping_failure = false # if there is a row-remapping failure, available for ampere and newer architecture
remapping_pending = false # if there is pending row-remapping, available for ampere and newer architecture
sram_ue = 0 # count of uncorrectable ecc errors happened in SRAM
dram_ue = 0 # count of uncorrectable ecc errors happened in DRAM
dram_ce = 0 # count of correctable ecc errors happended in DRAM
retired_page_sbe = 0 # count of retired pages caused by
ubuntu@192-222-52-71:~$ ai-accelerator-tool diagnose
I0212 00:46:22.690359 7296 diagnose.go:45] "Diagnose Results"
{
"GPU-231ec9bf-7a01-757f-f112-b0f1782a2c6b": [
{
"Name": "gpu_link_status",
"IsHealthy": true,
"Message": ""
},
{
"Name": "gpu_vram_unrecoverable_errors",
"IsHealthy": true,
"Message": ""
},
{
"Name": "gpu_vram_recoverable_errors",
"IsHealthy": true,
"Message": ""
}
],
"GPU-51617c87-8ba8-6274-2419-d9f235a9b80e": [
{
"Name": "gpu_link_status",
"IsHealthy": true,
"Message": ""
},
{
"Name": "gpu_vram_unrecoverable_errors",
"IsHealthy": true,
"Message": ""
},
{
"Name": "gpu_vram_recoverable_errors",
"IsHealthy": true,
"Message": ""
}
],
"GPU-6b87caa8-dfab-4716-b15c-6f5d95e6c63f": [
{
"Name": "gpu_link_status",
"IsHealthy": true,
"Message": ""
},
{
"Name": "gpu_vram_unrecoverable_errors",
"IsHealthy": true,
"Message": ""
},
{
"Name": "gpu_vram_recoverable_errors",
"IsHealthy": true,
"Message": ""
}
],
"GPU-8cd3fcbe-7b3a-e28c-64c5-d0271d874beb": [
{
"Name": "gpu_link_status",
"IsHealthy": true,
"Message": ""
},
{
"Name": "gpu_vram_unrecoverable_errors",
"IsHealthy": false,
"Message": "VRAM Unrecoverable Errors: get retired pages failed: exit status 1"
},
{
"Name": "gpu_vram_recoverable_errors",
"IsHealthy": false,
"Message": "VRAM Recoverable Errors: get retired pages failed: exit status 1"
}
],
"OVERALL": [
{
"Name": "gpu_driver_status",
"IsHealthy": true,
"Message": "GPU Driver is loaded successfully"
},
{
"Name": "gpu_card_count",
"IsHealthy": true,
"Message": "GPU Card Count: 4"
}
]
}
I expect the gpus.0 should be the first record from
diagnoseresult. However, it is not.