Skip to content

[gpus.0] is not aligned with diagnose sequence #9

@Jeffwan

Description

@Jeffwan

I expect the gpus.0 should be the first record from diagnose result. However, it is not.

[gpus.0]
device_name = "A100"                                    # device name
arch = 7                                                # architecture, see nvmlDeviceArchitecture_t below
pci = "00:00"                                           # pci bus_id:device_id
#uuid = "xxxxx"                                          # card uuid
link_gen = 4                                            # pcie link generation
link_width_current = 16                                 # pcie link width current
link_width_max = 16                                     # pcie link width max
nvlink_active = [true, true, true, true, true, true]    # if nvlink lane is active
remapping_failure = false                               # if there is a row-remapping failure, available for ampere and newer architecture
remapping_pending = false                               # if there is pending row-remapping, available for ampere and newer architecture
sram_ue = 0                                             # count of uncorrectable ecc errors happened in SRAM
dram_ue = 0                                             # count of uncorrectable ecc errors happened in DRAM
dram_ce = 0                                             # count of correctable ecc errors happended in DRAM
retired_page_sbe = 0                                    # count of retired pages caused by
ubuntu@192-222-52-71:~$ ai-accelerator-tool diagnose
I0212 00:46:22.690359    7296 diagnose.go:45] "Diagnose Results"
{
  "GPU-231ec9bf-7a01-757f-f112-b0f1782a2c6b": [
    {
      "Name": "gpu_link_status",
      "IsHealthy": true,
      "Message": ""
    },
    {
      "Name": "gpu_vram_unrecoverable_errors",
      "IsHealthy": true,
      "Message": ""
    },
    {
      "Name": "gpu_vram_recoverable_errors",
      "IsHealthy": true,
      "Message": ""
    }
  ],
  "GPU-51617c87-8ba8-6274-2419-d9f235a9b80e": [
    {
      "Name": "gpu_link_status",
      "IsHealthy": true,
      "Message": ""
    },
    {
      "Name": "gpu_vram_unrecoverable_errors",
      "IsHealthy": true,
      "Message": ""
    },
    {
      "Name": "gpu_vram_recoverable_errors",
      "IsHealthy": true,
      "Message": ""
    }
  ],
  "GPU-6b87caa8-dfab-4716-b15c-6f5d95e6c63f": [
    {
      "Name": "gpu_link_status",
      "IsHealthy": true,
      "Message": ""
    },
    {
      "Name": "gpu_vram_unrecoverable_errors",
      "IsHealthy": true,
      "Message": ""
    },
    {
      "Name": "gpu_vram_recoverable_errors",
      "IsHealthy": true,
      "Message": ""
    }
  ],
  "GPU-8cd3fcbe-7b3a-e28c-64c5-d0271d874beb": [
    {
      "Name": "gpu_link_status",
      "IsHealthy": true,
      "Message": ""
    },
    {
      "Name": "gpu_vram_unrecoverable_errors",
      "IsHealthy": false,
      "Message": "VRAM Unrecoverable Errors: get retired pages failed: exit status 1"
    },
    {
      "Name": "gpu_vram_recoverable_errors",
      "IsHealthy": false,
      "Message": "VRAM Recoverable Errors: get retired pages failed: exit status 1"
    }
  ],
  "OVERALL": [
    {
      "Name": "gpu_driver_status",
      "IsHealthy": true,
      "Message": "GPU Driver is loaded successfully"
    },
    {
      "Name": "gpu_card_count",
      "IsHealthy": true,
      "Message": "GPU Card Count: 4"
    }
  ]
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions