tenstorrent / tt-smi

Tenstorrent console based hardware information program
Apache License 2.0
18 stars 3 forks source link

tt-smi json output has duplicated entry? #3

Closed warthog9 closed 4 months ago

warthog9 commented 5 months ago

In doing some data crunching on json files produced from tt-smi I noticed that there seems to be a duplicated board in the output. Slightly sanitized example:

 cat 02-01-2024_22:08:18_results.json
{
    "time": "2024-02-01T22:08:18.265900",
    "host_info": {
        "OS": "Linux",
        "Distro": "Ubuntu 20.04.3 LTS",
        "Kernel": "5.4.0-170-generic",
        "Hostname": "tt-metal-2",
        "Platform": "x86_64",
        "Python": "3.8.10",
        "Memory": "125.51 GB",
        "Driver": "TTKMD 1.26"
    },
    "device_info": [
        {
            "board_info": {
                "bus_id": "0000:31:00.0",
                "board_type": "n300",
                "board_id": "1234",
                "coords": "N/A",
                "dram_status": true,
                "dram_speed": "12G",
                "pcie_speed": 1,
                "pcie_width": 1
            },
            "telemetry": {
                "voltage": "0.72",
                "current": " 17.0",
                "power": " 13.0",
                "aiclk": " 500",
                "asic_temperature": "44.9"
            },
            "firmwares": {
                "arc_fw": "2.13.0.0",
                "arc_fw_date": "2023-07-14",
                "eth_fw": "6.2.0",
                "m3_bl_fw": "129.2.0.0",
                "m3_app_fw": "5.5.0.0",
                "tt_flash_version": "7.12.0.0"
            },
            "limits": {
                "vdd_min": "0.72",
                "vdd_max": "0.95",
                "tdp_limit": " 85",
                "tdc_limit": "160",
                "asic_fmax": "1000",
                "therm_trip_l1_limit": "83",
                "thm_limit": "75",
                "bus_peak_limit": null
            }
        },
        {
            "board_info": {
                "bus_id": "0000:4b:00.0",
                "board_type": "n300",
                "board_id": "4567",
                "coords": "N/A",
                "dram_status": true,
                "dram_speed": "12G",
                "pcie_speed": 1,
                "pcie_width": 1
            },
            "telemetry": {
                "voltage": "0.72",
                "current": " 15.0",
                "power": " 11.0",
                "aiclk": " 500",
                "asic_temperature": "44.2"
            },
            "firmwares": {
                "arc_fw": "2.13.0.0",
                "arc_fw_date": "2023-07-14",
                "eth_fw": "6.2.0",
                "m3_bl_fw": "129.2.0.0",
                "m3_app_fw": "5.5.0.0",
                "tt_flash_version": "7.12.0.0"
            },
            "limits": {
                "vdd_min": "0.72",
                "vdd_max": "0.95",
                "tdp_limit": " 85",
                "tdc_limit": "160",
                "asic_fmax": "1000",
                "therm_trip_l1_limit": "83",
                "thm_limit": "75",
                "bus_peak_limit": null
            }
        },
        {
            "board_info": {
                "bus_id": "0000:b1:00.0",
                "board_type": "n300",
                "board_id": "3456",
                "coords": "N/A",
                "dram_status": true,
                "dram_speed": "12G",
                "pcie_speed": 1,
                "pcie_width": 1
            },
            "telemetry": {
                "voltage": "0.72",
                "current": " 15.0",
                "power": " 11.0",
                "aiclk": " 500",
                "asic_temperature": "40.1"
            },
            "firmwares": {
                "arc_fw": "2.13.0.0",
                "arc_fw_date": "2023-07-14",
                "eth_fw": "6.2.0",
                "m3_bl_fw": "129.2.0.0",
                "m3_app_fw": "5.5.0.0",
                "tt_flash_version": "7.12.0.0"
            },
            "limits": {
                "vdd_min": "0.72",
                "vdd_max": "0.95",
                "tdp_limit": " 85",
                "tdc_limit": "160",
                "asic_fmax": "1000",
                "therm_trip_l1_limit": "83",
                "thm_limit": "75",
                "bus_peak_limit": null
            }
        },
        {
            "board_info": {
                "bus_id": "0000:ca:00.0",
                "board_type": "n300",
                "board_id": "2345",
                "coords": "N/A",
                "dram_status": true,
                "dram_speed": "12G",
                "pcie_speed": 1,
                "pcie_width": 1
            },
            "telemetry": {
                "voltage": "0.72",
                "current": " 15.0",
                "power": " 11.0",
                "aiclk": " 500",
                "asic_temperature": "43.9"
            },
            "firmwares": {
                "arc_fw": "2.13.0.0",
                "arc_fw_date": "2023-07-14",
                "eth_fw": "6.2.0",
                "m3_bl_fw": "129.2.0.0",
                "m3_app_fw": "5.5.0.0",
                "tt_flash_version": "7.12.0.0"
            },
            "limits": {
                "vdd_min": "0.72",
                "vdd_max": "0.95",
                "tdp_limit": " 85",
                "tdc_limit": "160",
                "asic_fmax": "1000",
                "therm_trip_l1_limit": "83",
                "thm_limit": "75",
                "bus_peak_limit": null
            }
        },
        {
            "board_info": {
                "bus_id": "N/A",
                "board_type": "n300",
                "board_id": "1234",
                "coords": "N/A",
                "dram_status": true,
                "dram_speed": "12G",
                "pcie_speed": 0,
                "pcie_width": 0
            },
            "telemetry": {
                "voltage": "0.72",
                "current": " 14.0",
                "power": " 10.0",
                "aiclk": " 500",
                "asic_temperature": "35.1"
            },
            "firmwares": {
                "arc_fw": "2.13.0.0",
                "arc_fw_date": "2023-07-14",
                "eth_fw": "6.2.0",
                "m3_bl_fw": "129.2.0.0",
                "m3_app_fw": "5.5.0.0",
                "tt_flash_version": "7.12.0.0"
            },
            "limits": {
                "vdd_min": "0.72",
                "vdd_max": "0.95",
                "tdp_limit": " 85",
                "tdc_limit": "160",
                "asic_fmax": "1000",
                "therm_trip_l1_limit": "83",
                "thm_limit": "75",
                "bus_peak_limit": null
            }
        }
    ]
}

The gotcha is that board_id "1234" is duplicated both as the first entry, and last and the telemetry data doesn't quite match up either so it's not an exact 1:1 duplication

sbansalTT commented 5 months ago

Hey John, now that we have added full WH support, could you try this experiment again? The reporting of the nb300 left and right chip has been improved - they will have the same board id, but the coordinates & pci bus ID will be different. That will reflect in the logs as well. In the next release of SMI I will be adding "nb300 L" and "nb300 R" in the board type to clarify further between the two

sbansalTT commented 4 months ago

This has been fixed as of tt-smi 2.0.0

image