open-power / hostboot

System initialization firmware for Power systems
Apache License 2.0
75 stars 97 forks source link

checkstop: PC timebase Facility #234

Closed lili-lilili closed 1 year ago

lili-lilili commented 1 year ago

We encountered a checkstop issue during DC cycle testing. What does “EQ_CORE_FIR(16)[46] PC TimeBase Facility checkstop” mean?

root@NULL:~# peltool -i 0x50007813
{
"Private Header": {
    "Section Version":          "1",
    "Sub-section type":         "0",
    "Created by":               "0xE500",
    "Created at":               "09/09/2023 19:19:27",
    "Committed at":             "09/09/2023 19:19:27",
    "Creator Subsystem":        "BMC",
    "CSSVER":                   "",
    "Platform Log Id":          "0x50007813",
    "Entry Id":                 "0x50007813",
    "BMC Event Log Id":         "2822"
},
"User Header": {
    "Section Version":          "1",
    "Sub-section type":         "0",
    "Log Committed by":         "0x2000",
    "Subsystem":                "Processor Unit (CPU)",
    "Event Scope":              "Entire Platform",
    "Event Severity":           "Unrecoverable Error",
    "Event Type":               "Not Applicable",
    "Action Flags": [
                                "Service Action Required",
                                "Report Externally",
                                "HMC Call Home"
    ],
    "Host Transmission":        "Not Sent",
    "HMC Transmission":         "Not Sent"
},
"Primary SRC": {
    "Section Version":          "1",
    "Sub-section type":         "1",
    "Created by":               "0xE500",
    "SRC Version":              "0x02",
    "SRC Format":               "0x55",
    "Virtual Progress SRC":     "False",
    "I5/OS Service Event Bit":  "False",
    "Hypervisor Dump Initiated":"False",
    "Backplane CCIN":           "2E2D",
    "Terminate FW Error":       "False",
    "Deconfigured":             "False",
    "Guarded":                  "True",
    "Error Details": {
        "Message":              "Error Signature: 0x20DA0020 0x00010001 0x682C102E"
    },
    "Valid Word Count":         "0x09",
    "Reference Code":           "BD13E510",
    "Hex Word 2":               "00080055",
    "Hex Word 3":               "2E2D0010",
    "Hex Word 4":               "CC009544",
    "Hex Word 5":               "01000000",
    "Hex Word 6":               "20DA0020",
    "Hex Word 7":               "00010001",
    "Hex Word 8":               "682C102E",
    "Hex Word 9":               "00000000",
    "Callout Section": {
        "Callout Count":        "1",
        "Callouts": [{
            "FRU Type":         "Normal Hardware FRU",
            "Priority":         "Medium Priority",
            "Location Code":    "U78DA.ND0.WZS01SL-P0-C15",
            "Part Number":      "F210110",
            "CCIN":             "AB42",
            "Serial Number":    "            "
        }]
    },
    "SRC Details": {
        "Primary Attention": "system checkstop",
        "Signature Description": {
            "Chip Desc": "node 0 proc 1 (P10 2.0)",
            "Signature": "EQ_CORE_FIR(16)[46] PC TimeBase Facility checkstop",
            "Attn Type": "checkstop"
        }
    }
},
"Extended User Header": {
    "Section Version":          "1",
    "Sub-section type":         "0",
    "Created by":               "0x2000",
    "Reporting Machine Type":   "K1 Power",
    "Reporting Serial Number":  "NULL",
    "FW Released Ver":          "",
    "FW SubSys Version":        "1.12.00",
    "Common Ref Time":          "00/00/0000 00:00:00",
    "Symptom Id Len":           "36",
    "Symptom Id":               "BD13E510_20DA0020_00010001_682C102E"
},
"Failing MTMS": {
    "Section Version":          "1",
    "Sub-section type":         "0",
    "Created by":               "0x2000",
    "Machine Type Model":       "K1 Power",
    "Serial Number":            "NULL"
},
"User Data 0": {
    "Section Version": "1",
    "Sub-section type": "1",
    "Created by": "0x2000",
    "BMCLoad": "1.37 1.14 1.16",
    "BMCState": "Ready",
    "BMCUptime": "0y 0d 4h 18m 22s",
    "BootState": "SecondaryProcInit",
    "ChassisState": "On",
    "FW Version ID": "1.12.00",
    "HostState": "Running",
    "Process Name": "/usr/bin/openpower-hw-diags",
    "System IM": "60001000"
},
"User Data 1": {
    "Section Version": "1",
    "Sub-section type": "1",
    "Created by": "0x2000",
    "PEL_SUBSYSTEM": "0x13",
    "SRC6": "551157792",
    "SRC7": "65537",
    "SRC8": "1747718190",
    "_PID": "28813"
},
"User Data 2": {
    "Section Version": "1",
    "Sub-section type": "1",
    "Created by": "0x2000",
    "Data": [
        {
            "Deconfigured": false,
            "EntityPath": [
                38,
                1,
                0,
                2,
                0,
                5,
                1,
                35,
                4,
                83,
                0,
                7,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0
            ],
            "GuardType": "GARD_Unrecoverable",
            "Guarded": true,
            "LocationCode": "Ufcs-P0-C15",
            "Priority": "M"
        }
    ]
},
"User Data 3": {
    "Section Version": "1",
    "Sub-section type": "4",
    "Created by": "0xE500",
    "Hostboot Scratch Registers": {
        "0x0000283c": "0xaa811504",
        "0x000000004602f489": "0x0000000000000000"
    }
},
"User Data 4": {
    "Section Version": "1",
    "Sub-section type": "5",
    "Created by": "0xE500",
    "Scratch Register Error Signature": {
        "Chip ID": "0x004b0006",
        "Signature ID": "0x5993000a"
    }
},
"User Data 5": {
    "Section Version": "1",
    "Sub-section type": "3",
    "Created by": "0xE500",
    "Callout List FFDC": [
        {
            "Callout Type": "Hardware Callout",
            "Guard": true,
            "Priority": "medium",
            "Target": "physical:sys-0/node-0/proc-1/eq-4/fc-0/core-0"
        }
    ]
},
"User Data 6": {
    "Section Version": "1",
    "Sub-section type": "1",
    "Created by": "0xE500",
    "Signature List": [
        {
            "Chip Desc": "node 0 proc 0 (P10 2.0)",
            "Signature": "PB_EXT_FIR(0)[2] pb_x2_fir_err",
            "Attn Type": "checkstop"
        },
        {
            "Chip Desc": "node 0 proc 1 (P10 2.0)",
            "Signature": "EQ_CORE_FIR(16)[46] PC TimeBase Facility checkstop",
            "Attn Type": "checkstop"
        },
        {
            "Chip Desc": "node 0 proc 2 (P10 2.0)",
            "Signature": "PB_EXT_FIR(0)[4] pb_x4_fir_err",
            "Attn Type": "checkstop"
        },
        {
            "Chip Desc": "node 0 proc 3 (P10 2.0)",
            "Signature": "PB_EXT_FIR(0)[7] pb_x7_fir_err",
            "Attn Type": "checkstop"
        }
    ]
},
"User Data 7": {
    "Section Version": "1",
    "Sub-section type": "2",
    "Created by": "0xE500",
    "Register Dump": [
        "node 0 proc 0 (P10 2.0) ************************************",
        "  GFIR_CS                   (0x570F001C) 1000 0000 0000 0000",
        "  CFIR_N1_CS                (0x03040000) 8000 0000 4000 0000",
        "  CFIR_N1_CS_MASK           (0x03040040) 2000 0000 0000 0000",
        "  PB_EXT_FIR                (0x030113AE) 2000 0000 0000 0000",
        "  PB_EXT_FIR_MASK           (0x030113B1) D400 0000 0000 0000",
        "node 0 ocmb 6 (Explorer 2.0) *******************************",
        "  CHIPLET_OCMB_FIR_MASK     (0x08040002) 6627 FFE0 0000 0000",
        "node 0 proc 1 (P10 2.0) ************************************",
        "  GFIR_CS                   (0x570F001C) 0000 0000 0800 0000",
        "  CFIR_EQ_CS                (0x24040000) 8400 0000 0000 0000",
        "  CFIR_EQ_CS_MASK           (0x24040040) 2198 1800 0000 0000",
        "  PC_FIR_HOLD_OUT           (0x24028451) 2000 0800 0000 0000",
        "  TFAC_HOLD_OUT             (0x240284B7) 0078 0000 0001 0000",
        "  EQ_CORE_FIR               (0x24028440) 0000 0000 0002 0000",
        "  EQ_CORE_FIR_MASK          (0x24028443) 0221 D81A 71A9 F6FA",
        "  EQ_CORE_FIR_ACT1          (0x24028447) A914 2485 7410 0084",
        "  EQ_CORE_FIR_WOF           (0x24028448) 0000 0000 0002 0000",
        "node 0 proc 2 (P10 2.0) ************************************",
        "  GFIR_CS                   (0x570F001C) 1000 0000 0000 0000",
        "  CFIR_N1_CS                (0x03040000) 8000 0000 4000 0000",
        "  CFIR_N1_CS_MASK           (0x03040040) 2000 0000 0000 0000",
        "  PB_EXT_FIR                (0x030113AE) 0800 0000 0000 0000",
        "  PB_EXT_FIR_MASK           (0x030113B1) D400 0000 0000 0000",
        "node 0 proc 3 (P10 2.0) ************************************",
        "  GFIR_CS                   (0x570F001C) 1000 0000 0000 0000",
        "  CFIR_N1_CS                (0x03040000) 8000 0000 4000 0000",
        "  CFIR_N1_CS_MASK           (0x03040040) 2000 0000 0000 0000",
        "  PB_EXT_FIR                (0x030113AE) 0100 0000 0000 0000",
        "  PB_EXT_FIR_MASK           (0x030113B1) B400 0000 0000 0000",
        "node 0 ocmb 56 (Explorer 2.0) ******************************",
        "  CHIPLET_OCMB_FIR_MASK     (0x08040002) 6627 FFE0 0000 0000"
    ]
}
}
dcrowell77 commented 1 year ago

Timebase is the constantly ticking clock on all of the cores. I don't know the specifics but this checkstop indicates that the hardware found an issue in that logic. That logic is sourced from an internal processor clock that is in turn sourced from the system reference clock.