Open ZdayAlster opened 1 year ago
We have conducted two tests, both will report the timer stuck problem, but the register information is different: first: [ 323.289837862,3] SBE: Timer stuck, falling back to OPAL pollers. [ 323.289870355,3] SBE: You will likely have slower I2C and may have experienced increased jitter. [ 323.289945830,7] SBE: MBOX register dump for chip : 0 [ 323.289988745,7] SBE: PSU_SBE_DOORBELL_REG_RW: 0000000000000000 [ 323.290044178,7] SBE: PSU_HOST_SBE_MBOX_REG0: 00000201016cd401 [ 323.290077618,7] SBE: PSU_HOST_SBE_MBOX_REG1: 0000000000030d40 [ 323.290111155,7] SBE: PSU_HOST_SBE_MBOX_REG2: 0000000000000000 [ 323.290143797,7] SBE: PSU_HOST_SBE_MBOX_REG3: 0000000000000000 [ 323.290192471,7] SBE: PSU_HOST_DOORBELL_REG_RW: 0000000000000000 [ 323.290226915,7] SBE: PSU_HOST_SBE_MBOX_REG4: 00000000016cd401 [ 323.290267491,7] SBE: PSU_HOST_SBE_MBOX_REG5: 000000000000005c [ 323.290330197,7] SBE: PSU_HOST_SBE_MBOX_REG6: 00050002c7423213 [ 323.290362325,7] SBE: PSU_HOST_SBE_MBOX_REG7: 0000000000000000
the second time: [ 395.026940883,3] SBE: Timer stuck, falling back to OPAL pollers. [ 395.027001671,3] SBE: You will likely have slower I2C and may have experienced increased jitter. [ 395.027090241,7] SBE: MBOX register dump for chip : 0 [ 395.027132096,7] SBE: PSU_SBE_DOORBELL_REG_RW: 0000000000000000 [ 395.027194132,7] SBE: PSU_HOST_SBE_MBOX_REG0: 000002010298d401 [ 395.027226981,7] SBE: PSU_HOST_SBE_MBOX_REG1: 0000000000030d40 [ 395.027260392,7] SBE: PSU_HOST_SBE_MBOX_REG2: 0000000000000000 [ 395.027293728,7] SBE: PSU_HOST_SBE_MBOX_REG3: 0000000000000000 [ 395.027340497,7] SBE: PSU_HOST_DOORBELL_REG_RW: 0002000000000000 [ 395.027403301,7] SBE: PSU_HOST_SBE_MBOX_REG4: 000000000298d401 [ 395.027450021,7] SBE: PSU_HOST_SBE_MBOX_REG5: 000000000000005c [ 395.027510802,7] SBE: PSU_HOST_SBE_MBOX_REG6: 00050002c7423213 [ 395.027558420,7] SBE: PSU_HOST_SBE_MBOX_REG7: 0000000000000000
We now have two problems:
When we tested skiboot on the Rainer server, we encountered the "timer stuck" problem after entering petitboot. After that, the system stays in the wrong place and cannot operate.
Test environment information: Test server model: 9105-42A The test USES skiboot code: 87 skiboot c199fa017e7d3b1035bc06b306a58fafb56767
Here is the log: `dmesg: klogctl: Operation not permitted )0[1;24r[m[4l[?7h[?25l[?1c[H[J[H[J Petitboot () [2d ──────────────────────────────────────────────────────────────────────────────[4;3HSystem information[5;3HSystem configuration[6;3HSystem status log[7;3HLanguage[8;3HRescan devices[9;3HRetrieve config from URL[10;3HPlugins (0) [11d *[0;10;7mExit to shell [22;2H[m────────────────────────────────────────────────────────────────────────────── [23d Enter=accept, e=edit, n=new, x=exit, l=language, g=log, h=help [24d Welcome to Petitboot Info: Waiting for device discovery[ 260.089025205,7] IPMI: Got error response. cmd=0x9, netfn=0x1, rc=0x80 [ 260.089100568,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 260.089169380,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =133311569365 [ 260.089311687,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133209311671 [ 260.089376608,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133209376595 [ 260.089390535,8] opal_ipmi_send(cmd: 0x09 netfn: 0x00 len: 0x03) [ 260.089407648,8] BT: seq 0x4a netfn 0x00 cmd 0x09: Message sent to host [ 260.198520978,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 260.198599446,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =133420999204 [ 260.198728390,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133318728370 [ 260.198793206,8] opal_ipmi_send(cmd: 0x09 netfn: 0x00 len: 0x03) [ 260.198807125,8] BT: seq 0x4b netfn 0x00 cmd 0x09: Message sent to host [ 260.198896821,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133318896808 [ 260.307956405,7] IPMI: Got error response. cmd=0x9, netfn=0x1, rc=0xff [ 260.308040918,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =199999 [ 260.308110897,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =133530510370 [ 260.308493271,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133428493268 [ 260.308569287,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133428569281 [ 260.417455202,7] IPMI: Got error response. cmd=0x9, netfn=0x1, rc=0x80 [ 260.417545540,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 260.417614659,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =133640014644 [ 260.417708631,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133537708627 [ 260.417753592,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133537753585 [ 260.417770465,8] opal_ipmi_send(cmd: 0x09 netfn: 0x00 len: 0x03) [ 260.417782486,8] BT: seq 0x4d netfn 0x00 cmd 0x09: Message sent to host [ 261.014965779,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 261.015044209,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =133749444183 [ 261.015137588,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133647137574 [ 261.015180130,8] opal_ipmi_send(cmd: 0x09 netfn: 0x00 len: 0x03) [ 261.015196085,8] BT: seq 0x4e netfn 0x00 cmd 0x09: Message sent to host [ 261.015279764,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133647279750 [ 261.124389620,7] IPMI: Got error response. cmd=0x9, netfn=0x1, rc=0xff [ 261.124479620,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 261.124556885,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =133858956867 [ 261.124649298,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133756649283 [ 261.124694354,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133756694339 [ 261.233908056,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 261.233993909,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =133968393892 [ 261.234081954,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133866081921 [ 261.234133221,8] opal_ipmi_send(cmd: 0x01 netfn: 0x06 len: 0x00) [ 261.234144354,8] BT: seq 0x50 netfn 0x06 cmd 0x01: Message sent to host [ 261.234260199,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133866260183 [ 261.343348832,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 261.343426241,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =134077826225 [ 261.343519156,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133975519095 [ 261.343568231,8] opal_ipmi_send(cmd: 0x1a netfn: 0x3a len: 0x00) [ 261.343581315,8] BT: seq 0x51 netfn 0x3a cmd 0x1a: Message sent to host [ 261.343676949,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =133975676935 [ 261.452771744,7] IPMI: Got error response. cmd=0x1a, netfn=0x3b, rc=0xc1 [ 261.452839464,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 261.452916742,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =134187316725 [ 261.453009443,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134085009410 [ 261.453085044,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134085085025 [ 261.453844322,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134085844294 [ 261.453850640,8] OCC: IRQ received: 8001 [ 261.453917969,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134085917954 [ 261.453975908,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134085975890 [ 261.453991073,8] blocklevel_raw_read: 0x0 0xc0001ea90000 0x1000 [ 261.453993970,8] HIOMAP: Flash read at 0 for 1000 [ 261.453995732,8] HIOMAP: Reading at 0x00000000 for 0x00001000 offset: 0x0c000000 [ 261.454708308,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134086708290 [ 261.455250274,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134087250244 [ 261.455281105,8] blocklevel_raw_read: 0x26bd000 0xc000051a0000 0x1c00 [ 261.455284341,8] HIOMAP: Flash read at 26bd000 for 1c00 [ 261.455304323,8] BT: seq 0x52 netfn 0x3a cmd 0x5a: Message sent to host [ 261.455510610,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134087510594 [ 261.465572230,7] HIOMAP: Opened read window from 0x26bd000 for 8192 bytes at 0xc200000 [ 261.467462948,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134099462915 [ 261.467557444,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134099557415 [ 261.478299700,7] HIOMAP: Opened read window from 0x0 for 4096 bytes at 0xc000000 [ 261.478389105,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134110389074 [ 261.478399814,8] blocklevel_raw_read: 0x0 0xc0001ea90000 0x1000 [ 261.478401779,8] HIOMAP: Flash read at 0 for 1000 [ 261.478403045,8] HIOMAP: Reading at 0x00000000 for 0x00001000 offset: 0x0c000000 [ 261.479108279,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134111108274 [ 261.479547846,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134111547830 [ 261.479597285,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134111597281 [ 261.479654755,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134111654723 [ 261.479709925,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =134111709911 [ 264.006158740,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =199999 [ 264.006225217,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =135174225204 [ 264.006271825,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =135276671299 [ 264.006317889,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =135174317876 [ 264.115567395,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 264.115636823,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =135386036816 [ 264.224976035,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 264.225078504,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =135495478497 [ 264.334416900,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 264.334494596,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =135604894581
[J[1;64H9105-42A 783C4C1 [24d [ 264.443833106,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 264.443910630,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =135714310616 [ 265.041249861,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 265.041327574,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =135823727560 [ 265.150665922,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 265.150743415,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =135933143409 [ 265.260082020,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 265.260150868,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =136042550853 [ 265.369489508,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 265.369566898,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =136151966884 [ 265.478905763,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 265.478983479,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =136261383473 [ 266.047114499,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =136239114304 [ 266.047196944,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =136239196930 [ 266.076322851,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 266.076400694,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =136370800679 [ 266.185739267,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 266.185808360,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =136480208353 [ 266.295146725,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 266.295224209,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =136589624195 [ 266.404562838,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 266.404640369,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =136699040355 [ 267.001979378,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 267.002064726,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =136808464720 [ 267.111403045,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 267.111489317,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =136917889302 [ 267.220827635,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 267.220905123,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =137027305108 [ 267.330243428,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 267.330313056,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =137136713042 [ 267.439651799,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 267.439729568,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =137246129554 [ 268.037068423,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 268.037146147,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =137355546132 [ 268.088073126,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =137304073107 [ 268.088139714,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =137304139700 [ 268.146485955,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 268.146564034,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =137464964019 [ 268.255902945,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 //........(Duplicate logs with the words "SBE: djs" were removed) [ 392.497672149,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =201201672129 [ 392.497745106,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =201201745092 [ 393.066274163,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 393.066351800,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =201384751793 [ 393.175690913,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 393.175759300,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =201494159286 [ 393.285098213,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 393.285175713,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =201603575698 [ 393.394514054,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 393.394591475,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =201712991460 [ 393.503930370,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 393.503999079,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =201822399063 [ 394.101337830,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 394.101424003,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =201931823988 [ 394.210762455,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 394.210840100,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =202041240085 [ 394.320178659,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 394.320247254,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =202150647239 [ 394.429586001,6] SBE: dsj Func:p9_sbe_timer_schedule, tick_us =200000 [ 394.429655348,6] SBE: dsj Func:p9_sbe_timer_resp,time sbe_last_gen_stamp =202260055333 [ 395.026635783,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =202266635766 [ 395.026708500,6] SBE: dsj Func:p9_sbe_timeout_poll_one, mftb =202266708484 [ 395.026780071,6] SBE: dsj Func:p9_sbe_timer_poll,chip_id and default_chip_id:0,0 [ 395.026840131,6] SBE: dsj Func:p9_sbe_timer_poll,sbe_last_gen_stamp=202260055333 [ 395.026894246,6] SBE: dsj Func:p9_sbe_timer_poll,mftb()=202266894229 [ 395.026940883,3] SBE: Timer stuck, falling back to OPAL pollers. [ 395.027001671,3] SBE: You will likely have slower I2C and may have experienced increased jitter. [ 395.027090241,7] SBE: MBOX register dump for chip : 0 [ 395.027132096,7] SBE: PSU_SBE_DOORBELL_REG_RW: 0000000000000000 [ 395.027194132,7] SBE: PSU_HOST_SBE_MBOX_REG0: 000002010298d401 [ 395.027226981,7] SBE: PSU_HOST_SBE_MBOX_REG1: 0000000000030d40 [ 395.027260392,7] SBE: PSU_HOST_SBE_MBOX_REG2: 0000000000000000 [ 395.027293728,7] SBE: PSU_HOST_SBE_MBOX_REG3: 0000000000000000 [ 395.027340497,7] SBE: PSU_HOST_DOORBELL_REG_RW: 0002000000000000 [ 395.027403301,7] SBE: PSU_HOST_SBE_MBOX_REG4: 000000000298d401 [ 395.027450021,7] SBE: PSU_HOST_SBE_MBOX_REG5: 000000000000005c [ 395.027510802,7] SBE: PSU_HOST_SBE_MBOX_REG6: 00050002c7423213 [ 395.027558420,7] SBE: PSU_HOST_SBE_MBOX_REG7: 0000000000000000 `
The word "SBE: dsj" in the log is the printed information added by custom. The code is added at the position: `static void p9_sbe_timer_resp(struct p9_sbe_msg msg) { if (msg->state != sbe_msg_done) { prlog(PR_DEBUG, "Failed to schedule timer [chip id %x]\n", sbe_default_chip_id); } else { / Update last scheduled timer value */ sbe_last_gen_stamp = mftb() + usecs_to_tb(timer_ctrl_msg->reg[1]); sbe_timer_in_progress = true; prlog(PR_INFO,"dsj time sbe_last_gen_stamp=llu\n", sbe_last_gen_stamp ); }
}
static void p9_sbe_timer_poll(struct p9_sbe *sbe) { if (sbe->chip_id != sbe_default_chip_id) return;
}`