Open freeNestor opened 2 weeks ago
I've also encountered this situation. Executing the SQL statement below occasionally results in errors. This kind of intermittent bug is very difficult to debug.
with cet_keys as(
select b.COLUMN_NAME,a.OWNER from ALL_CONSTRAINTS a inner join ALL_CONS_COLUMNS b on a.CONSTRAINT_NAME=b.CONSTRAINT_NAME and a.CONSTRAINT_TYPE='P'
where a.OWNER = :p1 and a.TABLE_NAME= :p2
)
select a.COLUMN_NAME column_code,NVL(b.COMMENTS,a.COLUMN_NAME) column_name,case when exists(select * from cet_keys where COLUMN_NAME=a.COLUMN_NAME) then 'yes' else 'no' end is_key,
case when DATA_TYPE='NUMBER' and data_scale=0 then 'int' when DATA_TYPE='NUMBER' and data_scale>0 then 'float'
when DATA_TYPE in('FLOAT','BINARY_FLOAT','BINARY_DOUBLE') then 'float' when DATA_TYPE like '%TIMESTAMP%' then 'timestamp'
when DATA_TYPE = 'DATE' then 'timestamp' when DATA_TYPE like '%CHAR%' then 'varchar' else 'varchar' end DATA_TYPE
from ALL_TAB_COLUMNS a inner join ALL_COL_COMMENTS b on a.OWNER=b.OWNER and a.TABLE_NAME=b.TABLE_NAME and a.COLUMN_NAME=b.COLUMN_NAME
where a.OWNER=:p3 and a.TABLE_NAME= :p4 order by a.COLUMN_ID
After changing: conn, err := sqlx.Open("oracle", dsn) to: conn, err := go_ora.NewConnection(dsn, nil) and adjusting the subsequent code accordingly, the problem no longer occurs. You can give it a try.
After changing: conn, err := sqlx.Open("oracle", dsn) to: conn, err := go_ora.NewConnection(dsn, nil) and adjusting the subsequent code accordingly, the problem no longer occurs. You can give it a try.
Thanks for your suggestion. I evaluate it's hard for me to make such massive adjustment.😮💨
After changing: conn, err := sqlx.Open("oracle", dsn) to: conn, err := go_ora.NewConnection(dsn, nil) and adjusting the subsequent code accordingly, the problem no longer occurs. You can give it a try.
I test this changing not working. But I guess I find the cause and solution. I add some print in source code and add goroutine number with context, found logs below:
2024-06-28 19:40:38.658552663 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.604912ms
[2024-06-28 19:40:38.659069996 +0800 CST]context thseq: 29 error: context canceled, context: context.Background.WithValue(type string, val <not Stringer>).WithDeadline(2024-06-28 19:40:43.653923729 +0800 CST m
=+1452.700190387 [4.993935358s])
2024-06-28 19:40:38.660444875 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.773018ms
2024-06-28 19:40:38.660743282 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.934609ms
2024-06-28 19:40:38.660756345 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.665962ms
2024-06-28 19:40:38.660882689 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.775275ms
2024-06-28 19:40:38.661542983 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.950724ms
2024-06-28 19:40:38.662297683 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 3.57291ms
2024-06-28 19:40:38.66366604 +0800 CST thnum[19]ping db failed[count: 1443] execdur: 3.914246ms, error: [context context.Background.WithValue(type string, val <not Stringer>).WithDeadline(2024-06-28 19:40:43.659889248 +0800 CST m=+1452.706155913 [4.996307919s]) thnum 19]query db current time error: ORA-01013: user requested cancel of current operation
- sql: SELECT TO_CHAR(CURRENT_TIMESTAMP(0),'yyyy-MM-dd HH24:MI:SS') FROM dual
2024-06-28 19:40:38.663816568 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 3.859629ms
2024-06-28 19:40:38.664133996 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 4.304747ms
2024-06-28 19:40:38.664443857 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 5.592071ms
as you can see in logs, thnum and thseq is goroutine sequence, and there's a context cancel event printed when ora-01013 occurred, that the two sequence is not consist with each other. print in source code:
func (session *Session) StartContext(ctx context.Context) {
//session.oldCtx = session.ctx
//session.ctx = ctx
done := make(chan struct{})
session.doneContext = append(session.doneContext, done)
go func(idone chan time.Time, mu *sync.Mutex) {
var err error
mu.Lock()
var tracer = session.tracer
mu.Unlock()
select {
case <-idone:
return
case <-ctx.Done():
fmt.Printf("[%v]context thseq: %v error: %v, context: %v\n", time.Now().Local(), ctx.Value("thnum"), ctx.Err(), ctx)
...
So I guess this is a concurrency issue with session.doneContext which maybe under data race condition. I changed source code like below, and test for longer, the error is gone. I need to keep watching, maybe someone can confirm this fixed.
// using a return chan value to guarantee signal consistently
func (session *Session) StartContext(ctx context.Context) chan time.Time {
//session.oldCtx = session.ctx
//session.ctx = ctx
done := make(chan time.Time)
// session.doneContext = append(session.doneContext, done)
go func(idone chan time.Time, mu *sync.Mutex) {
var err error
mu.Lock()
var tracer = session.tracer
mu.Unlock()
// defer close(idone)
select {
case <-idone:
return
case <-ctx.Done():
// this is a debug print
fmt.Printf("[%v]context thseq: %v error: %v, context: %v\n", time.Now().Local(), ctx.Value("thnum"), ctx.Err(), ctx)
ct := time.Now().Local()
// due to forking or goroutine executing is slowly in high load system, we will receive ctx done and idone signal at the same time
// golang select ctx.Done randomly and we need to check if main goroutine is finished or not.
// if main goroutine finished with context cancel(not deadline exceeded), skip BreakConnection.
select {
case dt := <-idone:
if dt.Before(ct) { // or we may not need to check this time because it always meets condition.
fmt.Println("main goroutine finished without timeout, skip breaking")
return
}
default:
}
session.mu.Lock()
connected := session.Connected
session.mu.Unlock()
if connected {
if err = session.BreakConnection(); err != nil {
tracer.Print("Connection Break Error: ", err)
}
} else {
err = session.WriteFinalPacket()
if err != nil {
tracer.Print("Write Final Packet With Error: ", err)
}
session.Disconnect()
}
}
}(done, &session.mu)
return done
}
func (session *Session) EndContext(done chan time.Time) {
done <- time.Now().Local()
close(done)
// assuming session.doneContext is under data race condition, below index cannot be guaranteed consist with StartContext
// index := len(session.doneContext) - 1
// if index >= 0 {
// done := session.doneContext[index]
// if done != nil {
// close(done)
// done = nil
// }
// }
// if index == 0 {
// session.doneContext = nil
// } else {
// session.doneContext = session.doneContext[:index]
// }
//if session.doneContext != nil {
// close(session.doneContext)
// session.doneContext = nil
//}
//session.ctx = session.oldCtx
}
...
func (stmt *Stmt) QueryContext(ctx context.Context, namedArgs []driver.NamedValue) (driver.Rows, error) {
if stmt.connection.State != Opened {
stmt.connection.setBad()
return nil, driver.ErrBadConn
}
tracer := stmt.connection.tracer
tracer.Print("Query With Context:", stmt.text)
di := stmt.connection.session.StartContext(ctx)
defer stmt.connection.session.EndContext(di)
return stmt.Query_(namedArgs)
}
Hope this help. @sijms @drkisler
I upgraded go-ora recently due to issue #548. My application reproduces this error again after upgrading in production. seems issue is not solved completely. go-ora version: 2.8.17 or 2.8.19 oracle version: 19c
pls try blew sample code to reproduce and run longer, my test results:
reproduce code: