sijms / go-ora

Pure go oracle client
MIT License
766 stars 167 forks source link

ORA-01013 error reproduces again #567

Open freeNestor opened 2 weeks ago

freeNestor commented 2 weeks ago

I upgraded go-ora recently due to issue #548. My application reproduces this error again after upgrading in production. seems issue is not solved completely. go-ora version: 2.8.17 or 2.8.19 oracle version: 19c

pls try blew sample code to reproduce and run longer, my test results:

# ./test > testnew.log
# grep fail testnew.log
2024-06-20 14:05:31.335920972 +0800 CST ping db failed[count: 506] execdur: 3.299548ms, error: query db current time error: ORA-01013: user requested cancel of current operation
2024-06-20 14:10:10.38685891 +0800 CST ping db failed[count: 784] execdur: 54.770632ms, error: query db current time error: ORA-01013: user requested cancel of current operation
2024-06-20 14:18:13.024509272 +0800 CST ping db failed[count: 1265] execdur: 1.554644ms, error: query db current time error: ORA-01013: user requested cancel of current operation
2024-06-20 14:19:57.386923779 +0800 CST ping db failed[count: 1369] execdur: 2.056954ms, error: query db current time error: ORA-01013: user requested cancel of current operation
2024-06-20 14:20:29.485044594 +0800 CST ping db failed[count: 1401] execdur: 1.297867ms, error: query db current time error: ORA-01013: user requested cancel of current operation
2024-06-20 14:25:08.456694193 +0800 CST ping db failed[count: 1679] execdur: 1.881563ms, error: query db current time error: ORA-01013: user requested cancel of current operation
2024-06-20 14:36:14.982894251 +0800 CST ping db failed[count: 2343] execdur: 2.835757ms, error: query db current time error: ORA-01013: user requested cancel of current operation
2024-06-20 14:45:10.060514971 +0800 CST ping db failed[count: 2876] execdur: 2.43114ms, error: query db current time error: ORA-01013: user requested cancel of current operation
2024-06-20 14:50:05.181439867 +0800 CST ping db failed[count: 3170] execdur: 2.501114ms, error: query db current time error: ORA-01013: user requested cancel of current operation
2024-06-20 14:55:10.293671734 +0800 CST ping db failed[count: 3474] execdur: 2.66867ms, error: query db current time error: ORA-01013: user requested cancel of current operation
2024-06-20 14:59:34.279711608 +0800 CST ping db failed[count: 3737] execdur: 2.051858ms, error: query db current time error: ORA-01013: user requested cancel of current operation

# cat testnew.log |wc -l
144462
# grep fail testnew.log |wc -l
11

reproduce code:

package main

import (
    "context"
    "fmt"
    "sync"
    "time"

    "github.com/jmoiron/sqlx"
    go_ora "github.com/sijms/go-ora/v2"
)

func main() {

    options := map[string]string{
        "TIMEOUT":    "60",
        "TRACE FILE": "trace.log",
    }
    dsn := go_ora.BuildUrl("192.168.x.x", 1521, "db", "user", "pass", options)

    conn, err := sqlx.Open("oracle", dsn)
    if err != nil {
        fmt.Printf("create pool fail: %v\n", err)
        return
    }
    conn.SetMaxIdleConns(5)
    conn.SetMaxOpenConns(5)
    defer func() {
        err = conn.Close()
        if err != nil {
            fmt.Println("can't close db:", err)
        }
    }()

    selecttest := "SELECT TO_CHAR(CURRENT_TIMESTAMP(0),'yyyy-MM-dd HH24:MI:SS') FROM dual"
    var wg sync.WaitGroup
    wg.Add(10)
    for i := 0; i < 37; i++ {

        go func(wgc *sync.WaitGroup) {
            defer wgc.Done()
            count := 0
            for {
                p1 := time.Now().Local()
                dbt, err := GetDBCurrTimestamp(conn, selecttest, 5)
                p2 := time.Now().Local()
                count++

                if err != nil {
                    fmt.Printf("%v ping db failed[count: %d] execdur: %v, error: %v\n", time.Now().Local(), count, p2.Sub(p1), err)
                } else {
                    fmt.Printf("%v ping db succeeded[count: %d, val: %s] execdur: %v\n", time.Now().Local(), count, dbt, p2.Sub(p1))
                }
                time.Sleep(time.Second)
            }
        }(&wg)
    }
    wg.Wait()
}
func GetDBCurrTimestamp(dbconn *sqlx.DB, sqlstr string, timeout int) (string, error) {
    ctxTimeout, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
    defer cancel()
    getconn, err := dbconn.Connx(ctxTimeout)
    if err != nil {
        return "", fmt.Errorf("query db current time, getconn error: %v - sql: %s", err, sqlstr)
    }
    defer getconn.Close()

    res, err := getconn.QueryxContext(ctxTimeout, sqlstr)
    if err != nil {
        return "", fmt.Errorf("query db current time error: %v - sql: %s", err, sqlstr)
    }
    defer res.Close()

    var currt any
    if res.Next() {
        res.Scan(&currt)
    }
    switch val := currt.(type) {
    case string:
        return val, nil
    case []uint8:
        return string(val), nil
    default:
        return "", fmt.Errorf("dbtime cannot be converted to string: %v", currt)
    }
}
drkisler commented 2 weeks ago

I've also encountered this situation. Executing the SQL statement below occasionally results in errors. This kind of intermittent bug is very difficult to debug.


with cet_keys as(
select b.COLUMN_NAME,a.OWNER from ALL_CONSTRAINTS a inner join ALL_CONS_COLUMNS b on a.CONSTRAINT_NAME=b.CONSTRAINT_NAME and a.CONSTRAINT_TYPE='P' 
where a.OWNER = :p1 and a.TABLE_NAME= :p2
) 
select a.COLUMN_NAME column_code,NVL(b.COMMENTS,a.COLUMN_NAME) column_name,case when exists(select * from cet_keys where COLUMN_NAME=a.COLUMN_NAME) then 'yes' else 'no' end is_key,
case when DATA_TYPE='NUMBER' and data_scale=0 then 'int'  when DATA_TYPE='NUMBER' and data_scale>0 then 'float'
when DATA_TYPE in('FLOAT','BINARY_FLOAT','BINARY_DOUBLE') then 'float'  when DATA_TYPE like '%TIMESTAMP%' then 'timestamp' 
when DATA_TYPE = 'DATE' then 'timestamp'  when DATA_TYPE like '%CHAR%' then 'varchar'  else 'varchar' end DATA_TYPE  
from ALL_TAB_COLUMNS a inner join ALL_COL_COMMENTS b on a.OWNER=b.OWNER and a.TABLE_NAME=b.TABLE_NAME and a.COLUMN_NAME=b.COLUMN_NAME 
where a.OWNER=:p3 and a.TABLE_NAME= :p4 order by a.COLUMN_ID

After changing: conn, err := sqlx.Open("oracle", dsn) to: conn, err := go_ora.NewConnection(dsn, nil) and adjusting the subsequent code accordingly, the problem no longer occurs. You can give it a try.

freeNestor commented 2 weeks ago

After changing: conn, err := sqlx.Open("oracle", dsn) to: conn, err := go_ora.NewConnection(dsn, nil) and adjusting the subsequent code accordingly, the problem no longer occurs. You can give it a try.

Thanks for your suggestion. I evaluate it's hard for me to make such massive adjustment.😮‍💨

freeNestor commented 1 week ago

After changing: conn, err := sqlx.Open("oracle", dsn) to: conn, err := go_ora.NewConnection(dsn, nil) and adjusting the subsequent code accordingly, the problem no longer occurs. You can give it a try.

I test this changing not working. But I guess I find the cause and solution. I add some print in source code and add goroutine number with context, found logs below:

2024-06-28 19:40:38.658552663 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.604912ms
[2024-06-28 19:40:38.659069996 +0800 CST]context thseq: 29 error: context canceled, context: context.Background.WithValue(type string, val <not Stringer>).WithDeadline(2024-06-28 19:40:43.653923729 +0800 CST m
=+1452.700190387 [4.993935358s])
2024-06-28 19:40:38.660444875 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.773018ms
2024-06-28 19:40:38.660743282 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.934609ms
2024-06-28 19:40:38.660756345 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.665962ms
2024-06-28 19:40:38.660882689 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.775275ms
2024-06-28 19:40:38.661542983 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 2.950724ms
2024-06-28 19:40:38.662297683 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 3.57291ms
2024-06-28 19:40:38.66366604 +0800 CST thnum[19]ping db failed[count: 1443] execdur: 3.914246ms, error: [context context.Background.WithValue(type string, val <not Stringer>).WithDeadline(2024-06-28 19:40:43.659889248 +0800 CST m=+1452.706155913 [4.996307919s]) thnum 19]query db current time error: ORA-01013: user requested cancel of current operation
 - sql: SELECT TO_CHAR(CURRENT_TIMESTAMP(0),'yyyy-MM-dd HH24:MI:SS') FROM dual
2024-06-28 19:40:38.663816568 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 3.859629ms
2024-06-28 19:40:38.664133996 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 4.304747ms
2024-06-28 19:40:38.664443857 +0800 CST ping db succeeded[count: 1443, val: 2024-06-28 19:40:39] execdur: 5.592071ms

as you can see in logs, thnum and thseq is goroutine sequence, and there's a context cancel event printed when ora-01013 occurred, that the two sequence is not consist with each other. print in source code:

func (session *Session) StartContext(ctx context.Context)  {
    //session.oldCtx = session.ctx
    //session.ctx = ctx
    done := make(chan struct{})
    session.doneContext = append(session.doneContext, done)
    go func(idone chan time.Time, mu *sync.Mutex) {
        var err error
        mu.Lock()
        var tracer = session.tracer
        mu.Unlock()
        select {
        case <-idone:
            return
        case <-ctx.Done():
            fmt.Printf("[%v]context thseq: %v error: %v, context: %v\n", time.Now().Local(), ctx.Value("thnum"), ctx.Err(), ctx)
...

So I guess this is a concurrency issue with session.doneContext which maybe under data race condition. I changed source code like below, and test for longer, the error is gone. I need to keep watching, maybe someone can confirm this fixed.

// using a return chan value to guarantee signal consistently
func (session *Session) StartContext(ctx context.Context) chan time.Time {
    //session.oldCtx = session.ctx
    //session.ctx = ctx
    done := make(chan time.Time)
    // session.doneContext = append(session.doneContext, done)
    go func(idone chan time.Time, mu *sync.Mutex) {
        var err error
        mu.Lock()
        var tracer = session.tracer
        mu.Unlock()
        // defer close(idone)
        select {
        case <-idone:
            return
        case <-ctx.Done():
                        // this is a debug print
            fmt.Printf("[%v]context thseq: %v error: %v, context: %v\n", time.Now().Local(), ctx.Value("thnum"), ctx.Err(), ctx)
            ct := time.Now().Local()
            // due to forking or goroutine executing is slowly in high load system, we will receive ctx done and idone signal at the same time
            // golang select ctx.Done randomly and we need to check if main goroutine is finished or not.
            // if main goroutine finished with context cancel(not deadline exceeded), skip BreakConnection.
            select {
            case dt := <-idone:
                if dt.Before(ct) {  // or we may not need to check this time because it always meets condition.
                    fmt.Println("main goroutine finished without timeout, skip breaking")
                    return
                }
            default:
            }
            session.mu.Lock()
            connected := session.Connected
            session.mu.Unlock()
            if connected {
                if err = session.BreakConnection(); err != nil {
                    tracer.Print("Connection Break Error: ", err)
                }
            } else {
                err = session.WriteFinalPacket()
                if err != nil {
                    tracer.Print("Write Final Packet With Error: ", err)
                }
                session.Disconnect()
            }
        }
    }(done, &session.mu)
    return done
}
func (session *Session) EndContext(done chan time.Time) {
    done <- time.Now().Local()
        close(done)
    // assuming session.doneContext is under data race condition, below index cannot be guaranteed consist with StartContext

    // index := len(session.doneContext) - 1
    // if index >= 0 {
    //  done := session.doneContext[index]
    //  if done != nil {
    //      close(done)
    //      done = nil
    //  }
    // }
    // if index == 0 {
    //  session.doneContext = nil
    // } else {
    //  session.doneContext = session.doneContext[:index]
    // }
    //if session.doneContext != nil {
    //  close(session.doneContext)
    //  session.doneContext = nil
    //}
    //session.ctx = session.oldCtx
}
...
func (stmt *Stmt) QueryContext(ctx context.Context, namedArgs []driver.NamedValue) (driver.Rows, error) {
    if stmt.connection.State != Opened {
        stmt.connection.setBad()
        return nil, driver.ErrBadConn
    }
    tracer := stmt.connection.tracer
    tracer.Print("Query With Context:", stmt.text)

    di := stmt.connection.session.StartContext(ctx)
    defer stmt.connection.session.EndContext(di)
    return stmt.Query_(namedArgs)
}

Hope this help. @sijms @drkisler