Open fatihusta opened 1 year ago
We are also having trouble with the embeded BoltDB with the same error. But not only limited to parallel execution, it happens unpredictable with cron jobs and manually triggered jobs.
Hi @fatihusta , @nliechti thank for the reporting. I will try to reproduce.
@fiftin, I can reproduce this in my environment. When I schedule tasks to start at the same time, I get the following error in the log and the second task does not start:
panic: Connection schedule already exists
time="2023-08-16T08:45:36+02:00" level=info msg="Release resource locker with TaskRunner 2147483062"
goroutine 54 [running]:
github.com/ansible-semaphore/semaphore/db/bolt.(*BoltDb).Connect(0xc000103920, {0xb9cbfe, 0x8})
#011/home/runner/work/semaphore/semaphore/db/bolt/BoltDb.go:85 +0x3c5
github.com/ansible-semaphore/semaphore/services/schedules.ScheduleRunner.Run({0x0?, 0x0?, 0xc000031a70?})
#011/home/runner/work/semaphore/semaphore/services/schedules/pool.go:56 +0xaa
github.com/robfig/cron/v3.(*Cron).startJob.func1()
#011/home/runner/work/semaphore/semaphore/vendor/github.com/robfig/cron/v3/cron.go:312 +0x6a
created by github.com/robfig/cron/v3.(*Cron).startJob
#011/home/runner/work/semaphore/semaphore/vendor/github.com/robfig/cron/v3/cron.go:310 +0xad
This is still an issue for me
Nov 05 00:00:00 ansible semaphore[38994]: panic: Connection schedule already exists
Nov 05 00:00:00 ansible semaphore[38994]: goroutine 395 [running]:
Nov 05 00:00:00 ansible semaphore[38994]: github.com/ansible-semaphore/semaphore/db/bolt.(*BoltDb).Connect(0xc0000348d0, {0xbfba9c, 0x8})
Nov 05 00:00:00 ansible semaphore[38994]: /home/runner/work/semaphore/semaphore/db/bolt/BoltDb.go:86 +0x3f3
Nov 05 00:00:00 ansible semaphore[38994]: github.com/ansible-semaphore/semaphore/services/schedules.ScheduleRunner.Run({0xc0000f6580?, 0xc0000f6570?, 0xc0000>
Nov 05 00:00:00 ansible semaphore[38994]: /home/runner/work/semaphore/semaphore/services/schedules/SchedulePool.go:56 +0xaa
Nov 05 00:00:00 ansible semaphore[38994]: github.com/robfig/cron/v3.(*Cron).startJob.func1()
Nov 05 00:00:00 ansible semaphore[38994]: /home/runner/work/semaphore/semaphore/vendor/github.com/robfig/cron/v3/cron.go:312 +0x6a
Nov 05 00:00:00 ansible semaphore[38994]: created by github.com/robfig/cron/v3.(*Cron).startJob
Nov 05 00:00:00 ansible semaphore[38994]: /home/runner/work/semaphore/semaphore/vendor/github.com/robfig/cron/v3/cron.go:310 +0xad
v2.9.37
with BoltDB
@fiftin
I managed to find the issue, if 2 or more jobs are scheduled at the same time the cron scheduler executes ScheduleRunner 2 times one after the other and if Bolt database beeing used, second db "schedule" connection fails beeing created because only 1 connection allowed at the same time. I managed to sove the issue by locking a variable at start of the schedule at function Run that is called from cron and unlock it at the end:
package schedules
import (
"sync"
log "github.com/Sirupsen/logrus"
"github.com/ansible-semaphore/semaphore/db"
"github.com/ansible-semaphore/semaphore/lib"
"github.com/ansible-semaphore/semaphore/services/tasks"
"github.com/robfig/cron/v3"
)
type ScheduleRunner struct {
projectID int
scheduleID int
pool *SchedulePool
}
var singleConnectionStoreLocker sync.Locker
func (r ScheduleRunner) tryUpdateScheduleCommitHash(schedule db.Schedule) (updated bool, err error) {
repo, err := r.pool.store.GetRepository(schedule.ProjectID, *schedule.RepositoryID)
if err != nil {
return
}
err = repo.SSHKey.DeserializeSecret()
if err != nil {
return
}
remoteHash, err := lib.GitRepository{
Logger: nil,
TemplateID: schedule.TemplateID,
Repository: repo,
Client: lib.CreateDefaultGitClient(),
}.GetLastRemoteCommitHash()
if err != nil {
return
}
if schedule.LastCommitHash != nil && remoteHash == *schedule.LastCommitHash {
return
}
err = r.pool.store.SetScheduleCommitHash(schedule.ProjectID, schedule.ID, remoteHash)
if err != nil {
return
}
updated = true
return
}
func (r ScheduleRunner) Run() {
if !r.pool.store.PermanentConnection() {
singleConnectionStoreLocker.Lock()
log.Print("singleConnectStoreLocker locked")
r.pool.store.Connect("schedule")
log.Print("SCHEDULERUNNER: Store connected: schedule")
defer log.Print("singleConnectStoreLocker unlocked")
defer singleConnectionStoreLocker.Unlock()
defer log.Print("SCHEDULERUNNER: Store closed: schedule")
defer r.pool.store.Close("schedule")
}
schedule, err := r.pool.store.GetSchedule(r.projectID, r.scheduleID)
if err != nil {
log.Error(err)
return
}
if schedule.RepositoryID != nil {
var updated bool
updated, err = r.tryUpdateScheduleCommitHash(schedule)
if err != nil {
log.Error(err)
return
}
if !updated {
return
}
}
_, err = r.pool.taskPool.AddTask(db.Task{
TemplateID: schedule.TemplateID,
ProjectID: schedule.ProjectID,
}, nil, schedule.ProjectID)
if err != nil {
log.Error(err)
}
}
type SchedulePool struct {
cron *cron.Cron
locker sync.Locker
store db.Store
taskPool *tasks.TaskPool
}
func (p *SchedulePool) init() {
p.cron = cron.New()
p.locker = &sync.Mutex{}
}
func (p *SchedulePool) Refresh() {
defer p.locker.Unlock()
schedules, err := p.store.GetSchedules()
if err != nil {
log.Error(err)
return
}
p.locker.Lock()
p.clear()
for _, schedule := range schedules {
_, err := p.addRunner(ScheduleRunner{
projectID: schedule.ProjectID,
scheduleID: schedule.ID,
pool: p,
}, schedule.CronFormat)
if err != nil {
log.Error(err)
}
}
}
func (p *SchedulePool) addRunner(runner ScheduleRunner, cronFormat string) (int, error) {
id, err := p.cron.AddJob(cronFormat, runner)
if err != nil {
return 0, err
}
return int(id), nil
}
func (p *SchedulePool) Run() {
p.cron.Run()
}
func (p *SchedulePool) clear() {
runners := p.cron.Entries()
for _, r := range runners {
p.cron.Remove(r.ID)
}
}
func (p *SchedulePool) Destroy() {
defer p.locker.Unlock()
p.locker.Lock()
p.cron.Stop()
p.clear()
p.cron = nil
}
func CreateSchedulePool(store db.Store, taskPool *tasks.TaskPool) SchedulePool {
pool := SchedulePool{
store: store,
taskPool: taskPool,
}
singleConnectionStoreLocker = &sync.Mutex{}
pool.init()
pool.Refresh()
return pool
}
func ValidateCronFormat(cronFormat string) error {
_, err := cron.ParseStandard(cronFormat)
return err
}
@fiftin there are propably better solutions for this but it's a start and as far i can tell not disturbing things.
This bug is fixed since Feb 10. See the referenced commit above. And I just verified it on my own instance. After I upgraded to v2.9.75 I'm not able to reproduce the bug any more!
I still have this issue.
semaphore version
v2.9.64
May 21 12:00:00 semaphore semaphore[3287470]: panic: Connection schedule 6 already exists
May 21 12:00:00 semaphore semaphore[3287470]: goroutine 60699 [running]:
May 21 12:00:00 semaphore semaphore[3287470]: github.com/ansible-semaphore/semaphore/db/bolt.(*BoltDb).Connect(0xc0001c1b30, {0xc009e8a0d0, 0xa})
May 21 12:00:00 semaphore semaphore[3287470]: /home/runner/work/semaphore/semaphore/db/bolt/BoltDb.go:86 +0x3b4
May 21 12:00:00 semaphore semaphore[3287470]: github.com/ansible-semaphore/semaphore/services/schedules.ScheduleRunner.Run({0x1e67a40?, 0xc0048e0dc0?, 0xc0064b6f90?})
May 21 12:00:00 semaphore semaphore[3287470]: /home/runner/work/semaphore/semaphore/services/schedules/SchedulePool.go:57 +0xc3
May 21 12:00:00 semaphore semaphore[3287470]: github.com/robfig/cron/v3.(*Cron).startJob.func1()
May 21 12:00:00 semaphore semaphore[3287470]: /home/runner/work/semaphore/semaphore/vendor/github.com/robfig/cron/v3/cron.go:312 +0x5b
May 21 12:00:00 semaphore semaphore[3287470]: created by github.com/robfig/cron/v3.(*Cron).startJob in goroutine 7
May 21 12:00:00 semaphore semaphore[3287470]: /home/runner/work/semaphore/semaphore/vendor/github.com/robfig/cron/v3/cron.go:310 +0xa5
Task1:
0 0,8,16 * * *
Task2:0 0,8,16 * * *