nim-lang / Nim

Nim is a statically typed compiled systems programming language. It combines successful concepts from mature languages like Python, Ada and Modula. Its design focuses on efficiency, expressiveness, and elegance (in that order of priority).
https://nim-lang.org
Other
16.22k stars 1.46k forks source link

Possible heap corruption when allocating after running multiple threads #23206

Open dsemi opened 5 months ago

dsemi commented 5 months ago

Description

I apologize in advance for having a lot of code here, I've been trying to narrow this down to a simpler repro case, but I keep running into situations where removing seemingly unrelated parts of the code stops the bug from triggering.

I'm seeing inconsistent segfaults in system/alloc.nim rawAlloc after running some threaded code. All of the created threads should have been joined by the time the segfault triggers, it's like the heap is getting corrupted.

I've been able to reproduce by running the binary 100 times in a row to have higher confidence in whether a change I made affected it:

nim c main.nim
for i in {1..100}; do
./main || break
done

Repro code: main.nim:

import md5
import std/atomics
import std/cpuinfo
import std/monotimes
import strformat
import strutils
import sugar
import terminal
import times

const batchSize = 8000

type Shared = ref object
  prefix: string
  done: Atomic[bool]
  counter: Atomic[uint32]
  sol: Atomic[uint32]

proc worker(arg: (Shared, int)) {.thread.} =
  let (s, v) = arg
  while not s.done.load(moRelaxed):
    let offset = s.counter.fetchAdd(batchSize, moRelaxed)
    for n in 0'u32 ..< batchSize:
      let i = offset + n
      let h = (s.prefix & $i).toMD5
      if h[0] == 0 and h[1] == 0:
        var val = s.sol.load(moRelaxed)
        while i < val and not s.sol.compareExchangeWeak(val, i, moRelaxed):
          discard
        s.done.store(true, moRelaxed)

proc solve(input: string): uint32 =
  let s = Shared(prefix: input)
  s.sol.store(uint32.high)
  var thr: array[20, Thread[(Shared, int)]]
  doAssert countProcessors() < thr.len
  for i in 0 ..< countProcessors():
    # Needs to have tuple args to fail
    createThread(thr[i], worker, (s, 15))
  joinThreads(thr)
  s.sol.load

proc part1(input: string): string =
  $input.solve

proc colorizeTime(t: float): string =
  # Crash happens here
  let s = fmt"{t:.3f}"
  if t < 0.5:
    ansiForegroundColorCode(fgGreen) & s & ansiResetCode
  elif t < 1.0:
    ansiForegroundColorCode(fgYellow) & s & ansiResetCode
  else:
    ansiForegroundColorCode(fgRed) & s & ansiResetCode

proc timeit(f: (string) -> string, inp: string): (string, float) =
  let startT = getMonoTime()
  var ans = f(inp)
  let endT = getMonoTime()
  let t = endT - startT
  return (ans, float(t.inMicroseconds) / 1_000_000)

proc run() =
  # Needs the strip call to fail
  let contents = readFile("input.txt").strip
  let (ans, t) = timeit(part1, contents)
  stdout.write fmt"Part 1: "
  echo "$1  Elapsed time $2 seconds".format(align(ans, 54), colorizeTime(t))

run()

input.txt:

ckczppom

I tried inlining the contents of input.txt into the source, but that also stopped the crashes.

Nim Version

Nim Compiler Version 2.0.2 [MacOSX: arm64] Compiled at 2023-12-15 Copyright (c) 2006-2023 by Andreas Rumpf

active boot switches: -d:release -d:nimUseLinenoise

Current Output

Traceback (most recent call last)
/Users/dan/Code/nim_threading_test/main.nim(69) main
/Users/dan/Code/nim_threading_test/main.nim(67) run
/Users/dan/Code/nim_threading_test/main.nim(49) colorizeTime
/opt/homebrew/Cellar/nim/2.0.2/nim/lib/system/alloc.nim(1052) alloc
/opt/homebrew/Cellar/nim/2.0.2/nim/lib/system/alloc.nim(868) rawAlloc
SIGSEGV: Illegal storage access. (Attempt to read from nil?)

Expected Output

Part 1:                                                  34311  Elapsed time 0.NNN seconds

Possible Solution

No response

Additional Information

I can also reproduce compiling with -d:useMalloc, which results in the following trace instead:

main(92449,0x1d6c2d000) malloc: Heap corruption detected, free list is damaged at 0x600002ef8360
*** Incorrect guard value: 221136115696464
main(92449,0x1d6c2d000) malloc: *** set a breakpoint in malloc_error_break to debug
Part 1: Traceback (most recent call last)
/Users/dan/Code/nim_threading_test/main.nim(69) main
/Users/dan/Code/nim_threading_test/main.nim(67) run
/Users/dan/Code/nim_threading_test/main.nim(47) colorizeTime
/opt/homebrew/Cellar/nim/2.0.2/nim/lib/pure/strformat.nim(519) formatValue
/opt/homebrew/Cellar/nim/2.0.2/nim/lib/pure/strutils.nim(2485) formatBiggestFloat
SIGABRT: Abnormal termination.

I can also reproduce compiling with --mm:arc, but cannot reproduce running with --mm:refc (though refc seems to be giving an incorrect answer, that seems like a separate issue).

dsemi commented 5 months ago

If anyone is having trouble reproducing and would like extra debugging information, let me know what I can do to supplement.

beef331 commented 5 months ago

I'd like to blame it on sharing a ref across threads given the following works on my end.

import md5
import std/[atomics, cpuinfo, monotimes, strformat, strutils, sugar, terminal, times]

const batchSize = 8000

type
  SharedObj = object
    prefix: string
    done: Atomic[bool]
    counter: Atomic[uint32]
    sol: Atomic[uint32]
  Shared = ref SharedObj

proc worker(arg: (ptr SharedObj, int)) {.thread.} =
  let (s, v) = arg
  while not s.done.load(moRelaxed):
    let offset = s.counter.fetchAdd(batchSize, moRelaxed)
    for n in 0'u32 ..< batchSize:
      let i = offset + n
      let h = (s.prefix & $i).toMD5
      if h[0] == 0 and h[1] == 0:
        var val = s.sol.load(moRelaxed)
        while i < val and not s.sol.compareExchangeWeak(val, i, moRelaxed):
          discard
        s.done.store(true, moRelaxed)

proc solve(input: string): uint32 =
  let s = Shared(prefix: input)
  s.sol.store(uint32.high)
  var thr: array[20, Thread[(ptr SharedObj, int)]]
  doAssert countProcessors() < thr.len
  for thr in thr.toOpenArray(0, countProcessors() - 1).mitems:
    createThread(thr, worker, (cast[ptr SharedObj](s), 15))
  joinThreads(thr.toOpenArray(0, countProcessors() - 1))
  s.sol.load

proc part1(input: string): string =
  $input.solve

proc colorizeTime(t: float): string =
  # Crash happens here
  let s = fmt"{t:.3f}"
  if t < 0.5:
    ansiForegroundColorCode(fgGreen) & s & ansiResetCode
  elif t < 1.0:
    ansiForegroundColorCode(fgYellow) & s & ansiResetCode
  else:
    ansiForegroundColorCode(fgRed) & s & ansiResetCode

proc timeit(f: (string) -> string, inp: string): (string, float) =
  let startT = getMonoTime()
  var ans = f(inp)
  let endT = getMonoTime()
  let t = endT - startT
  return (ans, float(t.inMicroseconds) / 1_000_000)

proc run() =
  # Needs the strip call to fail
  let contents = readFile("input.txt").strip
  let (ans, t) = timeit(part1, contents)
  stdout.write fmt"Part 1: "
  echo "$1  Elapsed time $2 seconds".format(align(ans, 54), colorizeTime(t))

run()
dsemi commented 5 months ago

Ah yeah that seems to be the problem. Switching to a SharedPtr in threading removes the crash. Perhaps this can be signaled more prominently in the threading docs? An example in https://nim-lang.org/docs/typedthreads.html for something that isn't copied by value would be helpful, I had to search around to find that library, and https://forum.nim-lang.org/t/9378 which I found more helpful than the docs.