Tensor to NumpyArray and back requires copy

arkanoid87 commented 2 years ago

I've been doing some memory profiling with valgrind

It seems that Tensor to NumpyArray and back requires a copy for every kind of conversion, and this makes NumpyArray costly to use.

Is there a solution for this?

config.nims

--gc: "arc"
--opt: "speed"
--d: "danger"
--d: "useMalloc"

the results under each line is a different compilation and run. The result shows that an alloc the size of the original Tensor object is required on each transform from Tensor to NumpyArray and vive-versa

import std/[random, hashes]
import arraymancer
import nimpy
import scinim/numpyarrays

var rng {.compileTime.} = initRand(hash(CompileDate & CompileTime) and 0x28037091)

proc rndStr(len: int): string =
  result = newStringOfCap(len)
  for _ in 0..<len:
    result.add char(rng.rand(int('A') .. int('z')))

type Randomizable = float|string|int
proc makeTensor[T: Randomizable](shape: varargs[int]): Tensor[T] =
  result = newTensor[T](shape)
  for i in 0..<shape[0]:
    for j in 0..<shape[1]:
      result[i, j] = block:
        when T is float: rng.rand(0.0 .. 10.0)
        elif T is string: rndStr(5)
        elif T is int: rng.rand(0 .. 10)

proc sequentialAlloc =
  var randomT = makeTensor[int](1000, 1000)
    # total heap usage: 2 allocs, 2 frees, 8,000,095 bytes allocated

  var randomNd = randomT.toNdArray
    # +53,195,577 bytes (python initialization overhead)
    # total heap usage: 17,523 allocs, 13,443 frees, 61,195,672 bytes allocated

  var randomT2 = randomNd.toTensor
    # +16,000,190 bytes
    # total heap usage: 17,525 allocs, 13,445 frees, 69,195,767 bytes allocated

  var randomNd2 = randomT2.toNdArray
    # +8,000,518 bytes
    # total heap usage: 17,539 allocs, 13,458 frees, 77,196,285 bytes allocated

  var randomT3 = randomNd2.toTensor
    # +8,000,095 bytes
    # total heap usage: 17,541 allocs, 13,460 frees, 85,196,380 bytes allocated

  var randomNd3 = randomT3.toNdArray
    # +8,000,518 bytes
    # total heap usage: 17,555 allocs, 13,473 frees, 93,196,898 bytes allocated

  var randomT4 = randomNd3.toTensor
    # +8,000,095 bytes
    #  total heap usage: 17,557 allocs, 13,475 frees, 101,196,993 bytes allocated

  var randomNd4 = randomT4.toNdArray
    # + 8,000,518 bytes
    # total heap usage: 17,571 allocs, 13,488 frees, 109,197,511 bytes allocated

  #[
  =541685== HEAP SUMMARY:
  ==541685==     in use at exit: 5,576,332 bytes in 4,083 blocks
  ==541685==   total heap usage: 17,571 allocs, 13,488 frees, 109,197,511 bytes allocated
  ==541685==
  ==541685== LEAK SUMMARY:
  ==541685==    definitely lost: 64 bytes in 1 blocks
  ==541685==    indirectly lost: 0 bytes in 0 blocks
  ==541685==      possibly lost: 185,881 bytes in 153 blocks
  ==541685==    still reachable: 5,390,387 bytes in 3,929 blocks
  ==541685==         suppressed: 0 bytes in 0 blocks
  ==541685== Rerun with --leak-check=full to see details of leaked memory
  ==541685==
  ==541685== Use --track-origins=yes to see where uninitialised values come from
  ==541685== For lists of detected and suppressed errors, rerun with: -s
  ==541685== ERROR SUMMARY: 9641 errors from 152 contexts (suppressed: 0 from 0)
  ]#

proc sequentialRewriteAlloc =
  var randomT = makeTensor[int](1000, 1000)
    # total heap usage: 2 allocs, 2 frees, 8,000,095 bytes allocated

  var randomNd = randomT.toNdArray
    # +53,195,577 bytes (python initialization overhead)
    # total heap usage: 17,523 allocs, 13,443 frees, 61,195,672 bytes allocated

  randomT = randomNd.toTensor
    # +16,000,190 bytes
    # total heap usage: 17,525 allocs, 13,445 frees, 69,195,767 bytes allocated

  randomNd = randomT.toNdArray
    # +8,000,518 bytes
    # total heap usage: 17,539 allocs, 13,458 frees, 77,196,285 bytes allocated

  randomT = randomNd.toTensor
    # +8,000,095 bytes
    # total heap usage: 17,541 allocs, 13,460 frees, 85,196,380 bytes allocated

  randomNd = randomT.toNdArray
    # +8,000,518 bytes
    # total heap usage: 17,555 allocs, 13,473 frees, 93,196,898 bytes allocated

  randomT = randomNd.toTensor
    # +8,000,095 bytes
    #  total heap usage: 17,557 allocs, 13,475 frees, 101,196,993 bytes allocated

  randomNd = randomT.toNdArray
    # + 8,000,518 bytes
    # total heap usage: 17,571 allocs, 13,488 frees, 109,197,511 bytes allocated

    #[
    ==544560== HEAP SUMMARY:
    ==544560==     in use at exit: 5,576,268 bytes in 4,081 blocks
    ==544560==   total heap usage: 17,569 allocs, 13,488 frees, 109,197,447 bytes allocated
    ==544560==
    ==544560== LEAK SUMMARY:
    ==544560==    definitely lost: 64 bytes in 1 blocks
    ==544560==    indirectly lost: 0 bytes in 0 blocks
    ==544560==      possibly lost: 185,881 bytes in 153 blocks
    ==544560==    still reachable: 5,390,323 bytes in 3,927 blocks
    ==544560==         suppressed: 0 bytes in 0 blocks
    ==544560== Rerun with --leak-check=full to see details of leaked memory
    ==544560==
    ==544560== Use --track-origins=yes to see where uninitialised values come from
    ==544560== For lists of detected and suppressed errors, rerun with: -s
    ==544560== ERROR SUMMARY: 9641 errors from 152 contexts (suppressed: 0 from 0)
    ]#

proc chainAlloc() =
  # discard makeTensor[int](1000, 1000)
  # total heap usage: 2 allocs, 2 frees, 8,000,095 bytes allocated

  # discard makeTensor[int](1000, 1000).toNdArray
  # total heap usage: 17,523 allocs, 13,443 frees, 61,195,672 bytes allocated

  # discard makeTensor[int](1000, 1000).toNdArray.toTensor
  # total heap usage: 17,525 allocs, 13,445 frees, 69,195,767 bytes allocated

  # discard makeTensor[int](1000, 1000).toNdArray.toTensor.toNdArray
  # total heap usage: 17,539 allocs, 13,458 frees, 77,196,285 bytes allocated

  # discard makeTensor[int](1000, 1000).toNdArray.toTensor.toNdArray.toTensor
  # total heap usage: 17,541 allocs, 13,460 frees, 85,196,380 bytes allocated

  # discard makeTensor[int](1000, 1000).toNdArray.toTensor.toNdArray.toTensor.toNdArray
  # total heap usage: 17,555 allocs, 13,473 frees, 93,196,898 bytes allocated

  # discard makeTensor[int](1000, 1000).toNdArray.toTensor.toNdArray.toTensor.toNdArray.toTensor
  #  total heap usage: 17,557 allocs, 13,475 frees, 101,196,993 bytes allocated

  discard makeTensor[int](1000, 1000).toNdArray.toTensor.toNdArray.toTensor.toNdArray.toTensor.toNdArray
  # total heap usage: 17,571 allocs, 13,488 frees, 109,197,511 bytes allocated

  #[
  ==546925== HEAP SUMMARY:
  ==546925==     in use at exit: 5,576,332 bytes in 4,083 blocks
  ==546925==   total heap usage: 17,571 allocs, 13,488 frees, 109,197,511 bytes allocated
  ==546925== 
  ==546925== LEAK SUMMARY:
  ==546925==    definitely lost: 64 bytes in 1 blocks
  ==546925==    indirectly lost: 0 bytes in 0 blocks
  ==546925==      possibly lost: 185,881 bytes in 153 blocks
  ==546925==    still reachable: 5,390,387 bytes in 3,929 blocks
  ==546925==         suppressed: 0 bytes in 0 blocks
  ==546925== Rerun with --leak-check=full to see details of leaked memory
  ==546925== 
  ==546925== Use --track-origins=yes to see where uninitialised values come from
  ==546925== For lists of detected and suppressed errors, rerun with: -s
  ==546925== ERROR SUMMARY: 9641 errors from 152 contexts (suppressed: 0 from 0)
  ]#

chainAlloc()

Clonkk commented 2 years ago

No need for benchmark, the copy are explicit :

And potentially if Tensor is not contiguous :

https://github.com/SciNim/scinim/blob/a6f937ad7a78708333a93e51a1355035491090d7/scinim/numpyarrays.nim#L178

Why is a copy performed ? Because having two objects that refers to the same memory is the best way to end up with dangling pointers. for me, implementing a non-robust solution that I know will crash is bad design.

The goal of this was to potentially extend Arraymancer API through Numpy - since Python is usually quite slow anyway optimising for performance wasn't the top priority.

Solutions ?

Best would be to actually wrap the C API of Numpy, but it's probably a huge time sink.
Otherwise, implement a NumpyArrayView functions conversions (can be on top of an distinct NumpyArray type to avoid confusion) and accept that such functions may be dangerous / will probably crash if not carefully handled.

PRs are welcome if you want to give it a try, I don't really have the time to work on it these days.

arkanoid87 commented 2 years ago

Turning Numpy to Tensor without copy seems already possible with fromBuffer It also seems that 192 bytes are leaked at the let np = pyImport("numpy) line.


import std/dynlib
import nimpy
import nimpy/py_lib
import arraymancer
import scinim/numpyarrays

block:
  type TestType = int64
  const testSize = 1000

  var originalSeq = newSeq[TestType](testSize)
  # total heap usage: 1 allocs, 1 frees, 8,008 bytes allocated
  # All heap blocks were freed -- no leaks are possible

  let np = pyImport("numpy")
  # + 34,790,614
  # total heap usage: 10,577 allocs, 7,255 frees, 34,798,622 bytes allocated
  # definitely lost: 192 bytes in 2 blocks

  var pyObj = np.array(originalSeq)
  # + 16,000
  # total heap usage: 10,579 allocs, 7,257 frees, 34,814,622 bytes allocated
  # definitely lost: 192 bytes in 2 blocks

  # Create Tensor with fromBuffer (zero-copy)
  var pyNd = pyObj.asNumpyArray[: TestType]
  var tensor = fromBuffer(pyNd.data, pyNd.shape)
  # + 308 (!)
  # total heap usage: 10,584 allocs, 7,262 frees, 34,814,930 bytes allocated
  # definitely lost: 192 bytes in 2 blocks

  # Test r/w Numpy Array vs Tensor
  for i in 0..<testSize:
    assert pyObj[i].to(TestType) == tensor[i]
    pyObj[i] = i mod 5
    assert tensor[i] == i mod 5
  # + 0
  # total heap usage: 10,584 allocs, 7,262 frees, 34,814,930 bytes allocated
  # definitely lost: 192 bytes in 2 blocks

{.pragma: pyfunc, cdecl, gcsafe.}
let Py_FinalizeEx = cast[proc():int{.pyfunc.}](py_lib.pyLib.module.symAddr("Py_FinalizeEx"))
assert Py_FinalizeEx != nil
assert Py_FinalizeEx() == 0
# + 19,474
# total heap usage: 10,596 allocs, 9,567 frees, 34,834,404 bytes allocated
# definitely lost: 192 bytes in 2 blocks

--gc: "arc"
--d: "release"
--opt: "speed"
--d: "useMalloc"

I've also wrapped some Numpy C-API but I've not found any valuable advantage from the already implemented Buffer Protocol. I'll check if there's any func there to help moving from Tensor to Numpy, the only sensible way I've found so far seems quite convoluted https://stackoverflow.com/a/2925014/17274026

Clonkk commented 2 years ago

Turning Numpy to Tensor without copy seems already possible with fromBuffer

Yes, but that's not the limitating factor.

The issue is making sure the memory is not free / moved / resized by Python; otherwise your Tensor will points to invalid memory.

Using the C function in conjonction with https://github.com/yglukhov/nimpy/blob/master/nimpy/py_utils.nim#L9-L16 Py_IncRef and Py_DecRef should allow you to bind a C-array (i.e. ptr UncheckedArray) to a NumpyArray that won't get free'ed by Python.

Note that If the PyBuffer API already allows you to do this, then you won't need the C API of Numpy.

See also questions like :

https://stackoverflow.com/questions/52731884/pyarray-simplenewfromdata https://stackoverflow.com/questions/33478046/binding-c-array-to-numpy-array-without-copying?rq=1 https://codereview.stackexchange.com/questions/92266/sending-a-c-array-to-python-numpy-and-back

It also seems that 192 bytes are leaked at the let np = pyImport("numpy) line.

Are you sure it's leaked memory and not just memory allocated once for the lifetime of the program and thus not free'ed because the OS will reclaim it anyway (note : i'm not a fan either of this way of doing things, but it's not wrong).

Doing multiple pyImport("numpy") doesn't seem to increase the amount of memory allocated so I don't think it's an issue.

arkanoid87 commented 2 years ago

I might have a better solution, but it adds a python dependency: apache arrow.

I'm using it to build a pyarrow.Buffer object (that is an object compatible with python Buffer protocol) from a seq/Tensor using the pyarrow.foreign_buffer function, and then build the numpy array via the numpy.frombuffer. This should also protect from buffer resizing on the python size, as the resizable option is a quality that is only available for instances of pyarrow.ResizableBuffer.

See https://arrow.apache.org/docs/python/memory.html

Here's an example where seq -> arraymancer.Tensor -> pyarrow.Buffer -> numpy.array transformation is applied with zero-copy, with valgrind execution after each step. The original buffer generated from nim is still available even after Python environment is completely finalized and removed from scope, while keeping zero leaks on monitor.

overhead by step [bytes]:

seq alloc: 8
seq -> Tensor: 32
Tensor -> pyarrow.Buffer: 232
pyarrow.Buffer -> numpy.array: 0

This approach has a double advantage: arrow is not only a bridge for numpy, but a memory representation mode, a lingua franca. For example it may tackle for us zero-copy integration with pandas.

Moreover as I've experimented here Apache Arrow C-API (gobject based) is already easily wrapped with futhark, and the inner Arrow C Data Interface aims to be a stable abi.

import std/[
  dynlib,
  sequtils
]
import nimpy
import nimpy/py_lib
import arraymancer

proc Py_FinalizeEx =
  {.pragma: pyfunc, cdecl, gcsafe.}
  let aux = cast[proc():int{.pyfunc.}](py_lib.pyLib.module.symAddr("Py_FinalizeEx"))
  assert aux != nil
  assert aux() == 0

proc test(T: typedesc, testSize: int) =
  var allocSeq = toSeq(0.T..testSize.T)
  # data 8000 + overhead 8
  # total heap usage: 1 allocs, 1 frees, 8,008 bytes allocated
  # All heap blocks were freed -- no leaks are possible

  var zcTensor = fromBuffer(cast[ptr UncheckedArray[T]](allocSeq[0].addr), allocSeq.len)
  # overhead 32
  # total heap usage: 2 allocs, 2 frees, 8,040 bytes allocated
  # All heap blocks were freed -- no leaks are possible

  for i in 0..<testSize:
    assert allocSeq[i] == zcTensor[i]

  block:
    let 
      pa = pyImport("pyarrow")
      np = pyImport("numpy")
      sys = pyImport("sys")
      gc = pyImport("gc") 
    # overhead 36.431.083 (python initialization)
    # total heap usage: 14,392 allocs, 10,596 frees, 36,439,123 bytes allocated

    # https://arrow.apache.org/docs/python/generated/pyarrow.foreign_buffer.html
    let paBuffer = pa.foreign_buffer(cast[int](zcTensor.asContiguous(rowMajor).toUnsafeView), T.sizeof * allocSeq.len)
    # overhead 232
    # total heap usage: 14,398 allocs, 10,602 frees, 36,439,355 bytes allocated
    # echo sys.getrefcount(paBuffer) -> 2

    block:
      # https://numpy.org/doc/stable/reference/generated/numpy.frombuffer.html
      let npDtype = np.getAttr($T)
      let npArray = np.callMethod("frombuffer", paBuffer, npDtype)
      # overhead 0
      # total heap usage: 14,398 allocs, 10,602 frees, 36,439,355 bytes allocated
      # echo sys.getrefcount(paBuffer) -> 3

      for i in 0..<testSize:
        assert zcTensor[i] == npArray[i].to(T)

    discard gc.collect()
    # overhead 0
    # total heap usage: 14,398 allocs, 10,602 frees, 36,439,355 bytes allocated
    # npArray has been deallocated
    # echo sys.getrefcount(paBuffer) -> 2

  Py_FinalizeEx()
  # overhead 18.488
  # total heap usage: 14,409 allocs, 13,095 frees, 36,457,843 bytes allocated

  # python interpreter is closed, original buffer is still in place
  for i in 0..<testSize:
    assert allocSeq[i] == zcTensor[i]
  # overhead 0
  # total heap usage: 14,409 allocs, 13,095 frees, 36,457,843 bytes allocated

test(int64, 1000)

SciNim / scinim

Tensor to NumpyArray and back requires copy #8