aws-neuron / nki-samples

MIT No Attribution
7 stars 5 forks source link

[F134] neuronx-cc terminated abnormally #24

Open nandeeka opened 2 weeks ago

nandeeka commented 2 weeks ago

I am trying to run what I think should be a kernel. However, I am getting the opaque error message, [F134] neuronx-cc terminated abnormally. What is the error and/or how do I go about debugging an error message like this?

The full kernel is:

from neuronxcc.nki import baremetal, benchmark
import neuronxcc.nki.language as nl
import neuronxcc.nki.isa as ni
import numpy as np

# @benchmark(save_neff_name='file.neff', save_trace_name='profile.ntff', additional_compile_opt=' --disable-internal-io-dge ')
@baremetal(save_neff_name='file.neff', additional_compile_opt=' --disable-internal-io-dge ')
def lora(I_DRAM, PW_DRAM, A_DRAM, SB_DRAM, O_DRAM, K2, K1, K0, M2, M1, M0, N2, N1, N0, R):
  for n2 in nl.affine_range(N2):
    for m2 in nl.affine_range(M2):

      # O_SBUF = nl.zeros((M1, nl.par_dim(M0), N1 * N0), dtype=O_DRAM.dtype, buffer=nl.sbuf)
      PO_SBUF = nl.zeros((M1, nl.par_dim(M0), N1 * N0), dtype=O_DRAM.dtype, buffer=nl.sbuf)
      DO_SBUF = nl.zeros((M1, nl.par_dim(M0), N1 * N0), dtype=O_DRAM.dtype, buffer=nl.sbuf)

      AI_SBUF = nl.zeros((N1, nl.par_dim(R), N0), dtype=O_DRAM.dtype, buffer=nl.sbuf)

      m_start = m2 * M1 * M0
      m_end = m_start + M1 * M0

      SB_SBUF = nl.load(SB_DRAM[:, m_start:m_end])

      for k2 in nl.affine_range(K2):
        PW_SBUF = nl.ndarray((M1, nl.par_dim(K0), K1, M0), dtype=PW_DRAM.dtype, buffer=nl.sbuf)
        I_SBUF = nl.ndarray((K1, nl.par_dim(K0), N1 * N0), dtype=I_DRAM.dtype, buffer=nl.sbuf)

        for m1 in nl.affine_range(M1):
          PW_SBUF[m1] = nl.load(PW_DRAM[m2, k2, m1])

        for k1 in nl.affine_range(K1):
          k_start = k2 * K1 * K0 + k1 * K0
          k_end = k_start + K0

          n_start = n2 * N1 * N0
          n_end = n_start + N1 * N0

          I_SBUF[k1] = nl.load(I_DRAM[k_start:k_end, n_start:n_end])

        for m1 in nl.affine_range(M1):
          for n1 in nl.affine_range(N1):
            PO_PSUM = nl.zeros((M0, N0), dtype=nl.float32, buffer=nl.psum)

            n_start = n1 * N0
            n_end = n_start + N0

            for k1 in nl.affine_range(K1):
              PO_PSUM += ni.nc_matmul(PW_SBUF[m1, :, k1], I_SBUF[k1, :, n_start:n_end])

            PO_SBUF[m1, :, n_start:n_end] = nl.loop_reduce(PO_PSUM, op=np.add, loop_indices=[k2], dtype=O_DRAM.dtype)

        # if m2 == 0:
        A_SBUF = nl.ndarray((K1, nl.par_dim(K0), R), dtype=A_DRAM.dtype, buffer=nl.sbuf)
        for k1 in nl.affine_range(K1):
          k_start = k2 * K1 * K0 + k1 * K0
          k_end = k_start + K0

          A_SBUF[k1] = nl.load(A_DRAM[k_start:k_end], mask=m2==0)

        for n1 in nl.affine_range(N1):
          AI_PSUM = nl.zeros((nl.par_dim(R), N0), dtype=nl.float32, buffer=nl.psum)

          n_start = n1 * N0
          n_end = n_start + N0

          for k1 in nl.affine_range(K1):
            AI_PSUM += ni.nc_matmul(A_SBUF[k1], I_SBUF[k1, :, n_start:n_end], mask=m2==0)

          AI_SBUF[n1] = nl.loop_reduce(AI_PSUM, op=np.add, loop_indices=[k2], dtype=O_DRAM.dtype, mask=m2==0)

        # endif
      for m1 in nl.affine_range(M1):
        for n1 in nl.affine_range(N1):
          DO_PSUM = nl.zeros((nl.par_dim(M0), N0), dtype=nl.float32, buffer=nl.psum)

          m_start = m1 * M0
          m_end = m_start + M0

          DO_PSUM[:] = ni.nc_matmul(SB_SBUF[:, m_start:m_end], AI_SBUF[n1])

          n_start = n1 * N0
          n_end = n_start + N0

          DO_SBUF[m1, :, n_start:n_end] = nl.loop_reduce(DO_PSUM, op=np.add, loop_indices=[k2], dtype=O_DRAM.dtype)

      for m1 in nl.affine_range(M1):
        m_start = m2 * M1 * M0 + m1 * M0
        m_end = m_start + M0

        n_start = n2 * N1 * N0
        n_end = n_start + N1 * N0

        O_SBUF = nl.add(PO_SBUF[m1], DO_SBUF[m1])

        nl.store(O_DRAM[m_start:m_end, n_start:n_end], value=O_SBUF)

def launch():
  K, M, N, R = (4096, 4096, 2048, 8)

  K0 = 128
  M0 = 128
  N0 = 512

  M1 = 4
  N1 = 4
  K1 = 8

  K2 = K // (K1 * K0)
  M2 = M // (M1 * M0)
  N2 = N // (N1 * N0)

  assert K2 * K1 * K0 == K
  assert M2 * M1 * M0 == M
  assert N2 * N1 * N0 == N

  PW = np.random.random_sample([M2, K2, M1, K0, K1, M0]).astype(np.float16)
  I = np.random.random_sample([K, N]).astype(np.float16)
  A = np.random.random_sample([K, R]).astype(np.float16)
  SB = np.random.random_sample([R, K]).astype(np.float16)
  O = np.ndarray(shape=[M, N], dtype=np.float16)

  lora(I, PW, A, SB, O, K2, K1, K0, M2, M1, M0, N2, N1, N0, R)

  return I, PW, A, SB, O

def main():
  I, PW, A, SB, O = launch()
  print(O[0, 0])

if __name__ == "__main__":
  main()

The full error message is:

[F134] neuronx-cc terminated abnormally - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new
Traceback (most recent call last):
  File "/home/ubuntu/nki-kernels/out/../src/lora/nki_first_pass.py", line 130, in <module>
    main()
  File "/home/ubuntu/nki-kernels/out/../src/lora/nki_first_pass.py", line 126, in main
    I, PW, A, SB, O = launch()
  File "/home/ubuntu/nki-kernels/out/../src/lora/nki_first_pass.py", line 121, in launch
    lora(I, PW, A, SB, O, K2, K1, K0, M2, M1, M0, N2, N1, N0, R)
  File "neuronxcc/starfish/penguin/targets/nki/TraceKernel.py", line 756, in neuronxcc.starfish.penguin.targets.nki.TraceKernel.Kernel.__call__
  File "neuronxcc/starfish/penguin/targets/nki/TraceKernel.py", line 1254, in neuronxcc.starfish.penguin.targets.nki.TraceKernel.BaremetalKernel.post_process_call
  File "neuronxcc/starfish/penguin/targets/nki/TraceKernel.py", line 1257, in neuronxcc.starfish.penguin.targets.nki.TraceKernel.BaremetalKernel.post_process_call
  File "neuronxcc/starfish/penguin/targets/nki/TraceKernel.py", line 1314, in neuronxcc.starfish.penguin.targets.nki.TraceKernel.BaremetalKernel._compile
RuntimeError: Compilation failed for lora with error Command '['neuronx-cc', 'compile', '--framework', 'XLA', 'penguin.py', '--internal-tensorizer-opt-level=nki', '--pipeline', 'compile', 'SaveTemps', '--target', 'trn1', '--disable-internal-io-dge', '--disable-internal-io-dge', '--output=file.neff']' returned non-zero exit status 70.

My pip freeze is:

absl-py==2.1.0
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
amqp==5.2.0
annotated-types==0.7.0
ansicolors==1.1.8
anyio==4.4.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
astroid==3.2.4
asttokens==2.4.1
async-lru==2.0.4
async-timeout==4.0.3
attrs==24.2.0
Automat==24.8.1
aws-neuronx-runtime-discovery==2.9
awscli==1.34.19
babel==2.16.0
beautifulsoup4==4.12.3
billiard==4.2.0
bleach==6.1.0
boto3==1.35.19
botocore==1.35.19
build==1.2.2
cachetools==5.5.0
celery==5.4.0
certifi==2024.8.30
cffi==1.17.1
charset-normalizer==3.3.2
click==8.1.7
click-didyoumean==0.3.1
click-plugins==1.1.1
click-repl==0.3.0
cloud-tpu-client==0.10
cloudpickle==3.0.0
cmake==3.30.3
colorama==0.4.6
comm==0.2.2
constantly==23.10.4
contourpy==1.3.0
cryptography==43.0.1
cssselect==1.2.0
cycler==0.12.1
dask==2024.9.0
debugpy==1.8.5
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.8
distlib==0.3.8
docutils==0.16
dparse==0.6.3
ec2-metadata==2.13.0
entrypoints==0.4
environment-kernels==1.2.0
exceptiongroup==1.2.2
executing==2.1.0
fastapi==0.114.2
fastjsonschema==2.20.0
filelock==3.16.0
fonttools==4.53.1
fqdn==1.5.1
frozenlist==1.4.1
fsspec==2024.9.0
google-api-core==1.34.1
google-api-python-client==1.8.0
google-auth==2.34.0
google-auth-httplib2==0.2.0
googleapis-common-protos==1.65.0
h11==0.14.0
httpcore==1.0.5
httpie==3.2.3
httplib2==0.22.0
httpx==0.27.2
hyperlink==21.0.0
idna==3.10
imageio==2.35.1
importlib_metadata==8.5.0
incremental==24.7.2
iniconfig==2.0.0
ipykernel==6.29.5
ipython==8.27.0
ipywidgets==8.1.5
islpy==2023.2.5
isoduration==20.11.0
isort==5.13.2
itemadapter==0.9.0
itemloaders==1.3.1
jedi==0.19.1
Jinja2==3.1.4
jmespath==1.0.1
joblib==1.4.2
json5==0.9.25
jsonpointer==3.0.0
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
jupyter==1.1.1
jupyter-console==6.6.3
jupyter-events==0.10.0
jupyter-lsp==2.2.5
jupyter_client==8.6.2
jupyter_core==5.7.2
jupyter_server==2.14.2
jupyter_server_terminals==0.5.3
jupyterlab==4.2.5
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
jupyterlab_widgets==3.0.13
kiwisolver==1.4.7
kombu==5.4.1
libneuronxla==2.0.4115.0
llvmlite==0.43.0
locket==1.0.0
lockfile==0.12.2
lxml==5.3.0
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.9.2
matplotlib-inline==0.1.7
mccabe==0.7.0
mdurl==0.1.2
mistune==3.0.2
ml-dtypes==0.2.0
mpmath==1.3.0
multidict==6.1.0
nbclient==0.10.0
nbconvert==7.16.4
nbformat==5.10.4
nest-asyncio==1.6.0
networkx==2.8.8
neuronx-cc==2.15.128.0+56dc5a86
neuronx-distributed==0.9.0
neuronx-distributed-training==1.0.0
notebook==7.2.2
notebook_shim==0.2.4
numba==0.60.0
numpy==1.25.2
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==8.9.2.26
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.18.1
nvidia-nvjitlink-cu12==12.6.68
nvidia-nvtx-cu12==12.1.105
oauth2client==4.1.3
opencv-python==4.10.0.84
overrides==7.7.0
packaging==21.3
pandas==2.2.2
pandocfilters==1.5.1
papermill==2.6.0
parsel==1.9.1
parso==0.8.4
partd==1.4.2
pexpect==4.9.0
pgzip==0.3.5
pillow==10.4.0
pip-tools==7.4.1
pipenv==2024.0.2
platformdirs==4.3.3
plotly==5.24.1
pluggy==1.5.0
prometheus_client==0.20.0
prompt_toolkit==3.0.47
Protego==0.3.1
protobuf==3.20.3
psutil==6.0.0
ptyprocess==0.7.0
pure_eval==0.2.3
pyasn1==0.6.1
pyasn1_modules==0.4.1
pycparser==2.22
pydantic==2.9.1
pydantic_core==2.23.3
PyDispatcher==2.0.7
Pygments==2.18.0
pylint==3.2.7
pyOpenSSL==24.2.1
pyparsing==3.1.4
pyproject_hooks==1.1.0
PySocks==1.7.1
pytest==8.3.3
python-daemon==3.0.1
python-dateutil==2.9.0.post0
python-json-logger==2.0.7
pytz==2024.2
PyYAML==6.0.2
pyzmq==26.2.0
queuelib==1.7.0
referencing==0.35.1
requests==2.31.0
requests-file==2.1.0
requests-toolbelt==1.0.0
requests-unixsocket==0.3.0
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rich==13.8.1
rpds-py==0.20.0
rsa==4.7.2
ruamel.yaml==0.18.6
ruamel.yaml.clib==0.2.8
s3transfer==0.10.2
safetensors==0.4.5
safety==2.3.5
scikit-learn==1.5.2
scipy==1.11.2
Scrapy==2.11.2
seaborn==0.13.2
Send2Trash==1.8.3
service-identity==24.1.0
shap==0.46.0
six==1.16.0
slicer==0.0.8
sniffio==1.3.1
soupsieve==2.6
stack-data==0.6.3
starlette==0.38.5
sympy==1.13.2
tenacity==9.0.0
terminado==0.18.1
threadpoolctl==3.5.0
tinycss2==1.3.0
tldextract==5.1.2
tomli==2.0.1
tomlkit==0.13.2
toolz==0.12.1
torch==2.1.2
torch-neuronx==2.1.2.2.3.0
torch-xla==2.1.4
torchvision==0.16.2
tornado==6.4.1
tqdm==4.66.5
traitlets==5.14.3
triton==2.1.0
Twisted==24.7.0
types-python-dateutil==2.9.0.20240906
typing_extensions==4.12.2
tzdata==2024.1
uri-template==1.3.0
uritemplate==3.0.1
urllib3==2.2.3
vine==5.1.0
virtualenv==20.26.4
w3lib==2.2.1
wcwidth==0.2.13
webcolors==24.8.0
webencodings==0.5.1
websocket-client==1.8.0
wget==3.2
widgetsnbextension==4.0.13
yarl==1.11.1
zipp==3.20.2
zope.interface==7.0.3
jeffhataws commented 2 weeks ago

Thank @nandeeka for filing the issue. We will take a look.

JonathanHenson commented 2 weeks ago

Hi Nandeeka, I’m taking a look to reproduce. If you have one could you also provide the contents of the compiler log?

nandeeka commented 2 weeks ago

Hi Jonathan, Where do I see the compiler log? Following the instructions here, I tried printing it to the console with:

export NEURON_RT_LOG_LOCATION=console
export NEURON_RT_LOG_LEVEL=INFO

But this does not seem to have done anything. Thanks!

jeffhataws commented 2 weeks ago

Hi @nandeeka will you try adding adding additional_compile_opt="--verbose debug" argument to the baremetal decorator?

nandeeka commented 2 weeks ago

This worked. It looks like the error was:

2024-10-05T22:49:01Z ERROR 3808 [job.WalrusDriver.0]: Backend exited with code -6 and stderr: No existing axis k2 found in instruction I-33's parent list
walrus_driver: /local/p4clients/pkgbuild-const/workspace/src/KaenaCompiler/neuronxcc/walrus/ir/lib/IR/BasicBlockHolder.cpp:150: bir::LoopAxis* bir::BasicBlockHolder::findAxis(const string&, bir::Instruction*): Assertion `false && "No existing axis found"' failed.

After inspecting all instructions involving k2, I figured out which one was creating the problem, and I fixed it. I guess my remaining question is, is there any way for me to figure out which instruction was instruction I-33? As kernels get bigger, manually inspecting all relevant instructions becomes more and more challenging.

JonathanHenson commented 1 week ago

This actually works with the simulator as is, so will need to look further why it's correct at the nki insertion point but incorrect in the backend:

updated code:

def test_lora(self):
    K, M, N, R = (4096, 4096, 2048, 8)

    K0 = 128
    M0 = 128
    N0 = 512

    M1 = 4
    N1 = 4
    K1 = 8

    K2 = K // (K1 * K0)
    M2 = M // (M1 * M0)
    N2 = N // (N1 * N0)

    assert K2 * K1 * K0 == K
    assert M2 * M1 * M0 == M
    assert N2 * N1 * N0 == N

    PW = np.random.random_sample([M2, K2, M1, K0, K1, M0]).astype(np.float16)
    I = np.random.random_sample([K, N]).astype(np.float16)
    A = np.random.random_sample([K, R]).astype(np.float16)
    SB = np.random.random_sample([R, K]).astype(np.float16)
    O = np.ndarray(shape=[M, N], dtype=np.float16)

    nki.simulate_kernel(lora, I, PW, A, SB, O, K2, K1, K0, M2, M1, M0, N2, N1, N0, R)
    print(O[0,0])
    return I, PW, A, SB, O

output:

4890.0
JonathanHenson commented 1 week ago

This worked. It looks like the error was:

2024-10-05T22:49:01Z ERROR 3808 [job.WalrusDriver.0]: Backend exited with code -6 and stderr: No existing axis k2 found in instruction I-33's parent list
walrus_driver: /local/p4clients/pkgbuild-const/workspace/src/KaenaCompiler/neuronxcc/walrus/ir/lib/IR/BasicBlockHolder.cpp:150: bir::LoopAxis* bir::BasicBlockHolder::findAxis(const string&, bir::Instruction*): Assertion `false && "No existing axis found"' failed.

After inspecting all instructions involving k2, I figured out which one was creating the problem, and I fixed it. I guess my remaining question is, is there any way for me to figure out which instruction was instruction I-33? As kernels get bigger, manually inspecting all relevant instructions becomes more and more challenging.

I-33 would be the 33rd instruction emitted by the Kernel.

As far as a better way to see which instruction maps to what line of code, we should be able to re-correlate it back to the debug info for the kernel. I am adding this to our backlog to make it more clear what went wrong.