Error loading local model on Mali GPU

I've been trying to load a local model on a device with a Mali GPU, and have encountered an issue. After modifying the code in three locations, I'm still encountering an error. The error message is as follows:

I've made modifications to the following files

exo/exo/api/chatgpt_api.py: resolve_tinygrad_tokenizer function

def resolve_tinygrad_tokenizer(model_id: str):
  if model_id == "llama3-8b-sfr":

   # here i modified
   # return AutoTokenizer.from_pretrained("TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R")
    return AutoTokenizer.from_pretrained("/nasroot/models/Meta-Llama-3-8B")

  elif model_id == "llama3-70b-sfr":
    return AutoTokenizer.from_pretrained("TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R")
  else:
    raise ValueError(f"tinygrad doesnt currently support arbitrary model downloading. unsupported model: {model_id}")

exo/exo/api/chatgpt_api.py: resolve_tokenizer function

async def resolve_tokenizer(model_id: str):
  try:
    # if DEBUG >= 2: print(f"Trying AutoTokenizer for {model_id}")

    # here i modified.
    # return AutoTokenizer.from_pretrained(model_id)
    if DEBUG >= 2: print(f"Trying AutoTokenizer for /nasroot/models/Meta-Llama-3-8B")
    return AutoTokenizer.from_pretrained("/nasroot/models/Meta-Llama-3-8B")

  except Exception as e:
    if DEBUG >= 2: print(f"Failed to load tokenizer for {model_id}. Falling back to tinygrad tokenizer. Error: {e}")
    import traceback

    if DEBUG >= 2: print(traceback.format_exc())

  try:
    if DEBUG >= 2: print(f"Trying tinygrad tokenizer for {model_id}")
    return resolve_tinygrad_tokenizer(model_id)
  except Exception as e:
    if DEBUG >= 2: print(f"Failed again to load tokenizer for {model_id}. Falling back to mlx tokenizer. Error: {e}")
    import traceback

    if DEBUG >= 2: print(traceback.format_exc())

  if DEBUG >= 2: print(f"Trying mlx tokenizer for {model_id}")
  from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer

  return load_tokenizer(await get_model_path(model_id))

exo/inference/tinygard/inference.py: ensure_shard function

async def ensure_shard(self, shard: Shard):
  if self.shard == shard:
    return

  model_path = Path(shard.model_id)

  models_dir = Path(_cache_dir) / "tinygrad" / "downloads"
  model_path = models_dir / shard.model_id
  size = "8B"

  # here i modified.
  model_path = Path("/nasroot/models/Meta-Llama-3-8B")

  if Path(model_path / "model.safetensors.index.json").exists():
    model = model_path
  else:

    if DEBUG >= 2: print(f"Downloading tinygrad model {shard.model_id}...")
    if shard.model_id.lower().find("llama3-8b-sfr") != -1:
      await fetch_async(
        "https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model",
        "tokenizer.model",
        subdir=shard.model_id,
      )

The error message is as follows:

error lowering MetaOps.SINK
tensor operations:
[cast - exo.inference.tinygrad.models.llama:288::fix_bf16]
loaded weights in 1153.92 ms, 0.03 GB loaded at 0.03 GB/s
Traceback (most recent call last):
  File "/nasroot/code/exo/exo/api/chatgpt_api.py", line 231, in handle_post_chat_completions
    await self.node.process_prompt(shard, prompt, request_id=request_id)
  File "/nasroot/code/exo/exo/orchestration/standard_node.py", line 92, in process_prompt
    resp = await self._process_prompt(base_shard, prompt, request_id, inference_state)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nasroot/code/exo/exo/orchestration/standard_node.py", line 129, in _process_prompt
    result, inference_state, is_finished = await self.inference_engine.infer_prompt(request_id, shard, prompt, 
inference_state=inference_state)
                                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nasroot/code/exo/exo/inference/tinygrad/inference.py", line 203, in infer_prompt
    await self.ensure_shard(shard)
  File "/nasroot/code/exo/exo/inference/tinygrad/inference.py", line 295, in ensure_shard
    model = build_transformer(model_path, shard=shard, model_size=size)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nasroot/code/exo/exo/inference/tinygrad/inference.py", line 176, in build_transformer
    load_state_dict(model, weights, strict=False, consume=True)
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/nn/state.py", line 129, in load_state_dict
    else: v.replace(state_dict[k].to(v.device)).realize()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/tensor.py", line 3123, in _wrapper
    ret = fn(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/tensor.py", line 203, in realize
    run_schedule(*self.schedule_with_vars(*lst), do_update_stats=do_update_stats)
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/engine/realize.py", line 222, in 
run_schedule
    for ei in lower_schedule(schedule):
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/engine/realize.py", line 215, in 
lower_schedule
    raise e
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/engine/realize.py", line 209, in 
lower_schedule
    try: yield lower_schedule_item(si)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/engine/realize.py", line 193, in 
lower_schedule_item
    runner = get_runner(si.outputs[0].device, si.ast)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/engine/realize.py", line 164, in get_runner
    method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, dname=dname))
                                                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/engine/realize.py", line 82, in __init__
    self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src)
                                                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/device.py", line 177, in compile_cached
    lib = self.compile(src)
          ^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/exo/lib/python3.12/site-packages/tinygrad/runtime/ops_gpu.py", line 26, in compile
    raise CompileError(f"OpenCL Compile Error\n\n{mstr.value.decode()}")
tinygrad.device.CompileError: OpenCL Compile Error

<source>:2:66: error: unknown type name '__bf16'
__kernel void E_131072_32_4(__global half* data0, const __global __bf16* data1) {
                                                                 ^

<source>:9:3: error: use of undeclared identifier '__bf16'
  __bf16 val0 = data1[alu1];
  ^

<source>:10:3: error: use of undeclared identifier '__bf16'
  __bf16 val1 = data1[alu2];
  ^

<source>:11:3: error: use of undeclared identifier '__bf16'
  __bf16 val2 = data1[alu3];
  ^

<source>:12:3: error: use of undeclared identifier '__bf16'
  __bf16 val3 = data1[alu0];
  ^

<source>:13:24: error: use of undeclared identifier 'val0'; did you mean 'alu0'?
  data0[alu1] = (half)(val0);
                       ^

<source>:5:7: note: 'alu0' declared here
  int alu0 = ((gidx0*128)+(lidx1*4));
      ^

<source>:14:24: error: use of undeclared identifier 'val1'; did you mean 'alu1'?
  data0[alu2] = (half)(val1);
                       ^

<source>:6:7: note: 'alu1' declared here
  int alu1 = (alu0+1);
      ^

<source>:15:24: error: use of undeclared identifier 'val2'; did you mean 'alu2'?
  data0[alu3] = (half)(val2);
                       ^

<source>:7:7: note: 'alu2' declared here
  int alu2 = (alu0+2);
      ^

<source>:16:24: error: use of undeclared identifier 'val3'; did you mean 'alu3'?
  data0[alu0] = (half)(val3);
                       ^

<source>:8:7: note: 'alu3' declared here
  int alu3 = (alu0+3);
      ^

error: Compiler frontend failed (error code 62)

I'm not entirely sure if the way I'm loading the model is the root cause of this problem. Are there more elegant methods to load local models that I could explore?

exo-explore / exo

Error loading local model on Mali GPU #95