Open FFAMax opened 4 weeks ago
With DEBUG=8 TINYGRAD_DEBUG=8 DEBUG_DISCOVERY=8 exo
got some info:
Broadcasting presence at (127.0.0.1)
Broadcasting presence at (10.1.3.177): {"type": "discovery", "node_id":
"ce0c3546-20d9-4a2c-9e96-16c6894259fa", "grpc_port": 49868, "device_capabilities": {"model": "Linux Box
(NVIDIA GEFORCE GTX 1080 TI)", "chip": "NVIDIA GEFORCE GTX 1080 TI", "memory": 11264, "flops": {"fp32":
11.34, "fp16": 0.177, "int8": 45.36}}, "priority": 1}
received from peer ('127.0.0.1', 55821): {'type': 'discovery', 'node_id':
'ce0c3546-20d9-4a2c-9e96-16c6894259fa', 'grpc_port': 49868, 'device_capabilities': {'model': 'Linux Box
(NVIDIA GEFORCE GTX 1080 TI)', 'chip': 'NVIDIA GEFORCE GTX 1080 TI', 'memory': 11264, 'flops': {'fp32':
11.34, 'fp16': 0.177, 'int8': 45.36}}, 'priority': 1}
Broadcasting presence at (10.1.3.177)
received from peer ('10.1.3.177', 54839): {'type': 'discovery', 'node_id':
'ce0c3546-20d9-4a2c-9e96-16c6894259fa', 'grpc_port': 49868, 'device_capabilities': {'model': 'Linux Box
(NVIDIA GEFORCE GTX 1080 TI)', 'chip': 'NVIDIA GEFORCE GTX 1080 TI', 'memory': 11264, 'flops': {'fp32':
11.34, 'fp16': 0.177, 'int8': 45.36}}, 'priority': 1}
Peer statuses: {'26aa0d12-7bff-433b-89d1-49a0af8485a6': 'is_connected=True, health_check=True,
connected_at=1730615045.907977, last_seen=1730615342.8392212, prio=1',
'81ab223c-d922-4c16-b784-d1dab5c075f9': 'is_connected=True, health_check=True,
connected_at=1730615045.9222374, last_seen=1730615342.8244953, prio=1'}
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM61
EF_CUDA_VIRTUAL_SM(EF_CUDA_SM61)"
.elftype @"ET_EXEC"
//--------------------- .nv.info --------------------------
.section .nv.info,"",@"SHT_CUDA_INFO"
.align 4
//----- nvinfo : EIATTR_REGCOUNT
.align 4
/*0000*/ .byte 0x04, 0x2f
/*0002*/ .short (.L_2 - .L_1)
.align 4
.L_1:
/*0004*/ .word index@(E_101_101_2_2)
/*0008*/ .word 0x0000000a
//----- nvinfo : EIATTR_MAX_STACK_SIZE
.align 4
.L_2:
/*000c*/ .byte 0x04, 0x23
/*000e*/ .short (.L_4 - .L_3)
.align 4
.L_3:
/*0010*/ .word index@(E_101_101_2_2)
/*0014*/ .word 0x00000000
//----- nvinfo : EIATTR_MIN_STACK_SIZE
.align 4
.L_4:
/*0018*/ .byte 0x04, 0x12
/*001a*/ .short (.L_6 - .L_5)
.align 4
.L_5:
/*001c*/ .word index@(E_101_101_2_2)
/*0020*/ .word 0x00000000
//----- nvinfo : EIATTR_FRAME_SIZE
.align 4
.L_6:
/*0024*/ .byte 0x04, 0x11
/*0026*/ .short (.L_8 - .L_7)
.align 4
.L_7:
/*0028*/ .word index@(E_101_101_2_2)
/*002c*/ .word 0x00000000
.L_8:
//--------------------- .nv.info.E_101_101_2_2 --------------------------
.section .nv.info.E_101_101_2_2,"",@"SHT_CUDA_INFO"
.align 4
//----- nvinfo : EIATTR_CUDA_API_VERSION
.align 4
/*0000*/ .byte 0x04, 0x37
/*0002*/ .short (.L_10 - .L_9)
.L_9:
/*0004*/ .word 0x00000073
//----- nvinfo : EIATTR_SW2393858_WAR
.align 4
.L_10:
/*0008*/ .byte 0x01, 0x30
.zero 2
//----- nvinfo : EIATTR_SW1850030_WAR
.align 4
/*000c*/ .byte 0x01, 0x2a
.zero 2
//----- nvinfo : EIATTR_PARAM_CBANK
.align 4
/*0010*/ .byte 0x04, 0x0a
/*0012*/ .short (.L_12 - .L_11)
.align 4
.L_11:
/*0014*/ .word index@(.nv.constant0.E_101_101_2_2)
/*0018*/ .short 0x0140
/*001a*/ .short 0x0008
//----- nvinfo : EIATTR_CBANK_PARAM_SIZE
.align 4
.L_12:
/*001c*/ .byte 0x03, 0x19
/*001e*/ .short 0x0008
//----- nvinfo : EIATTR_KPARAM_INFO
.align 4
/*0020*/ .byte 0x04, 0x17
/*0022*/ .short (.L_14 - .L_13)
.L_13:
/*0024*/ .word 0x00000000
/*0028*/ .short 0x0000
/*002a*/ .short 0x0000
/*002c*/ .byte 0x00, 0xf0, 0x21, 0x00
//----- nvinfo : EIATTR_MAXREG_COUNT
.align 4
.L_14:
/*0030*/ .byte 0x03, 0x1b
/*0032*/ .short 0x00ff
//----- nvinfo : EIATTR_S2RCTAID_INSTR_OFFSETS
.align 4
/*0034*/ .byte 0x04, 0x1d
/*0036*/ .short (.L_16 - .L_15)
// ....[0]....
.L_15:
/*0038*/ .word 0x00000018
// ....[1]....
/*003c*/ .word 0x00000030
//----- nvinfo : EIATTR_EXIT_INSTR_OFFSETS
.align 4
.L_16:
/*0040*/ .byte 0x04, 0x1c
/*0042*/ .short (.L_18 - .L_17)
// ....[0]....
.L_17:
/*0044*/ .word 0x00000168
//----- nvinfo : EIATTR_MAX_THREADS
.align 4
.L_18:
/*0048*/ .byte 0x04, 0x05
/*004a*/ .short (.L_20 - .L_19)
.L_19:
/*004c*/ .word 0x00000004
/*0050*/ .word 0x00000001
/*0054*/ .word 0x00000001
.L_20:
//--------------------- .nv.rel.action --------------------------
.section .nv.rel.action,"",@"SHT_CUDA_RELOCINFO"
.align 8
.sectionentsize 8
/*0000*/ .byte 0x4b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x08,
0x10, 0x0a, 0x2f, 0x22
/*0010*/ .byte 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x08,
0x00, 0x00, 0x00, 0x00
/*0020*/ .byte 0x00, 0x00, 0x10, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x08,
0x00, 0x00, 0x00, 0x00
/*0030*/ .byte 0x00, 0x00, 0x20, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x08,
0x00, 0x00, 0x00, 0x00
/*0040*/ .byte 0x00, 0x00, 0x30, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x08,
0x00, 0x00, 0x00, 0x00
/*0050*/ .byte 0x01, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x08, 0x08,
0x00, 0x00, 0x00, 0x00
/*0060*/ .byte 0x01, 0x00, 0x10, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x18, 0x08,
0x00, 0x00, 0x00, 0x00
/*0070*/ .byte 0x01, 0x00, 0x20, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x28, 0x08,
0x00, 0x00, 0x00, 0x00
/*0080*/ .byte 0x01, 0x00, 0x30, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x38, 0x08,
0x00, 0x00, 0x00, 0x00
/*0090*/ .byte 0x02, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08,
0x00, 0x00, 0x00, 0x00
/*00a0*/ .byte 0x02, 0x00, 0x10, 0x08, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x18, 0x08,
0x00, 0x00, 0x00, 0x00
/*00b0*/ .byte 0x02, 0x00, 0x20, 0x08, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x28, 0x08,
0x00, 0x00, 0x00, 0x00
/*00c0*/ .byte 0x02, 0x00, 0x30, 0x08, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x38, 0x08,
0x00, 0x00, 0x00, 0x00
/*00d0*/ .byte 0x00, 0x00, 0x00, 0x14, 0x2c, 0x00, 0x00, 0x00
//--------------------- .nv.constant0.E_101_101_2_2 --------------------------
.section .nv.constant0.E_101_101_2_2,"a",@progbits
.align 4
.nv.constant0.E_101_101_2_2:
.zero 328
//--------------------- .text.E_101_101_2_2 --------------------------
.section .text.E_101_101_2_2,"ax",@progbits
.sectioninfo @"SHI_REGISTERS=10"
.align 32
.global E_101_101_2_2
.type E_101_101_2_2,@function
.size E_101_101_2_2,(.L_x_1 - E_101_101_2_2)
.other E_101_101_2_2,@"STO_CUDA_ENTRY STV_DEFAULT"
E_101_101_2_2:
.text.E_101_101_2_2:
/*0008*/ MOV R1, c[0x0][0x20] ;
/*0010*/ { MOV32I R9, 0xa29ecf17 ;
/*0018*/ S2R R0, SR_CTAID.X }
/*0028*/ S2R R2, SR_TID.Y ;
/*0030*/ S2R R3, SR_CTAID.Y ;
/*0038*/ S2R R5, SR_TID.X ;
/*0048*/ ISCADD R0, R0, R2, 0x1 ;
/*0050*/ IADD32I R2, R0, 0x325 ;
/*0058*/ XMAD R2, R3, 0x324, R2 ;
/*0068*/ XMAD.PSL R2, R3.H1, 0x324, R2 ;
/*0070*/ XMAD R2, R5, 0x325, R2 ;
/*0078*/ XMAD.PSL R2, R5.H1, 0x325, R2 ;
/*0088*/ XMAD R4, R2, R9, RZ ;
/*0090*/ XMAD.U16.S16 R6, R2, R9.H1, RZ ;
/*0098*/ XMAD.S16.S16.CSFU R7, R2.H1, R9.H1, R2 ;
/*00a8*/ XMAD.S16.U16.CHI R4, R2.H1, R9, R4 ;
/*00b0*/ XMAD R8, R3, 0x194, R0 ;
/*00b8*/ IADD3.RS R0, R4, R6, R7 ;
/*00c8*/ XMAD.PSL R4, R3.H1, 0x194, R8 ;
/*00d0*/ SHR R3, R0, 0x9 ;
/*00d8*/ XMAD R4, R5, 0xca, R4 ;
/*00e8*/ MOV32I R8, 0xfffffcda ;
/*00f0*/ LEA.HI R3, R0, R3, RZ, 0x1 ;
/*00f8*/ XMAD.PSL R0, R5.H1, 0xca, R4 ;
/*0108*/ XMAD R4, R3, R8, R2 ;
/*0110*/ XMAD.MRG R5, R3, R8.H1, RZ ;
/*0118*/ SHR R6, R0.reuse, 0x1e ;
/*0128*/ ISCADD R2.CC, R0, c[0x0][0x140], 0x2 ;
/*0130*/ XMAD.PSL.CBCC R0, R3.H1, R5.H1, R4 ;
/*0138*/ ISET.LT.AND R0, R0, 0x193, PT ;
/*0148*/ IADD.X R3, R6, c[0x0][0x144] ;
/*0150*/ LOP32I.AND R0, R0, 0xfa0a1f00 ;
/*0158*/ STG.E [R2], R0 ;
/*0168*/ EXIT ;
.L_x_0:
/*0170*/ BRA `(.L_x_0) ;
/*0178*/ NOP;
.L_x_1:
*** CUDA 1827 E_101_101_2_2 arg 1 mem 11.52 GB tm 36.45us/ 35303.50ms (
11.20 GFLOPS 4.5|4.5 GB/s) ['realize']
Traceback (most recent call last):
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/device.py", line 152, in alloc
try: return super().alloc(size, options)
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/device.py", line 136, in alloc
return self._alloc(size, options if options is not None else BufferOptions())
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/runtime/ops_cuda.py", line 68, in
_alloc
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/helpers.py", line 325, in init_c_var
def init_c_var(ctypes_var, creat_cb): return (creat_cb(ctypes_var), ctypes_var)[1]
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/runtime/ops_cuda.py", line 68, in
<lambda>
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/runtime/ops_cuda.py", line 13, in
check
if status != 0: raise RuntimeError(f"CUDA Error {status},
{ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status,
ctypes.byref(x)))).decode()}") # noqa: E501
RuntimeError: CUDA Error 2, out of memory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/ffamax/exo/exo/api/chatgpt_api.py", line 273, in handle_post_chat_completions
await asyncio.wait_for(
File "/usr/lib/python3.10/asyncio/tasks.py", line 445, in wait_for
return fut.result()
File "/home/ffamax/exo/exo/orchestration/standard_node.py", line 126, in process_prompt
resp = await self._process_prompt(base_shard, prompt, image_str, request_id, inference_state)
File "/home/ffamax/exo/exo/orchestration/standard_node.py", line 162, in _process_prompt
result, inference_state, is_finished = await self.inference_engine.infer_prompt(request_id, shard,
prompt, image_str, inference_state=inference_state)
File "/home/ffamax/exo/exo/inference/tinygrad/inference.py", line 64, in infer_prompt
h = await asyncio.get_event_loop().run_in_executor(self.executor, lambda: self.model(Tensor([toks]),
start_pos, TEMPERATURE).realize())
File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/home/ffamax/exo/exo/inference/tinygrad/inference.py", line 64, in <lambda>
h = await asyncio.get_event_loop().run_in_executor(self.executor, lambda: self.model(Tensor([toks]),
start_pos, TEMPERATURE).realize())
File "/home/ffamax/exo/exo/inference/tinygrad/models/llama.py", line 214, in __call__
return self.forward(tokens, start_pos, temperature, top_k, top_p, alpha_f, alpha_p)
File "/home/ffamax/exo/exo/inference/tinygrad/models/llama.py", line 202, in forward
h = layer(h, start_pos, freqs_cis, mask)
File "/home/ffamax/exo/exo/inference/tinygrad/models/llama.py", line 107, in __call__
h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask)
File "/home/ffamax/exo/exo/inference/tinygrad/models/llama.py", line 70, in __call__
self.cache_kv = Tensor.zeros(2, bsz, self.max_context, self.n_kv_heads, self.head_dim,
dtype=x.dtype).contiguous().realize()
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/tensor.py", line 3475, in _wrapper
if _METADATA.get() is not None: return fn(*args, **kwargs)
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/tensor.py", line 213, in realize
run_schedule(*self.schedule_with_vars(*lst), do_update_stats=do_update_stats)
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/engine/realize.py", line 224, in
run_schedule
ei.run(var_vals, do_update_stats=do_update_stats)
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/engine/realize.py", line 173, in run
bufs = [cast(Buffer, x) for x in self.bufs] if jit else [cast(Buffer, x).ensure_allocated() for x in
self.bufs]
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/engine/realize.py", line 173, in
<listcomp>
bufs = [cast(Buffer, x) for x in self.bufs] if jit else [cast(Buffer, x).ensure_allocated() for x in
self.bufs]
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/device.py", line 77, in
ensure_allocated
def ensure_allocated(self) -> Buffer: return self.allocate() if not hasattr(self, '_buf') else self
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/device.py", line 86, in allocate
self._buf = opaque if opaque is not None else self.allocator.alloc(self.nbytes, self.options)
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/device.py", line 155, in alloc
return super().alloc(size, options)
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/device.py", line 136, in alloc
return self._alloc(size, options if options is not None else BufferOptions())
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/runtime/ops_cuda.py", line 68, in
_alloc
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/helpers.py", line 325, in init_c_var
def init_c_var(ctypes_var, creat_cb): return (creat_cb(ctypes_var), ctypes_var)[1]
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/runtime/ops_cuda.py", line 68, in
<lambda>
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
File "/home/ffamax/exo/.venv/lib/python3.10/site-packages/tinygrad/runtime/ops_cuda.py", line 13, in
check
if status != 0: raise RuntimeError(f"CUDA Error {status},
{ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status,
ctypes.byref(x)))).decode()}") # noqa: E501
RuntimeError: CUDA Error 2, out of memory
Deregister callback_id='chatgpt-api-wait-response-2a883f12-fc4e-46fa-a5df-1ce276898ef4'
deregistered_callback=None
Received request: GET /v1/download/progress
update_peers: added=[] removed=[] updated=[]
unchanged=[<exo.networking.grpc.grpc_peer_handle.GRPCPeerHandle object at 0x7fdf962d3ca0>,
<exo.networking.grpc.grpc_peer_handle.GRPCPeerHandle object at 0x7fdf962d3790>] to_disconnect=[]
to_connect=[]
did_peers_change=False
Multiple nodes available, but OOM on single GPU - likely due to improper request distribution.
The last meaningful output:
Any ideas?