xorbitsai / xoscar

Python actor framework for heterogeneous computing.
https://xoscar.dev
Apache License 2.0
89 stars 21 forks source link

[BUG] ucx-py does not support IPV6 address #96

Closed frostyplanet closed 4 weeks ago

frostyplanet commented 2 months ago

Describe the bug

A clear and concise description of what the bug is.

To Reproduce

To help us to reproduce this bug, please provide information below:

  1. Your Python version: 3.11

  2. The version of Xoscar you use : While fixed ipv6 on asyncio socket https://github.com/xorbitsai/xoscar/pull/95

  3. Versions of crucial packages, such as numpy, scipy and pandas

  4. Full stack of the error.

expect ucx to listen on IPv6 address ::1, actually listened on 0.0.0.0

>           ref2 = await ctx.actor_ref(address=f"ucx://::1:{port}", uid="test-ucx")                                                                                                                                

test_ucx.py:204:                                                                                         
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/local/lib/python3.11/site-packages/xoscar/backends/context.py:197: in actor_ref                                                                                                    
    future = await self._call(actor_ref.address, message, wait=False)                                                                                                                                              
/usr/local/lib/python3.11/site-packages/xoscar/backends/context.py:77: in _call                                                                                                         
    return await self._caller.call(                                                                                                                                                                                
/usr/local/lib/python3.11/site-packages/xoscar/backends/core.py:181: in call                                                                                                            
    client = await self.get_client(router, dest_address)                                                                                                                                                           
/usr/local/lib/python3.11/site-packages/xoscar/backends/core.py:68: in get_client                                                                                                       
    client = await router.get_client(dest_address, from_who=self)                                                                                                                                                  
/usr/local/lib/python3.11/site-packages/xoscar/backends/router.py:143: in get_client                                                                                                    
    client = await self._create_client(client_type, address, **kw)                                                                                                                                                 
/usr/local/lib/python3.11/site-packages/xoscar/backends/router.py:157: in _create_client                                                                                                
    return await client_type.connect(address, local_address=local_address, **kw)                                                                                                                                   
/usr/local/python3.11/site-packages/xoscar/backends/communication/ucx.py:519: in connect
    ucp_endpoint = await ucp.create_endpoint(host, port)                                                                                                                                                           
/usr/local/lib/python3.11/site-packages/ucp/core.py:1016: in create_endpoint  
    return await _get_ctx().create_endpoint(                                                                                                                                                                       
/usr/local/python3.11/site-packages/ucp/core.py:316: in create_endpoint   
    ucx_ep = ucx_api.UCXEndpoint.create(                                                                  

E   socket.gaierror: [Errno -9] Address family for hostname not supported
  1. Minimized code to reproduce the error.
from xoscar.utils import get_next_port
from xoscar.backends.pool import create_actor_pool
from xoscar import Actor
from xoscar.context import get_context
from xoscar.backends.indigen.pool import MainActorPool
from xoscar.backends.router import Router
import pytest
import sys
import os
import ucp
import asyncio
from xoscar import actor_ref

class TestUCXActor(Actor):
    __test__ = False

    def __init__(self, init_val: int):
        self._init_val = init_val

    def verify(self, enabled_internal_addr: bool):
        router = Router.get_instance()
        assert router.external_address.startswith("ucx")  # type: ignore
        assert len(router._mapping) > 0  # type: ignore
        if not enabled_internal_addr:
            # no internal address
            assert all(v is None for v in router._mapping.values())  # type: ignore
        else:
            assert all(v is not None for v in router._mapping.values())  # type: ignore

    def add(self, n: int):
        return self._init_val + n

    async def foo(self, ref, n: int):
        assert self.address != ref.address
        return self._init_val + await ref.add(n)

async def test_ucx_ipv6():
    start_method = (
        os.environ.get("POOL_START_METHOD", "forkserver")
        if sys.platform != "win32"
        else None
    )
    port = get_next_port()
    # ip not exists on local host
    addr = f":::{port}"
    pool = await create_actor_pool(  # type: ignore
        addr,
        pool_cls=MainActorPool,
        n_process=0,
        subprocess_start_method=start_method,
        external_address_schemes=["ucx"],
    )
    print("addr", addr)
    async with pool:
        ctx = get_context()
        ref1 = await ctx.create_actor(
            TestUCXActor, init_val=0, address=pool.external_address, uid="test-ucx"
        )
        assert ref1.address == "ucx://" + addr
        await asyncio.sleep(100)
        // ref2 = await ctx.actor_ref(address=f"::1:{port}", uid="test-ucx")   // connect with asyncio will ConnectionRefusedError, because not listen to ipv6 addr
        ref2 = await ctx.actor_ref(address=f"ucx://::1:{port}", uid="test-ucx")
        assert await ref2.add(1) == 1
        assert await ref1.add(2) == 2

Expected behavior

ucp should be able to listen on and connect to ipv6 loopback address

Additional context

`` output on test case: addr :::49294

netstat -lupnv | grep 49294

tcp 0 0 0.0.0.0:49294 0.0.0.0:* LISTEN 2197743/python3.11

luweizheng commented 4 weeks ago

Fix by https://github.com/xorbitsai/xoscar/pull/95