Unable to connect to Hugging Face Dedicated Endpoints?

Is your feature request related to a problem? Please describe. I'm wanting to utilize guidance using a large hugging face hosted model. I don't see any classes related for Hugging Face specifically, and setting the api_key does not seem to set the required Authorization: Bearer header.

Describe the solution you'd like A handler for HuggingFace hosted endpoints.

Describe alternatives you've considered Don't know how to pass a hugging face endpoint directly into the guidance code.

Ideally be able to use hugging face interence endpoint object directly passed into an appropriate guidance wrapping model.

def inference_endpoint1():
    hf_login_check()
    ep_name = "llama-3-70b-instruct-aws1"
    for ep in list_inference_endpoints(namespace="my company"):
        if ep.name == ep_name:
            ep.wait()
            return ep
    ep = create_inference_endpoint(  #
            ep_name,  #
            repository="meta-llama/Meta-Llama-3-70B-Instruct",  #
            framework="pytorch",  #
            accelerator="gpu",  #
            instance_size="x4",  #
            instance_type="nvidia-a100",  #
            region="us-east-1",  #
            vendor="aws",  #
            min_replica=0,  #
            max_replica=1,  #
            task="text-generation",  #
            type=InferenceEndpointType.PROTECTED,  #
            namespace="newsrx",  #
            custom_image={  #
                "health_route": "/health",  #
                "url": "ghcr.io/huggingface/text-generation-inference:2.1.1",  #
                "env": {  #
                    "MAX_BATCH_PREFILL_TOKENS": "32768",  #
                    "MAX_BATCH_TOTAL_TOKENS": "32768",  #
                    "MAX_INPUT_LENGTH": "16384",  #
                    "MAX_TOTAL_TOKENS": "32768",  #
                    "MODEL_ID": "/repository"},  #
            })
    ep.scale_to_zero()
    ep.wait()
    return ep

def hf_login_check():
    try:
        huggingface_hub.whoami()
    except RequestException as e:
        print("Not logged in or not connected to internet")
        huggingface_hub.login()  # inference_endpoint()

from huggingface_hub import InferenceEndpoint
from pprint import pprint
from guidance.models import Model
from local_models.mixtral_guidelines import load_llama_3_70b_instruct_chat_q2 as test_model
import guidance

api_key = huggingface_hub.get_token()

with BlockTimer() as timer:
    end_point: InferenceEndpoint = local_utils.hf.inference.inference_endpoint1()
    n_ctx: int = 16384  # Context window size
    llm: Model = guidance.models.LlamaCpp(end_point.url, api_key=api_key, compute_log_probs=True)
    print(f"Model load elapsed: {timer.formatted}")

Additional attempts:

Using OpenAI client

Works

    ep1 = inference_endpoint1()
    while ep1.status != "running":
        if ep1.status == "failed":
            raise RuntimeError(f"Failed to create inference endpoint: {ep1.name}")
        ep1.wait()

    import openai
    client = openai.OpenAI(  #
            base_url=ep1.url + "/v1/",  #
            api_key=hf_bearer_token(),  #
    )

    role_system = {"role": "system", "content": "You are a helpful assistant."}
    role_user = {"role": "user", "content": "What is deep learning?"}
    chat_completion = client.chat.completions.create(model="gpt-4o",  #
                                                     messages=[role_system, role_user],  #
                                                     stream=True, max_tokens=1024,
                                                     temperature=0.0)

Using @guidance

Fails with streaming error

    llm = guidance.models.OpenAI(model="gpt-4o",  #
                                 base_url=ep1.url + "/v1/",  #
                                 api_key=hf_bearer_token(),  #
                                 echo=False)

    # llm = Transformers(ep1.url, echo=False, api_key=ep1.client.token)
    with system():
        llm += "I am an evil robot overlord."
    with user():
        llm += "What is your command?"
    with assistant():
        llm += gen()
    print(str(llm))

Stacktrace

Traceback (most recent call last):
  File "/home/michael/git/ai_newsletters/local_utils/hf/inference.py", line 290, in <module>
    main()
  File "/home/michael/git/ai_newsletters/local_utils/hf/inference.py", line 266, in main
    llm += gen()
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_model.py", line 1159, in __add__
    out = lm._run_stateless(value)
          ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_model.py", line 1364, in _run_stateless
    for chunk in gen_obj:
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_model.py", line 760, in __call__
    logits = self.get_logits(token_ids, forced_bytes, current_temp)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_grammarless.py", line 360, in get_logits
    raise new_bytes
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_grammarless.py", line 165, in _start_generator_stream
    for chunk in generator:
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_openai.py", line 156, in _generator_chat
    raise e
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/guidance/models/_openai.py", line 145, in _generator_chat
    for part in generator:
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/openai/_streaming.py", line 46, in __iter__
    for item in self._iterator:
  File "/home/michael/miniconda3/envs/ai_newsletters/lib/python3.12/site-packages/openai/_streaming.py", line 72, in __stream__
    raise APIError(
openai.APIError: An error occurred during streaming

`top_p` must be > 0.0 and < 1.0

Container logs show:

{
    "timestamp":"2024-07-11T17:32:24.361684Z",
    "level":"ERROR",
    "message":"`top_p` must be > 0.0 and < 1.0",
    "target":"text_generation_router::infer",
    "filename":"router/src/infer.rs",
    "line_number":137,
    "span":{
        "name":"generate_stream"
    },
    "spans":[
        {
            "name":"chat_completions"
        },{
            "name":"async_stream"
        },{
            "name":"generate_stream"
        }
    ]
}

Workaround

1) Use custom httpx client that alters the JSON of the request. 2) Supply custom httpx client to the OpenAI constructor

Create a custom class extending DefaultHttpxClient

This class is set to replace top_p=1.0 in the json payload with top_p=0.9999…

class LocalHttpxClient(DefaultHttpxClient):
    def build_request(self,  #
                      method: str,  #
                      url: URLTypes,  #
                      *, content: RequestContent | None = None,  #
                      data: RequestData | None = None,  #
                      files: RequestFiles | None = None,  #
                      json: typing.Any | None = None,  #
                      params: QueryParamTypes | None = None,  #
                      headers: HeaderTypes | None = None,  #
                      cookies: CookieTypes | None = None,  #
                      timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,  #
                      extensions: RequestExtensions | None = None,  #
                      ) -> Request:
        if json is not None and isinstance(json, dict):
            if "top_p" in json and json["top_p"] == 1.0:
                del json["top_p"]
        return Request(method, url, content=content, data=data, files=files, json=json,
                       params=params, headers=headers, cookies=cookies, extensions=extensions, )

def test()->None:
    test_ep: InferenceEndpoint = inference_endpoint1()
    while test_ep.status != "running":
        if test_ep.status == "failed":
            raise RuntimeError(f"Failed to create inference endpoint: {test_ep.name}")
        try:
            test_ep.wait(timeout=1)
        except InferenceEndpointTimeoutError:
            pass

    import openai
    httpx_client = LocalHttpxClient()
    client = openai.OpenAI(  #
            base_url=test_ep.url + "/v1",  #
            api_key=hf_bearer_token(),  #
            organization="NewsRx",  #
            http_client=httpx_client,  #
    )

    # print(f"Available models: {client.models.list()}")
    role_system = {"role": "system", "content": "I am an evil robot overlord."}
    role_user = {"role": "user", "content": "What is your command? Be very succinct."}
    chat_completion = client.chat.completions.create(model="tgi",  #
                                                     messages=[role_system, role_user],  #
                                                     stream=True,  #
                                                     max_tokens=1024,  #
                                                     temperature=0.0,  #
                                                     )

    print("=" * 40)
    for chunk in chat_completion:
        content = chunk.choices[0].delta.content
        if content is not None:
            print(content, end="")
    print()
    print()

    xllm = guidance.models.OpenAI(model="gpt-3.5-turbo",  #
                                  base_url=test_ep.url + "/v1",  #
                                  api_key=hf_bearer_token(),  #
                                  echo=False,  #
                                  organization="NewsRx",  #
                                  http_client=httpx_client,  #
                                  )
    llm = xllm
    with system():
        llm += "I am an evil robot overlord."
    with user():
        llm += "What is your command? Be very succinct."
    with assistant():
        llm += gen()
    print(str(llm))

guidance-ai / guidance