ggerganov / llama.cpp

LLM inference in C/C++
MIT License
64.64k stars 9.26k forks source link

[Vulcan] Extreme slow down when chatting! #8745

Open jarroddavis68 opened 1 month ago

jarroddavis68 commented 1 month ago
  1. when chatting with a model Hermes-2-Pro-Llama-3-8B-GGUF, I get about four questions in, and it becomes extremely slow to generate tokens.
  2. Pass the model response of the previous question back in as an assistant message to keep context.
  3. I will assume that it's an issue with the way I'm doing inference. The slow down is pretty fast and unexpected. Using Vulkan backend on a RTX-3060 (12GB VRAM), 32GB of RAM.
  4. here is my inference code. Maybe someone can point out what I could be doing wrong:
    
    function  TGenAI.RunInference(const AModelName: string; const AMaxTokens: UInt32): Boolean;
    var
    LPast: UInt32;
    LRemain: UInt32;
    LConsumed: UInt32;
    LSamplingContext: Pointer;
    I: UInt32;
    LPredict: UInt32;
    LBatch: UInt32;
    LEval: UInt32;
    LId: llama_token;
    LMaxEmbedSize: UInt32;
    LSkippedTokens: UInt32;
    LEmbedInput: TVector<llama_token>;
    LEmbed: TVector<llama_token>;
    LTimings: llama_timings;
    LTokenStr: string;
    LFirstToken: Boolean;

begin Result := False;

try // check if inference is already runnig if FInference.Active then begin SetError('[%s] Inference already active', ['RunInference']); Exit; end;

// start new inference
FInference := Default(TInference);

// check if model not loaded
if not LoadModel(AModelName) then
begin
  Exit;
end;

// build prompt message
FInference.Prompt := BuildMessageInferencePrompt(AModelName);
if FInference.Prompt.IsEmpty then
begin
  SetError('[%s] Inference prompt was empty', ['RunInference']);
  Exit;
end;

FInference.Active := True;
FInference.Response := '';

OnInferenceStart();
try
  LEmbedInput := tokenize(FContext, FInference.Prompt, true, true);
  try
    if LEmbedInput.empty() then
      LEmbedInput.Add(llama_token_bos(FModel));

    LMaxEmbedSize := llama_n_ctx(FContext) - 4;
    if LEmbedInput.Count() > LMaxEmbedSize then
    begin
      LSkippedTokens := LEmbedInput.count() - LMaxEmbedSize;
      SetError('[%s] Input too long: %d tokens over max context of %d',
        ['RunInference', LSkippedTokens, LMaxEmbedSize]);
      Exit;
    end;

    LEmbed := TVector<llama_token>.Create();
    try
      LSamplingContext := llama_sampling_init();
      try
        LPredict := AMaxTokens;
        LBatch := FContextParams.n_ubatch;

        LPast := 0;
        LRemain := LPredict;
        LConsumed := 0;
        LFirstToken := True;

        llama_reset_timings(FContext);
        while LRemain <> 0 do
        begin
          if OnInferenceCancel() then
          begin
            Break;
          end;

          if LEmbed.Count <> 0 then
          begin
            I := 0;
            while I < LEmbed.Count do
            begin
              LEval := LEmbed.Count - I;
              if LEval > LBatch then
                LEval := LBatch;

              if llama_decode(FContext,
                llama_batch_get_one(@LEmbed.FItems[I], LEval, LPast, 0)) <> 0 then
              begin
                SetError('Error in llama_decode with Vulkan backend');
                Break;
              end;

              Inc(LPast, LEval);
              Inc(I, LBatch);
            end;
            LEmbed.Clear;
          end;

          if LEmbedInput.Count <= LConsumed then
            begin
              LId := llama_sampling_sample(LSamplingContext, FContext, nil);
              if llama_token_is_eog(FModel, LId) then
              begin
                Break;
              end;

              llama_sampling_accept(LSamplingContext, FContext, LId, True);
              LEmbed.Add(LId);
              Dec(LRemain);

              LTokenStr := TokenToPiece(FContext, LId, False);
              if LFirstToken then
              begin
                LFirstToken := False;
                LTokenStr := LTokenStr.TrimLeft();
              end;

              FInference.Response := FInference.Response + LTokenStr;
              OnInferenceToken(LTokenStr);

            end
          else
            begin
              while LEmbedInput.Count > LConsumed do
              begin
                LEmbed.Add(LEmbedInput[LConsumed]);
                llama_sampling_accept(LSamplingContext, FContext, LEmbedInput[LConsumed], False);
                Inc(LConsumed);
                if LEmbed.Count >= LBatch then
                begin
                  Break;
                end;
              end;
            end;
        end;

        // get usage
        LTimings := llama_get_timings(FContext);
        FStats.InputTokens := LTimings.n_p_eval;
        FStats.OutputTokens := LTimings.n_eval;
        FStats.TokenInputSpeed := 1e3 / LTimings.t_p_eval_ms * LTimings.n_p_eval;
        FStats.TokenOutputSpeed := 1e3 / LTimings.t_eval_ms * LTimings.n_eval;
        FStats.TotalTokens := FStats.InputTokens + FStats.OutputTokens;
        Result := True;
      finally
        llama_sampling_free(LSamplingContext);
      end;
    finally
      LEmbed.Free();
    end;
  finally
    LEmbedInput.Free();
  end;
finally
  FInference.Active := False;
  OnInferenceEnd();
end;

except on E: Exception do begin SetError(E.Message); Exit; end; end; end;


5. Also, sidenote, I can no longer load phi3-mini models. Both 4k and 128k fail to load now. **Update**: Hmm, I see phi-3.1 GGUF versions on HF which seem to work. Ok then.
foldl commented 1 month ago

Off the topic: Love your Pascal code. I am just wondering why are you using it. It would be a good choice if there is a Markdown component.

jarroddavis68 commented 1 month ago

Off the topic: Love your Pascal code. I am just wondering why are you using it. It would be a good choice if there is a Markdown component.

Thx. I've been using pascal since 1986. It's my default language. So, I have to extra pain of doing the header conversion and trying to keep everything updated. I have scripts to automate most things so it's not too bad now.

Here is a quick video. The delay between first click and response is the just the model loading. But with each query, it starts to get slower and more sluggish. If I had continued (need to keep vid under 10mb) it would have been a very, very long delay between responses.

BTW, there is a markdown viewer component that I plan to use eventually.

https://github.com/user-attachments/assets/24075b7b-d928-4484-b6ef-cbd22d2a72de