HemulGM / DelphiOpenAI

OpenAI API wrapper for Delphi. Use ChatGPT, DALL-E, Whisper and other products.
MIT License
233 stars 57 forks source link

Add-on to use Vision in this version #27

Closed MaxiDonkey closed 10 months ago

MaxiDonkey commented 10 months ago

This version does not take Vision into account. Here is some sample code to support this.

Add the OpenAI.Vision.Images unit to manage images to send to the model for analysis.

unit OpenAI.Vision.Images;

interface

uses
  System.SysUtils, System.Classes;

type
  /// <summary>
  /// Access method to the image by the model.
  /// Warning : Images can be passed in the user and assistant messages.
  /// Currently that don't support images in the first system message but this may change in the future.
  /// </summary>
  TImageSourceType = (
    /// <summary>
    /// The model acceed to the image by using a link to the image
    /// </summary>
    FromUrl,
    /// <summary>
    /// The model acceed to th image by using the base64 encoded image directly in the request.
    /// </summary>
    FromBase64);

  TImageFormatType = (jpeg, png);

  TImageFormatTypehelper = record helper for TImageFormatType
    function ToString: string;
  end;

  TImageDetail = (
    /// <summary>
    /// By default, the model will use the auto setting which will look at the image input size and
    /// decide if it should use the low or high setting.
    /// </summary>
    auto,
    /// <summary>
    /// low will disable the “high res” model. The model will receive a low-res 512px x 512px version
    /// of the image, and represent the image with a budget of 65 tokens. This allows the API to return
    /// faster responses and consume fewer input tokens for use cases that do not require high detail.
    /// </summary>
    low,
    /// <summary>
    /// high will enable “high res” mode, which first allows the model to see the low res image and then
    /// creates detailed crops of input images as 512px squares based on the input image size. Each of
    /// the detailed crops uses twice the token budget (65 tokens) for a total of 129 tokens.
    /// </summary>
    high);

  TImageDetailhelper = record helper for TImageDetail
    function ToString: string;
  end;

  TImageSource = record
  private
    FType: TImageSourceType;
    FFormat: TImageFormatType;
    FDetail: TImageDetail;
    FValue: string;
  public
    /// <summary>
    /// By controlling the detail parameter, which has three options, low, high, or auto, you have control over
    /// how the model processes the image and generates its textual understanding.
    /// </summary>
    property Detail: TImageDetail read FDetail write FDetail;
    /// <summary>
    /// The value field can be either a URL with the link to the image to query or the image in base 64 format
    /// which will be transmitted with the request.
    /// </summary>
    property Value: string read FValue write FValue;
    class function Create(const Url: string; const DetailValue: TImageDetail = auto): TImageSource; overload; static;
    class function Create(const Format: TImageFormatType; const Base64Img: string;
      const DetailValue: TImageDetail = auto): TImageSource; overload; static;
  end;

  TChatVisionBuild = record
  private
    FContent: string;
    FImageSources: TArray<TImageSource>;
  public
    property Content: string read FContent write FContent;
    property ImageSources: TArray<TImageSource> read FImageSources write FImageSources;
    class function Create(const ContextText: string; Images: TArray<TImageSource>): TChatVisionBuild; static;
  end;

implementation

{ TImageFormatTypehelper }

function TImageFormatTypehelper.ToString: string;
begin
  case Self of
    TImageFormatType.jpeg:
      Exit('data:image/jpeg;base64,%s');
    TImageFormatType.png:
      Exit('data:image/png;base64,%s');
  end;
end;

{ TImageSource }

class function TImageSource.Create(const Url: string;
  const DetailValue: TImageDetail = auto): TImageSource;
begin
  Result.FType := TImageSourceType.FromUrl;
  Result.FDetail := DetailValue;
  Result.FValue := Url;
end;

class function TImageSource.Create(const Format: TImageFormatType;
  const Base64Img: string;
  const DetailValue: TImageDetail = auto): TImageSource;
begin
  Result.FType := TImageSourceType.FromBase64;
  Result.FFormat := Format;
  Result.FDetail := DetailValue;
  Result.FValue := System.SysUtils.Format(Format.ToString, [Base64Img]);
end;

{ TChatVisionBuild }

class function TChatVisionBuild.Create(const ContextText: string;
  Images: TArray<TImageSource>): TChatVisionBuild;
begin
  Result.FContent := ContextText;
  Result.FImageSources := Images;
end;

{ TImageDetailhelper }

function TImageDetailhelper.ToString: string;
begin
  case Self of
    TImageDetail.auto:
      Exit('auto');
    TImageDetail.low:
      Exit('low');
    TImageDetail.high:
      Exit('high');
  end;
end;

end.

So modify OpenAI.chat unit to support Vision.

unit OpenAI.Chat;

interface

uses
  System.SysUtils, OpenAI.API.Params, OpenAI.API, OpenAI.Chat.Functions,
  System.Classes, REST.JsonReflect, System.JSON,
  OpenAI.Vision.Images;

......

  TChatParams = class(TJSONParam)
    /// <summary>
    /// ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
    /// </summary>
    /// <seealso>https://platform.openai.com/docs/models/model-endpoint-compatibility</seealso>
    function Model(const Value: string): TChatParams;
.........
    /// <summary>
    /// GPT-4 with Vision, sometimes referred to as GPT-4V or gpt-4-vision-preview in the API, allows the model to take in images
    /// and answer questions about them.
    /// </summary>
    function Vision(const Context: string; Images: TArray<TImageSource>;
      const Role: TMessageRole = TMessageRole.User): TChatParams;
    constructor Create; override;
  end;

........ 

function TChatParams.Vision(const Context: string;
  Images: TArray<TImageSource>;
  const Role: TMessageRole = TMessageRole.User): TChatParams;
var
  Item: TImageSource;
  JSON: TJSONObject;
  JSONImgObj: TJSONObject;
  Items: TJSONArray;
  ArrayMessage: TJSONArray;
begin
  case Role of
    TMessageRole.User, TMessageRole.Assistant: ;
    else raise Exception.CreateFmt('Inappropriate role (%s)', [Role.ToString]);
  end;
  Items := TJSONArray.Create;
  try
    JSON := TJSONObject.Create;
    {"type": "text", "text": "What’s in this image?"}
    JSON.AddPair('type', 'text');
    JSON.AddPair('text', Context);
    Items.Add(JSON);
    for Item in Images do
    begin
      JSON := TJSONObject.Create;
      {"type": "image_url",
       "image_url": {
          "url": "Url or Image content to base64 string",
          "detail": "auto/low/high"}
      JSON.AddPair('type', 'image_url');
      JSONImgObj := TJSONObject.Create;
      JSONImgObj.AddPair('url', Item.Value);
      JSONImgObj.AddPair('detail', Item.Detail.ToString);
      JSON.AddPair('image_url', JSONImgObj);
      Items.Add(JSON);
    end;
    JSON := TJSONObject.Create;
    {"role": "user/assistant"}
    JSON.AddPair('role', Role.ToString);
    {"content": "content_value"}
    JSON.AddPair('content', Items);
    ArrayMessage := TJSONArray.Create;
    ArrayMessage.Add(JSON);
  except
    Items.Free;
    raise;
  end;

  Result := TChatParams(Add('messages', ArrayMessage));
end;

To use Vision: Example 1: Using a link to the image to process

var Chat := OpenAI.Chat.Create(
    procedure(Params: TChatParams)
    begin
      Params.Model('gpt-4-vision-preview');
      Params.Vision(MemoQuery.Text, [TImageSource.Create('https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg', TImageDetail.low)], TMessageRole.User);

      Params.MaxTokens(300);
    end);
  try
    for var Choice in Chat.Choices do
      MemoResult.Lines.Add(Choice.Message.Content);
  finally
    Chat.Free;
  end;

Example 2: By uploading the image with the query

var Chat := OpenAI.Chat.Create(
    procedure(Params: TChatParams)
    begin
      Params.Model('gpt-4-vision-preview');
      Params.Vision(MemoQuery.Text, [TImageSource.Create(TImageFormatType.png, Image1.Picture.ToBase64, TImageDetail.low)], TMessageRole.System);
      Params.MaxTokens(300);
    end);
  try
    for var Choice in Chat.Choices do
      MemoResult.Lines.Add(Choice.Message.Content);
  finally
    Chat.Free;
  end;

if the Image1 component is the VCL component

TPictureHelper = class helper for TPicture
    function ToBase64: string;
  end;

{ TPictureHelper }

function TPictureHelper.ToBase64: string;
begin
  var Input := TMemoryStream.Create;
  var Output := TStringStream.Create(EmptyStr, TEncoding.UTF8);
  try
    Self.SaveToStream(Input);
    Input.Position := 0;
    TNetEncoding.Base64.Encode(Input, Output);
    Result := OutPut.DataString;
  finally
    Input.Free;
    Output.Free;
  end;
end;
HemulGM commented 10 months ago

Hi. Thanks, but I have already implemented this)) Check it)

image

MaxiDonkey commented 10 months ago

I'm sorry but I can't find any TMessageContent class after searching all units. So I can't reproduce your example. Am I missing something? My approach was to reproduce the examples given here https://platform.openai.com/docs/guides/vision

HemulGM commented 10 months ago

You need to update the library from the repository. I updated literally the moment I responded to issue

HemulGM commented 10 months ago

https://github.com/HemulGM/DelphiOpenAI/blob/main/OpenAI.Chat.pas#L156