Cainier / gpt-tokens

Calculate the token consumption and amount of openai gpt message
MIT License
106 stars 13 forks source link

[Idea] self-discipline plugin #36

Open gen4sp opened 11 months ago

gen4sp commented 11 months ago

To manage API usage across multiple projects without exceeding limits, consider implementing a function that calculates delay in milliseconds for each request. This function should factor in userPriority and botPriority, ensuring quicker responses for paid users and higher priority projects. Lower priority requests can have longer delays. Here's a prototype from my current project for your review.

// RPM (requests per minute), RPD (requests per day), TPM (tokens per minute), TPD (tokens per day), and IPM (images per minute)

import { GPTTokens, supportModelType } from 'gpt-tokens'

import connector, { RatesResults } from './connector'
import logger from '../../lib/logger'
import Stats from '../../models/Stats'
import { StatKeys } from '../../models/Stats.types'
const log = logger({ module: 'RateLimiter.model.ts' })

export enum OpenAIModels {
  'gpt-4' = 'gpt-4',
  'gpt-4-1106-preview' = 'gpt-4-1106-preview',
  'gpt-4-vision-preview' = 'gpt-4-vision-preview',
  'gpt-3.5-turbo' = 'gpt-3.5-turbo',
  'text-embedding-ada-002' = 'text-embedding-ada-002',
  'whisper-1' = 'whisper-1',
  'tts-1' = 'tts-1',
  'dall-e-2' = 'dall-e-2',
  'dall-e-3' = 'dall-e-3',
}
export interface MessageItem {
  name?: string
  role: 'system' | 'user' | 'assistant'
  content: string
}
type RateSettings = {
  RPM: number //  requests per minute
  RPD: number //  requests per day
  TPM: number //  tokens per minute
  TPD: number // token per day
  CS: number // context size
}
const MSDAY = 1000 * 60 * 60 * 24
const MSMINUTE = 1000 * 60

const limits: { [model in OpenAIModels]?: RateSettings } = {
  [OpenAIModels['gpt-4']]: { RPM: 10000, RPD: -1, TPM: 300000, TPD: -1, CS: 8190 },
  [OpenAIModels['gpt-4-1106-preview']]: { RPM: 500, RPD: 10000, TPM: 300000, TPD: -1, CS: 128000 },
  [OpenAIModels['gpt-4-vision-preview']]: { RPM: 20, RPD: 100, TPM: 300000, TPD: -1, CS: 128000 },
  [OpenAIModels['gpt-3.5-turbo']]: { RPM: 10000, RPD: -1, TPM: 1000000, TPD: -1, CS: 4000 },
  [OpenAIModels['text-embedding-ada-002']]: { RPM: 10000, RPD: -1, TPM: 5000000, TPD: -1, CS: -1 },
  [OpenAIModels['whisper-1']]: { RPM: 100, RPD: -1, TPM: -1, TPD: -1, CS: -1 },
  [OpenAIModels['tts-1']]: { RPM: 100, RPD: -1, TPM: -1, TPD: -1, CS: -1 },
  [OpenAIModels['dall-e-2']]: { RPM: 100, RPD: -1, TPM: -1, TPD: -1, CS: -1 },
  [OpenAIModels['dall-e-3']]: { RPM: 15, RPD: -1, TPM: -1, TPD: -1, CS: -1 },
}
const lowestPriorityDelayMultiplier = 4
const highestPriorityDelayMultiplier = 1.1
class RateLimiter {
  constructor() {
    //
  }
  public async registerChatRequest(
    model: OpenAIModels,
    messages: MessageItem[],
    tools: any[] = [],
    userPriority = 1, // from 0(lowest) to 1(highest)
    botPriority = 1 // from 0(lowest) to 1(highest)
  ): Promise<boolean> {
    //
    try {
      const usageInfo = new GPTTokens({
        model: model as supportModelType | undefined,
        messages,
      })
      const rateSetting: RateSettings | undefined = limits[model]
      if (!rateSetting) {
        throw new Error(`Invalid model [${model}]`)
      }

      if (rateSetting.CS > 0 && usageInfo.usedTokens > rateSetting.CS) {
        throw new Error('Chat exceeds token limit')
      }

      const currentConumeRates: RatesResults = await connector.getRates(model)
      const getDelayForRate: number = this.getDelayForRate(rateSetting, currentConumeRates, userPriority, botPriority)

      // pause for a bit
      if (getDelayForRate && getDelayForRate > 0) {
        await new Promise((resolve) => setTimeout(resolve, getDelayForRate))
      }

      await connector.register(model, usageInfo.usedTokens)
        await Stats.addStat({
          [StatKeys.consumeRequests]: 1,
          [StatKeys.consumeTokens]: usageInfo.usedTokens,
          [StatKeys.consumeCash]: usageInfo.usedUSD,
        })
      return true
    } catch (e) {
      log.error(e, 'Error registering chat request')
      return false
    }
  }

  getDelayForRate(
    rateSettings: RateSettings,
    currentConumeRates: RatesResults,
    userPriority: number,
    botPriority: number
  ): number {
    //finding proper delay for each rate
    const rpmDelay =
      rateSettings.RPM > 0
        ? ((currentConumeRates.RPM / rateSettings.RPM) * MSMINUTE) / (rateSettings.RPM - currentConumeRates.RPM)
        : 0
    const rpdDelay =
      rateSettings.RPD > 0
        ? ((currentConumeRates.RPD / rateSettings.RPD) * MSDAY) / (rateSettings.RPD - currentConumeRates.RPD)
        : 0
    const tpmDelay =
      rateSettings.TPM > 0
        ? ((currentConumeRates.TPM / rateSettings.TPM) * MSMINUTE) / (rateSettings.TPM - currentConumeRates.TPM)
        : 0
    const tpdDelay =
      rateSettings.TPD > 0
        ? ((currentConumeRates.TPD / rateSettings.TPD) * MSDAY) / (rateSettings.TPD - currentConumeRates.TPD)
        : 0

    // get highest delay and calculate additional koeficient for low priority users
    const delay = Math.max(rpmDelay, rpdDelay, tpmDelay, tpdDelay)
    // additional multiplieer delay for unprioritezd from 1(highest prioriyt) to lowestPriorityDelayMultiplier(lowest priority)
    const totalPriorityK =
      (1 - userPriority * botPriority) * lowestPriorityDelayMultiplier + highestPriorityDelayMultiplier
    // rounded delay in ms which necessary to fit rate limits
    return Math.round(delay * totalPriorityK)
  }
}

export default new RateLimiter()