To manage API usage across multiple projects without exceeding limits, consider implementing a function that calculates delay in milliseconds for each request. This function should factor in userPriority and botPriority, ensuring quicker responses for paid users and higher priority projects. Lower priority requests can have longer delays. Here's a prototype from my current project for your review.
// RPM (requests per minute), RPD (requests per day), TPM (tokens per minute), TPD (tokens per day), and IPM (images per minute)
import { GPTTokens, supportModelType } from 'gpt-tokens'
import connector, { RatesResults } from './connector'
import logger from '../../lib/logger'
import Stats from '../../models/Stats'
import { StatKeys } from '../../models/Stats.types'
const log = logger({ module: 'RateLimiter.model.ts' })
export enum OpenAIModels {
'gpt-4' = 'gpt-4',
'gpt-4-1106-preview' = 'gpt-4-1106-preview',
'gpt-4-vision-preview' = 'gpt-4-vision-preview',
'gpt-3.5-turbo' = 'gpt-3.5-turbo',
'text-embedding-ada-002' = 'text-embedding-ada-002',
'whisper-1' = 'whisper-1',
'tts-1' = 'tts-1',
'dall-e-2' = 'dall-e-2',
'dall-e-3' = 'dall-e-3',
}
export interface MessageItem {
name?: string
role: 'system' | 'user' | 'assistant'
content: string
}
type RateSettings = {
RPM: number // requests per minute
RPD: number // requests per day
TPM: number // tokens per minute
TPD: number // token per day
CS: number // context size
}
const MSDAY = 1000 * 60 * 60 * 24
const MSMINUTE = 1000 * 60
const limits: { [model in OpenAIModels]?: RateSettings } = {
[OpenAIModels['gpt-4']]: { RPM: 10000, RPD: -1, TPM: 300000, TPD: -1, CS: 8190 },
[OpenAIModels['gpt-4-1106-preview']]: { RPM: 500, RPD: 10000, TPM: 300000, TPD: -1, CS: 128000 },
[OpenAIModels['gpt-4-vision-preview']]: { RPM: 20, RPD: 100, TPM: 300000, TPD: -1, CS: 128000 },
[OpenAIModels['gpt-3.5-turbo']]: { RPM: 10000, RPD: -1, TPM: 1000000, TPD: -1, CS: 4000 },
[OpenAIModels['text-embedding-ada-002']]: { RPM: 10000, RPD: -1, TPM: 5000000, TPD: -1, CS: -1 },
[OpenAIModels['whisper-1']]: { RPM: 100, RPD: -1, TPM: -1, TPD: -1, CS: -1 },
[OpenAIModels['tts-1']]: { RPM: 100, RPD: -1, TPM: -1, TPD: -1, CS: -1 },
[OpenAIModels['dall-e-2']]: { RPM: 100, RPD: -1, TPM: -1, TPD: -1, CS: -1 },
[OpenAIModels['dall-e-3']]: { RPM: 15, RPD: -1, TPM: -1, TPD: -1, CS: -1 },
}
const lowestPriorityDelayMultiplier = 4
const highestPriorityDelayMultiplier = 1.1
class RateLimiter {
constructor() {
//
}
public async registerChatRequest(
model: OpenAIModels,
messages: MessageItem[],
tools: any[] = [],
userPriority = 1, // from 0(lowest) to 1(highest)
botPriority = 1 // from 0(lowest) to 1(highest)
): Promise<boolean> {
//
try {
const usageInfo = new GPTTokens({
model: model as supportModelType | undefined,
messages,
})
const rateSetting: RateSettings | undefined = limits[model]
if (!rateSetting) {
throw new Error(`Invalid model [${model}]`)
}
if (rateSetting.CS > 0 && usageInfo.usedTokens > rateSetting.CS) {
throw new Error('Chat exceeds token limit')
}
const currentConumeRates: RatesResults = await connector.getRates(model)
const getDelayForRate: number = this.getDelayForRate(rateSetting, currentConumeRates, userPriority, botPriority)
// pause for a bit
if (getDelayForRate && getDelayForRate > 0) {
await new Promise((resolve) => setTimeout(resolve, getDelayForRate))
}
await connector.register(model, usageInfo.usedTokens)
await Stats.addStat({
[StatKeys.consumeRequests]: 1,
[StatKeys.consumeTokens]: usageInfo.usedTokens,
[StatKeys.consumeCash]: usageInfo.usedUSD,
})
return true
} catch (e) {
log.error(e, 'Error registering chat request')
return false
}
}
getDelayForRate(
rateSettings: RateSettings,
currentConumeRates: RatesResults,
userPriority: number,
botPriority: number
): number {
//finding proper delay for each rate
const rpmDelay =
rateSettings.RPM > 0
? ((currentConumeRates.RPM / rateSettings.RPM) * MSMINUTE) / (rateSettings.RPM - currentConumeRates.RPM)
: 0
const rpdDelay =
rateSettings.RPD > 0
? ((currentConumeRates.RPD / rateSettings.RPD) * MSDAY) / (rateSettings.RPD - currentConumeRates.RPD)
: 0
const tpmDelay =
rateSettings.TPM > 0
? ((currentConumeRates.TPM / rateSettings.TPM) * MSMINUTE) / (rateSettings.TPM - currentConumeRates.TPM)
: 0
const tpdDelay =
rateSettings.TPD > 0
? ((currentConumeRates.TPD / rateSettings.TPD) * MSDAY) / (rateSettings.TPD - currentConumeRates.TPD)
: 0
// get highest delay and calculate additional koeficient for low priority users
const delay = Math.max(rpmDelay, rpdDelay, tpmDelay, tpdDelay)
// additional multiplieer delay for unprioritezd from 1(highest prioriyt) to lowestPriorityDelayMultiplier(lowest priority)
const totalPriorityK =
(1 - userPriority * botPriority) * lowestPriorityDelayMultiplier + highestPriorityDelayMultiplier
// rounded delay in ms which necessary to fit rate limits
return Math.round(delay * totalPriorityK)
}
}
export default new RateLimiter()
To manage API usage across multiple projects without exceeding limits, consider implementing a function that calculates delay in milliseconds for each request. This function should factor in userPriority and botPriority, ensuring quicker responses for paid users and higher priority projects. Lower priority requests can have longer delays. Here's a prototype from my current project for your review.