hiro-v commented 1 month ago

Motivation

Standardized way to get performance metrics as defined in Databrick LLM inference
Standardized way to get hardware data as defined in Notion for hardware data related to motherboard, CPU, GPU, memory, disk, psu, chassis.
Standardized way to get model runtime and kernel data during inference.

Discussion

Start cortex


# 1. Install the NPM package
npm i -g @janhq/cortex

2. Initialize a compatible engine

cortex init

3. Download a GGUF model from Hugging Face

Models save to $(npm list -g)/node_modules

cortex models pull janhq/TinyLlama-1.1B-Chat-v1.0-GGUF

4. Load the model

cortex models start janhq/TinyLlama-1.1B-Chat-v1.0-GGUF

- Start benchmark cli (not bundled with cortex) which is written in NodeJS
```yaml
api:
    url: <cortex_url>
    api_key: <>
prompts:
    min: 1024
    max: 2048
    samples: 10
output: json
hardware: [cpu, gpu, psu, chassis, ram]

cortex-benchmark shoot

sequenceDiagram
    participant User
    participant Program
    participant SystemInfo
    participant OpenAI_API
    participant Monitor

    User->>+Program: Start (api_url, api_key, monitor_interval)
    Program->>+SystemInfo: Retrieve Hardware Metadata
    SystemInfo-->>-Program: Metadata

    loop Benchmarking Loop
        Program->>+OpenAI_API: Send Prompt (1024-2048 tokens)
        OpenAI_API-->>-Program: Response (latency, tokens)
    end

    loop Monitoring Loop
        Program->>+Monitor: Check CPU & Memory
        Monitor-->>-Program: Usage Data
    end

    Program->>+Program: Calculate Percentiles
    Program-->>-User: Display Metrics and Usage

Resource Metadata Retrieval: At the start of the program, it gathers detailed metadata about the CPU, memory, disks, and graphics systems.
OpenAI API compatible Benchmarking: The program tests the OpenAI API by submitting prompts of varying token lengths (from 1024 to 2048 tokens) to measure latency and time per output token. It iterates over 10 different lengths, increasing by 128 tokens each iteration.
Resource Monitoring: System resources (CPU and memory usage) are monitored at user-specified intervals, allowing for ongoing observation without impacting the benchmark tests.
Metrics Calculation: At the end of the monitoring period, the program calculates and displays the 50th, 75th, and 99th percentiles for CPU and memory usage, providing insights into the system's performance under load.

Resources

hiro-v commented 1 month ago

My current implementation, gonna refactor

const si = require('systeminformation');
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
const fs = require('fs');
const yaml = require('js-yaml');
const { Command } = require('commander');

const program = new Command();
program.option('--config <path>', 'Path to the configuration file');
program.parse(process.argv);

const options = program.opts();
if (!options.config) {
  console.error('Configuration file path is required.');
  process.exit(1);
}

let config;
try {
  config = yaml.load(fs.readFileSync(options.config, 'utf8'));
  console.log('Configuration loaded successfully:', config);
} catch (error) {
  console.error('Failed to read or parse the configuration file:', error);
  process.exit(1);
}

async function getSystemResources() {
  const data = {
    cpu: await si.currentLoad(),
    mem: await si.mem(),
    gpu: (await si.graphics()).controllers
  };
  console.log('Current system resources:', data);
  return data;
}

async function getResourceMetadata() {
    const hardware = {};
    const components = ['cpu', 'memory', 'gpu', 'disk', 'osInfo', 'memLayout', 'mem', 'chassis', 'baseboard'];
    for (let component of components) {
      if (config.hardware.includes(component)) {
        switch(component) {
          case 'memory':
            hardware.memory = await si.mem();
            break;
          case 'memLayout':
            hardware.memLayout = await si.memLayout();
            break;
          case 'gpu':
            hardware.gpu = await si.graphics();
            delete hardware.gpu.displays; // Assuming displays are not needed in hardware logs
            break;
          case 'disk':
            hardware.disk = await si.diskLayout();
            break;
          case 'osInfo':
            hardware.osInfo = await si.osInfo();
            console.log('Operating System Info:', hardware.osInfo);
            break;
          case 'chassis':
            hardware.chassis = await si.chassis();
            break;
          case 'baseboard':
            hardware.baseboard = await si.baseboard();
            break;
          default:
            hardware[component] = await si[component]();
            break;
        }
        console.log(`Hardware component ${component} data:`, hardware[component]);
      }
    }
    return hardware;
}

function generateTokenLengths(min, max, samples) {
    const step = (max - min) / (samples - 1);
    const lengths = Array.from({length: samples}, (_, index) => Math.round(min + step * index));
    console.log('Generated token lengths:', lengths);
    return lengths;
}

async function getResourceChange(startData, endData) {
  const change = {
    cpu: (endData.cpu.currentload - startData.cpu.currentload) / startData.cpu.currentload * 100,
    mem: (endData.mem.used - startData.mem.used) / startData.mem.total * 100,
    gpu: endData.gpu.map((gpu, index) => ({
      gpuUsageChange: gpu.utilizationGpu - startData.gpu[index].utilizationGpu
    }))
  };
  console.log('Resource change calculated:', change);
  return change;
}

async function benchmarkUser() {
  const results = [];
  const tokenLengths = generateTokenLengths(config.prompts.min, config.prompts.max, config.prompts.samples);
  for (const length of tokenLengths) {
    const startResources = await getSystemResources();
    const start = Date.now();

    const response = await fetch(config.api.url, {
      method: 'POST',
      headers: {
        'Authorization': `Bearer ${config.api.api_key}`,
        'Content-Type': 'application/json'
      },
      body: JSON.stringify({...config.api.parameters, max_tokens: length})
    });

    if (!response.ok) {
      throw new Error(`HTTP error! status: ${response.status}`);
    }

    const latency = Date.now() - start;
    const endResources = await getSystemResources();
    const resourceChange = await getResourceChange(startResources, endResources);

    results.push({
      tokens: length,
      latency,
      resourceChange
    });
    console.log(`Result for token length ${length}:`, results[results.length - 1]);
  }
  return results;
}

async function runBenchmarks() {
  const hardware = await getResourceMetadata();
  const userPromises = Array.from({ length: config.concurrency }, benchmarkUser);
  const allResults = await Promise.all(userPromises);

  const output = {
    hardware,
    results: allResults
  };

  fs.writeFileSync('output.json', JSON.stringify(output, null, 2));
  console.log('Benchmark results saved to output.json');
  console.log('Detailed Results:', JSON.stringify(allResults, null, 2));
}

runBenchmarks();

api:
  url: http://127.0.0.1:1337/v1/chat/completions
  api_key: <api_key>
  parameters: 
    messages:
    - content: You are a helpful assistant.
      role: system
    - content: Hello!
      role: user
    model: tinyllama-1.1b
    stream: true
    max_tokens: 2048
    stop:
    - hello
    frequency_penalty: 0
    presence_penalty: 0
    temperature: 0.7
    top_p: 0.95
prompts:
  min: 1024
  max: 2048
  samples: 10
output: json
hardware:
  - cpu
  - gpu
  - psu
  - chassis
  - ram
concurrency: 1

hiro-v commented 1 month ago

Hardware: Mac M3
Model: tiny llama 1.1B

{
  "hardware": {
    "cpu": {
      "manufacturer": "Apple",
      "brand": "M3 Pro",
      "vendor": "Apple",
      "family": "1598941843",
      "model": "",
      "stepping": "4",
      "revision": "",
      "voltage": "",
      "speed": 2.4,
      "speedMin": 2.4,
      "speedMax": 2.4,
      "governor": "",
      "cores": 11,
      "physicalCores": 11,
      "performanceCores": 5,
      "efficiencyCores": 6,
      "processors": 1,
      "socket": "SOC",
      "flags": "",
      "virtualization": true,
      "cache": {
        "l1d": 131072,
        "l1i": 65536,
        "l2": 4194304,
        "l3": null
      }
    },
    "gpu": {
      "controllers": [
        {
          "vendor": "Apple",
          "model": "Apple M3 Pro",
          "bus": "Built-In",
          "vramDynamic": true,
          "vram": null,
          "deviceId": "",
          "vendorId": "0x05ac",
          "external": false,
          "cores": "14",
          "metalVersion": ""
        }
      ]
    },
    "chassis": {
      "manufacturer": "Apple Inc.",
      "model": "Mac",
      "type": "Other",
      "version": "Mac15,6",
      "serial": "F2TGP7C125",
      "assetTag": "J514s",
      "sku": "J514sAP"
    }
  },
  "results": [
    [
      {
        "tokens": 1024,
        "latency": 133,
        "resourceChange": {
          "cpu": null,
          "mem": 0.009706285264756944,
          "gpu": [
            {
              "gpuUsageChange": null
            }
          ]
        }
      },
      {
        "tokens": 1138,
        "latency": 79,
        "resourceChange": {
          "cpu": null,
          "mem": 0.02288818359375,
          "gpu": [
            {
              "gpuUsageChange": null
            }
          ]
        }
      },
      {
        "tokens": 1252,
        "latency": 48,
        "resourceChange": {
          "cpu": null,
          "mem": 0,
          "gpu": [
            {
              "gpuUsageChange": null
            }
          ]
        }
      },
      {
        "tokens": 1365,
        "latency": 63,
        "resourceChange": {
          "cpu": null,
          "mem": 0.013266669379340278,
          "gpu": [
            {
              "gpuUsageChange": null
            }
          ]
        }
      },
      {
        "tokens": 1479,
        "latency": 1896,
        "resourceChange": {
          "cpu": null,
          "mem": -0.17242431640625,
          "gpu": [
            {
              "gpuUsageChange": null
            }
          ]
        }
      },
      {
        "tokens": 1593,
        "latency": 749,
        "resourceChange": {
          "cpu": null,
          "mem": 0.096893310546875,
          "gpu": [
            {
              "gpuUsageChange": null
            }
          ]
        }
      },
      {
        "tokens": 1707,
        "latency": 528,
        "resourceChange": {
          "cpu": null,
          "mem": 0.3261990017361111,
          "gpu": [
            {
              "gpuUsageChange": null
            }
          ]
        }
      },
      {
        "tokens": 1820,
        "latency": 80,
        "resourceChange": {
          "cpu": null,
          "mem": 0,
          "gpu": [
            {
              "gpuUsageChange": null
            }
          ]
        }
      },
      {
        "tokens": 1934,
        "latency": 1315,
        "resourceChange": {
          "cpu": null,
          "mem": -0.3627777099609375,
          "gpu": [
            {
              "gpuUsageChange": null
            }
          ]
        }
      },
      {
        "tokens": 2048,
        "latency": 1253,
        "resourceChange": {
          "cpu": null,
          "mem": 0.008138020833333332,
          "gpu": [
            {
              "gpuUsageChange": null
            }
          ]
        }
      }
    ]
  ]
}

hiro-v commented 1 month ago

Hardware: CPU Ryzen, GPU NVIDIA 4090
Model: TinyLlama

{
  "hardware": {
    "cpu": {
      "manufacturer": "AMD",
      "brand": "Ryzen Threadripper PRO 5965WX 24-Cores",
      "vendor": "AuthenticAMD",
      "family": "25",
      "model": "8",
      "stepping": "2",
      "revision": "2050",
      "voltage": "",
      "speed": 3.8,
      "speedMin": 3.8,
      "speedMax": 3.79,
      "governor": "",
      "cores": 16,
      "physicalCores": 16,
      "performanceCores": 16,
      "efficiencyCores": 0,
      "processors": 1,
      "socket": "Other",
      "flags": "de pse tsc sep mtrr mca cmov psn clfsh ds mmx fxsr sse sse2 ss htt tm ia64 pbe",
      "virtualization": true,
      "cache": {
        "l1d": 0,
        "l1i": 0,
        "l2": 0,
        "l3": 0
      }
    },
    "gpu": {
      "controllers": [
        {
          "vendor": "Microsoft",
          "model": "Microsoft Remote Display Adapter",
          "bus": "",
          "vram": 0,
          "vramDynamic": true,
          "subDeviceId": null
        },
        {
          "vendor": "(Standard display types)",
          "model": "Microsoft Basic Display Adapter",
          "bus": "PCI",
          "vram": 0,
          "vramDynamic": true,
          "subDeviceId": "11001AF4"
        },
        {
          "vendor": "NVIDIA",
          "model": "NVIDIA GeForce RTX 4090",
          "bus": "PCI",
          "vram": 24564,
          "vramDynamic": false,
          "subDeviceId": "0x889C1043",
          "driverVersion": "555.85",
          "name": "NVIDIA GeForce RTX 4090",
          "pciBus": "00000000:01:00.0",
          "memoryTotal": 24564,
          "memoryUsed": 2188,
          "memoryFree": 21955,
          "utilizationGpu": 7,
          "utilizationMemory": 1,
          "temperatureGpu": 24,
          "powerDraw": 38,
          "powerLimit": 500,
          "clockCore": 615,
          "clockMemory": 810
        }
      ]
    },
    "chassis": {
      "manufacturer": "QEMU",
      "model": "",
      "type": "Other",
      "version": "pc-q35-7.1",
      "serial": "",
      "assetTag": "",
      "sku": ""
    }
  },
  "results": [
    [
      {
        "tokens": 1024,
        "latency": 88,
        "resourceChange": {
          "cpu": null,
          "mem": 0.026111026563768642,
          "gpu": [
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            }
          ]
        }
      },
      {
        "tokens": 1138,
        "latency": 28,
        "resourceChange": {
          "cpu": null,
          "mem": 0.05019609488206003,
          "gpu": [
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": -7
            }
          ]
        }
      },
      {
        "tokens": 1252,
        "latency": 32,
        "resourceChange": {
          "cpu": null,
          "mem": -0.05215054871887338,
          "gpu": [
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": 0
            }
          ]
        }
      },
      {
        "tokens": 1365,
        "latency": 3,
        "resourceChange": {
          "cpu": null,
          "mem": 0.04956447260552889,
          "gpu": [
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": 26
            }
          ]
        }
      },
      {
        "tokens": 1479,
        "latency": 28,
        "resourceChange": {
          "cpu": null,
          "mem": -0.024609433981826676,
          "gpu": [
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": -4
            }
          ]
        }
      },
      {
        "tokens": 1593,
        "latency": 4,
        "resourceChange": {
          "cpu": null,
          "mem": 0.12588351145280155,
          "gpu": [
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": -1
            }
          ]
        }
      },
      {
        "tokens": 1707,
        "latency": 3,
        "resourceChange": {
          "cpu": null,
          "mem": -0.03436978576444946,
          "gpu": [
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": -24
            }
          ]
        }
      },
      {
        "tokens": 1820,
        "latency": 3,
        "resourceChange": {
          "cpu": null,
          "mem": -0.047407422944167804,
          "gpu": [
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": -18
            }
          ]
        }
      },
      {
        "tokens": 1934,
        "latency": 28,
        "resourceChange": {
          "cpu": null,
          "mem": -0.3049424681486591,
          "gpu": [
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": -13
            }
          ]
        }
      },
      {
        "tokens": 2048,
        "latency": 30,
        "resourceChange": {
          "cpu": null,
          "mem": 0.0863296560602191,
          "gpu": [
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": null
            },
            {
              "gpuUsageChange": 0
            }
          ]
        }
      }
    ]
  ]
}

janhq / cortex

Discussion: cortex benchmark cli #610

2. Initialize a compatible engine

3. Download a GGUF model from Hugging Face

Models save to $(npm list -g)/node_modules

4. Load the model