Open hiro-v opened 1 month ago
My current implementation, gonna refactor
const si = require('systeminformation');
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
const fs = require('fs');
const yaml = require('js-yaml');
const { Command } = require('commander');
const program = new Command();
program.option('--config <path>', 'Path to the configuration file');
program.parse(process.argv);
const options = program.opts();
if (!options.config) {
console.error('Configuration file path is required.');
process.exit(1);
}
let config;
try {
config = yaml.load(fs.readFileSync(options.config, 'utf8'));
console.log('Configuration loaded successfully:', config);
} catch (error) {
console.error('Failed to read or parse the configuration file:', error);
process.exit(1);
}
async function getSystemResources() {
const data = {
cpu: await si.currentLoad(),
mem: await si.mem(),
gpu: (await si.graphics()).controllers
};
console.log('Current system resources:', data);
return data;
}
async function getResourceMetadata() {
const hardware = {};
const components = ['cpu', 'memory', 'gpu', 'disk', 'osInfo', 'memLayout', 'mem', 'chassis', 'baseboard'];
for (let component of components) {
if (config.hardware.includes(component)) {
switch(component) {
case 'memory':
hardware.memory = await si.mem();
break;
case 'memLayout':
hardware.memLayout = await si.memLayout();
break;
case 'gpu':
hardware.gpu = await si.graphics();
delete hardware.gpu.displays; // Assuming displays are not needed in hardware logs
break;
case 'disk':
hardware.disk = await si.diskLayout();
break;
case 'osInfo':
hardware.osInfo = await si.osInfo();
console.log('Operating System Info:', hardware.osInfo);
break;
case 'chassis':
hardware.chassis = await si.chassis();
break;
case 'baseboard':
hardware.baseboard = await si.baseboard();
break;
default:
hardware[component] = await si[component]();
break;
}
console.log(`Hardware component ${component} data:`, hardware[component]);
}
}
return hardware;
}
function generateTokenLengths(min, max, samples) {
const step = (max - min) / (samples - 1);
const lengths = Array.from({length: samples}, (_, index) => Math.round(min + step * index));
console.log('Generated token lengths:', lengths);
return lengths;
}
async function getResourceChange(startData, endData) {
const change = {
cpu: (endData.cpu.currentload - startData.cpu.currentload) / startData.cpu.currentload * 100,
mem: (endData.mem.used - startData.mem.used) / startData.mem.total * 100,
gpu: endData.gpu.map((gpu, index) => ({
gpuUsageChange: gpu.utilizationGpu - startData.gpu[index].utilizationGpu
}))
};
console.log('Resource change calculated:', change);
return change;
}
async function benchmarkUser() {
const results = [];
const tokenLengths = generateTokenLengths(config.prompts.min, config.prompts.max, config.prompts.samples);
for (const length of tokenLengths) {
const startResources = await getSystemResources();
const start = Date.now();
const response = await fetch(config.api.url, {
method: 'POST',
headers: {
'Authorization': `Bearer ${config.api.api_key}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({...config.api.parameters, max_tokens: length})
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const latency = Date.now() - start;
const endResources = await getSystemResources();
const resourceChange = await getResourceChange(startResources, endResources);
results.push({
tokens: length,
latency,
resourceChange
});
console.log(`Result for token length ${length}:`, results[results.length - 1]);
}
return results;
}
async function runBenchmarks() {
const hardware = await getResourceMetadata();
const userPromises = Array.from({ length: config.concurrency }, benchmarkUser);
const allResults = await Promise.all(userPromises);
const output = {
hardware,
results: allResults
};
fs.writeFileSync('output.json', JSON.stringify(output, null, 2));
console.log('Benchmark results saved to output.json');
console.log('Detailed Results:', JSON.stringify(allResults, null, 2));
}
runBenchmarks();
api:
url: http://127.0.0.1:1337/v1/chat/completions
api_key: <api_key>
parameters:
messages:
- content: You are a helpful assistant.
role: system
- content: Hello!
role: user
model: tinyllama-1.1b
stream: true
max_tokens: 2048
stop:
- hello
frequency_penalty: 0
presence_penalty: 0
temperature: 0.7
top_p: 0.95
prompts:
min: 1024
max: 2048
samples: 10
output: json
hardware:
- cpu
- gpu
- psu
- chassis
- ram
concurrency: 1
{
"hardware": {
"cpu": {
"manufacturer": "Apple",
"brand": "M3 Pro",
"vendor": "Apple",
"family": "1598941843",
"model": "",
"stepping": "4",
"revision": "",
"voltage": "",
"speed": 2.4,
"speedMin": 2.4,
"speedMax": 2.4,
"governor": "",
"cores": 11,
"physicalCores": 11,
"performanceCores": 5,
"efficiencyCores": 6,
"processors": 1,
"socket": "SOC",
"flags": "",
"virtualization": true,
"cache": {
"l1d": 131072,
"l1i": 65536,
"l2": 4194304,
"l3": null
}
},
"gpu": {
"controllers": [
{
"vendor": "Apple",
"model": "Apple M3 Pro",
"bus": "Built-In",
"vramDynamic": true,
"vram": null,
"deviceId": "",
"vendorId": "0x05ac",
"external": false,
"cores": "14",
"metalVersion": ""
}
]
},
"chassis": {
"manufacturer": "Apple Inc.",
"model": "Mac",
"type": "Other",
"version": "Mac15,6",
"serial": "F2TGP7C125",
"assetTag": "J514s",
"sku": "J514sAP"
}
},
"results": [
[
{
"tokens": 1024,
"latency": 133,
"resourceChange": {
"cpu": null,
"mem": 0.009706285264756944,
"gpu": [
{
"gpuUsageChange": null
}
]
}
},
{
"tokens": 1138,
"latency": 79,
"resourceChange": {
"cpu": null,
"mem": 0.02288818359375,
"gpu": [
{
"gpuUsageChange": null
}
]
}
},
{
"tokens": 1252,
"latency": 48,
"resourceChange": {
"cpu": null,
"mem": 0,
"gpu": [
{
"gpuUsageChange": null
}
]
}
},
{
"tokens": 1365,
"latency": 63,
"resourceChange": {
"cpu": null,
"mem": 0.013266669379340278,
"gpu": [
{
"gpuUsageChange": null
}
]
}
},
{
"tokens": 1479,
"latency": 1896,
"resourceChange": {
"cpu": null,
"mem": -0.17242431640625,
"gpu": [
{
"gpuUsageChange": null
}
]
}
},
{
"tokens": 1593,
"latency": 749,
"resourceChange": {
"cpu": null,
"mem": 0.096893310546875,
"gpu": [
{
"gpuUsageChange": null
}
]
}
},
{
"tokens": 1707,
"latency": 528,
"resourceChange": {
"cpu": null,
"mem": 0.3261990017361111,
"gpu": [
{
"gpuUsageChange": null
}
]
}
},
{
"tokens": 1820,
"latency": 80,
"resourceChange": {
"cpu": null,
"mem": 0,
"gpu": [
{
"gpuUsageChange": null
}
]
}
},
{
"tokens": 1934,
"latency": 1315,
"resourceChange": {
"cpu": null,
"mem": -0.3627777099609375,
"gpu": [
{
"gpuUsageChange": null
}
]
}
},
{
"tokens": 2048,
"latency": 1253,
"resourceChange": {
"cpu": null,
"mem": 0.008138020833333332,
"gpu": [
{
"gpuUsageChange": null
}
]
}
}
]
]
}
{
"hardware": {
"cpu": {
"manufacturer": "AMD",
"brand": "Ryzen Threadripper PRO 5965WX 24-Cores",
"vendor": "AuthenticAMD",
"family": "25",
"model": "8",
"stepping": "2",
"revision": "2050",
"voltage": "",
"speed": 3.8,
"speedMin": 3.8,
"speedMax": 3.79,
"governor": "",
"cores": 16,
"physicalCores": 16,
"performanceCores": 16,
"efficiencyCores": 0,
"processors": 1,
"socket": "Other",
"flags": "de pse tsc sep mtrr mca cmov psn clfsh ds mmx fxsr sse sse2 ss htt tm ia64 pbe",
"virtualization": true,
"cache": {
"l1d": 0,
"l1i": 0,
"l2": 0,
"l3": 0
}
},
"gpu": {
"controllers": [
{
"vendor": "Microsoft",
"model": "Microsoft Remote Display Adapter",
"bus": "",
"vram": 0,
"vramDynamic": true,
"subDeviceId": null
},
{
"vendor": "(Standard display types)",
"model": "Microsoft Basic Display Adapter",
"bus": "PCI",
"vram": 0,
"vramDynamic": true,
"subDeviceId": "11001AF4"
},
{
"vendor": "NVIDIA",
"model": "NVIDIA GeForce RTX 4090",
"bus": "PCI",
"vram": 24564,
"vramDynamic": false,
"subDeviceId": "0x889C1043",
"driverVersion": "555.85",
"name": "NVIDIA GeForce RTX 4090",
"pciBus": "00000000:01:00.0",
"memoryTotal": 24564,
"memoryUsed": 2188,
"memoryFree": 21955,
"utilizationGpu": 7,
"utilizationMemory": 1,
"temperatureGpu": 24,
"powerDraw": 38,
"powerLimit": 500,
"clockCore": 615,
"clockMemory": 810
}
]
},
"chassis": {
"manufacturer": "QEMU",
"model": "",
"type": "Other",
"version": "pc-q35-7.1",
"serial": "",
"assetTag": "",
"sku": ""
}
},
"results": [
[
{
"tokens": 1024,
"latency": 88,
"resourceChange": {
"cpu": null,
"mem": 0.026111026563768642,
"gpu": [
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
}
]
}
},
{
"tokens": 1138,
"latency": 28,
"resourceChange": {
"cpu": null,
"mem": 0.05019609488206003,
"gpu": [
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
},
{
"gpuUsageChange": -7
}
]
}
},
{
"tokens": 1252,
"latency": 32,
"resourceChange": {
"cpu": null,
"mem": -0.05215054871887338,
"gpu": [
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
},
{
"gpuUsageChange": 0
}
]
}
},
{
"tokens": 1365,
"latency": 3,
"resourceChange": {
"cpu": null,
"mem": 0.04956447260552889,
"gpu": [
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
},
{
"gpuUsageChange": 26
}
]
}
},
{
"tokens": 1479,
"latency": 28,
"resourceChange": {
"cpu": null,
"mem": -0.024609433981826676,
"gpu": [
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
},
{
"gpuUsageChange": -4
}
]
}
},
{
"tokens": 1593,
"latency": 4,
"resourceChange": {
"cpu": null,
"mem": 0.12588351145280155,
"gpu": [
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
},
{
"gpuUsageChange": -1
}
]
}
},
{
"tokens": 1707,
"latency": 3,
"resourceChange": {
"cpu": null,
"mem": -0.03436978576444946,
"gpu": [
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
},
{
"gpuUsageChange": -24
}
]
}
},
{
"tokens": 1820,
"latency": 3,
"resourceChange": {
"cpu": null,
"mem": -0.047407422944167804,
"gpu": [
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
},
{
"gpuUsageChange": -18
}
]
}
},
{
"tokens": 1934,
"latency": 28,
"resourceChange": {
"cpu": null,
"mem": -0.3049424681486591,
"gpu": [
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
},
{
"gpuUsageChange": -13
}
]
}
},
{
"tokens": 2048,
"latency": 30,
"resourceChange": {
"cpu": null,
"mem": 0.0863296560602191,
"gpu": [
{
"gpuUsageChange": null
},
{
"gpuUsageChange": null
},
{
"gpuUsageChange": 0
}
]
}
}
]
]
}
Motivation
Discussion
2. Initialize a compatible engine
cortex init
3. Download a GGUF model from Hugging Face
Models save to $(npm list -g)/node_modules
cortex models pull janhq/TinyLlama-1.1B-Chat-v1.0-GGUF
4. Load the model
cortex models start janhq/TinyLlama-1.1B-Chat-v1.0-GGUF
Resources