{"rows":[{"id":"cmrio3ys401uvmj01lcom115h","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":4980.696149,"tokSPrefill":7664.915099,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:33:40.181Z","notes":"CPU-only build (Vulkan/CUDA/HIP disabled). 8 physical-core threads. One explicit throwaway plus llama-bench internal warmup; two measured repetitions. Headless Ubuntu, BIOS Silent mode, Core Turbo Boost disabled. Decode samples tok/s: 4930.78, 5030.61. GGUF-reported parameters: 6963968.","engineFlags":{"commandSnippet":"llama-bench -m TinyStories-3M-Q4_K_M.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"segestic/Tinystories-gpt-0.1-3m","displayName":"Tinystories-gpt-0.1-3m","family":"Gpt","params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"Q4_K_M","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":1,"reactionCounts":{},"myEmoji":null},{"id":"cmrio8nyr01vumj010r04d1cl","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":4979.352396,"tokSPrefill":7767.919137,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:37:19.443Z","notes":"CPU quantization/thread sweep. CPU-only build; 8 physical-core threads; llama-bench warmup and two measured repetitions. Headless; BIOS Silent mode; Core Turbo Boost disabled. Decode samples tok/s: 4972.35, 4986.36. GGUF-reported parameters: 6963968.","engineFlags":{"commandSnippet":"llama-bench -m TinyStories-3M-Q2_K.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"segestic/Tinystories-gpt-0.1-3m","displayName":"Tinystories-gpt-0.1-3m","family":"Gpt","params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"Q2_K","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":2,"reactionCounts":{},"myEmoji":null},{"id":"cmrio8ond01vzmj01rlxlvcc0","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":4911.930039,"tokSPrefill":7848.04374,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:37:20.329Z","notes":"CPU quantization/thread sweep. CPU-only build; 8 physical-core threads; llama-bench warmup and two measured repetitions. Headless; BIOS Silent mode; Core Turbo Boost disabled. Decode samples tok/s: 4925.10, 4898.76. GGUF-reported parameters: 6963968.","engineFlags":{"commandSnippet":"llama-bench -m TinyStories-3M-Q8_0.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"segestic/Tinystories-gpt-0.1-3m","displayName":"Tinystories-gpt-0.1-3m","family":"Gpt","params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"Q8_0","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":3,"reactionCounts":{},"myEmoji":null},{"id":"cmrio8ozf01w5mj01054z2ynf","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":4747.562394,"tokSPrefill":7728.936959,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:37:20.763Z","notes":"CPU quantization/thread sweep. CPU-only build; 8 physical-core threads; llama-bench warmup and two measured repetitions. Headless; BIOS Silent mode; Core Turbo Boost disabled. Decode samples tok/s: 4747.24, 4747.88. GGUF-reported parameters: 6963968.","engineFlags":{"commandSnippet":"llama-bench -m TinyStories-3M-F16.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"segestic/Tinystories-gpt-0.1-3m","displayName":"Tinystories-gpt-0.1-3m","family":"Gpt","params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"F16","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":4,"reactionCounts":{},"myEmoji":null},{"id":"cmrio408801vhmj01zdwdqzob","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":3569.083997,"tokSPrefill":74947.634851,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:33:42.057Z","notes":"CPU-only build (Vulkan/CUDA/HIP disabled). 8 physical-core threads. One explicit throwaway plus llama-bench internal warmup; two measured repetitions. Headless Ubuntu, BIOS Silent mode, Core Turbo Boost disabled. Decode samples tok/s: 3570.080, 3568.090. GGUF-reported parameters: 33883264.","engineFlags":{"commandSnippet":"llama-bench -m TinyStories-1M-Q4_K_M.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"ivnle/tinystories-lay4-hs128-hd2-1M","displayName":"tinystories-lay4-hs128-hd2-1M","family":"Llama","params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"Q4_K_M","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":5,"reactionCounts":{},"myEmoji":null},{"id":"cmokhzrb40005ld0493vy3ltd","contextLength":262144,"prefillTokens":null,"batchSize":10,"ttftMs":143,"tokSOut":2665.14,"tokSPrefill":null,"tokSTotal":55846.85,"peakVramGb":133,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-04-29T20:18:51.521Z","notes":"Requires DeepGEMM installation after uv pip install vllm:\ngit clone --recursive https://github.com/deepseek-ai/DeepGEMM.git\ncd DeepGEMM\nDG_FORCE_BUILD=1 uv pip install --force-reinstall --no-build-isolation .\n\nBenchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 143ms, Mean: 340ms","engineFlags":{"commandSnippet":"vllm serve Qwen/Qwen3.5-0.8B-Base --served-model-name qwen3 --trust-remote-code --tensor-parallel-size 1 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --gpu-memory-utilization 0.95 --speculative-config '{\"method\":\"qwen3_next_mtp\",\"num_speculative_tokens\":2}'  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":1,"gpuLayers":4,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-0.8B-Base","displayName":"Qwen3.5-0.8B-Base","family":"Qwen","params":1,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 NVL","gpuCount":1,"vramGb":141,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9175F","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev74+gfaab18955","quantization":"BF16","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"H200 NVL","hardwareGroupKey":"DISCRETE_GPU:h200 nvl","rank":6,"reactionCounts":{},"myEmoji":null},{"id":"cmrk3vsz102zzmj01p4vu2ouo","contextLength":32768,"prefillTokens":null,"batchSize":96,"ttftMs":38.65,"tokSOut":2527.68,"tokSPrefill":null,"tokSTotal":7731.146,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-14T03:42:59.437Z","notes":"Rented JarvisLabs RTX PRO 6000 Blackwell 96GB (cloud), CUDA 13 build sm_120. Median of 3 runs, temp 0, cold prefill, prompt ~50592 tok. Head-to-head vs my 3x R9700 rig, same GGUF quants.","engineFlags":{"commandSnippet":"vllm serve cyankiwi/GLM-4.5-Air-AWQ-4bit --max-model-len 32768 --max-num-seqs 96 --enable-chunked-prefill --enable-prefix-caching --gpu-memory-utilization 0.90","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"zai-org/GLM-4.5-Air","displayName":"GLM-4.5-Air","family":null,"params":110,"isMoE":true,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX PRO 6000 Blackwell","gpuCount":1,"vramGb":96,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"vllm-0.25.0","quantization":"AWQ-4bit","backend":"cuda"},"user":{"id":"cmorcd2jg0008l804hfkput08","username":"1337Hero","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX PRO 6000 Blackwell","hardwareGroupKey":"DISCRETE_GPU:rtx pro 6000 blackwell","rank":7,"reactionCounts":{},"myEmoji":null},{"id":"cmrk3vsnw02zumj01rfnmu5x4","contextLength":32768,"prefillTokens":null,"batchSize":64,"ttftMs":40.37,"tokSOut":2037.835,"tokSPrefill":null,"tokSTotal":6232.908,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-14T03:42:59.036Z","notes":"Rented JarvisLabs RTX PRO 6000 Blackwell 96GB (cloud), CUDA 13 build sm_120. Median of 3 runs, temp 0, cold prefill, prompt ~33728 tok. Head-to-head vs my 3x R9700 rig, same GGUF quants.","engineFlags":{"commandSnippet":"vllm serve cyankiwi/GLM-4.5-Air-AWQ-4bit --max-model-len 32768 --max-num-seqs 96 --enable-chunked-prefill --enable-prefix-caching --gpu-memory-utilization 0.90","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"zai-org/GLM-4.5-Air","displayName":"GLM-4.5-Air","family":null,"params":110,"isMoE":true,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX PRO 6000 Blackwell","gpuCount":1,"vramGb":96,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"vllm-0.25.0","quantization":"AWQ-4bit","backend":"cuda"},"user":{"id":"cmorcd2jg0008l804hfkput08","username":"1337Hero","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX PRO 6000 Blackwell","hardwareGroupKey":"DISCRETE_GPU:rtx pro 6000 blackwell","rank":8,"reactionCounts":{},"myEmoji":null},{"id":"cmp7fqvp7006jo401h82kkyfb","contextLength":32768,"prefillTokens":null,"batchSize":64,"ttftMs":270.64,"tokSOut":1405.394,"tokSPrefill":null,"tokSTotal":4407.469,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-05-15T21:34:40.123Z","notes":"Qwen2.5-7B BF16 vLLM ROCm source build. TP=1, max-seqs=128, batch-tokens=32768, client concurrency=64, chunked prefill + prefix caching. 1× R9700.","engineFlags":{"commandSnippet":"/home/mikekey/.venvs/vllm/bin/vllm serve /home/mikekey/models/hf/Qwen2.5-7B   --tensor-parallel-size 1   --dtype bfloat16   --max-model-len 32768   --max-num-seqs 128   --max-num-batched-tokens 32768   --enable-chunked-prefill   --enable-prefix-caching   --gpu-memory-utilization 0.90   --port 8000   --served-model-name qwen2.5-7b   --trust-remote-code","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen2.5-7B","displayName":"Qwen2.5-7B","family":"Qwen","params":8,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":1,"vramGb":32,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":"Arch Linux","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.2+rocm723","quantization":"bf16","backend":"rocm"},"user":{"id":"cmorcd2jg0008l804hfkput08","username":"1337Hero","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":9,"reactionCounts":{},"myEmoji":null},{"id":"cmp7f7rgi006co4019h144d9a","contextLength":32768,"prefillTokens":null,"batchSize":64,"ttftMs":275.98,"tokSOut":1399.209,"tokSPrefill":null,"tokSTotal":4388.071,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-05-15T21:19:48.162Z","notes":"Qwen2.5-7B BF16 vLLM ROCm source build. TP=1, max-seqs=128, batch-tokens=32768, client concurrency=64, chunked prefill + prefix caching. 1× R9700.","engineFlags":{"commandSnippet":"/home/mikekey/.venvs/vllm/bin/vllm serve /home/mikekey/models/hf/Qwen2.5-7B   --tensor-parallel-size 1   --dtype bfloat16   --max-model-len 32768   --max-num-seqs 128   --max-num-batched-tokens 32768   --enable-chunked-prefill   --enable-prefix-caching   --gpu-memory-utilization 0.90   --port 8000   --served-model-name qwen2.5-7b   --trust-remote-code","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen2.5-7B","displayName":"Qwen2.5-7B","family":"Qwen","params":8,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":1,"vramGb":32,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":"Arch Linux","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.2+rocm723","quantization":"bf16","backend":"rocm"},"user":{"id":"cmorcd2jg0008l804hfkput08","username":"1337Hero","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":10,"reactionCounts":{},"myEmoji":null},{"id":"cmqz6p4gl025toe01l70kn28a","contextLength":8192,"prefillTokens":null,"batchSize":1,"ttftMs":260,"tokSOut":1369.3,"tokSPrefill":130.8,"tokSTotal":983.3,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-06-29T12:18:36.885Z","notes":"`lmx benchmark run vllm   --mode remote   --base-url http://localhost:8000   --hf-id RedHatAI/diffusiongemma-26B-A4B-it-FP8-dynamic   --served-model RedHatAI/diffusiongemma-26B-A4B-it-FP8-dynamic   --quantization fp8   --max-tokens 4096`","engineFlags":{"commandSnippet":"sudo docker run --gpus all   --privileged --ipc=host -p 8000:8000   -v ~/.cache/huggingface:/root/.cache/huggingface   vllm/vllm-openai:gemma RedHatAI/diffusiongemma-26B-A4B-it-FP8-dynamic   --max-num-seqs 4   --tensor-parallel-size 1","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":"FA4","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"RedHatAI/diffusiongemma-26B-A4B-it-FP8-dynamic","displayName":"diffusiongemma-26B-A4B-it-FP8-dynamic","family":"Gemma","params":26,"isMoE":true,"baseModel":{"hfId":"google/diffusiongemma-26B-A4B-it","displayName":"diffusiongemma-26B-A4B-it"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H100","gpuCount":1,"vramGb":80,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"Intel(R) Xeon(R) Platinum 8481C","os":"Ubuntu 22.04.5 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.24.0 (dev)","quantization":"FP8","backend":"cuda"},"user":{"id":"cmqz33l6e0231oe010lk3smio","username":"joaogante","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"H100","hardwareGroupKey":"DISCRETE_GPU:h100","rank":11,"reactionCounts":{},"myEmoji":null},{"id":"cmrk3vsd802zpmj01go7y8e39","contextLength":32768,"prefillTokens":null,"batchSize":32,"ttftMs":34.02,"tokSOut":1288.275,"tokSPrefill":null,"tokSTotal":3940.309,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-14T03:42:58.652Z","notes":"Rented JarvisLabs RTX PRO 6000 Blackwell 96GB (cloud), CUDA 13 build sm_120. Median of 3 runs, temp 0, cold prefill, prompt ~16864 tok. Head-to-head vs my 3x R9700 rig, same GGUF quants.","engineFlags":{"commandSnippet":"vllm serve cyankiwi/GLM-4.5-Air-AWQ-4bit --max-model-len 32768 --max-num-seqs 96 --enable-chunked-prefill --enable-prefix-caching --gpu-memory-utilization 0.90","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"zai-org/GLM-4.5-Air","displayName":"GLM-4.5-Air","family":null,"params":110,"isMoE":true,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX PRO 6000 Blackwell","gpuCount":1,"vramGb":96,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"vllm-0.25.0","quantization":"AWQ-4bit","backend":"cuda"},"user":{"id":"cmorcd2jg0008l804hfkput08","username":"1337Hero","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX PRO 6000 Blackwell","hardwareGroupKey":"DISCRETE_GPU:rtx pro 6000 blackwell","rank":12,"reactionCounts":{},"myEmoji":null},{"id":"cmrkrj5fl04twmj01a5s9onpv","contextLength":512,"prefillTokens":null,"batchSize":64,"ttftMs":null,"tokSOut":1090.49,"tokSPrefill":1953.37,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-14T14:44:59.841Z","notes":"Warm steady-state CPU benchmark on SmolLM2-135M-Instruct Q2_K: 64 simultaneous HTTP requests, 64 server slots sharing eight physical CPU threads (CPUs 0-7), ubatch 1024, strict CPU and batch affinity. Two 64-request decode batches measured 1075.73 and 1105.25 aggregate tok/s; reported midpoint 1090.49 tok/s. Best prefill companion setting remains 32 slots/ubatch 1024/F16 K/V at 1953.37 aggregate prompt tok/s over 7360 prompt tokens. One warm-up batch was discarded. q8_0 K/V cache for decode; no GPU offload or SMT.","engineFlags":{"commandSnippet":"taskset -c 0-7 llama-server -m SmolLM2-135M-Instruct-Q2_K.gguf -c 512 -np 64 -t 8 -tb 8 -b 8192 -ub 1024 --poll 100 --cpu-range 0-7 --cpu-strict 1 --cpu-range-batch 0-7 --cpu-strict-batch 1 -ctk q8_0 -ctv q8_0 -fa auto","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":"q8_0","attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"HuggingFaceTB/SmolLM2-135M-Instruct","displayName":"SmolLM2-135M-Instruct","family":"Llama","params":0,"isMoE":false,"baseModel":{"hfId":"HuggingFaceTB/SmolLM2-135M","displayName":"SmolLM2-135M"}},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21, AVX2 -Ofast Zen4-tuned no-repack server","quantization":"Q2_K","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":13,"reactionCounts":{},"myEmoji":null},{"id":"cmozt9exg0085lo01c1g3q2i7","contextLength":65536,"prefillTokens":null,"batchSize":16,"ttftMs":null,"tokSOut":1048,"tokSPrefill":null,"tokSTotal":1048,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-05-10T13:30:50.452Z","notes":"Concurrency=16 aggregate throughput, sweet-spot config. 16 parallel chat completions, 8 distinct prompts cycled with per-request seeding to defeat prefix caching, temp=0.7, max_tokens=600 (avg actual=564). Wall 8.62s, total 9,029 tokens, no queueing (max-num-seqs=16), p95 ≈ p50, 0 KV preemptions. Continuous batching only — Eagle3 spec-decode tested and dropped (27% accept on NVFP4 target → -29% throughput). Per-stream ~69 t/s under load vs 128 t/s single-stream. KV cache FP8. EPYC 32c/64t host, 160 GB DDR5. Run 2026-05-09.","engineFlags":{"commandSnippet":"vllm serve /opt/models/gemma-4-26B-A4B-it-NVFP4 --served-model-name gemma-4-26b-a4b-it --host 0.0.0.0 --port 8000 --max-model-len 65536 --gpu-memory-utilization 0.90 --max-num-seqs 16","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/Gemma-4-26B-A4B-NVFP4","displayName":"Gemma-4-26B-A4B-NVFP4","family":"Gemma","params":14,"isMoE":false,"baseModel":{"hfId":"google/gemma-4-26B-A4B-it","displayName":"gemma-4-26B-A4B-it"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX PRO 6000 Blackwell","gpuCount":1,"vramGb":96,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1","quantization":"NVFP4 (modelopt_fp4 W4A4)","backend":null},"user":{"id":"cmowc9fer00qdp101b007eug5","username":"murdarch","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX PRO 6000 Blackwell","hardwareGroupKey":"DISCRETE_GPU:rtx pro 6000 blackwell","rank":14,"reactionCounts":{},"myEmoji":null},{"id":"cmot9gh4r000bib04esjf3r77","contextLength":32768,"prefillTokens":null,"batchSize":16,"ttftMs":175.4,"tokSOut":991.1,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-05-05T23:29:50.523Z","notes":"gpt-oss-20b MXFP4 concurrent throughput at batch=16 (matches --max-num-seqs 16). Aggregate output tok/s, best of 3 runs (991.1 / 986.8 / 990.6 -- variance under 0.5%). Per-request decode 64.8 tok/s. TTFT 175ms. This is vLLM's strength zone vs llama.cpp single-stream: ~6x throughput when serving multiple concurrent users. batch=1 single-stream was 48 tok/s (separate submission) -- same hardware, llama.cpp Q8_0 single-stream was 160 tok/s.","engineFlags":{"commandSnippet":"LD_LIBRARY_PATH=~/.local/lib/rccl-7.1.1:$LD_LIBRARY_PATH HIP_VISIBLE_DEVICES=0,1 VLLM_TARGET_DEVICE=rocm VLLM_ROCM_USE_AITER=0 FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE vllm serve openai/gpt-oss-20b --tensor-parallel-size 2 --dtype bfloat16 --max-model-len 32768 --max-num-seqs 16 --max-num-batched-tokens 4096 --enable-chunked-prefill --enable-prefix-caching --gpu-memory-utilization 0.90 --moe-backend triton --reasoning-parser openai_gptoss --tool-call-parser openai","tensorParallel":2,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"openai/gpt-oss-20b","displayName":"gpt-oss-20b","family":"Gpt","params":22,"isMoE":true,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":2,"vramGb":64,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 9 5950X","os":"Arch Linux","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1+rocm721","quantization":"MXFP4_MOE","backend":"rocm"},"user":{"id":"cmorcd2jg0008l804hfkput08","username":"1337Hero","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":15,"reactionCounts":{},"myEmoji":null},{"id":"cmojycegm0005jo04bvjn0bs6","contextLength":262144,"prefillTokens":null,"batchSize":10,"ttftMs":491,"tokSOut":878.38,"tokSPrefill":null,"tokSTotal":18412.81,"peakVramGb":535.8,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-04-29T11:08:49.078Z","notes":"Requires DeepGEMM installation after uv pip install vllm:\ngit clone --recursive https://github.com/deepseek-ai/DeepGEMM.git\ncd DeepGEMM\nDG_FORCE_BUILD=1 uv pip install --force-reinstall --no-build-isolation .\n\nBenchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 491ms, Mean: 855ms","engineFlags":{"commandSnippet":"vllm serve Qwen/Qwen3.5-122B-A10B-FP8 --served-model-name qwen3 --trust-remote-code --tensor-parallel-size 4 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --gpu-memory-utilization 0.95 --speculative-config {\"method\":\"qwen3_next_mtp\",\"num_speculative_tokens\":2}  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-122B-A10B-FP8","displayName":"Qwen3.5-122B-A10B-FP8","family":"Qwen","params":125,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-122B-A10B","displayName":"Qwen3.5-122B-A10B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 SXM","gpuCount":4,"vramGb":564,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev30+gf05f3664c","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"H200 SXM","hardwareGroupKey":"DISCRETE_GPU:h200 sxm","rank":16,"reactionCounts":{},"myEmoji":null},{"id":"cmr82nlmd005kqr01o87h22b9","contextLength":2048,"prefillTokens":null,"batchSize":64,"ttftMs":5520,"tokSOut":807.3,"tokSPrefill":6449.4,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-05T17:35:22.934Z","notes":"Full DGX/GB10 Agentic Arcade throughput queue. This queue includes the complete missing curve rows rather than silently culling single-stream, warmup-candidate, startup-outlier, or lowest-curve cells. qualityFlags=none. Imported from Agentic Arcade /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv row agents=64. Hardware/model fields inferred from build_site.py where not explicit; run API dry-run before public submit.","engineFlags":{"commandSnippet":"Agentic Arcade recorded throughput row from /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv at agents=64","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/Gemma-4-26B-A4B-NVFP4","displayName":"Gemma-4-26B-A4B-NVFP4","family":"Gemma","params":14,"isMoE":false,"baseModel":{"hfId":"google/gemma-4-26B-A4B-it","displayName":"gemma-4-26B-A4B-it"}},"hardware":{"hwClass":"UNIFIED","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":"NVIDIA","chipFamily":"GB10","chipVariant":"GB10 Grace Blackwell","unifiedMemoryGb":128,"cpu":null,"os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"see Agentic Arcade source metadata","quantization":"NVFP4","backend":"cuda"},"user":{"id":"cmr5toaxx00fgq901xfezberz","username":"Frosty40","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"GB10 Grace Blackwell","hardwareGroupKey":"UNIFIED:gb10 grace blackwell","rank":17,"reactionCounts":{},"myEmoji":null},{"id":"cmq3jm75g000tlj01bx4frdf0","contextLength":32768,"prefillTokens":null,"batchSize":8,"ttftMs":2529.805063267122,"tokSOut":796.1763558639424,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-06-07T08:51:37.781Z","notes":"4x Intel Arc Pro B70 production Gemma 4 c8 XPU graph profile. Mean of five c8 runs, 119 prompt tokens and 256 generated tokens per request. No speculative decode; KV cache dtype auto; INT4 AutoRound W4A16 weights; bfloat16 runtime activation dtype. Raw: /mnt/fast-ai/bench-results/gemma4-12b-it-int4-autoround/prod-c8-xpugraph-256o-repeat-20260607T084540Z","engineFlags":{"commandSnippet":"vllm serve /mnt/fast-ai/llm-models/gemma4-12b-it-int4-autoround-intel --host 127.0.0.1 --port 18080 --trust-remote-code --served-model-name gemma4-12b-it-int4-autoround --dtype bfloat16 --tensor-parallel-size 4 --distributed-executor-backend mp --max-model-len 32768 --max-num-batched-tokens 4096 --max-num-seqs 8 --gpu-memory-utilization 0.95 --kv-cache-dtype auto --enable-prefix-caching --compilation-config '{\"use_inductor_graph_partition\":true,\"compile_sizes\":[1],\"cudagraph_mode\":\"PIECEWISE\"}' --limit-mm-per-prompt '{\"image\":4}'","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":"auto","attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Intel/gemma-4-12B-it-int4-AutoRound","displayName":"gemma-4-12B-it-int4-AutoRound","family":"Gemma","params":3,"isMoE":false,"baseModel":{"hfId":"google/gemma-4-12B-it","displayName":"gemma-4-12B-it"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Intel Arc Pro B70","gpuCount":4,"vramGb":128,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen Threadripper PRO 5955WX 16-Cores","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.2rc1.dev2+gc51df4300.d20260523","quantization":"INT4 AutoRound W4A16","backend":null},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"Intel Arc Pro B70","hardwareGroupKey":"DISCRETE_GPU:intel arc pro b70","rank":18,"reactionCounts":{},"myEmoji":null},{"id":"cmq3jm7cx000wlj01wm75wqmk","contextLength":32768,"prefillTokens":null,"batchSize":8,"ttftMs":2541.752395831281,"tokSOut":780.9684204492766,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-06-07T08:51:38.049Z","notes":"4x Intel Arc Pro B70 production Gemma 4 c8 XPU graph profile. Mean of three c8 runs, 119 prompt tokens and 512 generated tokens per request. No speculative decode; KV cache dtype auto; INT4 AutoRound W4A16 weights; bfloat16 runtime activation dtype. Raw: /mnt/fast-ai/bench-results/gemma4-12b-it-int4-autoround/prod-c8-xpugraph-512o-repeat-20260607T084633Z","engineFlags":{"commandSnippet":"vllm serve /mnt/fast-ai/llm-models/gemma4-12b-it-int4-autoround-intel --host 127.0.0.1 --port 18080 --trust-remote-code --served-model-name gemma4-12b-it-int4-autoround --dtype bfloat16 --tensor-parallel-size 4 --distributed-executor-backend mp --max-model-len 32768 --max-num-batched-tokens 4096 --max-num-seqs 8 --gpu-memory-utilization 0.95 --kv-cache-dtype auto --enable-prefix-caching --compilation-config '{\"use_inductor_graph_partition\":true,\"compile_sizes\":[1],\"cudagraph_mode\":\"PIECEWISE\"}' --limit-mm-per-prompt '{\"image\":4}'","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":"auto","attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Intel/gemma-4-12B-it-int4-AutoRound","displayName":"gemma-4-12B-it-int4-AutoRound","family":"Gemma","params":3,"isMoE":false,"baseModel":{"hfId":"google/gemma-4-12B-it","displayName":"gemma-4-12B-it"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Intel Arc Pro B70","gpuCount":4,"vramGb":128,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen Threadripper PRO 5955WX 16-Cores","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.2rc1.dev2+gc51df4300.d20260523","quantization":"INT4 AutoRound W4A16","backend":null},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"Intel Arc Pro B70","hardwareGroupKey":"DISCRETE_GPU:intel arc pro b70","rank":19,"reactionCounts":{},"myEmoji":null},{"id":"cmr0jekxh00gyld01diqiz8un","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":37.27733500272734,"tokSOut":778.3687967765543,"tokSPrefill":86759.55016328595,"tokSTotal":781.8062109438122,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-06-30T11:02:06.197Z","notes":null,"engineFlags":{"commandSnippet":"# Remote endpoint: http://localhost:8000  servedModel: Qwen3.5-0.8B","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-0.8B","displayName":"Qwen3.5-0.8B","family":"Qwen","params":0.8,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-0.8B-Base","displayName":"Qwen3.5-0.8B-Base"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX PRO 6000 Blackwell","gpuCount":1,"vramGb":96,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":null,"quantization":"BF16","backend":"cuda"},"user":{"id":"cmqa02fl0005uro01l7wm8up2","username":"fraserprice","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX PRO 6000 Blackwell","hardwareGroupKey":"DISCRETE_GPU:rtx pro 6000 blackwell","rank":20,"reactionCounts":{},"myEmoji":null},{"id":"cmr82nl6i005eqr01qfbmkwjc","contextLength":2048,"prefillTokens":null,"batchSize":32,"ttftMs":3350,"tokSOut":778.2,"tokSPrefill":6803,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-05T17:35:22.362Z","notes":"Full DGX/GB10 Agentic Arcade throughput queue. This queue includes the complete missing curve rows rather than silently culling single-stream, warmup-candidate, startup-outlier, or lowest-curve cells. qualityFlags=none. Imported from Agentic Arcade /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv row agents=32. Hardware/model fields inferred from build_site.py where not explicit; run API dry-run before public submit.","engineFlags":{"commandSnippet":"Agentic Arcade recorded throughput row from /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv at agents=32","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/Gemma-4-26B-A4B-NVFP4","displayName":"Gemma-4-26B-A4B-NVFP4","family":"Gemma","params":14,"isMoE":false,"baseModel":{"hfId":"google/gemma-4-26B-A4B-it","displayName":"gemma-4-26B-A4B-it"}},"hardware":{"hwClass":"UNIFIED","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":"NVIDIA","chipFamily":"GB10","chipVariant":"GB10 Grace Blackwell","unifiedMemoryGb":128,"cpu":null,"os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"see Agentic Arcade source metadata","quantization":"NVFP4","backend":"cuda"},"user":{"id":"cmr5toaxx00fgq901xfezberz","username":"Frosty40","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"GB10 Grace Blackwell","hardwareGroupKey":"UNIFIED:gb10 grace blackwell","rank":21,"reactionCounts":{},"myEmoji":null},{"id":"cmr82nkz0005bqr01mrdinq3e","contextLength":2048,"prefillTokens":null,"batchSize":24,"ttftMs":2630,"tokSOut":743.1,"tokSPrefill":6507.1,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-05T17:35:22.092Z","notes":"Full DGX/GB10 Agentic Arcade throughput queue. This queue includes the complete missing curve rows rather than silently culling single-stream, warmup-candidate, startup-outlier, or lowest-curve cells. qualityFlags=none. Imported from Agentic Arcade /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv row agents=24. Hardware/model fields inferred from build_site.py where not explicit; run API dry-run before public submit.","engineFlags":{"commandSnippet":"Agentic Arcade recorded throughput row from /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv at agents=24","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/Gemma-4-26B-A4B-NVFP4","displayName":"Gemma-4-26B-A4B-NVFP4","family":"Gemma","params":14,"isMoE":false,"baseModel":{"hfId":"google/gemma-4-26B-A4B-it","displayName":"gemma-4-26B-A4B-it"}},"hardware":{"hwClass":"UNIFIED","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":"NVIDIA","chipFamily":"GB10","chipVariant":"GB10 Grace Blackwell","unifiedMemoryGb":128,"cpu":null,"os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"see Agentic Arcade source metadata","quantization":"NVFP4","backend":"cuda"},"user":{"id":"cmr5toaxx00fgq901xfezberz","username":"Frosty40","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"GB10 Grace Blackwell","hardwareGroupKey":"UNIFIED:gb10 grace blackwell","rank":22,"reactionCounts":{},"myEmoji":null},{"id":"cmr82nlf3005hqr010prdkoa9","contextLength":2048,"prefillTokens":null,"batchSize":48,"ttftMs":5480,"tokSOut":741.2,"tokSPrefill":5903.3,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-05T17:35:22.671Z","notes":"Full DGX/GB10 Agentic Arcade throughput queue. This queue includes the complete missing curve rows rather than silently culling single-stream, warmup-candidate, startup-outlier, or lowest-curve cells. qualityFlags=none. Imported from Agentic Arcade /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv row agents=48. Hardware/model fields inferred from build_site.py where not explicit; run API dry-run before public submit.","engineFlags":{"commandSnippet":"Agentic Arcade recorded throughput row from /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv at agents=48","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/Gemma-4-26B-A4B-NVFP4","displayName":"Gemma-4-26B-A4B-NVFP4","family":"Gemma","params":14,"isMoE":false,"baseModel":{"hfId":"google/gemma-4-26B-A4B-it","displayName":"gemma-4-26B-A4B-it"}},"hardware":{"hwClass":"UNIFIED","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":"NVIDIA","chipFamily":"GB10","chipVariant":"GB10 Grace Blackwell","unifiedMemoryGb":128,"cpu":null,"os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"see Agentic Arcade source metadata","quantization":"NVFP4","backend":"cuda"},"user":{"id":"cmr5toaxx00fgq901xfezberz","username":"Frosty40","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"GB10 Grace Blackwell","hardwareGroupKey":"UNIFIED:gb10 grace blackwell","rank":23,"reactionCounts":{},"myEmoji":null},{"id":"cmr1w30ik01lald01jc64n2to","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":666.67,"tokSPrefill":26185.13,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-01T09:44:47.708Z","notes":null,"engineFlags":{"commandSnippet":"","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"LiquidAI/LFM2.5-1.2B-Instruct-GGUF","displayName":"LFM2.5-1.2B-Instruct-GGUF","family":"Llama","params":1,"isMoE":false,"baseModel":{"hfId":"LiquidAI/LFM2.5-1.2B-Instruct","displayName":"LFM2.5-1.2B-Instruct"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 3090","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 7543","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":null,"quantization":"Q4_K_M","backend":null},"user":{"id":"cmr1thkb201jnld015tybcisv","username":"KruxOS","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX 3090","hardwareGroupKey":"DISCRETE_GPU:rtx 3090","rank":24,"reactionCounts":{"fire":1},"myEmoji":null},{"id":"cmrio40p201vmmj01h0qe8763","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":641.020677,"tokSPrefill":10291.352098,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:33:42.662Z","notes":"CPU-only build (Vulkan/CUDA/HIP disabled). 8 physical-core threads. One explicit throwaway plus llama-bench internal warmup; two measured repetitions. Headless Ubuntu, BIOS Silent mode, Core Turbo Boost disabled. Decode samples tok/s: 639.697, 642.344. GGUF-reported parameters: 164897280.","engineFlags":{"commandSnippet":"llama-bench -m TinyStories-33M-Q4_K_M.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"roneneldan/TinyStories-33M","displayName":"TinyStories-33M","family":"Gpt","params":null,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"Q4_K_M","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":25,"reactionCounts":{},"myEmoji":null},{"id":"cmrkr69qs04qhmj01resc0j51","contextLength":512,"prefillTokens":null,"batchSize":24,"ttftMs":null,"tokSOut":637.43,"tokSPrefill":495.43,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-14T14:34:58.901Z","notes":"Warm steady-state CPU benchmark: 24 simultaneous HTTP requests, 24 server slots sharing eight physical CPU threads (CPUs 0-7), ubatch 1024, strict CPU and batch affinity enabled. Two 24-request decode batches measured 640.03 and 634.83 aggregate tok/s; reported midpoint 637.43 tok/s. Matching short-prompt prefill was measured separately at 495.43 tok/s over 232 prompt tokens. One warm-up batch was discarded. q8_0 K/V cache; no GPU offload or SMT.","engineFlags":{"commandSnippet":"taskset -c 0-7 llama-server -m LFM2-350M-Q2_K_S-imatrix.gguf -c 512 -np 24 -t 8 -tb 8 -b 4096 -ub 1024 --poll 100 --cpu-range 0-7 --cpu-strict 1 --cpu-range-batch 0-7 --cpu-strict-batch 1 -ctk q8_0 -ctv q8_0 -fa auto","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":"q8_0","attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"LiquidAI/LFM2-350M","displayName":"LFM2-350M","family":null,"params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21, AVX2 -Ofast Zen4-tuned no-repack server","quantization":"Q2_K_S","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":26,"reactionCounts":{},"myEmoji":null},{"id":"cmrkqte6f04n7mj01vpjcpk3j","contextLength":512,"prefillTokens":null,"batchSize":24,"ttftMs":null,"tokSOut":629.54,"tokSPrefill":495.43,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-14T14:24:58.120Z","notes":"Warm steady-state CPU benchmark: 24 simultaneous HTTP requests, 24 server slots sharing eight physical CPU threads (CPUs 0-7), ubatch 1024. Two 24-request decode batches measured 637.32 and 621.76 aggregate tok/s; reported midpoint 629.54 tok/s. Matching short-prompt prefill was measured separately at 495.43 tok/s over 232 prompt tokens. One warm-up batch was discarded. q8_0 K/V cache; no GPU offload or SMT. Throughput plateaued at 32 slots (636.64 tok/s), so 24 slots is the lower-memory representative.","engineFlags":{"commandSnippet":"taskset -c 0-7 llama-server -m LFM2-350M-Q2_K_S-imatrix.gguf -c 512 -np 24 -t 8 -tb 8 -b 4096 -ub 1024 --poll 100 -ctk q8_0 -ctv q8_0 -fa auto","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":"q8_0","attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"LiquidAI/LFM2-350M","displayName":"LFM2-350M","family":null,"params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21, AVX2 -Ofast Zen4-tuned no-repack server","quantization":"Q2_K_S","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":27,"reactionCounts":{},"myEmoji":null},{"id":"cmqdzleqx00scmp011atncrcw","contextLength":10000,"prefillTokens":null,"batchSize":32,"ttftMs":4557.7,"tokSOut":609.39,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":86.5,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-06-14T16:16:36.585Z","notes":"Aggregate end-to-end throughput across N concurrent streams (256 output tokens each, ignore_eos, temp 0, fixed ~44-tok prompt). tokSOut = total output tokens / wall-clock (includes prefill); ttftMs = median first-token latency. vLLM gpu-mem-util 0.75 on GB10 unified memory. Part of a concurrency sweep.","engineFlags":{"commandSnippet":"","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4","displayName":"Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4","family":"Opt","params":33,"isMoE":true,"baseModel":{"hfId":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16","displayName":"Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16"}},"hardware":{"hwClass":"UNIFIED","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":"NVIDIA","chipFamily":"GB10","chipVariant":"GB10 Grace Blackwell","unifiedMemoryGb":128,"cpu":null,"os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.22.1rc1.dev504+g54bbf5166","quantization":"NVFP4","backend":"cuda"},"user":{"id":"cmofur0mv0004l404pqpmktma","username":"briancaffey","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"GB10 Grace Blackwell","hardwareGroupKey":"UNIFIED:gb10 grace blackwell","rank":28,"reactionCounts":{},"myEmoji":null},{"id":"cmrkqgiwy04momj01d0o062lw","contextLength":512,"prefillTokens":null,"batchSize":8,"ttftMs":null,"tokSOut":594.48,"tokSPrefill":495.43,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-14T14:14:57.730Z","notes":"Warm steady-state CPU benchmark: eight simultaneous HTTP requests, eight server slots sharing eight physical CPU threads (CPUs 0-7), ubatch 1024. Two 16-request decode batches measured 599.07 and 584.46 aggregate tok/s; two 8-request batches measured 591.41 and 589.89, reported midpoint 594.48 tok/s. Matching eight-request short-prompt prefill measured 495.43 tok/s over 232 prompt tokens. One warm-up batch was discarded. q8_0 K/V cache; no GPU offload or SMT.","engineFlags":{"commandSnippet":"taskset -c 0-7 llama-server -m LFM2-350M-Q2_K_S-imatrix.gguf -c 512 -np 8 -t 8 -tb 8 -b 4096 -ub 1024 --poll 100 -ctk q8_0 -ctv q8_0 -fa auto","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":"q8_0","attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"LiquidAI/LFM2-350M","displayName":"LFM2-350M","family":null,"params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21, AVX2 -Ofast Zen4-tuned no-repack server","quantization":"Q2_K_S","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":29,"reactionCounts":{},"myEmoji":null},{"id":"cmqfm5f8300cimt01khpaapre","contextLength":2048,"prefillTokens":512,"batchSize":2048,"ttftMs":null,"tokSOut":590.09,"tokSPrefill":29161.42,"tokSTotal":2729.52,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-06-15T19:35:48.052Z","notes":"llama.cpp b9653 CUDA backend on Windows 10, RTX 3060 Ti 8GB. User requested LiquidAI/LFM2.5-350M; llama.cpp requires GGUF, so this run used LiquidAI/LFM2.5-350M-GGUF with Q4_K_M quantization. All layers offloaded with -ngl 99. Benchmark used -p 512 -n 128 -r 10 and llama-bench defaults unless specified: context 2048, batch 2048, ubatch 512, flash-attn auto, mmap enabled. TTFT and peak VRAM were not measured. Initial Hugging Face cache symlink warning on Windows was harmless; inference completed normally with CUDA.","engineFlags":{"commandSnippet":"llama-bench -hf LiquidAI/LFM2.5-350M-GGUF:Q4_K_M -ngl 99 -p 512 -n 128 -r 10 -o md","tensorParallel":null,"gpuLayers":99,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"LiquidAI/LFM2.5-350M-GGUF","displayName":"LFM2.5-350M-GGUF","family":"Llama","params":null,"isMoE":false,"baseModel":{"hfId":"LiquidAI/LFM2.5-350M","displayName":"LFM2.5-350M"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 3060 Ti","gpuCount":1,"vramGb":8,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"13th Gen Intel(R) Core(TM) i5-13600KF","os":"Windows 10 Pro 10.0.19045","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"b9653 / 9dbc6621a","quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmqff2flx007kmt01s0p5nkmn","username":"PinoCookie","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX 3060 Ti","hardwareGroupKey":"DISCRETE_GPU:rtx 3060 ti","rank":30,"reactionCounts":{},"myEmoji":null},{"id":"cmpdzs3fq00s8pd01aobvkld4","contextLength":4096,"prefillTokens":null,"batchSize":1,"ttftMs":82.93,"tokSOut":576.85,"tokSPrefill":754.54,"tokSTotal":null,"peakVramGb":7.43,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-05-20T11:42:06.183Z","notes":"hipfire @ 4840f0b6 (master, post-sync 2026-05-20)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\ndecode mode: DFlash speculative\n  drafter: qwen35-9b-dflash-mq4.hf4 (z-lab Qwen3.5-DFlash native head, MQ4 weights)\n  target: qwen3.5-9b.mq4\nkv_cache: q8\nprompt_normalize: true (default)\nτ: 13.1818   accept_rate: 0.8788  (byte-identical to canonical bench)\noutput: 256 tokens emitted, coherent merge_sort code\nruns: 3 (median reported); per-run tok/s: [577.48, 572.66, 576.85], spread 0.84%\nAR baseline (same binary same hardware, --ar-baseline): 122.88 tok/s — DFlash speedup 4.70x\nHardware: Sapphire Nitro+ RX 7900 XTX, ROCm 7.2.x, k9lin host\nRefresh of existing leaderboard row 2 (575.24 tok/s on 0.1.20-alpha+71896daa) — parity with current master.","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target ~/.hipfire/models/qwen3.5-9b.mq4 --draft ~/.hipfire/models/qwen35-9b-dflash-mq4.hf4 --prompt-file benchmarks/prompts/merge_sort_thinking_off.txt --max 256 --temp 0.0 --no-chatml --kv-mode q8 --ctx 4096","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":"q8","attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-9B","displayName":"Qwen3.5-9B","family":"Qwen","params":9,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-9B-Base","displayName":"Qwen3.5-9B-Base"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.2.0+4840f0b6","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":31,"reactionCounts":{},"myEmoji":null},{"id":"cmofzk74w0002ij042txvvkx5","contextLength":4096,"prefillTokens":null,"batchSize":1,"ttftMs":75.53,"tokSOut":575.25,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":8.07421875,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-04-26T16:31:47.744Z","notes":"hipfire @ e659452 (master post-PR #51 + #52 series)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n  chatml-wrapped + explicit empty <think></think> for thinking-off\n  prompt file: benchmarks/prompts/merge_sort_thinking_off.txt\nruns: 3 (median reported); range 571.2–576.0\n  per-run tok/s: [571.23, 575.25, 576.04]\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\nτ (median): 13.077\naccept_rate (median): 0.872\nprefill: 31.1ms (868.3 tok/s)\nttft (excl warmup): 75.5ms = prefill + first cycle\nvram: 8268 MB used / 24560 MB total\nnatural EOS at 184 tokens — production-shape bounded code (no loop)","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target qwen3.5-9b.mq4 --draft qwen35-9b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/merge_sort_thinking_off.txt) --max 256 --no-chatml --kv-mode asym3","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-9B","displayName":"Qwen3.5-9B","family":"Qwen","params":9,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-9B-Base","displayName":"Qwen3.5-9B-Base"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.1.8-alpha+e659452","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":32,"reactionCounts":{},"myEmoji":null},{"id":"cmp8fqdz200ypo4014i9empwm","contextLength":4096,"prefillTokens":null,"batchSize":1,"ttftMs":78.93,"tokSOut":575.24,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":7.95,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-05-16T14:22:03.326Z","notes":"hipfire @ 71896daa (feat/tbq, sFWHT KV family + multi-GPU)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n  benchmarks/prompts/merge_sort_thinking_off.txt (27 input tokens)\ndecode mode: DFlash speculative (adaptive block_size 8..16, mean B=16)\n  drafter: qwen35-9b-dflash-mq4.hf4 (z-lab Qwen3.5-DFlash native head, MQ4 weights)\n  target: qwen3.5-9b.mq4 (MQ4 weights, KV q8 filtered: 16/64 layers carry KV)\nkv_cache: q8 — chosen as winner from {q8,asym3,fwht3,fwht4} sweep (τ identical across all modes at this prompt; q8 has lowest KV-write overhead)\nprompt_normalize: true (default since 2026-04-26)\nτ: 13.1818   accept_rate: 0.8788\noutput: 256 tokens emitted\nAR baseline (same binary same hardware, --ar-baseline): 123.11 tok/s — DFlash speedup 4.67x\nHardware: Sapphire Nitro+ RX 7900 XTX, ROCm 7.2.x, k9lin host\nGPU coordination: gpu-tcas exclusive lease per cell","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target ~/.hipfire/models/qwen3.5-9b.mq4 --draft ~/.hipfire/models/qwen35-9b-dflash-mq4.hf4 --prompt-file benchmarks/prompts/merge_sort_thinking_off.txt --max 256 --temp 0.0 --no-chatml --kv-mode q8 --ctx 4096","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-9B","displayName":"Qwen3.5-9B","family":"Qwen","params":9,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-9B-Base","displayName":"Qwen3.5-9B-Base"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.1.20-alpha+71896daa","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":33,"reactionCounts":{},"myEmoji":null},{"id":"cmr2uuv7706mypi01o7epgvid","contextLength":16384,"prefillTokens":null,"batchSize":128,"ttftMs":1958.56,"tokSOut":540.872,"tokSPrefill":null,"tokSTotal":1664.871,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-02T01:58:14.132Z","notes":"GLM-4.5-Air AWQ-4bit compressed-tensors; sweep C_seqs128 (pp3 seqs=128 conc=128 ctx=16384)","engineFlags":{"commandSnippet":"","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"cyankiwi/GLM-4.5-Air-AWQ-4bit","displayName":"GLM-4.5-Air-AWQ-4bit","family":null,"params":110,"isMoE":true,"baseModel":{"hfId":"zai-org/GLM-4.5-Air","displayName":"GLM-4.5-Air"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":3,"vramGb":96,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 9 5950X","os":"Arch Linux","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.2+rocm723","quantization":"AWQ-4bit","backend":"rocm"},"user":{"id":"cmorcd2jg0008l804hfkput08","username":"1337Hero","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":34,"reactionCounts":{},"myEmoji":null},{"id":"cmr37xws306stpi01f5g70cvn","contextLength":8192,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":528.7,"tokSPrefill":null,"tokSTotal":599,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-02T08:04:31.155Z","notes":"Remote vLLM DiffusionGemma benchmark against localhost endpoint using served alias diffusiongemma-page-vlm. vLLM process was observed using the NVIDIA RTX PRO 6000 Blackwell. No-stream request-window throughput is the meaningful figure for this block-diffusion endpoint; streamed decode timing is not comparable because output arrives in canvas-sized chunks.","engineFlags":{"commandSnippet":"vllm serve ../models/diffusiongemma/26b-a4b-it --served-model-name diffusiongemma-page-vlm --host 127.0.0.1 --port 8081 --dtype auto --max-model-len 8192 --max-num-seqs 4 --gpu-memory-utilization 0.70 --generation-config vllm --hf-overrides {\"diffusion_sampler\":\"entropy_bound\",\"diffusion_entropy_bound\":0.1} --diffusion-config {\"canvas_length\":256} --limit-mm-per-prompt {\"image\":1,\"audio\":0} --mm-processor-kwargs {\"max_soft_tokens\":560} --enable-chunked-prefill --max-num-batched-tokens 32768 --kv-cache-dtype fp8 --kv-cache-memory-bytes 10G --attention-backend TRITON_ATTN --moe-backend triton","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":"fp8","attentionBackend":"TRITON_ATTN","flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"google/diffusiongemma-26B-A4B-it","displayName":"diffusiongemma-26B-A4B-it","family":"Gemma","params":26,"isMoE":true,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX PRO 6000 Blackwell","gpuCount":1,"vramGb":95.6,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 9 9950X 16-Core Processor","os":"Linux","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.1.dev1+ga275a28e0","quantization":"bf16","backend":"cuda"},"user":{"id":"cmqmzrvvi005yqo010lpqildo","username":"nck","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX PRO 6000 Blackwell","hardwareGroupKey":"DISCRETE_GPU:rtx pro 6000 blackwell","rank":35,"reactionCounts":{},"myEmoji":null},{"id":"cmopmgspw0002jv041yjbb6sq","contextLength":8192,"prefillTokens":null,"batchSize":1,"ttftMs":113.4,"tokSOut":506.2,"tokSPrefill":null,"tokSTotal":504,"peakVramGb":33.6,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-05-03T10:22:55.845Z","notes":"Campaign champion — graph_partition + FAP compile (no max-autotune) on SM120 Blackwell. 2.14x over pre-compile baseline. BFCL 73.33% (22/30). DFlash speculative decoding N=4 + CUDA graphs + greedy (temp=0). 20/20 tool-call chain completed, 100% tool_call_valid_rate. Warmup=10 discarded runs before measurement.","engineFlags":{"commandSnippet":"vllm serve mmangkad/Qwen3.6-35B-A3B-NVFP4 --quantization nvfp4 --max-model-len 8192 --max-num-seqs 1 --gpu-memory-utilization 0.35 --kv-cache-dtype bfloat16 --tool-call-parser qwen3_coder --enable-auto-tool-choice --speculative-config {\"method\":\"dflash\",\"num_speculative_tokens\":4} --compilation-config {\"level\":2,\"inductor_compile_config\":{\"use_inductor_graph_partition\":true}}","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"bfloat16","attentionBackend":null,"flashAttn":true,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"mmangkad/Qwen3.6-35B-A3B-NVFP4","displayName":"Qwen3.6-35B-A3B-NVFP4","family":"Qwen","params":35,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.6-35B-A3B","displayName":"Qwen3.6-35B-A3B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX PRO 6000 Blackwell","gpuCount":1,"vramGb":96,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.19.x-pr40898-cu130","quantization":"NVFP4","backend":null},"user":{"id":"cmonzzbj40000jx04er7559mm","username":"zhu","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX PRO 6000 Blackwell","hardwareGroupKey":"DISCRETE_GPU:rtx pro 6000 blackwell","rank":36,"reactionCounts":{},"myEmoji":null},{"id":"cmojz7fb9000ljo04ia5qjriu","contextLength":200000,"prefillTokens":null,"batchSize":10,"ttftMs":368,"tokSOut":503.74,"tokSPrefill":null,"tokSTotal":9780.83,"peakVramGb":535.8,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-04-29T11:32:56.517Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 368ms, Mean: 626ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2.5 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 4 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.95  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2.5","displayName":"MiniMax-M2.5","family":"Minimax","params":229,"isMoE":true,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 SXM","gpuCount":4,"vramGb":564,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev30+gf05f3664c","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"H200 SXM","hardwareGroupKey":"DISCRETE_GPU:h200 sxm","rank":37,"reactionCounts":{},"myEmoji":null},{"id":"cmojyvep9000ejo0413hg8qg3","contextLength":200000,"prefillTokens":null,"batchSize":7,"ttftMs":417,"tokSOut":499.2,"tokSPrefill":null,"tokSTotal":9398.1,"peakVramGb":535.8,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-04-29T11:23:35.853Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 417ms, Mean: 587ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2.1 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 4 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.95  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2.1","displayName":"MiniMax-M2.1","family":"Minimax","params":229,"isMoE":true,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 SXM","gpuCount":4,"vramGb":564,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev30+gf05f3664c","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"H200 SXM","hardwareGroupKey":"DISCRETE_GPU:h200 sxm","rank":38,"reactionCounts":{},"myEmoji":null},{"id":"cmojzdzz3000jkw049rbli91e","contextLength":200000,"prefillTokens":null,"batchSize":10,"ttftMs":349,"tokSOut":495.88,"tokSPrefill":null,"tokSTotal":8389.38,"peakVramGb":535.8,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-04-29T11:38:03.232Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 349ms, Mean: 933ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2.7 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 4 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.95  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2.7","displayName":"MiniMax-M2.7","family":"Minimax","params":229,"isMoE":true,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 SXM","gpuCount":4,"vramGb":564,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev30+gf05f3664c","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"H200 SXM","hardwareGroupKey":"DISCRETE_GPU:h200 sxm","rank":39,"reactionCounts":{},"myEmoji":null},{"id":"cmr82nks40058qr01ywdd05w6","contextLength":2048,"prefillTokens":null,"batchSize":16,"ttftMs":1780,"tokSOut":495.8,"tokSPrefill":6383.4,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-05T17:35:21.844Z","notes":"Full DGX/GB10 Agentic Arcade throughput queue. This queue includes the complete missing curve rows rather than silently culling single-stream, warmup-candidate, startup-outlier, or lowest-curve cells. qualityFlags=none. Imported from Agentic Arcade /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv row agents=16. Hardware/model fields inferred from build_site.py where not explicit; run API dry-run before public submit.","engineFlags":{"commandSnippet":"Agentic Arcade recorded throughput row from /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv at agents=16","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/Gemma-4-26B-A4B-NVFP4","displayName":"Gemma-4-26B-A4B-NVFP4","family":"Gemma","params":14,"isMoE":false,"baseModel":{"hfId":"google/gemma-4-26B-A4B-it","displayName":"gemma-4-26B-A4B-it"}},"hardware":{"hwClass":"UNIFIED","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":"NVIDIA","chipFamily":"GB10","chipVariant":"GB10 Grace Blackwell","unifiedMemoryGb":128,"cpu":null,"os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"see Agentic Arcade source metadata","quantization":"NVFP4","backend":"cuda"},"user":{"id":"cmr5toaxx00fgq901xfezberz","username":"Frosty40","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"GB10 Grace Blackwell","hardwareGroupKey":"UNIFIED:gb10 grace blackwell","rank":40,"reactionCounts":{},"myEmoji":null},{"id":"cmriompce01y4mj018fva83y9","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":495.020584,"tokSPrefill":2513.730917,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:48:14.414Z","notes":"CPU-only llama.cpp build (GPU backends disabled), 8 physical-core threads. One throwaway run plus llama-bench internal warmup; two measured repetitions. Headless Ubuntu; BIOS Silent mode; Core Turbo Boost disabled. Decode samples tok/s: 496.331, 493.71. GGUF-reported parameters: 91131072. GGUF source: tiiuae/Falcon-H1-Tiny-90M-Instruct-GGUF.","engineFlags":{"commandSnippet":"llama-bench -m Falcon-H1-Tiny-90M-Instruct-Q2_K.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"tiiuae/Falcon-H1-Tiny-90M-Instruct","displayName":"Falcon-H1-Tiny-90M-Instruct","family":"Falcon","params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"Q2_K","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":41,"reactionCounts":{},"myEmoji":null},{"id":"cmojy2n700004ju04dvkn02xg","contextLength":200000,"prefillTokens":null,"batchSize":10,"ttftMs":390,"tokSOut":492.6,"tokSPrefill":null,"tokSTotal":9496.63,"peakVramGb":535.8,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-04-29T11:01:13.837Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 390ms, Mean: 718ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 4 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.95  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2","displayName":"MiniMax-M2","family":"Minimax","params":229,"isMoE":true,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 SXM","gpuCount":4,"vramGb":564,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev30+gf05f3664c","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"H200 SXM","hardwareGroupKey":"DISCRETE_GPU:h200 sxm","rank":42,"reactionCounts":{},"myEmoji":null},{"id":"cmr82nkkv0055qr011ulhk4vd","contextLength":2048,"prefillTokens":null,"batchSize":8,"ttftMs":880,"tokSOut":482.3,"tokSPrefill":6432.1,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-05T17:35:21.584Z","notes":"Full DGX/GB10 Agentic Arcade throughput queue. This queue includes the complete missing curve rows rather than silently culling single-stream, warmup-candidate, startup-outlier, or lowest-curve cells. qualityFlags=none. Imported from Agentic Arcade /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv row agents=8. Hardware/model fields inferred from build_site.py where not explicit; run API dry-run before public submit.","engineFlags":{"commandSnippet":"Agentic Arcade recorded throughput row from /home/frosty40/agentic-arcade/data/results_throughput_gemma.csv at agents=8","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/Gemma-4-26B-A4B-NVFP4","displayName":"Gemma-4-26B-A4B-NVFP4","family":"Gemma","params":14,"isMoE":false,"baseModel":{"hfId":"google/gemma-4-26B-A4B-it","displayName":"gemma-4-26B-A4B-it"}},"hardware":{"hwClass":"UNIFIED","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":"NVIDIA","chipFamily":"GB10","chipVariant":"GB10 Grace Blackwell","unifiedMemoryGb":128,"cpu":null,"os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"see Agentic Arcade source metadata","quantization":"NVFP4","backend":"cuda"},"user":{"id":"cmr5toaxx00fgq901xfezberz","username":"Frosty40","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"GB10 Grace Blackwell","hardwareGroupKey":"UNIFIED:gb10 grace blackwell","rank":43,"reactionCounts":{},"myEmoji":null},{"id":"cmr809zum002xqr01offbugjd","contextLength":2048,"prefillTokens":null,"batchSize":16,"ttftMs":null,"tokSOut":466.1,"tokSPrefill":122.7,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-05T16:28:48.958Z","notes":"Full DGX/GB10 Agentic Arcade throughput queue. This queue includes the complete missing curve rows rather than silently culling single-stream, warmup-candidate, startup-outlier, or lowest-curve cells. qualityFlags=none. Imported from Agentic Arcade /home/frosty40/agentic-arcade/data/results_throughput_aeon_ornith.csv row agents=16. Hardware/model fields inferred from build_site.py where not explicit; run API dry-run before public submit.","engineFlags":{"commandSnippet":"Agentic Arcade recorded throughput row from /home/frosty40/agentic-arcade/data/results_throughput_aeon_ornith.csv at agents=16","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"AEON-7/Ornith-1.0-35B-AEON-Ultimate-Uncensored-NVFP4","displayName":"Ornith-1.0-35B-AEON-Ultimate-Uncensored-NVFP4","family":"Qwen","params":0,"isMoE":true,"baseModel":{"hfId":"AEON-7/Ornith-1.0-35B-AEON-Ultimate-Uncensored-BF16","displayName":"Ornith-1.0-35B-AEON-Ultimate-Uncensored-BF16"}},"hardware":{"hwClass":"UNIFIED","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":"NVIDIA","chipFamily":"GB10","chipVariant":"GB10 Grace Blackwell","unifiedMemoryGb":128,"cpu":null,"os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"see Agentic Arcade source metadata","quantization":"NVFP4","backend":"cuda"},"user":{"id":"cmr5toaxx00fgq901xfezberz","username":"Frosty40","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"GB10 Grace Blackwell","hardwareGroupKey":"UNIFIED:gb10 grace blackwell","rank":44,"reactionCounts":{},"myEmoji":null},{"id":"cmqzbtn8902hsoe012ipenqyw","contextLength":4096,"prefillTokens":null,"batchSize":1,"ttftMs":14.3,"tokSOut":448.11,"tokSPrefill":35896.7,"tokSTotal":2134,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-06-29T14:42:05.913Z","notes":null,"engineFlags":{"commandSnippet":"/home/maxious/llama-master/build-cuda/bin/llama-bench -m /run/media/maxious/6C4A1C714A1C39F2/koboldcpp/Llama-3.2-1B.Q8_0.gguf -p 512 -n 128 -fa 1","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"meta-llama/Llama-3.2-1B-Instruct","displayName":"Llama-3.2-1B-Instruct","family":"Llama","params":1,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 5080","gpuCount":1,"vramGb":16,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 9800X3D 8-Core Processor","os":"Ubuntu 26.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":null,"quantization":"Q8_0","backend":"cuda"},"user":{"id":"cmqz5ajcj024roe01nncjtqug","username":"maxiosu","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX 5080","hardwareGroupKey":"DISCRETE_GPU:rtx 5080","rank":45,"reactionCounts":{},"myEmoji":null},{"id":"cmrio8pcf01wamj0114vo2fz6","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":440.214221,"tokSPrefill":2306.808034,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:37:21.231Z","notes":"CPU quantization/thread sweep. CPU-only build; 8 physical-core threads; llama-bench warmup and two measured repetitions. Headless; BIOS Silent mode; Core Turbo Boost disabled. Decode samples tok/s: 439.206, 441.222. GGUF-reported parameters: 134515008.","engineFlags":{"commandSnippet":"llama-bench -m SmolLM2-135M-Instruct-Q2_K.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"HuggingFaceTB/SmolLM2-135M-Instruct","displayName":"SmolLM2-135M-Instruct","family":"Llama","params":0,"isMoE":false,"baseModel":{"hfId":"HuggingFaceTB/SmolLM2-135M","displayName":"SmolLM2-135M"}},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"Q2_K","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":46,"reactionCounts":{},"myEmoji":null},{"id":"cmrip121g01zwmj01b5dqib2b","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":433.850605,"tokSPrefill":3374.909726,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:59:24.052Z","notes":"CPU-only llama.cpp build (GPU backends disabled), 8 physical-core threads. One throwaway run plus llama-bench internal warmup; two measured repetitions. Headless Ubuntu; BIOS Silent mode; Core Turbo Boost disabled. Decode samples tok/s: 432.884, 434.817. GGUF-reported parameters: 107908288. GGUF source: tiiuae/Falcon-H1-Tiny-Tool-Calling-90M-GGUF.","engineFlags":{"commandSnippet":"llama-bench -m Falcon-H1-Tiny-90M-Tool-Calling-Q4_K_M.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"tiiuae/Falcon-H1-Tiny-Tool-Calling-90M","displayName":"Falcon-H1-Tiny-Tool-Calling-90M","family":"Falcon","params":null,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"Q4_K_M","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":47,"reactionCounts":{},"myEmoji":null},{"id":"cmrj7jhha029lmj01wqw9b6t8","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":26.4,"tokSOut":427.87,"tokSPrefill":19387.67,"tokSTotal":1965.8,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T12:37:36.958Z","notes":null,"engineFlags":{"commandSnippet":"/usr/local/bin/llama-bench -m /hf_models/hub/models--bartowski--Qwen_Qwen3.5-0.8B-GGUF/snapshots/f36b1ea49a332ede8fe5f389bbf5b3575ef71f48/Qwen_Qwen3.5-0.8B-Q4_K_M.gguf -p 512 -n 128","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"bartowski/Qwen_Qwen3.5-0.8B-GGUF","displayName":"Qwen_Qwen3.5-0.8B-GGUF","family":"Qwen","params":0.8,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-0.8B","displayName":"Qwen3.5-0.8B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Multi GPU","gpuCount":2,"vramGb":28,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 9 7900X3D 12-Core Processor","os":"linux","isHeterogeneousGpu":true,"gpuSlots":[{"gpuName":"RTX 4070 Ti","count":1,"vramGb":12},{"gpuName":"RTX 5080","count":1,"vramGb":16}]},"engine":{"engineName":"llama.cpp","engineVersion":null,"quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmrj4k2t7027zmj01skta8ey0","username":"DevMozg","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"RTX 4070 Ti + RTX 5080","hardwareGroupKey":"DISCRETE_GPU:rtx 4070 ti + rtx 5080","rank":48,"reactionCounts":{},"myEmoji":null},{"id":"cmrip0zxz01zrmj01ssd39obw","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":425.679366,"tokSPrefill":3277.468737,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:59:21.335Z","notes":"CPU-only llama.cpp build (GPU backends disabled), 8 physical-core threads. One throwaway run plus llama-bench internal warmup; two measured repetitions. Headless Ubuntu; BIOS Silent mode; Core Turbo Boost disabled. Decode samples tok/s: 421.693, 429.666. GGUF-reported parameters: 107908288. GGUF source: tiiuae/Falcon-H1-Tiny-Coder-90M-GGUF.","engineFlags":{"commandSnippet":"llama-bench -m Falcon-H1-Tiny-90M-Coder-Q4_K_M.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"tiiuae/Falcon-H1-Tiny-Coder-90M","displayName":"Falcon-H1-Tiny-Coder-90M","family":"Falcon","params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"Q4_K_M","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":49,"reactionCounts":{},"myEmoji":null},{"id":"cmriomrc701y7mj01omh1mc6n","contextLength":2048,"prefillTokens":null,"batchSize":1,"ttftMs":null,"tokSOut":424.506271,"tokSPrefill":3495.076941,"tokSTotal":null,"peakVramGb":null,"gpuPowerWatts":[],"totalPowerWatts":null,"hardwareCost":null,"createdAt":"2026-07-13T03:48:16.999Z","notes":"CPU-only llama.cpp build (GPU backends disabled), 8 physical-core threads. One throwaway run plus llama-bench internal warmup; two measured repetitions. Headless Ubuntu; BIOS Silent mode; Core Turbo Boost disabled. Decode samples tok/s: 424.399, 424.614. GGUF-reported parameters: 91131072. GGUF source: tiiuae/Falcon-H1-Tiny-90M-Instruct-GGUF.","engineFlags":{"commandSnippet":"llama-bench -m Falcon-H1-Tiny-90M-Instruct-Q4_K_M.gguf -t 8 -p 512 -n 128 -r 2 -o json","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"tiiuae/Falcon-H1-Tiny-90M-Instruct","displayName":"Falcon-H1-Tiny-90M-Instruct","family":"Falcon","params":0,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"CPU_ONLY","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 7 7840HS","os":"Ubuntu 24.04.4 LTS","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"commit 6b4dc21","quantization":"Q4_K_M","backend":"cpu"},"user":{"id":"cmoext1fr0000jl04skh3hjqf","username":"steveseguin","verified":false,"verifiedAt":null,"pro":false},"hardwareGroupLabel":"AMD Ryzen 7 7840HS","hardwareGroupKey":"CPU_ONLY:amd ryzen 7 7840hs","rank":50,"reactionCounts":{},"myEmoji":null}],"total":2992,"limit":50,"offset":0}