{"rows":[{"id":"cmokhzrb40005ld0493vy3ltd","contextLength":262144,"batchSize":10,"ttftMs":143,"tokSOut":2665.14,"tokSPrefill":null,"tokSTotal":55846.85,"peakVramGb":133,"createdAt":"2026-04-29T20:18:51.521Z","notes":"Requires DeepGEMM installation after uv pip install vllm:\ngit clone --recursive https://github.com/deepseek-ai/DeepGEMM.git\ncd DeepGEMM\nDG_FORCE_BUILD=1 uv pip install --force-reinstall --no-build-isolation .\n\nBenchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 143ms, Mean: 340ms","engineFlags":{"commandSnippet":"vllm serve Qwen/Qwen3.5-0.8B-Base --served-model-name qwen3 --trust-remote-code --tensor-parallel-size 1 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --gpu-memory-utilization 0.95 --speculative-config '{\"method\":\"qwen3_next_mtp\",\"num_speculative_tokens\":2}'  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":1,"gpuLayers":4,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-0.8B-Base","displayName":"Qwen3.5-0.8B-Base","family":"Qwen","params":1,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 NVL","gpuCount":1,"vramGb":141,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9175F","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev74+gfaab18955","quantization":"BF16","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 NVL","hardwareGroupKey":"DISCRETE_GPU:h200 nvl","rank":1,"reactionCounts":{},"myEmoji":null},{"id":"cmp7fqvp7006jo401h82kkyfb","contextLength":32768,"batchSize":64,"ttftMs":270.64,"tokSOut":1405.394,"tokSPrefill":null,"tokSTotal":4407.469,"peakVramGb":null,"createdAt":"2026-05-15T21:34:40.123Z","notes":"Qwen2.5-7B BF16 vLLM ROCm source build. TP=1, max-seqs=128, batch-tokens=32768, client concurrency=64, chunked prefill + prefix caching. 1× R9700.","engineFlags":{"commandSnippet":"/home/mikekey/.venvs/vllm/bin/vllm serve /home/mikekey/models/hf/Qwen2.5-7B   --tensor-parallel-size 1   --dtype bfloat16   --max-model-len 32768   --max-num-seqs 128   --max-num-batched-tokens 32768   --enable-chunked-prefill   --enable-prefix-caching   --gpu-memory-utilization 0.90   --port 8000   --served-model-name qwen2.5-7b   --trust-remote-code","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen2.5-7B","displayName":"Qwen2.5-7B","family":"Qwen","params":8,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":1,"vramGb":32,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":"Arch Linux","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.2+rocm723","quantization":"bf16","backend":"rocm"},"user":{"id":"cmorcd2jg0008l804hfkput08","username":"1337Hero","verified":false,"verifiedAt":null},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":2,"reactionCounts":{},"myEmoji":null},{"id":"cmp7f7rgi006co4019h144d9a","contextLength":32768,"batchSize":64,"ttftMs":275.98,"tokSOut":1399.209,"tokSPrefill":null,"tokSTotal":4388.071,"peakVramGb":null,"createdAt":"2026-05-15T21:19:48.162Z","notes":"Qwen2.5-7B BF16 vLLM ROCm source build. TP=1, max-seqs=128, batch-tokens=32768, client concurrency=64, chunked prefill + prefix caching. 1× R9700.","engineFlags":{"commandSnippet":"/home/mikekey/.venvs/vllm/bin/vllm serve /home/mikekey/models/hf/Qwen2.5-7B   --tensor-parallel-size 1   --dtype bfloat16   --max-model-len 32768   --max-num-seqs 128   --max-num-batched-tokens 32768   --enable-chunked-prefill   --enable-prefix-caching   --gpu-memory-utilization 0.90   --port 8000   --served-model-name qwen2.5-7b   --trust-remote-code","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen2.5-7B","displayName":"Qwen2.5-7B","family":"Qwen","params":8,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":1,"vramGb":32,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":"Arch Linux","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.2+rocm723","quantization":"bf16","backend":"rocm"},"user":{"id":"cmorcd2jg0008l804hfkput08","username":"1337Hero","verified":false,"verifiedAt":null},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":3,"reactionCounts":{},"myEmoji":null},{"id":"cmozt9exg0085lo01c1g3q2i7","contextLength":65536,"batchSize":16,"ttftMs":null,"tokSOut":1048,"tokSPrefill":null,"tokSTotal":1048,"peakVramGb":null,"createdAt":"2026-05-10T13:30:50.452Z","notes":"Concurrency=16 aggregate throughput, sweet-spot config. 16 parallel chat completions, 8 distinct prompts cycled with per-request seeding to defeat prefix caching, temp=0.7, max_tokens=600 (avg actual=564). Wall 8.62s, total 9,029 tokens, no queueing (max-num-seqs=16), p95 ≈ p50, 0 KV preemptions. Continuous batching only — Eagle3 spec-decode tested and dropped (27% accept on NVFP4 target → -29% throughput). Per-stream ~69 t/s under load vs 128 t/s single-stream. KV cache FP8. EPYC 32c/64t host, 160 GB DDR5. Run 2026-05-09.","engineFlags":{"commandSnippet":"vllm serve /opt/models/gemma-4-26B-A4B-it-NVFP4 --served-model-name gemma-4-26b-a4b-it --host 0.0.0.0 --port 8000 --max-model-len 65536 --gpu-memory-utilization 0.90 --max-num-seqs 16","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/Gemma-4-26B-A4B-NVFP4","displayName":"Gemma-4-26B-A4B-NVFP4","family":"Gemma","params":14,"isMoE":false,"baseModel":{"hfId":"google/gemma-4-26B-A4B-it","displayName":"gemma-4-26B-A4B-it"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX PRO 6000 Blackwell","gpuCount":1,"vramGb":96,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1","quantization":"NVFP4 (modelopt_fp4 W4A4)","backend":null},"user":{"id":"cmowc9fer00qdp101b007eug5","username":"murdarch","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX PRO 6000 Blackwell","hardwareGroupKey":"DISCRETE_GPU:rtx pro 6000 blackwell","rank":4,"reactionCounts":{},"myEmoji":null},{"id":"cmot9gh4r000bib04esjf3r77","contextLength":32768,"batchSize":16,"ttftMs":175.4,"tokSOut":991.1,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":null,"createdAt":"2026-05-05T23:29:50.523Z","notes":"gpt-oss-20b MXFP4 concurrent throughput at batch=16 (matches --max-num-seqs 16). Aggregate output tok/s, best of 3 runs (991.1 / 986.8 / 990.6 -- variance under 0.5%). Per-request decode 64.8 tok/s. TTFT 175ms. This is vLLM's strength zone vs llama.cpp single-stream: ~6x throughput when serving multiple concurrent users. batch=1 single-stream was 48 tok/s (separate submission) -- same hardware, llama.cpp Q8_0 single-stream was 160 tok/s.","engineFlags":{"commandSnippet":"LD_LIBRARY_PATH=~/.local/lib/rccl-7.1.1:$LD_LIBRARY_PATH HIP_VISIBLE_DEVICES=0,1 VLLM_TARGET_DEVICE=rocm VLLM_ROCM_USE_AITER=0 FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE vllm serve openai/gpt-oss-20b --tensor-parallel-size 2 --dtype bfloat16 --max-model-len 32768 --max-num-seqs 16 --max-num-batched-tokens 4096 --enable-chunked-prefill --enable-prefix-caching --gpu-memory-utilization 0.90 --moe-backend triton --reasoning-parser openai_gptoss --tool-call-parser openai","tensorParallel":2,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"openai/gpt-oss-20b","displayName":"gpt-oss-20b","family":"Gpt","params":22,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":2,"vramGb":64,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen 9 5950X","os":"Arch Linux","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1+rocm721","quantization":"MXFP4_MOE","backend":"rocm"},"user":{"id":"cmorcd2jg0008l804hfkput08","username":"1337Hero","verified":false,"verifiedAt":null},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":5,"reactionCounts":{},"myEmoji":null},{"id":"cmojycegm0005jo04bvjn0bs6","contextLength":262144,"batchSize":10,"ttftMs":491,"tokSOut":878.38,"tokSPrefill":null,"tokSTotal":18412.81,"peakVramGb":535.8,"createdAt":"2026-04-29T11:08:49.078Z","notes":"Requires DeepGEMM installation after uv pip install vllm:\ngit clone --recursive https://github.com/deepseek-ai/DeepGEMM.git\ncd DeepGEMM\nDG_FORCE_BUILD=1 uv pip install --force-reinstall --no-build-isolation .\n\nBenchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 491ms, Mean: 855ms","engineFlags":{"commandSnippet":"vllm serve Qwen/Qwen3.5-122B-A10B-FP8 --served-model-name qwen3 --trust-remote-code --tensor-parallel-size 4 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --gpu-memory-utilization 0.95 --speculative-config {\"method\":\"qwen3_next_mtp\",\"num_speculative_tokens\":2}  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-122B-A10B-FP8","displayName":"Qwen3.5-122B-A10B-FP8","family":"Qwen","params":125,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-122B-A10B","displayName":"Qwen3.5-122B-A10B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 SXM","gpuCount":4,"vramGb":564,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev30+gf05f3664c","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 SXM","hardwareGroupKey":"DISCRETE_GPU:h200 sxm","rank":6,"reactionCounts":{},"myEmoji":null},{"id":"cmpdzs3fq00s8pd01aobvkld4","contextLength":4096,"batchSize":1,"ttftMs":82.93,"tokSOut":576.85,"tokSPrefill":754.54,"tokSTotal":null,"peakVramGb":7.43,"createdAt":"2026-05-20T11:42:06.183Z","notes":"hipfire @ 4840f0b6 (master, post-sync 2026-05-20)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\ndecode mode: DFlash speculative\n  drafter: qwen35-9b-dflash-mq4.hf4 (z-lab Qwen3.5-DFlash native head, MQ4 weights)\n  target: qwen3.5-9b.mq4\nkv_cache: q8\nprompt_normalize: true (default)\nτ: 13.1818   accept_rate: 0.8788  (byte-identical to canonical bench)\noutput: 256 tokens emitted, coherent merge_sort code\nruns: 3 (median reported); per-run tok/s: [577.48, 572.66, 576.85], spread 0.84%\nAR baseline (same binary same hardware, --ar-baseline): 122.88 tok/s — DFlash speedup 4.70x\nHardware: Sapphire Nitro+ RX 7900 XTX, ROCm 7.2.x, k9lin host\nRefresh of existing leaderboard row 2 (575.24 tok/s on 0.1.20-alpha+71896daa) — parity with current master.","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target ~/.hipfire/models/qwen3.5-9b.mq4 --draft ~/.hipfire/models/qwen35-9b-dflash-mq4.hf4 --prompt-file benchmarks/prompts/merge_sort_thinking_off.txt --max 256 --temp 0.0 --no-chatml --kv-mode q8 --ctx 4096","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":"q8","attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-9B","displayName":"Qwen3.5-9B","family":"Qwen","params":9,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-9B-Base","displayName":"Qwen3.5-9B-Base"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.2.0+4840f0b6","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":7,"reactionCounts":{},"myEmoji":null},{"id":"cmofzk74w0002ij042txvvkx5","contextLength":4096,"batchSize":1,"ttftMs":75.53,"tokSOut":575.25,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":8.07421875,"createdAt":"2026-04-26T16:31:47.744Z","notes":"hipfire @ e659452 (master post-PR #51 + #52 series)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n  chatml-wrapped + explicit empty <think></think> for thinking-off\n  prompt file: benchmarks/prompts/merge_sort_thinking_off.txt\nruns: 3 (median reported); range 571.2–576.0\n  per-run tok/s: [571.23, 575.25, 576.04]\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\nτ (median): 13.077\naccept_rate (median): 0.872\nprefill: 31.1ms (868.3 tok/s)\nttft (excl warmup): 75.5ms = prefill + first cycle\nvram: 8268 MB used / 24560 MB total\nnatural EOS at 184 tokens — production-shape bounded code (no loop)","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target qwen3.5-9b.mq4 --draft qwen35-9b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/merge_sort_thinking_off.txt) --max 256 --no-chatml --kv-mode asym3","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-9B","displayName":"Qwen3.5-9B","family":"Qwen","params":9,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-9B-Base","displayName":"Qwen3.5-9B-Base"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.1.8-alpha+e659452","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":8,"reactionCounts":{},"myEmoji":null},{"id":"cmp8fqdz200ypo4014i9empwm","contextLength":4096,"batchSize":1,"ttftMs":78.93,"tokSOut":575.24,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":7.95,"createdAt":"2026-05-16T14:22:03.326Z","notes":"hipfire @ 71896daa (feat/tbq, sFWHT KV family + multi-GPU)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n  benchmarks/prompts/merge_sort_thinking_off.txt (27 input tokens)\ndecode mode: DFlash speculative (adaptive block_size 8..16, mean B=16)\n  drafter: qwen35-9b-dflash-mq4.hf4 (z-lab Qwen3.5-DFlash native head, MQ4 weights)\n  target: qwen3.5-9b.mq4 (MQ4 weights, KV q8 filtered: 16/64 layers carry KV)\nkv_cache: q8 — chosen as winner from {q8,asym3,fwht3,fwht4} sweep (τ identical across all modes at this prompt; q8 has lowest KV-write overhead)\nprompt_normalize: true (default since 2026-04-26)\nτ: 13.1818   accept_rate: 0.8788\noutput: 256 tokens emitted\nAR baseline (same binary same hardware, --ar-baseline): 123.11 tok/s — DFlash speedup 4.67x\nHardware: Sapphire Nitro+ RX 7900 XTX, ROCm 7.2.x, k9lin host\nGPU coordination: gpu-tcas exclusive lease per cell","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target ~/.hipfire/models/qwen3.5-9b.mq4 --draft ~/.hipfire/models/qwen35-9b-dflash-mq4.hf4 --prompt-file benchmarks/prompts/merge_sort_thinking_off.txt --max 256 --temp 0.0 --no-chatml --kv-mode q8 --ctx 4096","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-9B","displayName":"Qwen3.5-9B","family":"Qwen","params":9,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-9B-Base","displayName":"Qwen3.5-9B-Base"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.1.20-alpha+71896daa","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":9,"reactionCounts":{},"myEmoji":null},{"id":"cmopmgspw0002jv041yjbb6sq","contextLength":8192,"batchSize":1,"ttftMs":113.4,"tokSOut":506.2,"tokSPrefill":null,"tokSTotal":504,"peakVramGb":33.6,"createdAt":"2026-05-03T10:22:55.845Z","notes":"Campaign champion — graph_partition + FAP compile (no max-autotune) on SM120 Blackwell. 2.14x over pre-compile baseline. BFCL 73.33% (22/30). DFlash speculative decoding N=4 + CUDA graphs + greedy (temp=0). 20/20 tool-call chain completed, 100% tool_call_valid_rate. Warmup=10 discarded runs before measurement.","engineFlags":{"commandSnippet":"vllm serve mmangkad/Qwen3.6-35B-A3B-NVFP4 --quantization nvfp4 --max-model-len 8192 --max-num-seqs 1 --gpu-memory-utilization 0.35 --kv-cache-dtype bfloat16 --tool-call-parser qwen3_coder --enable-auto-tool-choice --speculative-config {\"method\":\"dflash\",\"num_speculative_tokens\":4} --compilation-config {\"level\":2,\"inductor_compile_config\":{\"use_inductor_graph_partition\":true}}","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"bfloat16","attentionBackend":null,"flashAttn":true,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"mmangkad/Qwen3.6-35B-A3B-NVFP4","displayName":"Qwen3.6-35B-A3B-NVFP4","family":"Qwen","params":35,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.6-35B-A3B","displayName":"Qwen3.6-35B-A3B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX PRO 6000 Blackwell","gpuCount":1,"vramGb":96,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.19.x-pr40898-cu130","quantization":"NVFP4","backend":null},"user":{"id":"cmonzzbj40000jx04er7559mm","username":"zhu","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX PRO 6000 Blackwell","hardwareGroupKey":"DISCRETE_GPU:rtx pro 6000 blackwell","rank":10,"reactionCounts":{},"myEmoji":null},{"id":"cmojz7fb9000ljo04ia5qjriu","contextLength":200000,"batchSize":10,"ttftMs":368,"tokSOut":503.74,"tokSPrefill":null,"tokSTotal":9780.83,"peakVramGb":535.8,"createdAt":"2026-04-29T11:32:56.517Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 368ms, Mean: 626ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2.5 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 4 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.95  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2.5","displayName":"MiniMax-M2.5","family":"Minimax","params":229,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 SXM","gpuCount":4,"vramGb":564,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev30+gf05f3664c","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 SXM","hardwareGroupKey":"DISCRETE_GPU:h200 sxm","rank":11,"reactionCounts":{},"myEmoji":null},{"id":"cmojyvep9000ejo0413hg8qg3","contextLength":200000,"batchSize":7,"ttftMs":417,"tokSOut":499.2,"tokSPrefill":null,"tokSTotal":9398.1,"peakVramGb":535.8,"createdAt":"2026-04-29T11:23:35.853Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 417ms, Mean: 587ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2.1 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 4 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.95  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2.1","displayName":"MiniMax-M2.1","family":"Minimax","params":229,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 SXM","gpuCount":4,"vramGb":564,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev30+gf05f3664c","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 SXM","hardwareGroupKey":"DISCRETE_GPU:h200 sxm","rank":12,"reactionCounts":{},"myEmoji":null},{"id":"cmojzdzz3000jkw049rbli91e","contextLength":200000,"batchSize":10,"ttftMs":349,"tokSOut":495.88,"tokSPrefill":null,"tokSTotal":8389.38,"peakVramGb":535.8,"createdAt":"2026-04-29T11:38:03.232Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 349ms, Mean: 933ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2.7 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 4 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.95  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2.7","displayName":"MiniMax-M2.7","family":"Minimax","params":229,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 SXM","gpuCount":4,"vramGb":564,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev30+gf05f3664c","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 SXM","hardwareGroupKey":"DISCRETE_GPU:h200 sxm","rank":13,"reactionCounts":{},"myEmoji":null},{"id":"cmojy2n700004ju04dvkn02xg","contextLength":200000,"batchSize":10,"ttftMs":390,"tokSOut":492.6,"tokSPrefill":null,"tokSTotal":9496.63,"peakVramGb":535.8,"createdAt":"2026-04-29T11:01:13.837Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 390ms, Mean: 718ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 4 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.95  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":4,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2","displayName":"MiniMax-M2","family":"Minimax","params":229,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 SXM","gpuCount":4,"vramGb":564,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev30+gf05f3664c","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 SXM","hardwareGroupKey":"DISCRETE_GPU:h200 sxm","rank":14,"reactionCounts":{},"myEmoji":null},{"id":"cmpdzu8ig00sbpd01dsdrf3zr","contextLength":4096,"batchSize":1,"ttftMs":63.2,"tokSOut":371.8,"tokSPrefill":1160.07,"tokSTotal":null,"peakVramGb":7.65,"createdAt":"2026-05-20T11:43:46.072Z","notes":"hipfire @ 4840f0b6 (master post-sync 2026-05-20)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n  benchmarks/prompts/merge_sort_thinking_off.txt (27 input tokens)\ndecode mode: DFlash speculative (adaptive block_size, mean B=16)\n  drafter: qwen35-9b-dflash-mq4.hfq (z-lab Qwen3.5-DFlash native head, MQ4 weights)\n  target: qwen3.5-9b.mq4 (MQ4 weights)\nkv_cache: q8\nprompt_normalize: true (default since 2026-04-26)\nτ: 13.1818   accept_rate: 0.8788\noutput: 256 tokens emitted\nAR baseline (same binary same hardware, --ar-baseline): 99.39 tok/s — DFlash speedup 3.74x\nHardware: single R9700, ROCm 7.x, hiptrx host (HIP_VISIBLE_DEVICES=0)\nGPU coordination: dedicated host (hiptrx), no concurrent GPU load","engineFlags":{"commandSnippet":"HIP_VISIBLE_DEVICES=0 ./target/release/examples/dflash_spec_demo --target ~/.hipfire/models/qwen3.5-9b.mq4 --draft ~/.hipfire/models/qwen35-9b-dflash-mq4.hfq --prompt-file benchmarks/prompts/merge_sort_thinking_off.txt --max 256 --temp 0.0 --no-chatml --kv-mode q8 --ctx 4096","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":"q8","attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-9B","displayName":"Qwen3.5-9B","family":"Qwen","params":9,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-9B-Base","displayName":"Qwen3.5-9B-Base"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":1,"vramGb":32,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"i5 8500","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.2.0+4840f0b6","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":15,"reactionCounts":{},"myEmoji":null},{"id":"cmokh77p30001ju04nkvye3sd","contextLength":200000,"batchSize":10,"ttftMs":570,"tokSOut":338.89,"tokSPrefill":null,"tokSTotal":6002.7,"peakVramGb":278,"createdAt":"2026-04-29T19:56:39.736Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 570ms, Mean: 974ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2.7 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 2 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.98  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":2,"gpuLayers":4,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2.7","displayName":"MiniMax-M2.7","family":"Minimax","params":229,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 NVL","gpuCount":2,"vramGb":242,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9157F","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev73+g6f20f81cb","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 NVL","hardwareGroupKey":"DISCRETE_GPU:h200 nvl","rank":16,"reactionCounts":{},"myEmoji":null},{"id":"cmofdp1ve0003i904tw2yf6bs","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":336.98,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":null,"createdAt":"2026-04-26T06:19:42.650Z","notes":"hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\nτ (median): 7.5\naccept_rate (median): 0.5\nruns all: [336.98, 343.32, 314.93]","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target qwen3.5-9b.mq4 --draft qwen35-9b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-9B","displayName":"Qwen3.5-9B","family":"Qwen","params":9,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-9B-Base","displayName":"Qwen3.5-9B-Base"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.1.8-alpha+f16eceb","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":17,"reactionCounts":{},"myEmoji":null},{"id":"cmokgqlo0000ll504nndlpsel","contextLength":200000,"batchSize":10,"ttftMs":375,"tokSOut":333.93,"tokSPrefill":null,"tokSTotal":5521.66,"peakVramGb":278,"createdAt":"2026-04-29T19:43:44.689Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 375ms, Mean: 1038ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2.5 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 2 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.98  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":2,"gpuLayers":4,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2.5","displayName":"MiniMax-M2.5","family":"Minimax","params":229,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 NVL","gpuCount":2,"vramGb":242,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9157F","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev73+g6f20f81cb","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 NVL","hardwareGroupKey":"DISCRETE_GPU:h200 nvl","rank":18,"reactionCounts":{},"myEmoji":null},{"id":"cmokggtts0001l504wy0im2sm","contextLength":200000,"batchSize":10,"ttftMs":642,"tokSOut":332.9,"tokSPrefill":null,"tokSTotal":5599.88,"peakVramGb":278,"createdAt":"2026-04-29T19:36:08.705Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 642ms, Mean: 1026ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2.1 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 2 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.98  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":2,"gpuLayers":4,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2.1","displayName":"MiniMax-M2.1","family":"Minimax","params":229,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 NVL","gpuCount":2,"vramGb":282,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9157F","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev73+g6f20f81cb","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 NVL","hardwareGroupKey":"DISCRETE_GPU:h200 nvl","rank":19,"reactionCounts":{},"myEmoji":null},{"id":"cmp54e23i003jo301urnzxs61","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":325.190709,"tokSPrefill":17334.699786,"tokSTotal":932.178389,"peakVramGb":1.534,"createdAt":"2026-05-14T06:41:13.758Z","notes":"Corrected resubmission: actual llama.cpp commit is 320a6a44a5b1de6a074ba781e65f5fd79fb4051a (short 320a6a44a); earlier accepted record from this run used engineVersion=ce735c9. Automated llmfit run on clean upstream llama.cpp 320a6a44a with CUDA backend on NVIDIA RTX A5000. Mean of 2 measured /completion runs after one warmup: output tok/s=325.19, prefill tok/s=17334.70. Request used fixed prompt, n_predict=512, temperature=0, top_k=1, top_p=1, cache_prompt=false, one server slot, full GPU offload requested. Per-run output tok/s: 325.73, 324.65. ignore_eos=true. Local GGUF: gemma-3-1b-it-Q4_K_M.gguf.","engineFlags":{"commandSnippet":"/data/bt/os/llama.cpp/llama.cpp-bench-upstream/build-bench-cuda-gcc15/bin/llama-server -m /data/bt/models/hub/models--ggml-org--gemma-3-1b-it-GGUF/snapshots/f9c28bcd85737ffc5aef028638d3341d49869c27/gemma-3-1b-it-Q4_K_M.gguf -a gemma-3-1b-it-q4_k_m -ngl 999 -fa on -c 4096 -b 2048 -ub 512 -np 1 --no-context-shift --metrics --no-webui --host 127.0.0.1 --port 18103","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"fp16","attentionBackend":"flash_attn","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"ggml-org/gemma-3-1b-it-GGUF","displayName":"gemma-3-1b-it-GGUF","family":"Gemma","params":1,"isMoE":false,"baseModel":{"hfId":"google/gemma-3-1b-it","displayName":"gemma-3-1b-it"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX A5000","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"12th Gen Intel(R) Core(TM) i9-12900K","os":"Linux-6.19.14-300.fc44.x86_64-x86_64-with-glibc2.43","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"320a6a44a","quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmp52z7op0011o301d6xkyyyj","username":"snnn","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX A5000","hardwareGroupKey":"DISCRETE_GPU:rtx a5000","rank":20,"reactionCounts":{},"myEmoji":null},{"id":"cmp5419yh002no30141vcrtfp","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":325.190709,"tokSPrefill":17334.699786,"tokSTotal":932.178389,"peakVramGb":1.534,"createdAt":"2026-05-14T06:31:17.417Z","notes":"Automated llmfit run on clean upstream llama.cpp ce735c9 with CUDA backend on NVIDIA RTX A5000. Mean of 2 measured /completion runs after one warmup: output tok/s=325.19, prefill tok/s=17334.70. Request used fixed prompt, n_predict=512, temperature=0, top_k=1, top_p=1, cache_prompt=false, one server slot, full GPU offload requested. Per-run output tok/s: 325.73, 324.65. ignore_eos=true. Local GGUF: gemma-3-1b-it-Q4_K_M.gguf.","engineFlags":{"commandSnippet":"/data/bt/os/llama.cpp/llama.cpp-bench-upstream/build-bench-cuda-gcc15/bin/llama-server -m /data/bt/models/hub/models--ggml-org--gemma-3-1b-it-GGUF/snapshots/f9c28bcd85737ffc5aef028638d3341d49869c27/gemma-3-1b-it-Q4_K_M.gguf -a gemma-3-1b-it-q4_k_m -ngl 999 -fa on -c 4096 -b 2048 -ub 512 -np 1 --no-context-shift --metrics --no-webui --host 127.0.0.1 --port 18103","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"fp16","attentionBackend":"flash_attn","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"ggml-org/gemma-3-1b-it-GGUF","displayName":"gemma-3-1b-it-GGUF","family":"Gemma","params":1,"isMoE":false,"baseModel":{"hfId":"google/gemma-3-1b-it","displayName":"gemma-3-1b-it"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX A5000","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"12th Gen Intel(R) Core(TM) i9-12900K","os":"Linux-6.19.14-300.fc44.x86_64-x86_64-with-glibc2.43","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"ce735c9","quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmp52z7op0011o301d6xkyyyj","username":"snnn","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX A5000","hardwareGroupKey":"DISCRETE_GPU:rtx a5000","rank":21,"reactionCounts":{},"myEmoji":null},{"id":"cmofkl1gm0006l4045txzjtoc","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":322.6,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":null,"createdAt":"2026-04-26T09:32:32.806Z","notes":"hipfire @ f16eceb (master post-perf-recovery PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported); range 322.5–346.0\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\nτ (median): 7.067\naccept_rate (median): 0.471\nall runs tok/s: [346.0, 322.47, 322.6]","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target qwen3.5-9b.mq4 --draft qwen35-9b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-9B","displayName":"Qwen3.5-9B","family":"Qwen","params":9,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-9B-Base","displayName":"Qwen3.5-9B-Base"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.1.8-alpha+f16eceb","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":22,"reactionCounts":{},"myEmoji":null},{"id":"cmp54a0p10039o301inqkqaa7","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":320.551451,"tokSPrefill":14121.748134,"tokSTotal":941.691716,"peakVramGb":1.567,"createdAt":"2026-05-14T06:38:05.317Z","notes":"Corrected resubmission: actual llama.cpp commit is 320a6a44a5b1de6a074ba781e65f5fd79fb4051a (short 320a6a44a); earlier accepted record from this run used engineVersion=ce735c9. Automated llmfit run on clean upstream llama.cpp 320a6a44a with CUDA backend on NVIDIA RTX A5000. Mean of 2 measured /completion runs after one warmup: output tok/s=320.55, prefill tok/s=14121.75. Request used fixed prompt, n_predict=512, temperature=0, top_k=1, top_p=1, cache_prompt=false, one server slot, full GPU offload requested. Per-run output tok/s: 321.12, 319.98. ignore_eos=true. Local GGUF: qwen2.5-1.5b-instruct-q4_k_m.gguf.","engineFlags":{"commandSnippet":"/data/bt/os/llama.cpp/llama.cpp-bench-upstream/build-bench-cuda-gcc15/bin/llama-server -m /data/bt/models/hub/models--Qwen--Qwen2.5-1.5B-Instruct-GGUF/snapshots/91cad51170dc346986eccefdc2dd33a9da36ead9/qwen2.5-1.5b-instruct-q4_k_m.gguf -a qwen2.5-1.5b-instruct-q4_k_m -ngl 999 -fa on -c 4096 -b 2048 -ub 512 -np 1 --no-context-shift --metrics --no-webui --host 127.0.0.1 --port 18100","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"fp16","attentionBackend":"flash_attn","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen2.5-1.5B-Instruct-GGUF","displayName":"Qwen2.5-1.5B-Instruct-GGUF","family":"Qwen","params":2,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen2.5-1.5B-Instruct","displayName":"Qwen2.5-1.5B-Instruct"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX A5000","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"12th Gen Intel(R) Core(TM) i9-12900K","os":"Linux-6.19.14-300.fc44.x86_64-x86_64-with-glibc2.43","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"320a6a44a","quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmp52z7op0011o301d6xkyyyj","username":"snnn","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX A5000","hardwareGroupKey":"DISCRETE_GPU:rtx a5000","rank":23,"reactionCounts":{},"myEmoji":null},{"id":"cmp53x80o001zo301tn5kutld","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":320.551451,"tokSPrefill":14121.748134,"tokSTotal":941.691716,"peakVramGb":1.567,"createdAt":"2026-05-14T06:28:08.280Z","notes":"Automated llmfit run on clean upstream llama.cpp ce735c9 with CUDA backend on NVIDIA RTX A5000. Mean of 2 measured /completion runs after one warmup: output tok/s=320.55, prefill tok/s=14121.75. Request used fixed prompt, n_predict=512, temperature=0, top_k=1, top_p=1, cache_prompt=false, one server slot, full GPU offload requested. Per-run output tok/s: 321.12, 319.98. ignore_eos=true. Local GGUF: qwen2.5-1.5b-instruct-q4_k_m.gguf.","engineFlags":{"commandSnippet":"/data/bt/os/llama.cpp/llama.cpp-bench-upstream/build-bench-cuda-gcc15/bin/llama-server -m /data/bt/models/hub/models--Qwen--Qwen2.5-1.5B-Instruct-GGUF/snapshots/91cad51170dc346986eccefdc2dd33a9da36ead9/qwen2.5-1.5b-instruct-q4_k_m.gguf -a qwen2.5-1.5b-instruct-q4_k_m -ngl 999 -fa on -c 4096 -b 2048 -ub 512 -np 1 --no-context-shift --metrics --no-webui --host 127.0.0.1 --port 18100","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"fp16","attentionBackend":"flash_attn","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen2.5-1.5B-Instruct-GGUF","displayName":"Qwen2.5-1.5B-Instruct-GGUF","family":"Qwen","params":2,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen2.5-1.5B-Instruct","displayName":"Qwen2.5-1.5B-Instruct"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX A5000","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"12th Gen Intel(R) Core(TM) i9-12900K","os":"Linux-6.19.14-300.fc44.x86_64-x86_64-with-glibc2.43","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"ce735c9","quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmp52z7op0011o301d6xkyyyj","username":"snnn","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX A5000","hardwareGroupKey":"DISCRETE_GPU:rtx a5000","rank":24,"reactionCounts":{},"myEmoji":null},{"id":"cmpqj1mlh00fwpl01xbp7uqfh","contextLength":128000,"batchSize":1,"ttftMs":144.2,"tokSOut":302.9,"tokSPrefill":null,"tokSTotal":279.09,"peakVramGb":null,"createdAt":"2026-05-29T06:14:37.734Z","notes":"Cold start (no KV cache). 512 output tokens.","engineFlags":{"commandSnippet":"llama-server -m LFM2.5-8B-A1B-Q8_0.gguf -ngl 999 -fa1 -c 0","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"LiquidAI/LFM2.5-8B-A1B-GGUF","displayName":"LFM2.5-8B-A1B-GGUF","family":"Llama","params":8,"isMoE":false,"baseModel":{"hfId":"LiquidAI/LFM2.5-8B-A1B","displayName":"LFM2.5-8B-A1B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 3090","gpuCount":2,"vramGb":48,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen","os":"Ubuntu","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":null,"quantization":"Q8_0","backend":null},"user":{"id":"cmo3v1l6p0000jk04cplvn9g5","username":"Lottolabs","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX 3090","hardwareGroupKey":"DISCRETE_GPU:rtx 3090","rank":25,"reactionCounts":{},"myEmoji":null},{"id":"cmokg0wad0007l5040k9hch2p","contextLength":200000,"batchSize":10,"ttftMs":727,"tokSOut":302.51,"tokSPrefill":null,"tokSTotal":6492.21,"peakVramGb":278,"createdAt":"2026-04-29T19:23:45.398Z","notes":"Benchmark script:\nhttps://github.com/keennay/gpu-cluster-setup/blob/master/benchmark.py\n\nBenchmark Script Command:\npython benchmark.py --concurrency 10 --num-prompts 100 --label <label> --model <model>\n\nTTFT Results:\nMedian: 727ms, Mean: 1621ms","engineFlags":{"commandSnippet":"env SAFETENSORS_FAST_GPU=1 vllm serve MiniMaxAI/MiniMax-M2 --served-model-name minimax_m2 --trust-remote-code --tensor-parallel-size 2 --reasoning-parser minimax_m2_append_think --enable-auto-tool-choice --tool-call-parser minimax_m2 --gpu-memory-utilization 0.98  --no-enable-prefix-caching --host 0.0.0.0 --port 8000","tensorParallel":2,"gpuLayers":4,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"MiniMaxAI/MiniMax-M2","displayName":"MiniMax-M2","family":"Minimax","params":229,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 NVL","gpuCount":2,"vramGb":282,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9157F","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.1rc1.dev73+g6f20f81cb","quantization":"FP8","backend":"cuda"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 NVL","hardwareGroupKey":"DISCRETE_GPU:h200 nvl","rank":26,"reactionCounts":{},"myEmoji":null},{"id":"cmpqj1qrv00fzpl01z9ztgyuf","contextLength":128000,"batchSize":1,"ttftMs":251.4,"tokSOut":293.15,"tokSPrefill":null,"tokSTotal":256.26,"peakVramGb":null,"createdAt":"2026-05-29T06:14:43.147Z","notes":"10,000 tokens in KV cache. 512 output tokens.","engineFlags":{"commandSnippet":"llama-server -m LFM2.5-8B-A1B-Q8_0.gguf -ngl 999 -fa1 -c 0","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"LiquidAI/LFM2.5-8B-A1B-GGUF","displayName":"LFM2.5-8B-A1B-GGUF","family":"Llama","params":8,"isMoE":false,"baseModel":{"hfId":"LiquidAI/LFM2.5-8B-A1B","displayName":"LFM2.5-8B-A1B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 3090","gpuCount":2,"vramGb":48,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen","os":"Ubuntu","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":null,"quantization":"Q8_0","backend":null},"user":{"id":"cmo3v1l6p0000jk04cplvn9g5","username":"Lottolabs","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX 3090","hardwareGroupKey":"DISCRETE_GPU:rtx 3090","rank":27,"reactionCounts":{},"myEmoji":null},{"id":"cmppftgt8003ipl016cm2gqpb","contextLength":4096,"batchSize":1,"ttftMs":110.63,"tokSOut":286.64,"tokSPrefill":435.71,"tokSTotal":null,"peakVramGb":17.505859375,"createdAt":"2026-05-28T11:56:31.964Z","notes":"Qwen3.5-27B dense AWQ DFlash merge-sort bench on hiptrx.\nTarget path: /home/kaden/.hipfire/models/qwen3-27b-3.5.mq4-awq\nTarget md5: e1c9480a3fa54ad0a0f25ca18105510a\nTarget sha256: ea615949ddf6a180eee03ff6fde39f7e51148f153b1b05f82258b9953088576e\nDraft path: /home/kaden/.hipfire/models/qwen35-27b-dflash-mq4.hfq\nDraft md5: 7b6df2a4ee1c8d933f0a52e187d1860b\nDraft sha256: 3d428b97c1911a9ad815cc52fbee080306852c1dafad6b1b17bb70bd68010301\nPrompt: benchmarks/prompts/merge_sort_thinking_off.txt\nPrompt md5: 253c7ac50857fe6d0e10fb0d2c5e35c0\nBinary md5: a86f47db76bf9c25360e95422768e49f\nBinary sha256: a6047b9e23e3d37face8492edd0a7833383708f63b308425175ded325661d2a2\nGit: feat/dense-dflash-perfmaxx 3730b58bd3b5380eb1de672ec032b24016905458\n3 fresh-process runs tok/s: 286.64, 286.98, 286.61; median 286.64\nPer-run metrics: all runs tau=13.0000, accept_rate=0.8667, cycles=11, committed=165, accepted=143, output_tokens=155, eos=y, output_md5=f8ae27b7925cfde51ba446eaab547b2e\nMedian auxiliary metrics: ttft_ms=110.63, prefill_tok_s=435.71, peak_vram_gb=17.51\nHardware: hiptrx, AMD Radeon AI PRO R9700, gfx1201, GPU dev 0, 34.2 GB VRAM, HIP 7.2, ROCm driver 7.0.0-15-generic; host reports 4x R9700 gfx1201 but benchmark uses one GPU\nCoherence/eyeball: decoded output is fluent Python merge_sort code and terminates with <|im_end|>. No AR baseline rerun for this publish.","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo \\\n  --target /home/kaden/.hipfire/models/qwen3-27b-3.5.mq4-awq \\\n  --draft /home/kaden/.hipfire/models/qwen35-27b-dflash-mq4.hfq \\\n  --prompt-file benchmarks/prompts/merge_sort_thinking_off.txt \\\n  --max 256 --temp 0.0 --no-chatml --kv-mode q8 --ctx 4096","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":"q8","attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-27B","displayName":"Qwen3.5-27B","family":"Qwen","params":28,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":1,"vramGb":34.2,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":"Ubuntu Linux 7.0.0-15-generic on hiptrx","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.2.0+3730b58","quantization":"MQ4-AWQ","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":28,"reactionCounts":{},"myEmoji":null},{"id":"cmoqakz2k0004kw04w7793b7v","contextLength":800,"batchSize":1,"ttftMs":200,"tokSOut":285.6,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":null,"createdAt":"2026-05-03T21:38:01.484Z","notes":null,"engineFlags":{"commandSnippet":"ollama run nemotron3:33b --verbose","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","displayName":"NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","family":null,"params":32,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 5090","gpuCount":1,"vramGb":32,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"Ryzen","os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"ollama","engineVersion":null,"quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmoomjrty001fl504f570ylcv","username":"Kylej572","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX 5090","hardwareGroupKey":"DISCRETE_GPU:rtx 5090","rank":29,"reactionCounts":{},"myEmoji":null},{"id":"cmp54fejg003mo301r2zpuyam","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":281.895677,"tokSPrefill":17924.005145,"tokSTotal":812.76154,"peakVramGb":1.778,"createdAt":"2026-05-14T06:42:16.540Z","notes":"Corrected resubmission: actual llama.cpp commit is 320a6a44a5b1de6a074ba781e65f5fd79fb4051a (short 320a6a44a); earlier accepted record from this run used engineVersion=ce735c9. Automated llmfit run on clean upstream llama.cpp 320a6a44a with CUDA backend on NVIDIA RTX A5000. Mean of 2 measured /completion runs after one warmup: output tok/s=281.90, prefill tok/s=17924.01. Request used fixed prompt, n_predict=512, temperature=0, top_k=1, top_p=1, cache_prompt=false, one server slot, full GPU offload requested. Per-run output tok/s: 282.00, 281.79. ignore_eos=true. Local GGUF: gemma-3-1b-it-Q8_0.gguf.","engineFlags":{"commandSnippet":"/data/bt/os/llama.cpp/llama.cpp-bench-upstream/build-bench-cuda-gcc15/bin/llama-server -m /data/bt/models/hub/models--ggml-org--gemma-3-1b-it-GGUF/snapshots/f9c28bcd85737ffc5aef028638d3341d49869c27/gemma-3-1b-it-Q8_0.gguf -a gemma-3-1b-it-q8_0 -ngl 999 -fa on -c 4096 -b 2048 -ub 512 -np 1 --no-context-shift --metrics --no-webui --host 127.0.0.1 --port 18104","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"fp16","attentionBackend":"flash_attn","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"ggml-org/gemma-3-1b-it-GGUF","displayName":"gemma-3-1b-it-GGUF","family":"Gemma","params":1,"isMoE":false,"baseModel":{"hfId":"google/gemma-3-1b-it","displayName":"gemma-3-1b-it"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX A5000","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"12th Gen Intel(R) Core(TM) i9-12900K","os":"Linux-6.19.14-300.fc44.x86_64-x86_64-with-glibc2.43","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"320a6a44a","quantization":"Q8_0","backend":"cuda"},"user":{"id":"cmp52z7op0011o301d6xkyyyj","username":"snnn","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX A5000","hardwareGroupKey":"DISCRETE_GPU:rtx a5000","rank":30,"reactionCounts":{},"myEmoji":null},{"id":"cmp542mft002so301jxkarssw","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":281.895677,"tokSPrefill":17924.005145,"tokSTotal":812.76154,"peakVramGb":1.778,"createdAt":"2026-05-14T06:32:20.250Z","notes":"Automated llmfit run on clean upstream llama.cpp ce735c9 with CUDA backend on NVIDIA RTX A5000. Mean of 2 measured /completion runs after one warmup: output tok/s=281.90, prefill tok/s=17924.01. Request used fixed prompt, n_predict=512, temperature=0, top_k=1, top_p=1, cache_prompt=false, one server slot, full GPU offload requested. Per-run output tok/s: 282.00, 281.79. ignore_eos=true. Local GGUF: gemma-3-1b-it-Q8_0.gguf.","engineFlags":{"commandSnippet":"/data/bt/os/llama.cpp/llama.cpp-bench-upstream/build-bench-cuda-gcc15/bin/llama-server -m /data/bt/models/hub/models--ggml-org--gemma-3-1b-it-GGUF/snapshots/f9c28bcd85737ffc5aef028638d3341d49869c27/gemma-3-1b-it-Q8_0.gguf -a gemma-3-1b-it-q8_0 -ngl 999 -fa on -c 4096 -b 2048 -ub 512 -np 1 --no-context-shift --metrics --no-webui --host 127.0.0.1 --port 18104","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"fp16","attentionBackend":"flash_attn","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"ggml-org/gemma-3-1b-it-GGUF","displayName":"gemma-3-1b-it-GGUF","family":"Gemma","params":1,"isMoE":false,"baseModel":{"hfId":"google/gemma-3-1b-it","displayName":"gemma-3-1b-it"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX A5000","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"12th Gen Intel(R) Core(TM) i9-12900K","os":"Linux-6.19.14-300.fc44.x86_64-x86_64-with-glibc2.43","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"ce735c9","quantization":"Q8_0","backend":"cuda"},"user":{"id":"cmp52z7op0011o301d6xkyyyj","username":"snnn","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX A5000","hardwareGroupKey":"DISCRETE_GPU:rtx a5000","rank":31,"reactionCounts":{},"myEmoji":null},{"id":"cmp54bd4h003do301idc0252y","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":267.990554,"tokSPrefill":14588.058091,"tokSTotal":794.097229,"peakVramGb":2.183,"createdAt":"2026-05-14T06:39:08.081Z","notes":"Corrected resubmission: actual llama.cpp commit is 320a6a44a5b1de6a074ba781e65f5fd79fb4051a (short 320a6a44a); earlier accepted record from this run used engineVersion=ce735c9. Automated llmfit run on clean upstream llama.cpp 320a6a44a with CUDA backend on NVIDIA RTX A5000. Mean of 2 measured /completion runs after one warmup: output tok/s=267.99, prefill tok/s=14588.06. Request used fixed prompt, n_predict=512, temperature=0, top_k=1, top_p=1, cache_prompt=false, one server slot, full GPU offload requested. Per-run output tok/s: 268.37, 267.61. ignore_eos=true. Local GGUF: qwen2.5-1.5b-instruct-q8_0.gguf.","engineFlags":{"commandSnippet":"/data/bt/os/llama.cpp/llama.cpp-bench-upstream/build-bench-cuda-gcc15/bin/llama-server -m /data/bt/models/hub/models--Qwen--Qwen2.5-1.5B-Instruct-GGUF/snapshots/91cad51170dc346986eccefdc2dd33a9da36ead9/qwen2.5-1.5b-instruct-q8_0.gguf -a qwen2.5-1.5b-instruct-q8_0 -ngl 999 -fa on -c 4096 -b 2048 -ub 512 -np 1 --no-context-shift --metrics --no-webui --host 127.0.0.1 --port 18101","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"fp16","attentionBackend":"flash_attn","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen2.5-1.5B-Instruct-GGUF","displayName":"Qwen2.5-1.5B-Instruct-GGUF","family":"Qwen","params":2,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen2.5-1.5B-Instruct","displayName":"Qwen2.5-1.5B-Instruct"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX A5000","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"12th Gen Intel(R) Core(TM) i9-12900K","os":"Linux-6.19.14-300.fc44.x86_64-x86_64-with-glibc2.43","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"320a6a44a","quantization":"Q8_0","backend":"cuda"},"user":{"id":"cmp52z7op0011o301d6xkyyyj","username":"snnn","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX A5000","hardwareGroupKey":"DISCRETE_GPU:rtx a5000","rank":32,"reactionCounts":{},"myEmoji":null},{"id":"cmp53yket0025o301xddeudvl","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":267.990554,"tokSPrefill":14588.058091,"tokSTotal":794.097229,"peakVramGb":2.183,"createdAt":"2026-05-14T06:29:10.997Z","notes":"Automated llmfit run on clean upstream llama.cpp ce735c9 with CUDA backend on NVIDIA RTX A5000. Mean of 2 measured /completion runs after one warmup: output tok/s=267.99, prefill tok/s=14588.06. Request used fixed prompt, n_predict=512, temperature=0, top_k=1, top_p=1, cache_prompt=false, one server slot, full GPU offload requested. Per-run output tok/s: 268.37, 267.61. ignore_eos=true. Local GGUF: qwen2.5-1.5b-instruct-q8_0.gguf.","engineFlags":{"commandSnippet":"/data/bt/os/llama.cpp/llama.cpp-bench-upstream/build-bench-cuda-gcc15/bin/llama-server -m /data/bt/models/hub/models--Qwen--Qwen2.5-1.5B-Instruct-GGUF/snapshots/91cad51170dc346986eccefdc2dd33a9da36ead9/qwen2.5-1.5b-instruct-q8_0.gguf -a qwen2.5-1.5b-instruct-q8_0 -ngl 999 -fa on -c 4096 -b 2048 -ub 512 -np 1 --no-context-shift --metrics --no-webui --host 127.0.0.1 --port 18101","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"fp16","attentionBackend":"flash_attn","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen2.5-1.5B-Instruct-GGUF","displayName":"Qwen2.5-1.5B-Instruct-GGUF","family":"Qwen","params":2,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen2.5-1.5B-Instruct","displayName":"Qwen2.5-1.5B-Instruct"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX A5000","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"12th Gen Intel(R) Core(TM) i9-12900K","os":"Linux-6.19.14-300.fc44.x86_64-x86_64-with-glibc2.43","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"ce735c9","quantization":"Q8_0","backend":"cuda"},"user":{"id":"cmp52z7op0011o301d6xkyyyj","username":"snnn","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX A5000","hardwareGroupKey":"DISCRETE_GPU:rtx a5000","rank":33,"reactionCounts":{},"myEmoji":null},{"id":"cmoerk01e0008l4045wzrvvyu","contextLength":256000,"batchSize":10,"ttftMs":1091,"tokSOut":261.57,"tokSPrefill":null,"tokSTotal":4720.17,"peakVramGb":129.7,"createdAt":"2026-04-25T19:59:55.442Z","notes":"Median TTFT: 2281ms, Mean TTFT: 1091ms, NVFP4 GEMM=Marlin","engineFlags":{"commandSnippet":"vllm serve <NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4-Path> \\\n--async-scheduling \\\n--served-model-name nemotron-3-super-nvfp4 \\\n--dtype auto \\\n--kv-cache-dtype fp8 \\\n--tensor-parallel-size 1 \\\n--pipeline-parallel-size 1 \\\n--data-parallel-size 1 \\\n--trust-remote-code \\\n--attention-backend TRITON_ATTN \\\n--gpu-memory-utilization 0.9 \\\n--enable-chunked-prefill \\\n--max-num-seqs 512 \\\n--host 0.0.0.0 \\\n--port 8000 \\\n--enable-auto-tool-choice \\\n--tool-call-parser qwen3_coder \\\n--reasoning-parser-plugin <NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4-Path>/super_v3_reasoning_parser.py \\\n--reasoning-parser super_v3","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":"fp8","attentionBackend":"TRITON_ATTN","flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4","displayName":"NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4","family":"Opt","params":67,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"H200 NVL","gpuCount":1,"vramGb":141,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9175F","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vLLM","engineVersion":"0.18.1","quantization":"NVFP4","backend":"CUDA"},"user":{"id":"cmoeptucv000ajr042kk6r3i3","username":"keennay","verified":false,"verifiedAt":null},"hardwareGroupLabel":"H200 NVL","hardwareGroupKey":"DISCRETE_GPU:h200 nvl","rank":34,"reactionCounts":{},"myEmoji":null},{"id":"cmppmb47l0051pl01lyaup9mc","contextLength":4096,"batchSize":1,"ttftMs":77,"tokSOut":259.53,"tokSPrefill":1093.7,"tokSTotal":null,"peakVramGb":27.8179,"createdAt":"2026-05-28T14:58:13.137Z","notes":"A3B MoE MTP crown dry run on hiptrx. Prompt is HumanEval/26 remove_duplicates from /home/kaden/lucebox-hub/dflash/eval/humaneval_plus/humanevalplus.jsonl.\nTarget path: /home/kaden/.hipfire/models/qwen3.6-35b-a3b.mq4-awq-mi300x\nTarget md5: edde51ec1dac0f2bd42cff5ef1cb8944\nTarget sha256: 1dc1c7964de415e0040a540a4300b9518e11b00c13d99c23f576f2b9fe1e8bca\nMTP head path: /home/kaden/.hipfire/models/qwen3.6-35b-a3b.moe-mtp-mq4-cvs16384.mtp\nMTP head md5: 51076bfb5b489832f2f6c4191b9799b0\nMTP head sha256: 1e11a06d1946e1e5711d6692894e917a61d5f360d4f3508c8372d49e97c912c1\nPrompt file: .codeinsight+research/dense-dflash-perfmaxx/a3b-mtp-humaneval26-crown-validate-20260528-144250/prompt.txt\nPrompt raw file md5: 90a10f2b41763aacc80a3e4db5e0bb62\nPrompt normalized md5 reported by mtp_only_demo: e04fc67ed919bec41bfd7426a0b75d58\nPrompt sha256: 7823eea9be9599563c786fa16e792f3da2482016607d75ee06ca40b2d33c7dca\nBinary md5: 8c148158f2edd7e03bc78b2b76b73242\nBinary sha256: 2097a2b651af12e20ef7a30a932051ad4f5a30f0fc0fc5e33a975cdf48e5ea57\nGit: feat/dense-dflash-perfmaxx 3730b58bd3b5380eb1de672ec032b24016905458\n5 fresh-process runs tok/s: 258.64, 259.77, 259.34, 260.06, 259.53; median 259.53\nPer-run metrics: all runs tau=5.1250, cycles=8, committed=42, accepted_mtp=33, bonus=7, replay_skipped=5/8 cycles, eos=y, decoded_output_md5=5a1ba0994ef05da65ecfb9025f51743a\nMedian auxiliary metrics: ttft_ms=77.0, prefill_tok_s=1093.70, decode_secs=0.162, peak_vram_gb=27.8179\nHardware: hiptrx, AMD Radeon AI PRO R9700, gfx1201, GPU dev 0, 34.2 GB VRAM, HIP 7.2, ROCm driver 7.0.0-15-generic; host reports 4x R9700 gfx1201 but benchmark uses one GPU\nCoherence/eyeball: decoded output is a coherent remove_duplicates Python body and terminates with <|endoftext|>. No AR baseline rerun for this dry run.\nPositioning: 259.53 tok/s exceeds the current noted 254 tok/s first-place threshold for Qwen3.6-35B-A3B.","engineFlags":{"commandSnippet":"./target/release/examples/mtp_only_demo \\\n  --target /home/kaden/.hipfire/models/qwen3.6-35b-a3b.mq4-awq-mi300x \\\n  --mtp-head /home/kaden/.hipfire/models/qwen3.6-35b-a3b.moe-mtp-mq4-cvs16384.mtp \\\n  --prompt-file .codeinsight+research/dense-dflash-perfmaxx/a3b-mtp-humaneval26-crown-validate-20260528-144250/prompt.txt \\\n  --max-n 5 --trunk-spine --kv-mode q8 --temp 0.0 --max 256 --no-chatml","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":"q8","attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":true},"model":{"hfId":"Qwen/Qwen3.6-35B-A3B","displayName":"Qwen3.6-35B-A3B","family":"Qwen","params":36,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":1,"vramGb":34.2,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":"Ubuntu Linux 7.0.0-15-generic on hiptrx","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.2.0+3730b58","quantization":"MQ4-AWQ","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":35,"reactionCounts":{},"myEmoji":null},{"id":"cmpqj1v7400g2pl01n5xovbt7","contextLength":128000,"batchSize":1,"ttftMs":382,"tokSOut":259.45,"tokSPrefill":null,"tokSTotal":189.59,"peakVramGb":null,"createdAt":"2026-05-29T06:14:48.880Z","notes":"50,000 tokens in KV cache. 269 output tokens.","engineFlags":{"commandSnippet":"llama-server -m LFM2.5-8B-A1B-Q8_0.gguf -ngl 999 -fa1 -c 0","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"LiquidAI/LFM2.5-8B-A1B-GGUF","displayName":"LFM2.5-8B-A1B-GGUF","family":"Llama","params":8,"isMoE":false,"baseModel":{"hfId":"LiquidAI/LFM2.5-8B-A1B","displayName":"LFM2.5-8B-A1B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 3090","gpuCount":2,"vramGb":48,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen","os":"Ubuntu","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":null,"quantization":"Q8_0","backend":null},"user":{"id":"cmo3v1l6p0000jk04cplvn9g5","username":"Lottolabs","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX 3090","hardwareGroupKey":"DISCRETE_GPU:rtx 3090","rank":36,"reactionCounts":{},"myEmoji":null},{"id":"cmpdzx93g00sppd01d7e6lzvm","contextLength":4096,"batchSize":1,"ttftMs":123.68,"tokSOut":255.83,"tokSPrefill":405.91,"tokSTotal":null,"peakVramGb":7.73,"createdAt":"2026-05-20T11:46:06.796Z","notes":"hipfire @ 1a378379 (master 4840f0b6 + 3 hipx-local ROCm 7.2.x compiler shims rebased)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\ndecode mode: DFlash speculative\n  drafter: qwen35-9b-dflash-mq4.hfq\n  target: qwen3.5-9b.mq4\nkv_cache: q8\nprompt_normalize: true (default)\nτ: 13.1818   accept_rate: 0.8788\noutput: 256 tokens emitted, coherent merge_sort code\nruns: 3 (median reported); per-run tok/s: [255.83, 255.13, 256.32], spread 0.47%\nAR baseline (same binary same hardware, --ar-baseline): 45.65 tok/s — DFlash speedup 5.60x\nHardware: AMD RYZEN AI MAX+ 395 APU (Strix Halo), ROCm 7.2.x, hipx host (HIP_VISIBLE_DEVICES=1)\nFirst DFlash row for gfx1151 on localmaxxing.","engineFlags":{"commandSnippet":"HIP_VISIBLE_DEVICES=1 ./target/release/examples/dflash_spec_demo --target ~/.hipfire/models/qwen3.5-9b.mq4 --draft ~/.hipfire/models/qwen35-9b-dflash-mq4.hfq --prompt-file benchmarks/prompts/merge_sort_thinking_off.txt --max 256 --temp 0.0 --no-chatml --kv-mode q8 --ctx 4096","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":"q8","attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-9B","displayName":"Qwen3.5-9B","family":"Qwen","params":9,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-9B-Base","displayName":"Qwen3.5-9B-Base"}},"hardware":{"hwClass":"UNIFIED","gpuName":null,"gpuCount":1,"vramGb":null,"chipVendor":"AMD","chipFamily":"Strix Halo","chipVariant":"Radeon 8060S Graphics","unifiedMemoryGb":103,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.2.0+1a378379","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"Strix Halo Radeon 8060S","hardwareGroupKey":"UNIFIED:strix halo radeon 8060s","rank":37,"reactionCounts":{},"myEmoji":null},{"id":"cmppfddi3002xpl01z00xj2yk","contextLength":4096,"batchSize":1,"ttftMs":110.87,"tokSOut":254.83,"tokSPrefill":435.19,"tokSTotal":null,"peakVramGb":17.505859375,"createdAt":"2026-05-28T11:44:01.179Z","notes":"Corrected AWQ trunk; not the dev lmhead-a100 .hfq.\nTarget path used for command: /home/kaden/.hipfire/models/qwen3.6-27b-awq.mq4\nTarget md5: e42a489edf0be1b144307f2869b73b93\nTarget sha256 / HF x-linked-etag: 86a5f80fd29d545abb1093dead242725ced6d68b8607c6d566d897b1a82442dc\nHF source: schuttdev/hipfire-qwen3.6-27b qwen3.6-27b.mq4 @ f9b326a657f14cbc400e384ff84a4b9b4b726ba2\nDraft path: /home/kaden/.hipfire/models/qwen36-27b-dflash-mq4.hf4\nDraft md5: 204c4c4ceab30cb9ebc118fa9d59a446\nDraft sha256: bd8c4f07ae80fe1385bf2606af9a7ba0daa18ca8daec50916f2a489054c44e70\nPrompt: benchmarks/prompts/merge_sort_thinking_off.txt\nPrompt md5: 253c7ac50857fe6d0e10fb0d2c5e35c0\nBinary md5: a86f47db76bf9c25360e95422768e49f\nBinary sha256: a6047b9e23e3d37face8492edd0a7833383708f63b308425175ded325661d2a2\nGit: feat/dense-dflash-perfmaxx 3730b58bd3b5380eb1de672ec032b24016905458\n3 fresh-process runs tok/s: 254.83, 255.10, 253.97; median 254.83\nPer-run metrics: all runs tau=11.3846, accept_rate=0.7590, cycles=13, committed=174, accepted=148, output_tokens=162, eos=y, output_md5=4d3a55e4d1daff05ee0265b3055325bd\nMedian auxiliary metrics: ttft_ms=110.87, prefill_tok_s=435.19, peak_vram_gb=17.51\nHardware: hiptrx, AMD Radeon AI PRO R9700, gfx1201, GPU dev 0, 34.2 GB VRAM, HIP 7.2, ROCm driver 7.0.0-15-generic; host reports 4x R9700 gfx1201 but benchmark uses one GPU\nCoherence/eyeball: decoded output is fluent Python merge_sort code and terminates with <|im_end|> then <|endoftext|>. No AR baseline rerun for this publish.","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo \\\n  --target /home/kaden/.hipfire/models/qwen3.6-27b-awq.mq4 \\\n  --draft /home/kaden/.hipfire/models/qwen36-27b-dflash-mq4.hf4 \\\n  --prompt-file benchmarks/prompts/merge_sort_thinking_off.txt \\\n  --max 256 --temp 0.0 --no-chatml --kv-mode q8 --ctx 4096","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":"q8","attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.6-27B","displayName":"Qwen3.6-27B","family":"Qwen","params":28,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"Radeon AI Pro R9700","gpuCount":1,"vramGb":34.2,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":"Ubuntu Linux 7.0.0-15-generic on hiptrx","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.2.0+3730b58","quantization":"MQ4-AWQ","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"Radeon AI Pro R9700","hardwareGroupKey":"DISCRETE_GPU:radeon ai pro r9700","rank":38,"reactionCounts":{},"myEmoji":null},{"id":"cmp54cply003go3010534e7vo","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":254.404549,"tokSPrefill":10008.819467,"tokSTotal":735.459411,"peakVramGb":2.036,"createdAt":"2026-05-14T06:40:10.918Z","notes":"Corrected resubmission: actual llama.cpp commit is 320a6a44a5b1de6a074ba781e65f5fd79fb4051a (short 320a6a44a); earlier accepted record from this run used engineVersion=ce735c9. Automated llmfit run on clean upstream llama.cpp 320a6a44a with CUDA backend on NVIDIA RTX A5000. Mean of 2 measured /completion runs after one warmup: output tok/s=254.40, prefill tok/s=10008.82. Request used fixed prompt, n_predict=512, temperature=0, top_k=1, top_p=1, cache_prompt=false, one server slot, full GPU offload requested. Per-run output tok/s: 254.78, 254.03. ignore_eos=true. Local GGUF: Qwen_Qwen3.5-2B-Q4_K_M.gguf.","engineFlags":{"commandSnippet":"/data/bt/os/llama.cpp/llama.cpp-bench-upstream/build-bench-cuda-gcc15/bin/llama-server -m /data/bt/models/hub/models--bartowski--Qwen_Qwen3.5-2B-GGUF/snapshots/6521bcb22761828aa55639d1c814a207234c3e70/Qwen_Qwen3.5-2B-Q4_K_M.gguf -a qwen3.5-2b-q4_k_m -ngl 999 -fa on -c 4096 -b 2048 -ub 512 -np 1 --no-context-shift --metrics --no-webui --host 127.0.0.1 --port 18102","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"fp16","attentionBackend":"flash_attn","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"bartowski/Qwen_Qwen3.5-2B-GGUF","displayName":"Qwen_Qwen3.5-2B-GGUF","family":"Qwen","params":2,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-2B","displayName":"Qwen3.5-2B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX A5000","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"12th Gen Intel(R) Core(TM) i9-12900K","os":"Linux-6.19.14-300.fc44.x86_64-x86_64-with-glibc2.43","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"320a6a44a","quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmp52z7op0011o301d6xkyyyj","username":"snnn","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX A5000","hardwareGroupKey":"DISCRETE_GPU:rtx a5000","rank":39,"reactionCounts":{},"myEmoji":null},{"id":"cmp53zx5e002eo301f8dvif56","contextLength":4096,"batchSize":1,"ttftMs":null,"tokSOut":254.404549,"tokSPrefill":10008.819467,"tokSTotal":735.459411,"peakVramGb":2.036,"createdAt":"2026-05-14T06:30:14.163Z","notes":"Automated llmfit run on clean upstream llama.cpp ce735c9 with CUDA backend on NVIDIA RTX A5000. Mean of 2 measured /completion runs after one warmup: output tok/s=254.40, prefill tok/s=10008.82. Request used fixed prompt, n_predict=512, temperature=0, top_k=1, top_p=1, cache_prompt=false, one server slot, full GPU offload requested. Per-run output tok/s: 254.78, 254.03. ignore_eos=true. Local GGUF: Qwen_Qwen3.5-2B-Q4_K_M.gguf.","engineFlags":{"commandSnippet":"/data/bt/os/llama.cpp/llama.cpp-bench-upstream/build-bench-cuda-gcc15/bin/llama-server -m /data/bt/models/hub/models--bartowski--Qwen_Qwen3.5-2B-GGUF/snapshots/6521bcb22761828aa55639d1c814a207234c3e70/Qwen_Qwen3.5-2B-Q4_K_M.gguf -a qwen3.5-2b-q4_k_m -ngl 999 -fa on -c 4096 -b 2048 -ub 512 -np 1 --no-context-shift --metrics --no-webui --host 127.0.0.1 --port 18102","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"fp16","attentionBackend":"flash_attn","flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"bartowski/Qwen_Qwen3.5-2B-GGUF","displayName":"Qwen_Qwen3.5-2B-GGUF","family":"Qwen","params":2,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.5-2B","displayName":"Qwen3.5-2B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX A5000","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"12th Gen Intel(R) Core(TM) i9-12900K","os":"Linux-6.19.14-300.fc44.x86_64-x86_64-with-glibc2.43","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"ce735c9","quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmp52z7op0011o301d6xkyyyj","username":"snnn","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX A5000","hardwareGroupKey":"DISCRETE_GPU:rtx a5000","rank":40,"reactionCounts":{},"myEmoji":null},{"id":"cmp8fozao00xqo401nzsa1w68","contextLength":4096,"batchSize":1,"ttftMs":153.93,"tokSOut":254.4,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":18.72,"createdAt":"2026-05-16T14:20:57.648Z","notes":"hipfire @ 71896daa (feat/tbq, sFWHT KV family + multi-GPU)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n  benchmarks/prompts/merge_sort_thinking_off.txt (27 input tokens)\ndecode mode: DFlash speculative (adaptive block_size 8..16, mean B=16)\n  drafter: qwen35-27b-dflash.mq4 (z-lab Qwen3.5-DFlash native head, MQ4 weights)\n  target: qwen3.5-27b.mq4 (MQ4 weights, KV q8 filtered: 16/64 layers carry KV)\nkv_cache: q8 — chosen as winner from {q8,asym3,fwht3,fwht4} sweep (τ identical across all modes at this prompt; q8 has lowest KV-write overhead)\nprompt_normalize: true (default since 2026-04-26)\nτ: 13.1818   accept_rate: 0.8788\noutput: 256 tokens emitted\nAR baseline (same binary same hardware, --ar-baseline): 44.60 tok/s — DFlash speedup 5.70x\nHardware: Sapphire Nitro+ RX 7900 XTX, ROCm 7.2.x, k9lin host\nGPU coordination: gpu-tcas exclusive lease per cell","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target ~/.hipfire/models/qwen3.5-27b.mq4 --draft ~/.hipfire/models/qwen35-27b-dflash.mq4 --prompt-file benchmarks/prompts/merge_sort_thinking_off.txt --max 256 --temp 0.0 --no-chatml --kv-mode q8 --ctx 4096","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-27B","displayName":"Qwen3.5-27B","family":"Qwen","params":28,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.1.20-alpha+71896daa","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":41,"reactionCounts":{"fire":1},"myEmoji":null},{"id":"cmopmuqxr000ejv044fbclmq9","contextLength":4096,"batchSize":1,"ttftMs":229.2,"tokSOut":253.7,"tokSPrefill":null,"tokSTotal":249,"peakVramGb":38.4,"createdAt":"2026-05-03T10:33:46.719Z","notes":"FP8 quantization probe — graph_partition + FAP compile on SM120 Blackwell. BFCL 73.33% matches NVFP4 exactly (compile is quantization-neutral). Context limited to 4096 due to FP8 VRAM pressure at gpu-mem-util=0.40. NVFP4 achieves 2x TPS over FP8 on SM120 (506 vs 254 TPS) due to native 4-bit acceleration. DFlash N=4 + CUDA graphs + greedy.","engineFlags":{"commandSnippet":"vllm serve Qwen/Qwen3.6-35B-A3B-FP8 --dtype auto --max-model-len 4096 --max-num-seqs 1 --gpu-memory-utilization 0.40 --kv-cache-dtype bfloat16 --tool-call-parser qwen3_coder --enable-auto-tool-choice --speculative-config {\"method\":\"dflash\",\"num_speculative_tokens\":4} --compilation-config {\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"use_inductor_graph_partition\":true}","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"bfloat16","attentionBackend":null,"flashAttn":true,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.6-35B-A3B-FP8","displayName":"Qwen3.6-35B-A3B-FP8","family":"Qwen","params":35,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.6-35B-A3B","displayName":"Qwen3.6-35B-A3B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX PRO 6000 Blackwell","gpuCount":1,"vramGb":96,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD EPYC 9555","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.19.x-pr40898-cu130","quantization":"FP8","backend":null},"user":{"id":"cmonzzbj40000jx04er7559mm","username":"zhu","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX PRO 6000 Blackwell","hardwareGroupKey":"DISCRETE_GPU:rtx pro 6000 blackwell","rank":42,"reactionCounts":{},"myEmoji":null},{"id":"cmpq1u0sh009hpl01pq2jp7ta","contextLength":128000,"batchSize":1,"ttftMs":null,"tokSOut":251.2,"tokSPrefill":14563.6,"tokSTotal":null,"peakVramGb":3.6,"createdAt":"2026-05-28T22:12:49.409Z","notes":null,"engineFlags":{"commandSnippet":"","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"LiquidAI/LFM2.5-1.2B-Thinking-GGUF","displayName":"LFM2.5-1.2B-Thinking-GGUF","family":"Llama","params":1,"isMoE":false,"baseModel":{"hfId":"LiquidAI/LFM2.5-1.2B-Thinking","displayName":"LFM2.5-1.2B-Thinking"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 3060","gpuCount":2,"vramGb":12,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"Intel Xeon Gold 6138","os":"Ubuntu 26.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"2cbfdc6","quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmpq0h5n50081pl010wc0lw1q","username":"franktheglock","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX 3060","hardwareGroupKey":"DISCRETE_GPU:rtx 3060","rank":43,"reactionCounts":{},"myEmoji":null},{"id":"cmofyatrj0002jv04r8064iqg","contextLength":4096,"batchSize":1,"ttftMs":154.7,"tokSOut":250.25,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":18.2890625,"createdAt":"2026-04-26T15:56:30.896Z","notes":"hipfire @ 3945bb2 (master post-PR #51 loop-break + ngram_block series)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n  chatml-wrapped + explicit empty <think></think> for thinking-off\n  prompt file: benchmarks/prompts/merge_sort_thinking_off.txt\nruns: 3 (median reported); range 249.6–251.0\n  per-run tok/s: [249.56, 250.25, 251.02]\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\nτ (median): 13.182\naccept_rate (median): 0.879\nprefill: 81.3ms (332.0 tok/s)\nttft (excl warmup): 154.7ms = prefill + first cycle\nvram: 18728 MB used / 24560 MB total\nnatural EOS at 157 tokens — production-shape bounded code (no loop)","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target qwen3.5-27b.mq4 --draft qwen35-27b-dflash.mq4 --prompt $(cat benchmarks/prompts/merge_sort_thinking_off.txt) --max 256 --no-chatml --kv-mode asym3","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.5-27B","displayName":"Qwen3.5-27B","family":"Qwen","params":28,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.1.8-alpha+3945bb2","quantization":"MQ4","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":44,"reactionCounts":{},"myEmoji":null},{"id":"cmp4l1szd0016ql01ya8wbyv9","contextLength":262144,"batchSize":1,"ttftMs":61.3,"tokSOut":240.9,"tokSPrefill":8385,"tokSTotal":361.9,"peakVramGb":31,"createdAt":"2026-05-13T21:39:49.370Z","notes":"MTP speculative decoding (4 tokens). FP4/FP8 compressed-tensors. Performance mode: interactivity. Language model only.","engineFlags":{"commandSnippet":"vllm serve /home/goose/models/Qwen3.6-35B-A3B-NVFP4-FP8 --served-model-name Qwen3.6-35B-A3B --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --max-model-len auto --max-cudagraph-capture-size 8 --max-num-seqs 8 --max-num-batched-tokens 8192 --gpu-memory-utilization 0.88 --trust-remote-code --quantization compressed-tensors --safetensors-load-strategy prefetch --enable-prefix-caching --kv-cache-dtype fp8 --scheduling-policy priority --performance-mode interactivity --language-model-only --speculative-config '{\"method\":\"qwen3_next_mtp\",\"num_speculative_tokens\":4}'","tensorParallel":null,"gpuLayers":null,"kvCacheDtype":"fp8","attentionBackend":null,"flashAttn":null,"specDecoding":false,"mtpEnabled":true},"model":{"hfId":"Qwen/Qwen3.6-35B-A3B","displayName":"Qwen3.6-35B-A3B","family":"Qwen","params":36,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 5090","gpuCount":1,"vramGb":32,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"Ryzen 7 3700X","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.20.2rc1","quantization":"compressed-tensors (NVFP4+FP8)","backend":"cuda"},"user":{"id":"cmoz6x7gd006gtl01imbc8lp5","username":"slow4cyl","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX 5090","hardwareGroupKey":"DISCRETE_GPU:rtx 5090","rank":45,"reactionCounts":{"fire":1},"myEmoji":null},{"id":"cmozt50bb007wlo01sfaws32u","contextLength":200000,"batchSize":3,"ttftMs":null,"tokSOut":240.8,"tokSPrefill":null,"tokSTotal":240.8,"peakVramGb":null,"createdAt":"2026-05-10T13:27:24.887Z","notes":"Post-NVLink concurrency=3 aggregate: NVIDIA P3651 2-slot bridge, NV4 (~56 GB/s aggregate). 3 parallel chat completions, ignore_eos + min_tokens=400, temp=0.7. Per-request: ~80-85 t/s each. Pre-NVLink baseline was 199 t/s aggregate; bridge yields +21% here because the all-reduce tensor at this batch size becomes interconnect-bound on PHB. MTP n=3, KV FP8, TP=2 across 2x RTX 3090. Re-run 2026-05-10.","engineFlags":{"commandSnippet":"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,1 vllm serve /opt/models/Qwen3.6-27B-int4-AutoRound --served-model-name qwen3.6-27b --tensor-parallel-size 2 --max-model-len 200000 --kv-cache-dtype fp8 --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}' --disable-custom-all-reduce --host 0.0.0.0 --port 8000","tensorParallel":2,"gpuLayers":null,"kvCacheDtype":"fp8","attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Lorbus/Qwen3.6-27B-int4-AutoRound","displayName":"Qwen3.6-27B-int4-AutoRound","family":"Qwen","params":27,"isMoE":false,"baseModel":{"hfId":"Qwen/Qwen3.6-27B","displayName":"Qwen3.6-27B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 3090","gpuCount":2,"vramGb":48,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen","os":"Ubuntu","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"vllm","engineVersion":"0.19.1","quantization":"AutoRound INT4","backend":null},"user":{"id":"cmowc9fer00qdp101b007eug5","username":"murdarch","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX 3090","hardwareGroupKey":"DISCRETE_GPU:rtx 3090","rank":46,"reactionCounts":{},"myEmoji":null},{"id":"cmppe779e002mpl014npvq9st","contextLength":4096,"batchSize":1,"ttftMs":154.23,"tokSOut":232.14,"tokSPrefill":334.59,"tokSTotal":null,"peakVramGb":17.46875,"createdAt":"2026-05-28T11:11:13.539Z","notes":"hipfire @ ca30ca21 (dense-dflash-gfx11-perfmaxx, origin/feat/dense-dflash-perfmaxx base)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\ndecode mode: DFlash speculative, adaptive B stayed at B=16\ntarget: qwen3.6-27b-awq.mq4 (md5=e42a489edf0be1b144307f2869b73b93) -- corrected AWQ trunk\ndraft: qwen36-27b-dflash-mq4.hf4 (md5=204c4c4ceab30cb9ebc118fa9d59a446)\nbinary: dflash_spec_demo md5=5c7585b5523a6abc0ce9829413a81bec\nkv_cache: q8; ctx=4096; temp=0.0; no-chatml; prompt_normalize: true/default\nruns: 3 fresh processes; per-run tok/s: [232.28, 231.81, 232.14]; median=232.14; spread=0.20%\nmedian details: cycles=13 committed=174 accepted=148 tau=11.3846 accept_rate=0.7590\nprefill median: 80.7ms / 334.59 tok/s; ttft median: 154.23ms; peak VRAM: 17888 MB\nnatural EOS at 162 emitted tokens; decoded output eyeballed as fluent merge_sort code\nThis row supersedes invalid scratch runs that used qwen3.6-27b-dev/qwen3.6-27b.mq4-awq-gptq-f2-lmhead-a100.hfq as trunk.\nNo AR baseline rerun in this correction pass.\nHardware: Sapphire Nitro+ RX 7900 XTX / gfx1100 on k9lin, HIP 7.2, Ubuntu 24.04.","engineFlags":{"commandSnippet":"./target/release/examples/dflash_spec_demo --target /home/kaden/.hipfire/models/qwen3.6-27b-awq.mq4 --draft /home/kaden/.hipfire/models/qwen36-27b-dflash-mq4.hf4 --prompt-file benchmarks/prompts/merge_sort_thinking_off.txt --max 256 --temp 0.0 --no-chatml --kv-mode q8 --ctx 4096","tensorParallel":1,"gpuLayers":null,"kvCacheDtype":"q8","attentionBackend":null,"flashAttn":null,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.6-27B","displayName":"Qwen3.6-27B","family":"Qwen","params":28,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RX 7900 XTX","gpuCount":1,"vramGb":24,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":null,"os":null,"isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"hipfire","engineVersion":"0.2.0+ca30ca21","quantization":"MQ4-AWQ","backend":"rocm"},"user":{"id":"cmoeye1gq0000le04ie2kqs58","username":"schuttdev","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RX 7900 XTX","hardwareGroupKey":"DISCRETE_GPU:rx 7900 xtx","rank":47,"reactionCounts":{},"myEmoji":null},{"id":"cmp4xieqn0002pg01fz0ir9ry","contextLength":4096,"batchSize":1,"ttftMs":115.21,"tokSOut":230.73,"tokSPrefill":null,"tokSTotal":462.11,"peakVramGb":24,"createdAt":"2026-05-14T03:28:39.455Z","notes":"Single RTX 5090 ASUS ROG Astral OC (Blackwell GB202, sm_120a, 32 GB GDDR7), Ryzen 9 9950X + 96 GB DDR5-6000 CL28, Ubuntu 24.04 kernel 6.17, driver 580.159.03 / CUDA 13.0. Q4_K_M weights fully resident on GPU (-ngl 999), Q8 KV cache, 4K context, flash attention. Methodology: vllm bench serve random 512in/512out, 20 prompts, max-concurrency 1. Peak output token throughput 249.00 t/s during sustained decode; mean TPOT 4.12 ms.","engineFlags":{"commandSnippet":"/home/steven/dev/llama.cpp/build/bin/llama-server -m Qwen3.6-35B-A3B-Q4_K_M.gguf --alias qwen3.6-35b-a3b-q4km -fa 1 -ngl 999 -c 4096 -ctk q8_0 -ctv q8_0 --no-context-shift --parallel 1 -b 2048 -ub 1024 --metrics --jinja --reasoning off --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0.0 --host 0.0.0.0 --port 8000","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"q8_0","attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.6-35B-A3B","displayName":"Qwen3.6-35B-A3B","family":"Qwen","params":36,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 5090","gpuCount":1,"vramGb":32,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"Ryzen 7 3700X","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"b6217 (6217b49)","quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmof7gocp0015gv041vcnngqj","username":"Skiipy","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX 5090","hardwareGroupKey":"DISCRETE_GPU:rtx 5090","rank":48,"reactionCounts":{},"myEmoji":null},{"id":"cmph5zm7j00gwpc0104tq6x4m","contextLength":1024,"batchSize":1,"ttftMs":null,"tokSOut":228.56,"tokSPrefill":null,"tokSTotal":null,"peakVramGb":22.89,"createdAt":"2026-05-22T16:59:13.327Z","notes":"Lucebox-Hub test_dflash on RTX 5090 (Blackwell sm_120). 10-run validation: mean 222.64, best 228.56, min 214.33, std 4.06 tok/s. Same merge_sort thinking-off prompt class (25 input tokens, 256 forced output tokens, temp=0) as the hipfire reference. DFlash speculative decoding + DDTree (budget=16) + fast-rollback. Block-diffusion draft from z-lab/Qwen3.6-27B-DFlash (3.46GB BF16). Mean acceptance length 8.83 tokens/step, 55.2% accept rate, 29 draft steps for 256 tokens. AR baseline same hardware: 72.7 tok/s (llama.cpp mainline UD-Q4_K_XL) -> 3.14x speedup.","engineFlags":{"commandSnippet":"DFLASH27B_TOKENIZER=Qwen/Qwen3.6-27B python3 scripts/run.py --raw --prompt \"$(cat merge_sort_thinking_off.txt)\" --n-gen 256 --target models/Qwen3.6-27B-Q4_K_M.gguf --draft models/draft-3.6 --budget 16 --max-ctx 1024 --fa-window 0","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":"q8_0","attentionBackend":null,"flashAttn":true,"specDecoding":true,"mtpEnabled":false},"model":{"hfId":"Qwen/Qwen3.6-27B","displayName":"Qwen3.6-27B","family":"Qwen","params":28,"isMoE":false,"baseModel":null},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 5090","gpuCount":1,"vramGb":32,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"Ryzen 7 3700X","os":"Ubuntu 24.04","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":"Lucebox-Hub dflash 79fe738 (Luce-Org/llama.cpp@luce-dflash fork)","quantization":"Q4_K_M","backend":"cuda"},"user":{"id":"cmof7gocp0015gv041vcnngqj","username":"Skiipy","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX 5090","hardwareGroupKey":"DISCRETE_GPU:rtx 5090","rank":49,"reactionCounts":{},"myEmoji":null},{"id":"cmpqj1zce00g5pl01u99dc6ej","contextLength":128000,"batchSize":1,"ttftMs":457.6,"tokSOut":227.38,"tokSPrefill":null,"tokSTotal":194.75,"peakVramGb":null,"createdAt":"2026-05-29T06:14:54.254Z","notes":"100,000 tokens in KV cache (near max 128K context). 351 output tokens.","engineFlags":{"commandSnippet":"llama-server -m LFM2.5-8B-A1B-Q8_0.gguf -ngl 999 -fa1 -c 0","tensorParallel":null,"gpuLayers":999,"kvCacheDtype":null,"attentionBackend":null,"flashAttn":true,"specDecoding":false,"mtpEnabled":false},"model":{"hfId":"LiquidAI/LFM2.5-8B-A1B-GGUF","displayName":"LFM2.5-8B-A1B-GGUF","family":"Llama","params":8,"isMoE":false,"baseModel":{"hfId":"LiquidAI/LFM2.5-8B-A1B","displayName":"LFM2.5-8B-A1B"}},"hardware":{"hwClass":"DISCRETE_GPU","gpuName":"RTX 3090","gpuCount":2,"vramGb":48,"chipVendor":null,"chipFamily":null,"chipVariant":null,"unifiedMemoryGb":null,"cpu":"AMD Ryzen","os":"Ubuntu","isHeterogeneousGpu":false,"gpuSlots":[]},"engine":{"engineName":"llama.cpp","engineVersion":null,"quantization":"Q8_0","backend":null},"user":{"id":"cmo3v1l6p0000jk04cplvn9g5","username":"Lottolabs","verified":false,"verifiedAt":null},"hardwareGroupLabel":"RTX 3090","hardwareGroupKey":"DISCRETE_GPU:rtx 3090","rank":50,"reactionCounts":{},"myEmoji":null}],"total":1086,"limit":50,"offset":0}