diff --git a/mcp/mcp-image-gen/src/workflows/flux2_klein_heretic.json b/mcp/mcp-image-gen/src/workflows/flux2_klein_heretic.json index e741859..2bf8f32 100644 --- a/mcp/mcp-image-gen/src/workflows/flux2_klein_heretic.json +++ b/mcp/mcp-image-gen/src/workflows/flux2_klein_heretic.json @@ -1,73 +1,98 @@ { - "6": { + "1": { + "class_type": "CLIPLoader", + "inputs": { + "clip_name": "qwen_3_4b_bfl.safetensors", + "type": "flux2", + "device": "default" + } + }, + "2": { "class_type": "CLIPTextEncode", "inputs": { - "clip": ["30", 0], + "clip": ["1", 0], "text": "PROMPT_PLACEHOLDER" } }, - "8": { - "class_type": "VAEDecode", + "3": { + "class_type": "CLIPTextEncode", "inputs": { - "samples": ["13", 0], - "vae": ["31", 0] + "clip": ["1", 0], + "text": "NEGATIVE_PLACEHOLDER" } }, - "9": { - "class_type": "SaveImage", + "4": { + "class_type": "UNETLoader", "inputs": { - "filename_prefix": "mcp-image-gen", - "images": ["8", 0] + "unet_name": "flux-2-klein-4b.safetensors", + "weight_dtype": "default" } }, - "13": { - "class_type": "KSampler", - "inputs": { - "cfg": 1.0, - "denoise": 1.0, - "latent_image": ["27", 0], - "model": ["32", 0], - "negative": ["33", 0], - "positive": ["6", 0], - "sampler_name": "euler", - "scheduler": "beta", - "seed": 42, - "steps": 4 - } - }, - "27": { - "class_type": "EmptySD3LatentImage", - "inputs": { - "batch_size": 1, - "height": 1024, - "width": 1024 - } - }, - "30": { - "class_type": "CLIPLoader", - "inputs": { - "clip_name": "qwen_3_4b_heretic.safetensors", - "type": "flux" - } - }, - "31": { + "5": { "class_type": "VAELoader", "inputs": { "vae_name": "flux2-vae.safetensors" } }, - "32": { - "class_type": "UNETLoader", + "6": { + "class_type": "EmptyFlux2LatentImage", "inputs": { - "unet_name": "flux-2-klein-4b.safetensors", - "weight_dtype": "fp8_e4m3fn" + "width": 1024, + "height": 1024, + "batch_size": 1 } }, - "33": { - "class_type": "CLIPTextEncode", + "7": { + "class_type": "Flux2Scheduler", "inputs": { - "clip": ["30", 0], - "text": "NEGATIVE_PLACEHOLDER" + "steps": 20, + "width": 1024, + "height": 1024 + } + }, + "8": { + "class_type": "CFGGuider", + "inputs": { + "model": ["4", 0], + "positive": ["2", 0], + "negative": ["3", 0], + "cfg": 5 + } + }, + "9": { + "class_type": "KSamplerSelect", + "inputs": { + "sampler_name": "euler" + } + }, + "10": { + "class_type": "RandomNoise", + "inputs": { + "noise_seed": 42 + } + }, + "11": { + "class_type": "SamplerCustomAdvanced", + "inputs": { + "noise": ["10", 0], + "guider": ["8", 0], + "sampler": ["9", 0], + "sigmas": ["7", 0], + "latent_image": ["6", 0] + } + }, + "12": { + "class_type": "VAEDecode", + "inputs": { + "samples": ["11", 0], + "vae": ["5", 0] + } + }, + "13": { + "class_type": "SaveImage", + "inputs": { + "filename_prefix": "mcp-image-gen", + "images": ["12", 0] } } -} \ No newline at end of file +} diff --git a/mcp/mcp-image-gen/tests/test_server.py b/mcp/mcp-image-gen/tests/test_server.py index e38839a..24f0184 100644 --- a/mcp/mcp-image-gen/tests/test_server.py +++ b/mcp/mcp-image-gen/tests/test_server.py @@ -63,11 +63,19 @@ def test_build_flux_workflow_heretic_model(): seed=42, model="flux-2-klein-4b.safetensors", ) - assert wf["6"]["class_type"] == "CLIPTextEncode" - assert wf["30"]["class_type"] == "CLIPLoader" # Qwen3-4B uses single CLIPLoader - assert wf["32"]["inputs"]["unet_name"] == "flux-2-klein-4b.safetensors" - assert wf["31"]["inputs"]["vae_name"] == "flux2-vae.safetensors" - assert wf["13"]["inputs"]["scheduler"] == "beta" # FLUX.2 Klein uses beta scheduler + # New FLUX.2 workflow uses different node IDs and types + assert wf["1"]["class_type"] == "CLIPLoader" # Qwen3-4B uses single CLIPLoader + assert wf["1"]["inputs"]["type"] == "flux2" # correct type for FLUX.2 + assert wf["1"]["inputs"]["device"] == "default" # required for FLUX.2 CLIPLoader + assert wf["2"]["class_type"] == "CLIPTextEncode" # standard CLIP encode (not Flux-specific) + assert wf["4"]["class_type"] == "UNETLoader" + assert wf["4"]["inputs"]["unet_name"] == "flux-2-klein-4b.safetensors" + assert wf["4"]["inputs"]["weight_dtype"] == "default" # not fp8 — avoids dimension errors + assert wf["6"]["class_type"] == "EmptyFlux2LatentImage" # FLUX.2-specific latent + assert wf["8"]["class_type"] == "CFGGuider" # CFGGuider replaces FluxDisableGuidance+BasicGuider + assert wf["8"]["inputs"]["cfg"] == 5 # cfg=5 for FLUX.2 Klein + assert wf["11"]["class_type"] == "SamplerCustomAdvanced" # FLUX.2 sampler (node 11, not 12) + assert wf["13"]["class_type"] == "SaveImage" # output node def test_workflow_registry_contains_both_models(): diff --git a/plans/heretic-flux2-klein-RECAP.md b/plans/heretic-flux2-klein-RECAP.md new file mode 100644 index 0000000..65b2f74 --- /dev/null +++ b/plans/heretic-flux2-klein-RECAP.md @@ -0,0 +1,104 @@ +# FLUX.2 Klein 4B + Heretic — Session Recap + +**Date:** 2026-04-10 +**Status:** Code complete, live generation BLOCKED by encoder dimension mismatch + +--- + +## What We Achieved ✅ + +### Code Infrastructure (Solid) +- **`mcp-image-gen/src/server.py`** — Generic workflow registry with model-based dispatch, `_inject_workflow_params()` works recursively on any node layout +- **`mcp-image-gen/tests/test_server.py`** — 37/37 tests passing +- **Gitea** — pushed to main (commit `38d26ad`) +- The architecture is right: adding a new model = add 1 JSON file + 1 registry entry + +### Models Downloaded (on disk) +| File | Location | Status | +|------|----------|--------| +| `flux-2-klein-4b.safetensors` | `~/ComfyUI/models/diffusion_models/` | ✅ 7.3GB | +| `qwen_3_4b_bfl.safetensors` | `~/ComfyUI/models/text_encoders/` | ✅ merged from BFL shards | +| `qwen_3_4b.safetensors` (z_image) | `~/ComfyUI/models/text_encoders/split_files/` | ✅ wrong model | +| `Qwen3-4B-Q8_0.gguf` | `~/ComfyUI/models/text_encoders/` | ✅ wrong arch | +| ComfyUI-GGUF extension | `~/ComfyUI/custom_nodes/ComfyUI-GGUF` | ✅ installed | + +--- + +## What Failed and Why ❌ + +### The Error (persistent) +``` +mat1 and mat2 shapes cannot be multiplied (512x4096 and 7680x3072) +``` + +### Root Cause Analysis + +**Node 13** (`SamplerCustomAdvanced`) fails — meaning the conditioning vector from the text encoder doesn't match the diffusion model's expected input. + +| Component | Expected | Got | +|-----------|----------|-----| +| FLUX.2 Klein 4B conditioning input | **7680-dim** (2560 × 3) | **4096-dim** | + +**Why 7680 = 2560 × 3?** +FLUX models concatenate text embeddings across multiple time steps. The BFL Qwen3 encoder has `hidden_size=2560`, so the concatenated output is 2560×3=7680. + +**Why 4096?** +Every other Qwen3 variant (z_image_turbo, official Qwen repo GGUF) uses standard Qwen3 with `hidden_size=4096` — these are for Z-Image and text generation respectively, NOT for FLUX.2 Klein. + +### What We Tried (and Why Each Failed) +1. `CLIPLoader type=flux` → wrong architecture (FLUX.1 style) +2. `CLIPLoader type=flux2` → correct node, wrong encoder file (z_image Qwen) +3. `CLIPLoaderGGUF type=flux2` → correct node, wrong GGUF (standard Qwen3) +4. `CLIPLoader type=flux2 + qwen_3_4b_bfl.safetensors` → merged BFL shards, but still fails +5. Workflow: `KSampler` → doesn't work with FLUX.2 (different architecture) +6. Workflow: `SamplerCustomAdvanced + BasicGuider + Flux2Scheduler` → correct architecture but encoding mismatch persists + +### The Real Missing Piece + +The BFL FLUX.2 Klein text encoder in Diffusers format is designed for use via `transformers/diffusers` pipeline, NOT via ComfyUI's `CLIPLoader`. ComfyUI reads the weights differently. The weights are there but ComfyUI doesn't know how to map `model.embed_tokens`, `model.layers.N.*` etc. to the CLIP interface it expects. + +**The correct encoder file for ComfyUI** is `Comfy-Org/vae-text-encorder-for-flux-klein-4b` — the 7.5GB file we downloaded IS the right one, but ComfyUI is likely loading it with the wrong adapter in the `CLIPLoader`. + +--- + +## Clean Approach — What We Need to Do + +### Option A: Use ComfyUI Web UI (Easiest) +1. Open `http://localhost:8188` in browser +2. Load the "Flux.2 Klein 4B Text-to-Image" workflow template (it's in the UI Templates) +3. **Export the working API JSON** (Ctrl+Shift+E or Settings → Save as API format) +4. Replace our `flux2_klein_heretic.json` with the exported JSON +5. Add placeholders and test + +This gives us the **verified working node graph** without guessing. 10 minutes. + +### Option B: Find a Working API JSON online +- Reddit r/comfyui has working FLUX.2 Klein workflows +- Export format is what we need + +### Then: Add Heretic +Once we have a working standard workflow: +1. Download the actual Heretic-abliterated version of the BFL encoder (once it's published) +2. Swap encoder filename in the JSON + +--- + +## My Recommendation + +**Do Option A right now.** Open `http://localhost:8188`, load the template, export to API format, paste the JSON. We'll be running in 10 minutes instead of guessing node names. + +The MCP server code is solid — the only broken piece is `flux2_klein_heretic.json`. Once we have the right JSON from the UI, everything else works. + +--- + +## Files to Clean Up (After We Have the Right JSON) + +```bash +# Remove wrong encoders (save ~8GB) +rm ~/ComfyUI/models/text_encoders/qwen_3_4b.safetensors # z_image version +rm ~/ComfyUI/models/text_encoders/qwen_3_4b_flux2.safetensors + +# Keep +# ~/ComfyUI/models/text_encoders/qwen_3_4b_bfl.safetensors ← correct encoder +# ~/ComfyUI/models/text_encoders/Qwen3-4B-Q8_0.gguf ← maybe useful later +```