0.34.0 (#239)
Visual Agent Refactor + Visual Library Character Card Import Refactor Bug fixes and other improvements
19
.github/workflows/ci.yml
vendored
@@ -19,6 +19,25 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Remove unnecessary files to release disk space
|
||||
run: |
|
||||
sudo rm -rf \
|
||||
"$AGENT_TOOLSDIRECTORY" \
|
||||
/opt/ghc \
|
||||
/opt/google/chrome \
|
||||
/opt/microsoft/msedge \
|
||||
/opt/microsoft/powershell \
|
||||
/opt/pipx \
|
||||
/usr/lib/mono \
|
||||
/usr/local/julia* \
|
||||
/usr/local/lib/android \
|
||||
/usr/local/lib/node_modules \
|
||||
/usr/local/share/chromium \
|
||||
/usr/local/share/powershell \
|
||||
/usr/local/share/powershell \
|
||||
/usr/share/dotnet \
|
||||
/usr/share/swift
|
||||
|
||||
- name: Log in to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
|
||||
19
.github/workflows/test-container-build.yml
vendored
@@ -14,6 +14,25 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Remove unnecessary files to release disk space
|
||||
run: |
|
||||
sudo rm -rf \
|
||||
"$AGENT_TOOLSDIRECTORY" \
|
||||
/opt/ghc \
|
||||
/opt/google/chrome \
|
||||
/opt/microsoft/msedge \
|
||||
/opt/microsoft/powershell \
|
||||
/opt/pipx \
|
||||
/usr/lib/mono \
|
||||
/usr/local/julia* \
|
||||
/usr/local/lib/android \
|
||||
/usr/local/lib/node_modules \
|
||||
/usr/local/share/chromium \
|
||||
/usr/local/share/powershell \
|
||||
/usr/local/share/powershell \
|
||||
/usr/share/dotnet \
|
||||
/usr/share/swift
|
||||
|
||||
- name: Log in to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
|
||||
18
Dockerfile
@@ -45,6 +45,9 @@ WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
bash \
|
||||
wget \
|
||||
tar \
|
||||
xz-utils \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install uv in the final stage
|
||||
@@ -53,6 +56,21 @@ RUN pip install uv
|
||||
# Copy virtual environment from backend-build stage
|
||||
COPY --from=backend-build /app/.venv /app/.venv
|
||||
|
||||
# Download and install FFmpeg 8.0 with shared libraries into .venv (matching Windows installer approach)
|
||||
# Using BtbN FFmpeg builds which provide shared libraries - verified to work
|
||||
# Note: We tried using jrottenberg/ffmpeg:8.0-ubuntu image but copying libraries from it didn't work properly,
|
||||
# so we use the direct download approach which is more reliable and matches the Windows installer
|
||||
RUN cd /tmp && \
|
||||
wget -q https://github.com/BtbN/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-linux64-gpl-shared.tar.xz -O ffmpeg.tar.xz && \
|
||||
tar -xf ffmpeg.tar.xz && \
|
||||
cp -a ffmpeg-master-latest-linux64-gpl-shared/bin/* /app/.venv/bin/ && \
|
||||
cp -a ffmpeg-master-latest-linux64-gpl-shared/lib/* /app/.venv/lib/ && \
|
||||
rm -rf ffmpeg-master-latest-linux64-gpl-shared ffmpeg.tar.xz && \
|
||||
LD_LIBRARY_PATH=/app/.venv/lib /app/.venv/bin/ffmpeg -version | head -n 1
|
||||
|
||||
# Set LD_LIBRARY_PATH so torchcodec can find ffmpeg libraries at runtime
|
||||
ENV LD_LIBRARY_PATH=/app/.venv/lib:${LD_LIBRARY_PATH}
|
||||
|
||||
# Copy Python source code
|
||||
COPY --from=backend-build /app/src /app/src
|
||||
|
||||
|
||||
@@ -1,134 +0,0 @@
|
||||
"""
|
||||
An attempt to write a client against the runpod serverless vllm worker.
|
||||
|
||||
This is close to functional, but since runpod serverless gpu availability is currently terrible, i have
|
||||
been unable to properly test it.
|
||||
|
||||
Putting it here for now since i think it makes a decent example of how to write a client against a new service.
|
||||
"""
|
||||
|
||||
import pydantic
|
||||
import structlog
|
||||
import runpod
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from talemate.client.base import ClientBase, ExtraField
|
||||
from talemate.client.registry import register
|
||||
from talemate.emit import emit
|
||||
from talemate.config import Client as BaseClientConfig
|
||||
|
||||
log = structlog.get_logger("talemate.client.runpod_vllm")
|
||||
|
||||
|
||||
class Defaults(pydantic.BaseModel):
|
||||
max_token_length: int = 4096
|
||||
model: str = ""
|
||||
runpod_id: str = ""
|
||||
|
||||
|
||||
class ClientConfig(BaseClientConfig):
|
||||
runpod_id: str = ""
|
||||
|
||||
|
||||
@register()
|
||||
class RunPodVLLMClient(ClientBase):
|
||||
client_type = "runpod_vllm"
|
||||
conversation_retries = 5
|
||||
config_cls = ClientConfig
|
||||
|
||||
class Meta(ClientBase.Meta):
|
||||
title: str = "Runpod VLLM"
|
||||
name_prefix: str = "Runpod VLLM"
|
||||
enable_api_auth: bool = True
|
||||
manual_model: bool = True
|
||||
defaults: Defaults = Defaults()
|
||||
extra_fields: dict[str, ExtraField] = {
|
||||
"runpod_id": ExtraField(
|
||||
name="runpod_id",
|
||||
type="text",
|
||||
label="Runpod ID",
|
||||
required=True,
|
||||
description="The Runpod ID to connect to.",
|
||||
)
|
||||
}
|
||||
|
||||
def __init__(self, model=None, runpod_id=None, **kwargs):
|
||||
self.model_name = model
|
||||
self.runpod_id = runpod_id
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@property
|
||||
def experimental(self):
|
||||
return False
|
||||
|
||||
def set_client(self, **kwargs):
|
||||
log.debug("set_client", kwargs=kwargs, runpod_id=self.runpod_id)
|
||||
self.runpod_id = kwargs.get("runpod_id", self.runpod_id)
|
||||
|
||||
def tune_prompt_parameters(self, parameters: dict, kind: str):
|
||||
super().tune_prompt_parameters(parameters, kind)
|
||||
|
||||
keys = list(parameters.keys())
|
||||
|
||||
valid_keys = ["temperature", "top_p", "max_tokens"]
|
||||
|
||||
for key in keys:
|
||||
if key not in valid_keys:
|
||||
del parameters[key]
|
||||
|
||||
async def get_model_name(self):
|
||||
return self.model_name
|
||||
|
||||
async def generate(self, prompt: str, parameters: dict, kind: str):
|
||||
"""
|
||||
Generates text from the given prompt and parameters.
|
||||
"""
|
||||
prompt = prompt.strip()
|
||||
|
||||
self.log.debug("generate", prompt=prompt[:128] + " ...", parameters=parameters)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
endpoint = runpod.AsyncioEndpoint(self.runpod_id, session)
|
||||
|
||||
run_request = await endpoint.run(
|
||||
{
|
||||
"input": {
|
||||
"prompt": prompt,
|
||||
}
|
||||
# "parameters": parameters
|
||||
}
|
||||
)
|
||||
|
||||
while (await run_request.status()) not in [
|
||||
"COMPLETED",
|
||||
"FAILED",
|
||||
"CANCELLED",
|
||||
]:
|
||||
status = await run_request.status()
|
||||
log.debug("generate", status=status)
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
status = await run_request.status()
|
||||
|
||||
log.debug("generate", status=status)
|
||||
|
||||
response = await run_request.output()
|
||||
|
||||
log.debug("generate", response=response)
|
||||
|
||||
return response["choices"][0]["tokens"][0]
|
||||
|
||||
except Exception as e:
|
||||
self.log.error("generate error", e=e)
|
||||
emit(
|
||||
"status", message="Error during generation (check logs)", status="error"
|
||||
)
|
||||
return ""
|
||||
|
||||
def reconfigure(self, **kwargs):
|
||||
if kwargs.get("model"):
|
||||
self.model_name = kwargs["model"]
|
||||
if "runpod_id" in kwargs:
|
||||
self.api_auth = kwargs["runpod_id"]
|
||||
self.set_client(**kwargs)
|
||||
BIN
docs/img/0.34.0/character-card-1.png
Normal file
|
After Width: | Height: | Size: 346 KiB |
BIN
docs/img/0.34.0/character-card-2.png
Normal file
|
After Width: | Height: | Size: 702 KiB |
BIN
docs/img/0.34.0/character-card-3.png
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
docs/img/0.34.0/character-card-4.png
Normal file
|
After Width: | Height: | Size: 5.5 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.agent_config.png
Normal file
|
After Width: | Height: | Size: 54 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.browse-templates.png
Normal file
|
After Width: | Height: | Size: 9.4 KiB |
|
After Width: | Height: | Size: 3.0 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.lighting-lora.png
Normal file
|
After Width: | Height: | Size: 42 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.qwen-export.png
Normal file
|
After Width: | Height: | Size: 19 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.qwen-save.png
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.qwen-start.png
Normal file
|
After Width: | Height: | Size: 471 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.qwen-template.png
Normal file
|
After Width: | Height: | Size: 180 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.talemate-empty-prompt.png
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.talemate-prompts.png
Normal file
|
After Width: | Height: | Size: 82 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.talemate-references.png
Normal file
|
After Width: | Height: | Size: 48 KiB |
BIN
docs/img/0.34.0/comfyui.workflow.setup.talemate-resulotion.png
Normal file
|
After Width: | Height: | Size: 30 KiB |
BIN
docs/img/0.34.0/shared-world-1.png
Normal file
|
After Width: | Height: | Size: 70 KiB |
BIN
docs/img/0.34.0/shared-world-10.png
Normal file
|
After Width: | Height: | Size: 46 KiB |
BIN
docs/img/0.34.0/shared-world-11.png
Normal file
|
After Width: | Height: | Size: 45 KiB |
BIN
docs/img/0.34.0/shared-world-12.png
Normal file
|
After Width: | Height: | Size: 4.9 KiB |
BIN
docs/img/0.34.0/shared-world-13.png
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
docs/img/0.34.0/shared-world-14.png
Normal file
|
After Width: | Height: | Size: 411 KiB |
BIN
docs/img/0.34.0/shared-world-2.png
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
docs/img/0.34.0/shared-world-3.png
Normal file
|
After Width: | Height: | Size: 9.5 KiB |
BIN
docs/img/0.34.0/shared-world-4.png
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
docs/img/0.34.0/shared-world-5.png
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
docs/img/0.34.0/shared-world-6.png
Normal file
|
After Width: | Height: | Size: 27 KiB |
BIN
docs/img/0.34.0/shared-world-7.png
Normal file
|
After Width: | Height: | Size: 28 KiB |
BIN
docs/img/0.34.0/shared-world-8.png
Normal file
|
After Width: | Height: | Size: 371 KiB |
BIN
docs/img/0.34.0/shared-world-9.png
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
docs/img/0.34.0/visual-agent-a1111-1.png
Normal file
|
After Width: | Height: | Size: 46 KiB |
BIN
docs/img/0.34.0/visual-agent-a1111-2.png
Normal file
|
After Width: | Height: | Size: 45 KiB |
BIN
docs/img/0.34.0/visual-agent-a1111-3.png
Normal file
|
After Width: | Height: | Size: 2.8 KiB |
BIN
docs/img/0.34.0/visual-agent-comfyui-1.png
Normal file
|
After Width: | Height: | Size: 50 KiB |
BIN
docs/img/0.34.0/visual-agent-comfyui-2.png
Normal file
|
After Width: | Height: | Size: 42 KiB |
BIN
docs/img/0.34.0/visual-agent-comfyui-3.png
Normal file
|
After Width: | Height: | Size: 44 KiB |
BIN
docs/img/0.34.0/visual-agent-comfyui-4.png
Normal file
|
After Width: | Height: | Size: 41 KiB |
BIN
docs/img/0.34.0/visual-agent-comfyui-5.png
Normal file
|
After Width: | Height: | Size: 3.9 KiB |
BIN
docs/img/0.34.0/visual-agent-general-1.png
Normal file
|
After Width: | Height: | Size: 3.0 KiB |
BIN
docs/img/0.34.0/visual-agent-general-2.png
Normal file
|
After Width: | Height: | Size: 44 KiB |
BIN
docs/img/0.34.0/visual-agent-general-3.png
Normal file
|
After Width: | Height: | Size: 35 KiB |
BIN
docs/img/0.34.0/visual-agent-google-4.png
Normal file
|
After Width: | Height: | Size: 50 KiB |
BIN
docs/img/0.34.0/visual-agent-google-5.png
Normal file
|
After Width: | Height: | Size: 30 KiB |
BIN
docs/img/0.34.0/visual-agent-google-6.png
Normal file
|
After Width: | Height: | Size: 25 KiB |
BIN
docs/img/0.34.0/visual-agent-google-7.png
Normal file
|
After Width: | Height: | Size: 22 KiB |
BIN
docs/img/0.34.0/visual-agent-google-8.png
Normal file
|
After Width: | Height: | Size: 4.4 KiB |
BIN
docs/img/0.34.0/visual-agent-openai-1.png
Normal file
|
After Width: | Height: | Size: 50 KiB |
BIN
docs/img/0.34.0/visual-agent-openai-2.png
Normal file
|
After Width: | Height: | Size: 27 KiB |
BIN
docs/img/0.34.0/visual-agent-openai-3.png
Normal file
|
After Width: | Height: | Size: 25 KiB |
BIN
docs/img/0.34.0/visual-agent-openai-4.png
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
docs/img/0.34.0/visual-agent-openai-5.png
Normal file
|
After Width: | Height: | Size: 4.7 KiB |
BIN
docs/img/0.34.0/visual-agent-openrouter-1.png
Normal file
|
After Width: | Height: | Size: 52 KiB |
BIN
docs/img/0.34.0/visual-agent-openrouter-2.png
Normal file
|
After Width: | Height: | Size: 48 KiB |
BIN
docs/img/0.34.0/visual-agent-openrouter-3.png
Normal file
|
After Width: | Height: | Size: 47 KiB |
BIN
docs/img/0.34.0/visual-agent-openrouter-4.png
Normal file
|
After Width: | Height: | Size: 45 KiB |
BIN
docs/img/0.34.0/visual-agent-openrouter-5.png
Normal file
|
After Width: | Height: | Size: 4.5 KiB |
BIN
docs/img/0.34.0/visual-agent-sdnext-1.png
Normal file
|
After Width: | Height: | Size: 48 KiB |
BIN
docs/img/0.34.0/visual-agent-sdnext-2.png
Normal file
|
After Width: | Height: | Size: 51 KiB |
BIN
docs/img/0.34.0/visual-agent-sdnext-3.png
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
docs/img/0.34.0/visual-agent-sdnext-4.png
Normal file
|
After Width: | Height: | Size: 6.6 KiB |
BIN
docs/img/0.34.0/visual-agent-sdnext-5.png
Normal file
|
After Width: | Height: | Size: 54 KiB |
BIN
docs/img/0.34.0/visual-agent-sdnext-6.png
Normal file
|
After Width: | Height: | Size: 3.9 KiB |
BIN
docs/img/0.34.0/visual-library-0.png
Normal file
|
After Width: | Height: | Size: 6.9 KiB |
BIN
docs/img/0.34.0/visual-library-1.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
docs/img/0.34.0/visual-library-10.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
docs/img/0.34.0/visual-library-11.png
Normal file
|
After Width: | Height: | Size: 45 KiB |
BIN
docs/img/0.34.0/visual-library-12.png
Normal file
|
After Width: | Height: | Size: 53 KiB |
BIN
docs/img/0.34.0/visual-library-13.png
Normal file
|
After Width: | Height: | Size: 60 KiB |
BIN
docs/img/0.34.0/visual-library-14.png
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
docs/img/0.34.0/visual-library-15.png
Normal file
|
After Width: | Height: | Size: 1.1 MiB |
BIN
docs/img/0.34.0/visual-library-16.png
Normal file
|
After Width: | Height: | Size: 15 KiB |
BIN
docs/img/0.34.0/visual-library-17.png
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
docs/img/0.34.0/visual-library-18.png
Normal file
|
After Width: | Height: | Size: 29 KiB |
BIN
docs/img/0.34.0/visual-library-19.png
Normal file
|
After Width: | Height: | Size: 1.0 MiB |
BIN
docs/img/0.34.0/visual-library-2.png
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
docs/img/0.34.0/visual-library-20.png
Normal file
|
After Width: | Height: | Size: 15 KiB |
BIN
docs/img/0.34.0/visual-library-21.png
Normal file
|
After Width: | Height: | Size: 5.9 KiB |
BIN
docs/img/0.34.0/visual-library-22.png
Normal file
|
After Width: | Height: | Size: 91 KiB |
BIN
docs/img/0.34.0/visual-library-23.png
Normal file
|
After Width: | Height: | Size: 39 KiB |
BIN
docs/img/0.34.0/visual-library-24.png
Normal file
|
After Width: | Height: | Size: 2.5 KiB |
BIN
docs/img/0.34.0/visual-library-3.png
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
docs/img/0.34.0/visual-library-4.png
Normal file
|
After Width: | Height: | Size: 23 KiB |
BIN
docs/img/0.34.0/visual-library-5.png
Normal file
|
After Width: | Height: | Size: 1.9 MiB |
BIN
docs/img/0.34.0/visual-library-6.png
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
docs/img/0.34.0/visual-library-7.png
Normal file
|
After Width: | Height: | Size: 22 KiB |
BIN
docs/img/0.34.0/visual-library-8.png
Normal file
|
After Width: | Height: | Size: 1.0 MiB |
BIN
docs/img/0.34.0/visual-library-9.png
Normal file
|
After Width: | Height: | Size: 32 KiB |
BIN
docs/img/0.34.0/visual-library-backend-status-1.png
Normal file
|
After Width: | Height: | Size: 8.0 KiB |
BIN
docs/img/0.34.0/visual-library-backend-status-2.png
Normal file
|
After Width: | Height: | Size: 4.6 KiB |
@@ -1,42 +0,0 @@
|
||||
# AUTOMATIC1111
|
||||
|
||||
!!! info
|
||||
This requires you to setup a local instance of the AUTOMATIC1111 API. Follow the instructions from [their GitHub](https://github.com/AUTOMATIC1111/stable-diffusion-webui) to get it running.
|
||||
|
||||
Once you have it running, you will want to adjust the `webui-user.bat` in the AUTOMATIC1111 directory to include the following command arguments:
|
||||
|
||||
```bat
|
||||
set COMMANDLINE_ARGS=--api --listen --port 7861
|
||||
```
|
||||
|
||||
Then run the `webui-user.bat` to start the API.
|
||||
|
||||
Once your AUTOAMTIC1111 API is running (check with your browser) you can set the Visualizer config to use the `AUTOMATIC1111` backend
|
||||
|
||||
## Settings
|
||||
|
||||

|
||||
|
||||
##### API URL
|
||||
|
||||
The url of the API, if following this example, should be `http://localhost:7861`
|
||||
|
||||
##### Steps
|
||||
|
||||
The number of steps to use for image generation. More steps will result in higher quality images but will take longer to generate.
|
||||
|
||||
##### Sampling Method
|
||||
|
||||
Which sampling method to use for image generation.
|
||||
|
||||
##### Schedule Type
|
||||
|
||||
Which scheduler to use for image generation.
|
||||
|
||||
##### CFG Scale
|
||||
|
||||
CFG scale for image generation.
|
||||
|
||||
##### Model type
|
||||
|
||||
Differentiates between `SD1.5` and `SDXL` models. This will dictate the resolution of the image generation and actually matters for the quality so make sure this is set to the correct model type for the model you are using.
|
||||
80
docs/user-guide/agents/visualizer/backends/a1111.md
Normal file
@@ -0,0 +1,80 @@
|
||||
# AUTOMATIC1111
|
||||
|
||||
!!! warning "Deprecated Backend"
|
||||
**AUTOMATIC1111 (A1111) is essentially dead at this point** - development has largely stopped and the project is no longer actively maintained. Support for AUTOMATIC1111 has only been carried forward in Talemate because it was easy to maintain compatibility.
|
||||
|
||||
**We strongly recommend using [SD.Next](sdnext.md) instead**, which is an actively maintained fork of AUTOMATIC1111 with improved performance, better features, and ongoing development. SD.Next maintains API compatibility with AUTOMATIC1111, so migration is straightforward.
|
||||
|
||||
The AUTOMATIC1111 backend provides basic text-to-image generation capabilities using the AUTOMATIC1111 Stable Diffusion WebUI API. This backend only supports text-to-image generation - it does not support image editing or image analysis.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before configuring the AUTOMATIC1111 backend, you need to have AUTOMATIC1111 installed and running:
|
||||
|
||||
1. Install and start AUTOMATIC1111 Stable Diffusion WebUI on your system
|
||||
2. Ensure the API is enabled and accessible
|
||||
3. Note the API URL (default is `http://localhost:7860`)
|
||||
|
||||
!!! note "Migration to SD.Next"
|
||||
If you're setting up a new installation, please use [SD.Next](sdnext.md) instead. If you have an existing AUTOMATIC1111 installation, consider migrating to SD.Next for better performance and ongoing support.
|
||||
|
||||
## Configuration
|
||||
|
||||
In the Visualizer agent settings, select AUTOMATIC1111 as your backend for text-to-image generation.
|
||||
|
||||
### Text-to-Image Configuration
|
||||
|
||||
For text-to-image generation, configure the following settings:
|
||||
|
||||
- **API URL**: The URL where your AUTOMATIC1111 instance is running (e.g., `http://localhost:7860`)
|
||||
- **Steps**: Number of sampling steps (default: 40, range: 5-150)
|
||||
- **Sampling Method**: The sampling algorithm to use (e.g., "DPM++ 2M", "Euler a")
|
||||
- **Schedule Type**: The sampling schedule to use (e.g., "Automatic", "Karras", "Uniform")
|
||||
- **CFG Scale**: Classifier-free guidance scale (default: 7.0, range: 1-30)
|
||||
- **Prompt Type**: Choose between "Keywords" or "Descriptive" prompt formatting
|
||||
- **Resolutions**: Configure the pixel dimensions for Square, Portrait, and Landscape formats
|
||||
|
||||

|
||||
|
||||
!!! note "No Authentication"
|
||||
AUTOMATIC1111 backend does not support authentication. If your AUTOMATIC1111 instance requires authentication, you'll need to either disable it or use SD.Next instead, which supports authentication.
|
||||
|
||||
!!! note "Model Selection"
|
||||
AUTOMATIC1111 does not support model selection through the API. The backend will use whatever model is currently loaded in your AUTOMATIC1111 instance. You need to change models manually in the AUTOMATIC1111 WebUI interface.
|
||||
|
||||
## Usage
|
||||
|
||||
Once configured, the AUTOMATIC1111 backend will appear in the Visualizer agent status with a green indicator showing text-to-image capability is available.
|
||||
|
||||

|
||||
|
||||
## Limitations
|
||||
|
||||
The AUTOMATIC1111 backend has several limitations compared to SD.Next:
|
||||
|
||||
- **No image editing**: Only supports text-to-image generation
|
||||
- **No authentication**: Cannot connect to instances that require authentication
|
||||
- **No model selection**: Uses whatever model is loaded in AUTOMATIC1111
|
||||
- **No active development**: The AUTOMATIC1111 project is no longer actively maintained
|
||||
|
||||
## Sampler Settings
|
||||
|
||||
AUTOMATIC1111 provides control over the generation process:
|
||||
|
||||
- **Steps**: More steps generally produce higher quality images but take longer. Typical values range from 20-50 steps, with 40 being a good default.
|
||||
- **Sampling Method**: Different samplers produce different results. Popular options include:
|
||||
- **DPM++ 2M**: Fast and high quality (default)
|
||||
- **Euler a**: Fast, good for quick iterations
|
||||
- **DPM++ SDE**: Variant with different characteristics
|
||||
- **Schedule Type**: Controls the noise schedule used during sampling. "Automatic" is typically the best choice.
|
||||
- **CFG Scale**: Controls how closely the model follows your prompt. Lower values (1-7) allow more creative freedom, higher values (7-15) stick closer to the prompt.
|
||||
|
||||
## Prompt Formatting
|
||||
|
||||
AUTOMATIC1111 uses **Keywords** prompt formatting by default. This means prompts are formatted as keyword lists optimized for Stable Diffusion models. You can switch to **Descriptive** formatting if you prefer natural language descriptions, though Keywords typically work better with SD models.
|
||||
|
||||
## Automatic Setup with KoboldCpp
|
||||
|
||||
If you're using KoboldCpp with AUTOMATIC1111 support, Talemate can automatically detect and configure the AUTOMATIC1111 backend when "Automatic Setup" is enabled in the Visualizer settings. This will automatically set the API URL to match your KoboldCpp instance URL.
|
||||
166
docs/user-guide/agents/visualizer/backends/comfyui.md
Normal file
@@ -0,0 +1,166 @@
|
||||
# ComfyUI
|
||||
|
||||
## Prepare ComfyUI
|
||||
|
||||
This document assumes you have installed ComfyUI (either the portable or the desktop version).
|
||||
|
||||
Copy the .bat file you use to start ComfyUI and add the `--port` parameter.
|
||||
|
||||
```
|
||||
--port 8188
|
||||
```
|
||||
|
||||
You can put any port you want, but this example will use 8188.
|
||||
|
||||
|
||||
!!! note "If you are using a remote ComfyUI instance"
|
||||
If you are using a remote ComfyUI instance, you may want to add the `--listen` parameter as well.
|
||||
|
||||
```
|
||||
--listen 0.0.0.0
|
||||
```
|
||||
|
||||
You will then also need to obtain the IP address of the computer running ComfyUI and use it in the Talemate configuration. (instead of localhost)
|
||||
|
||||
|
||||
Confirm ComfyUI is running in your browser by visiting http://localhost:8188 or `http://<ip-address>:8188` before proceeding to talemate.
|
||||
|
||||
## Talemate configuration
|
||||
|
||||
In the Visualizer agent settings, select ComfyUI as your backend for text-to-image generation, image editing, or both. You'll need to configure each backend separately if you want to use ComfyUI for different operations.
|
||||
|
||||

|
||||
|
||||
### Text-to-Image Configuration
|
||||
|
||||
For text-to-image generation, configure the following settings:
|
||||
|
||||
- **API URL**: The URL where your ComfyUI instance is running (e.g., `http://localhost:8188`)
|
||||
- **Workflow**: Select the workflow file to use for generation. Talemate includes several pre-configured workflows including `qwen_image.json` and `z_image_turbo.json`
|
||||
- **Model**: Select the model to use from your ComfyUI models directory. If your workflow doesn't include a "Talemate Load Model" or "Talemate Load Checkpoint" node, this will be set to "- Workflow default -" and the model specified in the workflow file will be used.
|
||||
- **Prompt Type**: Choose between "Keywords" or "Descriptive" prompt formatting
|
||||
|
||||
!!! tip "Choosing Prompt Type"
|
||||
As a general rule: **SDXL models** typically work best with **Keywords** formatting, while most other models (including Qwen Image, Flux, etc.) work better with **Descriptive** formatting. If you're unsure, start with Descriptive and switch to Keywords if you're using an SDXL-based workflow.
|
||||
|
||||
- **Resolutions**: Configure the pixel dimensions for Square, Portrait, and Landscape formats
|
||||
|
||||

|
||||

|
||||
|
||||
### Image Editing Configuration
|
||||
|
||||
For image editing, configure similar settings but select an image editing workflow such as `qwen_image_edit.json`. The number of reference images supported depends on your model - for example, Qwen Image Edit can handle up to 3 reference images that can be used to guide the editing process.
|
||||
|
||||
!!! note "Prompt Type for Image Editing"
|
||||
Image editing workflows typically use **Descriptive** prompt formatting by default, as most image editing models (like Qwen Image Edit) work better with descriptive instructions rather than keyword-based prompts.
|
||||
|
||||

|
||||

|
||||
|
||||
|
||||
## Custom workflow creation
|
||||
|
||||
Talemate comes with pre-configured workflows for Qwen Image models (`qwen_image.json` for text-to-image and `qwen_image_edit.json` for image editing). However, since there are many variables in ComfyUI setups (different model formats like GGUF vs safetensors, custom LoRAs, different hardware configurations, etc.), you may want to customize these workflows to match your specific setup.
|
||||
|
||||
### Starting from a Template
|
||||
|
||||
Open ComfyUI in your browser and navigate to the templates menu. ComfyUI includes workflow templates that you can use as a starting point:
|
||||
|
||||
- **Qwen Image**: For text-to-image generation
|
||||
- **Qwen Image Edit**: For image editing workflows
|
||||
|
||||
These templates provide a good foundation for creating custom workflows.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
Load the Qwen Image template to see the base workflow structure.
|
||||
|
||||

|
||||
|
||||
### Naming Nodes for Talemate
|
||||
|
||||
For Talemate to properly interact with your workflow, you need to rename specific nodes with exact titles. These titles allow Talemate to inject prompts, set resolutions, and handle reference images automatically.
|
||||
|
||||
**Required Node Titles:**
|
||||
|
||||
1. **Talemate Positive Prompt**: The node that encodes the positive prompt (typically a `CLIPTextEncode` or `TextEncodeQwenImageEditPlus` node). This is required - workflows without this node will fail validation.
|
||||
2. **Talemate Negative Prompt**: The node that encodes the negative prompt (same node types as above)
|
||||
3. **Talemate Resolution**: The node that sets the image dimensions (typically an `EmptySD3LatentImage` or similar latent image node)
|
||||
|
||||
**Optional Node Titles:**
|
||||
|
||||
- **Talemate Load Model** or **Talemate Load Checkpoint**: If you want to allow model selection from Talemate's settings, rename your model loader node (typically `CheckpointLoaderSimple`, `UNETLoader`, or `UnetLoaderGGUF`) to one of these titles. If this node is not present, Talemate will use the model specified in the workflow file itself, and the model dropdown will show "- Workflow default -" as the only option.
|
||||
|
||||
To rename a node, right-click on it and select "Rename" or double-click the node title, then enter the exact title name.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
### Activating the Lightning LoRA (Optional)
|
||||
|
||||
The Qwen Image template includes a Lightning LoRA node that is deactivated by default. You can optionally activate it to speed up generation with fewer steps. Note that this is a trade-off: the Lightning LoRA reduces generation time but may degrade image quality compared to using more steps without the LoRA.
|
||||
|
||||
To activate the Lightning LoRA:
|
||||
|
||||
1. Find the `LoraLoaderModelOnly` node in your workflow (it should already be present in the Qwen template)
|
||||
2. Connect it between your model loader and sampler if it's not already connected
|
||||
3. Load the appropriate Lightning LoRA file (e.g., `Qwen-Image-Lightning-8steps-V1.0.safetensors` for 8-step generation)
|
||||
4. Adjust your sampler settings:
|
||||
|
||||
- **Steps**: Reduce to 8 steps (or 4 steps for the 4-step variant)
|
||||
- **CFG Scale**: Set to 1.0 (lower than typical values)
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
### Image Editing Workflows: Reference Nodes
|
||||
|
||||
For image editing workflows (like `qwen_image_edit.json`), you need to add reference image nodes. Note that ComfyUI includes a Qwen Image Edit template similar to the Qwen Image template, which you can use as a starting point.
|
||||
|
||||
!!! warning "Reference Nodes Required"
|
||||
Image editing workflows **must** define at least one reference node. If your workflow doesn't include any nodes titled "Talemate Reference 1" (or higher), the backend status will show an error and image editing will not work.
|
||||
|
||||
These are `LoadImage` nodes that Talemate will use to inject reference images for editing.
|
||||
|
||||
The number of reference nodes you can add depends on your model's capabilities. For example, Qwen Image Edit supports up to 3 reference images. Add `LoadImage` nodes and rename them with these exact titles:
|
||||
|
||||
- **Talemate Reference 1**
|
||||
- **Talemate Reference 2**
|
||||
- **Talemate Reference 3** (if your model supports it)
|
||||
|
||||
These nodes should be connected to your prompt encoding nodes (for Qwen Image Edit, use `TextEncodeQwenImageEditPlus` nodes that accept image inputs).
|
||||
|
||||

|
||||
|
||||
### Saving and Exporting the Workflow
|
||||
|
||||
Once your workflow is configured, you need to save it and export it in the API format for Talemate to use it.
|
||||
|
||||
1. **Save the workflow**: Use File → Save As to save your workflow as a `.json` file in your ComfyUI workflows directory
|
||||
2. **Export for API**: Use File → Export (API) to create the API-compatible version
|
||||
|
||||
!!! warning "Export vs Export (API)"
|
||||
It's critical to use **"Export (API)"** and not just "Export". The regular export format is not compatible with Talemate's API integration. The API export format includes the necessary metadata and structure that Talemate expects.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
After exporting, place the workflow JSON file in Talemate's `templates/comfyui-workflows` directory. Once placed there, it will automatically appear in the workflow dropdown in Talemate's ComfyUI settings.
|
||||
|
||||
!!! note "Workflow File Location"
|
||||
Workflow files must be placed in Talemate's `templates/comfyui-workflows` directory, not ComfyUI's workflows directory. Talemate loads workflows from its own templates directory to ensure compatibility and proper integration.
|
||||
|
||||
!!! tip "Workflow Not Appearing?"
|
||||
If your workflow file doesn't appear in the agent's settings dropdown after placing it in the correct directory, try reloading the Talemate browser window. The workflow list is refreshed when the page loads.
|
||||
|
||||
!!! info "Hot-Reloading Workflows"
|
||||
Changes to workflow files are automatically detected and reloaded by the agent. After modifying a workflow file, your changes will be applied to the next image generation without needing to restart Talemate or reload the browser window.
|
||||
|
||||
101
docs/user-guide/agents/visualizer/backends/google.md
Normal file
@@ -0,0 +1,101 @@
|
||||
# Google
|
||||
|
||||
The Google backend provides image generation, editing, and analysis capabilities using Google's Gemini image models. It supports text-to-image generation, image editing with reference images, and AI-powered image analysis.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before configuring the Google backend, you need to obtain a Google API key:
|
||||
|
||||
1. Go to [Google AI Studio](https://aistudio.google.com/app/apikey)
|
||||
2. Sign in with your Google account
|
||||
3. Create a new API key or use an existing one
|
||||
4. Copy the API key
|
||||
|
||||
Then configure it in Talemate:
|
||||
|
||||
1. Open Talemate Settings → Application → Google
|
||||
2. Paste your Google API key in the "Google API Key" field
|
||||
3. Save your changes
|
||||
|
||||
!!! note "API Key vs Vertex AI Credentials"
|
||||
The Visualizer agent uses the Google API key (not Vertex AI service account credentials). Make sure you're using the API key from Google AI Studio, not the service account JSON file used for Vertex AI.
|
||||
|
||||
## Configuration
|
||||
|
||||
In the Visualizer agent settings, select Google as your backend for text-to-image generation, image editing, image analysis, or any combination of these. Each operation can be configured separately.
|
||||
|
||||
### Text-to-Image Configuration
|
||||
|
||||
For text-to-image generation, configure the following settings:
|
||||
|
||||
- **Google API Key**: Your Google API key (configured globally in Talemate Settings)
|
||||
- **Model**: Select the image generation model to use:
|
||||
- **gemini-2.5-flash-image**: Faster generation, good quality
|
||||
- **gemini-3-pro-image-preview**: Higher quality, slower generation
|
||||
|
||||

|
||||
|
||||
The Google backend automatically handles aspect ratios based on the format you select:
|
||||
|
||||
- **Landscape**: 16:9 aspect ratio
|
||||
- **Portrait**: 9:16 aspect ratio
|
||||
- **Square**: 1:1 aspect ratio
|
||||
|
||||
### Image Editing Configuration
|
||||
|
||||
For image editing, configure similar settings but with an additional option:
|
||||
|
||||
- **Google API Key**: Your Google API key
|
||||
- **Model**: Select the image generation model (same options as text-to-image)
|
||||
- **Max References**: Configure the maximum number of reference images (1-3). This determines how many reference images you can provide when editing an image.
|
||||
|
||||

|
||||
|
||||
!!! note "Reference Images"
|
||||
Google's image editing models can use up to 3 reference images to guide the editing process. The "Max References" setting controls how many reference images Talemate will send to the API. You can adjust this based on your needs, but keep in mind that more references may provide better context for complex edits.
|
||||
|
||||
### Image Analysis Configuration
|
||||
|
||||
For image analysis, configure the following:
|
||||
|
||||
- **Google API Key**: Your Google API key
|
||||
- **Model**: Select a vision-capable text model:
|
||||
- **gemini-2.5-flash**: Fast analysis, good for general use
|
||||
- **gemini-2.5-pro**: Higher quality analysis
|
||||
- **gemini-3-pro-preview**: Latest model with improved capabilities
|
||||
|
||||
!!! note "Analysis Models"
|
||||
Image analysis uses text models that support vision capabilities, not the image generation models. These models can analyze images and provide detailed descriptions, answer questions about image content, and extract information from visual content.
|
||||
|
||||
## Usage
|
||||
|
||||
Once configured, the Google backend will appear in the Visualizer agent status with green indicators showing which capabilities are available.
|
||||
|
||||

|
||||
|
||||
The status indicators show:
|
||||
|
||||
- **Text to Image**: Available when text-to-image backend is configured
|
||||
- **Image Edit**: Available when image editing backend is configured (shows max references if configured)
|
||||
- **Image Analysis**: Available when image analysis backend is configured
|
||||
|
||||
## Model Recommendations
|
||||
|
||||
### Text-to-Image and Image Editing
|
||||
|
||||
- **gemini-2.5-flash-image**: Best for faster generation and general use. Good balance of speed and quality.
|
||||
- **gemini-3-pro-image-preview**: Best for higher quality results when speed is less important. Use when you need the best possible image quality.
|
||||
|
||||
### Image Analysis
|
||||
|
||||
- **gemini-2.5-flash**: Best for quick analysis and general use cases. Fast responses with good accuracy.
|
||||
- **gemini-2.5-pro**: Best for detailed analysis requiring higher accuracy and more nuanced understanding.
|
||||
- **gemini-3-pro-preview**: Best for the latest capabilities and most advanced analysis features.
|
||||
|
||||
## Prompt Formatting
|
||||
|
||||
The Google backend uses **Descriptive** prompt formatting by default. This means prompts are formatted as natural language descriptions rather than keyword lists. This works well with Google's Gemini models, which are designed to understand natural language instructions.
|
||||
|
||||
When generating images, provide detailed descriptions of what you want to create. For image editing, describe the changes you want to make in natural language.
|
||||
121
docs/user-guide/agents/visualizer/backends/openai.md
Normal file
@@ -0,0 +1,121 @@
|
||||
# OpenAI
|
||||
|
||||
The OpenAI backend provides image generation, editing, and analysis capabilities using OpenAI's image models. It supports text-to-image generation with DALL·E 3 and GPT-Image models, image editing with GPT-Image models, and AI-powered image analysis using vision-capable GPT models.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before configuring the OpenAI backend, you need to obtain an OpenAI API key:
|
||||
|
||||
1. Go to [OpenAI Platform](https://platform.openai.com/api-keys)
|
||||
2. Sign in with your OpenAI account
|
||||
3. Create a new API key or use an existing one
|
||||
4. Copy the API key
|
||||
|
||||
Then configure it in Talemate:
|
||||
|
||||
1. Open Talemate Settings → Application → OpenAI API
|
||||
2. Paste your OpenAI API key in the "OpenAI API Key" field
|
||||
3. Save your changes
|
||||
|
||||
For additional instructions, see the [OpenAI API setup guide](/talemate/user-guide/apis/openai/).
|
||||
|
||||
## Configuration
|
||||
|
||||
In the Visualizer agent settings, select OpenAI as your backend for text-to-image generation, image editing, image analysis, or any combination of these. Each operation can be configured separately.
|
||||
|
||||
### Text-to-Image Configuration
|
||||
|
||||
For text-to-image generation, configure the following settings:
|
||||
|
||||
- **OpenAI API Key**: Your OpenAI API key (configured globally in Talemate Settings)
|
||||
- **Model**: Select the image generation model to use:
|
||||
- **dall-e-3**: OpenAI's DALL·E 3 model (widely available)
|
||||
- **gpt-image-1**: OpenAI's GPT-Image model (may require organization verification)
|
||||
- **gpt-image-1-mini**: Smaller version of GPT-Image (may require organization verification)
|
||||
|
||||

|
||||
|
||||
!!! warning "Organization Verification"
|
||||
The **gpt-image-1** and **gpt-image-1-mini** models may require your OpenAI organization to be verified before you can use them. If you encounter errors with these models, you may need to complete OpenAI's organization verification process.
|
||||
|
||||
!!! note "Model Testing Status"
|
||||
Talemate's organization is not verified with OpenAI, and we have not tested the **gpt-image-1** and **gpt-image-1-mini** models. We have confirmed that **dall-e-3** works correctly. If you have access to the GPT-Image models and encounter issues, please report them so we can improve support for these models.
|
||||
|
||||
The OpenAI backend automatically sets resolution based on the format and model you select:
|
||||
|
||||
- **gpt-image-1** and **gpt-image-1-mini**:
|
||||
- Landscape: 1536x1024
|
||||
- Portrait: 1024x1536
|
||||
- Square: 1024x1024
|
||||
|
||||
- **dall-e-3**:
|
||||
- Landscape: 1792x1024
|
||||
- Portrait: 1024x1792
|
||||
- Square: 1024x1024
|
||||
|
||||
### Image Editing Configuration
|
||||
|
||||
For image editing, configure similar settings but note that DALL·E 3 does not support image editing:
|
||||
|
||||
- **OpenAI API Key**: Your OpenAI API key
|
||||
- **Model**: Select an image editing model:
|
||||
- **gpt-image-1**: Full-featured image editing model (may require organization verification)
|
||||
- **gpt-image-1-mini**: Smaller image editing model (may require organization verification)
|
||||
|
||||

|
||||
|
||||
!!! warning "DALL·E 3 Limitations"
|
||||
DALL·E 3 does not support image editing. If you select DALL·E 3 for image editing, you will receive an error. Use **gpt-image-1** or **gpt-image-1-mini** for image editing instead.
|
||||
|
||||
!!! note "Reference Images"
|
||||
OpenAI's image editing models support a single reference image. When editing an image, provide one reference image that will be used as the base for the edit.
|
||||
|
||||
### Image Analysis Configuration
|
||||
|
||||
For image analysis, configure the following:
|
||||
|
||||
- **OpenAI API Key**: Your OpenAI API key
|
||||
- **Model**: Select a vision-capable text model:
|
||||
- **gpt-4.1-mini**: Fast analysis model with vision capabilities
|
||||
- **gpt-4o-mini**: Alternative vision model option
|
||||
|
||||

|
||||
|
||||
!!! note "Analysis Models"
|
||||
Image analysis uses text models that support vision capabilities, not the image generation models. These models can analyze images and provide detailed descriptions, answer questions about image content, and extract information from visual content.
|
||||
|
||||
## Usage
|
||||
|
||||
Once configured, the OpenAI backend will appear in the Visualizer agent status with green indicators showing which capabilities are available.
|
||||
|
||||

|
||||
|
||||
The status indicators show:
|
||||
|
||||
- **Text to Image**: Available when text-to-image backend is configured
|
||||
- **Image Edit**: Available when image editing backend is configured (shows "References 1" indicating single reference support)
|
||||
- **Image Analysis**: Available when image analysis backend is configured
|
||||
|
||||
## Model Recommendations
|
||||
|
||||
### Text-to-Image
|
||||
|
||||
- **dall-e-3**: Most widely available option. Good for general use, though quality may vary.
|
||||
- **gpt-image-1**: Higher quality option, but requires organization verification. Use if you have access and need better results.
|
||||
- **gpt-image-1-mini**: Smaller version of GPT-Image, faster generation. Requires organization verification.
|
||||
|
||||
### Image Editing
|
||||
|
||||
- **gpt-image-1**: Best quality for image editing. Requires organization verification.
|
||||
- **gpt-image-1-mini**: Faster editing option. Requires organization verification.
|
||||
|
||||
### Image Analysis
|
||||
|
||||
- **gpt-4.1-mini**: Recommended default for image analysis. Fast and accurate.
|
||||
- **gpt-4o-mini**: Alternative option if you prefer this model.
|
||||
|
||||
## Prompt Formatting
|
||||
|
||||
The OpenAI backend uses **Descriptive** prompt formatting by default. This means prompts are formatted as natural language descriptions rather than keyword lists. Provide detailed, natural language descriptions of what you want to create or edit.
|
||||
119
docs/user-guide/agents/visualizer/backends/openrouter.md
Normal file
@@ -0,0 +1,119 @@
|
||||
# OpenRouter
|
||||
|
||||
The OpenRouter backend provides access to image generation, editing, and analysis capabilities through OpenRouter's unified API. OpenRouter allows you to access multiple AI providers through a single API, giving you flexibility to choose from various models and providers.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before configuring the OpenRouter backend, you need to obtain an OpenRouter API key:
|
||||
|
||||
1. Go to [OpenRouter Keys](https://openrouter.ai/settings/keys)
|
||||
2. Sign in with your account
|
||||
3. Create a new API key or use an existing one
|
||||
4. Copy the API key
|
||||
|
||||
Then configure it in Talemate:
|
||||
|
||||
1. Open Talemate Settings → Application → OpenRouter API
|
||||
2. Paste your OpenRouter API key in the "OpenRouter API Key" field
|
||||
3. Save your changes
|
||||
|
||||
For additional instructions, see the [OpenRouter API setup guide](/talemate/user-guide/apis/openrouter/).
|
||||
|
||||
## Configuration
|
||||
|
||||
In the Visualizer agent settings, select OpenRouter as your backend for text-to-image generation, image editing, image analysis, or any combination of these. Each operation can be configured separately.
|
||||
|
||||
### Text-to-Image Configuration
|
||||
|
||||
For text-to-image generation, configure the following settings:
|
||||
|
||||
- **OpenRouter API Key**: Your OpenRouter API key (configured globally in Talemate Settings)
|
||||
- **Model**: Select an image generation model from OpenRouter. The model list is dynamically populated based on models available through your OpenRouter account.
|
||||
- **Only use these providers**: Optionally filter to specific providers (e.g., only use Google or OpenAI)
|
||||
- **Ignore these providers**: Optionally exclude specific providers from consideration
|
||||
|
||||

|
||||
|
||||
!!! warning "Model Selection"
|
||||
There is no reliable way for Talemate to determine which models support text-to-image generation, so the model list is unfiltered. Please consult the [OpenRouter documentation](https://openrouter.ai/docs) to verify that your selected model supports image generation before using it.
|
||||
|
||||
The OpenRouter backend automatically handles aspect ratios based on the format you select:
|
||||
|
||||
- **Landscape**: 16:9 aspect ratio
|
||||
- **Portrait**: 9:16 aspect ratio
|
||||
- **Square**: 1:1 aspect ratio
|
||||
|
||||
### Image Editing Configuration
|
||||
|
||||
For image editing, configure similar settings with an additional option:
|
||||
|
||||
- **OpenRouter API Key**: Your OpenRouter API key
|
||||
- **Model**: Select an image editing model from OpenRouter
|
||||
- **Max References**: Configure the maximum number of reference images (1-3). This determines how many reference images you can provide when editing an image.
|
||||
- **Provider filtering**: Optionally filter providers (same as text-to-image)
|
||||
|
||||

|
||||
|
||||
!!! warning "Model Selection"
|
||||
There is no reliable way for Talemate to determine which models support image editing, so the model list is unfiltered. Image editing refers to image generation with support for 1 or more contextual reference images. Please consult the [OpenRouter documentation](https://openrouter.ai/docs) to verify that your selected model supports image editing before using it.
|
||||
|
||||
### Image Analysis Configuration
|
||||
|
||||
For image analysis, configure the following:
|
||||
|
||||
- **OpenRouter API Key**: Your OpenRouter API key
|
||||
- **Model**: Select a vision-capable text model from OpenRouter
|
||||
- **Provider filtering**: Optionally filter providers
|
||||
|
||||

|
||||
|
||||
!!! warning "Model Selection"
|
||||
There is no reliable way for Talemate to determine which models support image analysis, so the model list is unfiltered. Image analysis requires a text generation model that is multi-modal and supports vision capabilities. Please consult the [OpenRouter documentation](https://openrouter.ai/docs) to verify that your selected model supports vision before using it.
|
||||
|
||||
## Usage
|
||||
|
||||
Once configured, the OpenRouter backend will appear in the Visualizer agent status with green indicators showing which capabilities are available.
|
||||
|
||||

|
||||
|
||||
The status indicators show:
|
||||
|
||||
- **Text to Image**: Available when text-to-image backend is configured
|
||||
- **Image Edit**: Available when image editing backend is configured (shows max references if configured)
|
||||
- **Image Analysis**: Available when image analysis backend is configured
|
||||
|
||||
## Model Recommendations
|
||||
|
||||
OpenRouter provides access to many models from different providers. Here are some general recommendations:
|
||||
|
||||
### Text-to-Image and Image Editing
|
||||
|
||||
- **google/gemini-2.5-flash-image**: Fast image generation with good quality
|
||||
- **google/gemini-3-pro-image-preview**: Higher quality option (if available)
|
||||
|
||||
### Image Analysis
|
||||
|
||||
- **google/gemini-2.5-flash**: Fast analysis with good accuracy
|
||||
- **google/gemini-2.5-pro**: Higher quality analysis
|
||||
- **google/gemini-3-pro-preview**: Latest capabilities (if available)
|
||||
|
||||
## Provider Filtering
|
||||
|
||||
OpenRouter allows you to filter which providers are used for a specific model. This can be useful if:
|
||||
|
||||
- You want to use a specific provider for cost or quality reasons
|
||||
- You want to avoid certain providers
|
||||
- You want to test different providers for the same model
|
||||
|
||||
You can configure provider filtering in each backend's settings:
|
||||
|
||||
- **Only use these providers**: Limits requests to only the selected providers
|
||||
- **Ignore these providers**: Excludes the selected providers from consideration
|
||||
|
||||
If both are configured, "Only use these providers" takes precedence.
|
||||
|
||||
## Prompt Formatting
|
||||
|
||||
The OpenRouter backend uses **Descriptive** prompt formatting by default. This means prompts are formatted as natural language descriptions rather than keyword lists. Provide detailed, natural language descriptions of what you want to create or edit.
|
||||
104
docs/user-guide/agents/visualizer/backends/sdnext.md
Normal file
@@ -0,0 +1,104 @@
|
||||
# SD.Next
|
||||
|
||||
The SD.Next backend provides image generation and editing capabilities using Stable Diffusion Next (SD.Next), a fork of AUTOMATIC1111's Stable Diffusion WebUI. SD.Next offers improved performance and additional features while maintaining compatibility with the AUTOMATIC1111 API.
|
||||
|
||||

|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before configuring the SD.Next backend, you need to have SD.Next installed and running. SD.Next can be run locally or accessed remotely via its API.
|
||||
|
||||
1. Install and start SD.Next on your system
|
||||
2. Ensure the API is enabled and accessible
|
||||
3. Note the API URL (default is `http://localhost:7860`)
|
||||
|
||||
## Configuration
|
||||
|
||||
In the Visualizer agent settings, select SD.Next as your backend for text-to-image generation, image editing, or both. You'll need to configure each backend separately if you want to use SD.Next for different operations.
|
||||
|
||||
### Text-to-Image Configuration
|
||||
|
||||
For text-to-image generation, configure the following settings:
|
||||
|
||||
- **API URL**: The URL where your SD.Next instance is running (e.g., `http://localhost:7860`)
|
||||
- **Authentication Method**: Choose the authentication method:
|
||||
- **None**: No authentication required
|
||||
- **Basic (username/password)**: Use username and password authentication
|
||||
- **Bearer (API Key)**: Use API key authentication
|
||||
|
||||
!!! note "ArliAI SD.Next Endpoints"
|
||||
If you're connecting to ArliAI's SD.Next endpoints, you should use **Bearer (API Key)** authentication method. Configure your API key in the authentication settings.
|
||||
|
||||
- **Username/Password** (if using Basic auth): Your SD.Next credentials
|
||||
- **API Key** (if using Bearer auth): Your API key for SD.Next
|
||||
- **Steps**: Number of sampling steps (default: 40, range: 5-150)
|
||||
- **Sampling Method**: The sampling algorithm to use (dynamically populated from your SD.Next instance)
|
||||
- **CFG Scale**: Classifier-free guidance scale (default: 7.0, range: 1-30)
|
||||
- **Model**: Select the model to use from your SD.Next models directory (dynamically populated)
|
||||
- **Prompt Type**: Choose between "Keywords" or "Descriptive" prompt formatting
|
||||
- **Resolutions**: Configure the pixel dimensions for Square, Portrait, and Landscape formats
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
### Image Editing Configuration
|
||||
|
||||
For image editing, configure similar settings. SD.Next supports image editing through its img2img API, which uses a single reference image.
|
||||
|
||||

|
||||
|
||||
!!! note "Reference Images"
|
||||
SD.Next image editing supports a single reference image. When editing an image, provide one reference image that will be used as the base for the edit.
|
||||
|
||||
## Usage
|
||||
|
||||
Once configured, the SD.Next backend will appear in the Visualizer agent status with green indicators showing which capabilities are available.
|
||||
|
||||

|
||||
|
||||
The status indicators show:
|
||||
|
||||
- **Text to Image**: Available when text-to-image backend is configured
|
||||
- **Image Edit**: Available when image editing backend is configured (shows "References 1" indicating single reference support)
|
||||
|
||||
## Model and Sampler Selection
|
||||
|
||||
SD.Next dynamically fetches the list of available models and samplers from your instance when you configure the backend. This means:
|
||||
|
||||
- **Models**: The model dropdown is automatically populated with models available in your SD.Next installation
|
||||
- **Samplers**: The sampling method dropdown is automatically populated with samplers available in your SD.Next instance
|
||||
|
||||
If you change the API URL or authentication settings, Talemate will automatically refresh the model and sampler lists from the new instance.
|
||||
|
||||
!!! tip "Model Selection"
|
||||
If you don't select a specific model, SD.Next will use its default model. You can select "- Default Model -" from the dropdown to explicitly use the default, or leave the field empty.
|
||||
|
||||
## Sampler Settings
|
||||
|
||||
SD.Next provides extensive control over the generation process:
|
||||
|
||||
- **Steps**: More steps generally produce higher quality images but take longer. Typical values range from 20-50 steps, with 40 being a good default.
|
||||
- **Sampling Method**: Different samplers produce different results. Popular options include:
|
||||
- **DPM++ 2M**: Fast and high quality (default)
|
||||
- **Euler a**: Fast, good for quick iterations
|
||||
- **DPM++ 2M Karras**: Variant with different characteristics
|
||||
- **CFG Scale**: Controls how closely the model follows your prompt. Lower values (1-7) allow more creative freedom, higher values (7-15) stick closer to the prompt.
|
||||
|
||||
## Prompt Formatting
|
||||
|
||||
SD.Next uses **Keywords** prompt formatting by default. This means prompts are formatted as keyword lists optimized for Stable Diffusion models. You can switch to **Descriptive** formatting if you prefer natural language descriptions, though Keywords typically work better with SD models.
|
||||
|
||||
## Remote Access
|
||||
|
||||
If you're running SD.Next on a remote server:
|
||||
|
||||
1. Configure SD.Next to listen on the appropriate network interface
|
||||
2. Use the server's IP address or hostname in the API URL (e.g., `http://192.168.1.100:7860`)
|
||||
3. Configure appropriate authentication if your SD.Next instance requires it
|
||||
4. Ensure your firewall allows connections to the SD.Next port
|
||||
|
||||
!!! warning "Security Considerations"
|
||||
If exposing SD.Next over a network, always use authentication. Unauthenticated SD.Next instances can be accessed by anyone on your network, which may pose security risks.
|
||||