Skip to content

~~Qwenn 3 TTS~~ Deprecated

Clone the repository Qwenn TTS groxaxo, then inside edit the docker compose file as follows:

version: '3.8'

services:
  # GPU-enabled service with official backend (default)
  qwen3-tts-gpu:
    build:
      context: .
      dockerfile: Dockerfile
      target: production
    container_name: qwen3-tts-api
    expose:
      - "8880"
    environment:
      - HOST=0.0.0.0
      - PORT=8880
      - WORKERS=1
      - CORS_ORIGINS=*
      - TTS_BACKEND=official
      - TTS_WARMUP_ON_START=false
      #- TTS_MODEL_NAME=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
      # When using device_ids filter, the GPU appears as device 0 inside container
      - CUDA_VISIBLE_DEVICES=0
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    volumes:
      # Mount model cache for persistence
      - ~/.cache/huggingface:/home/appuser/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0']
              capabilities: [gpu]
    restart: always
    networks:
      - litellm_network
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8880/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 120s

  # GPU-enabled service with vLLM-Omni backend
  qwen3-tts-vllm:
    build:
      context: .
      dockerfile: Dockerfile.vllm
    container_name: qwen3-tts-api-vllm
    network_mode: host
    ports:
      - "8880:8880"
    environment:
      - HOST=0.0.0.0
      - PORT=8880
      - WORKERS=1
      - CORS_ORIGINS=*
      - TTS_BACKEND=vllm_omni
      - TTS_WARMUP_ON_START=true
      - TTS_MODEL_NAME=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
      - VLLM_WORKER_MULTIPROC_METHOD=spawn
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
      - HF_HOME=/root/.cache/huggingface
    volumes:
      # Mount model cache for persistence
      - ~/.cache/huggingface:/root/.cache/huggingface
    ipc: host
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['2']
              capabilities: [gpu]
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8880/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 180s
    profiles:
      - vllm

  # CPU-only service
  qwen3-tts-cpu:
    build:
      context: .
      dockerfile: Dockerfile
      target: cpu-base
    container_name: qwen3-tts-api-cpu
    network_mode: host
    ports:
      - "8880:8880"
    environment:
      - HOST=0.0.0.0
      - PORT=8880
      - WORKERS=1
      - CORS_ORIGINS=*
      - TTS_BACKEND=official
      - TTS_MODEL_NAME=Qwen/Qwen3-TTS-12Hz-1.7B-Base
    volumes:
      - ~/.cache/huggingface:/home/appuser/.cache/huggingface
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8880/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 180s
    profiles:
      - cpu
networks:
  litellm_network:
    external: true
    name: litellm-stack_default
# To run GPU version with official backend: docker-compose up qwen3-tts-gpu
# To run GPU version with vLLM backend: docker-compose --profile vllm up qwen3-tts-vllm
# To run CPU version: docker-compose --profile cpu up qwen3-tts-cpu

Add the entry in the liteLLM config file:

  # =====================
  # Qwen-TTS
  # =====================
  - model_name: qwen3-tts
    litellm_params:
      model: openai/qwen3-tts
      api_base: http://qwen3-tts-api:8880/v1  # container name as hostname
      openai_compatible: true
      api_key: none

Now in OWUI > Admin Panel > Settings > Audio, configure OWUI to use the Qween TTS model exposed via LiteLLM for audio generation requests:

alt text

  1. Select OpenAI from the dropdown menu
  2. Insert LiteLLM's API base URL: https://tracking.mlmp.ti.bfh.ch/v1
  3. Insert API Key
  4. Insert model's name (same as in model_name in model_list)
  5. Save

Finally start Qwenn container and restart OWUI and LiteLLM.