Get the FREE Ultimate OpenClaw Setup Guide →

Image Gen

Scanned
npx machina-cli add skill samarth777/modal-skills/image-gen --openclaw
Files (1)
SKILL.md
5.2 KB

Image Generation Service Example

A complete example of deploying a Stable Diffusion image generation service.

import modal
import io
import base64

# --- Configuration ---
MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0"
GPU_TYPE = "A10"

# --- Image Definition ---
image = (
    modal.Image.debian_slim(python_version="3.12")
    .pip_install(
        "torch==2.1.0",
        "diffusers==0.25.0",
        "transformers",
        "accelerate",
        "safetensors",
        "huggingface_hub[hf_transfer]",
        "Pillow",
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)

app = modal.App("image-generation", image=image)

# --- Model Cache ---
model_volume = modal.Volume.from_name("sdxl-cache", create_if_missing=True)
MODEL_PATH = "/models"

# --- Download Model ---
@app.function(
    volumes={MODEL_PATH: model_volume},
    secrets=[modal.Secret.from_name("huggingface-secret")],
    timeout=3600,
    cpu=4,
    memory=16384,
)
def download_model():
    from diffusers import DiffusionPipeline
    import torch
    import os
    
    pipe = DiffusionPipeline.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        variant="fp16",
        use_safetensors=True,
        token=os.environ.get("HF_TOKEN"),
    )
    pipe.save_pretrained(f"{MODEL_PATH}/{MODEL_ID}")
    model_volume.commit()

# --- Generation Service ---
@app.cls(
    gpu=GPU_TYPE,
    volumes={MODEL_PATH: model_volume},
    container_idle_timeout=120,
)
class ImageGenerator:
    @modal.enter()
    def load_pipeline(self):
        from diffusers import DiffusionPipeline
        import torch
        
        self.pipe = DiffusionPipeline.from_pretrained(
            f"{MODEL_PATH}/{MODEL_ID}",
            torch_dtype=torch.float16,
            variant="fp16",
        )
        self.pipe.to("cuda")
        
        # Optional: Enable memory optimizations
        self.pipe.enable_model_cpu_offload()
    
    @modal.method()
    def generate(
        self,
        prompt: str,
        negative_prompt: str = "",
        width: int = 1024,
        height: int = 1024,
        num_inference_steps: int = 30,
        guidance_scale: float = 7.5,
        seed: int | None = None,
    ) -> bytes:
        import torch
        
        generator = None
        if seed is not None:
            generator = torch.Generator(device="cuda").manual_seed(seed)
        
        image = self.pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=width,
            height=height,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator,
        ).images[0]
        
        # Convert to bytes
        buffer = io.BytesIO()
        image.save(buffer, format="PNG")
        return buffer.getvalue()

# --- Web API ---
@app.function()
@modal.fastapi_endpoint(method="POST", docs=True)
def generate_image(body: dict) -> dict:
    generator = ImageGenerator()
    
    image_bytes = generator.generate.remote(
        prompt=body["prompt"],
        negative_prompt=body.get("negative_prompt", ""),
        width=body.get("width", 1024),
        height=body.get("height", 1024),
        num_inference_steps=body.get("steps", 30),
        guidance_scale=body.get("guidance_scale", 7.5),
        seed=body.get("seed"),
    )
    
    # Return as base64
    image_b64 = base64.b64encode(image_bytes).decode()
    return {"image": image_b64}

# --- Direct Image Response ---
@app.function()
@modal.fastapi_endpoint(method="GET")
def generate_image_direct(
    prompt: str,
    width: int = 1024,
    height: int = 1024,
):
    from fastapi.responses import Response
    
    generator = ImageGenerator()
    image_bytes = generator.generate.remote(
        prompt=prompt,
        width=width,
        height=height,
    )
    
    return Response(content=image_bytes, media_type="image/png")

# --- Batch Generation ---
@app.function(timeout=3600)
def generate_batch(prompts: list[str], output_dir: str = "/output"):
    """Generate multiple images and save to volume."""
    volume = modal.Volume.from_name("generated-images", create_if_missing=True)
    
    generator = ImageGenerator()
    
    for i, prompt in enumerate(prompts):
        image_bytes = generator.generate.remote(prompt=prompt)
        
        with volume.batch_upload() as batch:
            batch.put_file(
                io.BytesIO(image_bytes),
                f"{output_dir}/image_{i:04d}.png"
            )
    
    return f"Generated {len(prompts)} images"

# --- CLI ---
@app.local_entrypoint()
def main(
    prompt: str = "A beautiful sunset over mountains, digital art",
    output: str = "output.png",
):
    print(f"Generating: {prompt}")
    
    generator = ImageGenerator()
    image_bytes = generator.generate.remote(prompt=prompt)
    
    with open(output, "wb") as f:
        f.write(image_bytes)
    
    print(f"Saved to {output}")

Usage

# Download model first
modal run image_gen.py::download_model

# Generate image via CLI
modal run image_gen.py --prompt "A cat astronaut" --output cat.png

# Deploy API
modal deploy image_gen.py

# Generate via API
curl "https://your-workspace--image-generation-generate-image-direct.modal.run?prompt=A%20cat%20astronaut" \
  --output cat.png

Source

git clone https://github.com/samarth777/modal-skills/blob/main/skills/image-gen/SKILL.mdView on GitHub

Overview

This skill demonstrates a complete end-to-end Stable Diffusion image generation service deployed with Modal. It caches the pretrained model on a dedicated volume, loads a FP16 DiffusionPipeline on a GPU, and exposes API endpoints to generate PNG images from prompts (including base64 responses).

How This Skill Works

The solution builds a Modal app image with Debian Slim, installs the required libraries (torch, diffusers, transformers, etc.), and uses a persistent model cache volume named sdxl-cache. It downloads the pretrained model to the volume, then loads the pipeline on a GPU (CUDA), enabling optional model CPU offload for memory efficiency. A FastAPI-based API exposes endpoints to generate images from prompts and return either a base64 string or a direct PNG response.

When to Use It

  • Expose a on-demand image generation API for a web or mobile app
  • Prototype and deploy Stable Diffusion with model caching on Modal
  • Generate batches of product or marketing visuals from prompts
  • Provide both base64 and direct PNG image responses via API endpoints
  • Demonstrate end-to-end deployment of a diffusion model with GPU acceleration

Quick Start

  1. Step 1: Set MODEL_ID to stabilityai/stable-diffusion-xl-base-1.0 and GPU_TYPE to A10; build the Modal image with Debian Slim and install required packages
  2. Step 2: Create a model cache volume (sdxl-cache), implement download_model to fetch the pretrained model, and run the app to load the pipeline on load
  3. Step 3: Deploy the app and call the API endpoints (POST /generate_image or GET) with a prompt to obtain a PNG or base64 image

Best Practices

  • Use a model cache volume (sdxl-cache) to avoid re-downloading large models
  • Pin exact package versions (e.g., torch, diffusers) and use FP16 for memory efficiency
  • Enable memory optimizations like enable_model_cpu_offload to fit large models on GPU
  • Protect access with secrets (huggingface-secret) and HF_TOKEN for model download
  • Return images as PNG or base64 to support diverse client integrations

Example Use Cases

  • Generate product images for an e-commerce catalog from textual prompts
  • Create marketing banners and social visuals from simple descriptions
  • Produce game concept art or prototype assets for rapid iterations
  • Generate personalized avatars or stylized portraits for apps
  • Automate batch artwork generation for design sprints and experiments

Frequently Asked Questions

Add this skill to your agents
Sponsor this space

Reach thousands of developers