Image Gen
Scannednpx machina-cli add skill samarth777/modal-skills/image-gen --openclawImage Generation Service Example
A complete example of deploying a Stable Diffusion image generation service.
import modal
import io
import base64
# --- Configuration ---
MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0"
GPU_TYPE = "A10"
# --- Image Definition ---
image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
"torch==2.1.0",
"diffusers==0.25.0",
"transformers",
"accelerate",
"safetensors",
"huggingface_hub[hf_transfer]",
"Pillow",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)
app = modal.App("image-generation", image=image)
# --- Model Cache ---
model_volume = modal.Volume.from_name("sdxl-cache", create_if_missing=True)
MODEL_PATH = "/models"
# --- Download Model ---
@app.function(
volumes={MODEL_PATH: model_volume},
secrets=[modal.Secret.from_name("huggingface-secret")],
timeout=3600,
cpu=4,
memory=16384,
)
def download_model():
from diffusers import DiffusionPipeline
import torch
import os
pipe = DiffusionPipeline.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16,
variant="fp16",
use_safetensors=True,
token=os.environ.get("HF_TOKEN"),
)
pipe.save_pretrained(f"{MODEL_PATH}/{MODEL_ID}")
model_volume.commit()
# --- Generation Service ---
@app.cls(
gpu=GPU_TYPE,
volumes={MODEL_PATH: model_volume},
container_idle_timeout=120,
)
class ImageGenerator:
@modal.enter()
def load_pipeline(self):
from diffusers import DiffusionPipeline
import torch
self.pipe = DiffusionPipeline.from_pretrained(
f"{MODEL_PATH}/{MODEL_ID}",
torch_dtype=torch.float16,
variant="fp16",
)
self.pipe.to("cuda")
# Optional: Enable memory optimizations
self.pipe.enable_model_cpu_offload()
@modal.method()
def generate(
self,
prompt: str,
negative_prompt: str = "",
width: int = 1024,
height: int = 1024,
num_inference_steps: int = 30,
guidance_scale: float = 7.5,
seed: int | None = None,
) -> bytes:
import torch
generator = None
if seed is not None:
generator = torch.Generator(device="cuda").manual_seed(seed)
image = self.pipe(
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
generator=generator,
).images[0]
# Convert to bytes
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return buffer.getvalue()
# --- Web API ---
@app.function()
@modal.fastapi_endpoint(method="POST", docs=True)
def generate_image(body: dict) -> dict:
generator = ImageGenerator()
image_bytes = generator.generate.remote(
prompt=body["prompt"],
negative_prompt=body.get("negative_prompt", ""),
width=body.get("width", 1024),
height=body.get("height", 1024),
num_inference_steps=body.get("steps", 30),
guidance_scale=body.get("guidance_scale", 7.5),
seed=body.get("seed"),
)
# Return as base64
image_b64 = base64.b64encode(image_bytes).decode()
return {"image": image_b64}
# --- Direct Image Response ---
@app.function()
@modal.fastapi_endpoint(method="GET")
def generate_image_direct(
prompt: str,
width: int = 1024,
height: int = 1024,
):
from fastapi.responses import Response
generator = ImageGenerator()
image_bytes = generator.generate.remote(
prompt=prompt,
width=width,
height=height,
)
return Response(content=image_bytes, media_type="image/png")
# --- Batch Generation ---
@app.function(timeout=3600)
def generate_batch(prompts: list[str], output_dir: str = "/output"):
"""Generate multiple images and save to volume."""
volume = modal.Volume.from_name("generated-images", create_if_missing=True)
generator = ImageGenerator()
for i, prompt in enumerate(prompts):
image_bytes = generator.generate.remote(prompt=prompt)
with volume.batch_upload() as batch:
batch.put_file(
io.BytesIO(image_bytes),
f"{output_dir}/image_{i:04d}.png"
)
return f"Generated {len(prompts)} images"
# --- CLI ---
@app.local_entrypoint()
def main(
prompt: str = "A beautiful sunset over mountains, digital art",
output: str = "output.png",
):
print(f"Generating: {prompt}")
generator = ImageGenerator()
image_bytes = generator.generate.remote(prompt=prompt)
with open(output, "wb") as f:
f.write(image_bytes)
print(f"Saved to {output}")
Usage
# Download model first
modal run image_gen.py::download_model
# Generate image via CLI
modal run image_gen.py --prompt "A cat astronaut" --output cat.png
# Deploy API
modal deploy image_gen.py
# Generate via API
curl "https://your-workspace--image-generation-generate-image-direct.modal.run?prompt=A%20cat%20astronaut" \
--output cat.png
Source
git clone https://github.com/samarth777/modal-skills/blob/main/skills/image-gen/SKILL.mdView on GitHub Overview
This skill demonstrates a complete end-to-end Stable Diffusion image generation service deployed with Modal. It caches the pretrained model on a dedicated volume, loads a FP16 DiffusionPipeline on a GPU, and exposes API endpoints to generate PNG images from prompts (including base64 responses).
How This Skill Works
The solution builds a Modal app image with Debian Slim, installs the required libraries (torch, diffusers, transformers, etc.), and uses a persistent model cache volume named sdxl-cache. It downloads the pretrained model to the volume, then loads the pipeline on a GPU (CUDA), enabling optional model CPU offload for memory efficiency. A FastAPI-based API exposes endpoints to generate images from prompts and return either a base64 string or a direct PNG response.
When to Use It
- Expose a on-demand image generation API for a web or mobile app
- Prototype and deploy Stable Diffusion with model caching on Modal
- Generate batches of product or marketing visuals from prompts
- Provide both base64 and direct PNG image responses via API endpoints
- Demonstrate end-to-end deployment of a diffusion model with GPU acceleration
Quick Start
- Step 1: Set MODEL_ID to stabilityai/stable-diffusion-xl-base-1.0 and GPU_TYPE to A10; build the Modal image with Debian Slim and install required packages
- Step 2: Create a model cache volume (sdxl-cache), implement download_model to fetch the pretrained model, and run the app to load the pipeline on load
- Step 3: Deploy the app and call the API endpoints (POST /generate_image or GET) with a prompt to obtain a PNG or base64 image
Best Practices
- Use a model cache volume (sdxl-cache) to avoid re-downloading large models
- Pin exact package versions (e.g., torch, diffusers) and use FP16 for memory efficiency
- Enable memory optimizations like enable_model_cpu_offload to fit large models on GPU
- Protect access with secrets (huggingface-secret) and HF_TOKEN for model download
- Return images as PNG or base64 to support diverse client integrations
Example Use Cases
- Generate product images for an e-commerce catalog from textual prompts
- Create marketing banners and social visuals from simple descriptions
- Produce game concept art or prototype assets for rapid iterations
- Generate personalized avatars or stylized portraits for apps
- Automate batch artwork generation for design sprints and experiments