← 返回首页
🧠

Stable Diffusion

📂 llm ⏱ 3 min 428 words

--- title: "Stable Diffusion" description: "掌握Stable Diffusion的潜在扩散模型原理、文本到图像生成以及图像到图像转换技术" tags: ["Stable Diffusion", "潜在扩散", "文本到图像", "img2img"] category: "llm" icon: "🧠"

Stable Diffusion

Stable Diffusion简介

Stable Diffusion是由Stability AI于2022年发布的开源图像生成模型。它基于潜在扩散模型(Latent Diffusion Model, LDM)架构,能够在潜在空间中进行高效的去噪过程,从而生成高质量的图像。

与传统的像素级扩散模型相比,Stable Diffusion在压缩的潜在空间中操作,大幅降低了计算成本,使得在消费级GPU上运行成为可能。

核心架构

1. VAE(变分自编码器)

VAE负责将图像压缩到潜在空间,以及从潜在表示重建图像。

from diffusers import AutoencoderKL
import torch

class VAEDecoder:
    def __init__(self, model_name="stabilityai/sd-vae-ft-mse"):
        self.vae = AutoencoderKL.from_pretrained(model_name)
        self.vae.eval()
    
    def encode(self, image):
        """将图像编码到潜在空间"""
        # image: [B, C, H, W] 范围[0, 1]
        with torch.no_grad():
            # 缩放到[-1, 1]
            image = 2 * image - 1
            posterior = self.vae.encode(image).latent_dist
            # 重参数化采样
            z = posterior.sample()
            # 缩放潜在向量
            z = z * 0.18215
        return z
    
    def decode(self, z):
        """从潜在空间解码图像"""
        with torch.no_grad():
            # 反缩放
            z = z / 0.18215
            decoded = self.vae.decode(z).sample
            # 缩放到[0, 1]
            decoded = (decoded + 1) / 2
        return decoded
    
    def reconstruct(self, image):
        """图像重建"""
        z = self.encode(image)
        return self.decode(z)

2. U-Net去噪网络

U-Net是扩散模型的核心,负责预测噪声。

from diffusers import UNet2DConditionModel

class NoisePredictor:
    def __init__(self, model_name="stabilityai/stable-diffusion-2-1"):
        self.unet = UNet2DConditionModel.from_pretrained(
            model_name, subfolder="unet"
        )
        self.unet.eval()
    
    def predict_noise(self, latent, timestep, encoder_hidden_states):
        """预测噪声"""
        with torch.no_grad():
            noise_pred = self.unet(
                latent,
                timestep,
                encoder_hidden_states=encoder_hidden_states
            ).sample
        return noise_pred
    
    def predict_noise_with_cfg(self, latent, timestep, 
                               encoder_hidden_states_uncond,
                               encoder_hidden_states_cond,
                               guidance_scale=7.5):
        """使用Classifier-Free Guidance预测噪声"""
        # 拼接无条件和条件输入
        latent_input = torch.cat([latent, latent])
        timestep_input = torch.cat([timestep, timestep])
        encoder_input = torch.cat([
            encoder_hidden_states_uncond, 
            encoder_hidden_states_cond
        ])
        
        # 预测噪声
        noise_pred = self.unet(
            latent_input,
            timestep_input,
            encoder_hidden_states=encoder_input
        ).sample
        
        # 分离并应用CFG
        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
        noise_pred = noise_pred_uncond + guidance_scale * (
            noise_pred_cond - noise_pred_uncond
        )
        
        return noise_pred

3. 文本编码器

from transformers import CLIPTextModel, CLIPTokenizer

class TextEncoder:
    def __init__(self, model_name="stabilityai/stable-diffusion-2-1"):
        self.tokenizer = CLIPTokenizer.from_pretrained(
            model_name, subfolder="tokenizer"
        )
        self.text_encoder = CLIPTextModel.from_pretrained(
            model_name, subfolder="text_encoder"
        )
        self.text_encoder.eval()
    
    def encode(self, text, max_length=77):
        """编码文本提示"""
        tokens = self.tokenizer(
            text,
            padding="max_length",
            max_length=max_length,
            truncation=True,
            return_tensors="pt"
        )
        with torch.no_grad():
            encoder_output = self.text_encoder(
                tokens.input_ids
            ).last_hidden_state
        return encoder_output
    
    def encode_batch(self, texts, max_length=77):
        """批量编码文本"""
        tokens = self.tokenizer(
            texts,
            padding=True,
            max_length=max_length,
            truncation=True,
            return_tensors="pt"
        )
        with torch.no_grad():
            encoder_output = self.text_encoder(
                tokens.input_ids
            ).last_hidden_state
        return encoder_output

完整的文生图流程

from diffusers import DDIMScheduler
import torch

class StableDiffusionPipeline:
    def __init__(self, model_name="stabilityai/stable-diffusion-2-1"):
        from diffusers import StableDiffusionPipeline as SDPipeline
        self.pipe = SDPipeline.from_pretrained(model_name)
        self.pipe.to("cuda")
    
    def generate(self, prompt, negative_prompt="", 
                 num_inference_steps=50, guidance_scale=7.5,
                 width=512, height=512, seed=None):
        """文生图"""
        generator = torch.Generator("cuda")
        if seed is not None:
            generator.manual_seed(seed)
        
        image = self.pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            width=width,
            height=height,
            generator=generator
        ).images[0]
        
        return image
    
    def img2img(self, prompt, init_image, strength=0.75,
                num_inference_steps=50, guidance_scale=7.5):
        """图像到图像转换"""
        image = self.pipe(
            prompt=prompt,
            image=init_image,
            strength=strength,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale
        ).images[0]
        return image

# 使用示例
pipe = StableDiffusionPipeline()

# 文生图
image = pipe.generate(
    prompt="a beautiful sunset over mountains, digital art",
    negative_prompt="blurry, low quality",
    num_inference_steps=50,
    guidance_scale=7.5,
    seed=42
)
image.save("sunset.png")

# 图生图
init_image = Image.open("input.png")
stylized = pipe.img2img(
    prompt="oil painting style",
    init_image=init_image,
    strength=0.6
)

采样器配置

from diffusers import (
    DDIMScheduler,
    PNDMScheduler,
    EulerDiscreteScheduler,
    DPMSolverMultistepScheduler
)

def configure_scheduler(scheduler_type="ddim", num_train_timesteps=1000):
    """配置采样器"""
    schedulers = {
        "ddim": DDIMScheduler(
            num_train_timesteps=num_train_timesteps,
            beta_start=0.00085,
            beta_end=0.012,
            beta_schedule="scaled_linear",
            clip_sample=False,
            set_alpha_to_one=True,
            steps_offset=1,
            reverse_betas_squared=True,
        ),
        "pndm": PNDMScheduler(
            num_train_timesteps=num_train_timesteps,
            beta_start=0.00085,
            beta_end=0.012,
            beta_schedule="scaled_linear",
        ),
        "euler": EulerDiscreteScheduler(
            num_train_timesteps=num_train_timesteps,
            beta_start=0.00085,
            beta_end=0.012,
            beta_schedule="scaled_linear",
        ),
        "dpm++": DPMSolverMultistepScheduler(
            num_train_timesteps=num_train_timesteps,
            beta_start=0.00085,
            beta_end=0.012,
            beta_schedule="scaled_linear",
        ),
    }
    return schedulers[scheduler_type]

总结

Stable Diffusion通过潜在扩散模型实现了高效的图像生成。掌握其VAE、U-Net和文本编码器的工作原理,对于理解和定制图像生成模型至关重要。