Stable Diffusion
--- title: "Stable Diffusion" description: "掌握Stable Diffusion的潜在扩散模型原理、文本到图像生成以及图像到图像转换技术" tags: ["Stable Diffusion", "潜在扩散", "文本到图像", "img2img"] category: "llm" icon: "🧠"
Stable Diffusion
Stable Diffusion简介
Stable Diffusion是由Stability AI于2022年发布的开源图像生成模型。它基于潜在扩散模型(Latent Diffusion Model, LDM)架构,能够在潜在空间中进行高效的去噪过程,从而生成高质量的图像。
与传统的像素级扩散模型相比,Stable Diffusion在压缩的潜在空间中操作,大幅降低了计算成本,使得在消费级GPU上运行成为可能。
核心架构
1. VAE(变分自编码器)
VAE负责将图像压缩到潜在空间,以及从潜在表示重建图像。
from diffusers import AutoencoderKL
import torch
class VAEDecoder:
def __init__(self, model_name="stabilityai/sd-vae-ft-mse"):
self.vae = AutoencoderKL.from_pretrained(model_name)
self.vae.eval()
def encode(self, image):
"""将图像编码到潜在空间"""
# image: [B, C, H, W] 范围[0, 1]
with torch.no_grad():
# 缩放到[-1, 1]
image = 2 * image - 1
posterior = self.vae.encode(image).latent_dist
# 重参数化采样
z = posterior.sample()
# 缩放潜在向量
z = z * 0.18215
return z
def decode(self, z):
"""从潜在空间解码图像"""
with torch.no_grad():
# 反缩放
z = z / 0.18215
decoded = self.vae.decode(z).sample
# 缩放到[0, 1]
decoded = (decoded + 1) / 2
return decoded
def reconstruct(self, image):
"""图像重建"""
z = self.encode(image)
return self.decode(z)
2. U-Net去噪网络
U-Net是扩散模型的核心,负责预测噪声。
from diffusers import UNet2DConditionModel
class NoisePredictor:
def __init__(self, model_name="stabilityai/stable-diffusion-2-1"):
self.unet = UNet2DConditionModel.from_pretrained(
model_name, subfolder="unet"
)
self.unet.eval()
def predict_noise(self, latent, timestep, encoder_hidden_states):
"""预测噪声"""
with torch.no_grad():
noise_pred = self.unet(
latent,
timestep,
encoder_hidden_states=encoder_hidden_states
).sample
return noise_pred
def predict_noise_with_cfg(self, latent, timestep,
encoder_hidden_states_uncond,
encoder_hidden_states_cond,
guidance_scale=7.5):
"""使用Classifier-Free Guidance预测噪声"""
# 拼接无条件和条件输入
latent_input = torch.cat([latent, latent])
timestep_input = torch.cat([timestep, timestep])
encoder_input = torch.cat([
encoder_hidden_states_uncond,
encoder_hidden_states_cond
])
# 预测噪声
noise_pred = self.unet(
latent_input,
timestep_input,
encoder_hidden_states=encoder_input
).sample
# 分离并应用CFG
noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (
noise_pred_cond - noise_pred_uncond
)
return noise_pred
3. 文本编码器
from transformers import CLIPTextModel, CLIPTokenizer
class TextEncoder:
def __init__(self, model_name="stabilityai/stable-diffusion-2-1"):
self.tokenizer = CLIPTokenizer.from_pretrained(
model_name, subfolder="tokenizer"
)
self.text_encoder = CLIPTextModel.from_pretrained(
model_name, subfolder="text_encoder"
)
self.text_encoder.eval()
def encode(self, text, max_length=77):
"""编码文本提示"""
tokens = self.tokenizer(
text,
padding="max_length",
max_length=max_length,
truncation=True,
return_tensors="pt"
)
with torch.no_grad():
encoder_output = self.text_encoder(
tokens.input_ids
).last_hidden_state
return encoder_output
def encode_batch(self, texts, max_length=77):
"""批量编码文本"""
tokens = self.tokenizer(
texts,
padding=True,
max_length=max_length,
truncation=True,
return_tensors="pt"
)
with torch.no_grad():
encoder_output = self.text_encoder(
tokens.input_ids
).last_hidden_state
return encoder_output
完整的文生图流程
from diffusers import DDIMScheduler
import torch
class StableDiffusionPipeline:
def __init__(self, model_name="stabilityai/stable-diffusion-2-1"):
from diffusers import StableDiffusionPipeline as SDPipeline
self.pipe = SDPipeline.from_pretrained(model_name)
self.pipe.to("cuda")
def generate(self, prompt, negative_prompt="",
num_inference_steps=50, guidance_scale=7.5,
width=512, height=512, seed=None):
"""文生图"""
generator = torch.Generator("cuda")
if seed is not None:
generator.manual_seed(seed)
image = self.pipe(
prompt=prompt,
negative_prompt=negative_prompt,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
width=width,
height=height,
generator=generator
).images[0]
return image
def img2img(self, prompt, init_image, strength=0.75,
num_inference_steps=50, guidance_scale=7.5):
"""图像到图像转换"""
image = self.pipe(
prompt=prompt,
image=init_image,
strength=strength,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale
).images[0]
return image
# 使用示例
pipe = StableDiffusionPipeline()
# 文生图
image = pipe.generate(
prompt="a beautiful sunset over mountains, digital art",
negative_prompt="blurry, low quality",
num_inference_steps=50,
guidance_scale=7.5,
seed=42
)
image.save("sunset.png")
# 图生图
init_image = Image.open("input.png")
stylized = pipe.img2img(
prompt="oil painting style",
init_image=init_image,
strength=0.6
)
采样器配置
from diffusers import (
DDIMScheduler,
PNDMScheduler,
EulerDiscreteScheduler,
DPMSolverMultistepScheduler
)
def configure_scheduler(scheduler_type="ddim", num_train_timesteps=1000):
"""配置采样器"""
schedulers = {
"ddim": DDIMScheduler(
num_train_timesteps=num_train_timesteps,
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=True,
steps_offset=1,
reverse_betas_squared=True,
),
"pndm": PNDMScheduler(
num_train_timesteps=num_train_timesteps,
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
),
"euler": EulerDiscreteScheduler(
num_train_timesteps=num_train_timesteps,
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
),
"dpm++": DPMSolverMultistepScheduler(
num_train_timesteps=num_train_timesteps,
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
),
}
return schedulers[scheduler_type]
总结
Stable Diffusion通过潜在扩散模型实现了高效的图像生成。掌握其VAE、U-Net和文本编码器的工作原理,对于理解和定制图像生成模型至关重要。