Spaces:

waveydaveygravy
/

IP-Adapter-Face-ID-Plus-Controlnet

Runtime error

File size: 5,294 Bytes

8d3fbf3

import cv2
from insightface.app import FaceAnalysis
from insightface.utils import face_align
import torch
import os
from datetime import datetime
import torch
import gradio as gr
from diffusers import (
    StableDiffusionPipeline,
    DDIMScheduler,
    AutoencoderKL,
    StableDiffusionControlNetPipeline,
    ControlNetModel,
)
from PIL import Image
from ip_adapter.ip_adapter_faceid import IPAdapterFaceIDPlus
from diffusers.utils import load_image
import numpy as np

# date_time = now.strftime("%Y-%m-%d_%H-%M-%S")


def generate_image(
    prompt,
    negative_prompt,
    depth_map_dir,
    face_reference_image,
    s_scale,
    num_inference_steps,
    v2,
):
    # Get the current date and time
    now = datetime.now()
    date_time = now.strftime("%Y-%m-%d_%H-%M-%S")

    # Create the output directory if it doesn't exist
    output_dir = "/content/output"
    os.makedirs(output_dir, exist_ok=True)
    # depth_map_dir = "" # or whichever you have the depthmap images in

    app = FaceAnalysis(
        name="buffalo_l", providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
    )
    app.prepare(ctx_id=0, det_size=(640, 640))
    face_reference_image = face_reference_image  # the face reference image
    face_reference_image_np = np.array(face_reference_image)
    faces = app.get(face_reference_image_np)
    faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
    face_image = face_align.norm_crop(
        face_reference_image_np, landmark=faces[0].kps, image_size=224
    )  # you can also segment the face

    base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
    vae_model_path = "stabilityai/sd-vae-ft-mse"
    image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
    ip_ckpt = (
        "/content/ip-adapter-faceid-plus_sd15.bin"
        if not v2
        else "ip-adapter-faceid-plusv2_sd15.bin"
    )
    device = "cuda"

    # Control net test
    controlnet_model_path = "lllyasviel/control_v11f1p_sd15_depth"
    controlnet = ControlNetModel.from_pretrained(
        controlnet_model_path, torch_dtype=torch.float16
    )

    noise_scheduler = DDIMScheduler(
        num_train_timesteps=1000,
        beta_start=0.00085,
        beta_end=0.012,
        beta_schedule="scaled_linear",
        clip_sample=False,
        set_alpha_to_one=False,
        steps_offset=1,
    )

    vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)

    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        base_model_path,
        torch_dtype=torch.float16,
        controlnet=controlnet,
        scheduler=noise_scheduler,
        vae=vae,
        feature_extractor=None,
        safety_checker=None,
    )

    # load ip-adapter
    ip_model = IPAdapterFaceIDPlus(pipe, image_encoder_path, ip_ckpt, device)

    depth_map_files = [
        f for f in os.listdir(depth_map_dir) if f.endswith((".jpg", ".png"))
    ]
    images = []

    for idx, filename in enumerate(depth_map_files):
        depth_map_path = os.path.join(depth_map_dir, filename)
        depth_map = load_image(depth_map_path)

        image = ip_model.generate(
            prompt=prompt,
            negative_prompt=negative_prompt,
            image=depth_map,
            face_image=face_image,
            faceid_embeds=faceid_embeds,
            shortcut=v2,
            s_scale=s_scale,
            num_samples=1,  # Generate one image per depth map
            width=512,
            height=512,
            num_inference_steps=num_inference_steps,
            seed=2023,
        )[0]

        # Save the image with the prompt name, date/time, and depth map index
        image_name = f"{prompt.replace(' ', '_')}_{date_time}_{idx}_0.png"
        image_path = os.path.join(output_dir, image_name)
        image.save(image_path)
        images.append(image)

    torch.cuda.empty_cache()
    return images


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(label="Prompt")
            negative_prompt = gr.Textbox(label="Negative Prompt")
            depth_map_dir = gr.Textbox(label="Depth Map Directory")
            face_reference_image = gr.Image(label="Face Reference Image", type="pil")
            # s_scale = gr.Slider(label="Face Structure strength", value=0.6, step=0.1, minimum=0, maximum=3)
            # num_inference_steps = gr.Slider(label="steps", value=10, step=1, minimum=1, maximum=50)
            v2 = gr.Checkbox(label="Use v2 Adapter", value=False)

        with gr.Column():
            s_scale = gr.Slider(
                label="Face Structure strength",
                value=0.6,
                step=0.1,
                minimum=0,
                maximum=3,
            )
            num_inference_steps = gr.Slider(
                label="steps", value=10, step=1, minimum=1, maximum=50
            )
            gallery = gr.Gallery(label="Generated Images")

    generate_btn = gr.Button("Generate Images")
    generate_btn.click(
        fn=generate_image,
        inputs=[
            prompt,
            negative_prompt,
            depth_map_dir,
            face_reference_image,
            s_scale,
            num_inference_steps,
            v2,
        ],
        outputs=gallery,
    )

demo.launch(share=True, debug=True)