import cv2 from insightface.app import FaceAnalysis from insightface.utils import face_align import torch import os from datetime import datetime import torch import gradio as gr from diffusers import ( StableDiffusionPipeline, DDIMScheduler, AutoencoderKL, StableDiffusionControlNetPipeline, ControlNetModel, ) from PIL import Image from ip_adapter.ip_adapter_faceid import IPAdapterFaceIDPlus from diffusers.utils import load_image import numpy as np # date_time = now.strftime("%Y-%m-%d_%H-%M-%S") def generate_image( prompt, negative_prompt, depth_map_dir, face_reference_image, s_scale, num_inference_steps, v2, ): # Get the current date and time now = datetime.now() date_time = now.strftime("%Y-%m-%d_%H-%M-%S") # Create the output directory if it doesn't exist output_dir = "/content/output" os.makedirs(output_dir, exist_ok=True) # depth_map_dir = "" # or whichever you have the depthmap images in app = FaceAnalysis( name="buffalo_l", providers=["CUDAExecutionProvider", "CPUExecutionProvider"] ) app.prepare(ctx_id=0, det_size=(640, 640)) face_reference_image = face_reference_image # the face reference image face_reference_image_np = np.array(face_reference_image) faces = app.get(face_reference_image_np) faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0) face_image = face_align.norm_crop( face_reference_image_np, landmark=faces[0].kps, image_size=224 ) # you can also segment the face base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE" vae_model_path = "stabilityai/sd-vae-ft-mse" image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K" ip_ckpt = ( "/content/ip-adapter-faceid-plus_sd15.bin" if not v2 else "ip-adapter-faceid-plusv2_sd15.bin" ) device = "cuda" # Control net test controlnet_model_path = "lllyasviel/control_v11f1p_sd15_depth" controlnet = ControlNetModel.from_pretrained( controlnet_model_path, torch_dtype=torch.float16 ) noise_scheduler = DDIMScheduler( num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, steps_offset=1, ) vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16) pipe = StableDiffusionControlNetPipeline.from_pretrained( base_model_path, torch_dtype=torch.float16, controlnet=controlnet, scheduler=noise_scheduler, vae=vae, feature_extractor=None, safety_checker=None, ) # load ip-adapter ip_model = IPAdapterFaceIDPlus(pipe, image_encoder_path, ip_ckpt, device) depth_map_files = [ f for f in os.listdir(depth_map_dir) if f.endswith((".jpg", ".png")) ] images = [] for idx, filename in enumerate(depth_map_files): depth_map_path = os.path.join(depth_map_dir, filename) depth_map = load_image(depth_map_path) image = ip_model.generate( prompt=prompt, negative_prompt=negative_prompt, image=depth_map, face_image=face_image, faceid_embeds=faceid_embeds, shortcut=v2, s_scale=s_scale, num_samples=1, # Generate one image per depth map width=512, height=512, num_inference_steps=num_inference_steps, seed=2023, )[0] # Save the image with the prompt name, date/time, and depth map index image_name = f"{prompt.replace(' ', '_')}_{date_time}_{idx}_0.png" image_path = os.path.join(output_dir, image_name) image.save(image_path) images.append(image) torch.cuda.empty_cache() return images with gr.Blocks() as demo: with gr.Row(): with gr.Column(): prompt = gr.Textbox(label="Prompt") negative_prompt = gr.Textbox(label="Negative Prompt") depth_map_dir = gr.Textbox(label="Depth Map Directory") face_reference_image = gr.Image(label="Face Reference Image", type="pil") # s_scale = gr.Slider(label="Face Structure strength", value=0.6, step=0.1, minimum=0, maximum=3) # num_inference_steps = gr.Slider(label="steps", value=10, step=1, minimum=1, maximum=50) v2 = gr.Checkbox(label="Use v2 Adapter", value=False) with gr.Column(): s_scale = gr.Slider( label="Face Structure strength", value=0.6, step=0.1, minimum=0, maximum=3, ) num_inference_steps = gr.Slider( label="steps", value=10, step=1, minimum=1, maximum=50 ) gallery = gr.Gallery(label="Generated Images") generate_btn = gr.Button("Generate Images") generate_btn.click( fn=generate_image, inputs=[ prompt, negative_prompt, depth_map_dir, face_reference_image, s_scale, num_inference_steps, v2, ], outputs=gallery, ) demo.launch(share=True, debug=True)