Spaces:

modelscope
/

AnyText

Running on A10G

App Files Files Community

tastelikefeet commited on Jan 3

Commit

de7836d

•

1 Parent(s): fdc24bb

first version

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
app.py +401 -0
bert_tokenizer.py +421 -0
cldm/cldm.py +617 -0
cldm/ddim_hacked.py +317 -0
cldm/embedding_manager.py +165 -0
cldm/hack.py +111 -0
cldm/logger.py +76 -0
cldm/model.py +30 -0
cldm/recognizer.py +303 -0
dataset_util.py +77 -0
example_images/banner.png +0 -0
example_images/edit1.png +0 -0
example_images/edit10.png +0 -0
example_images/edit11.png +0 -0
example_images/edit12.png +0 -0
example_images/edit13.png +0 -0
example_images/edit14.png +0 -0
example_images/edit2.png +0 -0
example_images/edit3.png +0 -0
example_images/edit4.png +0 -0
example_images/edit5.png +0 -0
example_images/edit6.png +0 -0
example_images/edit7.png +0 -0
example_images/edit8.png +0 -0
example_images/edit9.png +0 -0
example_images/gen1.png +0 -0
example_images/gen10.png +0 -0
example_images/gen11.png +0 -0
example_images/gen12.png +0 -0
example_images/gen13.png +0 -0
example_images/gen14.png +0 -0
example_images/gen15.png +0 -0
example_images/gen16.png +0 -0
example_images/gen2.png +0 -0
example_images/gen3.png +0 -0
example_images/gen4.png +0 -0
example_images/gen5.png +0 -0
example_images/gen6.png +0 -0
example_images/gen7.png +0 -0
example_images/gen8.png +0 -0
example_images/gen9.png +0 -0
example_images/ref1.jpg +0 -0
example_images/ref10.jpg +0 -0
example_images/ref11.jpg +0 -0
example_images/ref12.png +0 -0
example_images/ref13.jpg +0 -0
example_images/ref14.png +0 -0
example_images/ref2.jpg +0 -0
example_images/ref3.jpg +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ttf filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,401 @@

+'''
+AnyText: Multilingual Visual Text Generation And Editing
+Paper: https://arxiv.org/abs/2311.03054
+Code: https://github.com/tyxsspa/AnyText
+Copyright (c) Alibaba, Inc. and its affiliates.
+'''
+import os
+from modelscope.pipelines import pipeline
+import cv2
+import gradio as gr
+import numpy as np
+import re
+from gradio.components import Component
+from util import check_channels, resize_image, save_images
+import json
+BBOX_MAX_NUM = 8
+img_save_folder = 'SaveImages'
+load_model = True
+if load_model:
+    inference = pipeline('my-anytext-task', model='damo/cv_anytext_text_generation_editing', model_revision='v1.1.0')
+def count_lines(prompt):
+    prompt = prompt.replace('“', '"')
+    prompt = prompt.replace('”', '"')
+    p = '"(.*?)"'
+    strs = re.findall(p, prompt)
+    if len(strs) == 0:
+        strs = [' ']
+    return len(strs)
+def generate_rectangles(w, h, n, max_trys=200):
+    img = np.zeros((h, w, 1), dtype=np.uint8)
+    rectangles = []
+    attempts = 0
+    n_pass = 0
+    low_edge = int(max(w, h)*0.3 if n <= 3 else max(w, h)*0.2)  # ~150, ~100
+    while attempts < max_trys:
+        rect_w = min(np.random.randint(max((w*0.5)//n, low_edge), w), int(w*0.8))
+        ratio = np.random.uniform(4, 10)
+        rect_h = max(low_edge, int(rect_w/ratio))
+        rect_h = min(rect_h, int(h*0.8))
+        # gen rotate angle
+        rotation_angle = 0
+        rand_value = np.random.rand()
+        if rand_value < 0.7:
+            pass
+        elif rand_value < 0.8:
+            rotation_angle = np.random.randint(0, 40)
+        elif rand_value < 0.9:
+            rotation_angle = np.random.randint(140, 180)
+        else:
+            rotation_angle = np.random.randint(85, 95)
+        # rand position
+        x = np.random.randint(0, w - rect_w)
+        y = np.random.randint(0, h - rect_h)
+        # get vertex
+        rect_pts = cv2.boxPoints(((rect_w/2, rect_h/2), (rect_w, rect_h), rotation_angle))
+        rect_pts = np.int32(rect_pts)
+        # move
+        rect_pts += (x, y)
+        # check boarder
+        if np.any(rect_pts < 0) or np.any(rect_pts[:, 0] >= w) or np.any(rect_pts[:, 1] >= h):
+            attempts += 1
+            continue
+        # check overlap
+        if any(check_overlap_polygon(rect_pts, rp) for rp in rectangles):
+            attempts += 1
+            continue
+        n_pass += 1
+        cv2.fillPoly(img, [rect_pts], 255)
+        rectangles.append(rect_pts)
+        if n_pass == n:
+            break
+    print("attempts:", attempts)
+    if len(rectangles) != n:
+        raise gr.Error(f'Failed in auto generate positions after {attempts} attempts, try again!')
+    return img
+def check_overlap_polygon(rect_pts1, rect_pts2):
+    poly1 = cv2.convexHull(rect_pts1)
+    poly2 = cv2.convexHull(rect_pts2)
+    rect1 = cv2.boundingRect(poly1)
+    rect2 = cv2.boundingRect(poly2)
+    if rect1[0] + rect1[2] >= rect2[0] and rect2[0] + rect2[2] >= rect1[0] and rect1[1] + rect1[3] >= rect2[1] and rect2[1] + rect2[3] >= rect1[1]:
+        return True
+    return False
+def draw_rects(width, height, rects):
+    img = np.zeros((height, width, 1), dtype=np.uint8)
+    for rect in rects:
+        x1 = int(rect[0] * width)
+        y1 = int(rect[1] * height)
+        w = int(rect[2] * width)
+        h = int(rect[3] * height)
+        x2 = x1 + w
+        y2 = y1 + h
+        cv2.rectangle(img, (x1, y1), (x2, y2), 255, -1)
+    return img
+def process(mode, prompt, pos_radio, sort_radio, revise_pos, show_debug, draw_img, rect_img, ref_img, ori_img, img_count, ddim_steps, w, h, strength, cfg_scale, seed, eta, a_prompt, n_prompt, *rect_list):
+    n_lines = count_lines(prompt)
+    # Text Generation
+    if mode == 'gen':
+        # create pos_imgs
+        if pos_radio == 'Manual-draw(手绘)':
+            if draw_img is not None:
+                pos_imgs = 255 - draw_img['image']
+                if 'mask' in draw_img:
+                    pos_imgs = pos_imgs.astype(np.float32) + draw_img['mask'][..., 0:3].astype(np.float32)
+                    pos_imgs = pos_imgs.clip(0, 255).astype(np.uint8)
+            else:
+                pos_imgs = np.zeros((w, h, 1))
+        elif pos_radio == 'Manual-rect(拖框)':
+            rect_check = rect_list[:BBOX_MAX_NUM]
+            rect_xywh = rect_list[BBOX_MAX_NUM:]
+            checked_rects = []
+            for idx, c in enumerate(rect_check):
+                if c:
+                    _xywh = rect_xywh[4*idx:4*(idx+1)]
+                    checked_rects += [_xywh]
+            pos_imgs = draw_rects(w, h, checked_rects)
+        elif pos_radio == 'Auto-rand(随机)':
+            pos_imgs = generate_rectangles(w, h, n_lines, max_trys=500)
+    # Text Editing
+    elif mode == 'edit':
+        revise_pos = False  # disable pos revise in edit mode
+        if ref_img is None or ori_img is None:
+            raise gr.Error('No reference image, please upload one for edit!')
+        edit_image = ori_img.clip(1, 255)  # for mask reason
+        edit_image = check_channels(edit_image)
+        edit_image = resize_image(edit_image, max_length=768)
+        h, w = edit_image.shape[:2]
+        if isinstance(ref_img, dict) and 'mask' in ref_img and ref_img['mask'].mean() > 0:
+            pos_imgs = 255 - edit_image
+            edit_mask = cv2.resize(ref_img['mask'][..., 0:3], (w, h))
+            pos_imgs = pos_imgs.astype(np.float32) + edit_mask.astype(np.float32)
+            pos_imgs = pos_imgs.clip(0, 255).astype(np.uint8)
+        else:
+            if isinstance(ref_img, dict) and 'image' in ref_img:
+                ref_img = ref_img['image']
+            pos_imgs = 255 - ref_img  # example input ref_img is used as pos
+    cv2.imwrite('pos_imgs.png', 255-pos_imgs[..., ::-1])
+    params = {
+        "sort_priority": sort_radio,
+        "show_debug": show_debug,
+        "revise_pos": revise_pos,
+        "image_count": img_count,
+        "ddim_steps": ddim_steps,
+        "image_width": w,
+        "image_height": h,
+        "strength": strength,
+        "cfg_scale": cfg_scale,
+        "eta": eta,
+        "a_prompt": a_prompt,
+        "n_prompt": n_prompt
+    }
+    input_data = {
+        "prompt": prompt,
+        "seed": seed,
+        "draw_pos": pos_imgs,
+        "ori_image": ori_img,
+    }
+    results, rtn_code, rtn_warning, debug_info = inference(input_data, mode=mode, **params)
+    if rtn_code >= 0:
+        # save_images(results, img_save_folder)
+        # print(f'Done, result images are saved in: {img_save_folder}')
+        if rtn_warning:
+            gr.Warning(rtn_warning)
+    else:
+        raise gr.Error(rtn_warning)
+    return results, gr.Markdown(debug_info, visible=show_debug)
+def create_canvas(w=512, h=512, c=3, line=5):
+    image = np.full((h, w, c), 200, dtype=np.uint8)
+    for i in range(h):
+        if i % (w//line) == 0:
+            image[i, :, :] = 150
+    for j in range(w):
+        if j % (w//line) == 0:
+            image[:, j, :] = 150
+    image[h//2-8:h//2+8, w//2-8:w//2+8, :] = [200, 0, 0]
+    return image
+def resize_w(w, img1, img2):
+    if isinstance(img2, dict):
+        img2 = img2['image']
+    return [cv2.resize(img1, (w, img1.shape[0])), cv2.resize(img2, (w, img2.shape[0]))]
+def resize_h(h, img1, img2):
+    if isinstance(img2, dict):
+        img2 = img2['image']
+    return [cv2.resize(img1, (img1.shape[1], h)), cv2.resize(img2, (img2.shape[1], h))]
+is_t2i = 'true'
+block = gr.Blocks(css='style.css', theme=gr.themes.Soft()).queue()
+with open('javascript/bboxHint.js', 'r') as file:
+    value = file.read()
+escaped_value = json.dumps(value)
+with block:
+    block.load(fn=None,
+               _js=f"""() => {{
+               const script = document.createElement("script");
+               const text =  document.createTextNode({escaped_value});
+               script.appendChild(text);
+               document.head.appendChild(script);
+               }}""")
+    gr.HTML('<div style="text-align: center; margin: 20px auto;"> \
+            <img id="banner" src="https://modelscope.cn/api/v1/studio/damo/studio_anytext/repo?Revision=master&FilePath=example_images/banner.png&View=true" alt="anytext"> <br>  \
+            [<a href="https://arxiv.org/abs/2311.03054" style="color:blue; font-size:18px;">arXiv</a>] \
+            [<a href="https://github.com/tyxsspa/AnyText" style="color:blue; font-size:18px;">Code</a>] \
+            [<a href="https://modelscope.cn/models/damo/cv_anytext_text_generation_editing/summary" style="color:blue; font-size:18px;">ModelScope</a>]\
+            version: 1.1.0 </div>')
+    with gr.Row(variant='compact'):
+        with gr.Column():
+            with gr.Accordion('🕹Instructions(说明)', open=False,):
+                with gr.Tabs():
+                    with gr.Tab("English"):
+                        gr.Markdown('<span style="color:navy;font-size:20px">Run Examples</span>')
+                        gr.Markdown('<span style="color:black;font-size:16px">AnyText has two modes: Text Generation and Text Editing, and we provides a variety of examples. Select one, click on [Run!] button to run.</span>')
+                        gr.Markdown('<span style="color:gray;font-size:12px">Please note, before running examples, ensure the manual draw area is empty, otherwise may get wrong results. Additionally, different examples use \
+                                     different parameters (such as resolution, seed, etc.). When generate your own, please pay attention to the parameter changes, or refresh the page to restore the default parameters.</span>')
+                        gr.Markdown('<span style="color:navy;font-size:20px">Text Generation</span>')
+                        gr.Markdown('<span style="color:black;font-size:16px">Enter the textual description (in Chinese or English) of the image you want to generate in [Prompt]. Each text line that needs to be generated should be \
+                                     enclosed in double quotes. Then, manually draw the specified position for each text line to generate the image.</span>\
+                                     <span style="color:red;font-size:16px">The drawing of text positions is crucial to the quality of the resulting image</span>, \
+                                     <span style="color:black;font-size:16px">please do not draw too casually or too small. The number of positions should match the number of text lines, and the size of each position should be matched \
+                                     as closely as possible to the length or width of the corresponding text line. If [Manual-draw] is inconvenient, you can try dragging rectangles [Manual-rect] or random positions [Auto-rand].</span>')
+                        gr.Markdown('<span style="color:gray;font-size:12px">When generating multiple lines, each position is matched with the text line according to a certain rule. The [Sort Position] option is used to \
+                                     determine whether to prioritize sorting from top to bottom or from left to right. You can open the [Show Debug] option in the parameter settings to observe the text position and glyph image \
+                                     in the result. You can also select the [Revise Position] which uses the bounding box of the rendered text as the revised position. However, it is occasionally found that the creativity of the \
+                                     generated text is slightly lower using this method.</span>')
+                        gr.Markdown('<span style="color:navy;font-size:20px">Text Editing</span>')
+                        gr.Markdown('<span style="color:black;font-size:16px">Please upload an image in [Ref] as a reference image, then adjust the brush size, and mark the area(s) to be edited. Input the textual description and \
+                                     the new text to be modified in [Prompt], then generate the image.</span>')
+                        gr.Markdown('<span style="color:gray;font-size:12px">The reference image can be of any resolution, but it will be internally processed with a limit that the longer side cannot exceed 768 pixels, and the \
+                                     width and height will both be scaled to multiples of 64.</span>')
+                    with gr.Tab("简体中文"):
+                        gr.Markdown('<span style="color:navy;font-size:20px">运行示例</span>')
+                        gr.Markdown('<span style="color:black;font-size:16px">AnyText有两种运行模式：文字生成和文字编辑，每种模式下提供了丰富的示例，选择一个，点击[Run!]即可。</span>')
+                        gr.Markdown('<span style="color:gray;font-size:12px">请注意，运行示例前确保手绘位置区域是空的，防止影响示例结果，另外不同示例使用不同的参数（如分辨率，种子数等），如果要自行生成时，请留意参数变化，或刷新页面恢复到默认参数。</span>')
+                        gr.Markdown('<span style="color:navy;font-size:20px">文字生成</span>')
+                        gr.Markdown('<span style="color:black;font-size:16px">在Prompt中输入描述提示词（支持中英文），需要生成的每一行文字用双引号包裹，然后依次手绘指定每行文字的位置，生成图片。</span>\
+                                     <span style="color:red;font-size:16px">文字位置的绘制对成图质量很关键</span>, \
+                                     <span style="color:black;font-size:16px">请不要画的太随意或太小，位置的数量要与文字行数量一致，每个位置的尺寸要与对应的文字行的长短或宽高尽量匹配。如果手绘（Manual-draw）不方便，\
+                                     可以尝试拖框矩形（Manual-rect）或随机生成（Auto-rand）。</span>')
+                        gr.Markdown('<span style="color:gray;font-size:12px">多行生成时，每个位置按照一定规则排序后与文字行做对应，Sort Position选项用于确定排序时优先从上到下还是从左到右。\
+                                     可以在参数设置中打开Show Debug选项，在结果图像中观察文字位置和字形图。也可以勾选Revise Position选项，这样会用渲染文字的外接矩形作为修正后的位置，不过偶尔发现这样生成的文字创造性略低。</span>')
+                        gr.Markdown('<span style="color:navy;font-size:20px">文字编辑</span>')
+                        gr.Markdown('<span style="color:black;font-size:16px">请上传一张待编辑的图片作为参考图(Ref)，然后调整笔触大小后，在参考图上涂抹要编辑的位置，在Prompt中输入描述提示词和要修改的文字内容，生成图片。</span>')
+                        gr.Markdown('<span style="color:gray;font-size:12px">参考图可以为任意分辨率，但内部处理时会限制长边不能超过768，并且宽高都被缩放为64的整数倍。</span>')
+            with gr.Accordion('🛠Parameters(参数)', open=False):
+                with gr.Row(variant='compact'):
+                    img_count = gr.Slider(label="Image Count(图片��)", minimum=1, maximum=12, value=4, step=1)
+                    ddim_steps = gr.Slider(label="Steps(步数)", minimum=1, maximum=100, value=20, step=1)
+                with gr.Row(variant='compact'):
+                    image_width = gr.Slider(label="Image Width(宽度)", minimum=256, maximum=768, value=512, step=64)
+                    image_height = gr.Slider(label="Image Height(高度)", minimum=256, maximum=768, value=512, step=64)
+                with gr.Row(variant='compact'):
+                    strength = gr.Slider(label="Strength(控制力度)", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
+                    cfg_scale = gr.Slider(label="CFG-Scale(CFG强度)", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
+                with gr.Row(variant='compact'):
+                    seed = gr.Slider(label="Seed(种子数)", minimum=-1, maximum=99999999, step=1, randomize=False, value=-1)
+                    eta = gr.Number(label="eta (DDIM)", value=0.0)
+                with gr.Row(variant='compact'):
+                    show_debug = gr.Checkbox(label='Show Debug(调试信息)', value=False)
+                    gr.Markdown('<span style="color:silver;font-size:12px">whether show glyph image and debug information in the result(是否在结果中显示glyph图以及调试信息)</span>')
+                a_prompt = gr.Textbox(label="Added Prompt(附加提示词)", value='best quality, extremely detailed,4k, HD, supper legible text,  clear text edges,  clear strokes, neat writing, no watermarks')
+                n_prompt = gr.Textbox(label="Negative Prompt(负向提示词)", value='low-res, bad anatomy, extra digit, fewer digits, cropped, worst quality, low quality, watermark, unreadable text, messy words, distorted text, disorganized writing, advertising picture')
+            prompt = gr.Textbox(label="Prompt(提示词)")
+            with gr.Tabs() as tab_modes:
+                with gr.Tab("🖼Text Generation(文字生成)", elem_id='MD-tab-t2i') as mode_gen:
+                    pos_radio = gr.Radio(["Manual-draw(手绘)", "Manual-rect(拖框)", "Auto-rand(随机)"], value='Manual-draw(手绘)', label="Pos-Method(位置方式)", info="choose a method to specify text positions(选择方法用于指定文字位置).")
+                    with gr.Row():
+                        sort_radio = gr.Radio(["↕", "↔"], value='↕', label="Sort Position(位置排序)", info="position sorting priority(位置排序时的优先级)")
+                        revise_pos = gr.Checkbox(label='Revise Position(修正位置)', value=False)
+                        # gr.Markdown('<span style="color:silver;font-size:12px">try to revise according to text\'s bounding rectangle(尝试通过渲染后的文字行的外接矩形框修正位置)</span>')
+                    with gr.Row(variant='compact'):
+                        rect_cb_list: list[Component] = []
+                        rect_xywh_list: list[Component] = []
+                        for i in range(BBOX_MAX_NUM):
+                            e = gr.Checkbox(label=f'{i}', value=False, visible=False, min_width='10')
+                            x = gr.Slider(label='x', value=0.4, minimum=0.0, maximum=1.0, step=0.0001, elem_id=f'MD-t2i-{i}-x', visible=False)
+                            y = gr.Slider(label='y', value=0.4, minimum=0.0, maximum=1.0, step=0.0001, elem_id=f'MD-t2i-{i}-y',  visible=False)
+                            w = gr.Slider(label='w', value=0.2, minimum=0.0, maximum=1.0, step=0.0001, elem_id=f'MD-t2i-{i}-w',  visible=False)
+                            h = gr.Slider(label='h', value=0.2, minimum=0.0, maximum=1.0, step=0.0001, elem_id=f'MD-t2i-{i}-h',  visible=False)
+                            x.change(fn=None, inputs=x, outputs=x, _js=f'v => onBoxChange({is_t2i}, {i}, "x", v)', show_progress=False, queue=False)
+                            y.change(fn=None, inputs=y, outputs=y, _js=f'v => onBoxChange({is_t2i}, {i}, "y", v)', show_progress=False, queue=False)
+                            w.change(fn=None, inputs=w, outputs=w, _js=f'v => onBoxChange({is_t2i}, {i}, "w", v)', show_progress=False, queue=False)
+                            h.change(fn=None, inputs=h, outputs=h, _js=f'v => onBoxChange({is_t2i}, {i}, "h", v)', show_progress=False, queue=False)
+                            e.change(fn=None, inputs=e, outputs=e, _js=f'e => onBoxEnableClick({is_t2i}, {i}, e)', queue=False)
+                            rect_cb_list.extend([e])
+                            rect_xywh_list.extend([x, y, w, h])
+                    rect_img = gr.Image(value=create_canvas(), label="Rext Position(方框位置)", elem_id="MD-bbox-rect-t2i", show_label=False, visible=False)
+                    draw_img = gr.Image(value=create_canvas(), label="Draw Position(绘制位置)", visible=True, tool='sketch', show_label=False, brush_radius=60)
+                    def re_draw():
+                        return [gr.Image(value=create_canvas(), tool='sketch'), gr.Slider(value=512), gr.Slider(value=512)]
+                    draw_img.clear(re_draw, None, [draw_img, image_width, image_height])
+                    image_width.release(resize_w, [image_width, rect_img, draw_img], [rect_img, draw_img])
+                    image_height.release(resize_h, [image_height, rect_img, draw_img], [rect_img, draw_img])
+                    def change_options(selected_option):
+                        return [gr.Checkbox(visible=selected_option == 'Manual-rect(拖框)')] * BBOX_MAX_NUM + \
+                                [gr.Image(visible=selected_option == 'Manual-rect(拖框)'),
+                                 gr.Image(visible=selected_option == 'Manual-draw(手绘)'),
+                                 gr.Radio(visible=selected_option != 'Auto-rand(随机)'),
+                                 gr.Checkbox(value=selected_option == 'Auto-rand(随机)')]
+                    pos_radio.change(change_options, pos_radio, rect_cb_list + [rect_img, draw_img, sort_radio, revise_pos], show_progress=False, queue=False)
+                    with gr.Row():
+                        gr.Markdown("")
+                        run_gen = gr.Button(value="Run(运行)!", scale=0.3, elem_classes='run')
+                        gr.Markdown("")
+                    def exp_gen_click():
+                        return [gr.Slider(value=512), gr.Slider(value=512)]  # all examples are 512x512, refresh draw_img
+                    exp_gen = gr.Examples(
+                        [
+                            ['一只浣熊站在黑板前，上面写着"深度学习"', "example_images/gen1.png", "Manual-draw(手绘)", "↕", False, 4, 81808278],
+                            ['一个儿童蜡笔画，森林里有一个可爱的蘑菇形状的房子，标题是"森林小屋"', "example_images/gen16.png", "Manual-draw(手绘)", "↕", False, 4, 40173333],
+                            ['一个精美设计的logo，画的是一个黑白风格的厨师，带着厨师帽，logo下方写着“深夜食堂”', "example_images/gen14.png", "Manual-draw(手绘)", "↕", False, 4, 6970544],
+                            ['photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream', "example_images/gen9.png", "Manual-draw(手绘)", "↕", False, 4, 66273235],
+                            ['一张户外雪地靴的电商广告，上面写着 “双12大促！”，“立减50”，“加绒加厚”，“穿脱方便”，“温暖24小时送达”， “包邮”，高级设计感，精美构图', "example_images/gen15.png", "Manual-draw(手绘)", "↕", False, 4, 66980376],
+                            ['Sign on the clean building that reads "科学" and "과학"  and "ステップ" and "SCIENCE"', "example_images/gen6.png", "Manual-draw(手绘)", "↕", True, 4, 13246309],
+                            ['一个精致的马克杯，上面雕刻着一首中国古诗，内容是 "花落知多少" "夜来风雨声" "处处闻啼鸟" "春眠不觉晓"', "example_images/gen3.png", "Manual-draw(手绘)", "↔", False, 4, 60358279],
+                            ['A delicate square cake, cream and fruit, with "CHEERS" "to the" and "GRADUATE" written in chocolate', "example_images/gen8.png", "Manual-draw(手绘)", "↕", False, 4, 93424638],
+                            ['一件精美的毛衣，上面有针织的文字："通义丹青"', "example_images/gen4.png", "Manual-draw(手绘)", "↕", False, 4, 48769450],
+                            ['一个双肩包的特写照，上面用针织文字写着”为了无法“ ”计算的价值“', "example_images/gen12.png", "Manual-draw(手绘)", "↕", False, 4, 35552323],
+                            ['A nice drawing in pencil of Michael Jackson,  with the words "Micheal" and "Jackson" written on it', "example_images/gen7.png", "Manual-draw(手绘)", "↕", False, 4, 83866922],
+                            ['一个漂亮的蜡笔画，有行星，宇航员，还有宇宙飞船，上面写的是"去火星旅行", "王小明", "11月1日"', "example_images/gen5.png", "Manual-draw(手绘)", "↕", False, 4, 42328250],
+                            ['一个装饰华丽的蛋糕，上面用奶油写着“阿里云”和"APSARA"', "example_images/gen13.png", "Manual-draw(手绘)", "↕", False, 4, 62357019],
+                            ['一张关于墙上的彩色涂鸦艺术的摄影作品，上面写着“人工智能" 和 "神经网络"', "example_images/gen10.png", "Manual-draw(手绘)", "↕", False, 4, 64722007],
+                            ['一枚中国古代铜钱,  上面的文字是 "康"  "寶" "通" "熙"', "example_images/gen2.png", "Manual-draw(手绘)", "↕", False, 4, 24375031],
+                            ['a well crafted ice sculpture that made with "Happy" and "Holidays". Dslr photo, perfect illumination', "example_images/gen11.png", "Manual-draw(手绘)", "↕", True, 4, 64901362],
+                        ],
+                        [prompt, draw_img, pos_radio, sort_radio, revise_pos, img_count, seed],
+                        examples_per_page=5,
+                    )
+                    exp_gen.dataset.click(exp_gen_click, None, [image_width, image_height])
+                with gr.Tab("🎨Text Editing(文字编辑)") as mode_edit:
+                    with gr.Row(variant='compact'):
+                        ref_img = gr.Image(label='Ref(参考图)', source='upload')
+                        ori_img = gr.Image(label='Ori(原图)')
+                    def upload_ref(x):
+                        return [gr.Image(type="numpy", brush_radius=60, tool='sketch'),
+                                gr.Image(value=x)]
+                    def clear_ref(x):
+                        return gr.Image(source='upload', tool=None)
+                    ref_img.upload(upload_ref, ref_img, [ref_img, ori_img])
+                    ref_img.clear(clear_ref, ref_img, ref_img)
+                    with gr.Row():
+                        gr.Markdown("")
+                        run_edit = gr.Button(value="Run(运行)!", scale=0.3, elem_classes='run')
+                        gr.Markdown("")
+                    gr.Examples(
+                        [
+                            ['精美的书法作品，上面写着“志” “存” “高” ”远“', "example_images/ref10.jpg", "example_images/edit10.png", 4, 98053044],
+                            ['一个表情包，小猪说 "下班"', "example_images/ref2.jpg", "example_images/edit2.png", 2, 43304008],
+                            ['Characters written in chalk on the blackboard that says "DADDY"', "example_images/ref8.jpg", "example_images/edit8.png", 4, 73556391],
+                            ['一个中国古代铜钱，上面写着"乾" "隆"', "example_images/ref12.png", "example_images/edit12.png", 4, 89159482],
+                            ['黑板上写着"Here"', "example_images/ref11.jpg", "example_images/edit11.png", 2, 15353513],
+                            ['A letter picture that says "THER"', "example_images/ref6.jpg", "example_images/edit6.png", 4, 72321415],
+                            ['一堆水果, 中间写着“UIT”', "example_images/ref13.jpg", "example_images/edit13.png", 4, 54263567],
+                            ['一个漫画，上面写着" "', "example_images/ref14.png", "example_images/edit14.png", 4, 94081527],
+                            ['一个黄色标志牌，上边写着"不要" 和 "大意"', "example_images/ref3.jpg", "example_images/edit3.png", 2, 64010349],
+                            ['A cake with colorful characters that reads "EVERYDAY"', "example_images/ref7.jpg", "example_images/edit7.png", 4, 8943410],
+                            ['一个青铜鼎，上面写着"  "和"  "', "example_images/ref4.jpg", "example_images/edit4.png", 4, 71139289],
+                            ['一个建筑物前面的字母标牌， 上面写着 " "', "example_images/ref5.jpg", "example_images/edit5.png", 4, 50416289],
+                        ],
+                        [prompt, ori_img, ref_img, img_count, seed],
+                        examples_per_page=5,
+                    )
+        with gr.Column():
+            result_gallery = gr.Gallery(label='Result(结果)', show_label=True, preview=True, columns=2, allow_preview=True, height=600)
+            result_info = gr.Markdown('', visible=False)
+    ips = [prompt, pos_radio, sort_radio, revise_pos, show_debug, draw_img, rect_img, ref_img, ori_img, img_count, ddim_steps, image_width, image_height, strength, cfg_scale, seed, eta, a_prompt, n_prompt, *(rect_cb_list+rect_xywh_list)]
+    run_gen.click(fn=process, inputs=[gr.State('gen')] + ips, outputs=[result_gallery, result_info])
+    run_edit.click(fn=process, inputs=[gr.State('edit')] + ips, outputs=[result_gallery, result_info])
+block.launch(
+    server_name='0.0.0.0' if os.getenv('GRADIO_LISTEN', '') != '' else "127.0.0.1",
+    share=False,
+    root_path=f"/{os.getenv('GRADIO_PROXY_PATH')}" if os.getenv('GRADIO_PROXY_PATH') else ""
+)
+# block.launch(server_name='0.0.0.0')

bert_tokenizer.py ADDED Viewed

	@@ -0,0 +1,421 @@

+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import, division, print_function
+import collections
+import re
+import unicodedata
+import six
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+    """Checks whether the casing config is consistent with the checkpoint name."""
+    # The casing has to be passed in by the user and there is no explicit check
+    # as to whether it matches the checkpoint. The casing information probably
+    # should have been stored in the bert_config.json file, but it's not, so
+    # we have to heuristically detect it to validate.
+    if not init_checkpoint:
+        return
+    m = re.match('^.*?([A-Za-z0-9_-]+)/bert_model.ckpt', init_checkpoint)
+    if m is None:
+        return
+    model_name = m.group(1)
+    lower_models = [
+        'uncased_L-24_H-1024_A-16', 'uncased_L-12_H-768_A-12',
+        'multilingual_L-12_H-768_A-12', 'chinese_L-12_H-768_A-12'
+    ]
+    cased_models = [
+        'cased_L-12_H-768_A-12', 'cased_L-24_H-1024_A-16',
+        'multi_cased_L-12_H-768_A-12'
+    ]
+    is_bad_config = False
+    if model_name in lower_models and not do_lower_case:
+        is_bad_config = True
+        actual_flag = 'False'
+        case_name = 'lowercased'
+        opposite_flag = 'True'
+    if model_name in cased_models and do_lower_case:
+        is_bad_config = True
+        actual_flag = 'True'
+        case_name = 'cased'
+        opposite_flag = 'False'
+    if is_bad_config:
+        raise ValueError(
+            'You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. '
+            'However, `%s` seems to be a %s model, so you '
+            'should pass in `--do_lower_case=%s` so that the fine-tuning matches '
+            'how the model was pre-training. If this error is wrong, please '
+            'just comment out this check.' %
+            (actual_flag, init_checkpoint, model_name, case_name,
+             opposite_flag))
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode('utf-8', 'ignore')
+        else:
+            raise ValueError('Unsupported string type: %s' % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode('utf-8', 'ignore')
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError('Unsupported string type: %s' % (type(text)))
+    else:
+        raise ValueError('Not running on Python2 or Python 3?')
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode('utf-8', 'ignore')
+        else:
+            raise ValueError('Unsupported string type: %s' % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode('utf-8')
+        else:
+            raise ValueError('Unsupported string type: %s' % (type(text)))
+    else:
+        raise ValueError('Not running on Python2 or Python 3?')
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, 'r', encoding='utf-8') as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+    @staticmethod
+    def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
+        """ Converts a sequence of tokens (string) in a single string. """
+        def clean_up_tokenization(out_string):
+            """ Clean up a list of simple English tokenization artifacts
+            like spaces before punctuations and abreviated forms.
+            """
+            out_string = (
+                out_string.replace(' .', '.').replace(' ?', '?').replace(
+                    ' !', '!').replace(' ,', ',').replace(" ' ", "'").replace(
+                        " n't", "n't").replace(" 'm", "'m").replace(
+                            " 's", "'s").replace(" 've",
+                                                 "'ve").replace(" 're", "'re"))
+            return out_string
+        text = ' '.join(tokens).replace(' ##', '').strip()
+        if clean_up_tokenization_spaces:
+            clean_text = clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+    def vocab_size(self):
+        return len(self.vocab)
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+        output_tokens = whitespace_tokenize(' '.join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return [''.join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(' ')
+                output.append(char)
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
+                or (cp >= 0x20000 and cp <= 0x2A6DF)
+                or (cp >= 0x2A700 and cp <= 0x2B73F)
+                or (cp >= 0x2B740 and cp <= 0x2B81F)
+                or (cp >= 0x2B820 and cp <= 0x2CEAF)
+                or (cp >= 0xF900 and cp <= 0xFAFF)
+                or (cp >= 0x2F800 and cp <= 0x2FA1F)):
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+    def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        text = convert_to_unicode(text)
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = ''.join(chars[start:end])
+                    if start > 0:
+                        substr = '##' + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
+        return True
+    cat = unicodedata.category(char)
+    if cat == 'Zs':
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == '\t' or char == '\n' or char == '\r':
+        return False
+    cat = unicodedata.category(char)
+    if cat in ('Cc', 'Cf'):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
+            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith('P'):
+        return True
+    return False

cldm/cldm.py ADDED Viewed

	@@ -0,0 +1,617 @@

+import einops
+import torch
+import torch as th
+import torch.nn as nn
+import copy
+from easydict import EasyDict as edict
+from ldm.modules.diffusionmodules.util import (
+    conv_nd,
+    linear,
+    zero_module,
+    timestep_embedding,
+)
+from einops import rearrange, repeat
+from torchvision.utils import make_grid
+from ldm.modules.attention import SpatialTransformer
+from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
+from ldm.models.diffusion.ddpm import LatentDiffusion
+from ldm.util import log_txt_as_img, exists, instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
+from .recognizer import TextRecognizer, create_predictor
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+class ControlledUnetModel(UNetModel):
+    def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
+        hs = []
+        with torch.no_grad():
+            t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+            emb = self.time_embed(t_emb)
+            h = x.type(self.dtype)
+            for module in self.input_blocks:
+                h = module(h, emb, context)
+                hs.append(h)
+            h = self.middle_block(h, emb, context)
+        if control is not None:
+            h += control.pop()
+        for i, module in enumerate(self.output_blocks):
+            if only_mid_control or control is None:
+                h = torch.cat([h, hs.pop()], dim=1)
+            else:
+                h = torch.cat([h, hs.pop() + control.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+        return self.out(h)
+class ControlNet(nn.Module):
+    def __init__(
+            self,
+            image_size,
+            in_channels,
+            model_channels,
+            glyph_channels,
+            position_channels,
+            num_res_blocks,
+            attention_resolutions,
+            dropout=0,
+            channel_mult=(1, 2, 4, 8),
+            conv_resample=True,
+            dims=2,
+            use_checkpoint=False,
+            use_fp16=False,
+            num_heads=-1,
+            num_head_channels=-1,
+            num_heads_upsample=-1,
+            use_scale_shift_norm=False,
+            resblock_updown=False,
+            use_new_attention_order=False,
+            use_spatial_transformer=False,  # custom transformer support
+            transformer_depth=1,  # custom transformer support
+            context_dim=None,  # custom transformer support
+            n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+            legacy=True,
+            disable_self_attentions=None,
+            num_attention_blocks=None,
+            disable_middle_self_attn=False,
+            use_linear_in_transformer=False,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+        self.dims = dims
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+                                 "as a list/tuple (per-level) with the same length as channel_mult")
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
+            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                  f"attention will still not be set.")
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
+        self.glyph_block = TimestepEmbedSequential(
+            conv_nd(dims, glyph_channels, 8, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 8, 8, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 8, 16, 3, padding=1, stride=2),
+            nn.SiLU(),
+            conv_nd(dims, 16, 16, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            conv_nd(dims, 32, 32, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 32, 96, 3, padding=1, stride=2),
+            nn.SiLU(),
+            conv_nd(dims, 96, 96, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 96, 256, 3, padding=1, stride=2),
+            nn.SiLU(),
+        )
+        self.position_block = TimestepEmbedSequential(
+            conv_nd(dims, position_channels, 8, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 8, 8, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 8, 16, 3, padding=1, stride=2),
+            nn.SiLU(),
+            conv_nd(dims, 16, 16, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            conv_nd(dims, 32, 32, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 32, 64, 3, padding=1, stride=2),
+            nn.SiLU(),
+        )
+        self.fuse_block = zero_module(conv_nd(dims, 256+64+4, model_channels, 3, padding=1))
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else SpatialTransformer(
+                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint
+                            )
+                        )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self.zero_convs.append(self.make_zero_conv(ch))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                self.zero_convs.append(self.make_zero_conv(ch))
+                ds *= 2
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            # num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else SpatialTransformer(  # always uses a self-attn
+                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
+                use_checkpoint=use_checkpoint
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self.middle_block_out = self.make_zero_conv(ch)
+        self._feature_size += ch
+    def make_zero_conv(self, channels):
+        return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
+    def forward(self, x, hint, text_info, timesteps, context, **kwargs):
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        # guided_hint from text_info
+        B, C, H, W = x.shape
+        glyphs = torch.cat(text_info['glyphs'], dim=1).sum(dim=1, keepdim=True)
+        positions = torch.cat(text_info['positions'], dim=1).sum(dim=1, keepdim=True)
+        enc_glyph = self.glyph_block(glyphs, emb, context)
+        enc_pos = self.position_block(positions, emb, context)
+        guided_hint = self.fuse_block(torch.cat([enc_glyph, enc_pos, text_info['masked_x']], dim=1))
+        outs = []
+        h = x.type(self.dtype)
+        for module, zero_conv in zip(self.input_blocks, self.zero_convs):
+            if guided_hint is not None:
+                h = module(h, emb, context)
+                h += guided_hint
+                guided_hint = None
+            else:
+                h = module(h, emb, context)
+            outs.append(zero_conv(h, emb, context))
+        h = self.middle_block(h, emb, context)
+        outs.append(self.middle_block_out(h, emb, context))
+        return outs
+class ControlLDM(LatentDiffusion):
+    def __init__(self, control_stage_config, control_key, glyph_key, position_key, only_mid_control, loss_alpha=0, loss_beta=0, with_step_weight=False, use_vae_upsample=False, latin_weight=1.0, embedding_manager_config=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.control_model = instantiate_from_config(control_stage_config)
+        self.control_key = control_key
+        self.glyph_key = glyph_key
+        self.position_key = position_key
+        self.only_mid_control = only_mid_control
+        self.control_scales = [1.0] * 13
+        self.loss_alpha = loss_alpha
+        self.loss_beta = loss_beta
+        self.with_step_weight = with_step_weight
+        self.use_vae_upsample = use_vae_upsample
+        self.latin_weight = latin_weight
+        if embedding_manager_config is not None and embedding_manager_config.params.valid:
+            self.embedding_manager = self.instantiate_embedding_manager(embedding_manager_config, self.cond_stage_model)
+            for param in self.embedding_manager.embedding_parameters():
+                param.requires_grad = True
+        else:
+            self.embedding_manager = None
+        if self.loss_alpha > 0 or self.loss_beta > 0 or self.embedding_manager:
+            if embedding_manager_config.params.emb_type == 'ocr':
+                self.text_predictor = create_predictor().eval()
+                args = edict()
+                args.rec_image_shape = "3, 48, 320"
+                args.rec_batch_num = 6
+                args.rec_char_dict_path = './ocr_recog/ppocr_keys_v1.txt'
+                self.cn_recognizer = TextRecognizer(args, self.text_predictor)
+                for param in self.text_predictor.parameters():
+                    param.requires_grad = False
+                if self.embedding_manager:
+                    self.embedding_manager.recog = self.cn_recognizer
+    @torch.no_grad()
+    def get_input(self, batch, k, bs=None, *args, **kwargs):
+        if self.embedding_manager is None:  # fill in full caption
+            self.fill_caption(batch)
+        x, c, mx = super().get_input(batch, self.first_stage_key, mask_k='masked_img', *args, **kwargs)
+        control = batch[self.control_key]  # for log_images and loss_alpha, not real control
+        if bs is not None:
+            control = control[:bs]
+        control = control.to(self.device)
+        control = einops.rearrange(control, 'b h w c -> b c h w')
+        control = control.to(memory_format=torch.contiguous_format).float()
+        inv_mask = batch['inv_mask']
+        if bs is not None:
+            inv_mask = inv_mask[:bs]
+        inv_mask = inv_mask.to(self.device)
+        inv_mask = einops.rearrange(inv_mask, 'b h w c -> b c h w')
+        inv_mask = inv_mask.to(memory_format=torch.contiguous_format).float()
+        glyphs = batch[self.glyph_key]
+        gly_line = batch['gly_line']
+        positions = batch[self.position_key]
+        n_lines = batch['n_lines']
+        language = batch['language']
+        texts = batch['texts']
+        assert len(glyphs) == len(positions)
+        for i in range(len(glyphs)):
+            if bs is not None:
+                glyphs[i] = glyphs[i][:bs]
+                gly_line[i] = gly_line[i][:bs]
+                positions[i] = positions[i][:bs]
+                n_lines = n_lines[:bs]
+            glyphs[i] = glyphs[i].to(self.device)
+            gly_line[i] = gly_line[i].to(self.device)
+            positions[i] = positions[i].to(self.device)
+            glyphs[i] = einops.rearrange(glyphs[i], 'b h w c -> b c h w')
+            gly_line[i] = einops.rearrange(gly_line[i], 'b h w c -> b c h w')
+            positions[i] = einops.rearrange(positions[i], 'b h w c -> b c h w')
+            glyphs[i] = glyphs[i].to(memory_format=torch.contiguous_format).float()
+            gly_line[i] = gly_line[i].to(memory_format=torch.contiguous_format).float()
+            positions[i] = positions[i].to(memory_format=torch.contiguous_format).float()
+        info = {}
+        info['glyphs'] = glyphs
+        info['positions'] = positions
+        info['n_lines'] = n_lines
+        info['language'] = language
+        info['texts'] = texts
+        info['img'] = batch['img']  # nhwc, (-1,1)
+        info['masked_x'] = mx
+        info['gly_line'] = gly_line
+        info['inv_mask'] = inv_mask
+        return x, dict(c_crossattn=[c], c_concat=[control], text_info=info)
+    def apply_model(self, x_noisy, t, cond, *args, **kwargs):
+        assert isinstance(cond, dict)
+        diffusion_model = self.model.diffusion_model
+        _cond = torch.cat(cond['c_crossattn'], 1)
+        _hint = torch.cat(cond['c_concat'], 1)
+        control = self.control_model(x=x_noisy, timesteps=t, context=_cond, hint=_hint, text_info=cond['text_info'])
+        control = [c * scale for c, scale in zip(control, self.control_scales)]
+        eps = diffusion_model(x=x_noisy, timesteps=t, context=_cond, control=control, only_mid_control=self.only_mid_control)
+        return eps
+    def instantiate_embedding_manager(self, config, embedder):
+        model = instantiate_from_config(config, embedder=embedder)
+        return model
+    @torch.no_grad()
+    def get_unconditional_conditioning(self, N):
+        return self.get_learned_conditioning(dict(c_crossattn=[[""] * N], text_info=None))
+    def get_learned_conditioning(self, c):
+        if self.cond_stage_forward is None:
+            if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
+                if self.embedding_manager is not None and c['text_info'] is not None:
+                    self.embedding_manager.encode_text(c['text_info'])
+                if isinstance(c, dict):
+                    cond_txt = c['c_crossattn'][0]
+                else:
+                    cond_txt = c
+                if self.embedding_manager is not None:
+                    cond_txt = self.cond_stage_model.encode(cond_txt, embedding_manager=self.embedding_manager)
+                else:
+                    cond_txt = self.cond_stage_model.encode(cond_txt)
+                if isinstance(c, dict):
+                    c['c_crossattn'][0] = cond_txt
+                else:
+                    c = cond_txt
+                if isinstance(c, DiagonalGaussianDistribution):
+                    c = c.mode()
+            else:
+                c = self.cond_stage_model(c)
+        else:
+            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
+            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
+        return c
+    def fill_caption(self, batch, place_holder='*'):
+        bs = len(batch['n_lines'])
+        cond_list = copy.deepcopy(batch[self.cond_stage_key])
+        for i in range(bs):
+            n_lines = batch['n_lines'][i]
+            if n_lines == 0:
+                continue
+            cur_cap = cond_list[i]
+            for j in range(n_lines):
+                r_txt = batch['texts'][j][i]
+                cur_cap = cur_cap.replace(place_holder, f'"{r_txt}"', 1)
+            cond_list[i] = cur_cap
+        batch[self.cond_stage_key] = cond_list
+    @torch.no_grad()
+    def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
+                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
+                   plot_diffusion_rows=False, unconditional_guidance_scale=9.0, unconditional_guidance_label=None,
+                   use_ema_scope=True,
+                   **kwargs):
+        use_ddim = ddim_steps is not None
+        log = dict()
+        z, c = self.get_input(batch, self.first_stage_key, bs=N)
+        if self.cond_stage_trainable:
+            with torch.no_grad():
+                c = self.get_learned_conditioning(c)
+        c_crossattn = c["c_crossattn"][0][:N]
+        c_cat = c["c_concat"][0][:N]
+        text_info = c["text_info"]
+        text_info['glyphs'] = [i[:N] for i in text_info['glyphs']]
+        text_info['gly_line'] = [i[:N] for i in text_info['gly_line']]
+        text_info['positions'] = [i[:N] for i in text_info['positions']]
+        text_info['n_lines'] = text_info['n_lines'][:N]
+        text_info['masked_x'] = text_info['masked_x'][:N]
+        text_info['img'] = text_info['img'][:N]
+        N = min(z.shape[0], N)
+        n_row = min(z.shape[0], n_row)
+        log["reconstruction"] = self.decode_first_stage(z)
+        log["masked_image"] = self.decode_first_stage(text_info['masked_x'])
+        log["control"] = c_cat * 2.0 - 1.0
+        log["img"] = text_info['img'].permute(0, 3, 1, 2)  # log source image if needed
+        # get glyph
+        glyph_bs = torch.stack(text_info['glyphs'])
+        glyph_bs = torch.sum(glyph_bs, dim=0) * 2.0 - 1.0
+        log["glyph"] = torch.nn.functional.interpolate(glyph_bs, size=(512, 512), mode='bilinear', align_corners=True,)
+        # fill caption
+        if not self.embedding_manager:
+            self.fill_caption(batch)
+        captions = batch[self.cond_stage_key]
+        log["conditioning"] = log_txt_as_img((512, 512), captions, size=16)
+        if plot_diffusion_rows:
+            # get diffusion row
+            diffusion_row = list()
+            z_start = z[:n_row]
+            for t in range(self.num_timesteps):
+                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                    t = t.to(self.device).long()
+                    noise = torch.randn_like(z_start)
+                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
+                    diffusion_row.append(self.decode_first_stage(z_noisy))
+            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
+            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
+            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
+            log["diffusion_row"] = diffusion_grid
+        if sample:
+            # get denoise row
+            samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c], "text_info": text_info},
+                                                     batch_size=N, ddim=use_ddim,
+                                                     ddim_steps=ddim_steps, eta=ddim_eta)
+            x_samples = self.decode_first_stage(samples)
+            log["samples"] = x_samples
+            if plot_denoise_rows:
+                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
+                log["denoise_row"] = denoise_grid
+        if unconditional_guidance_scale > 1.0:
+            uc_cross = self.get_unconditional_conditioning(N)
+            uc_cat = c_cat  # torch.zeros_like(c_cat)
+            uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross['c_crossattn'][0]], "text_info": text_info}
+            samples_cfg, tmps = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c_crossattn], "text_info": text_info},
+                                                batch_size=N, ddim=use_ddim,
+                                                ddim_steps=ddim_steps, eta=ddim_eta,
+                                                unconditional_guidance_scale=unconditional_guidance_scale,
+                                                unconditional_conditioning=uc_full,
+                                                )
+            x_samples_cfg = self.decode_first_stage(samples_cfg)
+            log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+            pred_x0 = False  # wether log pred_x0
+            if pred_x0:
+                for idx in range(len(tmps['pred_x0'])):
+                    pred_x0 = self.decode_first_stage(tmps['pred_x0'][idx])
+                    log[f"pred_x0_{tmps['index'][idx]}"] = pred_x0
+        return log
+    @torch.no_grad()
+    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
+        ddim_sampler = DDIMSampler(self)
+        b, c, h, w = cond["c_concat"][0].shape
+        shape = (self.channels, h // 8, w // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, log_every_t=5, **kwargs)
+        return samples, intermediates
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.control_model.parameters())
+        if self.embedding_manager:
+            params += list(self.embedding_manager.embedding_parameters())
+        if not self.sd_locked:
+            # params += list(self.model.diffusion_model.input_blocks.parameters())
+            # params += list(self.model.diffusion_model.middle_block.parameters())
+            params += list(self.model.diffusion_model.output_blocks.parameters())
+            params += list(self.model.diffusion_model.out.parameters())
+        if self.unlockKV:
+            nCount = 0
+            for name, param in self.model.diffusion_model.named_parameters():
+                if 'attn2.to_k' in name or 'attn2.to_v' in name:
+                    params += [param]
+                    nCount += 1
+            print(f'Cross attention is unlocked, and {nCount} Wk or Wv are added to potimizers!!!')
+        opt = torch.optim.AdamW(params, lr=lr)
+        return opt
+    def low_vram_shift(self, is_diffusing):
+        if is_diffusing:
+            self.model = self.model.cuda()
+            self.control_model = self.control_model.cuda()
+            self.first_stage_model = self.first_stage_model.cpu()
+            self.cond_stage_model = self.cond_stage_model.cpu()
+        else:
+            self.model = self.model.cpu()
+            self.control_model = self.control_model.cpu()
+            self.first_stage_model = self.first_stage_model.cuda()
+            self.cond_stage_model = self.cond_stage_model.cuda()

cldm/ddim_hacked.py ADDED Viewed

	@@ -0,0 +1,317 @@

+"""SAMPLING ONLY."""
+import torch
+import numpy as np
+from tqdm import tqdm
+from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, extract_into_tensor
+class DDIMSampler(object):
+    def __init__(self, model, schedule="linear", **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor:
+            if attr.device != torch.device("cuda"):
+                attr = attr.to(torch.device("cuda"))
+        setattr(self, name, attr)
+    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
+        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
+                                                                                   ddim_timesteps=self.ddim_timesteps,
+                                                                                   eta=ddim_eta,verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
+                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
+    @torch.no_grad()
+    def sample(self,
+               S,
+               batch_size,
+               shape,
+               conditioning=None,
+               callback=None,
+               normals_sequence=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               x0=None,
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=True,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+               dynamic_threshold=None,
+               ucg_schedule=None,
+               **kwargs
+               ):
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                ctmp = conditioning[list(conditioning.keys())[0]]
+                while isinstance(ctmp, list): ctmp = ctmp[0]
+                cbs = ctmp.shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            elif isinstance(conditioning, list):
+                for ctmp in conditioning:
+                    if ctmp.shape[0] != batch_size:
+                        print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
+        samples, intermediates = self.ddim_sampling(conditioning, size,
+                                                    callback=callback,
+                                                    img_callback=img_callback,
+                                                    quantize_denoised=quantize_x0,
+                                                    mask=mask, x0=x0,
+                                                    ddim_use_original_steps=False,
+                                                    noise_dropout=noise_dropout,
+                                                    temperature=temperature,
+                                                    score_corrector=score_corrector,
+                                                    corrector_kwargs=corrector_kwargs,
+                                                    x_T=x_T,
+                                                    log_every_t=log_every_t,
+                                                    unconditional_guidance_scale=unconditional_guidance_scale,
+                                                    unconditional_conditioning=unconditional_conditioning,
+                                                    dynamic_threshold=dynamic_threshold,
+                                                    ucg_schedule=ucg_schedule
+                                                    )
+        return samples, intermediates
+    @torch.no_grad()
+    def ddim_sampling(self, cond, shape,
+                      x_T=None, ddim_use_original_steps=False,
+                      callback=None, timesteps=None, quantize_denoised=False,
+                      mask=None, x0=None, img_callback=None, log_every_t=100,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, dynamic_threshold=None,
+                      ucg_schedule=None):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+        intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        print(f"Running DDIM Sampling with {total_steps} timesteps")
+        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+            if ucg_schedule is not None:
+                assert len(ucg_schedule) == len(time_range)
+                unconditional_guidance_scale = ucg_schedule[i]
+            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
+                                      quantize_denoised=quantize_denoised, temperature=temperature,
+                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
+                                      corrector_kwargs=corrector_kwargs,
+                                      unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      dynamic_threshold=dynamic_threshold)
+            img, pred_x0 = outs
+            if callback: callback(i)
+            if img_callback: img_callback(pred_x0, i)
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(img)
+                intermediates['pred_x0'].append(pred_x0)
+        return img, intermediates
+    @torch.no_grad()
+    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None,
+                      dynamic_threshold=None):
+        b, *_, device = *x.shape, x.device
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            model_output = self.model.apply_model(x, t, c)
+        else:
+            model_t = self.model.apply_model(x, t, c)
+            model_uncond = self.model.apply_model(x, t, unconditional_conditioning)
+            model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
+        if self.model.parameterization == "v":
+            e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
+        else:
+            e_t = model_output
+        if score_corrector is not None:
+            assert self.model.parameterization == "eps", 'not implemented'
+            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        # select parameters corresponding to the currently considered timestep
+        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+        # current prediction for x_0
+        if self.model.parameterization != "v":
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        else:
+            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
+        if quantize_denoised:
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+        if dynamic_threshold is not None:
+            raise NotImplementedError()
+        # direction pointing to x_t
+        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x_prev, pred_x0
+    @torch.no_grad()
+    def encode(self, x0, c, t_enc, use_original_steps=False, return_intermediates=None,
+               unconditional_guidance_scale=1.0, unconditional_conditioning=None, callback=None):
+        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
+        num_reference_steps = timesteps.shape[0]
+        assert t_enc <= num_reference_steps
+        num_steps = t_enc
+        if use_original_steps:
+            alphas_next = self.alphas_cumprod[:num_steps]
+            alphas = self.alphas_cumprod_prev[:num_steps]
+        else:
+            alphas_next = self.ddim_alphas[:num_steps]
+            alphas = torch.tensor(self.ddim_alphas_prev[:num_steps])
+        x_next = x0
+        intermediates = []
+        inter_steps = []
+        for i in tqdm(range(num_steps), desc='Encoding Image'):
+            t = torch.full((x0.shape[0],), timesteps[i], device=self.model.device, dtype=torch.long)
+            if unconditional_guidance_scale == 1.:
+                noise_pred = self.model.apply_model(x_next, t, c)
+            else:
+                assert unconditional_conditioning is not None
+                e_t_uncond, noise_pred = torch.chunk(
+                    self.model.apply_model(torch.cat((x_next, x_next)), torch.cat((t, t)),
+                                           torch.cat((unconditional_conditioning, c))), 2)
+                noise_pred = e_t_uncond + unconditional_guidance_scale * (noise_pred - e_t_uncond)
+            xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next
+            weighted_noise_pred = alphas_next[i].sqrt() * (
+                    (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) * noise_pred
+            x_next = xt_weighted + weighted_noise_pred
+            if return_intermediates and i % (
+                    num_steps // return_intermediates) == 0 and i < num_steps - 1:
+                intermediates.append(x_next)
+                inter_steps.append(i)
+            elif return_intermediates and i >= num_steps - 2:
+                intermediates.append(x_next)
+                inter_steps.append(i)
+            if callback: callback(i)
+        out = {'x_encoded': x_next, 'intermediate_steps': inter_steps}
+        if return_intermediates:
+            out.update({'intermediates': intermediates})
+        return x_next, out
+    @torch.no_grad()
+    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
+        # fast, but does not allow for exact reconstruction
+        # t serves as an index to gather the correct alphas
+        if use_original_steps:
+            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
+            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
+        else:
+            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
+            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
+        if noise is None:
+            noise = torch.randn_like(x0)
+        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
+                extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
+    @torch.no_grad()
+    def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
+               use_original_steps=False, callback=None):
+        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
+        timesteps = timesteps[:t_start]
+        time_range = np.flip(timesteps)
+        total_steps = timesteps.shape[0]
+        print(f"Running DDIM Sampling with {total_steps} timesteps")
+        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
+        x_dec = x_latent
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
+            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
+                                          unconditional_guidance_scale=unconditional_guidance_scale,
+                                          unconditional_conditioning=unconditional_conditioning)
+            if callback: callback(i)
+        return x_dec

cldm/embedding_manager.py ADDED Viewed

	@@ -0,0 +1,165 @@

+'''
+Copyright (c) Alibaba, Inc. and its affiliates.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from ldm.modules.diffusionmodules.util import conv_nd, linear
+def get_clip_token_for_string(tokenizer, string):
+    batch_encoding = tokenizer(string, truncation=True, max_length=77, return_length=True,
+                               return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+    tokens = batch_encoding["input_ids"]
+    assert torch.count_nonzero(tokens - 49407) == 2, f"String '{string}' maps to more than a single token. Please use another string"
+    return tokens[0, 1]
+def get_bert_token_for_string(tokenizer, string):
+    token = tokenizer(string)
+    assert torch.count_nonzero(token) == 3, f"String '{string}' maps to more than a single token. Please use another string"
+    token = token[0, 1]
+    return token
+def get_clip_vision_emb(encoder, processor, img):
+    _img = img.repeat(1, 3, 1, 1)*255
+    inputs = processor(images=_img, return_tensors="pt")
+    inputs['pixel_values'] = inputs['pixel_values'].to(img.device)
+    outputs = encoder(**inputs)
+    emb = outputs.image_embeds
+    return emb
+def get_recog_emb(encoder, img_list):
+    _img_list = [(img.repeat(1, 3, 1, 1)*255)[0] for img in img_list]
+    encoder.predictor.eval()
+    _, preds_neck = encoder.pred_imglist(_img_list, show_debug=False)
+    return preds_neck
+def pad_H(x):
+    _, _, H, W = x.shape
+    p_top = (W - H) // 2
+    p_bot = W - H - p_top
+    return F.pad(x, (0, 0, p_top, p_bot))
+class EncodeNet(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(EncodeNet, self).__init__()
+        chan = 16
+        n_layer = 4  # downsample
+        self.conv1 = conv_nd(2, in_channels, chan, 3, padding=1)
+        self.conv_list = nn.ModuleList([])
+        _c = chan
+        for i in range(n_layer):
+            self.conv_list.append(conv_nd(2, _c, _c*2, 3, padding=1, stride=2))
+            _c *= 2
+        self.conv2 = conv_nd(2, _c, out_channels, 3, padding=1)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.act = nn.SiLU()
+    def forward(self, x):
+        x = self.act(self.conv1(x))
+        for layer in self.conv_list:
+            x = self.act(layer(x))
+        x = self.act(self.conv2(x))
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        return x
+class EmbeddingManager(nn.Module):
+    def __init__(
+            self,
+            embedder,
+            valid=True,
+            glyph_channels=20,
+            position_channels=1,
+            placeholder_string='*',
+            add_pos=False,
+            emb_type='ocr',
+            **kwargs
+    ):
+        super().__init__()
+        if hasattr(embedder, 'tokenizer'):  # using Stable Diffusion's CLIP encoder
+            get_token_for_string = partial(get_clip_token_for_string, embedder.tokenizer)
+            token_dim = 768
+            if hasattr(embedder, 'vit'):
+                assert emb_type == 'vit'
+                self.get_vision_emb = partial(get_clip_vision_emb, embedder.vit, embedder.processor)
+            self.get_recog_emb = None
+        else:  # using LDM's BERT encoder
+            get_token_for_string = partial(get_bert_token_for_string, embedder.tknz_fn)
+            token_dim = 1280
+        self.token_dim = token_dim
+        self.emb_type = emb_type
+        self.add_pos = add_pos
+        if add_pos:
+            self.position_encoder = EncodeNet(position_channels, token_dim)
+        if emb_type == 'ocr':
+            self.proj = linear(40*64, token_dim)
+        if emb_type == 'conv':
+            self.glyph_encoder = EncodeNet(glyph_channels, token_dim)
+        self.placeholder_token = get_token_for_string(placeholder_string)
+    def encode_text(self, text_info):
+        if self.get_recog_emb is None and self.emb_type == 'ocr':
+            self.get_recog_emb = partial(get_recog_emb, self.recog)
+        gline_list = []
+        pos_list = []
+        for i in range(len(text_info['n_lines'])):  # sample index in a batch
+            n_lines = text_info['n_lines'][i]
+            for j in range(n_lines):  # line
+                gline_list += [text_info['gly_line'][j][i:i+1]]
+                if self.add_pos:
+                    pos_list += [text_info['positions'][j][i:i+1]]
+        if len(gline_list) > 0:
+            if self.emb_type == 'ocr':
+                recog_emb = self.get_recog_emb(gline_list)
+                enc_glyph = self.proj(recog_emb.reshape(recog_emb.shape[0], -1))
+            elif self.emb_type == 'vit':
+                enc_glyph = self.get_vision_emb(pad_H(torch.cat(gline_list, dim=0)))
+            elif self.emb_type == 'conv':
+                enc_glyph = self.glyph_encoder(pad_H(torch.cat(gline_list, dim=0)))
+            if self.add_pos:
+                enc_pos = self.position_encoder(torch.cat(gline_list, dim=0))
+                enc_glyph = enc_glyph+enc_pos
+        self.text_embs_all = []
+        n_idx = 0
+        for i in range(len(text_info['n_lines'])):  # sample index in a batch
+            n_lines = text_info['n_lines'][i]
+            text_embs = []
+            for j in range(n_lines):  # line
+                text_embs += [enc_glyph[n_idx:n_idx+1]]
+                n_idx += 1
+            self.text_embs_all += [text_embs]
+    def forward(
+            self,
+            tokenized_text,
+            embedded_text,
+    ):
+        b, device = tokenized_text.shape[0], tokenized_text.device
+        for i in range(b):
+            idx = tokenized_text[i] == self.placeholder_token.to(device)
+            if sum(idx) > 0:
+                if i >= len(self.text_embs_all):
+                    print('truncation for log images...')
+                    break
+                text_emb = torch.cat(self.text_embs_all[i], dim=0)
+                if sum(idx) != len(text_emb):
+                    print('truncation for long caption...')
+                embedded_text[i][idx] = text_emb[:sum(idx)]
+        return embedded_text
+    def embedding_parameters(self):
+        return self.parameters()

cldm/hack.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+import einops
+import ldm.modules.encoders.modules
+import ldm.modules.attention
+from transformers import logging
+from ldm.modules.attention import default
+def disable_verbosity():
+    logging.set_verbosity_error()
+    print('logging improved.')
+    return
+def enable_sliced_attention():
+    ldm.modules.attention.CrossAttention.forward = _hacked_sliced_attentin_forward
+    print('Enabled sliced_attention.')
+    return
+def hack_everything(clip_skip=0):
+    disable_verbosity()
+    ldm.modules.encoders.modules.FrozenCLIPEmbedder.forward = _hacked_clip_forward
+    ldm.modules.encoders.modules.FrozenCLIPEmbedder.clip_skip = clip_skip
+    print('Enabled clip hacks.')
+    return
+# Written by Lvmin
+def _hacked_clip_forward(self, text):
+    PAD = self.tokenizer.pad_token_id
+    EOS = self.tokenizer.eos_token_id
+    BOS = self.tokenizer.bos_token_id
+    def tokenize(t):
+        return self.tokenizer(t, truncation=False, add_special_tokens=False)["input_ids"]
+    def transformer_encode(t):
+        if self.clip_skip > 1:
+            rt = self.transformer(input_ids=t, output_hidden_states=True)
+            return self.transformer.text_model.final_layer_norm(rt.hidden_states[-self.clip_skip])
+        else:
+            return self.transformer(input_ids=t, output_hidden_states=False).last_hidden_state
+    def split(x):
+        return x[75 * 0: 75 * 1], x[75 * 1: 75 * 2], x[75 * 2: 75 * 3]
+    def pad(x, p, i):
+        return x[:i] if len(x) >= i else x + [p] * (i - len(x))
+    raw_tokens_list = tokenize(text)
+    tokens_list = []
+    for raw_tokens in raw_tokens_list:
+        raw_tokens_123 = split(raw_tokens)
+        raw_tokens_123 = [[BOS] + raw_tokens_i + [EOS] for raw_tokens_i in raw_tokens_123]
+        raw_tokens_123 = [pad(raw_tokens_i, PAD, 77) for raw_tokens_i in raw_tokens_123]
+        tokens_list.append(raw_tokens_123)
+    tokens_list = torch.IntTensor(tokens_list).to(self.device)
+    feed = einops.rearrange(tokens_list, 'b f i -> (b f) i')
+    y = transformer_encode(feed)
+    z = einops.rearrange(y, '(b f) i c -> b (f i) c', f=3)
+    return z
+# Stolen from https://github.com/basujindal/stable-diffusion/blob/main/optimizedSD/splitAttention.py
+def _hacked_sliced_attentin_forward(self, x, context=None, mask=None):
+    h = self.heads
+    q = self.to_q(x)
+    context = default(context, x)
+    k = self.to_k(context)
+    v = self.to_v(context)
+    del context, x
+    q, k, v = map(lambda t: einops.rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+    limit = k.shape[0]
+    att_step = 1
+    q_chunks = list(torch.tensor_split(q, limit // att_step, dim=0))
+    k_chunks = list(torch.tensor_split(k, limit // att_step, dim=0))
+    v_chunks = list(torch.tensor_split(v, limit // att_step, dim=0))
+    q_chunks.reverse()
+    k_chunks.reverse()
+    v_chunks.reverse()
+    sim = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device)
+    del k, q, v
+    for i in range(0, limit, att_step):
+        q_buffer = q_chunks.pop()
+        k_buffer = k_chunks.pop()
+        v_buffer = v_chunks.pop()
+        sim_buffer = torch.einsum('b i d, b j d -> b i j', q_buffer, k_buffer) * self.scale
+        del k_buffer, q_buffer
+        # attention, what we cannot get enough of, by chunks
+        sim_buffer = sim_buffer.softmax(dim=-1)
+        sim_buffer = torch.einsum('b i j, b j d -> b i d', sim_buffer, v_buffer)
+        del v_buffer
+        sim[i:i + att_step, :, :] = sim_buffer
+        del sim_buffer
+    sim = einops.rearrange(sim, '(b h) n d -> b n (h d)', h=h)
+    return self.to_out(sim)

cldm/logger.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import numpy as np
+import torch
+import torchvision
+from PIL import Image
+from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.utilities.distributed import rank_zero_only
+class ImageLogger(Callback):
+    def __init__(self, batch_frequency=2000, max_images=4, clamp=True, increase_log_steps=True,
+                 rescale=True, disabled=False, log_on_batch_idx=False, log_first_step=False,
+                 log_images_kwargs=None):
+        super().__init__()
+        self.rescale = rescale
+        self.batch_freq = batch_frequency
+        self.max_images = max_images
+        if not increase_log_steps:
+            self.log_steps = [self.batch_freq]
+        self.clamp = clamp
+        self.disabled = disabled
+        self.log_on_batch_idx = log_on_batch_idx
+        self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {}
+        self.log_first_step = log_first_step
+    @rank_zero_only
+    def log_local(self, save_dir, split, images, global_step, current_epoch, batch_idx):
+        root = os.path.join(save_dir, "image_log", split)
+        for k in images:
+            grid = torchvision.utils.make_grid(images[k], nrow=4)
+            if self.rescale:
+                grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+            grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
+            grid = grid.numpy()
+            grid = (grid * 255).astype(np.uint8)
+            filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(k, global_step, current_epoch, batch_idx)
+            path = os.path.join(root, filename)
+            os.makedirs(os.path.split(path)[0], exist_ok=True)
+            Image.fromarray(grid).save(path)
+    def log_img(self, pl_module, batch, batch_idx, split="train"):
+        check_idx = batch_idx  # if self.log_on_batch_idx else pl_module.global_step
+        if (self.check_frequency(check_idx) and  # batch_idx % self.batch_freq == 0
+                hasattr(pl_module, "log_images") and
+                callable(pl_module.log_images) and
+                self.max_images > 0):
+            logger = type(pl_module.logger)
+            is_train = pl_module.training
+            if is_train:
+                pl_module.eval()
+            with torch.no_grad():
+                images = pl_module.log_images(batch, split=split, **self.log_images_kwargs)
+            for k in images:
+                N = min(images[k].shape[0], self.max_images)
+                images[k] = images[k][:N]
+                if isinstance(images[k], torch.Tensor):
+                    images[k] = images[k].detach().cpu()
+                    if self.clamp:
+                        images[k] = torch.clamp(images[k], -1., 1.)
+            self.log_local(pl_module.logger.save_dir, split, images,
+                           pl_module.global_step, pl_module.current_epoch, batch_idx)
+            if is_train:
+                pl_module.train()
+    def check_frequency(self, check_idx):
+        return check_idx % self.batch_freq == 0
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+        if not self.disabled:
+            self.log_img(pl_module, batch, batch_idx, split="train")

cldm/model.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+import torch
+from omegaconf import OmegaConf
+from ldm.util import instantiate_from_config
+def get_state_dict(d):
+    return d.get('state_dict', d)
+def load_state_dict(ckpt_path, location='cpu'):
+    _, extension = os.path.splitext(ckpt_path)
+    if extension.lower() == ".safetensors":
+        import safetensors.torch
+        state_dict = safetensors.torch.load_file(ckpt_path, device=location)
+    else:
+        state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
+    state_dict = get_state_dict(state_dict)
+    print(f'Loaded state_dict from [{ckpt_path}]')
+    return state_dict
+def create_model(config_path, cond_stage_path=None):
+    config = OmegaConf.load(config_path)
+    if cond_stage_path:
+        config.model.params.cond_stage_config.params.version = cond_stage_path  # use pre-downloaded ckpts, in case blocked
+    model = instantiate_from_config(config.model).cpu()
+    print(f'Loaded model config from [{config_path}]')
+    return model

cldm/recognizer.py ADDED Viewed

	@@ -0,0 +1,303 @@

+'''
+Copyright (c) Alibaba, Inc. and its affiliates.
+'''
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import cv2
+import numpy as np
+import math
+import traceback
+from easydict import EasyDict as edict
+import time
+from ocr_recog.RecModel import RecModel
+import torch
+import torch.nn.functional as F
+from skimage.transform._geometric import _umeyama as get_sym_mat
+def min_bounding_rect(img):
+    ret, thresh = cv2.threshold(img, 127, 255, 0)
+    contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    if len(contours) == 0:
+        print('Bad contours, using fake bbox...')
+        return np.array([[0, 0], [100, 0], [100, 100], [0, 100]])
+    max_contour = max(contours, key=cv2.contourArea)
+    rect = cv2.minAreaRect(max_contour)
+    box = cv2.boxPoints(rect)
+    box = np.int0(box)
+    # sort
+    x_sorted = sorted(box, key=lambda x: x[0])
+    left = x_sorted[:2]
+    right = x_sorted[2:]
+    left = sorted(left, key=lambda x: x[1])
+    (tl, bl) = left
+    right = sorted(right, key=lambda x: x[1])
+    (tr, br) = right
+    if tl[1] > bl[1]:
+        (tl, bl) = (bl, tl)
+    if tr[1] > br[1]:
+        (tr, br) = (br, tr)
+    return np.array([tl, tr, br, bl])
+def adjust_image(box, img):
+    pts1 = np.float32([box[0], box[1], box[2], box[3]])
+    width = max(np.linalg.norm(pts1[0]-pts1[1]), np.linalg.norm(pts1[2]-pts1[3]))
+    height = max(np.linalg.norm(pts1[0]-pts1[3]), np.linalg.norm(pts1[1]-pts1[2]))
+    pts2 = np.float32([[0, 0], [width, 0], [width, height], [0, height]])
+    # get transform matrix
+    M = get_sym_mat(pts1, pts2, estimate_scale=True)
+    C, H, W = img.shape
+    T = np.array([[2 / W, 0, -1], [0, 2 / H, -1], [0, 0, 1]])
+    theta = np.linalg.inv(T @ M @ np.linalg.inv(T))
+    theta = torch.from_numpy(theta[:2, :]).unsqueeze(0).type(torch.float32).to(img.device)
+    grid = F.affine_grid(theta, torch.Size([1, C, H, W]), align_corners=True)
+    result = F.grid_sample(img.unsqueeze(0), grid, align_corners=True)
+    result = torch.clamp(result.squeeze(0), 0, 255)
+    # crop
+    result = result[:, :int(height), :int(width)]
+    return result
+'''
+mask: numpy.ndarray, mask of textual, HWC
+src_img: torch.Tensor, source image, CHW
+'''
+def crop_image(src_img, mask):
+    box = min_bounding_rect(mask)
+    result = adjust_image(box, src_img)
+    if len(result.shape) == 2:
+        result = torch.stack([result]*3, axis=-1)
+    return result
+def create_predictor(model_dir=None, model_lang='ch', is_onnx=False):
+    model_file_path = model_dir
+    if model_file_path is not None and not os.path.exists(model_file_path):
+        raise ValueError("not find model file path {}".format(model_file_path))
+    if is_onnx:
+        import onnxruntime as ort
+        sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider'])  # 'TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'
+        return sess
+    else:
+        if model_lang == 'ch':
+            n_class = 6625
+        elif model_lang == 'en':
+            n_class = 97
+        else:
+            raise ValueError(f"Unsupported OCR recog model_lang: {model_lang}")
+        rec_config = edict(
+            in_channels=3,
+            backbone=edict(type='MobileNetV1Enhance', scale=0.5, last_conv_stride=[1, 2], last_pool_type='avg'),
+            neck=edict(type='SequenceEncoder', encoder_type="svtr", dims=64, depth=2, hidden_dims=120, use_guide=True),
+            head=edict(type='CTCHead', fc_decay=0.00001, out_channels=n_class, return_feats=True)
+        )
+        rec_model = RecModel(rec_config)
+        if model_file_path is not None:
+            rec_model.load_state_dict(torch.load(model_file_path, map_location="cpu"))
+            rec_model.eval()
+        return rec_model.eval()
+def _check_image_file(path):
+    img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff'}
+    return any([path.lower().endswith(e) for e in img_end])
+def get_image_file_list(img_file):
+    imgs_lists = []
+    if img_file is None or not os.path.exists(img_file):
+        raise Exception("not found any img file in {}".format(img_file))
+    if os.path.isfile(img_file) and _check_image_file(img_file):
+        imgs_lists.append(img_file)
+    elif os.path.isdir(img_file):
+        for single_file in os.listdir(img_file):
+            file_path = os.path.join(img_file, single_file)
+            if os.path.isfile(file_path) and _check_image_file(file_path):
+                imgs_lists.append(file_path)
+    if len(imgs_lists) == 0:
+        raise Exception("not found any img file in {}".format(img_file))
+    imgs_lists = sorted(imgs_lists)
+    return imgs_lists
+class TextRecognizer(object):
+    def __init__(self, args, predictor):
+        self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
+        self.rec_batch_num = args.rec_batch_num
+        self.predictor = predictor
+        self.chars = self.get_char_dict(args.rec_char_dict_path)
+        self.char2id = {x: i for i, x in enumerate(self.chars)}
+        self.is_onnx = not isinstance(self.predictor, torch.nn.Module)
+    # img: CHW
+    def resize_norm_img(self, img, max_wh_ratio):
+        imgC, imgH, imgW = self.rec_image_shape
+        assert imgC == img.shape[0]
+        imgW = int((imgH * max_wh_ratio))
+        h, w = img.shape[1:]
+        ratio = w / float(h)
+        if math.ceil(imgH * ratio) > imgW:
+            resized_w = imgW
+        else:
+            resized_w = int(math.ceil(imgH * ratio))
+        resized_image = torch.nn.functional.interpolate(
+            img.unsqueeze(0),
+            size=(imgH, resized_w),
+            mode='bilinear',
+            align_corners=True,
+        )
+        resized_image /= 255.0
+        resized_image -= 0.5
+        resized_image /= 0.5
+        padding_im = torch.zeros((imgC, imgH, imgW), dtype=torch.float32).to(img.device)
+        padding_im[:, :, 0:resized_w] = resized_image[0]
+        return padding_im
+    # img_list: list of tensors with shape chw 0-255
+    def pred_imglist(self, img_list, show_debug=False, is_ori=False):
+        img_num = len(img_list)
+        assert img_num > 0
+        # Calculate the aspect ratio of all text bars
+        width_list = []
+        for img in img_list:
+            width_list.append(img.shape[2] / float(img.shape[1]))
+        # Sorting can speed up the recognition process
+        indices = torch.from_numpy(np.argsort(np.array(width_list)))
+        batch_num = self.rec_batch_num
+        preds_all = [None] * img_num
+        preds_neck_all = [None] * img_num
+        for beg_img_no in range(0, img_num, batch_num):
+            end_img_no = min(img_num, beg_img_no + batch_num)
+            norm_img_batch = []
+            imgC, imgH, imgW = self.rec_image_shape[:3]
+            max_wh_ratio = imgW / imgH
+            for ino in range(beg_img_no, end_img_no):
+                h, w = img_list[indices[ino]].shape[1:]
+                if h > w * 1.2:
+                    img = img_list[indices[ino]]
+                    img = torch.transpose(img, 1, 2).flip(dims=[1])
+                    img_list[indices[ino]] = img
+                    h, w = img.shape[1:]
+                # wh_ratio = w * 1.0 / h
+                # max_wh_ratio = max(max_wh_ratio, wh_ratio)  # comment to not use different ratio
+            for ino in range(beg_img_no, end_img_no):
+                norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio)
+                norm_img = norm_img.unsqueeze(0)
+                norm_img_batch.append(norm_img)
+            norm_img_batch = torch.cat(norm_img_batch, dim=0)
+            if show_debug:
+                for i in range(len(norm_img_batch)):
+                    _img = norm_img_batch[i].permute(1, 2, 0).detach().cpu().numpy()
+                    _img = (_img + 0.5)*255
+                    _img = _img[:, :, ::-1]
+                    file_name = f'{indices[beg_img_no + i]}'
+                    file_name = file_name + '_ori' if is_ori else file_name
+                    cv2.imwrite(file_name + '.jpg', _img)
+            if self.is_onnx:
+                input_dict = {}
+                input_dict[self.predictor.get_inputs()[0].name] = norm_img_batch.detach().cpu().numpy()
+                outputs = self.predictor.run(None, input_dict)
+                preds = {}
+                preds['ctc'] = torch.from_numpy(outputs[0])
+                preds['ctc_neck'] = [torch.zeros(1)] * img_num
+            else:
+                preds = self.predictor(norm_img_batch)
+            for rno in range(preds['ctc'].shape[0]):
+                preds_all[indices[beg_img_no + rno]] = preds['ctc'][rno]
+                preds_neck_all[indices[beg_img_no + rno]] = preds['ctc_neck'][rno]
+        return torch.stack(preds_all, dim=0), torch.stack(preds_neck_all, dim=0)
+    def get_char_dict(self, character_dict_path):
+        character_str = []
+        with open(character_dict_path, "rb") as fin:
+            lines = fin.readlines()
+            for line in lines:
+                line = line.decode('utf-8').strip("\n").strip("\r\n")
+                character_str.append(line)
+        dict_character = list(character_str)
+        dict_character = ['sos'] + dict_character + [' ']  # eos is space
+        return dict_character
+    def get_text(self, order):
+        char_list = [self.chars[text_id] for text_id in order]
+        return ''.join(char_list)
+    def decode(self, mat):
+        text_index = mat.detach().cpu().numpy().argmax(axis=1)
+        ignored_tokens = [0]
+        selection = np.ones(len(text_index), dtype=bool)
+        selection[1:] = text_index[1:] != text_index[:-1]
+        for ignored_token in ignored_tokens:
+            selection &= text_index != ignored_token
+        return text_index[selection], np.where(selection)[0]
+    def get_ctcloss(self, preds, gt_text, weight):
+        if not isinstance(weight, torch.Tensor):
+            weight = torch.tensor(weight).to(preds.device)
+        ctc_loss = torch.nn.CTCLoss(reduction='none')
+        log_probs = preds.log_softmax(dim=2).permute(1, 0, 2)  # NTC-->TNC
+        targets = []
+        target_lengths = []
+        for t in gt_text:
+            targets += [self.char2id.get(i, len(self.chars)-1) for i in t]
+            target_lengths += [len(t)]
+        targets = torch.tensor(targets).to(preds.device)
+        target_lengths = torch.tensor(target_lengths).to(preds.device)
+        input_lengths = torch.tensor([log_probs.shape[0]]*(log_probs.shape[1])).to(preds.device)
+        loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
+        loss = loss / input_lengths * weight
+        return loss
+def main():
+    rec_model_dir = "./ocr_weights/ppv3_rec.pth"
+    predictor = create_predictor(rec_model_dir)
+    args = edict()
+    args.rec_image_shape = "3, 48, 320"
+    args.rec_char_dict_path = './ocr_weights/ppocr_keys_v1.txt'
+    args.rec_batch_num = 6
+    text_recognizer = TextRecognizer(args, predictor)
+    image_dir = './test_imgs_cn'
+    gt_text = ['韩国小馆']*14
+    image_file_list = get_image_file_list(image_dir)
+    valid_image_file_list = []
+    img_list = []
+    for image_file in image_file_list:
+        img = cv2.imread(image_file)
+        if img is None:
+            print("error in loading image:{}".format(image_file))
+            continue
+        valid_image_file_list.append(image_file)
+        img_list.append(torch.from_numpy(img).permute(2, 0, 1).float())
+    try:
+        tic = time.time()
+        times = []
+        for i in range(10):
+            preds, _ = text_recognizer.pred_imglist(img_list)  # get text
+            preds_all = preds.softmax(dim=2)
+            times += [(time.time()-tic)*1000.]
+            tic = time.time()
+        print(times)
+        print(np.mean(times[1:]) / len(preds_all))
+        weight = np.ones(len(gt_text))
+        loss = text_recognizer.get_ctcloss(preds, gt_text, weight)
+        for i in range(len(valid_image_file_list)):
+            pred = preds_all[i]
+            order, idx = text_recognizer.decode(pred)
+            text = text_recognizer.get_text(order)
+            print(f'{valid_image_file_list[i]}: pred/gt="{text}"/"{gt_text[i]}", loss={loss[i]:.2f}')
+    except Exception as E:
+        print(traceback.format_exc(), E)
+if __name__ == "__main__":
+    main()

dataset_util.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import json
+import pathlib
+__all__ = ['load', 'save', 'show_bbox_on_image']
+def load(file_path: str):
+    file_path = pathlib.Path(file_path)
+    func_dict = {'.txt': load_txt, '.json': load_json, '.list': load_txt}
+    assert file_path.suffix in func_dict
+    return func_dict[file_path.suffix](file_path)
+def load_txt(file_path: str):
+    with open(file_path, 'r', encoding='utf8') as f:
+        content = [x.strip().strip('\ufeff').strip('\xef\xbb\xbf') for x in f.readlines()]
+    return content
+def load_json(file_path: str):
+    with open(file_path, 'r', encoding='utf8') as f:
+        content = json.load(f)
+    return content
+def save(data, file_path):
+    file_path = pathlib.Path(file_path)
+    func_dict = {'.txt': save_txt, '.json': save_json}
+    assert file_path.suffix in func_dict
+    return func_dict[file_path.suffix](data, file_path)
+def save_txt(data, file_path):
+    if not isinstance(data, list):
+        data = [data]
+    with open(file_path, mode='w', encoding='utf8') as f:
+        f.write('\n'.join(data))
+def save_json(data, file_path):
+    with open(file_path, 'w', encoding='utf-8') as json_file:
+        json.dump(data, json_file, ensure_ascii=False, indent=4)
+def show_bbox_on_image(image, polygons=None, txt=None, color=None, font_path='./font/Arial_Unicode.ttf'):
+    from PIL import ImageDraw, ImageFont
+    image = image.convert('RGB')
+    draw = ImageDraw.Draw(image)
+    if len(txt) == 0:
+        txt = None
+    if color is None:
+        color = (255, 0, 0)
+    if txt is not None:
+        font = ImageFont.truetype(font_path, 20)
+    for i, box in enumerate(polygons):
+        box = box[0]
+        if txt is not None:
+            draw.text((int(box[0][0]) + 20, int(box[0][1]) - 20), str(txt[i]), fill='red', font=font)
+        for j in range(len(box) - 1):
+            draw.line((box[j][0], box[j][1], box[j + 1][0], box[j + 1][1]), fill=color, width=2)
+        draw.line((box[-1][0], box[-1][1], box[0][0], box[0][1]), fill=color, width=2)
+    return image
+def show_glyphs(glyphs, name):
+    import numpy as np
+    import cv2
+    size = 64
+    gap = 5
+    n_char = 20
+    canvas = np.ones((size, size*n_char + gap*(n_char-1), 1))*0.5
+    x = 0
+    for i in range(glyphs.shape[-1]):
+        canvas[:, x:x + size, :] = glyphs[..., i:i+1]
+        x += size+gap
+    cv2.imwrite(name, canvas*255)