yibolu commited on
Commit
8881820
1 Parent(s): ec9c23f

update lyrasd2

Browse files
Files changed (50) hide show
  1. .gitattributes +5 -16
  2. .gitignore +4 -2
  3. CHANGELOG.md +0 -4
  4. LISENCE +0 -494
  5. README.md +104 -50
  6. control_bird_canny.png +0 -0
  7. controlnet_img2img_demo.py +62 -0
  8. controlnet_txt2img_demo.py +63 -0
  9. demo.py +0 -12
  10. img2img_demo.py +47 -0
  11. lyraSD/__init__.py +0 -1
  12. lyraSD/inference.py +0 -85
  13. lyraSD/muse_trt/__init__.py +0 -10
  14. lyraSD/muse_trt/models.py +0 -149
  15. lyraSD/muse_trt/sd_img2img.py +0 -368
  16. lyraSD/muse_trt/sd_text2img.py +0 -292
  17. lyraSD/muse_trt/super.py +0 -64
  18. lyraSD/muse_trt/utilities.py +0 -538
  19. lyrasd_model/__init__.py +5 -0
  20. lyrasd_model/lora_util.py +54 -0
  21. lyrasd_model/lyrasd_controlnet_img2img_pipeline.py +637 -0
  22. lyrasd_model/lyrasd_controlnet_txt2img_pipeline.py +547 -0
  23. lyrasd_model/lyrasd_img2img_pipeline.py +554 -0
  24. lyraSD/muse_trt/libnvinfer_plugin.so → lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so +2 -2
  25. sd1.4-engine/superx4-512-512.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm86.so +2 -2
  26. sd1.4-engine/clip.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so +2 -2
  27. sd1.4-engine/vae-decoder.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so +2 -2
  28. lyrasd_model/lyrasd_lib/placeholder.txt +0 -0
  29. lyrasd_model/lyrasd_txt2img_pipeline.py +458 -0
  30. models/README.md +12 -0
  31. output/img2img_demo.jpg +0 -0
  32. output/img2img_input.jpg +0 -0
  33. output/text2img_demo.jpg +0 -0
  34. outputs/res_controlnet_img2img_0.png +0 -0
  35. outputs/res_controlnet_txt2img_0.png +0 -0
  36. outputs/res_img2img_0.png +0 -0
  37. outputs/res_txt2img_0.png +0 -0
  38. outputs/res_txt2img_lora_0.png +0 -0
  39. requirements.txt +2 -0
  40. sd1.4-engine/feature_extractor/preprocessor_config.json +0 -28
  41. sd1.4-engine/scheduler/scheduler_config.json +0 -14
  42. sd1.4-engine/text_encoder/config.json +0 -25
  43. sd1.4-engine/tokenizer/merges.txt +0 -0
  44. sd1.4-engine/tokenizer/special_tokens_map.json +0 -24
  45. sd1.4-engine/tokenizer/tokenizer_config.json +0 -34
  46. sd1.4-engine/tokenizer/vocab.json +0 -0
  47. sd1.4-engine/unet_fp16.plan +0 -3
  48. sd1.4-engine/vae/config.json +0 -31
  49. sd1.4-engine/vae/diffusion_pytorch_model.bin +0 -3
  50. txt2img_demo.py +44 -0
.gitattributes CHANGED
@@ -25,6 +25,7 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -32,19 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
- lyraSD/muse_trt/libnvinfer_plugin.so filter=lfs diff=lfs merge=lfs -text
36
- sd1.5-engine/clip.plan filter=lfs diff=lfs merge=lfs -text
37
- sd1.5-engine/superx4.plan filter=lfs diff=lfs merge=lfs -text
38
- sd1.5-engine/unet_fp16.plan filter=lfs diff=lfs merge=lfs -text
39
- sd1.5-engine/vae-decoder.plan filter=lfs diff=lfs merge=lfs -text
40
- sd1.5-engine/vae-encoder.plan filter=lfs diff=lfs merge=lfs -text
41
- sd1.5-engine/scheduler filter=lfs diff=lfs merge=lfs -text
42
- sd1.5-engine/superx4-512-512.plan filter=lfs diff=lfs merge=lfs -text
43
- sd1.5-engine/text_encoder filter=lfs diff=lfs merge=lfs -text
44
- sd1.5-engine/tokenizer filter=lfs diff=lfs merge=lfs -text
45
- sd1.5-engine/vae filter=lfs diff=lfs merge=lfs -text
46
- sd1.5-engine/feature_extractor filter=lfs diff=lfs merge=lfs -text
47
- sd1.4-engine/clip.plan filter=lfs diff=lfs merge=lfs -text
48
- sd1.4-engine/superx4-512-512.plan filter=lfs diff=lfs merge=lfs -text
49
- sd1.4-engine/unet_fp16.plan filter=lfs diff=lfs merge=lfs -text
50
- sd1.4-engine/vae-decoder.plan filter=lfs diff=lfs merge=lfs -text
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so filter=lfs diff=lfs merge=lfs -text
37
+ lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm86.so filter=lfs diff=lfs merge=lfs -text
38
+ lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so filter=lfs diff=lfs merge=lfs -text
39
+ lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
- *.un~
2
  *.pyc
3
- __pycache__/
 
 
 
1
+ .idea
2
  *.pyc
3
+ .vscode
4
+ __pycache__
5
+ models/lyrasd*
CHANGELOG.md DELETED
@@ -1,4 +0,0 @@
1
- ## V1.0
2
-
3
- - Add accelerated Stable Diffusion pretrained model v1.4 (from: https://huggingface.co/CompVis/stable-diffusion-v1-4)
4
- - Add accelerated Real-ESRGAN(4x) (from https://github.com/xinntao/Real-ESRGAN)
 
 
 
 
 
LISENCE DELETED
@@ -1,494 +0,0 @@
1
- CreativeML Open RAIL-M License
2
-
3
- Copyright (c) 2023 Tencent Music Entertainment
4
-
5
-
6
- Terms of the CreativeML Open RAIL-M:
7
- --------------------------------------------------------------------
8
- Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
9
-
10
- CreativeML Open RAIL-M
11
- dated August 22, 2022
12
-
13
- Section I: PREAMBLE
14
-
15
- Multimodal generative models are being widely adopted and used, and have the potential to transform the way artists, among other individuals, conceive and benefit from AI or ML technologies as a tool for content creation.
16
-
17
- Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations.
18
-
19
- In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for art and content generation.
20
-
21
- Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this License aims to strike a balance between both in order to enable responsible open-science in the field of AI.
22
-
23
- This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model.
24
-
25
- NOW THEREFORE, You and Licensor agree as follows:
26
-
27
- 1. Definitions
28
-
29
- - "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
30
- - "Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
31
- - "Output" means the results of operating a Model as embodied in informational content resulting therefrom.
32
- - "Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
33
- - "Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
34
- - "Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any.
35
- - "Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access.
36
- - "Licensor" means the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
37
- - "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
38
- - "Third Parties" means individuals or legal entities that are not under common control with Licensor or You.
39
- - "Contribution" means any work of authorship, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
40
- - "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
41
-
42
- Section II: INTELLECTUAL PROPERTY RIGHTS
43
-
44
- Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
45
-
46
- 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model.
47
- 3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution incorporated within the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Work shall terminate as of the date such litigation is asserted or filed.
48
-
49
- Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
50
-
51
- 4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions:
52
- Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material.
53
- You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
54
- You must cause any modified files to carry prominent notices stating that You changed the files;
55
- You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model.
56
- You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. - for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
57
- 5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5).
58
- 6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License.
59
-
60
- Section IV: OTHER PROVISIONS
61
-
62
- 7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or modify the Output of the Model based on updates. You shall undertake reasonable efforts to use the latest version of the Model.
63
- 8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
64
- 9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License.
65
- 10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
66
- 11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
67
- 12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
68
-
69
- END OF TERMS AND CONDITIONS
70
-
71
-
72
-
73
- Attachment A
74
-
75
- Use Restrictions
76
-
77
- You agree not to use the Model or Derivatives of the Model:
78
- - In any way that violates any applicable national, federal, state, local or international law or regulation;
79
- - For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
80
- - To generate or disseminate verifiably false information and/or content with the purpose of harming others;
81
- - To generate or disseminate personal identifiable information that can be used to harm an individual;
82
- - To defame, disparage or otherwise harass others;
83
- - For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation;
84
- - For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics;
85
- - To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
86
- - For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories;
87
- - To provide medical advice and medical results interpretation;
88
- - To generate or disseminate information for the purpose to be used
89
-
90
-
91
-
92
-
93
- Other dependencies and licenses:
94
-
95
-
96
- Open Source Software Licensed under the CreativeML Open RAIL-M License:
97
- --------------------------------------------------------------------
98
- 1. stable-diffuison
99
- Files:https://huggingface.co/CompVis/stable-diffusion-v1-4
100
- License:CreativeML Open RAIL-M
101
- For details:https://huggingface.co/spaces/CompVis/stable-diffusion-license
102
-
103
- A copy of the MIT License is included in this file.
104
-
105
-
106
- Open Source Software Licensed under the Apache License Version 2.0:
107
- --------------------------------------------------------------------
108
- 1. huggingface/diffusers
109
- File:https://github.com/huggingface/diffusers
110
- License:Apache License V2
111
- For details:https://github.com/huggingface/diffusers/blob/main/LICENSE
112
-
113
- 2. huggingface/transformers
114
- Copyright 2018- The Hugging Face team. All rights reserved.
115
-
116
- 3. NVIDIA/TensorRT
117
- Copyright 2021 NVIDIA Corporation
118
-
119
- Licensed under the Apache License, Version 2.0 (the "License");
120
- you may not use this file except in compliance with the License.
121
- You may obtain a copy of the License at
122
-
123
- http://www.apache.org/licenses/LICENSE-2.0
124
-
125
- Unless required by applicable law or agreed to in writing, software
126
- distributed under the License is distributed on an "AS IS" BASIS,
127
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
128
- See the License for the specific language governing permissions and
129
- limitations under the License.
130
-
131
- 4. TensorRT/tools/Polygraphy
132
- Copyright 2020 NVIDIA Corporation
133
-
134
- Licensed under the Apache License, Version 2.0 (the "License");
135
- you may not use this file except in compliance with the License.
136
- You may obtain a copy of the License at
137
-
138
- http://www.apache.org/licenses/LICENSE-2.0
139
-
140
- Unless required by applicable law or agreed to in writing, software
141
- distributed under the License is distributed on an "AS IS" BASIS,
142
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
143
- See the License for the specific language governing permissions and
144
- limitations under the License.
145
-
146
-
147
- Terms of the Apache License Version 2.0:
148
- --------------------------------------------------------------------
149
- Apache License
150
-
151
- Version 2.0, January 2004
152
-
153
- http://www.apache.org/licenses/
154
-
155
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
156
- 1. Definitions.
157
-
158
- "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
159
-
160
- "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
161
-
162
- "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
163
-
164
- "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
165
-
166
- "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
167
-
168
- "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
169
-
170
- "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
171
-
172
- "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
173
-
174
- "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
175
-
176
- "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
177
-
178
- 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
179
-
180
- 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
181
-
182
- 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
183
-
184
- You must give any other recipients of the Work or Derivative Works a copy of this License; and
185
-
186
- You must cause any modified files to carry prominent notices stating that You changed the files; and
187
-
188
- You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
189
-
190
- If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
191
-
192
- You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
193
-
194
- 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
195
-
196
- 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
197
-
198
- 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
199
-
200
- 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
201
-
202
- 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
203
-
204
- END OF TERMS AND CONDITIONS
205
-
206
-
207
- Open Source Software Licensed under the Modified BSD License:
208
- --------------------------------------------------------------------
209
- 1. Numpy
210
- Copyright (c) 2005-2023, NumPy Developers.
211
- All rights reserved.
212
-
213
- Terms of the Modified BSD License:
214
- -------------------------------------------------------------------
215
- This project is licensed under the terms of the Modified BSD License, as follows:
216
-
217
- Copyright (c) 2005-2023, NumPy Developers.
218
- All rights reserved.
219
-
220
- Redistribution and use in source and binary forms, with or without
221
- modification, are permitted provided that the following conditions are met:
222
-
223
- Redistributions of source code must retain the above copyright notice, this
224
- list of conditions and the following disclaimer.
225
-
226
- Redistributions in binary form must reproduce the above copyright notice, this
227
- list of conditions and the following disclaimer in the documentation and/or
228
- other materials provided with the distribution.
229
-
230
- Neither the name of the NumPy Developers nor the names of any contributors
231
- may be used to endorse or promote products derived from this
232
- software without specific prior written permission.
233
-
234
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
235
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
236
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
237
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
238
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
239
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
240
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
241
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
242
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
243
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
244
-
245
- 2. pytorch
246
-
247
- From PyTorch:
248
-
249
- Copyright (c) 2016- Facebook, Inc (Adam Paszke)
250
- Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
251
- Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
252
- Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
253
- Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
254
- Copyright (c) 2011-2013 NYU (Clement Farabet)
255
- Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
256
- Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
257
- Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
258
-
259
- From Caffe2:
260
-
261
- Copyright (c) 2016-present, Facebook Inc. All rights reserved.
262
-
263
- All contributions by Facebook:
264
- Copyright (c) 2016 Facebook Inc.
265
-
266
- All contributions by Google:
267
- Copyright (c) 2015 Google Inc.
268
- All rights reserved.
269
-
270
- All contributions by Yangqing Jia:
271
- Copyright (c) 2015 Yangqing Jia
272
- All rights reserved.
273
-
274
- All contributions by Kakao Brain:
275
- Copyright 2019-2020 Kakao Brain
276
-
277
- All contributions by Cruise LLC:
278
- Copyright (c) 2022 Cruise LLC.
279
- All rights reserved.
280
-
281
- All contributions from Caffe:
282
- Copyright(c) 2013, 2014, 2015, the respective contributors
283
- All rights reserved.
284
-
285
- All other contributions:
286
- Copyright(c) 2015, 2016 the respective contributors
287
- All rights reserved.
288
-
289
- Caffe2 uses a copyright model similar to Caffe: each contributor holds
290
- copyright over their contributions to Caffe2. The project versioning records
291
- all such contribution and copyright details. If a contributor wants to further
292
- mark their specific copyright on a particular contribution, they should
293
- indicate their copyright solely in the commit message of the change when it is
294
- committed.
295
-
296
- All rights reserved.
297
-
298
-
299
- Terms of the Modified BSD License:
300
- -------------------------------------------------------------------
301
- This project is licensed under the terms of the Modified BSD License, as follows:
302
-
303
- Redistribution and use in source and binary forms, with or without
304
- modification, are permitted provided that the following conditions are met:
305
-
306
- 1. Redistributions of source code must retain the above copyright
307
- notice, this list of conditions and the following disclaimer.
308
-
309
- 2. Redistributions in binary form must reproduce the above copyright
310
- notice, this list of conditions and the following disclaimer in the
311
- documentation and/or other materials provided with the distribution.
312
-
313
- 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
314
- and IDIAP Research Institute nor the names of its contributors may be
315
- used to endorse or promote products derived from this software without
316
- specific prior written permission.
317
-
318
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
319
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
320
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
321
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
322
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
323
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
324
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
325
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
326
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
327
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
328
- POSSIBILITY OF SUCH DAMAGE.
329
-
330
-
331
-
332
- Open Source Software Licensed under the BSD 3-Clause License:
333
- --------------------------------------------------------------------
334
- 1. scipy
335
- Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
336
-
337
- Terms of the BSD 3-Clause License:
338
- --------------------------------------------------------------------
339
- Redistribution and use in source and binary forms, with or without
340
- modification, are permitted provided that the following conditions are met:
341
-
342
- * Redistributions of source code must retain the above copyright notice, this
343
- list of conditions and the following disclaimer.
344
-
345
- * Redistributions in binary form must reproduce the above copyright notice,
346
- this list of conditions and the following disclaimer in the documentation
347
- and/or other materials provided with the distribution.
348
-
349
- * Neither the name of the copyright holder nor the names of its
350
- contributors may be used to endorse or promote products derived from
351
- this software without specific prior written permission.
352
-
353
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
354
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
355
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
356
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
357
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
358
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
359
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
360
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
361
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
362
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
363
-
364
-
365
- Open Source Software Licensed under the Python Software Foundation License Version 2:
366
- --------------------------------------------------------------------------
367
- 1. Python/cpython
368
- Copyright © 2001-2023 Python Software Foundation. All rights reserved
369
-
370
-
371
- A. HISTORY OF THE SOFTWARE
372
- ==========================
373
-
374
- Python was created in the early 1990s by Guido van Rossum at Stichting
375
- Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands
376
- as a successor of a language called ABC. Guido remains Python's
377
- principal author, although it includes many contributions from others.
378
-
379
- In 1995, Guido continued his work on Python at the Corporation for
380
- National Research Initiatives (CNRI, see https://www.cnri.reston.va.us)
381
- in Reston, Virginia where he released several versions of the
382
- software.
383
-
384
- In May 2000, Guido and the Python core development team moved to
385
- BeOpen.com to form the BeOpen PythonLabs team. In October of the same
386
- year, the PythonLabs team moved to Digital Creations, which became
387
- Zope Corporation. In 2001, the Python Software Foundation (PSF, see
388
- https://www.python.org/psf/) was formed, a non-profit organization
389
- created specifically to own Python-related Intellectual Property.
390
- Zope Corporation was a sponsoring member of the PSF.
391
-
392
- All Python releases are Open Source (see https://opensource.org for
393
- the Open Source Definition). Historically, most, but not all, Python
394
- releases have also been GPL-compatible; the table below summarizes
395
- the various releases.
396
-
397
- Release Derived Year Owner GPL-
398
- from compatible? (1)
399
-
400
- 0.9.0 thru 1.2 1991-1995 CWI yes
401
- 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes
402
- 1.6 1.5.2 2000 CNRI no
403
- 2.0 1.6 2000 BeOpen.com no
404
- 1.6.1 1.6 2001 CNRI yes (2)
405
- 2.1 2.0+1.6.1 2001 PSF no
406
- 2.0.1 2.0+1.6.1 2001 PSF yes
407
- 2.1.1 2.1+2.0.1 2001 PSF yes
408
- 2.1.2 2.1.1 2002 PSF yes
409
- 2.1.3 2.1.2 2002 PSF yes
410
- 2.2 and above 2.1.1 2001-now PSF yes
411
-
412
- Footnotes:
413
-
414
- (1) GPL-compatible doesn't mean that we're distributing Python under
415
- the GPL. All Python licenses, unlike the GPL, let you distribute
416
- a modified version without making your changes open source. The
417
- GPL-compatible licenses make it possible to combine Python with
418
- other software that is released under the GPL; the others don't.
419
-
420
- (2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
421
- because its license has a choice of law clause. According to
422
- CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
423
- is "not incompatible" with the GPL.
424
-
425
- Thanks to the many outside volunteers who have worked under Guido's
426
- direction to make these releases possible.
427
-
428
-
429
- B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
430
- ===============================================================
431
-
432
- Python software and documentation are licensed under the
433
- Python Software Foundation License Version 2.
434
-
435
- Starting with Python 3.8.6, examples, recipes, and other code in
436
- the documentation are dual licensed under the PSF License Version 2
437
- and the Zero-Clause BSD license.
438
-
439
- Some software incorporated into Python is under different licenses.
440
- The licenses are listed with code falling under that license.
441
-
442
-
443
- PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
444
- --------------------------------------------
445
-
446
- 1. This LICENSE AGREEMENT is between the Python Software Foundation
447
- ("PSF"), and the Individual or Organization ("Licensee") accessing and
448
- otherwise using this software ("Python") in source or binary form and
449
- its associated documentation.
450
-
451
- 2. Subject to the terms and conditions of this License Agreement, PSF hereby
452
- grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
453
- analyze, test, perform and/or display publicly, prepare derivative works,
454
- distribute, and otherwise use Python alone or in any derivative version,
455
- provided, however, that PSF's License Agreement and PSF's notice of copyright,
456
- i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
457
- 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation;
458
- All Rights Reserved" are retained in Python alone or in any derivative version
459
- prepared by Licensee.
460
-
461
- 3. In the event Licensee prepares a derivative work that is based on
462
- or incorporates Python or any part thereof, and wants to make
463
- the derivative work available to others as provided herein, then
464
- Licensee hereby agrees to include in any such work a brief summary of
465
- the changes made to Python.
466
-
467
- 4. PSF is making Python available to Licensee on an "AS IS"
468
- basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
469
- IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
470
- DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
471
- FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
472
- INFRINGE ANY THIRD PARTY RIGHTS.
473
-
474
- 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
475
- FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
476
- A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
477
- OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
478
-
479
- 6. This License Agreement will automatically terminate upon a material
480
- breach of its terms and conditions.
481
-
482
- 7. Nothing in this License Agreement shall be deemed to create any
483
- relationship of agency, partnership, or joint venture between PSF and
484
- Licensee. This License Agreement does not grant permission to use PSF
485
- trademarks or trade name in a trademark sense to endorse or promote
486
- products or services of Licensee, or any third party.
487
-
488
- 8. By copying, installing or otherwise using Python, Licensee
489
- agrees to be bound by the terms and conditions of this License
490
- Agreement.
491
-
492
-
493
-
494
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -4,19 +4,23 @@ language:
4
  - en
5
  tags:
6
  - art
7
- - tensorRT
8
  ---
9
- ## Model Card for lyraSD
10
 
11
- lyraSD is currently the **fastest Stable Diffusion model** available, boasting an inference cost of only **0.435 seconds** for a 512x512 image, accelerating the process up to **10 times faster** than the original version.
12
 
13
  Among its main features are:
14
 
15
- - weights: original Stable Diffusion 1.4 weights
16
- - input image: 512x512 (in img2img mode)
17
- - output image: 512x512
18
- - device requirements: Nvidia Ampere architecture (A100) or compatable
19
- - super-resultion: 4x by default, optional.
 
 
 
 
20
 
21
  ## Speed
22
 
@@ -25,75 +29,125 @@ Among its main features are:
25
  - device: Nvidia A100 40G
26
  - img size: 512x512
27
  - percision:fp16
28
- - steps: 30
29
- - solver: LMSD
30
-
31
- ### text2img
32
- |model|time cost(ms)|memory(MB)|
33
- |:-:|:-:|:-:|
34
- |Pytorch SD|~5000ms|~10240|
35
- |lyraSD|~435ms|~4026|
36
-
37
- ### superResolution(SR)
38
- |model|time cost(ms)|memory(MB)|
39
- |:-:|:-:|:-:|
40
- |Pytorch SR|~720ms|~6650|
41
- |lyraSD|~26ms|~1600|
42
-
43
-
44
 
 
 
 
 
 
45
 
 
 
 
 
 
46
 
47
  ## Model Sources
48
 
49
- - **Repository:** https://huggingface.co/CompVis/stable-diffusion-v1-4
 
 
50
 
51
- ## Uses
52
 
53
  ```python
54
- from lyraSD import LyraSD
55
-
56
- t2imodel = LyraSD("text2img", "./sd1.4-engine")
57
- t2imodel.inference(prompt="A fantasy landscape, trending on artstation", use_super=False)
58
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- from PIL import Image
61
- i2imodel = LyraSD("img2img", "./sd1.4-engine")
62
- demo_img = Image.open("output/text2img_demo.jpg")
63
- i2imodel.inference(prompt="A fantasy landscape, trending on artstation", image=demo_img)
64
 
65
  ```
66
  ## Demo output
67
 
68
- ### text2img
69
- ![text2img_demo](./output/text2img_demo.jpg)
 
70
 
71
- ### img2img
 
72
 
73
- ![text2img_demo](./output/img2img_input.jpg)
74
 
75
- ![text2img_demo](./output/img2img_demo.jpg)
 
76
 
 
 
77
 
 
78
 
79
- ## Environment
 
80
 
81
- - hardware: Nvidia Ampere architecture (A100) or compatable
82
- - docker image avaible: https://hub.docker.com/r/bigmoyan/lyra_aigc/tags
83
- ```
84
- docker pull bigmoyan/lyra_aigc:v0.1
 
 
 
 
 
 
 
 
 
 
85
  ```
86
 
87
  ## Citation
88
  ``` bibtex
89
- @Misc{lyraSD2023,
90
- author = {Kangjian Wu, Zhengtao Wang, Bin Wu},
91
- title = {lyraSD: Accelerating Stable Diffusion by 10x+},
92
- howpublished = {\url{https://huggingface.co/TMElyralab/lyraSD}},
93
  year = {2023}
94
  }
95
  ```
96
 
97
  ## Report bug
98
- - start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraSD/discussions
99
  - report bug with a `[bug]` mark in the title.
 
4
  - en
5
  tags:
6
  - art
7
+ - Stable Diffusion
8
  ---
9
+ ## Model Card for lyraSD2
10
 
11
+ lyraSD2 is currently the **fastest Stable Diffusion model** that can 100% align the outputs of **diffusers** available, boasting an inference cost of only **0.52 seconds** for a 512x512 image, accelerating the process up to **80% faster** than the original version.
12
 
13
  Among its main features are:
14
 
15
+ - 4 Commonly used Pipelines
16
+ - - Text2Img
17
+ - - Img2Img
18
+ - - ControlNetText2Img
19
+ - - ControlNetImg2Img
20
+ - 100% likeness to diffusers output
21
+ - ControlNet Hot Swap: Can hot swap a ControlNet model weights within 0.4s (0s if cached)
22
+ - Lora How Swap: Can hot swap a Lora within 0.5s (0.1s if cached)
23
+ - device requirements: Nvidia Ampere architecture (A100, A10) or compatable
24
 
25
  ## Speed
26
 
 
29
  - device: Nvidia A100 40G
30
  - img size: 512x512
31
  - percision:fp16
32
+ - steps: 20
33
+ - sampler: EulerA
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ ### Text2Img
36
+ |model|time cost(ms)|
37
+ |:-:|:-:|
38
+ |torch2.0.1 + diffusers|~667ms|
39
+ |lyraSD|~528ms|
40
 
41
+ ### ControlNet-Text2Img
42
+ |model|time cost(ms)|
43
+ |:-:|:-:|
44
+ |torch2.0.1 + diffusers|~930ms|
45
+ |lyraSD2|~745ms|
46
 
47
  ## Model Sources
48
 
49
+ - **Checkpoint:** https://civitai.com/models/7371/rev-animated
50
+ - **ControlNet:** https://huggingface.co/lllyasviel/sd-controlnet-canny
51
+ - **Lora:** https://civitai.com/models/18323?modelVersionId=46846
52
 
53
+ ## Text2Img Uses
54
 
55
  ```python
56
+ import torch
57
+ import time
58
+
59
+ from lyrasd_model import LyraSdTxt2ImgPipeline
60
+
61
+ # 存放模型文件的路径,应该包含一下结构:
62
+ # 1. clip 模型
63
+ # 2. 转换好的优化后的 unet 模型,放入其中的 unet_bins 文件夹
64
+ # 3. vae 模型
65
+ # 4. scheduler 配置
66
+
67
+ # LyraSD 的 C++ 编译动态链接库,其中包含 C++ CUDA 计算的细节
68
+ lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so"
69
+ model_path = "./models/lyrasd_rev_animated"
70
+ lora_path = "./models/lyrasd_xiaorenshu_lora"
71
+
72
+ # 构建 Txt2Img 的 Pipeline
73
+ model = LyraSdTxt2ImgPipeline(model_path, lib_path)
74
+
75
+ # load lora
76
+ # 参数分别为 lora 存放位置,名字,lora 强度,lora模型精度
77
+ model.load_lora(lora_path, "xiaorenshu", 0.4, "fp32")
78
+
79
+ # 准备应用的输入和超参数
80
+ prompt = "a cat, cute, cartoon, concise, traditional, chinese painting, Tang and Song Dynasties, masterpiece, 4k, 8k, UHD, best quality"
81
+ negative_prompt = "(((horrible))), (((scary))), (((naked))), (((large breasts))), high saturation, colorful, human:2, body:2, low quality, bad quality, lowres, out of frame, duplicate, watermark, signature, text, frames, cut, cropped, malformed limbs, extra limbs, (((missing arms))), (((missing legs)))"
82
+ height, width = 512, 512
83
+ steps = 30
84
+ guidance_scale = 7
85
+ generator = torch.Generator().manual_seed(123)
86
+ num_images = 1
87
+
88
+ start = time.perf_counter()
89
+ # 推理生成
90
+ images = model(prompt, height, width, steps,
91
+ guidance_scale, negative_prompt, num_images,
92
+ generator=generator)
93
+ print("image gen cost: ",time.perf_counter() - start)
94
+ # 存储生成的图片
95
+ for i, image in enumerate(images):
96
+ image.save(f"outputs/res_txt2img_lora_{i}.png")
97
+
98
+ # unload lora,参数为 lora 的名字,是否清除 lora 缓存
99
+ # model.unload_lora("xiaorenshu", True)
100
 
 
 
 
 
101
 
102
  ```
103
  ## Demo output
104
 
105
+ ### Text2Img
106
+ #### Text2Img without Lora
107
+ ![text2img_demo](./outputs/res_txt2img_0.png)
108
 
109
+ #### Text2Img with Lora
110
+ ![text2img_demo](./outputs/res_txt2img_lora_0.png)
111
 
112
+ ### Img2Img
113
 
114
+ #### Img2Img input
115
+ <img src="https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/seaside_town.png?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1692601590;1865401590&q-key-time=1692601590;1865401590&q-header-list=&q-url-param-list=&q-signature=ca04ca92d990d94813029c0d9ef29537e5f4637c" alt="img2img input" width="512"/>
116
 
117
+ #### Img2Img output
118
+ ![text2img_demo](./outputs/res_img2img_0.png)
119
 
120
+ ### ControlNet Text2Img
121
 
122
+ #### Control Image
123
+ ![text2img_demo](./control_bird_canny.png)
124
 
125
+ #### ControlNet Text2Img Output
126
+ ![text2img_demo](./outputs/res_controlnet_txt2img_0.png)
127
+
128
+ ## Docker Environment Recommendation
129
+
130
+ - For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
131
+ - For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
132
+
133
+ ```bash
134
+ docker pull nvcr.io/nvidia/pytorch:23.02-py3
135
+ docker run --rm -it --gpus all -v ./:/lyraSD2 nvcr.io/nvidia/pytorch:23.02-py3
136
+
137
+ pip install -r requirements.txt
138
+ python txt2img_demo.py
139
  ```
140
 
141
  ## Citation
142
  ``` bibtex
143
+ @Misc{lyraSD2_2023,
144
+ author = {Kangjian Wu, Zhengtao Wang, Yibo Lu, Haoxiong Su, Bin Wu},
145
+ title = {lyraSD2: Accelerating Stable Diffusion with best flexibility},
146
+ howpublished = {\url{https://huggingface.co/TMElyralab/lyraSD2}},
147
  year = {2023}
148
  }
149
  ```
150
 
151
  ## Report bug
152
+ - start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraSD2/discussions
153
  - report bug with a `[bug]` mark in the title.
control_bird_canny.png ADDED
controlnet_img2img_demo.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+
3
+ import requests
4
+ import torch
5
+ from PIL import Image
6
+
7
+ from lyrasd_model import LyraSdControlnetImg2ImgPipeline
8
+
9
+ # 存放模型文件的路径,应该包含一下结构:
10
+ # 1. clip 模型
11
+ # 2. 转换好的优化后的 unet 模型
12
+ # 3. vae 模型
13
+ # 4. scheduler 配置
14
+
15
+ # LyraSD 的 C++ 编译动态链接库,其中包含 C++ CUDA 计算的细节
16
+ lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
17
+ model_path = "./models/lyrasd_rev_animated"
18
+ canny_controlnet_path = "./models/lyrasd_canny"
19
+
20
+ # 构建 Img2Img 的 Pipeline
21
+ model = LyraSdControlnetImg2ImgPipeline(model_path, lib_path)
22
+
23
+ # load Controlnet 模型,最多load 3个
24
+ model.load_controlnet_model("canny", canny_controlnet_path, "fp32")
25
+
26
+ control_img = Image.open("control_bird_canny.png")
27
+
28
+ # 准备应用的输入和超参数
29
+ prompt = "a bird"
30
+ negative_prompt = "NSFW"
31
+ height, width = 512, 512
32
+ steps = 20
33
+ guidance_scale = 7.5
34
+ generator = torch.Generator().manual_seed(123)
35
+ num_images = 1
36
+
37
+ # 可以一次性load 3 个 Controlnets,达到multi Controlnet的效果,这里的参数的长度需要对其
38
+ # Controlnet 所输入的img list 长度应该和 controlnet scale 与 Controlnet name 一致,而内部的list长度需要和batch size一致
39
+ # 对应的index 可以对其
40
+ controlnet_images = [[control_img]]
41
+ controlnet_scale= [0.5]
42
+ controlnet_names= ['canny']
43
+
44
+ # 从 cos 上拿个图作为初始化图片
45
+ init_image_url = "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/seaside_town.png?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1692601590;1865401590&q-key-time=1692601590;1865401590&q-header-list=&q-url-param-list=&q-signature=ca04ca92d990d94813029c0d9ef29537e5f4637c"
46
+ init_image = BytesIO(requests.get(init_image_url).content)
47
+ init_image = Image.open(init_image).convert('RGB')
48
+ init_image = init_image.resize((width, height), Image.Resampling.LANCZOS)
49
+ guess_mode = False
50
+ strength = 0.8
51
+
52
+ # 推理生成
53
+ images = model(prompt, init_image, strength, height, width, steps,
54
+ guidance_scale, negative_prompt, num_images,
55
+ generator=generator, controlnet_images=controlnet_images,
56
+ controlnet_scale=controlnet_scale, controlnet_names=controlnet_names,
57
+ guess_mode=guess_mode
58
+ )
59
+
60
+ # 存储生成的图片
61
+ for i, image in enumerate(images):
62
+ image.save(f"outputs/res_controlnet_img2img_{i}.png")
controlnet_txt2img_demo.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import time
3
+ from PIL import Image
4
+
5
+ from lyrasd_model import LyraSdControlnetTxt2ImgPipeline
6
+
7
+ # 存放模型文件的路径,应该包含一下结构:
8
+ # 1. clip 模型
9
+ # 2. 转换好的优化后的 unet 模型
10
+ # 3. 转换好的优化后的 controlnet 模型
11
+ # 4. vae 模型
12
+ # 5. scheduler 配置
13
+
14
+ # LyraSD 的 C++ 编译动态链接库,其中包含 C++ CUDA 计算的细节
15
+ lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
16
+ model_path = "./models/lyrasd_rev_animated"
17
+ canny_controlnet_path = "./models/lyrasd_canny"
18
+ # 构建 Txt2Img 的 Pipeline
19
+ pipe = LyraSdControlnetTxt2ImgPipeline(model_path, lib_path)
20
+
21
+ # load Controlnet 模型,最多load 3个
22
+ start = time.perf_counter()
23
+ pipe.load_controlnet_model("canny", canny_controlnet_path, "fp32")
24
+ print(f"controlnet load cost: {time.perf_counter() - start}")
25
+ # 可以通过 get_loaded_controlnet 方法获取目前已经load 好的Controlnet list
26
+ print(pipe.get_loaded_controlnet())
27
+
28
+ # 可以通过unload_controlnet_model 方法unload Controlnet
29
+ # pipe.unload_controlnet_model("canny")
30
+
31
+ control_img = Image.open("control_bird_canny.png")
32
+
33
+ # 准备应用的输入和超参数
34
+ prompt = "a blue bird"
35
+ negative_prompt = "NSFW"
36
+ height, width = 512, 512
37
+ steps = 20
38
+ guidance_scale = 7.5
39
+ generator = torch.Generator().manual_seed(123)
40
+ num_images = 1
41
+ guess_mode = False
42
+
43
+ # 可以一次性load 3 个 Controlnets,达到multi Controlnet的效果,这里的参数的长度需要对其
44
+ # Controlnet 所输入的img list 长度应该和 controlnet scale 与 Controlnet name 一致,而内部的list长度需要和batch size一致
45
+ # 对应的index 可以对其
46
+ controlnet_images = [[control_img]]
47
+ controlnet_scale = [0.5]
48
+ controlnet_names = ['canny']
49
+
50
+ # 推理生成,返回结果都是生成好的 PIL.Image
51
+
52
+ start = time.perf_counter()
53
+ images = pipe(prompt, height, width, steps,
54
+ guidance_scale, negative_prompt, num_images,
55
+ generator=generator, controlnet_images=controlnet_images,
56
+ controlnet_scale=controlnet_scale, controlnet_names=controlnet_names,
57
+ guess_mode=guess_mode
58
+ )
59
+ print("cur cost: ",time.perf_counter() - start)
60
+
61
+ # 存储生成的图片
62
+ for i, image in enumerate(images):
63
+ image.save(f"./outputs/res_controlnet_txt2img_{i}.png")
demo.py DELETED
@@ -1,12 +0,0 @@
1
- from lyraSD import LyraSD
2
-
3
- t2imodel = LyraSD("text2img", "./sd1.4-engine")
4
- t2imodel.inference(prompt="A fantasy landscape, trending on artstation", use_super=True)
5
-
6
-
7
- from PIL import Image
8
- i2imodel = LyraSD("img2img", "./sd1.4-engine")
9
- demo_img = Image.open("output/img2img_input.jpg")
10
- i2imodel.inference(prompt="A fantasy landscape, trending on artstation",
11
- image=demo_img, use_super=True)
12
-
 
 
 
 
 
 
 
 
 
 
 
 
 
img2img_demo.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+
3
+ import requests
4
+ import torch
5
+ from PIL import Image
6
+
7
+ from lyrasd_model import LyraSDImg2ImgPipeline
8
+
9
+ # 存放模型文件的路径,应该包含一下结构:
10
+ # 1. clip 模型
11
+ # 2. 转换好的优化后的 unet 模型
12
+ # 3. vae 模型
13
+ # 4. scheduler 配置
14
+
15
+ # LyraSD 的 C++ 编译动态链接库,其中包含 C++ CUDA 计算的细节
16
+ lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
17
+ model_path = "./models/lyrasd_rev_animated"
18
+
19
+ # 构建 Img2Img 的 Pipeline
20
+ model = LyraSDImg2ImgPipeline(model_path, lib_path)
21
+
22
+ # 准备应用的输入和超参数
23
+ prompt = "a cat, cartoon style"
24
+ negative_prompt = "NSFW"
25
+ height, width = 512, 512
26
+ steps = 20
27
+ guidance_scale = 7.5
28
+ generator = torch.Generator().manual_seed(123)
29
+ num_images = 1
30
+
31
+ # 从 cos 上拿个图作为初始化图片
32
+ init_image_url = "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/seaside_town.png?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1692601590;1865401590&q-key-time=1692601590;1865401590&q-header-list=&q-url-param-list=&q-signature=ca04ca92d990d94813029c0d9ef29537e5f4637c"
33
+ init_image = BytesIO(requests.get(init_image_url).content)
34
+ init_image = Image.open(init_image).convert('RGB')
35
+ init_image = init_image.resize((width, height), Image.Resampling.LANCZOS)
36
+
37
+ strength = 0.8
38
+
39
+ # 推理生成
40
+ images = model(prompt, init_image, strength, steps,
41
+ guidance_scale, negative_prompt, num_images,
42
+ generator=generator
43
+ )
44
+
45
+ # 存储生成的图片
46
+ for i, image in enumerate(images):
47
+ image.save(f"./outputs/res_img2img_{i}.png")
lyraSD/__init__.py DELETED
@@ -1 +0,0 @@
1
- from .inference import LyraSD
 
 
lyraSD/inference.py DELETED
@@ -1,85 +0,0 @@
1
- import os
2
- from PIL import Image
3
- from .muse_trt import TRTStableDiffusionText2ImgPipeline
4
- from .muse_trt import TRTStableDiffusionImg2ImgPipeline
5
- import numpy as np
6
-
7
-
8
- class LyraSD(object):
9
- def __init__(self, sd_mode, engine_dir,o_height=512, o_width=512, device="cuda:0"):
10
- self.sd_mode = sd_mode
11
- self.device = device
12
- self.o_height = o_height
13
- self.o_width = o_width
14
- if self.sd_mode == "text2img":
15
- self.pipeline = TRTStableDiffusionText2ImgPipeline(
16
- engine_dir = engine_dir,
17
- o_height = o_height,
18
- o_width = o_width,
19
- device=device
20
- )
21
- elif self.sd_mode == "img2img":
22
- self.pipeline = TRTStableDiffusionImg2ImgPipeline(
23
- engine_dir = engine_dir,
24
- o_height = o_height,
25
- o_width = o_width,
26
- device=device
27
- )
28
- else:
29
- raise ValueError("Invalid sd_mode: {}".format(self.sd_mode))
30
-
31
-
32
-
33
- def inference(self, prompt,
34
- image=None,
35
- save_dir="./output",
36
- save_basename="sd-",
37
- negative_prompts='',
38
- strength=0.8,
39
- height=None,
40
- width =None,
41
- num_images_per_prompt=1,
42
- num_inference_steps=50,
43
- guidance_scale=7.5,
44
- use_super=False,
45
- ):
46
-
47
- if self.sd_mode=="text2img" and prompt is None:
48
- raise ValueError("prompt must be set on text2img mode")
49
-
50
- if self.sd_mode=="img2img" and image is None:
51
- raise ValueError("image must be set on img2img mode")
52
-
53
- save_basename += f"{self.sd_mode}"
54
- if height is None:
55
- height = self.o_height
56
- if width is None:
57
- width = self.o_width
58
-
59
- if self.sd_mode=="text2img":
60
- result_image = self.pipeline(prompt=prompt, negative_prompt=negative_prompts,
61
- num_inference_steps= num_inference_steps,
62
- num_images_per_prompt=num_images_per_prompt,
63
- guidance_scale=guidance_scale,
64
- height=height,
65
- width=width,
66
- use_super=use_super)
67
- elif self.sd_mode=="img2img":
68
- result_image = self.pipeline(prompt=prompt,
69
- image=image,
70
- negative_prompt=negative_prompts,
71
- strength = strength,
72
- num_inference_steps= num_inference_steps,
73
- num_images_per_prompt=num_images_per_prompt,
74
- guidance_scale=guidance_scale,
75
- height=height,
76
- width=width,
77
- use_super=use_super)
78
-
79
-
80
- for i in range(result_image.shape[0]):
81
- result_image = Image.fromarray(result_image[0])
82
- result_image = result_image.resize((512, 512))
83
- result_image.save(os.path.join(save_dir, save_basename + "-{}.jpg".format(i)))
84
-
85
- return result_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lyraSD/muse_trt/__init__.py DELETED
@@ -1,10 +0,0 @@
1
- import ctypes
2
- import os
3
-
4
- current_workdir = os.path.dirname(__file__)
5
-
6
- ctypes.cdll.LoadLibrary(os.path.join(current_workdir, "libnvinfer_plugin.so"))
7
-
8
- from .sd_img2img import TRTStableDiffusionImg2ImgPipeline
9
- from .sd_text2img import TRTStableDiffusionText2ImgPipeline
10
- from .super import SuperX4TRTInfer
 
 
 
 
 
 
 
 
 
 
 
lyraSD/muse_trt/models.py DELETED
@@ -1,149 +0,0 @@
1
- r"""models components"""
2
- from collections import OrderedDict
3
- from copy import deepcopy
4
- from typing import Any, Dict, Optional, Union
5
-
6
- import numpy as np
7
- import torch
8
- from cuda import cudart
9
- from diffusers import ControlNetModel
10
- from diffusers.models import AutoencoderKL, UNet2DConditionModel
11
- from torch import nn
12
- from torch.nn import functional as F
13
- from transformers import CLIPTextModel
14
-
15
-
16
- class BaseModel():
17
- def __init__(
18
- self,
19
- local_model_path=None,
20
- hf_token=None,
21
- text_maxlen=77,
22
- embedding_dim=768,
23
- fp16=False,
24
- device='cuda',
25
- verbose=True,
26
- max_batch_size=16
27
- ):
28
- self.fp16 = fp16
29
- self.device = device
30
- self.verbose = verbose
31
- self.hf_token = hf_token
32
- self.local_model_path = local_model_path
33
-
34
- # Defaults
35
- self.text_maxlen = text_maxlen
36
- self.embedding_dim = embedding_dim
37
- self.min_batch = 1
38
- self.max_batch = max_batch_size
39
- self.min_latent_shape = 256 // 8 # min image resolution: 256x256
40
- self.max_latent_shape = 1024 // 8 # max image resolution: 1024x1024
41
-
42
- def get_model(self):
43
- pass
44
-
45
- def get_shape_dict(self, batch_size, image_height, image_width):
46
- return None
47
-
48
- def check_dims(self, batch_size, image_height, image_width):
49
- assert batch_size >= self.min_batch and batch_size <= self.max_batch
50
- assert image_height % 8 == 0 or image_width % 8 == 0
51
- latent_height = image_height // 8
52
- latent_width = image_width // 8
53
- assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
54
- assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
55
- return (latent_height, latent_width)
56
-
57
-
58
- class CLIP(BaseModel):
59
- def get_model(self):
60
- if self.hf_token is None and self.local_model_path is not None:
61
- clip_model = CLIPTextModel.from_pretrained(
62
- self.local_model_path, subfolder="text_encoder").to(self.device)
63
- else:
64
- clip_model = CLIPTextModel.from_pretrained(
65
- "openai/clip-vit-large-patch14").to(self.device)
66
- return clip_model
67
-
68
- def get_shape_dict(self, batch_size, image_height, image_width):
69
- self.check_dims(batch_size, image_height, image_width)
70
- return {
71
- 'input_ids': (batch_size, self.text_maxlen),
72
- 'text_embeddings': (batch_size, self.text_maxlen, self.embedding_dim)
73
- }
74
-
75
-
76
- class UNet(BaseModel):
77
- def get_model(self):
78
- model_opts = {'revision': 'fp16',
79
- 'torch_dtype': torch.float16} if self.fp16 else {}
80
- print(model_opts)
81
- if self.hf_token is None and self.local_model_path is not None:
82
- unet_model = UNet2DConditionModel.from_pretrained(
83
- self.local_model_path, subfolder="unet",
84
- **model_opts
85
- ).to(self.device)
86
- else:
87
- unet_model = UNet2DConditionModel.from_pretrained(
88
- "CompVis/stable-diffusion-v1-4",
89
- subfolder="unet",
90
- use_auth_token=self.hf_token,
91
- **model_opts).to(self.device)
92
- return unet_model
93
-
94
- def get_shape_dict(self, batch_size, image_height, image_width):
95
- latent_height, latent_width = self.check_dims(
96
- batch_size, image_height, image_width)
97
- return {
98
- 'sample': (2*batch_size, 4, latent_height, latent_width),
99
- 'encoder_hidden_states': (2*batch_size, self.text_maxlen, self.embedding_dim),
100
- 'latent': (2*batch_size, 4, latent_height, latent_width)
101
- }
102
-
103
- class VAEEncoderModule(nn.Module):
104
- def __init__(self, local_model_path, device) -> None:
105
- super().__init__()
106
- self.vae = AutoencoderKL.from_pretrained(
107
- local_model_path, subfolder="vae"
108
- ).to(device)
109
-
110
- def forward(self, x):
111
- h = self.vae.encoder(x)
112
- moments = self.vae.quant_conv(h)
113
- return moments
114
-
115
-
116
- class VAEEncoder(BaseModel):
117
- def get_model(self):
118
- vae_encoder = VAEEncoderModule(self.local_model_path, self.device)
119
- return vae_encoder
120
- def get_shape_dict(self, batch_size, image_height, image_width):
121
- image_height, image_width = self.check_dims(
122
- batch_size, image_height, image_width)
123
- return {
124
- 'images': (batch_size, 3, image_height, image_width),
125
- 'latent': (batch_size, 8, image_height//8, image_width//8),
126
- }
127
-
128
-
129
- class VAEDecoder(BaseModel):
130
- def get_model(self):
131
- if self.hf_token is None and self.local_model_path is not None:
132
- vae = AutoencoderKL.from_pretrained(
133
- self.local_model_path, subfolder="vae"
134
- ).to(self.device)
135
- else:
136
- vae = AutoencoderKL.from_pretrained(
137
- "CompVis/stable-diffusion-v1-4",
138
- subfolder="vae",
139
- use_auth_token=self.hf_token).to(self.device)
140
- vae.forward = vae.decode
141
- return vae
142
-
143
- def get_shape_dict(self, batch_size, image_height, image_width):
144
- latent_height, latent_width = self.check_dims(
145
- batch_size, image_height, image_width)
146
- return {
147
- 'latent': (batch_size, 4, latent_height, latent_width),
148
- 'images': (batch_size, 3, image_height, image_width)
149
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lyraSD/muse_trt/sd_img2img.py DELETED
@@ -1,368 +0,0 @@
1
- r"""
2
- StableDiffusion Img2Img Pipeline by TensorRT.
3
- It has included SuperResolutionX4 TensorRT Engine.
4
-
5
- Inspired by: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
6
- https://developer.nvidia.com/tensorrt
7
- """
8
-
9
- import inspect
10
- import os
11
- from typing import List, Optional, Union
12
-
13
- import numpy as np
14
- import PIL.Image
15
- import tensorrt as trt
16
- import torch
17
- import time
18
- from diffusers import AutoencoderKL
19
- from diffusers.schedulers import DPMSolverMultistepScheduler
20
- from diffusers.models.vae import DiagonalGaussianDistribution
21
- from diffusers.utils import PIL_INTERPOLATION, randn_tensor
22
- from polygraphy import cuda
23
- from transformers import CLIPTokenizer
24
-
25
- from .models import CLIP, UNet, VAEDecoder, VAEEncoder
26
- from .super import SuperX4TRTInfer
27
- from .utilities import TRT_LOGGER, Engine
28
-
29
-
30
- def preprocess(image):
31
- if isinstance(image, torch.Tensor):
32
- return image
33
- elif isinstance(image, PIL.Image.Image):
34
- image = [image]
35
-
36
- if isinstance(image[0], PIL.Image.Image):
37
- w, h = image[0].size
38
- w, h = map(lambda x: x - x % 8, (w, h)) # resize to integer multiple of 8
39
-
40
- image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
41
- image = np.concatenate(image, axis=0)
42
- image = np.array(image).astype(np.float32) / 255.0
43
- image = image.transpose(0, 3, 1, 2)
44
- image = 2.0 * image - 1.0
45
- image = torch.from_numpy(image)
46
- elif isinstance(image[0], torch.Tensor):
47
- image = torch.cat(image, dim=0)
48
- return image
49
-
50
-
51
- class TRTStableDiffusionImg2ImgPipeline:
52
- def __init__(self, engine_dir: str, o_height: int = 1300, o_width: int = 750, device: str = 'cuda:0'):
53
- self.device = torch.device(device)
54
- super().__init__()
55
- self.vae = AutoencoderKL.from_pretrained(
56
- os.path.join(engine_dir, 'vae'),
57
- torch_dtype=torch.float16
58
- ).to(self.device)
59
-
60
- self.tokenizer = CLIPTokenizer.from_pretrained(
61
- os.path.join(engine_dir, 'tokenizer')
62
- )
63
- self.scheduler = DPMSolverMultistepScheduler.from_pretrained(
64
- os.path.join(engine_dir, 'scheduler')
65
- )
66
-
67
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
68
- self.trt_torch_models_cls = {
69
- 'clip': CLIP(),
70
- 'unet_fp16': UNet(),
71
- #'vae-encoder': VAEEncoder(),
72
- 'vae-decoder': VAEDecoder()
73
- }
74
-
75
- self.engine = {}
76
- # Build engines
77
- for model_name, _ in self.trt_torch_models_cls.items():
78
- engine = Engine(model_name, engine_dir)
79
- self.engine[model_name] = engine
80
- # Separate iteration to activate engines
81
- for model_name, _ in self.trt_torch_models_cls.items():
82
- self.engine[model_name].activate()
83
- self.stream = cuda.Stream()
84
-
85
- self.super = SuperX4TRTInfer(
86
- engine_dir,
87
- model_name='superx4-512-512.plan',
88
- fp16=True,
89
- )
90
-
91
- def runEngine(self, model_name, feed_dict):
92
- engine = self.engine[model_name]
93
- return engine.infer(feed_dict, self.stream)
94
-
95
- def _torch_decode_latents(self, latents):
96
- latents = 1 / self.vae.config.scaling_factor * latents
97
- image = self.vae.decode(latents).sample
98
- image = (image / 2 + 0.5).clamp(0, 1)
99
- # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
100
- image = image.cpu().permute(0, 2, 3, 1).float().numpy()
101
- image = (image * 255).round()
102
- return image
103
-
104
- def _trt_decode_latents(self, latents):
105
- latents = 1 / self.vae.config.scaling_factor * latents
106
- sample_inp = cuda.DeviceView(
107
- ptr=latents.data_ptr(), shape=latents.shape, dtype=np.float32)
108
- image = self.runEngine('vae-decoder', {"latent": sample_inp})['images']
109
- image = (image / 2 + 0.5).clamp(0, 1)
110
- # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
111
- image = image.cpu().permute(0, 2, 3, 1).float().numpy()
112
- image = (image * 255).round()
113
-
114
- return image
115
-
116
- def prepare_extra_step_kwargs(self, generator, eta):
117
- # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
118
- # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
119
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
120
- # and should be between [0, 1]
121
-
122
- accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
123
- extra_step_kwargs = {}
124
- if accepts_eta:
125
- extra_step_kwargs["eta"] = eta
126
-
127
- # check if the scheduler accepts generator
128
- accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
129
- if accepts_generator:
130
- extra_step_kwargs["generator"] = generator
131
- return extra_step_kwargs
132
-
133
- def get_timesteps(self, num_inference_steps, strength, device):
134
- # get the original timestep using init_timestep
135
- init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
136
-
137
- t_start = max(num_inference_steps - init_timestep, 0)
138
- timesteps = self.scheduler.timesteps[t_start:]
139
-
140
- return timesteps, num_inference_steps - t_start
141
-
142
- def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
143
- if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
144
- raise ValueError(
145
- f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
146
- )
147
-
148
- image = image.to(device=device, dtype=dtype)
149
-
150
- batch_size = batch_size * num_images_per_prompt
151
- if isinstance(generator, list) and len(generator) != batch_size:
152
- raise ValueError(
153
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
154
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
155
- )
156
-
157
- if isinstance(generator, list):
158
- init_latents = [
159
- self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
160
- ]
161
- init_latents = torch.cat(init_latents, dim=0)
162
- else:
163
- init_latents = self.vae.encode(image).latent_dist.sample(generator)
164
-
165
- init_latents = self.vae.config.scaling_factor * init_latents
166
-
167
- if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
168
- raise ValueError(
169
- f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
170
- )
171
- else:
172
- init_latents = torch.cat([init_latents], dim=0)
173
-
174
- shape = init_latents.shape
175
- noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
176
-
177
- # get latents
178
- init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
179
- latents = init_latents
180
-
181
- return latents
182
-
183
- def _default_height_width(self, height, width, image):
184
- if isinstance(image, list):
185
- image = image[0]
186
-
187
- if height is None:
188
- if isinstance(image, PIL.Image.Image):
189
- height = image.height
190
- elif isinstance(image, torch.Tensor):
191
- height = image.shape[3]
192
-
193
- height = (height // 8) * 8 # round down to nearest multiple of 8
194
-
195
- if width is None:
196
- if isinstance(image, PIL.Image.Image):
197
- width = image.width
198
- elif isinstance(image, torch.Tensor):
199
- width = image.shape[2]
200
-
201
- width = (width // 8) * 8 # round down to nearest multiple of 8
202
-
203
- return height, width
204
-
205
- def _trt_encode_prompt(self, prompt, negative_prompt, num_images_per_prompt,):
206
- # Tokenize input
207
- text_input_ids = self.tokenizer(
208
- prompt,
209
- padding="max_length",
210
- max_length=self.tokenizer.model_max_length,
211
- return_tensors="pt",
212
- ).input_ids.type(torch.int32).to(self.device)
213
-
214
- # CLIP text encoder
215
- text_input_ids_inp = cuda.DeviceView(
216
- ptr=text_input_ids.data_ptr(), shape=text_input_ids.shape, dtype=np.int32
217
- )
218
- text_embeddings = self.runEngine('clip', {"input_ids": text_input_ids_inp})['text_embeddings']
219
-
220
- # Duplicate text embeddings for each generation per prompt
221
- bs_embed, seq_len, _ = text_embeddings.shape
222
- text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
223
- text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
224
-
225
- max_length = text_input_ids.shape[-1]
226
- uncond_input_ids = self.tokenizer(
227
- negative_prompt,
228
- padding="max_length",
229
- max_length=max_length,
230
- truncation=True,
231
- return_tensors="pt",
232
- ).input_ids.type(torch.int32).to(self.device)
233
- uncond_input_ids_inp = cuda.DeviceView(
234
- ptr=uncond_input_ids.data_ptr(), shape=uncond_input_ids.shape, dtype=np.int32)
235
- uncond_embeddings = self.runEngine('clip', {"input_ids": uncond_input_ids_inp})['text_embeddings']
236
-
237
- # Duplicate unconditional embeddings for each generation per prompt
238
- seq_len = uncond_embeddings.shape[1]
239
- uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
240
- uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
241
-
242
- # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
243
- text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
244
- text_embeddings = text_embeddings.to(dtype=torch.float16)
245
-
246
- return text_embeddings
247
-
248
- @torch.no_grad()
249
- def __call__(
250
- self,
251
- prompt: Union[str, List[str]] = None,
252
- image: Union[torch.Tensor, PIL.Image.Image] = None,
253
- strength: float = 0.8,
254
- height: Optional[int] = None,
255
- width: Optional[int] = None,
256
- num_inference_steps: int = 50,
257
- guidance_scale: float = 7.5,
258
- negative_prompt: Optional[Union[str, List[str]]] = None,
259
- num_images_per_prompt: Optional[int] = 1,
260
- eta: float = 0.0,
261
- generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
262
- latents: Optional[torch.FloatTensor] = None,
263
- prompt_embeds: Optional[torch.FloatTensor] = None,
264
- use_super: bool = True,
265
- ):
266
- # 1. Default height and width to unet
267
- height, width = self._default_height_width(height, width, image)
268
-
269
- # 2. Define call parameters and Allocate the cuda buffers for TRT Engine bindings.
270
- if prompt is not None and isinstance(prompt, str):
271
- batch_size = 1
272
- elif prompt is not None and isinstance(prompt, list):
273
- batch_size = len(prompt)
274
- else:
275
- batch_size = prompt_embeds.shape[0]
276
-
277
- # Allocate buffers for TensorRT engine bindings
278
- for model_name, obj in self.trt_torch_models_cls.items():
279
- self.engine[model_name].allocate_buffers(
280
- shape_dict=obj.get_shape_dict(batch_size, height, width),
281
- device=self.device
282
- )
283
-
284
- do_classifier_free_guidance = guidance_scale > 1.0
285
-
286
- with trt.Runtime(TRT_LOGGER) as runtime:
287
- torch.cuda.synchronize()
288
-
289
- # 3. Encode input prompt. TRT Clip model.
290
- prompt_embeds = self._trt_encode_prompt(
291
- prompt, negative_prompt, num_images_per_prompt
292
- )
293
-
294
- # 4. Prepare mask, image, and controlnet_conditioning_image
295
- image = preprocess(image)
296
-
297
- # 5. Prepare timesteps.
298
- self.scheduler.set_timesteps(num_inference_steps, device=self.device)
299
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, self.device)
300
- latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
301
-
302
- # 6. Prepare latent variables. It will use VAE-Enoder(currently the encoder is torch model, not trt)
303
- latents = self.prepare_latents(
304
- image,
305
- latent_timestep,
306
- batch_size,
307
- num_images_per_prompt,
308
- prompt_embeds.dtype,
309
- self.device,
310
- generator,
311
- )
312
-
313
- # 7. Prepare extra step kwargs and Set lantens/controlnet_conditioning_image/prompt_embeds to special dtype.
314
- # The dytpe must be equal to the following to ensure that the NAN can not be issued in trt engine.
315
- extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
316
- latents = latents.to(dtype=torch.float32)
317
- prompt_embeds = prompt_embeds.to(dtype=torch.float16)
318
-
319
- # 8. Denoising loop
320
- for i, t in enumerate(timesteps):
321
- # expand the latents if we are doing classifier free guidance
322
- latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
323
-
324
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
325
-
326
- # predict the noise residual
327
-
328
- dtype = np.float16
329
- if t.dtype != torch.float32:
330
- timestep_float = t.float()
331
- else:
332
- timestep_float = t
333
-
334
- sample_inp = cuda.DeviceView(
335
- ptr=latent_model_input.data_ptr(), shape=latent_model_input.shape, dtype=np.float32
336
- )
337
- timestep_inp = cuda.DeviceView(
338
- ptr=timestep_float.data_ptr(), shape=timestep_float.shape, dtype=np.float32
339
- )
340
- embeddings_inp = cuda.DeviceView(
341
- ptr=prompt_embeds.data_ptr(), shape=prompt_embeds.shape, dtype=dtype
342
- )
343
-
344
- noise_pred = self.engine['unet_fp16'].infer(
345
- {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
346
- self.stream)['latent']
347
- # perform guidance
348
- if do_classifier_free_guidance:
349
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
350
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
351
-
352
- # compute the previous noisy sample x_t -> x_t-1
353
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
354
-
355
- # 9. Use VAE-Decoder to decode the latents
356
- image = self._trt_decode_latents(latents)
357
-
358
- # 10. SuperX4 Resolution, Optional.
359
- if use_super:
360
- image = np.ascontiguousarray(np.transpose(image, (0, 3, 1, 2))).astype(np.float16)
361
- #image = self.super.infer(np.transpose(image.astype(np.float16), (0, 3, 1, 2)))
362
- image = self.super.infer(image)
363
- image = np.uint8(np.transpose(image, (0, 2, 3, 1)))
364
- else:
365
- image = np.uint8(image)
366
-
367
- return image
368
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lyraSD/muse_trt/sd_text2img.py DELETED
@@ -1,292 +0,0 @@
1
- r"""
2
- StableDiffusion Text2Img Pipeline by TensorRT.
3
- It has included SuperResolutionX4 TensorRT Engine.
4
-
5
- Inspired by: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
6
- https://developer.nvidia.com/tensorrt
7
- """
8
-
9
- import inspect
10
- import os
11
- from typing import List, Optional, Union
12
-
13
- import numpy as np
14
- import tensorrt as trt
15
- import torch
16
- from diffusers import AutoencoderKL
17
- from diffusers.schedulers import DPMSolverMultistepScheduler
18
- from diffusers.utils import PIL_INTERPOLATION, randn_tensor
19
- from polygraphy import cuda
20
- from transformers import CLIPTokenizer
21
-
22
- from .models import CLIP, UNet, VAEDecoder
23
- from .super import SuperX4TRTInfer
24
- from .utilities import TRT_LOGGER, Engine
25
-
26
-
27
- class TRTStableDiffusionText2ImgPipeline:
28
- def __init__(self, engine_dir: str, o_height: int = 512, o_width: int = 512, device: str = 'cuda:0'):
29
- self.device = torch.device(device)
30
- super().__init__()
31
- self.vae = AutoencoderKL.from_pretrained(
32
- os.path.join(engine_dir, 'vae'),
33
- torch_dtype=torch.float16
34
- ).to(self.device)
35
-
36
- self.tokenizer = CLIPTokenizer.from_pretrained(
37
- os.path.join(engine_dir, 'tokenizer')
38
- )
39
- self.scheduler = DPMSolverMultistepScheduler.from_pretrained(
40
- os.path.join(engine_dir, 'scheduler')
41
- )
42
-
43
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
44
- self.trt_torch_models_cls = {
45
- 'clip': CLIP(),
46
- 'unet_fp16': UNet(),
47
- 'vae-decoder': VAEDecoder()
48
- }
49
-
50
- self.engine = {}
51
- # Build engines
52
- for model_name, _ in self.trt_torch_models_cls.items():
53
- engine = Engine(model_name, engine_dir)
54
- self.engine[model_name] = engine
55
- # Separate iteration to activate engines
56
- for model_name, _ in self.trt_torch_models_cls.items():
57
- self.engine[model_name].activate()
58
- self.stream = cuda.Stream()
59
-
60
- self.super = SuperX4TRTInfer(
61
- engine_dir=engine_dir,
62
- model_name='superx4-512-512.plan',
63
- fp16=True
64
- )
65
-
66
- def runEngine(self, model_name, feed_dict):
67
- engine = self.engine[model_name]
68
- return engine.infer(feed_dict, self.stream)
69
-
70
- def _torch_decode_latents(self, latents):
71
- latents = 1 / self.vae.config.scaling_factor * latents
72
- image = self.vae.decode(latents).sample
73
- image = (image / 2 + 0.5).clamp(0, 1)
74
- # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
75
- image = image.cpu().permute(0, 2, 3, 1).float().numpy()
76
- image = (image * 255).round()
77
- return image
78
-
79
- def _trt_decode_latents(self, latents):
80
- latents = 1 / self.vae.config.scaling_factor * latents
81
- sample_inp = cuda.DeviceView(
82
- ptr=latents.data_ptr(), shape=latents.shape, dtype=np.float32)
83
- image = self.runEngine('vae-decoder', {"latent": sample_inp})['images']
84
- image = (image / 2 + 0.5).clamp(0, 1)
85
- # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
86
- image = image.cpu().permute(0, 2, 3, 1).float().numpy()
87
- image = (image * 255).round()
88
-
89
- return image
90
-
91
- def prepare_extra_step_kwargs(self, generator, eta):
92
- # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
93
- # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
94
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
95
- # and should be between [0, 1]
96
-
97
- accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
98
- extra_step_kwargs = {}
99
- if accepts_eta:
100
- extra_step_kwargs["eta"] = eta
101
-
102
- # check if the scheduler accepts generator
103
- accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
104
- if accepts_generator:
105
- extra_step_kwargs["generator"] = generator
106
- return extra_step_kwargs
107
-
108
- def get_timesteps(self, num_inference_steps, strength, device):
109
- # get the original timestep using init_timestep
110
- init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
111
-
112
- t_start = max(num_inference_steps - init_timestep, 0)
113
- timesteps = self.scheduler.timesteps[t_start:]
114
-
115
- return timesteps, num_inference_steps - t_start
116
-
117
- def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
118
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
119
- if isinstance(generator, list) and len(generator) != batch_size:
120
- raise ValueError(
121
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
122
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
123
- )
124
-
125
- if latents is None:
126
- latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
127
- else:
128
- latents = latents.to(device)
129
-
130
- # scale the initial noise by the standard deviation required by the scheduler
131
- latents = latents * self.scheduler.init_noise_sigma
132
- return latents
133
-
134
- def _trt_encode_prompt(self, prompt, negative_prompt, num_images_per_prompt,):
135
- # Tokenize input
136
- text_input_ids = self.tokenizer(
137
- prompt,
138
- padding="max_length",
139
- max_length=self.tokenizer.model_max_length,
140
- return_tensors="pt",
141
- ).input_ids.type(torch.int32).to(self.device)
142
-
143
- # CLIP text encoder
144
- text_input_ids_inp = cuda.DeviceView(
145
- ptr=text_input_ids.data_ptr(), shape=text_input_ids.shape, dtype=np.int32
146
- )
147
- text_embeddings = self.runEngine('clip', {"input_ids": text_input_ids_inp})['text_embeddings']
148
-
149
- # Duplicate text embeddings for each generation per prompt
150
- bs_embed, seq_len, _ = text_embeddings.shape
151
- text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
152
- text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
153
-
154
- max_length = text_input_ids.shape[-1]
155
- uncond_input_ids = self.tokenizer(
156
- negative_prompt,
157
- padding="max_length",
158
- max_length=max_length,
159
- truncation=True,
160
- return_tensors="pt",
161
- ).input_ids.type(torch.int32).to(self.device)
162
- uncond_input_ids_inp = cuda.DeviceView(
163
- ptr=uncond_input_ids.data_ptr(), shape=uncond_input_ids.shape, dtype=np.int32)
164
- uncond_embeddings = self.runEngine('clip', {"input_ids": uncond_input_ids_inp})['text_embeddings']
165
-
166
- # Duplicate unconditional embeddings for each generation per prompt
167
- seq_len = uncond_embeddings.shape[1]
168
- uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
169
- uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
170
-
171
- # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
172
- text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
173
- text_embeddings = text_embeddings.to(dtype=torch.float16)
174
-
175
- return text_embeddings
176
-
177
- @torch.no_grad()
178
- def __call__(
179
- self,
180
- prompt: Union[str, List[str]] = None,
181
- height: Optional[int] = None,
182
- width: Optional[int] = None,
183
- num_inference_steps: int = 50,
184
- guidance_scale: float = 7.5,
185
- negative_prompt: Optional[Union[str, List[str]]] = None,
186
- num_images_per_prompt: Optional[int] = 1,
187
- eta: float = 0.0,
188
- generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
189
- latents: Optional[torch.FloatTensor] = None,
190
- prompt_embeds: Optional[torch.FloatTensor] = None,
191
- use_super: bool = True,
192
- ):
193
- # 1. Default height and width to unet
194
- assert height is not None, "height can not be None"
195
- assert width is not None, "width can not be None"
196
-
197
- # 2. Define call parameters and Allocate the cuda buffers for TRT Engine bindings.
198
- if prompt is not None and isinstance(prompt, str):
199
- batch_size = 1
200
- elif prompt is not None and isinstance(prompt, list):
201
- batch_size = len(prompt)
202
- else:
203
- batch_size = prompt_embeds.shape[0]
204
-
205
- # Allocate buffers for TensorRT engine bindings
206
- for model_name, obj in self.trt_torch_models_cls.items():
207
- self.engine[model_name].allocate_buffers(
208
- shape_dict=obj.get_shape_dict(batch_size, height, width),
209
- device=self.device
210
- )
211
-
212
- do_classifier_free_guidance = guidance_scale > 1.0
213
-
214
- with trt.Runtime(TRT_LOGGER) as runtime:
215
- torch.cuda.synchronize()
216
-
217
- # 3. Encode input prompt. TRT Clip model.
218
- prompt_embeds = self._trt_encode_prompt(
219
- prompt, negative_prompt, num_images_per_prompt
220
- )
221
-
222
- # 4. Prepare timesteps.
223
- self.scheduler.set_timesteps(num_inference_steps, device=self.device)
224
- timesteps = self.scheduler.timesteps
225
-
226
- # 5. Prepare latent variables. It will use VAE-Enoder(currently the encoder is torch model, not trt)
227
- num_channels_latents = 4
228
- latents = self.prepare_latents(
229
- batch_size*num_images_per_prompt,
230
- num_channels_latents,
231
- height,
232
- width,
233
- prompt_embeds.dtype,
234
- self.device,
235
- generator,
236
- latents
237
- )
238
-
239
- # 6. Prepare extra step kwargs and Set lantens/controlnet_conditioning_image/prompt_embeds to special dtype.
240
- # The dytpe must be equal to the following to ensure that the NAN can not be issued in trt engine.
241
- extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
242
- latents = latents.to(dtype=torch.float32)
243
- prompt_embeds = prompt_embeds.to(dtype=torch.float16)
244
-
245
- # 7. Denoising loop
246
- for i, t in enumerate(timesteps):
247
- # expand the latents if we are doing classifier free guidance
248
- latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
249
-
250
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
251
-
252
- # predict the noise residual
253
-
254
- dtype = np.float16
255
- if t.dtype != torch.float32:
256
- timestep_float = t.float()
257
- else:
258
- timestep_float = t
259
-
260
- sample_inp = cuda.DeviceView(
261
- ptr=latent_model_input.data_ptr(), shape=latent_model_input.shape, dtype=np.float32
262
- )
263
- timestep_inp = cuda.DeviceView(
264
- ptr=timestep_float.data_ptr(), shape=timestep_float.shape, dtype=np.float32
265
- )
266
- embeddings_inp = cuda.DeviceView(
267
- ptr=prompt_embeds.data_ptr(), shape=prompt_embeds.shape, dtype=dtype
268
- )
269
-
270
- noise_pred = self.engine['unet_fp16'].infer(
271
- {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
272
- self.stream)['latent']
273
- # perform guidance
274
- if do_classifier_free_guidance:
275
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
276
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
277
-
278
- # compute the previous noisy sample x_t -> x_t-1
279
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
280
-
281
- # 8. Use VAE-Decoder to decode the latents
282
- image = self._trt_decode_latents(latents)
283
-
284
- # 9. SuperX4 Resolution, Optional.
285
- if use_super:
286
- image = np.ascontiguousarray(np.transpose(image, (0, 3, 1, 2))).astype(np.float16)
287
- #image = self.super.infer(np.transpose(image.astype(np.float16), (0, 3, 1, 2)))
288
- image = self.super.infer(image)
289
- image = np.uint8(np.transpose(image, (0, 2, 3, 1)))
290
- else:
291
- image = np.uint8(image)
292
- return image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lyraSD/muse_trt/super.py DELETED
@@ -1,64 +0,0 @@
1
- r"""use tensorrt engine to infer, a useful pipeline"""
2
-
3
- import os
4
-
5
- import numpy as np
6
- from polygraphy import cuda
7
- from polygraphy.backend.common import bytes_from_path
8
- from polygraphy.backend.trt import engine_from_bytes
9
-
10
-
11
- class SuperX4TRTInfer:
12
- def __init__(self, engine_dir,
13
- model_name='superx4.plan',
14
- o_height=None,
15
- o_width=None,
16
- fp16=True,
17
- ) -> None:
18
- engine_path = os.path.join(engine_dir, model_name)
19
- self.engine = engine_from_bytes(bytes_from_path(engine_path))
20
- self.context = self.engine.create_execution_context()
21
-
22
- self.o_height = o_height
23
- self.o_width = o_width
24
- self.fp = fp16
25
- self.dtype = np.float16 if fp16 else np.float32
26
-
27
- self.stream = cuda.Stream()
28
-
29
- def infer(self, x):
30
- batch_size, channel, height, width = x.shape
31
- if self.o_height is None or self.o_width is None:
32
- o_height = height*4
33
- o_width = width*4
34
- else:
35
- o_height = self.o_height
36
- o_width = self.o_width
37
-
38
- h_output = np.empty([batch_size, channel, o_height, o_width], dtype=self.dtype)
39
-
40
- # allocate device memory
41
- d_input = cuda.wrapper().malloc(1 * x.nbytes)
42
- d_output = cuda.wrapper().malloc(1*h_output.nbytes)
43
-
44
- bindings = [int(d_input), int(d_output)]
45
-
46
- # transfer input data to device
47
- cuda.wrapper().memcpy(d_input, x.ctypes.data, x.nbytes, cuda.MemcpyKind.HostToDevice, self.stream.ptr)
48
-
49
- # execute model
50
- noerror = self.context.execute_async_v2(bindings, self.stream.ptr)
51
- if not noerror:
52
- raise ValueError(f"ERROR: inference failed.")
53
-
54
- # transfer predictions back
55
- cuda.wrapper().memcpy(h_output.ctypes.data, d_output, h_output.nbytes, cuda.MemcpyKind.DeviceToHost, self.stream.ptr)
56
- cuda.wrapper().free(d_input)
57
- cuda.wrapper().free(d_output)
58
-
59
- return h_output
60
-
61
- def teardown(self):
62
- del self.engine
63
- self.stream.free()
64
- del self.stream
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lyraSD/muse_trt/utilities.py DELETED
@@ -1,538 +0,0 @@
1
- r"""utils components"""
2
-
3
- from collections import OrderedDict
4
- from copy import copy
5
- import numpy as np
6
- import os
7
- import math
8
- from PIL import Image
9
- from polygraphy.backend.common import bytes_from_path
10
- from polygraphy.backend.trt import CreateConfig, Profile
11
- from polygraphy.backend.trt import engine_from_bytes, engine_from_network, network_from_onnx_path, save_engine
12
- from polygraphy.backend.trt import util as trt_util
13
- from polygraphy import cuda
14
- import random
15
- from scipy import integrate
16
- import tensorrt as trt
17
- import torch
18
-
19
- TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
20
-
21
-
22
- class Engine():
23
- def __init__(
24
- self,
25
- model_name,
26
- engine_dir,
27
- memory_pool_size=None
28
- ):
29
- self.engine_path = os.path.join(engine_dir, model_name+'.plan')
30
- self.engine = None
31
- self.context = None
32
- self.buffers = OrderedDict()
33
- self.tensors = OrderedDict()
34
- self.memory_pool_size = memory_pool_size
35
-
36
- def __del__(self):
37
- [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)]
38
- del self.engine
39
- del self.context
40
- del self.buffers
41
- del self.tensors
42
-
43
- def build(self, onnx_path, fp16, input_profile=None, enable_preview=False):
44
- print(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
45
- p = Profile()
46
- if input_profile:
47
- for name, dims in input_profile.items():
48
- assert len(dims) == 3
49
- p.add(name, min=dims[0], opt=dims[1], max=dims[2])
50
-
51
- preview_features = []
52
- if enable_preview:
53
- trt_version = [int(i) for i in trt.__version__.split(".")]
54
- # FASTER_DYNAMIC_SHAPES_0805 should only be used for TRT 8.5.1 or above.
55
- if trt_version[0] > 8 or \
56
- (trt_version[0] == 8 and (trt_version[1] > 5 or (trt_version[1] == 5 and trt_version[2] >= 1))):
57
- preview_features = [trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]
58
-
59
- if self.memory_pool_size is not None:
60
- memory_pool_limits = {trt.MemoryPoolType.WORKSPACE: (self.memory_pool_size*(2 ** 30))}
61
- print(memory_pool_limits)
62
- else:
63
- memory_pool_limits = None
64
- engine = engine_from_network(
65
- network_from_onnx_path(onnx_path),
66
- config=CreateConfig(
67
- fp16=fp16, profiles=[p], preview_features=preview_features, memory_pool_limits=memory_pool_limits
68
- )
69
- )
70
- save_engine(engine, path=self.engine_path)
71
-
72
- def activate(self):
73
- print(f"Loading TensorRT engine: {self.engine_path}")
74
- self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
75
- self.context = self.engine.create_execution_context()
76
-
77
- def allocate_buffers(self, shape_dict=None, device='cuda'):
78
- for idx in range(trt_util.get_bindings_per_profile(self.engine)):
79
- binding = self.engine[idx]
80
- if shape_dict and binding in shape_dict:
81
- shape = shape_dict[binding]
82
- else:
83
- shape = self.engine.get_binding_shape(binding)
84
- dtype = trt_util.np_dtype_from_trt(self.engine.get_binding_dtype(binding))
85
- if self.engine.binding_is_input(binding):
86
- self.context.set_binding_shape(idx, shape)
87
- # Workaround to convert np dtype to torch
88
- np_type_tensor = np.empty(shape=[], dtype=dtype)
89
- torch_type_tensor = torch.from_numpy(np_type_tensor)
90
- tensor = torch.empty(tuple(shape), dtype=torch_type_tensor.dtype).to(device=device)
91
- self.tensors[binding] = tensor
92
- self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype)
93
-
94
- def infer(self, feed_dict, stream):
95
- start_binding, end_binding = trt_util.get_active_profile_bindings(self.context)
96
- # shallow copy of ordered dict
97
- device_buffers = copy(self.buffers)
98
- for name, buf in feed_dict.items():
99
- assert isinstance(buf, cuda.DeviceView)
100
- device_buffers[name] = buf
101
- bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()]
102
- noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr)
103
- if not noerror:
104
- raise ValueError(f"ERROR: inference failed.")
105
-
106
- return self.tensors
107
-
108
-
109
- class LMSDiscreteScheduler():
110
- def __init__(
111
- self,
112
- device='cuda',
113
- beta_start=0.00085,
114
- beta_end=0.012,
115
- num_train_timesteps=1000,
116
- ):
117
- self.num_train_timesteps = num_train_timesteps
118
- self.order = 4
119
-
120
- self.beta_start = beta_start
121
- self.beta_end = beta_end
122
- betas = (torch.linspace(beta_start**0.5, beta_end**0.5, self.num_train_timesteps, dtype=torch.float32) ** 2)
123
- alphas = 1.0 - betas
124
- self.alphas_cumprod = torch.cumprod(alphas, dim=0)
125
-
126
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
127
- sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
128
- self.sigmas = torch.from_numpy(sigmas)
129
-
130
- # standard deviation of the initial noise distribution
131
- self.init_noise_sigma = self.sigmas.max()
132
-
133
- self.device = device
134
-
135
- def set_timesteps(self, steps):
136
- self.num_inference_steps = steps
137
-
138
- timesteps = np.linspace(0, self.num_train_timesteps - 1, steps, dtype=float)[::-1].copy()
139
- sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
140
- sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
141
- sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
142
- self.sigmas = torch.from_numpy(sigmas).to(device=self.device)
143
-
144
- # Move all timesteps to correct device beforehand
145
- self.timesteps = torch.from_numpy(timesteps).to(device=self.device).float()
146
- self.derivatives = []
147
-
148
- def scale_model_input(self, sample: torch.FloatTensor, idx, *args, **kwargs) -> torch.FloatTensor:
149
- return sample * self.latent_scales[idx]
150
-
151
- def configure(self):
152
- order = self.order
153
- self.lms_coeffs = []
154
- self.latent_scales = [1./((sigma**2 + 1) ** 0.5) for sigma in self.sigmas]
155
-
156
- def get_lms_coefficient(order, t, current_order):
157
- """
158
- Compute a linear multistep coefficient.
159
- """
160
- def lms_derivative(tau):
161
- prod = 1.0
162
- for k in range(order):
163
- if current_order == k:
164
- continue
165
- prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
166
- return prod
167
- integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
168
- return integrated_coeff
169
-
170
- for step_index in range(self.num_inference_steps):
171
- order = min(step_index + 1, order)
172
- self.lms_coeffs.append([get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)])
173
-
174
- def step(self, output, latents, idx, timestep):
175
- # compute the previous noisy sample x_t -> x_t-1
176
- # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
177
- sigma = self.sigmas[idx]
178
- pred_original_sample = latents - sigma * output
179
- # 2. Convert to an ODE derivative
180
- derivative = (latents - pred_original_sample) / sigma
181
- self.derivatives.append(derivative)
182
- if len(self.derivatives) > self.order:
183
- self.derivatives.pop(0)
184
- # 3. Compute previous sample based on the derivatives path
185
- prev_sample = latents + sum(
186
- coeff * derivative for coeff, derivative in zip(self.lms_coeffs[idx], reversed(self.derivatives))
187
- )
188
-
189
- return prev_sample
190
-
191
-
192
- class DPMScheduler():
193
- def __init__(
194
- self,
195
- beta_start=0.00085,
196
- beta_end=0.012,
197
- num_train_timesteps=1000,
198
- solver_order=2,
199
- predict_epsilon=True,
200
- thresholding=False,
201
- dynamic_thresholding_ratio=0.995,
202
- sample_max_value=1.0,
203
- algorithm_type="dpmsolver++",
204
- solver_type="midpoint",
205
- lower_order_final=True,
206
- device='cuda',
207
- ):
208
- # this schedule is very specific to the latent diffusion model.
209
- self.betas = (
210
- torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
211
- )
212
-
213
- self.device = device
214
- self.alphas = 1.0 - self.betas
215
- self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
216
- # Currently we only support VP-type noise schedule
217
- self.alpha_t = torch.sqrt(self.alphas_cumprod)
218
- self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
219
- self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
220
-
221
- # standard deviation of the initial noise distribution
222
- self.init_noise_sigma = 1.0
223
-
224
- self.algorithm_type = algorithm_type
225
- self.predict_epsilon = predict_epsilon
226
- self.thresholding = thresholding
227
- self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
228
- self.sample_max_value = sample_max_value
229
- self.lower_order_final = lower_order_final
230
-
231
- # settings for DPM-Solver
232
- if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
233
- raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
234
- if solver_type not in ["midpoint", "heun"]:
235
- raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
236
-
237
- # setable values
238
- self.num_inference_steps = None
239
- self.solver_order = solver_order
240
- self.num_train_timesteps = num_train_timesteps
241
- self.solver_type = solver_type
242
-
243
- self.first_order_first_coef = []
244
- self.first_order_second_coef = []
245
-
246
- self.second_order_first_coef = []
247
- self.second_order_second_coef = []
248
- self.second_order_third_coef = []
249
-
250
- self.third_order_first_coef = []
251
- self.third_order_second_coef = []
252
- self.third_order_third_coef = []
253
- self.third_order_fourth_coef = []
254
-
255
- def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
256
- return sample
257
-
258
- def configure(self):
259
- lower_order_nums = 0
260
- for step_index in range(self.num_inference_steps):
261
- step_idx = step_index
262
- timestep = self.timesteps[step_idx]
263
-
264
- prev_timestep = 0 if step_idx == len(self.timesteps) - 1 else self.timesteps[step_idx + 1]
265
-
266
- self.dpm_solver_first_order_coefs_precompute(timestep, prev_timestep)
267
-
268
- timestep_list = [self.timesteps[step_index - 1], timestep]
269
- self.multistep_dpm_solver_second_order_coefs_precompute(timestep_list, prev_timestep)
270
-
271
- timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
272
- self.multistep_dpm_solver_third_order_coefs_precompute(timestep_list, prev_timestep)
273
-
274
- if lower_order_nums < self.solver_order:
275
- lower_order_nums += 1
276
-
277
- def dpm_solver_first_order_coefs_precompute(self, timestep, prev_timestep):
278
- lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
279
- alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
280
- sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
281
- h = lambda_t - lambda_s
282
- if self.algorithm_type == "dpmsolver++":
283
- self.first_order_first_coef.append(sigma_t / sigma_s)
284
- self.first_order_second_coef.append(alpha_t * (torch.exp(-h) - 1.0))
285
- elif self.algorithm_type == "dpmsolver":
286
- self.first_order_first_coef.append(alpha_t / alpha_s)
287
- self.first_order_second_coef.append(sigma_t * (torch.exp(h) - 1.0))
288
-
289
- def multistep_dpm_solver_second_order_coefs_precompute(self, timestep_list, prev_timestep):
290
- t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
291
- lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
292
- alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
293
- sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
294
- h = lambda_t - lambda_s0
295
- if self.algorithm_type == "dpmsolver++":
296
- # See https://arxiv.org/abs/2211.01095 for detailed derivations
297
- if self.solver_type == "midpoint":
298
- self.second_order_first_coef.append(sigma_t / sigma_s0)
299
- self.second_order_second_coef.append((alpha_t * (torch.exp(-h) - 1.0)))
300
- self.second_order_third_coef.append(0.5 * (alpha_t * (torch.exp(-h) - 1.0)))
301
- elif self.solver_type == "heun":
302
- self.second_order_first_coef.append(sigma_t / sigma_s0)
303
- self.second_order_second_coef.append((alpha_t * (torch.exp(-h) - 1.0)))
304
- self.second_order_third_coef.append(alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0))
305
- elif self.algorithm_type == "dpmsolver":
306
- # See https://arxiv.org/abs/2206.00927 for detailed derivations
307
- if self.solver_type == "midpoint":
308
- self.second_order_first_coef.append(alpha_t / alpha_s0)
309
- self.second_order_second_coef.append((sigma_t * (torch.exp(h) - 1.0)))
310
- self.second_order_third_coef.append(0.5 * (sigma_t * (torch.exp(h) - 1.0)))
311
- elif self.solver_type == "heun":
312
- self.second_order_first_coef.append(alpha_t / alpha_s0)
313
- self.second_order_second_coef.append((sigma_t * (torch.exp(h) - 1.0)))
314
- self.second_order_third_coef.append((sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)))
315
-
316
- def multistep_dpm_solver_third_order_coefs_precompute(self, timestep_list, prev_timestep):
317
- t, s0 = prev_timestep, timestep_list[-1]
318
- lambda_t, lambda_s0 = (
319
- self.lambda_t[t],
320
- self.lambda_t[s0]
321
- )
322
- alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
323
- sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
324
- h = lambda_t - lambda_s0
325
- if self.algorithm_type == "dpmsolver++":
326
- self.third_order_first_coef.append(sigma_t / sigma_s0)
327
- self.third_order_second_coef.append(alpha_t * (torch.exp(-h) - 1.0))
328
- self.third_order_third_coef.append(alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0))
329
- self.third_order_fourth_coef.append(alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5))
330
- elif self.algorithm_type == "dpmsolver":
331
- self.third_order_first_coef.append(alpha_t / alpha_s0)
332
- self.third_order_second_coef.append(sigma_t * (torch.exp(h) - 1.0))
333
- self.third_order_third_coef.append(sigma_t * ((torch.exp(h) - 1.0) / h - 1.0))
334
- self.third_order_fourth_coef.append(sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5))
335
-
336
- def set_timesteps(self, num_inference_steps):
337
- self.num_inference_steps = num_inference_steps
338
- timesteps = (
339
- np.linspace(0, self.num_train_timesteps - 1, num_inference_steps + 1)
340
- .round()[::-1][:-1]
341
- .copy()
342
- .astype(np.int32)
343
- )
344
- self.timesteps = torch.from_numpy(timesteps).to(self.device)
345
- self.model_outputs = [
346
- None,
347
- ] * self.solver_order
348
- self.lower_order_nums = 0
349
-
350
- def convert_model_output(
351
- self, model_output, timestep, sample
352
- ):
353
- # DPM-Solver++ needs to solve an integral of the data prediction model.
354
- if self.algorithm_type == "dpmsolver++":
355
- if self.predict_epsilon:
356
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
357
- x0_pred = (sample - sigma_t * model_output) / alpha_t
358
- else:
359
- x0_pred = model_output
360
- if self.thresholding:
361
- # Dynamic thresholding in https://arxiv.org/abs/2205.11487
362
- dynamic_max_val = torch.quantile(
363
- torch.abs(x0_pred).reshape((x0_pred.shape[0], -1)), self.dynamic_thresholding_ratio, dim=1
364
- )
365
- dynamic_max_val = torch.maximum(
366
- dynamic_max_val,
367
- self.sample_max_value * torch.ones_like(dynamic_max_val).to(dynamic_max_val.device),
368
- )[(...,) + (None,) * (x0_pred.ndim - 1)]
369
- x0_pred = torch.clamp(x0_pred, -dynamic_max_val, dynamic_max_val) / dynamic_max_val
370
- return x0_pred
371
- # DPM-Solver needs to solve an integral of the noise prediction model.
372
- elif self.algorithm_type == "dpmsolver":
373
- if self.predict_epsilon:
374
- return model_output
375
- else:
376
- alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
377
- epsilon = (sample - alpha_t * model_output) / sigma_t
378
- return epsilon
379
-
380
- def dpm_solver_first_order_update(
381
- self,
382
- idx,
383
- model_output,
384
- sample
385
- ):
386
- first_coef = self.first_order_first_coef[idx]
387
- second_coef = self.first_order_second_coef[idx]
388
-
389
- if self.algorithm_type == "dpmsolver++":
390
- x_t = first_coef * sample - second_coef * model_output
391
- elif self.algorithm_type == "dpmsolver":
392
- x_t = first_coef * sample - second_coef * model_output
393
- return x_t
394
-
395
- def multistep_dpm_solver_second_order_update(
396
- self,
397
- idx,
398
- model_output_list,
399
- timestep_list,
400
- prev_timestep,
401
- sample
402
- ):
403
- t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
404
- m0, m1 = model_output_list[-1], model_output_list[-2]
405
- lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
406
- h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
407
- r0 = h_0 / h
408
- D0, D1 = m0, (1.0 / r0) * (m0 - m1)
409
-
410
- first_coef = self.second_order_first_coef[idx]
411
- second_coef = self.second_order_second_coef[idx]
412
- third_coef = self.second_order_third_coef[idx]
413
-
414
- if self.algorithm_type == "dpmsolver++":
415
- # See https://arxiv.org/abs/2211.01095 for detailed derivations
416
- if self.solver_type == "midpoint":
417
- x_t = (
418
- first_coef * sample
419
- - second_coef * D0
420
- - third_coef * D1
421
- )
422
- elif self.solver_type == "heun":
423
- x_t = (
424
- first_coef * sample
425
- - second_coef * D0
426
- + third_coef * D1
427
- )
428
- elif self.algorithm_type == "dpmsolver":
429
- # See https://arxiv.org/abs/2206.00927 for detailed derivations
430
- if self.solver_type == "midpoint":
431
- x_t = (
432
- first_coef * sample
433
- - second_coef * D0
434
- - third_coef * D1
435
- )
436
- elif self.solver_type == "heun":
437
- x_t = (
438
- first_coef * sample
439
- - second_coef * D0
440
- - third_coef * D1
441
- )
442
- return x_t
443
-
444
- def multistep_dpm_solver_third_order_update(
445
- self,
446
- idx,
447
- model_output_list,
448
- timestep_list,
449
- prev_timestep,
450
- sample
451
- ):
452
- t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
453
- m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
454
- lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
455
- self.lambda_t[t],
456
- self.lambda_t[s0],
457
- self.lambda_t[s1],
458
- self.lambda_t[s2],
459
- )
460
- h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
461
- r0, r1 = h_0 / h, h_1 / h
462
- D0 = m0
463
- D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
464
- D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
465
- D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
466
-
467
- first_coef = self.third_order_first_coef[idx]
468
- second_coef = self.third_order_second_coef[idx]
469
- third_coef = self.third_order_third_coef[idx]
470
- fourth_coef = self.third_order_fourth_coef[idx]
471
-
472
- if self.algorithm_type == "dpmsolver++":
473
- # See https://arxiv.org/abs/2206.00927 for detailed derivations
474
- x_t = (
475
- first_coef * sample
476
- - second_coef * D0
477
- + third_coef * D1
478
- - fourth_coef * D2
479
- )
480
- elif self.algorithm_type == "dpmsolver":
481
- # See https://arxiv.org/abs/2206.00927 for detailed derivations
482
- x_t = (
483
- first_coef * sample
484
- - second_coef * D0
485
- - third_coef * D1
486
- - fourth_coef * D2
487
- )
488
- return x_t
489
-
490
- def step(self, output, latents, step_index, timestep):
491
- if self.num_inference_steps is None:
492
- raise ValueError(
493
- "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
494
- )
495
-
496
- prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
497
- lower_order_final = (
498
- (step_index == len(self.timesteps) - 1) and self.lower_order_final and len(self.timesteps) < 15
499
- )
500
- lower_order_second = (
501
- (step_index == len(self.timesteps) - 2) and self.lower_order_final and len(self.timesteps) < 15
502
- )
503
-
504
- output = self.convert_model_output(output, timestep, latents)
505
- for i in range(self.solver_order - 1):
506
- self.model_outputs[i] = self.model_outputs[i + 1]
507
- self.model_outputs[-1] = output
508
-
509
- if self.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
510
- prev_sample = self.dpm_solver_first_order_update(step_index, output, latents)
511
- elif self.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
512
- timestep_list = [self.timesteps[step_index - 1], timestep]
513
- prev_sample = self.multistep_dpm_solver_second_order_update(
514
- step_index, self.model_outputs, timestep_list, prev_timestep, latents
515
- )
516
- else:
517
- timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
518
- prev_sample = self.multistep_dpm_solver_third_order_update(
519
- step_index, self.model_outputs, timestep_list, prev_timestep, latents
520
- )
521
-
522
- if self.lower_order_nums < self.solver_order:
523
- self.lower_order_nums += 1
524
-
525
- return prev_sample
526
-
527
-
528
- def save_image(images, image_path_dir, image_name_prefix):
529
- """
530
- Save the generated images to png files.
531
- """
532
- images = ((images + 1) * 255 / 2).clamp(0, 255).detach().permute(0, 2, 3, 1).round().type(torch.uint8).cpu().numpy()
533
- for i in range(images.shape[0]):
534
- image_path = os.path.join(image_path_dir, image_name_prefix+str(i+1)+'-'+str(random.randint(1000, 9999))+'.png')
535
- print(f"Saving image {i+1} / {images.shape[0]} to: {image_path}")
536
- Image.fromarray(images[i]).save(image_path)
537
-
538
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lyrasd_model/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from . import lyrasd_img2img_pipeline, lyrasd_txt2img_pipeline, lyrasd_controlnet_txt2img_pipeline, lyrasd_controlnet_img2img_pipeline
2
+ from .lyrasd_txt2img_pipeline import LyraSdTxt2ImgPipeline
3
+ from .lyrasd_img2img_pipeline import LyraSDImg2ImgPipeline
4
+ from .lyrasd_controlnet_txt2img_pipeline import LyraSdControlnetTxt2ImgPipeline
5
+ from .lyrasd_controlnet_img2img_pipeline import LyraSdControlnetImg2ImgPipeline
lyrasd_model/lora_util.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
4
+ import numpy as np
5
+
6
+ def add_text_lora_layer(clip_model, lora_model_path="Misaka.safetensors", alpha=1.0, lora_file_format="fp32", device="cuda:0"):
7
+ if lora_file_format == "fp32":
8
+ model_dtype = np.float32
9
+ elif lora_file_format == "fp16":
10
+ model_dtype = np.float16
11
+ else:
12
+ raise Exception(f"unsupported model dtype: {lora_file_format}")
13
+ all_files = os.scandir(lora_model_path)
14
+ unload_dict = []
15
+ # directly update weight in diffusers model
16
+ for file in all_files:
17
+
18
+ if 'text' in file.name:
19
+ layer_infos = file.name.split('.')[0].split('text_model_')[-1].split('_')
20
+ curr_layer = clip_model.text_model
21
+ else:
22
+ continue
23
+
24
+ # find the target layer
25
+ temp_name = layer_infos.pop(0)
26
+ while len(layer_infos) > -1:
27
+ try:
28
+ curr_layer = curr_layer.__getattr__(temp_name)
29
+ if len(layer_infos) > 0:
30
+ temp_name = layer_infos.pop(0)
31
+ # if temp_name == "self":
32
+ # temp_name += "_" + layer_infos.pop(0)
33
+ # elif temp_name != "mlp" and len(layer_infos) == 1:
34
+ # temp_name += "_" + layer_infos.pop(0)
35
+ elif len(layer_infos) == 0:
36
+ break
37
+ except Exception:
38
+ if len(temp_name) > 0:
39
+ temp_name += '_'+layer_infos.pop(0)
40
+ else:
41
+ temp_name = layer_infos.pop(0)
42
+ data = torch.from_numpy(np.fromfile(file.path, dtype=model_dtype)).to(clip_model.dtype).to(clip_model.device).reshape(curr_layer.weight.data.shape)
43
+ if len(curr_layer.weight.data) == 4:
44
+ adding_weight = alpha * data.permute(0,3,1,2)
45
+ else:
46
+ adding_weight = alpha * data
47
+ curr_layer.weight.data += adding_weight
48
+
49
+ curr_layer_unload_data = {
50
+ "layer": curr_layer,
51
+ "added_weight": adding_weight
52
+ }
53
+ unload_dict.append(curr_layer_unload_data)
54
+ return unload_dict
lyrasd_model/lyrasd_controlnet_img2img_pipeline.py ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Any, Callable, Dict, List, Optional, Union
3
+ from diffusers.schedulers import KarrasDiffusionSchedulers
4
+ from diffusers.loaders import TextualInversionLoaderMixin
5
+ from diffusers.models import AutoencoderKL
6
+ from diffusers.utils import randn_tensor, logging
7
+ from diffusers.schedulers import EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, DPMSolverMultistepScheduler
8
+ from diffusers.utils import PIL_INTERPOLATION
9
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
10
+ import os
11
+ import numpy as np
12
+ import warnings
13
+ from .lora_util import add_text_lora_layer
14
+ import gc
15
+
16
+ from PIL import Image
17
+ import PIL
18
+
19
+ import inspect
20
+
21
+ import time
22
+
23
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
24
+
25
+ def numpy_to_pil(images):
26
+ """
27
+ Convert a numpy image or a batch of images to a PIL image.
28
+ """
29
+ if images.ndim == 3:
30
+ images = images[None, ...]
31
+ images = (images * 255).round().astype("uint8")
32
+ if images.shape[-1] == 1:
33
+ # special case for grayscale (single channel) images
34
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
35
+ else:
36
+ pil_images = [Image.fromarray(image) for image in images]
37
+
38
+ return pil_images
39
+
40
+
41
+ def preprocess(image):
42
+ warnings.warn(
43
+ "The preprocess method is deprecated and will be removed in a future version. Please"
44
+ " use VaeImageProcessor.preprocess instead",
45
+ FutureWarning,
46
+ )
47
+ if isinstance(image, torch.Tensor):
48
+ return image
49
+ elif isinstance(image, PIL.Image.Image):
50
+ image = [image]
51
+
52
+ if isinstance(image[0], PIL.Image.Image):
53
+ w, h = image[0].size
54
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
55
+
56
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
57
+ image = np.concatenate(image, axis=0)
58
+ image = np.array(image).astype(np.float32) / 255.0
59
+ image = image.transpose(0, 3, 1, 2)
60
+ image = 2.0 * image - 1.0
61
+ image = torch.from_numpy(image)
62
+ elif isinstance(image[0], torch.Tensor):
63
+ image = torch.cat(image, dim=0)
64
+ return image
65
+
66
+ class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
67
+ def __init__(self, model_path, lib_so_path, model_dtype='fp32', device=torch.device("cuda"), dtype=torch.float16) -> None:
68
+ self.device = device
69
+ self.dtype = dtype
70
+
71
+ torch.classes.load_library(lib_so_path)
72
+
73
+ self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
74
+ self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
75
+ self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
76
+
77
+ self.unet_in_channels = 4
78
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
79
+ self.vae.enable_tiling()
80
+ self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
81
+ 3, # max num of controlnets
82
+ "fp16" # inference dtype (can only use fp16 for now)
83
+ )
84
+
85
+ unet_path = os.path.join(model_path, "unet_bins/")
86
+
87
+ self.reload_unet_model(unet_path, model_dtype)
88
+
89
+ self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
90
+
91
+ def load_controlnet_model(self, model_name, controlnet_path, model_dtype="fp32"):
92
+ if len(controlnet_path) > 0 and controlnet_path[-1] != "/":
93
+ controlnet_path = controlnet_path + "/"
94
+ self.unet.load_controlnet_model(model_name, controlnet_path, model_dtype)
95
+
96
+ def unload_controlnet_model(self, model_name):
97
+ self.unet.unload_controlnet_model(model_name, True)
98
+
99
+ def get_loaded_controlnet(self):
100
+ return self.unet.get_loaded_controlnet()
101
+
102
+ def reload_unet_model(self, unet_path, unet_file_format='fp32'):
103
+ if len(unet_path) > 0 and unet_path[-1] != "/":
104
+ unet_path = unet_path + "/"
105
+ return self.unet.reload_unet_model(unet_path, unet_file_format)
106
+
107
+ def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
108
+ if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
109
+ lora_model_path = lora_model_path + "/"
110
+ lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
111
+ self.loaded_lora[lora_name] = lora
112
+ self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
113
+
114
+ def unload_lora(self, lora_name, clean_cache=False):
115
+ for layer_data in self.loaded_lora[lora_name]:
116
+ layer = layer_data['layer']
117
+ added_weight = layer_data['added_weight']
118
+ layer.weight.data -= added_weight
119
+ self.unet.unload_lora(lora_name, clean_cache)
120
+ del self.loaded_lora[lora_name]
121
+ gc.collect()
122
+ torch.cuda.empty_cache()
123
+
124
+ def clean_lora_cache(self):
125
+ self.unet.clean_lora_cache()
126
+
127
+ def get_loaded_lora(self):
128
+ return self.unet.get_loaded_lora()
129
+
130
+ def _encode_prompt(
131
+ self,
132
+ prompt,
133
+ device,
134
+ num_images_per_prompt,
135
+ do_classifier_free_guidance,
136
+ negative_prompt=None,
137
+ prompt_embeds: Optional[torch.FloatTensor] = None,
138
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
139
+ ):
140
+ r"""
141
+ Encodes the prompt into text encoder hidden states.
142
+
143
+ Args:
144
+ prompt (`str` or `List[str]`, *optional*):
145
+ prompt to be encoded
146
+ device: (`torch.device`):
147
+ torch device
148
+ num_images_per_prompt (`int`):
149
+ number of images that should be generated per prompt
150
+ do_classifier_free_guidance (`bool`):
151
+ whether to use classifier free guidance or not
152
+ negative_prompt (`str` or `List[str]`, *optional*):
153
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
154
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
155
+ less than `1`).
156
+ prompt_embeds (`torch.FloatTensor`, *optional*):
157
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
158
+ provided, text embeddings will be generated from `prompt` input argument.
159
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
160
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
161
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
162
+ argument.
163
+ """
164
+ if prompt is not None and isinstance(prompt, str):
165
+ batch_size = 1
166
+ elif prompt is not None and isinstance(prompt, list):
167
+ batch_size = len(prompt)
168
+ else:
169
+ batch_size = prompt_embeds.shape[0]
170
+
171
+ if prompt_embeds is None:
172
+ # textual inversion: procecss multi-vector tokens if necessary
173
+ if isinstance(self, TextualInversionLoaderMixin):
174
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
175
+
176
+ text_inputs = self.tokenizer(
177
+ prompt,
178
+ padding="max_length",
179
+ max_length=self.tokenizer.model_max_length,
180
+ truncation=True,
181
+ return_tensors="pt",
182
+ )
183
+ text_input_ids = text_inputs.input_ids
184
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
185
+
186
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
187
+ text_input_ids, untruncated_ids
188
+ ):
189
+ removed_text = self.tokenizer.batch_decode(
190
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
191
+ )
192
+ logger.warning(
193
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
194
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
195
+ )
196
+
197
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
198
+ attention_mask = text_inputs.attention_mask.to(device)
199
+ else:
200
+ attention_mask = None
201
+
202
+ prompt_embeds = self.text_encoder(
203
+ text_input_ids.to(device),
204
+ attention_mask=attention_mask,
205
+ )
206
+ prompt_embeds = prompt_embeds[0]
207
+
208
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
209
+
210
+ bs_embed, seq_len, _ = prompt_embeds.shape
211
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
212
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
213
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
214
+
215
+ # get unconditional embeddings for classifier free guidance
216
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
217
+ uncond_tokens: List[str]
218
+ if negative_prompt is None:
219
+ uncond_tokens = [""] * batch_size
220
+ elif type(prompt) is not type(negative_prompt):
221
+ raise TypeError(
222
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
223
+ f" {type(prompt)}."
224
+ )
225
+ elif isinstance(negative_prompt, str):
226
+ uncond_tokens = [negative_prompt]
227
+ elif batch_size != len(negative_prompt):
228
+ raise ValueError(
229
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
230
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
231
+ " the batch size of `prompt`."
232
+ )
233
+ else:
234
+ uncond_tokens = negative_prompt
235
+
236
+ # textual inversion: procecss multi-vector tokens if necessary
237
+ if isinstance(self, TextualInversionLoaderMixin):
238
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
239
+
240
+ max_length = prompt_embeds.shape[1]
241
+ uncond_input = self.tokenizer(
242
+ uncond_tokens,
243
+ padding="max_length",
244
+ max_length=max_length,
245
+ truncation=True,
246
+ return_tensors="pt",
247
+ )
248
+
249
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
250
+ attention_mask = uncond_input.attention_mask.to(device)
251
+ else:
252
+ attention_mask = None
253
+
254
+ negative_prompt_embeds = self.text_encoder(
255
+ uncond_input.input_ids.to(device),
256
+ attention_mask=attention_mask,
257
+ )
258
+ negative_prompt_embeds = negative_prompt_embeds[0]
259
+
260
+ if do_classifier_free_guidance:
261
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
262
+ seq_len = negative_prompt_embeds.shape[1]
263
+
264
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
265
+
266
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
267
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
268
+
269
+ # For classifier free guidance, we need to do two forward passes.
270
+ # Here we concatenate the unconditional and text embeddings into a single batch
271
+ # to avoid doing two forward passes
272
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
273
+
274
+ return prompt_embeds
275
+
276
+
277
+ def decode_latents(self, latents):
278
+ latents = 1 / self.vae.config.scaling_factor * latents
279
+ image = self.vae.decode(latents).sample
280
+ image = (image / 2 + 0.5).clamp(0, 1)
281
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
282
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
283
+ return image
284
+
285
+ def check_inputs(
286
+ self,
287
+ prompt,
288
+ height,
289
+ width,
290
+ negative_prompt=None,
291
+ prompt_embeds=None,
292
+ negative_prompt_embeds=None,
293
+ ):
294
+ if height % 64 != 0 or width % 64 != 0: # 初版暂时只支持 64 的倍数的 height 和 width
295
+ raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
296
+
297
+ if prompt is not None and prompt_embeds is not None:
298
+ raise ValueError(
299
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
300
+ " only forward one of the two."
301
+ )
302
+ elif prompt is None and prompt_embeds is None:
303
+ raise ValueError(
304
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
305
+ )
306
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
307
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
308
+
309
+ if negative_prompt is not None and negative_prompt_embeds is not None:
310
+ raise ValueError(
311
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
312
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
313
+ )
314
+
315
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
316
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
317
+ raise ValueError(
318
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
319
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
320
+ f" {negative_prompt_embeds.shape}."
321
+ )
322
+
323
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
324
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
325
+ raise ValueError(
326
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
327
+ )
328
+
329
+ image = image.to(device=device, dtype=dtype)
330
+
331
+ batch_size = batch_size * num_images_per_prompt
332
+
333
+ if image.shape[1] == 4:
334
+ init_latents = image
335
+
336
+ else:
337
+ if isinstance(generator, list) and len(generator) != batch_size:
338
+ raise ValueError(
339
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
340
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
341
+ )
342
+
343
+ elif isinstance(generator, list):
344
+ init_latents = [
345
+ self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
346
+ ]
347
+ init_latents = torch.cat(init_latents, dim=0)
348
+ else:
349
+ init_latents = self.vae.encode(image).latent_dist.sample(generator)
350
+
351
+ init_latents = self.vae.config.scaling_factor * init_latents
352
+
353
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
354
+ # expand init_latents for batch_size
355
+ deprecation_message = (
356
+ f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
357
+ " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
358
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
359
+ " your script to pass as many initial images as text prompts to suppress this warning."
360
+ )
361
+ deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
362
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
363
+ init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
364
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
365
+ raise ValueError(
366
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
367
+ )
368
+ else:
369
+ init_latents = torch.cat([init_latents], dim=0)
370
+
371
+ shape = init_latents.shape
372
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
373
+
374
+ # get latents
375
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
376
+ latents = init_latents
377
+
378
+ return latents
379
+
380
+ def prepare_image(
381
+ self,
382
+ image,
383
+ width,
384
+ height,
385
+ batch_size,
386
+ num_images_per_prompt,
387
+ device,
388
+ dtype,
389
+ do_classifier_free_guidance=False,
390
+ guess_mode=False,
391
+ ):
392
+ if not isinstance(image, torch.Tensor):
393
+ if isinstance(image, PIL.Image.Image):
394
+ image = [image]
395
+
396
+ if isinstance(image[0], PIL.Image.Image):
397
+ images = []
398
+
399
+ for image_ in image:
400
+ image_ = image_.convert("RGB")
401
+ image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
402
+ image_ = np.array(image_)
403
+ image_ = image_[None, :]
404
+ images.append(image_)
405
+
406
+ image = images
407
+
408
+ image = np.concatenate(image, axis=0)
409
+ image = np.array(image).astype(np.float32) / 255.0
410
+ image = torch.from_numpy(image)
411
+ elif isinstance(image[0], torch.Tensor):
412
+ image = torch.cat(image, dim=0)
413
+
414
+ image_batch_size = image.shape[0]
415
+
416
+ if image_batch_size == 1:
417
+ repeat_by = batch_size
418
+ else:
419
+ # image batch size is the same as prompt batch size
420
+ repeat_by = num_images_per_prompt
421
+
422
+ image = image.repeat_interleave(repeat_by, dim=0)
423
+
424
+ image = image.to(device=device, dtype=dtype)
425
+
426
+ if do_classifier_free_guidance and not guess_mode:
427
+ image = torch.cat([image] * 2)
428
+
429
+ return image
430
+
431
+ def prepare_extra_step_kwargs(self, generator, eta):
432
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
433
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
434
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
435
+ # and should be between [0, 1]
436
+
437
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
438
+ extra_step_kwargs = {}
439
+ if accepts_eta:
440
+ extra_step_kwargs["eta"] = eta
441
+
442
+ # check if the scheduler accepts generator
443
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
444
+ if accepts_generator:
445
+ extra_step_kwargs["generator"] = generator
446
+ return extra_step_kwargs
447
+
448
+ def get_timesteps(self, num_inference_steps, strength, device):
449
+ # get the original timestep using init_timestep
450
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
451
+
452
+ t_start = max(num_inference_steps - init_timestep, 0)
453
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
454
+
455
+ return timesteps, num_inference_steps - t_start
456
+
457
+
458
+ @torch.no_grad()
459
+ def __call__(
460
+ self,
461
+ prompt: Union[str, List[str]] = None,
462
+ image: Union[
463
+ torch.FloatTensor,
464
+ PIL.Image.Image,
465
+ np.ndarray,
466
+ List[torch.FloatTensor],
467
+ List[PIL.Image.Image],
468
+ List[np.ndarray],
469
+ ] = None,
470
+ strength: float = 0.8,
471
+ height: Optional[int] = None,
472
+ width: Optional[int] = None,
473
+ num_inference_steps: int = 50,
474
+ guidance_scale: float = 7.5,
475
+ negative_prompt: Optional[Union[str, List[str]]] = None,
476
+ num_images_per_prompt: Optional[int] = 1,
477
+ controlnet_images: Optional[List[PIL.Image.Image]] = None,
478
+ controlnet_scale: Optional[List[float]] = None,
479
+ controlnet_names: Optional[List[str]] = None,
480
+ guess_mode = False,
481
+ eta: float = 0.0,
482
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
483
+ latents: Optional[torch.FloatTensor] = None,
484
+ prompt_embeds: Optional[torch.FloatTensor] = None,
485
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
486
+ ):
487
+ r"""
488
+ Function invoked when calling the pipeline for generation.
489
+
490
+ Args:
491
+ prompt (`str` or `List[str]`, *optional*):
492
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
493
+ instead.
494
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
495
+ The height in pixels of the generated image.
496
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
497
+ The width in pixels of the generated image.
498
+ num_inference_steps (`int`, *optional*, defaults to 50):
499
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
500
+ expense of slower inference.
501
+ guidance_scale (`float`, *optional*, defaults to 7.5):
502
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
503
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
504
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
505
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
506
+ usually at the expense of lower image quality.
507
+ negative_prompt (`str` or `List[str]`, *optional*):
508
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
509
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
510
+ less than `1`).
511
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
512
+ The number of images to generate per prompt.
513
+ eta (`float`, *optional*, defaults to 0.0):
514
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
515
+ [`schedulers.DDIMScheduler`], will be ignored for others.
516
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
517
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
518
+ to make generation deterministic.
519
+ latents (`torch.FloatTensor`, *optional*):
520
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
521
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
522
+ tensor will ge generated by sampling using the supplied random `generator`.
523
+ prompt_embeds (`torch.FloatTensor`, *optional*):
524
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
525
+ provided, text embeddings will be generated from `prompt` input argument.
526
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
527
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
528
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
529
+ argument.
530
+
531
+ """
532
+ # 1. Check inputs. Raise error if not correct
533
+ self.check_inputs(
534
+ prompt, height, width, negative_prompt, prompt_embeds, negative_prompt_embeds
535
+ )
536
+
537
+ # 2. Define call parameters
538
+ if prompt is not None and isinstance(prompt, str):
539
+ batch_size = 1
540
+ elif prompt is not None and isinstance(prompt, list):
541
+ batch_size = len(prompt)
542
+ else:
543
+ batch_size = prompt_embeds.shape[0]
544
+
545
+ device = self.device
546
+
547
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
548
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
549
+ # corresponds to doing no classifier free guidance.
550
+ do_classifier_free_guidance = guidance_scale > 1.0
551
+
552
+
553
+ # 3. Encode input prompt
554
+ start = time.perf_counter()
555
+ prompt_embeds = self._encode_prompt(
556
+ prompt,
557
+ device,
558
+ num_images_per_prompt,
559
+ do_classifier_free_guidance,
560
+ negative_prompt,
561
+ prompt_embeds=prompt_embeds,
562
+ negative_prompt_embeds=negative_prompt_embeds,
563
+ )
564
+ control_images = []
565
+
566
+ # 4 prepare controlnet images
567
+ for image_ in controlnet_images:
568
+ image_ = self.prepare_image(
569
+ image=image_,
570
+ width=width,
571
+ height=height,
572
+ batch_size=batch_size * num_images_per_prompt,
573
+ num_images_per_prompt=num_images_per_prompt,
574
+ device=device,
575
+ dtype=prompt_embeds.dtype,
576
+ do_classifier_free_guidance=do_classifier_free_guidance
577
+ )
578
+
579
+ control_images.append(image_)
580
+
581
+ control_scales = []
582
+
583
+ scales = [1.0, ] * 13
584
+ if guess_mode:
585
+ scales = torch.logspace(-1, 0, 13).tolist()
586
+
587
+ for scale in controlnet_scale:
588
+ scales_ = [d * scale for d in scales]
589
+ control_scales.append(scales_)
590
+
591
+ image = preprocess(image)
592
+
593
+ # 5. set timesteps
594
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
595
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
596
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
597
+
598
+ # 6. Prepare latent variables
599
+ latents = self.prepare_latents(
600
+ image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
601
+ )
602
+
603
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
604
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
605
+
606
+ # 8. Denoising loop
607
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
608
+
609
+ start_unet = time.perf_counter()
610
+ for i, t in enumerate(timesteps):
611
+ # expand the latents if we are doing classifier free guidance
612
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
613
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
614
+ latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
615
+
616
+ # 后边三个 None 是给到controlnet 的参数,暂时给到 None 当 placeholder
617
+ noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, controlnet_names, control_images, control_scales, guess_mode)
618
+
619
+ noise_pred = noise_pred.permute(0, 3, 1, 2)
620
+ # perform guidance
621
+
622
+ if do_classifier_free_guidance:
623
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
624
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
625
+
626
+ # compute the previous noisy sample x_t -> x_t-1
627
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
628
+
629
+ torch.cuda.synchronize()
630
+
631
+ start = time.perf_counter()
632
+ image = self.decode_latents(latents)
633
+ torch.cuda.synchronize()
634
+ image = numpy_to_pil(image)
635
+
636
+ return image
637
+ # return None
lyrasd_model/lyrasd_controlnet_txt2img_pipeline.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Any, Callable, Dict, List, Optional, Union
3
+ from diffusers.schedulers import KarrasDiffusionSchedulers
4
+ from diffusers.loaders import TextualInversionLoaderMixin
5
+ from diffusers.models import AutoencoderKL
6
+ from diffusers.utils import randn_tensor, logging
7
+ from diffusers.schedulers import EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, DPMSolverMultistepScheduler
8
+ from diffusers.utils import PIL_INTERPOLATION
9
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
10
+ import os
11
+ import numpy as np
12
+ from .lora_util import add_text_lora_layer
13
+ import gc
14
+ from PIL import Image
15
+ import PIL
16
+
17
+ import inspect
18
+
19
+ import time
20
+
21
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
22
+
23
+ def numpy_to_pil(images):
24
+ """
25
+ Convert a numpy image or a batch of images to a PIL image.
26
+ """
27
+ if images.ndim == 3:
28
+ images = images[None, ...]
29
+ images = (images * 255).round().astype("uint8")
30
+ if images.shape[-1] == 1:
31
+ # special case for grayscale (single channel) images
32
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
33
+ else:
34
+ pil_images = [Image.fromarray(image) for image in images]
35
+
36
+ return pil_images
37
+
38
+
39
+ class LyraSdControlnetTxt2ImgPipeline(TextualInversionLoaderMixin):
40
+ def __init__(self, model_path, lib_so_path, model_dtype='fp32', device=torch.device("cuda"), dtype=torch.float16) -> None:
41
+ self.device = device
42
+ self.dtype = dtype
43
+
44
+ torch.classes.load_library(lib_so_path)
45
+
46
+ self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
47
+ self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
48
+ self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
49
+ self.unet_in_channels = 4
50
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
51
+ self.vae.enable_tiling()
52
+ self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
53
+ 3, # max num of controlnets
54
+ "fp16" # inference dtype (can only use fp16 for now)
55
+ )
56
+
57
+ unet_path = os.path.join(model_path, "unet_bins/")
58
+ self.reload_unet_model(unet_path, model_dtype)
59
+
60
+ self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
61
+
62
+ def load_controlnet_model(self, model_name, controlnet_path, model_dtype="fp32"):
63
+ if len(controlnet_path) > 0 and controlnet_path[-1] != "/":
64
+ controlnet_path = controlnet_path + "/"
65
+ self.unet.load_controlnet_model(model_name, controlnet_path, model_dtype)
66
+
67
+ def unload_controlnet_model(self, model_name):
68
+ self.unet.unload_controlnet_model(model_name, True)
69
+
70
+ def get_loaded_controlnet(self):
71
+ return self.unet.get_loaded_controlnet()
72
+
73
+ def reload_unet_model(self, unet_path, unet_file_format='fp32'):
74
+ if len(unet_path) > 0 and unet_path[-1] != "/":
75
+ unet_path = unet_path + "/"
76
+ return self.unet.reload_unet_model(unet_path, unet_file_format)
77
+
78
+ def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
79
+ if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
80
+ lora_model_path = lora_model_path + "/"
81
+ lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
82
+ self.loaded_lora[lora_name] = lora
83
+ self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
84
+
85
+ def unload_lora(self, lora_name, clean_cache=False):
86
+ for layer_data in self.loaded_lora[lora_name]:
87
+ layer = layer_data['layer']
88
+ added_weight = layer_data['added_weight']
89
+ layer.weight.data -= added_weight
90
+ self.unet.unload_lora(lora_name, clean_cache)
91
+ del self.loaded_lora[lora_name]
92
+ gc.collect()
93
+ torch.cuda.empty_cache()
94
+
95
+ def clean_lora_cache(self):
96
+ self.unet.clean_lora_cache()
97
+
98
+ def get_loaded_lora(self):
99
+ return self.unet.get_loaded_lora()
100
+
101
+ def _encode_prompt(
102
+ self,
103
+ prompt,
104
+ device,
105
+ num_images_per_prompt,
106
+ do_classifier_free_guidance,
107
+ negative_prompt=None,
108
+ prompt_embeds: Optional[torch.FloatTensor] = None,
109
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
110
+ ):
111
+ r"""
112
+ Encodes the prompt into text encoder hidden states.
113
+
114
+ Args:
115
+ prompt (`str` or `List[str]`, *optional*):
116
+ prompt to be encoded
117
+ device: (`torch.device`):
118
+ torch device
119
+ num_images_per_prompt (`int`):
120
+ number of images that should be generated per prompt
121
+ do_classifier_free_guidance (`bool`):
122
+ whether to use classifier free guidance or not
123
+ negative_prompt (`str` or `List[str]`, *optional*):
124
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
125
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
126
+ less than `1`).
127
+ prompt_embeds (`torch.FloatTensor`, *optional*):
128
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
129
+ provided, text embeddings will be generated from `prompt` input argument.
130
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
131
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
132
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
133
+ argument.
134
+ """
135
+ if prompt is not None and isinstance(prompt, str):
136
+ batch_size = 1
137
+ elif prompt is not None and isinstance(prompt, list):
138
+ batch_size = len(prompt)
139
+ else:
140
+ batch_size = prompt_embeds.shape[0]
141
+
142
+ if prompt_embeds is None:
143
+ # textual inversion: procecss multi-vector tokens if necessary
144
+ if isinstance(self, TextualInversionLoaderMixin):
145
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
146
+
147
+ text_inputs = self.tokenizer(
148
+ prompt,
149
+ padding="max_length",
150
+ max_length=self.tokenizer.model_max_length,
151
+ truncation=True,
152
+ return_tensors="pt",
153
+ )
154
+ text_input_ids = text_inputs.input_ids
155
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
156
+
157
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
158
+ text_input_ids, untruncated_ids
159
+ ):
160
+ removed_text = self.tokenizer.batch_decode(
161
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
162
+ )
163
+ logger.warning(
164
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
165
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
166
+ )
167
+
168
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
169
+ attention_mask = text_inputs.attention_mask.to(device)
170
+ else:
171
+ attention_mask = None
172
+
173
+ prompt_embeds = self.text_encoder(
174
+ text_input_ids.to(device),
175
+ attention_mask=attention_mask,
176
+ )
177
+ prompt_embeds = prompt_embeds[0]
178
+
179
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
180
+
181
+ bs_embed, seq_len, _ = prompt_embeds.shape
182
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
183
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
184
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
185
+
186
+ # get unconditional embeddings for classifier free guidance
187
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
188
+ uncond_tokens: List[str]
189
+ if negative_prompt is None:
190
+ uncond_tokens = [""] * batch_size
191
+ elif type(prompt) is not type(negative_prompt):
192
+ raise TypeError(
193
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
194
+ f" {type(prompt)}."
195
+ )
196
+ elif isinstance(negative_prompt, str):
197
+ uncond_tokens = [negative_prompt]
198
+ elif batch_size != len(negative_prompt):
199
+ raise ValueError(
200
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
201
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
202
+ " the batch size of `prompt`."
203
+ )
204
+ else:
205
+ uncond_tokens = negative_prompt
206
+
207
+ # textual inversion: procecss multi-vector tokens if necessary
208
+ if isinstance(self, TextualInversionLoaderMixin):
209
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
210
+
211
+ max_length = prompt_embeds.shape[1]
212
+ uncond_input = self.tokenizer(
213
+ uncond_tokens,
214
+ padding="max_length",
215
+ max_length=max_length,
216
+ truncation=True,
217
+ return_tensors="pt",
218
+ )
219
+
220
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
221
+ attention_mask = uncond_input.attention_mask.to(device)
222
+ else:
223
+ attention_mask = None
224
+
225
+ negative_prompt_embeds = self.text_encoder(
226
+ uncond_input.input_ids.to(device),
227
+ attention_mask=attention_mask,
228
+ )
229
+ negative_prompt_embeds = negative_prompt_embeds[0]
230
+
231
+ if do_classifier_free_guidance:
232
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
233
+ seq_len = negative_prompt_embeds.shape[1]
234
+
235
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
236
+
237
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
238
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
239
+
240
+ # For classifier free guidance, we need to do two forward passes.
241
+ # Here we concatenate the unconditional and text embeddings into a single batch
242
+ # to avoid doing two forward passes
243
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
244
+
245
+ return prompt_embeds
246
+
247
+
248
+ def decode_latents(self, latents):
249
+ latents = 1 / self.vae.config.scaling_factor * latents
250
+ image = self.vae.decode(latents).sample
251
+ image = (image / 2 + 0.5).clamp(0, 1)
252
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
253
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
254
+ return image
255
+
256
+ def check_inputs(
257
+ self,
258
+ prompt,
259
+ height,
260
+ width,
261
+ negative_prompt=None,
262
+ prompt_embeds=None,
263
+ negative_prompt_embeds=None,
264
+ ):
265
+ if height % 64 != 0 or width % 64 != 0: # 初版暂时只支持 64 的倍数的 height 和 width
266
+ raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
267
+
268
+ if prompt is not None and prompt_embeds is not None:
269
+ raise ValueError(
270
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
271
+ " only forward one of the two."
272
+ )
273
+ elif prompt is None and prompt_embeds is None:
274
+ raise ValueError(
275
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
276
+ )
277
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
278
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
279
+
280
+ if negative_prompt is not None and negative_prompt_embeds is not None:
281
+ raise ValueError(
282
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
283
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
284
+ )
285
+
286
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
287
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
288
+ raise ValueError(
289
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
290
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
291
+ f" {negative_prompt_embeds.shape}."
292
+ )
293
+
294
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
295
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
296
+ if isinstance(generator, list) and len(generator) != batch_size:
297
+ raise ValueError(
298
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
299
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
300
+ )
301
+
302
+ if latents is None:
303
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
304
+ else:
305
+ latents = latents.to(device)
306
+
307
+ # scale the initial noise by the standard deviation required by the scheduler
308
+ latents = latents * self.scheduler.init_noise_sigma
309
+ return latents
310
+
311
+ def prepare_image(
312
+ self,
313
+ image,
314
+ width,
315
+ height,
316
+ batch_size,
317
+ num_images_per_prompt,
318
+ device,
319
+ dtype,
320
+ do_classifier_free_guidance=False,
321
+ guess_mode=False,
322
+ ):
323
+ if not isinstance(image, torch.Tensor):
324
+ if isinstance(image, PIL.Image.Image):
325
+ image = [image]
326
+
327
+ if isinstance(image[0], PIL.Image.Image):
328
+ images = []
329
+
330
+ for image_ in image:
331
+ image_ = image_.convert("RGB")
332
+ image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
333
+ image_ = np.array(image_)
334
+ image_ = image_[None, :]
335
+ images.append(image_)
336
+
337
+ image = images
338
+
339
+ image = np.concatenate(image, axis=0)
340
+ image = np.array(image).astype(np.float32) / 255.0
341
+ image = torch.from_numpy(image)
342
+ elif isinstance(image[0], torch.Tensor):
343
+ image = torch.cat(image, dim=0)
344
+
345
+ image_batch_size = image.shape[0]
346
+
347
+ if image_batch_size == 1:
348
+ repeat_by = batch_size
349
+ else:
350
+ # image batch size is the same as prompt batch size
351
+ repeat_by = num_images_per_prompt
352
+
353
+ image = image.repeat_interleave(repeat_by, dim=0)
354
+
355
+ image = image.to(device=device, dtype=dtype)
356
+
357
+ if do_classifier_free_guidance and not guess_mode:
358
+ image = torch.cat([image] * 2)
359
+
360
+ return image
361
+
362
+ def prepare_extra_step_kwargs(self, generator, eta):
363
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
364
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
365
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
366
+ # and should be between [0, 1]
367
+
368
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
369
+ extra_step_kwargs = {}
370
+ if accepts_eta:
371
+ extra_step_kwargs["eta"] = eta
372
+
373
+ # check if the scheduler accepts generator
374
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
375
+ if accepts_generator:
376
+ extra_step_kwargs["generator"] = generator
377
+ return extra_step_kwargs
378
+
379
+ @torch.no_grad()
380
+ def __call__(
381
+ self,
382
+ prompt: Union[str, List[str]] = None,
383
+ height: Optional[int] = None,
384
+ width: Optional[int] = None,
385
+ num_inference_steps: int = 50,
386
+ guidance_scale: float = 7.5,
387
+ negative_prompt: Optional[Union[str, List[str]]] = None,
388
+ num_images_per_prompt: Optional[int] = 1,
389
+ controlnet_images: Optional[List[PIL.Image.Image]] = None,
390
+ controlnet_scale: Optional[List[float]] = None,
391
+ controlnet_names: Optional[List[str]] = None,
392
+ guess_mode = False,
393
+ eta: float = 0.0,
394
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
395
+ latents: Optional[torch.FloatTensor] = None,
396
+ prompt_embeds: Optional[torch.FloatTensor] = None,
397
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
398
+ ):
399
+ r"""
400
+ Function invoked when calling the pipeline for generation.
401
+
402
+ Args:
403
+ prompt (`str` or `List[str]`, *optional*):
404
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
405
+ instead.
406
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
407
+ The height in pixels of the generated image.
408
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
409
+ The width in pixels of the generated image.
410
+ num_inference_steps (`int`, *optional*, defaults to 50):
411
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
412
+ expense of slower inference.
413
+ guidance_scale (`float`, *optional*, defaults to 7.5):
414
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
415
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
416
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
417
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
418
+ usually at the expense of lower image quality.
419
+ negative_prompt (`str` or `List[str]`, *optional*):
420
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
421
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
422
+ less than `1`).
423
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
424
+ The number of images to generate per prompt.
425
+ eta (`float`, *optional*, defaults to 0.0):
426
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
427
+ [`schedulers.DDIMScheduler`], will be ignored for others.
428
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
429
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
430
+ to make generation deterministic.
431
+ latents (`torch.FloatTensor`, *optional*):
432
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
433
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
434
+ tensor will ge generated by sampling using the supplied random `generator`.
435
+ prompt_embeds (`torch.FloatTensor`, *optional*):
436
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
437
+ provided, text embeddings will be generated from `prompt` input argument.
438
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
439
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
440
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
441
+ argument.
442
+
443
+ """
444
+ # 1. Check inputs. Raise error if not correct
445
+ self.check_inputs(
446
+ prompt, height, width, negative_prompt, prompt_embeds, negative_prompt_embeds
447
+ )
448
+
449
+ # 2. Define call parameters
450
+ if prompt is not None and isinstance(prompt, str):
451
+ batch_size = 1
452
+ elif prompt is not None and isinstance(prompt, list):
453
+ batch_size = len(prompt)
454
+ else:
455
+ batch_size = prompt_embeds.shape[0]
456
+
457
+ device = self.device
458
+
459
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
460
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
461
+ # corresponds to doing no classifier free guidance.
462
+ do_classifier_free_guidance = guidance_scale > 1.0
463
+
464
+ # 3. Encode input prompt
465
+ prompt_embeds = self._encode_prompt(
466
+ prompt,
467
+ device,
468
+ num_images_per_prompt,
469
+ do_classifier_free_guidance,
470
+ negative_prompt,
471
+ prompt_embeds=prompt_embeds,
472
+ negative_prompt_embeds=negative_prompt_embeds,
473
+ )
474
+ control_images = []
475
+
476
+ for image_ in controlnet_images:
477
+ image_ = self.prepare_image(
478
+ image=image_,
479
+ width=width,
480
+ height=height,
481
+ batch_size=batch_size * num_images_per_prompt,
482
+ num_images_per_prompt=num_images_per_prompt,
483
+ device=device,
484
+ dtype=prompt_embeds.dtype,
485
+ do_classifier_free_guidance=do_classifier_free_guidance
486
+ )
487
+
488
+ control_images.append(image_)
489
+
490
+ control_scales = []
491
+
492
+ scales = [1.0, ] * 13
493
+ if guess_mode:
494
+ scales = torch.logspace(-1, 0, 13).tolist()
495
+
496
+ for scale in controlnet_scale:
497
+ scales_ = [d * scale for d in scales]
498
+ control_scales.append(scales_)
499
+
500
+ # 4. Prepare timesteps
501
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
502
+ timesteps = self.scheduler.timesteps
503
+
504
+ # 5. Prepare latent variables
505
+ num_channels_latents = self.unet_in_channels
506
+ latents = self.prepare_latents(
507
+ batch_size * num_images_per_prompt,
508
+ num_channels_latents,
509
+ height,
510
+ width,
511
+ prompt_embeds.dtype,
512
+ device,
513
+ generator,
514
+ latents,
515
+ )
516
+
517
+ # # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
518
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
519
+
520
+ # 7. Denoising loop
521
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
522
+
523
+ start_unet = time.perf_counter()
524
+ for i, t in enumerate(timesteps):
525
+ # expand the latents if we are doing classifier free guidance
526
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
527
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
528
+ latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
529
+
530
+ # 后边三个 None 是给到controlnet 的参数,暂时给到 None 当 placeholder
531
+ noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, controlnet_names, control_images, control_scales, guess_mode)
532
+
533
+ noise_pred = noise_pred.permute(0, 3, 1, 2)
534
+ # perform guidance
535
+
536
+ if do_classifier_free_guidance:
537
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
538
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
539
+
540
+ # compute the previous noisy sample x_t -> x_t-1
541
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
542
+
543
+ image = self.decode_latents(latents)
544
+ image = numpy_to_pil(image)
545
+
546
+ return image
547
+ # return None
lyrasd_model/lyrasd_img2img_pipeline.py ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import logging
3
+ import os
4
+ import warnings
5
+ from typing import Callable, List, Optional, Union
6
+
7
+ import numpy as np
8
+ import PIL
9
+ import torch
10
+ from diffusers.loaders import TextualInversionLoaderMixin
11
+ from diffusers.models import AutoencoderKL
12
+ from diffusers.schedulers import EulerAncestralDiscreteScheduler
13
+ from diffusers.utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
14
+ from PIL import Image
15
+ from transformers import CLIPTextModel, CLIPTokenizer
16
+ from .lora_util import add_text_lora_layer
17
+ import gc
18
+
19
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
20
+
21
+
22
+ def numpy_to_pil(images):
23
+ """
24
+ Convert a numpy image or a batch of images to a PIL image.
25
+ """
26
+ if images.ndim == 3:
27
+ images = images[None, ...]
28
+ images = (images * 255).round().astype("uint8")
29
+ if images.shape[-1] == 1:
30
+ # special case for grayscale (single channel) images
31
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
32
+ else:
33
+ pil_images = [Image.fromarray(image) for image in images]
34
+
35
+ return pil_images
36
+
37
+
38
+ def preprocess(image):
39
+ warnings.warn(
40
+ "The preprocess method is deprecated and will be removed in a future version. Please"
41
+ " use VaeImageProcessor.preprocess instead",
42
+ FutureWarning,
43
+ )
44
+ if isinstance(image, torch.Tensor):
45
+ return image
46
+ elif isinstance(image, PIL.Image.Image):
47
+ image = [image]
48
+
49
+ if isinstance(image[0], PIL.Image.Image):
50
+ w, h = image[0].size
51
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
52
+
53
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
54
+ image = np.concatenate(image, axis=0)
55
+ image = np.array(image).astype(np.float32) / 255.0
56
+ image = image.transpose(0, 3, 1, 2)
57
+ image = 2.0 * image - 1.0
58
+ image = torch.from_numpy(image)
59
+ elif isinstance(image[0], torch.Tensor):
60
+ image = torch.cat(image, dim=0)
61
+ return image
62
+
63
+
64
+ class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
65
+ def __init__(self, model_path, lib_so_path, model_dtype='fp32', device=torch.device("cuda"), dtype=torch.float16) -> None:
66
+ self.device = device
67
+ self.dtype = dtype
68
+
69
+ torch.classes.load_library(lib_so_path)
70
+
71
+ self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
72
+ self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
73
+ self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
74
+ unet_path = os.path.join(model_path, "unet_bins/")
75
+
76
+ self.unet_in_channels = 4
77
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
78
+ self.vae.enable_tiling()
79
+ self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
80
+ 3, # max num of controlnets
81
+ "fp16" # inference dtype (can only use fp16 for now)
82
+ )
83
+
84
+ self.reload_unet_model(unet_path, model_dtype)
85
+
86
+ self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
87
+
88
+ def reload_unet_model(self, unet_path, unet_file_format='fp32'):
89
+ if len(unet_path) > 0 and unet_path[-1] != "/":
90
+ unet_path = unet_path + "/"
91
+ return self.unet.reload_unet_model(unet_path, unet_file_format)
92
+
93
+ def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
94
+ if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
95
+ lora_model_path = lora_model_path + "/"
96
+ lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
97
+ self.loaded_lora[lora_name] = lora
98
+ self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
99
+
100
+ def unload_lora(self, lora_name, clean_cache=False):
101
+ for layer_data in self.loaded_lora[lora_name]:
102
+ layer = layer_data['layer']
103
+ added_weight = layer_data['added_weight']
104
+ layer.weight.data -= added_weight
105
+ self.unet.unload_lora(lora_name, clean_cache)
106
+ del self.loaded_lora[lora_name]
107
+ gc.collect()
108
+ torch.cuda.empty_cache()
109
+
110
+ def clean_lora_cache(self):
111
+ self.unet.clean_lora_cache()
112
+
113
+ def get_loaded_lora(self):
114
+ return self.unet.get_loaded_lora()
115
+
116
+
117
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
118
+ def _encode_prompt(
119
+ self,
120
+ prompt,
121
+ device,
122
+ num_images_per_prompt,
123
+ do_classifier_free_guidance,
124
+ negative_prompt=None,
125
+ prompt_embeds: Optional[torch.FloatTensor] = None,
126
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
127
+ ):
128
+ r"""
129
+ Encodes the prompt into text encoder hidden states.
130
+
131
+ Args:
132
+ prompt (`str` or `List[str]`, *optional*):
133
+ prompt to be encoded
134
+ device: (`torch.device`):
135
+ torch device
136
+ num_images_per_prompt (`int`):
137
+ number of images that should be generated per prompt
138
+ do_classifier_free_guidance (`bool`):
139
+ whether to use classifier free guidance or not
140
+ negative_prompt (`str` or `List[str]`, *optional*):
141
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
142
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
143
+ less than `1`).
144
+ prompt_embeds (`torch.FloatTensor`, *optional*):
145
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
146
+ provided, text embeddings will be generated from `prompt` input argument.
147
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
148
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
149
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
150
+ argument.
151
+ """
152
+
153
+ if prompt is not None and isinstance(prompt, str):
154
+ batch_size = 1
155
+ elif prompt is not None and isinstance(prompt, list):
156
+ batch_size = len(prompt)
157
+ else:
158
+ batch_size = prompt_embeds.shape[0]
159
+
160
+ if prompt_embeds is None:
161
+ # textual inversion: procecss multi-vector tokens if necessary
162
+ if isinstance(self, TextualInversionLoaderMixin):
163
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
164
+
165
+ text_inputs = self.tokenizer(
166
+ prompt,
167
+ padding="max_length",
168
+ max_length=self.tokenizer.model_max_length,
169
+ truncation=True,
170
+ return_tensors="pt",
171
+ )
172
+ text_input_ids = text_inputs.input_ids
173
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
174
+
175
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
176
+ text_input_ids, untruncated_ids
177
+ ):
178
+ removed_text = self.tokenizer.batch_decode(
179
+ untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
180
+ )
181
+ logger.warning(
182
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
183
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
184
+ )
185
+
186
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
187
+ attention_mask = text_inputs.attention_mask.to(device)
188
+ else:
189
+ attention_mask = None
190
+
191
+ prompt_embeds = self.text_encoder(
192
+ text_input_ids.to(device),
193
+ attention_mask=attention_mask,
194
+ )
195
+ prompt_embeds = prompt_embeds[0]
196
+
197
+ if self.text_encoder is not None:
198
+ prompt_embeds_dtype = self.text_encoder.dtype
199
+ elif self.unet is not None:
200
+ prompt_embeds_dtype = self.unet.dtype
201
+ else:
202
+ prompt_embeds_dtype = prompt_embeds.dtype
203
+
204
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
205
+
206
+ bs_embed, seq_len, _ = prompt_embeds.shape
207
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
208
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
209
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
210
+
211
+ # get unconditional embeddings for classifier free guidance
212
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
213
+ uncond_tokens: List[str]
214
+ if negative_prompt is None:
215
+ uncond_tokens = [""] * batch_size
216
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
217
+ raise TypeError(
218
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
219
+ f" {type(prompt)}."
220
+ )
221
+ elif isinstance(negative_prompt, str):
222
+ uncond_tokens = [negative_prompt]
223
+ elif batch_size != len(negative_prompt):
224
+ raise ValueError(
225
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
226
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
227
+ " the batch size of `prompt`."
228
+ )
229
+ else:
230
+ uncond_tokens = negative_prompt
231
+
232
+ # textual inversion: procecss multi-vector tokens if necessary
233
+ if isinstance(self, TextualInversionLoaderMixin):
234
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
235
+
236
+ max_length = prompt_embeds.shape[1]
237
+ uncond_input = self.tokenizer(
238
+ uncond_tokens,
239
+ padding="max_length",
240
+ max_length=max_length,
241
+ truncation=True,
242
+ return_tensors="pt",
243
+ )
244
+
245
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
246
+ attention_mask = uncond_input.attention_mask.to(device)
247
+ else:
248
+ attention_mask = None
249
+
250
+ negative_prompt_embeds = self.text_encoder(
251
+ uncond_input.input_ids.to(device),
252
+ attention_mask=attention_mask,
253
+ )
254
+ negative_prompt_embeds = negative_prompt_embeds[0]
255
+
256
+ if do_classifier_free_guidance:
257
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
258
+ seq_len = negative_prompt_embeds.shape[1]
259
+
260
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
261
+
262
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
263
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
264
+
265
+ # For classifier free guidance, we need to do two forward passes.
266
+ # Here we concatenate the unconditional and text embeddings into a single batch
267
+ # to avoid doing two forward passes
268
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
269
+
270
+ return prompt_embeds
271
+
272
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
273
+
274
+ def decode_latents(self, latents):
275
+ latents = 1 / self.vae.config.scaling_factor * latents
276
+ image = self.vae.decode(latents).sample
277
+ image = (image / 2 + 0.5).clamp(0, 1)
278
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
279
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
280
+ return image
281
+
282
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
283
+ def prepare_extra_step_kwargs(self, generator, eta):
284
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
285
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
286
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
287
+ # and should be between [0, 1]
288
+
289
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
290
+ extra_step_kwargs = {}
291
+ if accepts_eta:
292
+ extra_step_kwargs["eta"] = eta
293
+
294
+ # check if the scheduler accepts generator
295
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
296
+ if accepts_generator:
297
+ extra_step_kwargs["generator"] = generator
298
+ return extra_step_kwargs
299
+
300
+ def check_inputs(
301
+ self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
302
+ ):
303
+ if strength < 0 or strength > 1:
304
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
305
+
306
+ if (callback_steps is None) or (
307
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
308
+ ):
309
+ raise ValueError(
310
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
311
+ f" {type(callback_steps)}."
312
+ )
313
+
314
+ if prompt is not None and prompt_embeds is not None:
315
+ raise ValueError(
316
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
317
+ " only forward one of the two."
318
+ )
319
+ elif prompt is None and prompt_embeds is None:
320
+ raise ValueError(
321
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
322
+ )
323
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
324
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
325
+
326
+ if negative_prompt is not None and negative_prompt_embeds is not None:
327
+ raise ValueError(
328
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
329
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
330
+ )
331
+
332
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
333
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
334
+ raise ValueError(
335
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
336
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
337
+ f" {negative_prompt_embeds.shape}."
338
+ )
339
+
340
+ def get_timesteps(self, num_inference_steps, strength, device):
341
+ # get the original timestep using init_timestep
342
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
343
+
344
+ t_start = max(num_inference_steps - init_timestep, 0)
345
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
346
+
347
+ return timesteps, num_inference_steps - t_start
348
+
349
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
350
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
351
+ raise ValueError(
352
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
353
+ )
354
+
355
+ image = image.to(device=device, dtype=dtype)
356
+
357
+ batch_size = batch_size * num_images_per_prompt
358
+
359
+ if image.shape[1] == 4:
360
+ init_latents = image
361
+
362
+ else:
363
+ if isinstance(generator, list) and len(generator) != batch_size:
364
+ raise ValueError(
365
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
366
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
367
+ )
368
+
369
+ elif isinstance(generator, list):
370
+ init_latents = [
371
+ self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
372
+ ]
373
+ init_latents = torch.cat(init_latents, dim=0)
374
+ else:
375
+ init_latents = self.vae.encode(image).latent_dist.sample(generator)
376
+
377
+ init_latents = self.vae.config.scaling_factor * init_latents
378
+
379
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
380
+ # expand init_latents for batch_size
381
+ deprecation_message = (
382
+ f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
383
+ " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
384
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
385
+ " your script to pass as many initial images as text prompts to suppress this warning."
386
+ )
387
+ deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
388
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
389
+ init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
390
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
391
+ raise ValueError(
392
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
393
+ )
394
+ else:
395
+ init_latents = torch.cat([init_latents], dim=0)
396
+
397
+ shape = init_latents.shape
398
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
399
+
400
+ # get latents
401
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
402
+ latents = init_latents
403
+
404
+ return latents
405
+
406
+ @torch.no_grad()
407
+ def __call__(
408
+ self,
409
+ prompt: Union[str, List[str]] = None,
410
+ image: Union[
411
+ torch.FloatTensor,
412
+ PIL.Image.Image,
413
+ np.ndarray,
414
+ List[torch.FloatTensor],
415
+ List[PIL.Image.Image],
416
+ List[np.ndarray],
417
+ ] = None,
418
+ strength: float = 0.8,
419
+ num_inference_steps: Optional[int] = 50,
420
+ guidance_scale: Optional[float] = 7.5,
421
+ negative_prompt: Optional[Union[str, List[str]]] = None,
422
+ num_images_per_prompt: Optional[int] = 1,
423
+ eta: Optional[float] = 0.0,
424
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
425
+ prompt_embeds: Optional[torch.FloatTensor] = None,
426
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
427
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
428
+ callback_steps: int = 1,
429
+ ):
430
+ r"""
431
+ The call function to the pipeline for generation.
432
+
433
+ Args:
434
+ prompt (`str` or `List[str]`, *optional*):
435
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
436
+ image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
437
+ `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
438
+ latents as `image`, but if passing latents directly it is not encoded again.
439
+ strength (`float`, *optional*, defaults to 0.8):
440
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
441
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
442
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
443
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
444
+ essentially ignores `image`.
445
+ num_inference_steps (`int`, *optional*, defaults to 50):
446
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
447
+ expense of slower inference. This parameter is modulated by `strength`.
448
+ guidance_scale (`float`, *optional*, defaults to 7.5):
449
+ A higher guidance scale value encourages the model to generate images closely linked to the text
450
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
451
+ negative_prompt (`str` or `List[str]`, *optional*):
452
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
453
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
454
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
455
+ The number of images to generate per prompt.
456
+ eta (`float`, *optional*, defaults to 0.0):
457
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
458
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
459
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
460
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
461
+ generation deterministic.
462
+ prompt_embeds (`torch.FloatTensor`, *optional*):
463
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
464
+ provided, text embeddings are generated from the `prompt` input argument.
465
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
466
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
467
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
468
+ callback (`Callable`, *optional*):
469
+ A function that calls every `callback_steps` steps during inference. The function is called with the
470
+ following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
471
+ callback_steps (`int`, *optional*, defaults to 1):
472
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
473
+ every step.
474
+
475
+ Examples:
476
+
477
+ Returns:
478
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
479
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
480
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
481
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
482
+ "not-safe-for-work" (nsfw) content.
483
+ """
484
+ # 1. Check inputs. Raise error if not correct
485
+ self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
486
+
487
+ # 2. Define call parameters
488
+ if prompt is not None and isinstance(prompt, str):
489
+ batch_size = 1
490
+ elif prompt is not None and isinstance(prompt, list):
491
+ batch_size = len(prompt)
492
+ else:
493
+ batch_size = prompt_embeds.shape[0]
494
+
495
+ device = self.device
496
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
497
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
498
+ # corresponds to doing no classifier free guidance.
499
+ do_classifier_free_guidance = guidance_scale > 1.0
500
+
501
+ # 3. Encode input prompt
502
+ prompt_embeds = self._encode_prompt(
503
+ prompt,
504
+ device,
505
+ num_images_per_prompt,
506
+ do_classifier_free_guidance,
507
+ negative_prompt,
508
+ prompt_embeds=prompt_embeds,
509
+ negative_prompt_embeds=negative_prompt_embeds,
510
+ )
511
+
512
+ # 4. Preprocess image
513
+ image = preprocess(image)
514
+
515
+ # 5. set timesteps
516
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
517
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
518
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
519
+
520
+ # 6. Prepare latent variables
521
+ latents = self.prepare_latents(
522
+ image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
523
+ )
524
+
525
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
526
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
527
+
528
+ # 8. Denoising loop
529
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
530
+
531
+ for i, t in enumerate(timesteps):
532
+ # expand the latents if we are doing classifier free guidance
533
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
534
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
535
+ latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
536
+
537
+ # predict the noise residual
538
+ # 后边4个 None 是给到controlnet 的参数,暂时给到 None 当 placeholder
539
+ noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, None, None, None, None)
540
+
541
+ noise_pred = noise_pred.permute(0, 3, 1, 2)
542
+
543
+ # perform guidance
544
+ if do_classifier_free_guidance:
545
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
546
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
547
+
548
+ # compute the previous noisy sample x_t -> x_t-1
549
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
550
+
551
+ image = self.decode_latents(latents)
552
+ image = numpy_to_pil(image)
553
+
554
+ return image
lyraSD/muse_trt/libnvinfer_plugin.so → lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53cbcc8a47524652bb8e0399a2fbbcfc0b785f11bdc491bbb6a71e4b888ee124
3
- size 85198184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0689ed5d3b55f5033a8869d5f23ce900793aa0ab7fdc4a3e3c0a0f3a243c83da
3
+ size 65441456
sd1.4-engine/superx4-512-512.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm86.so RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4c37b1fa1a9966104975398cf7b4b4a3ed722335ccde47145c9eb8316030797
3
- size 10211800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e27e715fa3a17ce25bf23b772e0dd355d0780c1bd93cfeeb12ef45b0ba2444
3
+ size 65389176
sd1.4-engine/clip.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc71dac2ad328d4e49d99fad6bbdcb1ce201d2cd376da34f55565c6a60188d1a
3
- size 247251489
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2eaa9067ad8eb1d20872afa71ed9497f62d930819704d15e5e8bf559623eca7
3
+ size 65498752
sd1.4-engine/vae-decoder.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fd093053ffd431b8331ac11e179a31122b8636c5f2f18b8ae9614b714706d4b
3
- size 100138684
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d0c909ff2498934c6d1ed8f46af6cdc7812872177c0a4e7ca0ee99bf88fcb65
3
+ size 65519232
lyrasd_model/lyrasd_lib/placeholder.txt ADDED
File without changes
lyrasd_model/lyrasd_txt2img_pipeline.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import os
3
+ import time
4
+ from typing import Any, Callable, Dict, List, Optional, Union
5
+
6
+ import torch
7
+ from diffusers.loaders import TextualInversionLoaderMixin
8
+ from diffusers.models import AutoencoderKL
9
+ from diffusers.schedulers import (DPMSolverMultistepScheduler,
10
+ EulerAncestralDiscreteScheduler,
11
+ EulerDiscreteScheduler,
12
+ KarrasDiffusionSchedulers)
13
+ from diffusers.utils import logging, randn_tensor
14
+ from PIL import Image
15
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
16
+ import gc
17
+ import numpy as np
18
+
19
+ from .lora_util import add_text_lora_layer
20
+
21
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
22
+
23
+
24
+ def numpy_to_pil(images):
25
+ """
26
+ Convert a numpy image or a batch of images to a PIL image.
27
+ """
28
+ if images.ndim == 3:
29
+ images = images[None, ...]
30
+ images = (images * 255).round().astype("uint8")
31
+ if images.shape[-1] == 1:
32
+ # special case for grayscale (single channel) images
33
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
34
+ else:
35
+ pil_images = [Image.fromarray(image) for image in images]
36
+
37
+ return pil_images
38
+
39
+
40
+ class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
41
+ def __init__(self, model_path, lib_so_path, model_dtype="fp32", device=torch.device("cuda"), dtype=torch.float16) -> None:
42
+ self.device = device
43
+ self.dtype = dtype
44
+
45
+ torch.classes.load_library(lib_so_path)
46
+
47
+ self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
48
+ self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
49
+ self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
50
+ unet_path = os.path.join(model_path, "unet_bins/")
51
+
52
+ self.unet_in_channels = 4
53
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
54
+ self.vae.enable_tiling()
55
+ self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
56
+ 3, # max num of controlnets
57
+ "fp16" # inference dtype (can only use fp16 for now)
58
+ )
59
+
60
+ unet_path = os.path.join(model_path, "unet_bins/")
61
+
62
+ self.reload_unet_model(unet_path, model_dtype)
63
+
64
+ self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
65
+
66
+ self.loaded_lora = {}
67
+
68
+ def reload_unet_model(self, unet_path, unet_file_format='fp32'):
69
+ if len(unet_path) > 0 and unet_path[-1] != "/":
70
+ unet_path = unet_path + "/"
71
+ return self.unet.reload_unet_model(unet_path, unet_file_format)
72
+
73
+ def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
74
+ if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
75
+ lora_model_path = lora_model_path + "/"
76
+ lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
77
+ self.loaded_lora[lora_name] = lora
78
+ self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
79
+
80
+ def unload_lora(self, lora_name, clean_cache=False):
81
+ for layer_data in self.loaded_lora[lora_name]:
82
+ layer = layer_data['layer']
83
+ added_weight = layer_data['added_weight']
84
+ layer.weight.data -= added_weight
85
+ self.unet.unload_lora(lora_name, clean_cache)
86
+ del self.loaded_lora[lora_name]
87
+ gc.collect()
88
+ torch.cuda.empty_cache()
89
+
90
+ def clean_lora_cache(self):
91
+ self.unet.clean_lora_cache()
92
+
93
+ def get_loaded_lora(self):
94
+ return self.unet.get_loaded_lora()
95
+
96
+ def _encode_prompt(
97
+ self,
98
+ prompt,
99
+ device,
100
+ num_images_per_prompt,
101
+ do_classifier_free_guidance,
102
+ negative_prompt=None,
103
+ prompt_embeds: Optional[torch.FloatTensor] = None,
104
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
105
+ ):
106
+ r"""
107
+ Encodes the prompt into text encoder hidden states.
108
+
109
+ Args:
110
+ prompt (`str` or `List[str]`, *optional*):
111
+ prompt to be encoded
112
+ device: (`torch.device`):
113
+ torch device
114
+ num_images_per_prompt (`int`):
115
+ number of images that should be generated per prompt
116
+ do_classifier_free_guidance (`bool`):
117
+ whether to use classifier free guidance or not
118
+ negative_prompt (`str` or `List[str]`, *optional*):
119
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
120
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
121
+ less than `1`).
122
+ prompt_embeds (`torch.FloatTensor`, *optional*):
123
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
124
+ provided, text embeddings will be generated from `prompt` input argument.
125
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
126
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
127
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
128
+ argument.
129
+ """
130
+ if prompt is not None and isinstance(prompt, str):
131
+ batch_size = 1
132
+ elif prompt is not None and isinstance(prompt, list):
133
+ batch_size = len(prompt)
134
+ else:
135
+ batch_size = prompt_embeds.shape[0]
136
+
137
+ if prompt_embeds is None:
138
+ # textual inversion: procecss multi-vector tokens if necessary
139
+ if isinstance(self, TextualInversionLoaderMixin):
140
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
141
+
142
+ text_inputs = self.tokenizer(
143
+ prompt,
144
+ padding="max_length",
145
+ max_length=self.tokenizer.model_max_length,
146
+ truncation=True,
147
+ return_tensors="pt",
148
+ )
149
+ text_input_ids = text_inputs.input_ids
150
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
151
+
152
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
153
+ text_input_ids, untruncated_ids
154
+ ):
155
+ removed_text = self.tokenizer.batch_decode(
156
+ untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
157
+ )
158
+ logger.warning(
159
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
160
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
161
+ )
162
+
163
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
164
+ attention_mask = text_inputs.attention_mask.to(device)
165
+ else:
166
+ attention_mask = None
167
+
168
+ prompt_embeds = self.text_encoder(
169
+ text_input_ids.to(device),
170
+ attention_mask=attention_mask,
171
+ )
172
+ prompt_embeds = prompt_embeds[0]
173
+
174
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
175
+
176
+ bs_embed, seq_len, _ = prompt_embeds.shape
177
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
178
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
179
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
180
+
181
+ # get unconditional embeddings for classifier free guidance
182
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
183
+ uncond_tokens: List[str]
184
+ if negative_prompt is None:
185
+ uncond_tokens = [""] * batch_size
186
+ elif type(prompt) is not type(negative_prompt):
187
+ raise TypeError(
188
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
189
+ f" {type(prompt)}."
190
+ )
191
+ elif isinstance(negative_prompt, str):
192
+ uncond_tokens = [negative_prompt]
193
+ elif batch_size != len(negative_prompt):
194
+ raise ValueError(
195
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
196
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
197
+ " the batch size of `prompt`."
198
+ )
199
+ else:
200
+ uncond_tokens = negative_prompt
201
+
202
+ # textual inversion: procecss multi-vector tokens if necessary
203
+ if isinstance(self, TextualInversionLoaderMixin):
204
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
205
+
206
+ max_length = prompt_embeds.shape[1]
207
+ uncond_input = self.tokenizer(
208
+ uncond_tokens,
209
+ padding="max_length",
210
+ max_length=max_length,
211
+ truncation=True,
212
+ return_tensors="pt",
213
+ )
214
+
215
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
216
+ attention_mask = uncond_input.attention_mask.to(device)
217
+ else:
218
+ attention_mask = None
219
+
220
+ negative_prompt_embeds = self.text_encoder(
221
+ uncond_input.input_ids.to(device),
222
+ attention_mask=attention_mask,
223
+ )
224
+ negative_prompt_embeds = negative_prompt_embeds[0]
225
+
226
+ if do_classifier_free_guidance:
227
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
228
+ seq_len = negative_prompt_embeds.shape[1]
229
+
230
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
231
+
232
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
233
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
234
+
235
+ # For classifier free guidance, we need to do two forward passes.
236
+ # Here we concatenate the unconditional and text embeddings into a single batch
237
+ # to avoid doing two forward passes
238
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
239
+
240
+ return prompt_embeds
241
+
242
+ def decode_latents(self, latents):
243
+ latents = 1 / self.vae.config.scaling_factor * latents
244
+ image = self.vae.decode(latents).sample
245
+ image = (image / 2 + 0.5).clamp(0, 1)
246
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
247
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
248
+ return image
249
+
250
+ def check_inputs(
251
+ self,
252
+ prompt,
253
+ height,
254
+ width,
255
+ negative_prompt=None,
256
+ prompt_embeds=None,
257
+ negative_prompt_embeds=None,
258
+ ):
259
+ if height % 64 != 0 or width % 64 != 0: # 初版暂时只支持 64 的倍数的 height 和 width
260
+ raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
261
+
262
+ if prompt is not None and prompt_embeds is not None:
263
+ raise ValueError(
264
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
265
+ " only forward one of the two."
266
+ )
267
+ elif prompt is None and prompt_embeds is None:
268
+ raise ValueError(
269
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
270
+ )
271
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
272
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
273
+
274
+ if negative_prompt is not None and negative_prompt_embeds is not None:
275
+ raise ValueError(
276
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
277
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
278
+ )
279
+
280
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
281
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
282
+ raise ValueError(
283
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
284
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
285
+ f" {negative_prompt_embeds.shape}."
286
+ )
287
+
288
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
289
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
290
+ if isinstance(generator, list) and len(generator) != batch_size:
291
+ raise ValueError(
292
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
293
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
294
+ )
295
+
296
+ if latents is None:
297
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
298
+ else:
299
+ latents = latents.to(device)
300
+
301
+ # scale the initial noise by the standard deviation required by the scheduler
302
+ latents = latents * self.scheduler.init_noise_sigma
303
+ return latents
304
+
305
+ def prepare_extra_step_kwargs(self, generator, eta):
306
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
307
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
308
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
309
+ # and should be between [0, 1]
310
+
311
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
312
+ extra_step_kwargs = {}
313
+ if accepts_eta:
314
+ extra_step_kwargs["eta"] = eta
315
+
316
+ # check if the scheduler accepts generator
317
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
318
+ if accepts_generator:
319
+ extra_step_kwargs["generator"] = generator
320
+ return extra_step_kwargs
321
+
322
+ @torch.no_grad()
323
+ def __call__(
324
+ self,
325
+ prompt: Union[str, List[str]] = None,
326
+ height: Optional[int] = None,
327
+ width: Optional[int] = None,
328
+ num_inference_steps: int = 50,
329
+ guidance_scale: float = 7.5,
330
+ negative_prompt: Optional[Union[str, List[str]]] = None,
331
+ num_images_per_prompt: Optional[int] = 1,
332
+ eta: float = 0.0,
333
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
334
+ latents: Optional[torch.FloatTensor] = None,
335
+ prompt_embeds: Optional[torch.FloatTensor] = None,
336
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
337
+ ):
338
+ r"""
339
+ Function invoked when calling the pipeline for generation.
340
+
341
+ Args:
342
+ prompt (`str` or `List[str]`, *optional*):
343
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
344
+ instead.
345
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
346
+ The height in pixels of the generated image.
347
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
348
+ The width in pixels of the generated image.
349
+ num_inference_steps (`int`, *optional*, defaults to 50):
350
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
351
+ expense of slower inference.
352
+ guidance_scale (`float`, *optional*, defaults to 7.5):
353
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
354
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
355
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
356
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
357
+ usually at the expense of lower image quality.
358
+ negative_prompt (`str` or `List[str]`, *optional*):
359
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
360
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
361
+ less than `1`).
362
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
363
+ The number of images to generate per prompt.
364
+ eta (`float`, *optional*, defaults to 0.0):
365
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
366
+ [`schedulers.DDIMScheduler`], will be ignored for others.
367
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
368
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
369
+ to make generation deterministic.
370
+ latents (`torch.FloatTensor`, *optional*):
371
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
372
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
373
+ tensor will ge generated by sampling using the supplied random `generator`.
374
+ prompt_embeds (`torch.FloatTensor`, *optional*):
375
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
376
+ provided, text embeddings will be generated from `prompt` input argument.
377
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
378
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
379
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
380
+ argument.
381
+
382
+ """
383
+ # 1. Check inputs. Raise error if not correct
384
+ self.check_inputs(
385
+ prompt, height, width, negative_prompt, prompt_embeds, negative_prompt_embeds
386
+ )
387
+
388
+ # 2. Define call parameters
389
+ if prompt is not None and isinstance(prompt, str):
390
+ batch_size = 1
391
+ elif prompt is not None and isinstance(prompt, list):
392
+ batch_size = len(prompt)
393
+ else:
394
+ batch_size = prompt_embeds.shape[0]
395
+
396
+ device = self.device
397
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
398
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
399
+ # corresponds to doing no classifier free guidance.
400
+ do_classifier_free_guidance = guidance_scale > 1.0
401
+
402
+ # 3. Encode input prompt
403
+ prompt_embeds = self._encode_prompt(
404
+ prompt,
405
+ device,
406
+ num_images_per_prompt,
407
+ do_classifier_free_guidance,
408
+ negative_prompt,
409
+ prompt_embeds=prompt_embeds,
410
+ negative_prompt_embeds=negative_prompt_embeds,
411
+ )
412
+
413
+ # 4. Prepare timesteps
414
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
415
+ timesteps = self.scheduler.timesteps
416
+
417
+ # 5. Prepare latent variables
418
+ num_channels_latents = self.unet_in_channels
419
+ latents = self.prepare_latents(
420
+ batch_size * num_images_per_prompt,
421
+ num_channels_latents,
422
+ height,
423
+ width,
424
+ prompt_embeds.dtype,
425
+ device,
426
+ generator,
427
+ latents,
428
+ )
429
+
430
+ # # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
431
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
432
+
433
+ # 7. Denoising loop
434
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
435
+
436
+ for i, t in enumerate(timesteps):
437
+ # expand the latents if we are doing classifier free guidance
438
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
439
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
440
+ latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
441
+
442
+ # 后边4个 None 是给到controlnet 的参数,暂时给到 None 当 placeholder
443
+ noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, None, None, None, None)
444
+
445
+ noise_pred = noise_pred.permute(0, 3, 1, 2)
446
+ # perform guidance
447
+
448
+ if do_classifier_free_guidance:
449
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
450
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
451
+
452
+ # compute the previous noisy sample x_t -> x_t-1
453
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
454
+
455
+ image = self.decode_latents(latents)
456
+ image = numpy_to_pil(image)
457
+
458
+ return image
models/README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Models
2
+ ### This is the place where you should download the checkpoints, and unzip them
3
+
4
+ ```bash
5
+ wget -O lyrasd_rev_animated.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/lyrasd_rev_animated.tar.gz?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1694078210;1866878210&q-key-time=1694078210;1866878210&q-header-list=&q-url-param-list=&q-signature=6046546135631dee9e8be7d8e061a77e8790e675"
6
+ wget -O lyrasd_canny.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/lyrasd_canny.tar.gz?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1694078194;1866878194&q-key-time=1694078194;1866878194&q-header-list=&q-url-param-list=&q-signature=efb713ee650a0ee3c954fb3a0e148c37ef13cd3b"
7
+ wget -O lyrasd_xiaorenshu_lora.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/lyrasd_xiaorenshu_lora.tar.gz?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1694078234;1866878234&q-key-time=1694078234;1866878234&q-header-list=&q-url-param-list=&q-signature=fb9a577a54ea6dedd9be696e40b96b71a1b23b5d"
8
+
9
+ tar -xvf lyrasd_rev_animated.tar.gz
10
+ tar -xvf lyrasd_canny.tar.gz
11
+ tar -xvf lyrasd_xiaorenshu_lora.tar.gz
12
+ ```
output/img2img_demo.jpg DELETED
Binary file (22 kB)
 
output/img2img_input.jpg DELETED
Binary file (97.3 kB)
 
output/text2img_demo.jpg DELETED
Binary file (42.2 kB)
 
outputs/res_controlnet_img2img_0.png ADDED
outputs/res_controlnet_txt2img_0.png ADDED
outputs/res_img2img_0.png ADDED
outputs/res_txt2img_0.png ADDED
outputs/res_txt2img_lora_0.png ADDED
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ diffusers
2
+ transformers
sd1.4-engine/feature_extractor/preprocessor_config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "crop_size": {
3
- "height": 224,
4
- "width": 224
5
- },
6
- "do_center_crop": true,
7
- "do_convert_rgb": true,
8
- "do_normalize": true,
9
- "do_rescale": true,
10
- "do_resize": true,
11
- "feature_extractor_type": "CLIPFeatureExtractor",
12
- "image_mean": [
13
- 0.48145466,
14
- 0.4578275,
15
- 0.40821073
16
- ],
17
- "image_processor_type": "CLIPImageProcessor",
18
- "image_std": [
19
- 0.26862954,
20
- 0.26130258,
21
- 0.27577711
22
- ],
23
- "resample": 3,
24
- "rescale_factor": 0.00392156862745098,
25
- "size": {
26
- "shortest_edge": 224
27
- }
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sd1.4-engine/scheduler/scheduler_config.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "_class_name": "PNDMScheduler",
3
- "_diffusers_version": "0.14.0",
4
- "beta_end": 0.012,
5
- "beta_schedule": "scaled_linear",
6
- "beta_start": 0.00085,
7
- "clip_sample": false,
8
- "num_train_timesteps": 1000,
9
- "prediction_type": "epsilon",
10
- "set_alpha_to_one": false,
11
- "skip_prk_steps": true,
12
- "steps_offset": 1,
13
- "trained_betas": null
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sd1.4-engine/text_encoder/config.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "_name_or_path": "openai/clip-vit-large-patch14",
3
- "architectures": [
4
- "CLIPTextModel"
5
- ],
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 0,
8
- "dropout": 0.0,
9
- "eos_token_id": 2,
10
- "hidden_act": "quick_gelu",
11
- "hidden_size": 768,
12
- "initializer_factor": 1.0,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 3072,
15
- "layer_norm_eps": 1e-05,
16
- "max_position_embeddings": 77,
17
- "model_type": "clip_text_model",
18
- "num_attention_heads": 12,
19
- "num_hidden_layers": 12,
20
- "pad_token_id": 1,
21
- "projection_dim": 768,
22
- "torch_dtype": "float32",
23
- "transformers_version": "4.25.1",
24
- "vocab_size": 49408
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sd1.4-engine/tokenizer/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
sd1.4-engine/tokenizer/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<|startoftext|>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<|endoftext|>",
17
- "unk_token": {
18
- "content": "<|endoftext|>",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sd1.4-engine/tokenizer/tokenizer_config.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "bos_token": {
4
- "__type": "AddedToken",
5
- "content": "<|startoftext|>",
6
- "lstrip": false,
7
- "normalized": true,
8
- "rstrip": false,
9
- "single_word": false
10
- },
11
- "do_lower_case": true,
12
- "eos_token": {
13
- "__type": "AddedToken",
14
- "content": "<|endoftext|>",
15
- "lstrip": false,
16
- "normalized": true,
17
- "rstrip": false,
18
- "single_word": false
19
- },
20
- "errors": "replace",
21
- "model_max_length": 77,
22
- "name_or_path": "openai/clip-vit-large-patch14",
23
- "pad_token": "<|endoftext|>",
24
- "special_tokens_map_file": "./special_tokens_map.json",
25
- "tokenizer_class": "CLIPTokenizer",
26
- "unk_token": {
27
- "__type": "AddedToken",
28
- "content": "<|endoftext|>",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false
33
- }
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sd1.4-engine/tokenizer/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
sd1.4-engine/unet_fp16.plan DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0373e858a69bb44fe0f21c4990de3ae18d415b4d99ca44e4809ea48fc3482e5a
3
- size 1725864976
 
 
 
 
sd1.4-engine/vae/config.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "_class_name": "AutoencoderKL",
3
- "_diffusers_version": "0.14.0",
4
- "_name_or_path": "stabilityai/sd-vae-ft-mse",
5
- "act_fn": "silu",
6
- "block_out_channels": [
7
- 128,
8
- 256,
9
- 512,
10
- 512
11
- ],
12
- "down_block_types": [
13
- "DownEncoderBlock2D",
14
- "DownEncoderBlock2D",
15
- "DownEncoderBlock2D",
16
- "DownEncoderBlock2D"
17
- ],
18
- "in_channels": 3,
19
- "latent_channels": 4,
20
- "layers_per_block": 2,
21
- "norm_num_groups": 32,
22
- "out_channels": 3,
23
- "sample_size": 256,
24
- "scaling_factor": 0.18215,
25
- "up_block_types": [
26
- "UpDecoderBlock2D",
27
- "UpDecoderBlock2D",
28
- "UpDecoderBlock2D",
29
- "UpDecoderBlock2D"
30
- ]
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sd1.4-engine/vae/diffusion_pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4302e1efa25f3a47ceb7536bc335715ad9d1f203e90c2d25507600d74006e89
3
- size 334715313
 
 
 
 
txt2img_demo.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import time
3
+
4
+ from lyrasd_model import LyraSdTxt2ImgPipeline
5
+
6
+ # 存放模型文件的路径,应该包含一下结构:
7
+ # 1. clip 模型
8
+ # 2. 转换好的优化后的 unet 模型,放入其中的 unet_bins 文件夹
9
+ # 3. vae 模型
10
+ # 4. scheduler 配置
11
+
12
+ # LyraSD 的 C++ 编译动态链接库,其中包含 C++ CUDA 计算的细节
13
+ lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
14
+ model_path = "./models/lyrasd_rev_animated"
15
+ lora_path = "./models/lyrasd_xiaorenshu_lora"
16
+
17
+ # 构建 Txt2Img 的 Pipeline
18
+ model = LyraSdTxt2ImgPipeline(model_path, lib_path)
19
+
20
+ # load lora
21
+ # 参数分别为 lora 存放位置,名字,lora 强度,lora模型精度
22
+ model.load_lora(lora_path, "xiaorenshu", 0.4, "fp32")
23
+
24
+ # 准备应用的输入和超参数
25
+ prompt = "a cat, cute, cartoon, concise, traditional, chinese painting, Tang and Song Dynasties, masterpiece, 4k, 8k, UHD, best quality"
26
+ negative_prompt = "(((horrible))), (((scary))), (((naked))), (((large breasts))), high saturation, colorful, human:2, body:2, low quality, bad quality, lowres, out of frame, duplicate, watermark, signature, text, frames, cut, cropped, malformed limbs, extra limbs, (((missing arms))), (((missing legs)))"
27
+ height, width = 512, 512
28
+ steps = 30
29
+ guidance_scale = 7
30
+ generator = torch.Generator().manual_seed(123)
31
+ num_images = 1
32
+
33
+ start = time.perf_counter()
34
+ # 推理生成
35
+ images = model(prompt, height, width, steps,
36
+ guidance_scale, negative_prompt, num_images,
37
+ generator=generator)
38
+ print("image gen cost: ",time.perf_counter() - start)
39
+ # 存储生成的图片
40
+ for i, image in enumerate(images):
41
+ image.save(f"outputs/res_txt2img_lora_{i}.png")
42
+
43
+ # unload lora,参数为 lora 的名字,是否清除 lora 缓存
44
+ # model.unload_lora("xiaorenshu", True)