Spaces:
Running
on
T4
Running
on
T4
Merge branch 'main' of https://huggingface.co/spaces/FantasticGNU/AnomalyGPT
Browse files- .gitattributes +2 -0
- README.md +5 -0
- app.py +24 -32
- capsule_crack.png +3 -0
- carpet_normal.jpg +0 -0
- hazelnut_cut.png +3 -0
- header.py +1 -1
- model/ImageBind/data.py +1 -1
- model/openllama.py +25 -5
- requirements.txt +2 -0
.gitattributes
CHANGED
@@ -36,3 +36,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
36 |
.bin filter=lfs diff=lfs merge=lfs -text
|
37 |
.pt filter=lfs diff=lfs merge=lfs -text
|
38 |
.pth filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
36 |
.bin filter=lfs diff=lfs merge=lfs -text
|
37 |
.pt filter=lfs diff=lfs merge=lfs -text
|
38 |
.pth filter=lfs diff=lfs merge=lfs -text
|
39 |
+
hazelnut_cut.png filter=lfs diff=lfs merge=lfs -text
|
40 |
+
capsule_crack.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: cc-by-sa-4.0
|
3 |
+
title: AnomalyGPT
|
4 |
+
sdk: gradio
|
5 |
+
---
|
app.py
CHANGED
@@ -14,14 +14,13 @@ args = {
|
|
14 |
'model': 'openllama_peft',
|
15 |
'imagebind_ckpt_path': './pretrained_ckpt/imagebind_ckpt/imagebind_huge.pth',
|
16 |
'vicuna_ckpt_path': './pretrained_ckpt/vicuna_ckpt/7b_v0',
|
17 |
-
'anomalygpt_ckpt_path': './ckpt/
|
18 |
'delta_ckpt_path': './pretrained_ckpt/pandagpt_ckpt/7b/pytorch_model.pt',
|
19 |
'stage': 2,
|
20 |
'max_tgt_len': 128,
|
21 |
'lora_r': 32,
|
22 |
'lora_alpha': 32,
|
23 |
-
'lora_dropout': 0.1
|
24 |
-
'layers': [7,15,23,31]
|
25 |
}
|
26 |
|
27 |
model = OpenLLAMAPEFTModel(**args)
|
@@ -29,10 +28,9 @@ delta_ckpt = torch.load(args['delta_ckpt_path'], map_location=torch.device('cpu'
|
|
29 |
model.load_state_dict(delta_ckpt, strict=False)
|
30 |
delta_ckpt = torch.load(args['anomalygpt_ckpt_path'], map_location=torch.device('cpu'))
|
31 |
model.load_state_dict(delta_ckpt, strict=False)
|
32 |
-
model = model.eval()
|
33 |
-
|
34 |
-
|
35 |
-
output = None
|
36 |
|
37 |
"""Override Chatbot.postprocess"""
|
38 |
def postprocess(self, y):
|
@@ -127,7 +125,7 @@ def predict(
|
|
127 |
history.append((input, response))
|
128 |
|
129 |
|
130 |
-
plt.imshow(pixel_output.reshape(224,224).detach().cpu(), cmap='binary_r')
|
131 |
plt.axis('off')
|
132 |
plt.savefig('output.png',bbox_inches='tight',pad_inches = 0)
|
133 |
|
@@ -156,57 +154,48 @@ def predict(
|
|
156 |
eroded_image = cv2.erode(image, kernel, iterations=1)
|
157 |
cv2.imwrite('output.png', eroded_image)
|
158 |
|
159 |
-
global output
|
160 |
output = PILImage.open('output.png').convert('L')
|
161 |
|
162 |
|
163 |
-
return chatbot, history, modality_cache
|
164 |
|
165 |
|
166 |
-
def get_image():
|
167 |
-
global output
|
168 |
-
return output if output else "ffffff.png"
|
169 |
-
|
170 |
|
171 |
def reset_user_input():
|
172 |
return gr.update(value='')
|
173 |
|
174 |
-
def reset_dialog():
|
175 |
-
return [], []
|
176 |
|
177 |
def reset_state():
|
178 |
-
|
179 |
-
output = None
|
180 |
-
return None, None, [], [], []
|
181 |
-
|
182 |
|
|
|
183 |
|
184 |
with gr.Blocks() as demo:
|
185 |
gr.HTML("""<h1 align="center">Demo of AnomalyGPT</h1>""")
|
186 |
|
187 |
with gr.Row():
|
188 |
with gr.Column(scale=1):
|
189 |
-
with gr.Row(scale=3):
|
190 |
-
image_path = gr.Image(type="filepath", label="Query Image", value=None)
|
191 |
-
with gr.Row(scale=3):
|
192 |
-
normal_img_path = gr.Image(type="filepath", label="Normal Image", value=None)
|
193 |
with gr.Row():
|
194 |
-
|
195 |
with gr.Row():
|
196 |
-
|
|
|
|
|
197 |
with gr.Row():
|
|
|
|
|
198 |
temperature = gr.Slider(0, 1, value=1.0, step=0.01, label="Temperature", interactive=True)
|
199 |
|
200 |
|
201 |
with gr.Column(scale=3):
|
202 |
with gr.Row():
|
203 |
with gr.Column(scale=6):
|
204 |
-
chatbot = gr.Chatbot().style(height=
|
205 |
with gr.Column(scale=4):
|
206 |
# gr.Image(output)
|
207 |
-
image_output = gr.Image(
|
208 |
with gr.Row():
|
209 |
-
user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=
|
210 |
with gr.Row():
|
211 |
with gr.Column(scale=2):
|
212 |
submitBtn = gr.Button("Submit", variant="primary")
|
@@ -230,19 +219,22 @@ with gr.Blocks() as demo:
|
|
230 |
], [
|
231 |
chatbot,
|
232 |
history,
|
233 |
-
modality_cache
|
|
|
234 |
],
|
235 |
show_progress=True
|
236 |
)
|
237 |
|
238 |
submitBtn.click(reset_user_input, [], [user_input])
|
239 |
emptyBtn.click(reset_state, outputs=[
|
|
|
240 |
image_path,
|
241 |
normal_img_path,
|
242 |
chatbot,
|
243 |
history,
|
244 |
-
modality_cache
|
|
|
245 |
], show_progress=True)
|
246 |
|
247 |
|
248 |
-
demo.queue().launch(
|
|
|
14 |
'model': 'openllama_peft',
|
15 |
'imagebind_ckpt_path': './pretrained_ckpt/imagebind_ckpt/imagebind_huge.pth',
|
16 |
'vicuna_ckpt_path': './pretrained_ckpt/vicuna_ckpt/7b_v0',
|
17 |
+
'anomalygpt_ckpt_path': './ckpt/train_supervised/pytorch_model.pt',
|
18 |
'delta_ckpt_path': './pretrained_ckpt/pandagpt_ckpt/7b/pytorch_model.pt',
|
19 |
'stage': 2,
|
20 |
'max_tgt_len': 128,
|
21 |
'lora_r': 32,
|
22 |
'lora_alpha': 32,
|
23 |
+
'lora_dropout': 0.1
|
|
|
24 |
}
|
25 |
|
26 |
model = OpenLLAMAPEFTModel(**args)
|
|
|
28 |
model.load_state_dict(delta_ckpt, strict=False)
|
29 |
delta_ckpt = torch.load(args['anomalygpt_ckpt_path'], map_location=torch.device('cpu'))
|
30 |
model.load_state_dict(delta_ckpt, strict=False)
|
31 |
+
model = model.eval().to(torch.bfloat16)#.half()#.cuda()
|
32 |
+
# model.image_decoder = model.image_decoder.cuda()
|
33 |
+
# model.prompt_learner = model.prompt_learner.cuda()
|
|
|
34 |
|
35 |
"""Override Chatbot.postprocess"""
|
36 |
def postprocess(self, y):
|
|
|
125 |
history.append((input, response))
|
126 |
|
127 |
|
128 |
+
plt.imshow(pixel_output.to(torch.float16).reshape(224,224).detach().cpu(), cmap='binary_r')
|
129 |
plt.axis('off')
|
130 |
plt.savefig('output.png',bbox_inches='tight',pad_inches = 0)
|
131 |
|
|
|
154 |
eroded_image = cv2.erode(image, kernel, iterations=1)
|
155 |
cv2.imwrite('output.png', eroded_image)
|
156 |
|
|
|
157 |
output = PILImage.open('output.png').convert('L')
|
158 |
|
159 |
|
160 |
+
return chatbot, history, modality_cache, output
|
161 |
|
162 |
|
|
|
|
|
|
|
|
|
163 |
|
164 |
def reset_user_input():
|
165 |
return gr.update(value='')
|
166 |
|
|
|
|
|
167 |
|
168 |
def reset_state():
|
169 |
+
return gr.update(value=''), None, None, [], [], [], PILImage.open('ffffff.png')
|
|
|
|
|
|
|
170 |
|
171 |
+
examples = ['hazelnut_cut.png','capsule_crack.png','carpet_normal.jpg']
|
172 |
|
173 |
with gr.Blocks() as demo:
|
174 |
gr.HTML("""<h1 align="center">Demo of AnomalyGPT</h1>""")
|
175 |
|
176 |
with gr.Row():
|
177 |
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
178 |
with gr.Row():
|
179 |
+
image_path = gr.Image(type="filepath", label="Query Image", value=examples[0])
|
180 |
with gr.Row():
|
181 |
+
normal_img_path = gr.Image(type="filepath", label="Normal Image (optional)", value=None)
|
182 |
+
with gr.Row():
|
183 |
+
gr.Examples(examples=examples, inputs=[image_path])
|
184 |
with gr.Row():
|
185 |
+
max_length = gr.Slider(0, 512, value=512, step=1.0, label="Max length", interactive=True)
|
186 |
+
top_p = gr.Slider(0, 1, value=0.01, step=0.01, label="Top P", interactive=True)
|
187 |
temperature = gr.Slider(0, 1, value=1.0, step=0.01, label="Temperature", interactive=True)
|
188 |
|
189 |
|
190 |
with gr.Column(scale=3):
|
191 |
with gr.Row():
|
192 |
with gr.Column(scale=6):
|
193 |
+
chatbot = gr.Chatbot().style(height=440)
|
194 |
with gr.Column(scale=4):
|
195 |
# gr.Image(output)
|
196 |
+
image_output = gr.Image(interactive=False, label="Localization Output", type='pil',value=PILImage.open('ffffff.png'))
|
197 |
with gr.Row():
|
198 |
+
user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=12).style(container=False)
|
199 |
with gr.Row():
|
200 |
with gr.Column(scale=2):
|
201 |
submitBtn = gr.Button("Submit", variant="primary")
|
|
|
219 |
], [
|
220 |
chatbot,
|
221 |
history,
|
222 |
+
modality_cache,
|
223 |
+
image_output
|
224 |
],
|
225 |
show_progress=True
|
226 |
)
|
227 |
|
228 |
submitBtn.click(reset_user_input, [], [user_input])
|
229 |
emptyBtn.click(reset_state, outputs=[
|
230 |
+
user_input,
|
231 |
image_path,
|
232 |
normal_img_path,
|
233 |
chatbot,
|
234 |
history,
|
235 |
+
modality_cache,
|
236 |
+
image_output
|
237 |
], show_progress=True)
|
238 |
|
239 |
|
240 |
+
demo.queue().launch()
|
capsule_crack.png
ADDED
Git LFS Details
|
carpet_normal.jpg
ADDED
hazelnut_cut.png
ADDED
Git LFS Details
|
header.py
CHANGED
@@ -25,7 +25,7 @@ import logging
|
|
25 |
from copy import deepcopy
|
26 |
import ipdb
|
27 |
import argparse
|
28 |
-
import data
|
29 |
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig
|
30 |
from torch.nn.utils.rnn import pad_sequence
|
31 |
from peft import LoraConfig, TaskType, get_peft_model
|
|
|
25 |
from copy import deepcopy
|
26 |
import ipdb
|
27 |
import argparse
|
28 |
+
from model.ImageBind import data
|
29 |
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig
|
30 |
from torch.nn.utils.rnn import pad_sequence
|
31 |
from peft import LoraConfig, TaskType, get_peft_model
|
model/ImageBind/data.py
CHANGED
@@ -23,7 +23,7 @@ from torchvision.transforms._transforms_video import NormalizeVideo
|
|
23 |
|
24 |
DEFAULT_AUDIO_FRAME_SHIFT_MS = 10 # in milliseconds
|
25 |
|
26 |
-
BPE_PATH = "
|
27 |
|
28 |
|
29 |
def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):
|
|
|
23 |
|
24 |
DEFAULT_AUDIO_FRAME_SHIFT_MS = 10 # in milliseconds
|
25 |
|
26 |
+
BPE_PATH = "./model/ImageBind/bpe/bpe_simple_vocab_16e6.txt.gz"
|
27 |
|
28 |
|
29 |
def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):
|
model/openllama.py
CHANGED
@@ -10,6 +10,8 @@ import kornia as K
|
|
10 |
|
11 |
import torch
|
12 |
from torch.nn.utils import rnn
|
|
|
|
|
13 |
|
14 |
CLASS_NAMES = ['bottle', 'cable', 'capsule', 'carpet', 'grid', 'hazelnut', 'leather', 'metal nut', 'pill', 'screw', 'tile', 'toothbrush', 'transistor', 'wood', 'zipper', 'object',
|
15 |
'candle', 'cashew', 'chewinggum', 'fryum', 'macaroni', 'pcb', 'pipe fryum']
|
@@ -165,17 +167,21 @@ class OpenLLAMAPEFTModel(nn.Module):
|
|
165 |
max_tgt_len = args['max_tgt_len']
|
166 |
stage = args['stage']
|
167 |
|
|
|
|
|
168 |
print (f'Initializing visual encoder from {imagebind_ckpt_path} ...')
|
169 |
|
170 |
self.visual_encoder, self.visual_hidden_size = imagebind_model.imagebind_huge(args)
|
|
|
171 |
imagebind_ckpt = torch.load(imagebind_ckpt_path, map_location=torch.device('cpu'))
|
172 |
self.visual_encoder.load_state_dict(imagebind_ckpt, strict=True)
|
|
|
173 |
|
174 |
self.iter = 0
|
175 |
|
176 |
-
self.image_decoder = LinearLayer(1280, 1024, 4)
|
177 |
|
178 |
-
self.prompt_learner = PromptLearner(1, 4096)
|
179 |
|
180 |
self.loss_focal = FocalLoss()
|
181 |
self.loss_dice = BinaryDiceLoss()
|
@@ -199,11 +205,25 @@ class OpenLLAMAPEFTModel(nn.Module):
|
|
199 |
target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
|
200 |
)
|
201 |
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
self.llama_model = get_peft_model(self.llama_model, peft_config)
|
204 |
self.llama_model.print_trainable_parameters()
|
205 |
|
206 |
-
self.llama_tokenizer = LlamaTokenizer.from_pretrained(vicuna_ckpt_path, use_fast=False)
|
207 |
self.llama_tokenizer.pad_token = self.llama_tokenizer.eos_token
|
208 |
self.llama_tokenizer.padding_side = "right"
|
209 |
print ('Language decoder initialized.')
|
@@ -213,7 +233,7 @@ class OpenLLAMAPEFTModel(nn.Module):
|
|
213 |
)
|
214 |
|
215 |
self.max_tgt_len = max_tgt_len
|
216 |
-
|
217 |
|
218 |
|
219 |
def rot90_img(self,x,k):
|
|
|
10 |
|
11 |
import torch
|
12 |
from torch.nn.utils import rnn
|
13 |
+
from transformers import AutoConfig, AutoModelForCausalLM
|
14 |
+
from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map
|
15 |
|
16 |
CLASS_NAMES = ['bottle', 'cable', 'capsule', 'carpet', 'grid', 'hazelnut', 'leather', 'metal nut', 'pill', 'screw', 'tile', 'toothbrush', 'transistor', 'wood', 'zipper', 'object',
|
17 |
'candle', 'cashew', 'chewinggum', 'fryum', 'macaroni', 'pcb', 'pipe fryum']
|
|
|
167 |
max_tgt_len = args['max_tgt_len']
|
168 |
stage = args['stage']
|
169 |
|
170 |
+
self.device = torch.device('cpu') # torch.cuda.current_device()
|
171 |
+
|
172 |
print (f'Initializing visual encoder from {imagebind_ckpt_path} ...')
|
173 |
|
174 |
self.visual_encoder, self.visual_hidden_size = imagebind_model.imagebind_huge(args)
|
175 |
+
self.visual_encoder.to(self.device)
|
176 |
imagebind_ckpt = torch.load(imagebind_ckpt_path, map_location=torch.device('cpu'))
|
177 |
self.visual_encoder.load_state_dict(imagebind_ckpt, strict=True)
|
178 |
+
|
179 |
|
180 |
self.iter = 0
|
181 |
|
182 |
+
self.image_decoder = LinearLayer(1280, 1024, 4).to(self.device)
|
183 |
|
184 |
+
self.prompt_learner = PromptLearner(1, 4096).to(self.device)
|
185 |
|
186 |
self.loss_focal = FocalLoss()
|
187 |
self.loss_dice = BinaryDiceLoss()
|
|
|
205 |
target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
|
206 |
)
|
207 |
|
208 |
+
# config = AutoConfig.from_pretrained(vicuna_ckpt_path)
|
209 |
+
# with init_empty_weights():
|
210 |
+
# self.llama_model = AutoModelForCausalLM.from_config(config)
|
211 |
+
|
212 |
+
# # device_map = infer_auto_device_map(self.llama_model, no_split_module_classes=["OPTDecoderLayer"], dtype="float16")
|
213 |
+
# # print(device_map)
|
214 |
+
# device_map = {'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10.self_attn': 0, 'model.layers.10.mlp.gate_proj': 0, 'model.layers.10.mlp.down_proj': 'cpu', 'model.layers.10.mlp.up_proj': 'cpu', 'model.layers.10.mlp.act_fn': 'cpu', 'model.layers.10.input_layernorm': 'cpu', 'model.layers.10.post_attention_layernorm': 'cpu', 'model.layers.11': 'cpu', 'model.layers.12': 'cpu', 'model.layers.13': 'cpu', 'model.layers.14': 'cpu', 'model.layers.15': 'cpu', 'model.layers.16': 'cpu', 'model.layers.17': 'cpu', 'model.layers.18': 'cpu', 'model.layers.19': 'cpu', 'model.layers.20': 'cpu', 'model.layers.21': 'cpu', 'model.layers.22': 'cpu', 'model.layers.23': 'cpu', 'model.layers.24': 'disk', 'model.layers.25': 'disk', 'model.layers.26': 'disk', 'model.layers.27': 'disk', 'model.layers.28': 'disk', 'model.layers.29': 'disk', 'model.layers.30': 'disk', 'model.layers.31.self_attn': 'disk', 'model.layers.31.mlp.gate_proj': 'disk', 'model.layers.31.mlp.down_proj': 'disk', 'model.layers.31.mlp.up_proj': 'disk', 'model.layers.31.mlp.act_fn': 'disk', 'model.layers.31.input_layernorm': 'disk', 'model.layers.31.post_attention_layernorm': 'disk', 'model.norm': 'disk', 'lm_head': 'disk'}
|
215 |
+
# # self.llama_model = load_checkpoint_and_dispatch(self.llama_model, vicuna_ckpt_path, device_map=device_map, offload_folder="offload", offload_state_dict = True)
|
216 |
+
# # self.llama_model.to(torch.float16)
|
217 |
+
# # try:
|
218 |
+
self.llama_model = AutoModelForCausalLM.from_pretrained(vicuna_ckpt_path, torch_dtype=torch.bfloat16, device_map='auto', offload_folder="offload", offload_state_dict = True)
|
219 |
+
# # except:
|
220 |
+
# pass
|
221 |
+
# finally:
|
222 |
+
# print(self.llama_model.hf_device_map)
|
223 |
self.llama_model = get_peft_model(self.llama_model, peft_config)
|
224 |
self.llama_model.print_trainable_parameters()
|
225 |
|
226 |
+
self.llama_tokenizer = LlamaTokenizer.from_pretrained(vicuna_ckpt_path, use_fast=False, torch_dtype=torch.bfloat16, device_map='auto', offload_folder="offload", offload_state_dict = True)
|
227 |
self.llama_tokenizer.pad_token = self.llama_tokenizer.eos_token
|
228 |
self.llama_tokenizer.padding_side = "right"
|
229 |
print ('Language decoder initialized.')
|
|
|
233 |
)
|
234 |
|
235 |
self.max_tgt_len = max_tgt_len
|
236 |
+
|
237 |
|
238 |
|
239 |
def rot90_img(self,x,k):
|
requirements.txt
CHANGED
@@ -23,3 +23,5 @@ torchaudio==0.13.1
|
|
23 |
torchvision==0.14.1
|
24 |
tqdm==4.64.1
|
25 |
transformers==4.29.1
|
|
|
|
|
|
23 |
torchvision==0.14.1
|
24 |
tqdm==4.64.1
|
25 |
transformers==4.29.1
|
26 |
+
sentencepiece
|
27 |
+
accelerate
|