pengdaqian commited on
Commit
171f55b
1 Parent(s): f60bff4
Files changed (19) hide show
  1. Dockerfile +28 -0
  2. README.md +10 -0
  3. audio_to_text.py +75 -0
  4. data.py +58 -0
  5. dedup_audio_text_80.json +3 -0
  6. dedup_audio_text_80.safetensors +3 -0
  7. demo.wav +0 -0
  8. main.py +105 -0
  9. model.py +36 -0
  10. phone.jpg +0 -0
  11. pipeline.py +36 -0
  12. prompt.json +3 -0
  13. requirements.txt +11 -0
  14. test.py +7 -0
  15. text_data.py +20 -0
  16. text_to_audio_embedding.py +80 -0
  17. text_to_img.py +46 -0
  18. utils.py +16 -0
  19. warm_up.py +20 -0
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.8
2
+
3
+ RUN useradd -m -u 1000 user
4
+
5
+ USER user
6
+
7
+ ENV HOME=/home/user \
8
+ PATH=/home/user/.local/bin:$PATH
9
+
10
+ COPY requirements.txt requirements.txt
11
+ RUN pip3 install --no-cache-dir -r requirements.txt
12
+
13
+ RUN mkdir -p $HOME/app/cache/transformers
14
+ RUN chmod 777 $HOME/app/cache/transformers
15
+
16
+ ENV TRANSFORMERS_CACHE $HOME/app/cache/transformers
17
+ ENV HF_HOME $HOME/app/cache/transformers
18
+
19
+ WORKDIR $HOME/app
20
+
21
+ COPY --chown=user . $HOME/app
22
+
23
+ # cache model
24
+ RUN python3 warm_up.py
25
+
26
+ EXPOSE 7860
27
+
28
+ CMD ["sh", "-c", "python3 main.py"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Audio Img
3
+ emoji: 🏃
4
+ colorFrom: purple
5
+ colorTo: pink
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
audio_to_text.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import librosa
4
+ import laion_clap
5
+ import torch
6
+ import numpy as np
7
+ from safetensors import safe_open
8
+ from transformers import pipeline
9
+
10
+
11
+ class AudioPipeline(object):
12
+ def __init__(self, audio_text_path, audio_text_embeddings_path):
13
+ self.model = laion_clap.CLAP_Module(enable_fusion=False)
14
+ self.model.load_ckpt() # download the default pretrained checkpoint.
15
+ self.audio_text_path = audio_text_path
16
+ self.audio_text_embeddings_path = audio_text_embeddings_path
17
+ self.gpt2_pipe = pipeline('text-generation', model='Gustavosta/MagicPrompt-Stable-Diffusion', tokenizer='gpt2')
18
+
19
+ def audio_embedding(self, file_path):
20
+ # quantization
21
+ def int16_to_float32(x):
22
+ return (x / 32767.0).astype(np.float32)
23
+
24
+ def float32_to_int16(x):
25
+ x = np.clip(x, a_min=-1., a_max=1.)
26
+ return (x * 32767.).astype(np.int16)
27
+
28
+ # Get audio embeddings from audio data
29
+ audio_data, _ = librosa.load(file_path, sr=48000) # sample rate should be 48000
30
+ audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
31
+ audio_data = torch.from_numpy(
32
+ int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
33
+ audio_embed = self.model.get_audio_embedding_from_data(x=audio_data, use_tensor=True)
34
+ return audio_embed
35
+
36
+ def load_candidate_text(self):
37
+ import json
38
+ with open(self.audio_text_path, 'r') as f:
39
+ texts = json.load(f)
40
+
41
+ tensors = {}
42
+ with safe_open(self.audio_text_embeddings_path, framework="pt", device=0) as f:
43
+ for k in f.keys():
44
+ tensors[k] = f.get_tensor(k)
45
+ text_embed = tensors["text_embed"]
46
+ return texts, text_embed
47
+
48
+ def audio2txt(self, filepath):
49
+ audio_embed = self.audio_embedding(filepath)
50
+ texts, text_embed = self.load_candidate_text()
51
+ # Concatenate the embeddings from all batches into a single tensor
52
+ result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1))
53
+ similarity_scores = torch.softmax(result_tensor, dim=1)
54
+ topk_scores, topk_indices = torch.topk(similarity_scores, k=10, dim=1)
55
+ print("Top 10 similarity scores:", topk_scores)
56
+ print("Top 10 sentence indices:", topk_indices)
57
+ #
58
+ topK_sentences = [texts[idx].replace("The sounds of", "") for idx in topk_indices[0].tolist()]
59
+ starting_text = topK_sentences[0]
60
+ response = self.gpt2_pipe(starting_text, max_length=(len(starting_text) + random.randint(60, 90)),
61
+ num_return_sequences=4)
62
+ response_list = []
63
+ for x in response:
64
+ resp = x['generated_text'].strip()
65
+ if resp != starting_text and len(resp) > (len(starting_text) + 4) and resp.endswith(
66
+ (":", "-", "—")) is False:
67
+ response_list.append(resp)
68
+ return response_list[0]
69
+
70
+
71
+ if __name__ == "__main__":
72
+ pipeline = AudioPipeline(audio_text_path='/root/autodl-tmp/dedup_audio_text_80.json',
73
+ audio_text_embeddings_path='/root/autodl-tmp/audio_text_embeddings.safetensors')
74
+ texts = pipeline.audio2txt('/root/autodl-tmp/下载.wav')
75
+ print(texts)
data.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import util
2
+ import json
3
+ from safetensors.numpy import load_file
4
+ import numpy as np
5
+ import torch
6
+ from safetensors.torch import save_file
7
+
8
+
9
+ def dedup_similar_sentences(sentences, threshold=0.9, batch_size=64):
10
+ file = load_file("/root/autodl-tmp/dedup_audio_text.safetensors")
11
+ sentence_embeddings = torch.tensor(file['text_embed']).cuda()
12
+
13
+ batch_idx = 0
14
+ while batch_idx * batch_size < len(sentences):
15
+ start_idx = batch_idx * batch_size
16
+ end_idx = min((batch_idx + 1) * batch_size, len(sentences))
17
+
18
+ batch_embeddings = sentence_embeddings[start_idx:end_idx]
19
+ cosine_scores = torch.matmul(batch_embeddings, sentence_embeddings.T) / (
20
+ torch.norm(batch_embeddings, dim=1)[:, None] *
21
+ torch.norm(sentence_embeddings, dim=1))
22
+
23
+ duplicate_indices = torch.where((cosine_scores > threshold) & (cosine_scores < 1.0))
24
+ duplicate_indices_list = duplicate_indices[1].tolist()
25
+ remove_idx = set(duplicate_indices_list)
26
+
27
+ # Update sentences and sentence_embeddings to remove duplicates
28
+ keep_indices = [idx for idx in range(len(sentences)) if idx not in remove_idx]
29
+ sentences = [sentences[idx] for idx in keep_indices]
30
+ sentence_embeddings = sentence_embeddings[keep_indices]
31
+
32
+ print(len(sentences))
33
+ print(sentence_embeddings.shape)
34
+ # Update batch_idx accordingly
35
+ batch_idx = start_idx // batch_size + 1
36
+
37
+ uq_sentences = sentences
38
+
39
+ return uq_sentences, sentence_embeddings
40
+
41
+
42
+ def read_default_prompt():
43
+ import json
44
+ with open('/root/autodl-tmp/dedup_audio_text.json', 'r') as f:
45
+ data = json.load(f)
46
+ return data
47
+
48
+
49
+ if __name__ == '__main__':
50
+ all_texts = read_default_prompt()
51
+ unique_sentences, unique_embeddings = dedup_similar_sentences(all_texts, threshold=0.8)
52
+ with open("/root/autodl-tmp/dedup_audio_text_80.json", "w") as outfile:
53
+ json.dump(unique_sentences, outfile)
54
+
55
+ tensors = {
56
+ "text_embed": unique_embeddings,
57
+ }
58
+ save_file(tensors, "/root/autodl-tmp/dedup_audio_text_80.safetensors")
dedup_audio_text_80.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba048799f695abb7ff2e23a21e41f5f921ff3420b6973fdfa21a1323b48f6b74
3
+ size 4982806
dedup_audio_text_80.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40324130f06f88f852dc189bd8634dd3c03d607e028c0d424f9fbcb5d288b619
3
+ size 173641816
demo.wav ADDED
Binary file (261 kB). View file
 
main.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ from typing import Union, List
4
+
5
+ import requests
6
+ import uvicorn
7
+ from fastapi import BackgroundTasks, FastAPI
8
+
9
+ import model
10
+ from audio_to_text import AudioPipeline
11
+ from text_to_img import init_text2img_pipe, predict
12
+ from utils import read_from_url
13
+ from minio import Minio
14
+ import uuid
15
+
16
+ app = FastAPI()
17
+
18
+
19
+ def write_scan_audio_result(audio_id: int, scans: List[int], url: str, callback: str):
20
+ score_general_threshold = 0.35
21
+ score_character_threshold = 0.85
22
+
23
+ image_url = audio_file_to_image_url(url)
24
+
25
+ if image_url is None:
26
+ image_url = ""
27
+
28
+ print(image_url)
29
+
30
+ callBackReq = model.AudioScanCallbackRequest(id=audio_id, isValid=True, image_url=image_url)
31
+
32
+ try:
33
+ requests.post(callback, json=callBackReq.dict())
34
+ except Exception as ex:
35
+ print(ex)
36
+
37
+ # tags = list(map(lambda x: model.AudioScanTag(type="Moderation",
38
+ # confidence=x['confidence']), nsfw_tags))
39
+
40
+ ret = model.AudioScanResponse(ok=True, error="", deleted=False, blockedFor=[], tags=None)
41
+ return ret
42
+
43
+
44
+ def write_scan_model_result(model_name: str, callback: str):
45
+ pass
46
+
47
+
48
+ @app.post("/audio-scan")
49
+ async def image_scan_handler(req: model.AudioScanRequest, background_tasks: BackgroundTasks):
50
+ if not req.wait:
51
+ background_tasks.add_task(write_scan_audio_result,
52
+ audio_id=req.audioId,
53
+ scans=req.scans,
54
+ url=req.url, callback=req.callbackUrl)
55
+ return model.AudioScanResponse(ok=True, error="", deleted=False, blockedFor=[], tags=[])
56
+ else:
57
+ ret = write_scan_audio_result(audio_id=req.audioId,
58
+ scans=req.scans,
59
+ url=req.url, callback=req.callbackUrl)
60
+ return ret
61
+
62
+
63
+ def audio_file_to_image_url(audiofile):
64
+ file_path = read_from_url(audiofile)
65
+ text = auto_pipeline.audio2txt(file_path)
66
+ negative_prompt = [
67
+ "(watermark:2)", "signature", "username", "(text:2)", "website",
68
+ "(worst quality:2)", "(low quality:2)", "(normal quality:2)", "polar lowres", "jpeg",
69
+ "((monochrome))", "((grayscale))", "sketches", "Paintings",
70
+ "(blurry:2)", "cropped", "lowres", "error", "sketches",
71
+ "(duplicate:1.331)", "(morbid:1.21)", "(mutilated:1.21)", "(tranny:1.331)",
72
+ "(bad proportions:1.331)",
73
+ ]
74
+ images = predict(text, " ".join(negative_prompt), text2img_pipeline)
75
+ for image in images:
76
+ in_mem_file = io.BytesIO()
77
+ image.save(in_mem_file, format='png', pnginfo=None)
78
+ in_mem_file.seek(0)
79
+
80
+ object_name = uuid.uuid4()
81
+
82
+ s3_client.put_object(
83
+ bucket_name=os.environ.get("S3_BUCKET"),
84
+ object_name=object_name,
85
+ data=in_mem_file,
86
+ length=in_mem_file.getbuffer().nbytes,
87
+ content_type="image/png"
88
+ )
89
+ image_url = f'{os.environ.get("PUB_VIEW_URL")}/{object_name}'
90
+ return image_url
91
+
92
+
93
+ global auto_pipeline, text2img_pipeline, s3_client
94
+
95
+ if __name__ == "__main__":
96
+ auto_pipeline = AudioPipeline(audio_text_path='/home/user/app/dedup_audio_text_80.json',
97
+ audio_text_embeddings_path='/home/user/app/audio_text_embeddings.safetensors')
98
+ text2img_pipeline = init_text2img_pipe()
99
+ s3_client = Minio(
100
+ os.environ.get("S3_ENDPOINT"),
101
+ access_key=os.environ.get("S3_ACCESS_KEY"),
102
+ secret_key=os.environ.get("S3_SECRET_KEY"),
103
+ )
104
+
105
+ uvicorn.run(app, host="0.0.0.0", port=7860)
model.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class AudioScanRequest(BaseModel):
7
+ audioId: int
8
+ url: str
9
+ wait: bool
10
+ scans: List[int]
11
+ callbackUrl: str
12
+
13
+
14
+ class AudioScanTag(BaseModel):
15
+ type: str
16
+ name: str
17
+
18
+
19
+ class AudioScanResponse(BaseModel):
20
+ ok: bool
21
+ error: str
22
+ deleted: bool
23
+ blockedFor: List[str]
24
+ tags: List[AudioScanTag]
25
+
26
+
27
+ class AudioTag(BaseModel):
28
+ tag: str
29
+ id: Optional[int]
30
+ confidence: int
31
+
32
+
33
+ class AudioScanCallbackRequest(BaseModel):
34
+ id: int
35
+ isValid: bool
36
+ image_url: str
phone.jpg ADDED
pipeline.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ import matplotlib.pyplot as plt
4
+ from datasets import load_dataset
5
+ from diffusers import DiffusionPipeline
6
+ from transformers import (
7
+ WhisperForConditionalGeneration,
8
+ WhisperProcessor,
9
+ )
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
14
+
15
+ audio_sample = ds[3]
16
+
17
+ text = audio_sample["text"].lower()
18
+ speech_data = audio_sample["audio"]["array"]
19
+
20
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
21
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
22
+
23
+ diffuser_pipeline = DiffusionPipeline.from_pretrained(
24
+ "CompVis/stable-diffusion-v1-4",
25
+ custom_pipeline="audio_to_image_pipeline.py",
26
+ speech_model=model,
27
+ speech_processor=processor,
28
+
29
+ torch_dtype=torch.float16,
30
+ )
31
+
32
+ diffuser_pipeline.enable_attention_slicing()
33
+ diffuser_pipeline = diffuser_pipeline.to(device)
34
+
35
+ output = diffuser_pipeline(speech_data)
36
+ plt.imshow(output.images[0])
prompt.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf7cb6869d3ea66d777f225002e9a7df1de92162bdae2832ba03dca3865c79c2
3
+ size 117
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ uvicorn[standard]
2
+ fastapi
3
+ huggingface_hub
4
+ Pillow
5
+ onnxruntime
6
+ numpy
7
+ diffusers
8
+ transformers
9
+ torch
10
+ requests
11
+ librosa
test.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from optimum.onnxruntime import ORTStableDiffusionPipeline
2
+
3
+ model_id = "eatdianatoday/yiwu"
4
+ pipe = ORTStableDiffusionPipeline.from_pretrained(model_id, export=True)
5
+ prompt = "a photo of an astronaut riding a horse on mars"
6
+ images = pipe(prompt).images[0]
7
+ pipe.save_pretrained("/root/autodl-tmp/onnx-novelai-diffusion")
text_data.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+
4
+ # read the csv file into a pandas DataFrame
5
+ df = pd.read_csv('/root/autodl-tmp/audioset_balanced_train.csv')
6
+
7
+ captions = df[" caption"].tolist()
8
+ df = pd.read_csv('/root/autodl-tmp/Epidemic_all_debiased.csv', on_bad_lines='skip')
9
+ captions_2 = df[' caption2'].tolist()
10
+ df = pd.read_csv('/root/autodl-tmp/audioset_eval.csv', on_bad_lines='skip')
11
+ captions_3 = df['caption'].tolist()
12
+ df = pd.read_csv('/root/autodl-tmp/audioset_unbalanced_train.csv', on_bad_lines='skip')
13
+ captions_4 = df[' caption'].tolist()
14
+
15
+ captions = captions + captions_2 + captions_3 + captions_4
16
+ print(len(captions))
17
+
18
+ with open("audio_text.json", "w") as outfile:
19
+ # write the data as a JSON string to the file
20
+ json.dump(captions, outfile)
text_to_audio_embedding.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import json
3
+ import librosa
4
+ import laion_clap
5
+ import torch
6
+ import numpy as np
7
+ import time
8
+ from itertools import islice
9
+ from safetensors import safe_open
10
+ from safetensors.numpy import save_file
11
+
12
+ def read_default_prompt():
13
+ import json
14
+ with open('/root/autodl-tmp/dedup_audio_text_80.json', 'r') as f:
15
+ data = json.load(f)
16
+ return data
17
+
18
+
19
+ def init_audio_pipe():
20
+ # quantization
21
+ def int16_to_float32(x):
22
+ return (x / 32767.0).astype(np.float32)
23
+
24
+ def float32_to_int16(x):
25
+ x = np.clip(x, a_min=-1., a_max=1.)
26
+ return (x * 32767.).astype(np.int16)
27
+
28
+ model = laion_clap.CLAP_Module(enable_fusion=False)
29
+ model.load_ckpt() # download the default pretrained checkpoint.
30
+
31
+ # Get audio embeddings from audio data
32
+ audio_data, _ = librosa.load('/root/autodl-tmp/下载.wav', sr=48000) # sample rate should be 48000
33
+ audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
34
+ audio_data = torch.from_numpy(
35
+ int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
36
+ audio_embed = model.get_audio_embedding_from_data(x=audio_data, use_tensor=True)
37
+ # print(audio_embed[:, -20:])
38
+ print(audio_embed)
39
+ print(audio_embed.shape)
40
+
41
+ # Get text embedings from texts, but return torch tensor:
42
+ start_time = time.time()
43
+
44
+ # change this file to iterator the text_data batch_size 300 and save the embedding to audio_text.safetensors
45
+ text_data = read_default_prompt()
46
+ batch_size = 256
47
+ num_batches = int(np.ceil(len(text_data) / batch_size))
48
+
49
+ text_embed = []
50
+ for i in range(num_batches):
51
+ # Get the next batch of text data
52
+ batch_data = list(islice(text_data, i * batch_size, (i + 1) * batch_size))
53
+
54
+ # Embed the batch of text data
55
+ batch_embed = model.get_text_embedding(batch_data, use_tensor=False)
56
+
57
+ # Append the batch embeddings to the list
58
+ text_embed.append(batch_embed)
59
+
60
+ # Concatenate the embeddings from all batches into a single tensor
61
+ text_embed = np.concatenate(text_embed)
62
+
63
+ # Save the embeddings to a file
64
+ print(text_embed)
65
+ print(text_embed.shape)
66
+
67
+ tensors = {
68
+ "text_embed": text_embed,
69
+ }
70
+ save_file(tensors, "/root/autodl-tmp/audio_text_embeddings.safetensors")
71
+ # end_time = time.time()
72
+ # print(end_time - start_time)
73
+ #
74
+ # result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1))
75
+ # similarity_scores = torch.softmax(result_tensor, dim=1)
76
+ # print(similarity_scores)
77
+
78
+
79
+ if __name__ == "__main__":
80
+ init_audio_pipe()
text_to_img.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from diffusers import UnCLIPScheduler, DDPMScheduler, StableUnCLIPPipeline
3
+ from diffusers.models import PriorTransformer
4
+ from transformers import CLIPTokenizer, CLIPTextModelWithProjection
5
+
6
+
7
+ def init_text2img_pipe():
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ data_type = torch.float16 if torch.cuda.is_available() else torch.float32
10
+
11
+ prior_model_id = "kakaobrain/karlo-v1-alpha"
12
+ prior = PriorTransformer.from_pretrained(prior_model_id, subfolder="prior", torch_dtype=data_type)
13
+
14
+ prior_text_model_id = "openai/clip-vit-large-patch14"
15
+ prior_tokenizer = CLIPTokenizer.from_pretrained(prior_text_model_id)
16
+ prior_text_model = CLIPTextModelWithProjection.from_pretrained(prior_text_model_id, torch_dtype=data_type)
17
+ prior_scheduler = UnCLIPScheduler.from_pretrained(prior_model_id, subfolder="prior_scheduler")
18
+ prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
19
+
20
+ stable_unclip_model_id = "stabilityai/stable-diffusion-2-1-unclip-small"
21
+
22
+ pipe = StableUnCLIPPipeline.from_pretrained(
23
+ stable_unclip_model_id,
24
+ torch_dtype=data_type,
25
+ variant="fp16",
26
+ prior_tokenizer=prior_tokenizer,
27
+ prior_text_encoder=prior_text_model,
28
+ prior=prior,
29
+ prior_scheduler=prior_scheduler,
30
+ )
31
+ return pipe.to(device)
32
+
33
+
34
+ def predict(prompt: str, negative_prompt: str, pipeline):
35
+ return pipeline(prompt=prompt,
36
+ negative_prompt=negative_prompt,
37
+ height=600,
38
+ width=400,
39
+ num_inference_steps=60).images
40
+
41
+
42
+ if __name__ == "__main__":
43
+ text2img_pipeline = init_text2img_pipe()
44
+ images = predict("a dog", "a cat", text2img_pipeline)
45
+ for idx, image in enumerate(images):
46
+ image.save(f"/root/autodl-tmp/image_{idx}.png")
utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ headers = {
4
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
5
+ }
6
+
7
+ def read_from_url(file_path):
8
+ if file_path.startswith("http"):
9
+ tmp_path = f'/tmp/clamd_{file_path.split("/")[-1].split("?")[0]}'
10
+ print("tmp_path ", tmp_path)
11
+ resp = requests.get(file_path, headers=headers).content
12
+ with open(tmp_path, "wb") as f:
13
+ f.write(resp)
14
+ return tmp_path
15
+ return file_path
16
+
warm_up.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from audio_to_text import AudioPipeline
2
+ from text_to_img import init_text2img_pipe, predict
3
+
4
+ if __name__ == "__main__":
5
+ negative_prompt = [
6
+ "(watermark:2)", "signature", "username", "(text:2)", "website",
7
+ "(worst quality:2)", "(low quality:2)", "(normal quality:2)", "polar lowres", "jpeg",
8
+ "((monochrome))", "((grayscale))", "sketches", "Paintings",
9
+ "(blurry:2)", "cropped", "lowres", "error", "sketches",
10
+ "(duplicate:1.331)", "(morbid:1.21)", "(mutilated:1.21)", "(tranny:1.331)",
11
+ "(bad proportions:1.331)",
12
+ ]
13
+ pipeline = AudioPipeline(audio_text_path='/home/user/app/dedup_audio_text_80.json',
14
+ audio_text_embeddings_path='/home/user/app/audio_text_embeddings.safetensors')
15
+ text = pipeline.audio2txt('/home/user/app/demo.wav')
16
+
17
+ text2img_pipeline = init_text2img_pipe()
18
+ images = predict(text, " ".join(negative_prompt), text2img_pipeline)
19
+ for idx, image in enumerate(images):
20
+ image.save(f"/root/autodl-tmp/image_{idx}.png")