Spaces:
Build error
Build error
pengdaqian
commited on
Commit
•
171f55b
1
Parent(s):
f60bff4
add more
Browse files- Dockerfile +28 -0
- README.md +10 -0
- audio_to_text.py +75 -0
- data.py +58 -0
- dedup_audio_text_80.json +3 -0
- dedup_audio_text_80.safetensors +3 -0
- demo.wav +0 -0
- main.py +105 -0
- model.py +36 -0
- phone.jpg +0 -0
- pipeline.py +36 -0
- prompt.json +3 -0
- requirements.txt +11 -0
- test.py +7 -0
- text_data.py +20 -0
- text_to_audio_embedding.py +80 -0
- text_to_img.py +46 -0
- utils.py +16 -0
- warm_up.py +20 -0
Dockerfile
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.8
|
2 |
+
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
|
5 |
+
USER user
|
6 |
+
|
7 |
+
ENV HOME=/home/user \
|
8 |
+
PATH=/home/user/.local/bin:$PATH
|
9 |
+
|
10 |
+
COPY requirements.txt requirements.txt
|
11 |
+
RUN pip3 install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
RUN mkdir -p $HOME/app/cache/transformers
|
14 |
+
RUN chmod 777 $HOME/app/cache/transformers
|
15 |
+
|
16 |
+
ENV TRANSFORMERS_CACHE $HOME/app/cache/transformers
|
17 |
+
ENV HF_HOME $HOME/app/cache/transformers
|
18 |
+
|
19 |
+
WORKDIR $HOME/app
|
20 |
+
|
21 |
+
COPY --chown=user . $HOME/app
|
22 |
+
|
23 |
+
# cache model
|
24 |
+
RUN python3 warm_up.py
|
25 |
+
|
26 |
+
EXPOSE 7860
|
27 |
+
|
28 |
+
CMD ["sh", "-c", "python3 main.py"]
|
README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Audio Img
|
3 |
+
emoji: 🏃
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: pink
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
---
|
9 |
+
|
10 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
audio_to_text.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
import librosa
|
4 |
+
import laion_clap
|
5 |
+
import torch
|
6 |
+
import numpy as np
|
7 |
+
from safetensors import safe_open
|
8 |
+
from transformers import pipeline
|
9 |
+
|
10 |
+
|
11 |
+
class AudioPipeline(object):
|
12 |
+
def __init__(self, audio_text_path, audio_text_embeddings_path):
|
13 |
+
self.model = laion_clap.CLAP_Module(enable_fusion=False)
|
14 |
+
self.model.load_ckpt() # download the default pretrained checkpoint.
|
15 |
+
self.audio_text_path = audio_text_path
|
16 |
+
self.audio_text_embeddings_path = audio_text_embeddings_path
|
17 |
+
self.gpt2_pipe = pipeline('text-generation', model='Gustavosta/MagicPrompt-Stable-Diffusion', tokenizer='gpt2')
|
18 |
+
|
19 |
+
def audio_embedding(self, file_path):
|
20 |
+
# quantization
|
21 |
+
def int16_to_float32(x):
|
22 |
+
return (x / 32767.0).astype(np.float32)
|
23 |
+
|
24 |
+
def float32_to_int16(x):
|
25 |
+
x = np.clip(x, a_min=-1., a_max=1.)
|
26 |
+
return (x * 32767.).astype(np.int16)
|
27 |
+
|
28 |
+
# Get audio embeddings from audio data
|
29 |
+
audio_data, _ = librosa.load(file_path, sr=48000) # sample rate should be 48000
|
30 |
+
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
|
31 |
+
audio_data = torch.from_numpy(
|
32 |
+
int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
|
33 |
+
audio_embed = self.model.get_audio_embedding_from_data(x=audio_data, use_tensor=True)
|
34 |
+
return audio_embed
|
35 |
+
|
36 |
+
def load_candidate_text(self):
|
37 |
+
import json
|
38 |
+
with open(self.audio_text_path, 'r') as f:
|
39 |
+
texts = json.load(f)
|
40 |
+
|
41 |
+
tensors = {}
|
42 |
+
with safe_open(self.audio_text_embeddings_path, framework="pt", device=0) as f:
|
43 |
+
for k in f.keys():
|
44 |
+
tensors[k] = f.get_tensor(k)
|
45 |
+
text_embed = tensors["text_embed"]
|
46 |
+
return texts, text_embed
|
47 |
+
|
48 |
+
def audio2txt(self, filepath):
|
49 |
+
audio_embed = self.audio_embedding(filepath)
|
50 |
+
texts, text_embed = self.load_candidate_text()
|
51 |
+
# Concatenate the embeddings from all batches into a single tensor
|
52 |
+
result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1))
|
53 |
+
similarity_scores = torch.softmax(result_tensor, dim=1)
|
54 |
+
topk_scores, topk_indices = torch.topk(similarity_scores, k=10, dim=1)
|
55 |
+
print("Top 10 similarity scores:", topk_scores)
|
56 |
+
print("Top 10 sentence indices:", topk_indices)
|
57 |
+
#
|
58 |
+
topK_sentences = [texts[idx].replace("The sounds of", "") for idx in topk_indices[0].tolist()]
|
59 |
+
starting_text = topK_sentences[0]
|
60 |
+
response = self.gpt2_pipe(starting_text, max_length=(len(starting_text) + random.randint(60, 90)),
|
61 |
+
num_return_sequences=4)
|
62 |
+
response_list = []
|
63 |
+
for x in response:
|
64 |
+
resp = x['generated_text'].strip()
|
65 |
+
if resp != starting_text and len(resp) > (len(starting_text) + 4) and resp.endswith(
|
66 |
+
(":", "-", "—")) is False:
|
67 |
+
response_list.append(resp)
|
68 |
+
return response_list[0]
|
69 |
+
|
70 |
+
|
71 |
+
if __name__ == "__main__":
|
72 |
+
pipeline = AudioPipeline(audio_text_path='/root/autodl-tmp/dedup_audio_text_80.json',
|
73 |
+
audio_text_embeddings_path='/root/autodl-tmp/audio_text_embeddings.safetensors')
|
74 |
+
texts = pipeline.audio2txt('/root/autodl-tmp/下载.wav')
|
75 |
+
print(texts)
|
data.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import util
|
2 |
+
import json
|
3 |
+
from safetensors.numpy import load_file
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from safetensors.torch import save_file
|
7 |
+
|
8 |
+
|
9 |
+
def dedup_similar_sentences(sentences, threshold=0.9, batch_size=64):
|
10 |
+
file = load_file("/root/autodl-tmp/dedup_audio_text.safetensors")
|
11 |
+
sentence_embeddings = torch.tensor(file['text_embed']).cuda()
|
12 |
+
|
13 |
+
batch_idx = 0
|
14 |
+
while batch_idx * batch_size < len(sentences):
|
15 |
+
start_idx = batch_idx * batch_size
|
16 |
+
end_idx = min((batch_idx + 1) * batch_size, len(sentences))
|
17 |
+
|
18 |
+
batch_embeddings = sentence_embeddings[start_idx:end_idx]
|
19 |
+
cosine_scores = torch.matmul(batch_embeddings, sentence_embeddings.T) / (
|
20 |
+
torch.norm(batch_embeddings, dim=1)[:, None] *
|
21 |
+
torch.norm(sentence_embeddings, dim=1))
|
22 |
+
|
23 |
+
duplicate_indices = torch.where((cosine_scores > threshold) & (cosine_scores < 1.0))
|
24 |
+
duplicate_indices_list = duplicate_indices[1].tolist()
|
25 |
+
remove_idx = set(duplicate_indices_list)
|
26 |
+
|
27 |
+
# Update sentences and sentence_embeddings to remove duplicates
|
28 |
+
keep_indices = [idx for idx in range(len(sentences)) if idx not in remove_idx]
|
29 |
+
sentences = [sentences[idx] for idx in keep_indices]
|
30 |
+
sentence_embeddings = sentence_embeddings[keep_indices]
|
31 |
+
|
32 |
+
print(len(sentences))
|
33 |
+
print(sentence_embeddings.shape)
|
34 |
+
# Update batch_idx accordingly
|
35 |
+
batch_idx = start_idx // batch_size + 1
|
36 |
+
|
37 |
+
uq_sentences = sentences
|
38 |
+
|
39 |
+
return uq_sentences, sentence_embeddings
|
40 |
+
|
41 |
+
|
42 |
+
def read_default_prompt():
|
43 |
+
import json
|
44 |
+
with open('/root/autodl-tmp/dedup_audio_text.json', 'r') as f:
|
45 |
+
data = json.load(f)
|
46 |
+
return data
|
47 |
+
|
48 |
+
|
49 |
+
if __name__ == '__main__':
|
50 |
+
all_texts = read_default_prompt()
|
51 |
+
unique_sentences, unique_embeddings = dedup_similar_sentences(all_texts, threshold=0.8)
|
52 |
+
with open("/root/autodl-tmp/dedup_audio_text_80.json", "w") as outfile:
|
53 |
+
json.dump(unique_sentences, outfile)
|
54 |
+
|
55 |
+
tensors = {
|
56 |
+
"text_embed": unique_embeddings,
|
57 |
+
}
|
58 |
+
save_file(tensors, "/root/autodl-tmp/dedup_audio_text_80.safetensors")
|
dedup_audio_text_80.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba048799f695abb7ff2e23a21e41f5f921ff3420b6973fdfa21a1323b48f6b74
|
3 |
+
size 4982806
|
dedup_audio_text_80.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:40324130f06f88f852dc189bd8634dd3c03d607e028c0d424f9fbcb5d288b619
|
3 |
+
size 173641816
|
demo.wav
ADDED
Binary file (261 kB). View file
|
|
main.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import os
|
3 |
+
from typing import Union, List
|
4 |
+
|
5 |
+
import requests
|
6 |
+
import uvicorn
|
7 |
+
from fastapi import BackgroundTasks, FastAPI
|
8 |
+
|
9 |
+
import model
|
10 |
+
from audio_to_text import AudioPipeline
|
11 |
+
from text_to_img import init_text2img_pipe, predict
|
12 |
+
from utils import read_from_url
|
13 |
+
from minio import Minio
|
14 |
+
import uuid
|
15 |
+
|
16 |
+
app = FastAPI()
|
17 |
+
|
18 |
+
|
19 |
+
def write_scan_audio_result(audio_id: int, scans: List[int], url: str, callback: str):
|
20 |
+
score_general_threshold = 0.35
|
21 |
+
score_character_threshold = 0.85
|
22 |
+
|
23 |
+
image_url = audio_file_to_image_url(url)
|
24 |
+
|
25 |
+
if image_url is None:
|
26 |
+
image_url = ""
|
27 |
+
|
28 |
+
print(image_url)
|
29 |
+
|
30 |
+
callBackReq = model.AudioScanCallbackRequest(id=audio_id, isValid=True, image_url=image_url)
|
31 |
+
|
32 |
+
try:
|
33 |
+
requests.post(callback, json=callBackReq.dict())
|
34 |
+
except Exception as ex:
|
35 |
+
print(ex)
|
36 |
+
|
37 |
+
# tags = list(map(lambda x: model.AudioScanTag(type="Moderation",
|
38 |
+
# confidence=x['confidence']), nsfw_tags))
|
39 |
+
|
40 |
+
ret = model.AudioScanResponse(ok=True, error="", deleted=False, blockedFor=[], tags=None)
|
41 |
+
return ret
|
42 |
+
|
43 |
+
|
44 |
+
def write_scan_model_result(model_name: str, callback: str):
|
45 |
+
pass
|
46 |
+
|
47 |
+
|
48 |
+
@app.post("/audio-scan")
|
49 |
+
async def image_scan_handler(req: model.AudioScanRequest, background_tasks: BackgroundTasks):
|
50 |
+
if not req.wait:
|
51 |
+
background_tasks.add_task(write_scan_audio_result,
|
52 |
+
audio_id=req.audioId,
|
53 |
+
scans=req.scans,
|
54 |
+
url=req.url, callback=req.callbackUrl)
|
55 |
+
return model.AudioScanResponse(ok=True, error="", deleted=False, blockedFor=[], tags=[])
|
56 |
+
else:
|
57 |
+
ret = write_scan_audio_result(audio_id=req.audioId,
|
58 |
+
scans=req.scans,
|
59 |
+
url=req.url, callback=req.callbackUrl)
|
60 |
+
return ret
|
61 |
+
|
62 |
+
|
63 |
+
def audio_file_to_image_url(audiofile):
|
64 |
+
file_path = read_from_url(audiofile)
|
65 |
+
text = auto_pipeline.audio2txt(file_path)
|
66 |
+
negative_prompt = [
|
67 |
+
"(watermark:2)", "signature", "username", "(text:2)", "website",
|
68 |
+
"(worst quality:2)", "(low quality:2)", "(normal quality:2)", "polar lowres", "jpeg",
|
69 |
+
"((monochrome))", "((grayscale))", "sketches", "Paintings",
|
70 |
+
"(blurry:2)", "cropped", "lowres", "error", "sketches",
|
71 |
+
"(duplicate:1.331)", "(morbid:1.21)", "(mutilated:1.21)", "(tranny:1.331)",
|
72 |
+
"(bad proportions:1.331)",
|
73 |
+
]
|
74 |
+
images = predict(text, " ".join(negative_prompt), text2img_pipeline)
|
75 |
+
for image in images:
|
76 |
+
in_mem_file = io.BytesIO()
|
77 |
+
image.save(in_mem_file, format='png', pnginfo=None)
|
78 |
+
in_mem_file.seek(0)
|
79 |
+
|
80 |
+
object_name = uuid.uuid4()
|
81 |
+
|
82 |
+
s3_client.put_object(
|
83 |
+
bucket_name=os.environ.get("S3_BUCKET"),
|
84 |
+
object_name=object_name,
|
85 |
+
data=in_mem_file,
|
86 |
+
length=in_mem_file.getbuffer().nbytes,
|
87 |
+
content_type="image/png"
|
88 |
+
)
|
89 |
+
image_url = f'{os.environ.get("PUB_VIEW_URL")}/{object_name}'
|
90 |
+
return image_url
|
91 |
+
|
92 |
+
|
93 |
+
global auto_pipeline, text2img_pipeline, s3_client
|
94 |
+
|
95 |
+
if __name__ == "__main__":
|
96 |
+
auto_pipeline = AudioPipeline(audio_text_path='/home/user/app/dedup_audio_text_80.json',
|
97 |
+
audio_text_embeddings_path='/home/user/app/audio_text_embeddings.safetensors')
|
98 |
+
text2img_pipeline = init_text2img_pipe()
|
99 |
+
s3_client = Minio(
|
100 |
+
os.environ.get("S3_ENDPOINT"),
|
101 |
+
access_key=os.environ.get("S3_ACCESS_KEY"),
|
102 |
+
secret_key=os.environ.get("S3_SECRET_KEY"),
|
103 |
+
)
|
104 |
+
|
105 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
model.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional
|
2 |
+
|
3 |
+
from pydantic import BaseModel
|
4 |
+
|
5 |
+
|
6 |
+
class AudioScanRequest(BaseModel):
|
7 |
+
audioId: int
|
8 |
+
url: str
|
9 |
+
wait: bool
|
10 |
+
scans: List[int]
|
11 |
+
callbackUrl: str
|
12 |
+
|
13 |
+
|
14 |
+
class AudioScanTag(BaseModel):
|
15 |
+
type: str
|
16 |
+
name: str
|
17 |
+
|
18 |
+
|
19 |
+
class AudioScanResponse(BaseModel):
|
20 |
+
ok: bool
|
21 |
+
error: str
|
22 |
+
deleted: bool
|
23 |
+
blockedFor: List[str]
|
24 |
+
tags: List[AudioScanTag]
|
25 |
+
|
26 |
+
|
27 |
+
class AudioTag(BaseModel):
|
28 |
+
tag: str
|
29 |
+
id: Optional[int]
|
30 |
+
confidence: int
|
31 |
+
|
32 |
+
|
33 |
+
class AudioScanCallbackRequest(BaseModel):
|
34 |
+
id: int
|
35 |
+
isValid: bool
|
36 |
+
image_url: str
|
phone.jpg
ADDED
pipeline.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from datasets import load_dataset
|
5 |
+
from diffusers import DiffusionPipeline
|
6 |
+
from transformers import (
|
7 |
+
WhisperForConditionalGeneration,
|
8 |
+
WhisperProcessor,
|
9 |
+
)
|
10 |
+
|
11 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
+
|
13 |
+
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
14 |
+
|
15 |
+
audio_sample = ds[3]
|
16 |
+
|
17 |
+
text = audio_sample["text"].lower()
|
18 |
+
speech_data = audio_sample["audio"]["array"]
|
19 |
+
|
20 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
|
21 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
22 |
+
|
23 |
+
diffuser_pipeline = DiffusionPipeline.from_pretrained(
|
24 |
+
"CompVis/stable-diffusion-v1-4",
|
25 |
+
custom_pipeline="audio_to_image_pipeline.py",
|
26 |
+
speech_model=model,
|
27 |
+
speech_processor=processor,
|
28 |
+
|
29 |
+
torch_dtype=torch.float16,
|
30 |
+
)
|
31 |
+
|
32 |
+
diffuser_pipeline.enable_attention_slicing()
|
33 |
+
diffuser_pipeline = diffuser_pipeline.to(device)
|
34 |
+
|
35 |
+
output = diffuser_pipeline(speech_data)
|
36 |
+
plt.imshow(output.images[0])
|
prompt.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf7cb6869d3ea66d777f225002e9a7df1de92162bdae2832ba03dca3865c79c2
|
3 |
+
size 117
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
uvicorn[standard]
|
2 |
+
fastapi
|
3 |
+
huggingface_hub
|
4 |
+
Pillow
|
5 |
+
onnxruntime
|
6 |
+
numpy
|
7 |
+
diffusers
|
8 |
+
transformers
|
9 |
+
torch
|
10 |
+
requests
|
11 |
+
librosa
|
test.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from optimum.onnxruntime import ORTStableDiffusionPipeline
|
2 |
+
|
3 |
+
model_id = "eatdianatoday/yiwu"
|
4 |
+
pipe = ORTStableDiffusionPipeline.from_pretrained(model_id, export=True)
|
5 |
+
prompt = "a photo of an astronaut riding a horse on mars"
|
6 |
+
images = pipe(prompt).images[0]
|
7 |
+
pipe.save_pretrained("/root/autodl-tmp/onnx-novelai-diffusion")
|
text_data.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import json
|
3 |
+
|
4 |
+
# read the csv file into a pandas DataFrame
|
5 |
+
df = pd.read_csv('/root/autodl-tmp/audioset_balanced_train.csv')
|
6 |
+
|
7 |
+
captions = df[" caption"].tolist()
|
8 |
+
df = pd.read_csv('/root/autodl-tmp/Epidemic_all_debiased.csv', on_bad_lines='skip')
|
9 |
+
captions_2 = df[' caption2'].tolist()
|
10 |
+
df = pd.read_csv('/root/autodl-tmp/audioset_eval.csv', on_bad_lines='skip')
|
11 |
+
captions_3 = df['caption'].tolist()
|
12 |
+
df = pd.read_csv('/root/autodl-tmp/audioset_unbalanced_train.csv', on_bad_lines='skip')
|
13 |
+
captions_4 = df[' caption'].tolist()
|
14 |
+
|
15 |
+
captions = captions + captions_2 + captions_3 + captions_4
|
16 |
+
print(len(captions))
|
17 |
+
|
18 |
+
with open("audio_text.json", "w") as outfile:
|
19 |
+
# write the data as a JSON string to the file
|
20 |
+
json.dump(captions, outfile)
|
text_to_audio_embedding.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
import json
|
3 |
+
import librosa
|
4 |
+
import laion_clap
|
5 |
+
import torch
|
6 |
+
import numpy as np
|
7 |
+
import time
|
8 |
+
from itertools import islice
|
9 |
+
from safetensors import safe_open
|
10 |
+
from safetensors.numpy import save_file
|
11 |
+
|
12 |
+
def read_default_prompt():
|
13 |
+
import json
|
14 |
+
with open('/root/autodl-tmp/dedup_audio_text_80.json', 'r') as f:
|
15 |
+
data = json.load(f)
|
16 |
+
return data
|
17 |
+
|
18 |
+
|
19 |
+
def init_audio_pipe():
|
20 |
+
# quantization
|
21 |
+
def int16_to_float32(x):
|
22 |
+
return (x / 32767.0).astype(np.float32)
|
23 |
+
|
24 |
+
def float32_to_int16(x):
|
25 |
+
x = np.clip(x, a_min=-1., a_max=1.)
|
26 |
+
return (x * 32767.).astype(np.int16)
|
27 |
+
|
28 |
+
model = laion_clap.CLAP_Module(enable_fusion=False)
|
29 |
+
model.load_ckpt() # download the default pretrained checkpoint.
|
30 |
+
|
31 |
+
# Get audio embeddings from audio data
|
32 |
+
audio_data, _ = librosa.load('/root/autodl-tmp/下载.wav', sr=48000) # sample rate should be 48000
|
33 |
+
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
|
34 |
+
audio_data = torch.from_numpy(
|
35 |
+
int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
|
36 |
+
audio_embed = model.get_audio_embedding_from_data(x=audio_data, use_tensor=True)
|
37 |
+
# print(audio_embed[:, -20:])
|
38 |
+
print(audio_embed)
|
39 |
+
print(audio_embed.shape)
|
40 |
+
|
41 |
+
# Get text embedings from texts, but return torch tensor:
|
42 |
+
start_time = time.time()
|
43 |
+
|
44 |
+
# change this file to iterator the text_data batch_size 300 and save the embedding to audio_text.safetensors
|
45 |
+
text_data = read_default_prompt()
|
46 |
+
batch_size = 256
|
47 |
+
num_batches = int(np.ceil(len(text_data) / batch_size))
|
48 |
+
|
49 |
+
text_embed = []
|
50 |
+
for i in range(num_batches):
|
51 |
+
# Get the next batch of text data
|
52 |
+
batch_data = list(islice(text_data, i * batch_size, (i + 1) * batch_size))
|
53 |
+
|
54 |
+
# Embed the batch of text data
|
55 |
+
batch_embed = model.get_text_embedding(batch_data, use_tensor=False)
|
56 |
+
|
57 |
+
# Append the batch embeddings to the list
|
58 |
+
text_embed.append(batch_embed)
|
59 |
+
|
60 |
+
# Concatenate the embeddings from all batches into a single tensor
|
61 |
+
text_embed = np.concatenate(text_embed)
|
62 |
+
|
63 |
+
# Save the embeddings to a file
|
64 |
+
print(text_embed)
|
65 |
+
print(text_embed.shape)
|
66 |
+
|
67 |
+
tensors = {
|
68 |
+
"text_embed": text_embed,
|
69 |
+
}
|
70 |
+
save_file(tensors, "/root/autodl-tmp/audio_text_embeddings.safetensors")
|
71 |
+
# end_time = time.time()
|
72 |
+
# print(end_time - start_time)
|
73 |
+
#
|
74 |
+
# result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1))
|
75 |
+
# similarity_scores = torch.softmax(result_tensor, dim=1)
|
76 |
+
# print(similarity_scores)
|
77 |
+
|
78 |
+
|
79 |
+
if __name__ == "__main__":
|
80 |
+
init_audio_pipe()
|
text_to_img.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from diffusers import UnCLIPScheduler, DDPMScheduler, StableUnCLIPPipeline
|
3 |
+
from diffusers.models import PriorTransformer
|
4 |
+
from transformers import CLIPTokenizer, CLIPTextModelWithProjection
|
5 |
+
|
6 |
+
|
7 |
+
def init_text2img_pipe():
|
8 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
9 |
+
data_type = torch.float16 if torch.cuda.is_available() else torch.float32
|
10 |
+
|
11 |
+
prior_model_id = "kakaobrain/karlo-v1-alpha"
|
12 |
+
prior = PriorTransformer.from_pretrained(prior_model_id, subfolder="prior", torch_dtype=data_type)
|
13 |
+
|
14 |
+
prior_text_model_id = "openai/clip-vit-large-patch14"
|
15 |
+
prior_tokenizer = CLIPTokenizer.from_pretrained(prior_text_model_id)
|
16 |
+
prior_text_model = CLIPTextModelWithProjection.from_pretrained(prior_text_model_id, torch_dtype=data_type)
|
17 |
+
prior_scheduler = UnCLIPScheduler.from_pretrained(prior_model_id, subfolder="prior_scheduler")
|
18 |
+
prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
|
19 |
+
|
20 |
+
stable_unclip_model_id = "stabilityai/stable-diffusion-2-1-unclip-small"
|
21 |
+
|
22 |
+
pipe = StableUnCLIPPipeline.from_pretrained(
|
23 |
+
stable_unclip_model_id,
|
24 |
+
torch_dtype=data_type,
|
25 |
+
variant="fp16",
|
26 |
+
prior_tokenizer=prior_tokenizer,
|
27 |
+
prior_text_encoder=prior_text_model,
|
28 |
+
prior=prior,
|
29 |
+
prior_scheduler=prior_scheduler,
|
30 |
+
)
|
31 |
+
return pipe.to(device)
|
32 |
+
|
33 |
+
|
34 |
+
def predict(prompt: str, negative_prompt: str, pipeline):
|
35 |
+
return pipeline(prompt=prompt,
|
36 |
+
negative_prompt=negative_prompt,
|
37 |
+
height=600,
|
38 |
+
width=400,
|
39 |
+
num_inference_steps=60).images
|
40 |
+
|
41 |
+
|
42 |
+
if __name__ == "__main__":
|
43 |
+
text2img_pipeline = init_text2img_pipe()
|
44 |
+
images = predict("a dog", "a cat", text2img_pipeline)
|
45 |
+
for idx, image in enumerate(images):
|
46 |
+
image.save(f"/root/autodl-tmp/image_{idx}.png")
|
utils.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
|
3 |
+
headers = {
|
4 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
|
5 |
+
}
|
6 |
+
|
7 |
+
def read_from_url(file_path):
|
8 |
+
if file_path.startswith("http"):
|
9 |
+
tmp_path = f'/tmp/clamd_{file_path.split("/")[-1].split("?")[0]}'
|
10 |
+
print("tmp_path ", tmp_path)
|
11 |
+
resp = requests.get(file_path, headers=headers).content
|
12 |
+
with open(tmp_path, "wb") as f:
|
13 |
+
f.write(resp)
|
14 |
+
return tmp_path
|
15 |
+
return file_path
|
16 |
+
|
warm_up.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from audio_to_text import AudioPipeline
|
2 |
+
from text_to_img import init_text2img_pipe, predict
|
3 |
+
|
4 |
+
if __name__ == "__main__":
|
5 |
+
negative_prompt = [
|
6 |
+
"(watermark:2)", "signature", "username", "(text:2)", "website",
|
7 |
+
"(worst quality:2)", "(low quality:2)", "(normal quality:2)", "polar lowres", "jpeg",
|
8 |
+
"((monochrome))", "((grayscale))", "sketches", "Paintings",
|
9 |
+
"(blurry:2)", "cropped", "lowres", "error", "sketches",
|
10 |
+
"(duplicate:1.331)", "(morbid:1.21)", "(mutilated:1.21)", "(tranny:1.331)",
|
11 |
+
"(bad proportions:1.331)",
|
12 |
+
]
|
13 |
+
pipeline = AudioPipeline(audio_text_path='/home/user/app/dedup_audio_text_80.json',
|
14 |
+
audio_text_embeddings_path='/home/user/app/audio_text_embeddings.safetensors')
|
15 |
+
text = pipeline.audio2txt('/home/user/app/demo.wav')
|
16 |
+
|
17 |
+
text2img_pipeline = init_text2img_pipe()
|
18 |
+
images = predict(text, " ".join(negative_prompt), text2img_pipeline)
|
19 |
+
for idx, image in enumerate(images):
|
20 |
+
image.save(f"/root/autodl-tmp/image_{idx}.png")
|