Spaces:

next-social
/

audio_img

Build error

App Files Files Community

pengdaqian commited on Apr 14, 2023

Commit

171f55b

•

1 Parent(s): f60bff4

add more

Browse files

Files changed (19) hide show

Dockerfile +28 -0
README.md +10 -0
audio_to_text.py +75 -0
data.py +58 -0
dedup_audio_text_80.json +3 -0
dedup_audio_text_80.safetensors +3 -0
demo.wav +0 -0
main.py +105 -0
model.py +36 -0
phone.jpg +0 -0
pipeline.py +36 -0
prompt.json +3 -0
requirements.txt +11 -0
test.py +7 -0
text_data.py +20 -0
text_to_audio_embedding.py +80 -0
text_to_img.py +46 -0
utils.py +16 -0
warm_up.py +20 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.8
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+COPY requirements.txt requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt
+RUN mkdir -p $HOME/app/cache/transformers
+RUN chmod 777 $HOME/app/cache/transformers
+ENV TRANSFORMERS_CACHE $HOME/app/cache/transformers
+ENV HF_HOME $HOME/app/cache/transformers
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+# cache model
+RUN python3 warm_up.py
+EXPOSE 7860
+CMD ["sh", "-c", "python3 main.py"]

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Audio Img
+emoji: 🏃
+colorFrom: purple
+colorTo: pink
+sdk: docker
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

audio_to_text.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import random
+import librosa
+import laion_clap
+import torch
+import numpy as np
+from safetensors import safe_open
+from transformers import pipeline
+class AudioPipeline(object):
+    def __init__(self, audio_text_path, audio_text_embeddings_path):
+        self.model = laion_clap.CLAP_Module(enable_fusion=False)
+        self.model.load_ckpt()  # download the default pretrained checkpoint.
+        self.audio_text_path = audio_text_path
+        self.audio_text_embeddings_path = audio_text_embeddings_path
+        self.gpt2_pipe = pipeline('text-generation', model='Gustavosta/MagicPrompt-Stable-Diffusion', tokenizer='gpt2')
+    def audio_embedding(self, file_path):
+        # quantization
+        def int16_to_float32(x):
+            return (x / 32767.0).astype(np.float32)
+        def float32_to_int16(x):
+            x = np.clip(x, a_min=-1., a_max=1.)
+            return (x * 32767.).astype(np.int16)
+        # Get audio embeddings from audio data
+        audio_data, _ = librosa.load(file_path, sr=48000)  # sample rate should be 48000
+        audio_data = audio_data.reshape(1, -1)  # Make it (1,T) or (N,T)
+        audio_data = torch.from_numpy(
+            int16_to_float32(float32_to_int16(audio_data))).float()  # quantize before send it in to the model
+        audio_embed = self.model.get_audio_embedding_from_data(x=audio_data, use_tensor=True)
+        return audio_embed
+    def load_candidate_text(self):
+        import json
+        with open(self.audio_text_path, 'r') as f:
+            texts = json.load(f)
+        tensors = {}
+        with safe_open(self.audio_text_embeddings_path, framework="pt", device=0) as f:
+            for k in f.keys():
+                tensors[k] = f.get_tensor(k)
+        text_embed = tensors["text_embed"]
+        return texts, text_embed
+    def audio2txt(self, filepath):
+        audio_embed = self.audio_embedding(filepath)
+        texts, text_embed = self.load_candidate_text()
+        # Concatenate the embeddings from all batches into a single tensor
+        result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1))
+        similarity_scores = torch.softmax(result_tensor, dim=1)
+        topk_scores, topk_indices = torch.topk(similarity_scores, k=10, dim=1)
+        print("Top 10 similarity scores:", topk_scores)
+        print("Top 10 sentence indices:", topk_indices)
+        #
+        topK_sentences = [texts[idx].replace("The sounds of", "") for idx in topk_indices[0].tolist()]
+        starting_text = topK_sentences[0]
+        response = self.gpt2_pipe(starting_text, max_length=(len(starting_text) + random.randint(60, 90)),
+                                  num_return_sequences=4)
+        response_list = []
+        for x in response:
+            resp = x['generated_text'].strip()
+            if resp != starting_text and len(resp) > (len(starting_text) + 4) and resp.endswith(
+                    (":", "-", "—")) is False:
+                response_list.append(resp)
+        return response_list[0]
+if __name__ == "__main__":
+    pipeline = AudioPipeline(audio_text_path='/root/autodl-tmp/dedup_audio_text_80.json',
+                             audio_text_embeddings_path='/root/autodl-tmp/audio_text_embeddings.safetensors')
+    texts = pipeline.audio2txt('/root/autodl-tmp/下载.wav')
+    print(texts)

data.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from sentence_transformers import util
+import json
+from safetensors.numpy import load_file
+import numpy as np
+import torch
+from safetensors.torch import save_file
+def dedup_similar_sentences(sentences, threshold=0.9, batch_size=64):
+    file = load_file("/root/autodl-tmp/dedup_audio_text.safetensors")
+    sentence_embeddings = torch.tensor(file['text_embed']).cuda()
+    batch_idx = 0
+    while batch_idx * batch_size < len(sentences):
+        start_idx = batch_idx * batch_size
+        end_idx = min((batch_idx + 1) * batch_size, len(sentences))
+        batch_embeddings = sentence_embeddings[start_idx:end_idx]
+        cosine_scores = torch.matmul(batch_embeddings, sentence_embeddings.T) / (
+                torch.norm(batch_embeddings, dim=1)[:, None] *
+                torch.norm(sentence_embeddings, dim=1))
+        duplicate_indices = torch.where((cosine_scores > threshold) & (cosine_scores < 1.0))
+        duplicate_indices_list = duplicate_indices[1].tolist()
+        remove_idx = set(duplicate_indices_list)
+        # Update sentences and sentence_embeddings to remove duplicates
+        keep_indices = [idx for idx in range(len(sentences)) if idx not in remove_idx]
+        sentences = [sentences[idx] for idx in keep_indices]
+        sentence_embeddings = sentence_embeddings[keep_indices]
+        print(len(sentences))
+        print(sentence_embeddings.shape)
+        # Update batch_idx accordingly
+        batch_idx = start_idx // batch_size + 1
+    uq_sentences = sentences
+    return uq_sentences, sentence_embeddings
+def read_default_prompt():
+    import json
+    with open('/root/autodl-tmp/dedup_audio_text.json', 'r') as f:
+        data = json.load(f)
+    return data
+if __name__ == '__main__':
+    all_texts = read_default_prompt()
+    unique_sentences, unique_embeddings = dedup_similar_sentences(all_texts, threshold=0.8)
+    with open("/root/autodl-tmp/dedup_audio_text_80.json", "w") as outfile:
+        json.dump(unique_sentences, outfile)
+    tensors = {
+        "text_embed": unique_embeddings,
+    }
+    save_file(tensors, "/root/autodl-tmp/dedup_audio_text_80.safetensors")

dedup_audio_text_80.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba048799f695abb7ff2e23a21e41f5f921ff3420b6973fdfa21a1323b48f6b74
+size 4982806

dedup_audio_text_80.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40324130f06f88f852dc189bd8634dd3c03d607e028c0d424f9fbcb5d288b619
+size 173641816

demo.wav ADDED Viewed

Binary file (261 kB). View file

main.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import io
+import os
+from typing import Union, List
+import requests
+import uvicorn
+from fastapi import BackgroundTasks, FastAPI
+import model
+from audio_to_text import AudioPipeline
+from text_to_img import init_text2img_pipe, predict
+from utils import read_from_url
+from minio import Minio
+import uuid
+app = FastAPI()
+def write_scan_audio_result(audio_id: int, scans: List[int], url: str, callback: str):
+    score_general_threshold = 0.35
+    score_character_threshold = 0.85
+    image_url = audio_file_to_image_url(url)
+    if image_url is None:
+        image_url = ""
+    print(image_url)
+    callBackReq = model.AudioScanCallbackRequest(id=audio_id, isValid=True, image_url=image_url)
+    try:
+        requests.post(callback, json=callBackReq.dict())
+    except Exception as ex:
+        print(ex)
+    # tags = list(map(lambda x: model.AudioScanTag(type="Moderation",
+    #                                                   confidence=x['confidence']), nsfw_tags))
+    ret = model.AudioScanResponse(ok=True, error="", deleted=False, blockedFor=[], tags=None)
+    return ret
+def write_scan_model_result(model_name: str, callback: str):
+    pass
+@app.post("/audio-scan")
+async def image_scan_handler(req: model.AudioScanRequest, background_tasks: BackgroundTasks):
+    if not req.wait:
+        background_tasks.add_task(write_scan_audio_result,
+                                  audio_id=req.audioId,
+                                  scans=req.scans,
+                                  url=req.url, callback=req.callbackUrl)
+        return model.AudioScanResponse(ok=True, error="", deleted=False, blockedFor=[], tags=[])
+    else:
+        ret = write_scan_audio_result(audio_id=req.audioId,
+                                      scans=req.scans,
+                                      url=req.url, callback=req.callbackUrl)
+        return ret
+def audio_file_to_image_url(audiofile):
+    file_path = read_from_url(audiofile)
+    text = auto_pipeline.audio2txt(file_path)
+    negative_prompt = [
+        "(watermark:2)", "signature", "username", "(text:2)", "website",
+        "(worst quality:2)", "(low quality:2)", "(normal quality:2)", "polar lowres", "jpeg",
+        "((monochrome))", "((grayscale))", "sketches", "Paintings",
+        "(blurry:2)", "cropped", "lowres", "error", "sketches",
+        "(duplicate:1.331)", "(morbid:1.21)", "(mutilated:1.21)", "(tranny:1.331)",
+        "(bad proportions:1.331)",
+    ]
+    images = predict(text, " ".join(negative_prompt), text2img_pipeline)
+    for image in images:
+        in_mem_file = io.BytesIO()
+        image.save(in_mem_file, format='png', pnginfo=None)
+        in_mem_file.seek(0)
+        object_name = uuid.uuid4()
+        s3_client.put_object(
+            bucket_name=os.environ.get("S3_BUCKET"),
+            object_name=object_name,
+            data=in_mem_file,
+            length=in_mem_file.getbuffer().nbytes,
+            content_type="image/png"
+        )
+        image_url = f'{os.environ.get("PUB_VIEW_URL")}/{object_name}'
+        return image_url
+global auto_pipeline, text2img_pipeline, s3_client
+if __name__ == "__main__":
+    auto_pipeline = AudioPipeline(audio_text_path='/home/user/app/dedup_audio_text_80.json',
+                                  audio_text_embeddings_path='/home/user/app/audio_text_embeddings.safetensors')
+    text2img_pipeline = init_text2img_pipe()
+    s3_client = Minio(
+        os.environ.get("S3_ENDPOINT"),
+        access_key=os.environ.get("S3_ACCESS_KEY"),
+        secret_key=os.environ.get("S3_SECRET_KEY"),
+    )
+    uvicorn.run(app, host="0.0.0.0", port=7860)

model.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from typing import List, Optional
+from pydantic import BaseModel
+class AudioScanRequest(BaseModel):
+    audioId: int
+    url: str
+    wait: bool
+    scans: List[int]
+    callbackUrl: str
+class AudioScanTag(BaseModel):
+    type: str
+    name: str
+class AudioScanResponse(BaseModel):
+    ok: bool
+    error: str
+    deleted: bool
+    blockedFor: List[str]
+    tags: List[AudioScanTag]
+class AudioTag(BaseModel):
+    tag: str
+    id: Optional[int]
+    confidence: int
+class AudioScanCallbackRequest(BaseModel):
+    id: int
+    isValid: bool
+    image_url: str

phone.jpg ADDED Viewed

pipeline.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import matplotlib.pyplot as plt
+from datasets import load_dataset
+from diffusers import DiffusionPipeline
+from transformers import (
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+audio_sample = ds[3]
+text = audio_sample["text"].lower()
+speech_data = audio_sample["audio"]["array"]
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
+processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+diffuser_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="audio_to_image_pipeline.py",
+    speech_model=model,
+    speech_processor=processor,
+    torch_dtype=torch.float16,
+)
+diffuser_pipeline.enable_attention_slicing()
+diffuser_pipeline = diffuser_pipeline.to(device)
+output = diffuser_pipeline(speech_data)
+plt.imshow(output.images[0])

prompt.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf7cb6869d3ea66d777f225002e9a7df1de92162bdae2832ba03dca3865c79c2
+size 117

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+uvicorn[standard]
+fastapi
+huggingface_hub
+Pillow
+onnxruntime
+numpy
+diffusers
+transformers
+torch
+requests
+librosa

test.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from optimum.onnxruntime import ORTStableDiffusionPipeline
+model_id = "eatdianatoday/yiwu"
+pipe = ORTStableDiffusionPipeline.from_pretrained(model_id, export=True)
+prompt = "a photo of an astronaut riding a horse on mars"
+images = pipe(prompt).images[0]
+pipe.save_pretrained("/root/autodl-tmp/onnx-novelai-diffusion")

text_data.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import pandas as pd
+import json
+# read the csv file into a pandas DataFrame
+df = pd.read_csv('/root/autodl-tmp/audioset_balanced_train.csv')
+captions = df[" caption"].tolist()
+df = pd.read_csv('/root/autodl-tmp/Epidemic_all_debiased.csv', on_bad_lines='skip')
+captions_2 = df[' caption2'].tolist()
+df = pd.read_csv('/root/autodl-tmp/audioset_eval.csv', on_bad_lines='skip')
+captions_3 = df['caption'].tolist()
+df = pd.read_csv('/root/autodl-tmp/audioset_unbalanced_train.csv', on_bad_lines='skip')
+captions_4 = df[' caption'].tolist()
+captions = captions + captions_2 + captions_3 + captions_4
+print(len(captions))
+with open("audio_text.json", "w") as outfile:
+    # write the data as a JSON string to the file
+    json.dump(captions, outfile)

text_to_audio_embedding.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gc
+import json
+import librosa
+import laion_clap
+import torch
+import numpy as np
+import time
+from itertools import islice
+from safetensors import safe_open
+from safetensors.numpy import save_file
+def read_default_prompt():
+    import json
+    with open('/root/autodl-tmp/dedup_audio_text_80.json', 'r') as f:
+        data = json.load(f)
+    return data
+def init_audio_pipe():
+    # quantization
+    def int16_to_float32(x):
+        return (x / 32767.0).astype(np.float32)
+    def float32_to_int16(x):
+        x = np.clip(x, a_min=-1., a_max=1.)
+        return (x * 32767.).astype(np.int16)
+    model = laion_clap.CLAP_Module(enable_fusion=False)
+    model.load_ckpt()  # download the default pretrained checkpoint.
+    # Get audio embeddings from audio data
+    audio_data, _ = librosa.load('/root/autodl-tmp/下载.wav', sr=48000)  # sample rate should be 48000
+    audio_data = audio_data.reshape(1, -1)  # Make it (1,T) or (N,T)
+    audio_data = torch.from_numpy(
+        int16_to_float32(float32_to_int16(audio_data))).float()  # quantize before send it in to the model
+    audio_embed = model.get_audio_embedding_from_data(x=audio_data, use_tensor=True)
+    # print(audio_embed[:, -20:])
+    print(audio_embed)
+    print(audio_embed.shape)
+    # Get text embedings from texts, but return torch tensor:
+    start_time = time.time()
+    # change this file to iterator the text_data batch_size 300 and save the embedding to audio_text.safetensors
+    text_data = read_default_prompt()
+    batch_size = 256
+    num_batches = int(np.ceil(len(text_data) / batch_size))
+    text_embed = []
+    for i in range(num_batches):
+        # Get the next batch of text data
+        batch_data = list(islice(text_data, i * batch_size, (i + 1) * batch_size))
+        # Embed the batch of text data
+        batch_embed = model.get_text_embedding(batch_data, use_tensor=False)
+        # Append the batch embeddings to the list
+        text_embed.append(batch_embed)
+    # Concatenate the embeddings from all batches into a single tensor
+    text_embed = np.concatenate(text_embed)
+    # Save the embeddings to a file
+    print(text_embed)
+    print(text_embed.shape)
+    tensors = {
+        "text_embed": text_embed,
+    }
+    save_file(tensors, "/root/autodl-tmp/audio_text_embeddings.safetensors")
+    # end_time = time.time()
+    # print(end_time - start_time)
+    #
+    # result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1))
+    # similarity_scores = torch.softmax(result_tensor, dim=1)
+    # print(similarity_scores)
+if __name__ == "__main__":
+    init_audio_pipe()

text_to_img.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+from diffusers import UnCLIPScheduler, DDPMScheduler, StableUnCLIPPipeline
+from diffusers.models import PriorTransformer
+from transformers import CLIPTokenizer, CLIPTextModelWithProjection
+def init_text2img_pipe():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    data_type = torch.float16 if torch.cuda.is_available() else torch.float32
+    prior_model_id = "kakaobrain/karlo-v1-alpha"
+    prior = PriorTransformer.from_pretrained(prior_model_id, subfolder="prior", torch_dtype=data_type)
+    prior_text_model_id = "openai/clip-vit-large-patch14"
+    prior_tokenizer = CLIPTokenizer.from_pretrained(prior_text_model_id)
+    prior_text_model = CLIPTextModelWithProjection.from_pretrained(prior_text_model_id, torch_dtype=data_type)
+    prior_scheduler = UnCLIPScheduler.from_pretrained(prior_model_id, subfolder="prior_scheduler")
+    prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
+    stable_unclip_model_id = "stabilityai/stable-diffusion-2-1-unclip-small"
+    pipe = StableUnCLIPPipeline.from_pretrained(
+        stable_unclip_model_id,
+        torch_dtype=data_type,
+        variant="fp16",
+        prior_tokenizer=prior_tokenizer,
+        prior_text_encoder=prior_text_model,
+        prior=prior,
+        prior_scheduler=prior_scheduler,
+    )
+    return pipe.to(device)
+def predict(prompt: str, negative_prompt: str, pipeline):
+    return pipeline(prompt=prompt,
+                    negative_prompt=negative_prompt,
+                    height=600,
+                    width=400,
+                    num_inference_steps=60).images
+if __name__ == "__main__":
+    text2img_pipeline = init_text2img_pipe()
+    images = predict("a dog", "a cat", text2img_pipeline)
+    for idx, image in enumerate(images):
+        image.save(f"/root/autodl-tmp/image_{idx}.png")

utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import requests
+headers = {
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
+}
+def read_from_url(file_path):
+    if file_path.startswith("http"):
+        tmp_path = f'/tmp/clamd_{file_path.split("/")[-1].split("?")[0]}'
+        print("tmp_path ", tmp_path)
+        resp = requests.get(file_path, headers=headers).content
+        with open(tmp_path, "wb") as f:
+            f.write(resp)
+        return tmp_path
+    return file_path

warm_up.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from audio_to_text import AudioPipeline
+from text_to_img import init_text2img_pipe, predict
+if __name__ == "__main__":
+    negative_prompt = [
+        "(watermark:2)", "signature", "username", "(text:2)", "website",
+        "(worst quality:2)", "(low quality:2)", "(normal quality:2)", "polar lowres", "jpeg",
+        "((monochrome))", "((grayscale))", "sketches", "Paintings",
+        "(blurry:2)", "cropped", "lowres", "error", "sketches",
+        "(duplicate:1.331)", "(morbid:1.21)", "(mutilated:1.21)", "(tranny:1.331)",
+        "(bad proportions:1.331)",
+    ]
+    pipeline = AudioPipeline(audio_text_path='/home/user/app/dedup_audio_text_80.json',
+                             audio_text_embeddings_path='/home/user/app/audio_text_embeddings.safetensors')
+    text = pipeline.audio2txt('/home/user/app/demo.wav')
+    text2img_pipeline = init_text2img_pipe()
+    images = predict(text, " ".join(negative_prompt), text2img_pipeline)
+    for idx, image in enumerate(images):
+        image.save(f"/root/autodl-tmp/image_{idx}.png")