Woleek commited on
Commit
f5f5100
1 Parent(s): c4e7950

Audio postprocessing

Browse files
Files changed (2) hide show
  1. app.py +21 -5
  2. requirements.txt +2 -1
app.py CHANGED
@@ -2,6 +2,8 @@ import torch
2
  import gradio as gr
3
  from transformers import ViTImageProcessor, ViTModel
4
  from audiodiffusion import AudioDiffusionPipeline, ImageEncoder
 
 
5
 
6
  device = "cuda" if torch.cuda.is_available() else "cpu"
7
  generator1 = torch.Generator(device)
@@ -13,6 +15,16 @@ processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k
13
  extractor = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
14
  image_encoder = ImageEncoder(processor, extractor)
15
 
 
 
 
 
 
 
 
 
 
 
16
  def _encode_image(image):
17
  return torch.unsqueeze(image_encoder.encode(image), axis=1).to(device)
18
 
@@ -28,9 +40,13 @@ def _generate_spectrogram(condition, steps, eta):
28
  )
29
  return images[0], (sample_rate, audios[0])
30
 
 
 
 
31
  def run_generation(image, steps, eta):
32
  condition = _encode_image(image)
33
  spectrogram, (sr, audio) = _generate_spectrogram(condition, steps, eta)
 
34
  return spectrogram, (sr, audio)
35
 
36
  with gr.Blocks(title="Image-based soundtrack generation") as demo:
@@ -44,21 +60,21 @@ with gr.Blocks(title="Image-based soundtrack generation") as demo:
44
  label="Conditioning image"
45
  )
46
  steps = gr.Slider(
47
- minimum=1,
48
  maximum=1000,
49
- step=1,
50
  value=50,
51
  label="Denoising steps"
52
  )
53
  eta = gr.Slider(
54
- minimum=0.1,
55
  maximum=1.0,
56
  step=0.1,
57
- value=0.9,
58
  label="η"
59
  )
60
  gr.Markdown('''
61
- Eta (η) is a variable that controls the level of interpolation between a deterministic DDIM (η=0.0) and a stochastic DDPM (η=1.0).
62
  ''')
63
  btn = gr.Button("Generate")
64
  clear = gr.ClearButton(image)
 
2
  import gradio as gr
3
  from transformers import ViTImageProcessor, ViTModel
4
  from audiodiffusion import AudioDiffusionPipeline, ImageEncoder
5
+ from pedalboard.io import AudioFile
6
+ from pedalboard import Pedalboard, NoiseGate, Compressor, LowShelfFilter, Gain, HighShelfFilter, Reverb
7
 
8
  device = "cuda" if torch.cuda.is_available() else "cpu"
9
  generator1 = torch.Generator(device)
 
15
  extractor = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
16
  image_encoder = ImageEncoder(processor, extractor)
17
 
18
+ board = Pedalboard([
19
+ NoiseGate(threshold_db=-60, ratio=10.0),
20
+ Compressor(threshold_db=60, ratio=1.0),
21
+ LowShelfFilter(cutoff_frequency_hz=220, gain_db=-10),
22
+ HighShelfFilter(cutoff_frequency_hz=1200, gain_db=-10),
23
+ Gain(gain_db=40),
24
+ Reverb(room_size=0.5),
25
+
26
+ ])
27
+
28
  def _encode_image(image):
29
  return torch.unsqueeze(image_encoder.encode(image), axis=1).to(device)
30
 
 
40
  )
41
  return images[0], (sample_rate, audios[0])
42
 
43
+ def _denoise_audio(audio, sr):
44
+ return board(audio, sr)
45
+
46
  def run_generation(image, steps, eta):
47
  condition = _encode_image(image)
48
  spectrogram, (sr, audio) = _generate_spectrogram(condition, steps, eta)
49
+ audio = _denoise_audio(audio, sr)
50
  return spectrogram, (sr, audio)
51
 
52
  with gr.Blocks(title="Image-based soundtrack generation") as demo:
 
60
  label="Conditioning image"
61
  )
62
  steps = gr.Slider(
63
+ minimum=10,
64
  maximum=1000,
65
+ step=10,
66
  value=50,
67
  label="Denoising steps"
68
  )
69
  eta = gr.Slider(
70
+ minimum=0.0,
71
  maximum=1.0,
72
  step=0.1,
73
+ value=0.6,
74
  label="η"
75
  )
76
  gr.Markdown('''
77
+ Eta (η) is a variable that controls the level of interpolation between deterministic (η=0.0) and stochastic (η=1.0) denoising schedule.
78
  ''')
79
  btn = gr.Button("Generate")
80
  clear = gr.ClearButton(image)
requirements.txt CHANGED
@@ -4,4 +4,5 @@ transformers==4.35.2
4
  numpy==1.23.5
5
  Pillow==9.3.0
6
  diffusers==0.23.1
7
- librosa==0.10.1
 
 
4
  numpy==1.23.5
5
  Pillow==9.3.0
6
  diffusers==0.23.1
7
+ librosa==0.10.1
8
+ pedalboard==0.8.6