Spaces:
Edmond98
/
Running on A100

Afrinetwork7 commited on
Commit
73af305
1 Parent(s): 311f9e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -22
app.py CHANGED
@@ -9,14 +9,15 @@ import logging
9
  import torch
10
  import librosa
11
  from pathlib import Path
12
- import magic # For MIME type detection
13
  from pydub import AudioSegment
 
14
  import traceback
15
  from logging.handlers import RotatingFileHandler
16
  import os
17
  import boto3
18
  from botocore.exceptions import NoCredentialsError
19
  import time
 
20
 
21
  # Import functions from other modules
22
  from asr import transcribe, ASR_LANGUAGES
@@ -61,32 +62,62 @@ class TTSRequest(BaseModel):
61
  language: str
62
  speed: float
63
 
64
- def detect_mime_type(input_bytes):
65
- mime = magic.Magic(mime=True)
66
- return mime.from_buffer(input_bytes)
67
-
68
- def extract_audio(input_bytes):
69
- mime_type = detect_mime_type(input_bytes)
70
-
71
- if mime_type.startswith('audio/'):
72
- return sf.read(io.BytesIO(input_bytes))
73
- elif mime_type.startswith('video/webm'):
74
- audio = AudioSegment.from_file(io.BytesIO(input_bytes), format="webm")
75
- audio_array = np.array(audio.get_array_of_samples())
76
- sample_rate = audio.frame_rate
77
  return audio_array, sample_rate
78
- else:
79
- raise ValueError(f"Unsupported MIME type: {mime_type}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  @app.post("/transcribe")
82
  async def transcribe_audio(request: AudioRequest):
83
  try:
84
  input_bytes = base64.b64decode(request.audio)
85
- audio_array, sample_rate = extract_audio(input_bytes)
86
-
87
- # Convert to mono if stereo
88
- if len(audio_array.shape) > 1:
89
- audio_array = audio_array.mean(axis=1)
90
 
91
  # Ensure audio_array is float32
92
  audio_array = audio_array.astype(np.float32)
@@ -206,7 +237,7 @@ async def synthesize_speech(request: TTSRequest):
206
  async def identify_language(request: AudioRequest):
207
  try:
208
  input_bytes = base64.b64decode(request.audio)
209
- audio_array, sample_rate = extract_audio(input_bytes)
210
  result = identify(audio_array)
211
  return JSONResponse(content={"language_identification": result})
212
  except Exception as e:
 
9
  import torch
10
  import librosa
11
  from pathlib import Path
 
12
  from pydub import AudioSegment
13
+ from moviepy.editor import VideoFileClip
14
  import traceback
15
  from logging.handlers import RotatingFileHandler
16
  import os
17
  import boto3
18
  from botocore.exceptions import NoCredentialsError
19
  import time
20
+ import tempfile
21
 
22
  # Import functions from other modules
23
  from asr import transcribe, ASR_LANGUAGES
 
62
  language: str
63
  speed: float
64
 
65
+ def extract_audio_from_file(input_bytes):
66
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.tmp') as temp_file:
67
+ temp_file.write(input_bytes)
68
+ temp_file_path = temp_file.name
69
+
70
+ try:
71
+ # First, try to read as a standard audio file
72
+ audio_array, sample_rate = sf.read(temp_file_path)
 
 
 
 
 
73
  return audio_array, sample_rate
74
+ except Exception:
75
+ try:
76
+ # Try to read as a video file
77
+ video = VideoFileClip(temp_file_path)
78
+ audio = video.audio
79
+ if audio is not None:
80
+ # Extract audio from video
81
+ audio_array = audio.to_soundarray()
82
+ sample_rate = audio.fps
83
+
84
+ # Convert to mono if stereo
85
+ if len(audio_array.shape) > 1 and audio_array.shape[1] > 1:
86
+ audio_array = audio_array.mean(axis=1)
87
+
88
+ # Ensure audio is float32 and normalized
89
+ audio_array = audio_array.astype(np.float32)
90
+ audio_array /= np.max(np.abs(audio_array))
91
+
92
+ video.close()
93
+ return audio_array, sample_rate
94
+ else:
95
+ raise ValueError("Video file contains no audio")
96
+ except Exception:
97
+ # If video reading fails, try as generic audio with pydub
98
+ try:
99
+ audio = AudioSegment.from_file(temp_file_path)
100
+ audio_array = np.array(audio.get_array_of_samples())
101
+
102
+ # Convert to float32 and normalize
103
+ audio_array = audio_array.astype(np.float32) / (2**15 if audio.sample_width == 2 else 2**7)
104
+
105
+ # Convert stereo to mono if necessary
106
+ if audio.channels == 2:
107
+ audio_array = audio_array.reshape((-1, 2)).mean(axis=1)
108
+
109
+ return audio_array, audio.frame_rate
110
+ except Exception as e:
111
+ raise ValueError(f"Unsupported file format: {str(e)}")
112
+ finally:
113
+ # Clean up the temporary file
114
+ os.unlink(temp_file_path)
115
 
116
  @app.post("/transcribe")
117
  async def transcribe_audio(request: AudioRequest):
118
  try:
119
  input_bytes = base64.b64decode(request.audio)
120
+ audio_array, sample_rate = extract_audio_from_file(input_bytes)
 
 
 
 
121
 
122
  # Ensure audio_array is float32
123
  audio_array = audio_array.astype(np.float32)
 
237
  async def identify_language(request: AudioRequest):
238
  try:
239
  input_bytes = base64.b64decode(request.audio)
240
+ audio_array, sample_rate = extract_audio_from_file(input_bytes)
241
  result = identify(audio_array)
242
  return JSONResponse(content={"language_identification": result})
243
  except Exception as e: