Is it possible to set/output segments as in Open AI's API? For example avg_threshold,temperature,compression_ratio.
#132
by
rodosabbath
- opened
Hey everyone.
Beginner here, trying to implement a small API using the model sending.
My idea is to have a JSON response similar to Open AI's one, where segments like tokens, seek, start and etc.. are available.
I've read on the forums here and docs that the pipeline object can't handle this return since it is a specificity of the raw model.
Looking more, apparently it is possible to return it using a generate() function as listed here:
Though its implementation are rather confusing for me, I've tried adapting to my code but couldn't move forward with it.
If anyone can share a light I appreciate it a lot! Below is what I built so far.
from collections import OrderedDict
import torch
import torchaudio
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
from flask import Flask, app, jsonify, render_template, request, Response
import os
import json
audio = request.files["audio"]
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperForConditionalGeneration.from_pretrained(
"openai/whisper-large-v3"
)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=16,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
)
temp = os.path.join("tmp", audio.filename)
audio.save(temp)
transcricao = pipe(
temp,
return_timestamps=True,
)
waveform, sample_rate = torchaudio.load(temp)
duration = waveform.size(1) / sample_rate
os.remove(temp)
start = None
end = None
for chunk in transcricao["chunks"]:
if start is None:
start = chunk["timestamp"][0]
if chunk["timestamp"][1] is not None:
end = chunk["timestamp"][1]
id = 0
tokens = processor.tokenizer(transcricao["text"], return_tensors="pt")
input_ids = tokens.input_ids[0]
seg = {
"id": id,
"start": start,
"end": end,
"tokens": input_ids.tolist(),
}
segments = seg
response = {
"task": "transcription",
"duration": duration,
"text": transcricao["text"],
"segments": [segments],
}
response_json = json.dumps(
{"status": 200, "msg": "Áudio transcrito", "data": response},
ensure_ascii=False,
sort_keys=False,
)
return Response(response_json, mimetype="application/json")