omkar334 commited on
Commit
c425f6c
1 Parent(s): 2468331

text-to-speech integration

Browse files
Files changed (3) hide show
  1. agent.py +4 -6
  2. app.py +38 -33
  3. sarvam.py +7 -5
agent.py CHANGED
@@ -44,7 +44,7 @@ async def call_agent(user_prompt, grade, subject):
44
  "src_lang": "Identify the language that the user query is in, type: str",
45
  "dest_lang": """Identify the target language from the user query if the function is either "translator" or "speaker". If language is not found, return "none",
46
  type: Enum["hindi", "bengali", "kannada", "malayalam", "marathi", "odia", "punjabi", "tamil", "telugu", "english", "gujarati", "none"]""",
47
- "source": "Identify the sentence that the user wants to translate or speak. Retu 'none', type: Optional[str]",
48
  "response": "Your response, type: Optional[str]",
49
  },
50
  llm=llm,
@@ -69,12 +69,10 @@ async def function_caller(user_prompt, collection, client):
69
  user_prompt = RAG_USER_PROMPT.format(data, user_prompt)
70
 
71
  response = await llm(system_prompt, user_prompt)
72
-
73
- return response
74
 
75
  elif function == "translator":
76
- return await translator(result["response"], result["src_lang"], result["dest_lang"])
77
 
78
  elif function == "speaker":
79
- return await speaker(result["response"], result["dest_lang"])
80
- # return base64.b64encode(b"audio data").decode()
 
44
  "src_lang": "Identify the language that the user query is in, type: str",
45
  "dest_lang": """Identify the target language from the user query if the function is either "translator" or "speaker". If language is not found, return "none",
46
  type: Enum["hindi", "bengali", "kannada", "malayalam", "marathi", "odia", "punjabi", "tamil", "telugu", "english", "gujarati", "none"]""",
47
+ "source": "Identify the sentence that the user wants to translate or speak. Else return 'none', type: Optional[str]",
48
  "response": "Your response, type: Optional[str]",
49
  },
50
  llm=llm,
 
69
  user_prompt = RAG_USER_PROMPT.format(data, user_prompt)
70
 
71
  response = await llm(system_prompt, user_prompt)
72
+ return {"text": response}
 
73
 
74
  elif function == "translator":
75
+ return await translator(result["source"], result["src_lang"], result["dest_lang"])
76
 
77
  elif function == "speaker":
78
+ return await speaker(result["source"])
 
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import base64
2
- import io
3
 
4
  import gradio as gr
5
  from fastapi import FastAPI
@@ -28,44 +28,49 @@ class ChatQuery(BaseModel):
28
 
29
  @app.post("/chat")
30
  async def chat(query: ChatQuery):
31
- result = await function_caller(query.query, query.collection, hclient)
32
-
33
- if isinstance(result, str):
34
- return {"text": result}
35
- elif isinstance(result, bytes) or (isinstance(result, str) and result.startswith("data:audio")):
36
- if isinstance(result, bytes):
37
- audio_b64 = base64.b64encode(result).decode()
38
- else:
39
- audio_b64 = result.split(",")[1] # Remove the "data:audio/wav;base64," prefix
40
- return {"audio": audio_b64}
41
- else:
42
- return {"error": "Unexpected result type"}
43
 
44
 
45
- async def gradio_interface(input_text, grade, subject, chapter):
46
  collection = f"{grade}_{subject.lower()}_{chapter}"
47
  response = await chat(ChatQuery(query=input_text, collection=collection))
 
48
  if "text" in response:
49
- return response["text"], None
50
- elif "audio" in response:
51
- audio_data = base64.b64decode(response["audio"])
52
- return "Audio response generated", (44100, io.BytesIO(audio_data))
 
 
 
 
 
 
 
 
53
  else:
54
- return "Unexpected response format", None
55
-
56
-
57
- iface = gr.Interface(
58
- fn=gradio_interface,
59
- inputs=[
60
- gr.Textbox(lines=2, placeholder="Enter your question here..."),
61
- gr.Dropdown(choices=["1", "2", "3", "4", "5", "6", "7", "9", "10", "11", "12"], label="Grade", value="9", interactive=True),
62
- gr.Dropdown(choices=["Math", "Science", "History"], label="Subject", value="Science", interactive=True),
63
- gr.Dropdown(choices=["1", "2", "3", "4", "5", "6", "7", "9", "10", "11", "12", "13", "14", "15", "16"], label="Chapter", value="11", interactive=True),
64
- ],
65
- outputs=[gr.Textbox(label="Response"), gr.Audio(label="Audio Response")],
66
- title="Agentic RAG Chatbot",
67
- description="Ask a question and get an answer from the chatbot. The response may be text or audio.",
68
- )
 
 
 
 
 
 
 
69
 
70
  app = gr.mount_gradio_app(app, iface, path="/")
71
 
 
1
  import base64
2
+ import tempfile
3
 
4
  import gradio as gr
5
  from fastapi import FastAPI
 
28
 
29
  @app.post("/chat")
30
  async def chat(query: ChatQuery):
31
+ return await function_caller(query.query, query.collection, hclient)
 
 
 
 
 
 
 
 
 
 
 
32
 
33
 
34
+ async def gradio_interface(input_text, grade, subject, chapter, history):
35
  collection = f"{grade}_{subject.lower()}_{chapter}"
36
  response = await chat(ChatQuery(query=input_text, collection=collection))
37
+
38
  if "text" in response:
39
+ output = response["text"]
40
+ history.append((input_text, output))
41
+
42
+ elif "audios" in response:
43
+ audio_data = base64.b64decode(response["audios"][0])
44
+
45
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audiofile:
46
+ audiofile.write(audio_data)
47
+ audiofile.flush()
48
+
49
+ return "", history, audiofile.name
50
+
51
  else:
52
+ output = "Unexpected response format"
53
+ history.append((input_text, output))
54
+
55
+ return "", history, None
56
+
57
+
58
+ with gr.Blocks() as iface:
59
+ gr.Markdown("# Agentic RAG Chatbot")
60
+ gr.Markdown("Ask a question and get an answer from the chatbot. The response may be text or audio.")
61
+
62
+ with gr.Row():
63
+ grade = gr.Dropdown(choices=["1", "2", "3", "4", "5", "6", "7", "9", "10", "11", "12"], label="Grade", value="9", interactive=True)
64
+ subject = gr.Dropdown(choices=["Math", "Science", "History"], label="Subject", value="Science", interactive=True)
65
+ chapter = gr.Dropdown(choices=["1", "2", "3", "4", "5", "6", "7", "9", "10", "11", "12", "13", "14", "15", "16"], label="Chapter", value="11", interactive=True)
66
+
67
+ chatbot = gr.Chatbot(label="Chat History")
68
+ msg = gr.Textbox(label="Your message", placeholder="Type your message here...")
69
+
70
+ state = gr.State([])
71
+ audio_output = gr.Audio(label="Audio Response", type="filepath") # Separate audio output component
72
+
73
+ msg.submit(gradio_interface, inputs=[msg, grade, subject, chapter, state], outputs=[msg, chatbot, audio_output])
74
 
75
  app = gr.mount_gradio_app(app, iface, path="/")
76
 
sarvam.py CHANGED
@@ -36,11 +36,11 @@ async def translator(text, src, dest):
36
  headers = {"Content-Type": "application/json", "api-subscription-key": os.getenv("SARVAM_API_KEY")}
37
  async with session.post(url, headers=headers, json=payload) as response:
38
  if response.status == 200:
39
- output = await response.text()
40
- return output
41
 
42
 
43
- async def speaker(text, dest):
44
  async with aiohttp.ClientSession() as session:
45
  url = "https://api.sarvam.ai/text-to-speech"
46
 
@@ -49,7 +49,7 @@ async def speaker(text, dest):
49
  "target_language_code": code_map[dest],
50
  "speaker": "meera",
51
  "pitch": 0,
52
- "pace": 1.65,
53
  "loudness": 1.5,
54
  "speech_sample_rate": 8000,
55
  "enable_preprocessing": True,
@@ -58,5 +58,7 @@ async def speaker(text, dest):
58
  headers = {"Content-Type": "application/json", "api-subscription-key": os.getenv("SARVAM_API_KEY")}
59
  async with session.post(url, headers=headers, json=payload) as response:
60
  if response.status == 200:
61
- output = await response.read()
62
  return output
 
 
 
36
  headers = {"Content-Type": "application/json", "api-subscription-key": os.getenv("SARVAM_API_KEY")}
37
  async with session.post(url, headers=headers, json=payload) as response:
38
  if response.status == 200:
39
+ output = await response.json()
40
+ return {"text": output["translated_text"]}
41
 
42
 
43
+ async def speaker(text, dest="hindi"):
44
  async with aiohttp.ClientSession() as session:
45
  url = "https://api.sarvam.ai/text-to-speech"
46
 
 
49
  "target_language_code": code_map[dest],
50
  "speaker": "meera",
51
  "pitch": 0,
52
+ "pace": 1.25,
53
  "loudness": 1.5,
54
  "speech_sample_rate": 8000,
55
  "enable_preprocessing": True,
 
58
  headers = {"Content-Type": "application/json", "api-subscription-key": os.getenv("SARVAM_API_KEY")}
59
  async with session.post(url, headers=headers, json=payload) as response:
60
  if response.status == 200:
61
+ output = await response.json()
62
  return output
63
+ else:
64
+ print(response.status)