DocChat_n_Talk / app.py
capradeepgujaran's picture
Update app.py
601f904 verified
raw
history blame
4.88 kB
import os
import cv2
import numpy as np
from PIL import Image
import pytesseract
import gradio as gr
from pdf2image import convert_from_path
import PyPDF2
from llama_index.core import VectorStoreIndex, Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import get_response_synthesizer
from sentence_transformers import SentenceTransformer, util
import logging
from openai_tts_tool import generate_audio_and_text
import tempfile
# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
# Initialize global variables
vector_index = None
query_log = []
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
# Define available languages for TTS
AVAILABLE_LANGUAGES = [
("en", "English"),
("ar", "Arabic"),
("de", "German"),
("mr", "Marathi"),
("kn", "Kannada"),
("tl", "Filipino (Tagalog)"),
("fr", "French"),
("gu", "Gujarati"),
("hi", "Hindi"),
("ml", "Malayalam"),
("ta", "Tamil"),
("te", "Telugu"),
("ur", "Urdu"),
("si", "Sinhala")
]
# Get available languages for OCR
try:
langs = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
except:
langs = ['eng'] # Fallback to English if tesseract isn't properly configured
# ... (keep all the existing functions until create_gradio_interface unchanged) ...
def create_gradio_interface():
with gr.Blocks(title="Document Processing and TTS App") as demo:
gr.Markdown("# πŸ“„ Document Processing, Text & Audio Generation App")
with gr.Tab("πŸ“€ Upload Documents"):
api_key_input = gr.Textbox(
label="Enter OpenAI API Key",
placeholder="Paste your OpenAI API Key here",
type="password"
)
file_upload = gr.File(label="Upload Files", file_count="multiple", type="filepath")
lang_dropdown = gr.Dropdown(choices=langs, label="Select OCR Language", value='eng')
upload_button = gr.Button("Upload and Index")
upload_status = gr.Textbox(label="Status", interactive=False)
with gr.Tab("❓ Ask a Question"):
query_input = gr.Textbox(label="Enter your question")
model_dropdown = gr.Dropdown(
choices=["gpt-4-0125-preview", "gpt-3.5-turbo-0125"],
label="Select Model",
value="gpt-3.5-turbo-0125"
)
similarity_checkbox = gr.Checkbox(label="Use Similarity Check", value=False)
query_button = gr.Button("Ask")
answer_output = gr.Textbox(label="Answer", interactive=False)
with gr.Tab("πŸ—£οΈ Generate Audio and Text"):
text_input = gr.Textbox(label="Enter text for generation")
voice_type = gr.Dropdown(
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
label="Voice Type",
value="alloy"
)
voice_speed = gr.Slider(
minimum=0.25,
maximum=4.0,
value=1.0,
label="Voice Speed"
)
language = gr.Dropdown(
choices=[(code, name) for code, name in AVAILABLE_LANGUAGES],
label="Language for Audio and Script",
value="en",
type="value"
)
output_option = gr.Radio(
choices=["audio", "script_text", "both"],
label="Output Option",
value="both"
)
generate_button = gr.Button("Generate")
audio_output = gr.Audio(label="Generated Audio")
script_output = gr.File(label="Script Text File")
status_output = gr.Textbox(label="Status", interactive=False)
# Wire up the components
upload_button.click(
fn=process_upload,
inputs=[api_key_input, file_upload, lang_dropdown],
outputs=[upload_status]
)
query_button.click(
fn=query_app,
inputs=[query_input, model_dropdown, similarity_checkbox, api_key_input],
outputs=[answer_output]
)
answer_output.change(
fn=lambda ans: ans,
inputs=[answer_output],
outputs=[text_input]
)
generate_button.click(
fn=generate_audio_and_text,
inputs=[
api_key_input, text_input, model_dropdown, voice_type,
voice_speed, language, output_option
],
outputs=[audio_output, script_output, status_output]
)
return demo
if __name__ == "__main__":
demo = create_gradio_interface()
demo.launch()
else:
demo = create_gradio_interface()