Spaces:
Runtime error
Runtime error
import gradio as gr | |
from io import BytesIO | |
import fitz | |
import openai | |
class TranslationAgent: | |
def __init__(self, openai_key): | |
self.memory = [] | |
system_msg = "You are a translator from english to italian.\n" \ | |
" The only thing you do is to translate.\n" \ | |
" You don't write anything other then the translation of the text you get.\n" \ | |
" The user will only provide the text without asking anything, but what he wants is the translation.\n" \ | |
" Never return the translation of a previously translated part!\n " \ | |
"The text you will need to translate will often include none sense stuff because it is coming from a text extraction of a pdf file including images and table.\n" \ | |
" Do your best to translate also this messy parts." | |
self.memory.append({"role": "system", "content": system_msg}) | |
openai.api_key = openai_key | |
def fade_memory(self): | |
if len(self.memory) >= 5: | |
del self.memory[1:3] | |
def translate_chunk(self, chunk): | |
self.memory.append({"role": "user", "content": chunk}) | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=self.memory | |
) | |
reply = response["choices"][0]["message"]["content"] | |
self.memory.append({"role": "assistant", "content": reply}) | |
self.fade_memory() | |
return reply | |
def extract_text_from_pdf(pdf, start, stop): | |
text = "" | |
with fitz.open(stream=BytesIO(pdf), filetype='pdf') as doc: # remove .read() | |
for i, page in enumerate(doc): | |
if start <= i: | |
if i <= stop: | |
text += page.get_text() | |
else: | |
break | |
return text | |
def split_text(text, chunk_size=100): | |
words = text.split() | |
chunks = [] | |
current_chunk_words = [] | |
for word in words: | |
current_chunk_words.append(word) | |
if word.endswith('.') and len(current_chunk_words) >= chunk_size: | |
chunks.append(' '.join(current_chunk_words)) | |
current_chunk_words = [] | |
# add the last chunk if any words remain | |
if current_chunk_words: | |
chunks.append(' '.join(current_chunk_words)) | |
return chunks | |
def translate_pdf(openai_key, pdf, start, stop): | |
translator = TranslationAgent(openai_key) | |
# extract text | |
if pdf is not None: | |
text = extract_text_from_pdf(pdf, start=start, stop=stop) | |
chunks = split_text(text) | |
translated_chunks = [] | |
for chunk in chunks: | |
translated_chunk = translator.translate_chunk(chunk) | |
translated_chunks.append(translated_chunk + " ") | |
translated_text = ' '.join(translated_chunks) | |
with open('translated.txt', 'w') as f: | |
f.write(translated_text) | |
return translated_text, "Translation Successful" | |
iface = gr.Interface(title="Pdf Translator English -> Italian", | |
fn=translate_pdf, | |
inputs=[ | |
gr.inputs.Textbox(lines=1, label="OpenAI API key", | |
placeholder="Enter your OpenAI API key here"), | |
gr.inputs.File(type="binary", label="PDF file", ), | |
gr.inputs.Number(label="Starting Page", ), | |
gr.inputs.Number(label="Final Page") | |
], | |
outputs=["text", "text"] | |
) | |
iface.launch(share=True) |