Spaces:
Runtime error
Runtime error
import langid | |
import os | |
from haystack import Pipeline | |
from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader | |
from haystack.document_stores import InMemoryDocumentStore | |
from haystack.utils import print_answers | |
from deep_translator import GoogleTranslator | |
class Sejarah: | |
def __init__(self): | |
document_store = InMemoryDocumentStore(use_bm25=True) | |
#initialize the pipeline | |
indexing_pipeline = Pipeline() | |
text_converter = TextConverter() | |
preprocessor = PreProcessor( | |
clean_whitespace=True, | |
clean_header_footer=True, | |
clean_empty_lines=True, | |
split_by="word", | |
split_length=200, | |
split_overlap=20, | |
split_respect_sentence_boundary=True, | |
) | |
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"]) | |
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"]) | |
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"]) | |
dir = "documents" | |
files_to_index = [dir+"/" + f for f in os.listdir(dir)] | |
indexing_pipeline.run_batch(file_paths=files_to_index) | |
retriever = BM25Retriever(document_store=document_store) | |
reader = FARMReader(model_name_or_path="primasr/malaybert-for-eqa-finetuned", use_gpu=True) | |
self.querying_pipeline = Pipeline() | |
self.querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) | |
self.querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"]) | |
def language_converter(self, content, lang, method): | |
if lang == "en": | |
if method == "question": | |
new_content = GoogleTranslator(source='en', target='ms').translate(content) | |
if "when" in content: | |
new_content = new_content.replace("apabila","bila") | |
else: | |
new_content = GoogleTranslator(source='ms', target='en').translate(content) | |
else: | |
new_content = content | |
return new_content | |
def detect_language(self, content): | |
lang = langid.classify(content) | |
return lang[0] | |
def interface(self, question): | |
language = self.detect_language(question) | |
converted_question = self.language_converter(question, language, "question") | |
result = self.querying_pipeline.run( | |
query=converted_question, | |
params={ | |
"Retriever": {"top_k": 10}, | |
"Reader": {"top_k": 5} | |
} | |
) | |
answer = self.language_converter(result['answers'][0].answer, language, "answer") | |
context = self.language_converter(result['answers'][0].context, language, "answer") | |
return answer, context |