import gradio as gr from langchain_community.llms import LlamaCpp from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from langchain_core.callbacks import StreamingStdOutCallbackHandler from langchain.retrievers import TFIDFRetriever from langchain.chains import RetrievalQA from langchain.memory import ConversationBufferMemory from langchain_community.chat_models import ChatLlamaCpp from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain_core.prompts import PromptTemplate callbacks = [StreamingStdOutCallbackHandler()] print("creating ll started") M_NAME = "finbro-v0.1.0-llama-3-8B-instruct-1m.gguf" llm = ChatLlamaCpp( model_path=M_NAME, n_batch=8, temperature=0.85, max_tokens=256, top_p=0.95, top_k = 10, callback_manager=callbacks, n_ctx=2048, verbose=True, # Verbose is required to pass to the callback manager ) print("creating ll ended") # for without memory template = """You are the Finiantial expert: ### Instruction: {question} ### Input: ### Response: """ prompt = PromptTemplate(template=template, input_variables=["question"]) print("test2") llm_chain_model = LLMChain(prompt=prompt, llm=llm) # for retriver def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) model_name = "BAAI/bge-base-en-v1.5" model_kwargs = {"device":'cpu'} encode_kwargs = {'normalize_embeddings':True} hf = HuggingFaceEmbeddings( model_name = model_name, model_kwargs = model_kwargs, encode_kwargs = encode_kwargs ) vectorstore = Chroma( collection_name="example_collection", embedding_function=hf, persist_directory="./chroma_langchain_db", # Where to save data locally, remove if not neccesary ) retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6}) template = """you are the financial ai assistant {context} Question: {question} Helpful Answer:""" custom_rag_prompt = PromptTemplate.from_template(template) rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | custom_rag_prompt | llm | StrOutputParser() ) print("retriver done") def greet(question, model_type): print(f"question is {question}") if model_type == "With RAG": out_gen = rag_chain.invoke(question) print("test5") print(f"out is: {out_gen}") else: print("test3") out_gen = llm_chain_model.run(question) print("test4") print(f"out is: {out_gen}") return out_gen demo = gr.Interface(fn=greet, inputs=["text", gr.Dropdown( ["Without RAG", "With RAG"], label="Memory status", info="With using memory, the output will be slow but strong" ),], outputs="text") demo.launch(debug=True, share=True)