import os import streamlit as st import pickle import time import langchain #from langchain import OpenAI #from langchain.chains import RetrievalQAWithSourcesChain #from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import UnstructuredURLLoader #from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import FAISS import requests import pandas as pd from langchain_community.llms import HuggingFaceEndpoint from sentence_transformers import SentenceTransformer from langchain.document_loaders import TextLoader from sentence_transformers import SentenceTransformer, util from langchain.schema import SystemMessage, HumanMessage, AIMessage import faiss from dotenv import load_dotenv load_dotenv() # take environment variables from .env (especially openai api key) def querypreprocess(query: str ): vec = model.encode(query) #again embeddings of query by sentencetransformer and able to search the index vector. #svec = np.array(vec).reshape(1,-1) # as 2D needed distances, I = index.search(vec, k=2) row_indices = I.tolist()[0] list1 = [docs[i].page_content for i in row_indices] str1 = " " str1 = str1.join(list1) #str1 = '\n'.join([str(message) for message in list1]) #results = ' '.join(map(str, list1)) #list to string convert return str1 def augmented_prompt(query: str): messages = querypreprocess(query) source_knowledge =''.join([str(message) for message in messages]) #source_knowledge =results augmented_prompt = f""" using the contexts below, answer the query. Contexts: {source_knowledge} Question: {query} Answer:""" return augmented_prompt st.title("RockyBot: News Research Tool 📈") st.sidebar.title("News Article URLs") urls = [] for i in range(3): url = st.sidebar.text_input(f"URL {i+1}") urls.append(url) process_url_clicked = st.sidebar.button("Process URLs") file_path = "sentence_embeddings.pkl" main_placeholder = st.empty() #llm = OpenAI(temperature=0.9, max_tokens=500) llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2") if process_url_clicked: # load data loader = UnstructuredURLLoader(urls=urls) main_placeholder.text("Data Loading...Started...✅✅✅") data = loader.load() # split data text_splitter = RecursiveCharacterTextSplitter( separators=['\n\n', '\n', '.', ','], chunk_size=1000, chunk_overlap=0 ) main_placeholder.text("Text Splitter...Started...✅✅✅") docs = text_splitter.split_documents(data) # Create an array of text to embed sentences = [] for i, row in enumerate(docs): sentences.append(row.page_content) # create embeddings and save it to FAISS index #embeddings = OpenAIEmbeddings() #vectorstore_openai = FAISS.from_documents(docs, embeddings) # initialize sentence transformer model model = SentenceTransformer('bert-base-nli-mean-tokens') # create sentence embeddings sentence_embeddings = model.encode(sentences) main_placeholder.text("Embedding Vector Started Building...✅✅✅") time.sleep(2) # Save the FAISS index to a pickle file with open(file_path, "wb") as f: pickle.dump(sentence_embeddings, f) query = main_placeholder.text_input("Question: ") if query: if os.path.exists(file_path): with open(file_path, "rb") as f: query = pickle.load(f) import faiss d = sentence_embeddings.shape[1] index = faiss.IndexFlatL2(d) # build the index, d=size of vectors # here we assume xb contains a n-by-d numpy matrix of type float32 index.add(sentence_embeddings) # add vectors to the index #chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever()) #result = chain({"question": query}, return_only_outputs=True) # result will be a dictionary of this format --> {"answer": "", "sources": [] } #xq = model.encode([query]) #k=2 #D, I = index.search(xq, k=k) #result1 = [f'{i}: {sentences[i]}' for i in I[0]] messages = [ SystemMessage(content="You are a helpful assistant."), HumanMessage(content=query), AIMessage(content="I am Great, Thank You, How Can I Help You.") ] prompt = augmented_prompt(query) messages.append(prompt) result = llm.invoke(messages) st.header("Answer") # Display sources, if available sources = result.get("sources", "") if sources: st.subheader("Sources:") sources_list = sources.split("\n") # Split the sources by newline for source in sources_list: st.write(source)