Spaces:

omkar334
/

agentic_rag

Runtime error

App Files Files Community

omkar334 commited on Sep 28

Commit

7244d35

•

1 Parent(s): b6e72f6

extractor agent

Browse files

Files changed (3) hide show

agent.py +22 -1
prompts.py +24 -2
scraper.py +25 -0

agent.py CHANGED Viewed

@@ -1,8 +1,15 @@
 from dotenv import load_dotenv
 from strictjson import strict_json_async
-from prompts import AGENT_PROMPT, RAG_SYS_PROMPT, RAG_USER_PROMPT
 from sarvam import speaker, translator
 load_dotenv()
@@ -47,6 +54,7 @@ async def call_agent(user_prompt, collection):
             "dest_lang": """Identify the target language from the user query if the function is either "translator" or "speaker". If language is not found, return "none",
                                     type: Enum["hindi", "bengali", "kannada", "malayalam", "marathi", "odia", "punjabi", "tamil", "telugu", "english", "gujarati", "none"]""",
             "source": "Identify the sentence that the user wants to translate or speak. Else return 'none', type: Optional[str]",
             "response": "Your response, type: Optional[str]",
         },
         llm=llm,
@@ -66,6 +74,15 @@ async def retriever(user_prompt, collection, client):
     return await llm(system_prompt, user_prompt)
 async def function_caller(user_prompt, collection, client):
     result = await call_agent(user_prompt, collection)
     function = result["function"].lower()
@@ -82,3 +99,7 @@ async def function_caller(user_prompt, collection, client):
     elif function == "speaker":
         return await speaker(result["source"])

 from dotenv import load_dotenv
 from strictjson import strict_json_async
+from prompts import (
+    AGENT_PROMPT,
+    EXTRACT_SYS_PROMPT,
+    EXTRACT_USER_PROMPT,
+    RAG_SYS_PROMPT,
+    RAG_USER_PROMPT,
+)
 from sarvam import speaker, translator
+from scraper import extract
 load_dotenv()
             "dest_lang": """Identify the target language from the user query if the function is either "translator" or "speaker". If language is not found, return "none",
                                     type: Enum["hindi", "bengali", "kannada", "malayalam", "marathi", "odia", "punjabi", "tamil", "telugu", "english", "gujarati", "none"]""",
             "source": "Identify the sentence that the user wants to translate or speak. Else return 'none', type: Optional[str]",
+            "url": "Identify if any URL or link is provided in the user query, type: str",
             "response": "Your response, type: Optional[str]",
         },
         llm=llm,
     return await llm(system_prompt, user_prompt)
+async def extractor(user_prompt, url):
+    text = extract(user_prompt)
+    system_prompt = EXTRACT_SYS_PROMPT.format(url)
+    user_prompt = EXTRACT_USER_PROMPT.format(text, user_prompt)
+    return await llm(system_prompt, user_prompt)
 async def function_caller(user_prompt, collection, client):
     result = await call_agent(user_prompt, collection)
     function = result["function"].lower()
     elif function == "speaker":
         return await speaker(result["source"])
+    elif function == "extractor":
+        response = await extractor(user_prompt, result["url"])
+        return {"text": response}

prompts.py CHANGED Viewed

@@ -3,24 +3,28 @@ AGENT_PROMPT = """You are an advanced AI agent designed to assist with a wide ra
 1. **retriever**: Fetches relevant information from the textbook database.
 2. **translator**: Translates text between languages.
 3. **speaker**: Converts text to speech.
 Your task is to carefully analyze the user's query and determine the most appropriate action:
 - If the query requires information related to the textbook or any related educational topic, use the **retriever** function.
 - If a translation is requested, use the **translator** function.
 - If the user wants text converted to speech, use the **speaker** function.
 - If the query falls outside the scope of the given textbook or does not require any of the specialized functions, provide a direct and informative response based on your knowledge.
 For each query, you must:
-1. Identify the primary function needed (retriever, translator, speaker, or none).
 2. Extract key keywords from the query.
 3. Identify the source language of the query.
 4. Determine the target language for translation or speech (if applicable).
 5. Isolate the specific text to be translated or spoken (if applicable).
-6. Provide a relevant and accurate response based on your knowledge if no function call is necessary.
 ### Important Guidelines:
 - Always prioritize using the **retriever** for queries that relate to educational content.
 - If a function needs to be called, set the 'response' field to null. Otherwise, provide a direct answer and set 'response' to "none."
 - Be precise in identifying languages and extracting relevant text for translation or speech.
 - If translation or speech is not requested, set 'dest_lang' to "none" and 'source' to "none."
@@ -45,3 +49,21 @@ Retrieved Information:
 User Query: {}
 Please formulate your response using the above information, ensuring it's appropriate for the grade level and subject as specified in the system prompt. If the retrieved information doesn't fully address the query, you may supplement with relevant knowledge, but clearly indicate when you're doing so."""

 1. **retriever**: Fetches relevant information from the textbook database.
 2. **translator**: Translates text between languages.
 3. **speaker**: Converts text to speech.
+4. **extractor**: Extracts text from a provided URL or link.
 Your task is to carefully analyze the user's query and determine the most appropriate action:
 - If the query requires information related to the textbook or any related educational topic, use the **retriever** function.
 - If a translation is requested, use the **translator** function.
 - If the user wants text converted to speech, use the **speaker** function.
+- If the user provides a URL or link, use the **extractor** function.
 - If the query falls outside the scope of the given textbook or does not require any of the specialized functions, provide a direct and informative response based on your knowledge.
 For each query, you must:
+1. Identify the primary function needed (retriever, translator, speaker, extractor, or none).
 2. Extract key keywords from the query.
 3. Identify the source language of the query.
 4. Determine the target language for translation or speech (if applicable).
 5. Isolate the specific text to be translated or spoken (if applicable).
+6. Identify if a URL or link is provided by the user.(if applicable).
+7. Provide a relevant and accurate response based on your knowledge if no function call is necessary.
 ### Important Guidelines:
 - Always prioritize using the **retriever** for queries that relate to educational content.
+- Use the **extractor** function if the user provides a URL or link. If URL is not present, set 'url' to "none"
 - If a function needs to be called, set the 'response' field to null. Otherwise, provide a direct answer and set 'response' to "none."
 - Be precise in identifying languages and extracting relevant text for translation or speech.
 - If translation or speech is not requested, set 'dest_lang' to "none" and 'source' to "none."
 User Query: {}
 Please formulate your response using the above information, ensuring it's appropriate for the grade level and subject as specified in the system prompt. If the retrieved information doesn't fully address the query, you may supplement with relevant knowledge, but clearly indicate when you're doing so."""
+EXTRACT_SYS_PROMPT = """You are an extraction agent designed to retrieve relevant text content from web pages. The user has provided the following URL: {}.
+Your task is to use the extracted text to help answer the user's question  as accurately as possible.
+Be thorough in your extraction, focusing on the main body of the content while avoiding any irrelevant information like advertisements, navigation bars, or footnotes.
+If the text extracted from the webpage does not directly answer the user's question, use the extracted content as supporting information and provide an answer based on both the content and your own knowledge.
+"""
+EXTRACT_USER_PROMPT = """
+Based on the following extracted information and the user query, provide a response that is clear, concise, and directly answers the user's question.
+If the extracted content from the webpage is not sufficient to fully answer the question, use your own knowledge to provide a complete and helpful response.
+Retrieved Information:
+{}
+User Query: {}
+"""

scraper.py CHANGED Viewed

@@ -2,9 +2,11 @@ import asyncio
 import base64
 import io
 import json
 from string import ascii_lowercase
 import aiohttp
 from client import HybridClient
 from headers import random_headers
@@ -121,3 +123,26 @@ def upload_book_from_json(json_file_path):
         hclient.create(collection)
         hclient.insert(collection, chunks)

 import base64
 import io
 import json
+import re
 from string import ascii_lowercase
 import aiohttp
+from bs4 import BeautifulSoup, Comment, Declaration
 from client import HybridClient
 from headers import random_headers
         hclient.create(collection)
         hclient.insert(collection, chunks)
+def is_visible_text(element):
+    if element.parent.name in ["style", "script", "[document]", "head", "title"]:
+        return False
+    elif re.match("<!--.*-->", str(element)):
+        return False
+    elif type(element) is Comment or type(element) is Declaration:
+        return False
+    elif len(str(element)) < 50:
+        return False
+    return True
+async def extract(url: str):
+    async with aiohttp.ClientSession() as session:
+        headers = random_headers()
+        async with session.get(url, headers=headers, timeout=10) as r:
+            r.raise_for_status()
+            content = await r.read()
+            texts = BeautifulSoup(content, "html.parser").findAll(string=True)
+            text = "".join(list(filter(is_visible_text, texts)))
+            return text