Spaces:
Runtime error
Runtime error
extractor agent
Browse files- agent.py +22 -1
- prompts.py +24 -2
- scraper.py +25 -0
agent.py
CHANGED
@@ -1,8 +1,15 @@
|
|
1 |
from dotenv import load_dotenv
|
2 |
from strictjson import strict_json_async
|
3 |
|
4 |
-
from prompts import
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from sarvam import speaker, translator
|
|
|
6 |
|
7 |
load_dotenv()
|
8 |
|
@@ -47,6 +54,7 @@ async def call_agent(user_prompt, collection):
|
|
47 |
"dest_lang": """Identify the target language from the user query if the function is either "translator" or "speaker". If language is not found, return "none",
|
48 |
type: Enum["hindi", "bengali", "kannada", "malayalam", "marathi", "odia", "punjabi", "tamil", "telugu", "english", "gujarati", "none"]""",
|
49 |
"source": "Identify the sentence that the user wants to translate or speak. Else return 'none', type: Optional[str]",
|
|
|
50 |
"response": "Your response, type: Optional[str]",
|
51 |
},
|
52 |
llm=llm,
|
@@ -66,6 +74,15 @@ async def retriever(user_prompt, collection, client):
|
|
66 |
return await llm(system_prompt, user_prompt)
|
67 |
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
async def function_caller(user_prompt, collection, client):
|
70 |
result = await call_agent(user_prompt, collection)
|
71 |
function = result["function"].lower()
|
@@ -82,3 +99,7 @@ async def function_caller(user_prompt, collection, client):
|
|
82 |
|
83 |
elif function == "speaker":
|
84 |
return await speaker(result["source"])
|
|
|
|
|
|
|
|
|
|
1 |
from dotenv import load_dotenv
|
2 |
from strictjson import strict_json_async
|
3 |
|
4 |
+
from prompts import (
|
5 |
+
AGENT_PROMPT,
|
6 |
+
EXTRACT_SYS_PROMPT,
|
7 |
+
EXTRACT_USER_PROMPT,
|
8 |
+
RAG_SYS_PROMPT,
|
9 |
+
RAG_USER_PROMPT,
|
10 |
+
)
|
11 |
from sarvam import speaker, translator
|
12 |
+
from scraper import extract
|
13 |
|
14 |
load_dotenv()
|
15 |
|
|
|
54 |
"dest_lang": """Identify the target language from the user query if the function is either "translator" or "speaker". If language is not found, return "none",
|
55 |
type: Enum["hindi", "bengali", "kannada", "malayalam", "marathi", "odia", "punjabi", "tamil", "telugu", "english", "gujarati", "none"]""",
|
56 |
"source": "Identify the sentence that the user wants to translate or speak. Else return 'none', type: Optional[str]",
|
57 |
+
"url": "Identify if any URL or link is provided in the user query, type: str",
|
58 |
"response": "Your response, type: Optional[str]",
|
59 |
},
|
60 |
llm=llm,
|
|
|
74 |
return await llm(system_prompt, user_prompt)
|
75 |
|
76 |
|
77 |
+
async def extractor(user_prompt, url):
|
78 |
+
text = extract(user_prompt)
|
79 |
+
|
80 |
+
system_prompt = EXTRACT_SYS_PROMPT.format(url)
|
81 |
+
user_prompt = EXTRACT_USER_PROMPT.format(text, user_prompt)
|
82 |
+
|
83 |
+
return await llm(system_prompt, user_prompt)
|
84 |
+
|
85 |
+
|
86 |
async def function_caller(user_prompt, collection, client):
|
87 |
result = await call_agent(user_prompt, collection)
|
88 |
function = result["function"].lower()
|
|
|
99 |
|
100 |
elif function == "speaker":
|
101 |
return await speaker(result["source"])
|
102 |
+
|
103 |
+
elif function == "extractor":
|
104 |
+
response = await extractor(user_prompt, result["url"])
|
105 |
+
return {"text": response}
|
prompts.py
CHANGED
@@ -3,24 +3,28 @@ AGENT_PROMPT = """You are an advanced AI agent designed to assist with a wide ra
|
|
3 |
1. **retriever**: Fetches relevant information from the textbook database.
|
4 |
2. **translator**: Translates text between languages.
|
5 |
3. **speaker**: Converts text to speech.
|
|
|
6 |
|
7 |
Your task is to carefully analyze the user's query and determine the most appropriate action:
|
8 |
|
9 |
- If the query requires information related to the textbook or any related educational topic, use the **retriever** function.
|
10 |
- If a translation is requested, use the **translator** function.
|
11 |
- If the user wants text converted to speech, use the **speaker** function.
|
|
|
12 |
- If the query falls outside the scope of the given textbook or does not require any of the specialized functions, provide a direct and informative response based on your knowledge.
|
13 |
|
14 |
For each query, you must:
|
15 |
-
1. Identify the primary function needed (retriever, translator, speaker, or none).
|
16 |
2. Extract key keywords from the query.
|
17 |
3. Identify the source language of the query.
|
18 |
4. Determine the target language for translation or speech (if applicable).
|
19 |
5. Isolate the specific text to be translated or spoken (if applicable).
|
20 |
-
6.
|
|
|
21 |
|
22 |
### Important Guidelines:
|
23 |
- Always prioritize using the **retriever** for queries that relate to educational content.
|
|
|
24 |
- If a function needs to be called, set the 'response' field to null. Otherwise, provide a direct answer and set 'response' to "none."
|
25 |
- Be precise in identifying languages and extracting relevant text for translation or speech.
|
26 |
- If translation or speech is not requested, set 'dest_lang' to "none" and 'source' to "none."
|
@@ -45,3 +49,21 @@ Retrieved Information:
|
|
45 |
User Query: {}
|
46 |
|
47 |
Please formulate your response using the above information, ensuring it's appropriate for the grade level and subject as specified in the system prompt. If the retrieved information doesn't fully address the query, you may supplement with relevant knowledge, but clearly indicate when you're doing so."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
1. **retriever**: Fetches relevant information from the textbook database.
|
4 |
2. **translator**: Translates text between languages.
|
5 |
3. **speaker**: Converts text to speech.
|
6 |
+
4. **extractor**: Extracts text from a provided URL or link.
|
7 |
|
8 |
Your task is to carefully analyze the user's query and determine the most appropriate action:
|
9 |
|
10 |
- If the query requires information related to the textbook or any related educational topic, use the **retriever** function.
|
11 |
- If a translation is requested, use the **translator** function.
|
12 |
- If the user wants text converted to speech, use the **speaker** function.
|
13 |
+
- If the user provides a URL or link, use the **extractor** function.
|
14 |
- If the query falls outside the scope of the given textbook or does not require any of the specialized functions, provide a direct and informative response based on your knowledge.
|
15 |
|
16 |
For each query, you must:
|
17 |
+
1. Identify the primary function needed (retriever, translator, speaker, extractor, or none).
|
18 |
2. Extract key keywords from the query.
|
19 |
3. Identify the source language of the query.
|
20 |
4. Determine the target language for translation or speech (if applicable).
|
21 |
5. Isolate the specific text to be translated or spoken (if applicable).
|
22 |
+
6. Identify if a URL or link is provided by the user.(if applicable).
|
23 |
+
7. Provide a relevant and accurate response based on your knowledge if no function call is necessary.
|
24 |
|
25 |
### Important Guidelines:
|
26 |
- Always prioritize using the **retriever** for queries that relate to educational content.
|
27 |
+
- Use the **extractor** function if the user provides a URL or link. If URL is not present, set 'url' to "none"
|
28 |
- If a function needs to be called, set the 'response' field to null. Otherwise, provide a direct answer and set 'response' to "none."
|
29 |
- Be precise in identifying languages and extracting relevant text for translation or speech.
|
30 |
- If translation or speech is not requested, set 'dest_lang' to "none" and 'source' to "none."
|
|
|
49 |
User Query: {}
|
50 |
|
51 |
Please formulate your response using the above information, ensuring it's appropriate for the grade level and subject as specified in the system prompt. If the retrieved information doesn't fully address the query, you may supplement with relevant knowledge, but clearly indicate when you're doing so."""
|
52 |
+
|
53 |
+
|
54 |
+
EXTRACT_SYS_PROMPT = """You are an extraction agent designed to retrieve relevant text content from web pages. The user has provided the following URL: {}.
|
55 |
+
Your task is to use the extracted text to help answer the user's question as accurately as possible.
|
56 |
+
Be thorough in your extraction, focusing on the main body of the content while avoiding any irrelevant information like advertisements, navigation bars, or footnotes.
|
57 |
+
If the text extracted from the webpage does not directly answer the user's question, use the extracted content as supporting information and provide an answer based on both the content and your own knowledge.
|
58 |
+
"""
|
59 |
+
|
60 |
+
|
61 |
+
EXTRACT_USER_PROMPT = """
|
62 |
+
Based on the following extracted information and the user query, provide a response that is clear, concise, and directly answers the user's question.
|
63 |
+
If the extracted content from the webpage is not sufficient to fully answer the question, use your own knowledge to provide a complete and helpful response.
|
64 |
+
|
65 |
+
Retrieved Information:
|
66 |
+
{}
|
67 |
+
|
68 |
+
User Query: {}
|
69 |
+
"""
|
scraper.py
CHANGED
@@ -2,9 +2,11 @@ import asyncio
|
|
2 |
import base64
|
3 |
import io
|
4 |
import json
|
|
|
5 |
from string import ascii_lowercase
|
6 |
|
7 |
import aiohttp
|
|
|
8 |
|
9 |
from client import HybridClient
|
10 |
from headers import random_headers
|
@@ -121,3 +123,26 @@ def upload_book_from_json(json_file_path):
|
|
121 |
|
122 |
hclient.create(collection)
|
123 |
hclient.insert(collection, chunks)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import base64
|
3 |
import io
|
4 |
import json
|
5 |
+
import re
|
6 |
from string import ascii_lowercase
|
7 |
|
8 |
import aiohttp
|
9 |
+
from bs4 import BeautifulSoup, Comment, Declaration
|
10 |
|
11 |
from client import HybridClient
|
12 |
from headers import random_headers
|
|
|
123 |
|
124 |
hclient.create(collection)
|
125 |
hclient.insert(collection, chunks)
|
126 |
+
|
127 |
+
|
128 |
+
def is_visible_text(element):
|
129 |
+
if element.parent.name in ["style", "script", "[document]", "head", "title"]:
|
130 |
+
return False
|
131 |
+
elif re.match("<!--.*-->", str(element)):
|
132 |
+
return False
|
133 |
+
elif type(element) is Comment or type(element) is Declaration:
|
134 |
+
return False
|
135 |
+
elif len(str(element)) < 50:
|
136 |
+
return False
|
137 |
+
return True
|
138 |
+
|
139 |
+
|
140 |
+
async def extract(url: str):
|
141 |
+
async with aiohttp.ClientSession() as session:
|
142 |
+
headers = random_headers()
|
143 |
+
async with session.get(url, headers=headers, timeout=10) as r:
|
144 |
+
r.raise_for_status()
|
145 |
+
content = await r.read()
|
146 |
+
texts = BeautifulSoup(content, "html.parser").findAll(string=True)
|
147 |
+
text = "".join(list(filter(is_visible_text, texts)))
|
148 |
+
return text
|