omkar334 commited on
Commit
7244d35
1 Parent(s): b6e72f6

extractor agent

Browse files
Files changed (3) hide show
  1. agent.py +22 -1
  2. prompts.py +24 -2
  3. scraper.py +25 -0
agent.py CHANGED
@@ -1,8 +1,15 @@
1
  from dotenv import load_dotenv
2
  from strictjson import strict_json_async
3
 
4
- from prompts import AGENT_PROMPT, RAG_SYS_PROMPT, RAG_USER_PROMPT
 
 
 
 
 
 
5
  from sarvam import speaker, translator
 
6
 
7
  load_dotenv()
8
 
@@ -47,6 +54,7 @@ async def call_agent(user_prompt, collection):
47
  "dest_lang": """Identify the target language from the user query if the function is either "translator" or "speaker". If language is not found, return "none",
48
  type: Enum["hindi", "bengali", "kannada", "malayalam", "marathi", "odia", "punjabi", "tamil", "telugu", "english", "gujarati", "none"]""",
49
  "source": "Identify the sentence that the user wants to translate or speak. Else return 'none', type: Optional[str]",
 
50
  "response": "Your response, type: Optional[str]",
51
  },
52
  llm=llm,
@@ -66,6 +74,15 @@ async def retriever(user_prompt, collection, client):
66
  return await llm(system_prompt, user_prompt)
67
 
68
 
 
 
 
 
 
 
 
 
 
69
  async def function_caller(user_prompt, collection, client):
70
  result = await call_agent(user_prompt, collection)
71
  function = result["function"].lower()
@@ -82,3 +99,7 @@ async def function_caller(user_prompt, collection, client):
82
 
83
  elif function == "speaker":
84
  return await speaker(result["source"])
 
 
 
 
 
1
  from dotenv import load_dotenv
2
  from strictjson import strict_json_async
3
 
4
+ from prompts import (
5
+ AGENT_PROMPT,
6
+ EXTRACT_SYS_PROMPT,
7
+ EXTRACT_USER_PROMPT,
8
+ RAG_SYS_PROMPT,
9
+ RAG_USER_PROMPT,
10
+ )
11
  from sarvam import speaker, translator
12
+ from scraper import extract
13
 
14
  load_dotenv()
15
 
 
54
  "dest_lang": """Identify the target language from the user query if the function is either "translator" or "speaker". If language is not found, return "none",
55
  type: Enum["hindi", "bengali", "kannada", "malayalam", "marathi", "odia", "punjabi", "tamil", "telugu", "english", "gujarati", "none"]""",
56
  "source": "Identify the sentence that the user wants to translate or speak. Else return 'none', type: Optional[str]",
57
+ "url": "Identify if any URL or link is provided in the user query, type: str",
58
  "response": "Your response, type: Optional[str]",
59
  },
60
  llm=llm,
 
74
  return await llm(system_prompt, user_prompt)
75
 
76
 
77
+ async def extractor(user_prompt, url):
78
+ text = extract(user_prompt)
79
+
80
+ system_prompt = EXTRACT_SYS_PROMPT.format(url)
81
+ user_prompt = EXTRACT_USER_PROMPT.format(text, user_prompt)
82
+
83
+ return await llm(system_prompt, user_prompt)
84
+
85
+
86
  async def function_caller(user_prompt, collection, client):
87
  result = await call_agent(user_prompt, collection)
88
  function = result["function"].lower()
 
99
 
100
  elif function == "speaker":
101
  return await speaker(result["source"])
102
+
103
+ elif function == "extractor":
104
+ response = await extractor(user_prompt, result["url"])
105
+ return {"text": response}
prompts.py CHANGED
@@ -3,24 +3,28 @@ AGENT_PROMPT = """You are an advanced AI agent designed to assist with a wide ra
3
  1. **retriever**: Fetches relevant information from the textbook database.
4
  2. **translator**: Translates text between languages.
5
  3. **speaker**: Converts text to speech.
 
6
 
7
  Your task is to carefully analyze the user's query and determine the most appropriate action:
8
 
9
  - If the query requires information related to the textbook or any related educational topic, use the **retriever** function.
10
  - If a translation is requested, use the **translator** function.
11
  - If the user wants text converted to speech, use the **speaker** function.
 
12
  - If the query falls outside the scope of the given textbook or does not require any of the specialized functions, provide a direct and informative response based on your knowledge.
13
 
14
  For each query, you must:
15
- 1. Identify the primary function needed (retriever, translator, speaker, or none).
16
  2. Extract key keywords from the query.
17
  3. Identify the source language of the query.
18
  4. Determine the target language for translation or speech (if applicable).
19
  5. Isolate the specific text to be translated or spoken (if applicable).
20
- 6. Provide a relevant and accurate response based on your knowledge if no function call is necessary.
 
21
 
22
  ### Important Guidelines:
23
  - Always prioritize using the **retriever** for queries that relate to educational content.
 
24
  - If a function needs to be called, set the 'response' field to null. Otherwise, provide a direct answer and set 'response' to "none."
25
  - Be precise in identifying languages and extracting relevant text for translation or speech.
26
  - If translation or speech is not requested, set 'dest_lang' to "none" and 'source' to "none."
@@ -45,3 +49,21 @@ Retrieved Information:
45
  User Query: {}
46
 
47
  Please formulate your response using the above information, ensuring it's appropriate for the grade level and subject as specified in the system prompt. If the retrieved information doesn't fully address the query, you may supplement with relevant knowledge, but clearly indicate when you're doing so."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  1. **retriever**: Fetches relevant information from the textbook database.
4
  2. **translator**: Translates text between languages.
5
  3. **speaker**: Converts text to speech.
6
+ 4. **extractor**: Extracts text from a provided URL or link.
7
 
8
  Your task is to carefully analyze the user's query and determine the most appropriate action:
9
 
10
  - If the query requires information related to the textbook or any related educational topic, use the **retriever** function.
11
  - If a translation is requested, use the **translator** function.
12
  - If the user wants text converted to speech, use the **speaker** function.
13
+ - If the user provides a URL or link, use the **extractor** function.
14
  - If the query falls outside the scope of the given textbook or does not require any of the specialized functions, provide a direct and informative response based on your knowledge.
15
 
16
  For each query, you must:
17
+ 1. Identify the primary function needed (retriever, translator, speaker, extractor, or none).
18
  2. Extract key keywords from the query.
19
  3. Identify the source language of the query.
20
  4. Determine the target language for translation or speech (if applicable).
21
  5. Isolate the specific text to be translated or spoken (if applicable).
22
+ 6. Identify if a URL or link is provided by the user.(if applicable).
23
+ 7. Provide a relevant and accurate response based on your knowledge if no function call is necessary.
24
 
25
  ### Important Guidelines:
26
  - Always prioritize using the **retriever** for queries that relate to educational content.
27
+ - Use the **extractor** function if the user provides a URL or link. If URL is not present, set 'url' to "none"
28
  - If a function needs to be called, set the 'response' field to null. Otherwise, provide a direct answer and set 'response' to "none."
29
  - Be precise in identifying languages and extracting relevant text for translation or speech.
30
  - If translation or speech is not requested, set 'dest_lang' to "none" and 'source' to "none."
 
49
  User Query: {}
50
 
51
  Please formulate your response using the above information, ensuring it's appropriate for the grade level and subject as specified in the system prompt. If the retrieved information doesn't fully address the query, you may supplement with relevant knowledge, but clearly indicate when you're doing so."""
52
+
53
+
54
+ EXTRACT_SYS_PROMPT = """You are an extraction agent designed to retrieve relevant text content from web pages. The user has provided the following URL: {}.
55
+ Your task is to use the extracted text to help answer the user's question as accurately as possible.
56
+ Be thorough in your extraction, focusing on the main body of the content while avoiding any irrelevant information like advertisements, navigation bars, or footnotes.
57
+ If the text extracted from the webpage does not directly answer the user's question, use the extracted content as supporting information and provide an answer based on both the content and your own knowledge.
58
+ """
59
+
60
+
61
+ EXTRACT_USER_PROMPT = """
62
+ Based on the following extracted information and the user query, provide a response that is clear, concise, and directly answers the user's question.
63
+ If the extracted content from the webpage is not sufficient to fully answer the question, use your own knowledge to provide a complete and helpful response.
64
+
65
+ Retrieved Information:
66
+ {}
67
+
68
+ User Query: {}
69
+ """
scraper.py CHANGED
@@ -2,9 +2,11 @@ import asyncio
2
  import base64
3
  import io
4
  import json
 
5
  from string import ascii_lowercase
6
 
7
  import aiohttp
 
8
 
9
  from client import HybridClient
10
  from headers import random_headers
@@ -121,3 +123,26 @@ def upload_book_from_json(json_file_path):
121
 
122
  hclient.create(collection)
123
  hclient.insert(collection, chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import base64
3
  import io
4
  import json
5
+ import re
6
  from string import ascii_lowercase
7
 
8
  import aiohttp
9
+ from bs4 import BeautifulSoup, Comment, Declaration
10
 
11
  from client import HybridClient
12
  from headers import random_headers
 
123
 
124
  hclient.create(collection)
125
  hclient.insert(collection, chunks)
126
+
127
+
128
+ def is_visible_text(element):
129
+ if element.parent.name in ["style", "script", "[document]", "head", "title"]:
130
+ return False
131
+ elif re.match("<!--.*-->", str(element)):
132
+ return False
133
+ elif type(element) is Comment or type(element) is Declaration:
134
+ return False
135
+ elif len(str(element)) < 50:
136
+ return False
137
+ return True
138
+
139
+
140
+ async def extract(url: str):
141
+ async with aiohttp.ClientSession() as session:
142
+ headers = random_headers()
143
+ async with session.get(url, headers=headers, timeout=10) as r:
144
+ r.raise_for_status()
145
+ content = await r.read()
146
+ texts = BeautifulSoup(content, "html.parser").findAll(string=True)
147
+ text = "".join(list(filter(is_visible_text, texts)))
148
+ return text