agentic_rag / scraper.py
omkar334's picture
extractor agent
7244d35
import asyncio
import base64
import io
import json
import re
from string import ascii_lowercase
import aiohttp
from bs4 import BeautifulSoup, Comment, Declaration
from client import HybridClient
from headers import random_headers
from preprocessing import index_pdf
grade_map = ascii_lowercase[:12]
subject_map = {
"science": "esc1",
"geography": "ess1",
"economics": "ess2",
"history": "ess3",
"politics": "ess4",
}
def get_url(grade, subject, chapter):
filename = grade_map[grade - 1] + subject_map[subject] + str(chapter).zfill(2)
url = f"https://ncert.nic.in/textbook/pdf/{filename}.pdf"
print(url)
return url
async def get_book(grade, subject, chapters=None):
book = {}
if not chapters:
chapters = range(1, 20)
async with aiohttp.ClientSession() as session:
print("Downloaded - ", end="")
for i in chapters:
url = get_url(grade, subject, i)
pdf = await download(session, url)
if pdf:
collection = f"{grade}_{subject}_{i}"
print(i, end="")
book[collection] = pdf
else:
break
print()
return book
async def download(session: aiohttp.ClientSession, url: str, max_retries: int = 3) -> io.BytesIO | None:
for attempt in range(max_retries):
try:
headers = {"Accept": "application/pdf"} | random_headers()
async with session.get(url, headers=headers, timeout=10) as r:
r.raise_for_status()
content = await r.read()
return io.BytesIO(content)
except Exception as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
await asyncio.sleep(2 ** (attempt + 1))
else:
print(f"Max retries reached. Unable to download PDF from {url}")
return None
async def upload_book(grade, subject, chapters=None):
hclient = HybridClient()
book = await get_book(grade, subject, chapters)
for collection, pdf in book.items():
print(collection)
chunks = index_pdf(pdf, buffer=True)
hclient.create(collection)
hclient.insert(collection, chunks)
async def save_book_to_json(grade, subject, chapters=None):
book = await get_book(grade, subject, chapters)
result = {}
for collection, pdf in book.items():
chunks = index_pdf(pdf, buffer=True)
serializable_chunks = []
for chunk in chunks:
serializable_chunk = {}
for key, value in chunk.items():
if isinstance(value, bytes):
serializable_chunk[key] = base64.b64encode(value).decode("utf-8")
else:
serializable_chunk[key] = value
serializable_chunks.append(serializable_chunk)
result[collection] = serializable_chunks
with open(f"{grade}_{subject}.json", "w") as f:
json.dump(result, f)
def upload_book_from_json(json_file_path):
hclient = HybridClient()
with open(json_file_path, "r") as f:
data = json.load(f)
for collection, serialized_chunks in data.items():
chunks = []
for serialized_chunk in serialized_chunks:
chunk = {}
for key, value in serialized_chunk.items():
if isinstance(value, str) and value.startswith("b'") and value.endswith("'"):
chunk[key] = base64.b64decode(value[2:-1])
else:
chunk[key] = value
chunks.append(chunk)
hclient.create(collection)
hclient.insert(collection, chunks)
def is_visible_text(element):
if element.parent.name in ["style", "script", "[document]", "head", "title"]:
return False
elif re.match("<!--.*-->", str(element)):
return False
elif type(element) is Comment or type(element) is Declaration:
return False
elif len(str(element)) < 50:
return False
return True
async def extract(url: str):
async with aiohttp.ClientSession() as session:
headers = random_headers()
async with session.get(url, headers=headers, timeout=10) as r:
r.raise_for_status()
content = await r.read()
texts = BeautifulSoup(content, "html.parser").findAll(string=True)
text = "".join(list(filter(is_visible_text, texts)))
return text