Spaces:

omkar334
/

agentic_rag

Runtime error

App Files Files Community

agentic_rag / scraper.py

omkar334

extractor agent

7244d35 about 1 month ago

raw

history blame contribute delete

4.48 kB

	import asyncio
	import base64
	import io
	import json
	import re
	from string import ascii_lowercase

	import aiohttp
	from bs4 import BeautifulSoup, Comment, Declaration

	from client import HybridClient
	from headers import random_headers
	from preprocessing import index_pdf

	grade_map = ascii_lowercase[:12]

	subject_map = {
	"science": "esc1",
	"geography": "ess1",
	"economics": "ess2",
	"history": "ess3",
	"politics": "ess4",
	}


	def get_url(grade, subject, chapter):
	filename = grade_map[grade - 1] + subject_map[subject] + str(chapter).zfill(2)
	url = f"https://ncert.nic.in/textbook/pdf/{filename}.pdf"
	print(url)
	return url


	async def get_book(grade, subject, chapters=None):
	book = {}
	if not chapters:
	chapters = range(1, 20)

	async with aiohttp.ClientSession() as session:
	print("Downloaded - ", end="")
	for i in chapters:
	url = get_url(grade, subject, i)

	pdf = await download(session, url)

	if pdf:
	collection = f"{grade}_{subject}_{i}"
	print(i, end="")
	book[collection] = pdf
	else:
	break
	print()
	return book


	async def download(session: aiohttp.ClientSession, url: str, max_retries: int = 3) -> io.BytesIO \| None:
	for attempt in range(max_retries):
	try:
	headers = {"Accept": "application/pdf"} \| random_headers()
	async with session.get(url, headers=headers, timeout=10) as r:
	r.raise_for_status()
	content = await r.read()
	return io.BytesIO(content)
	except Exception as e:
	print(f"Attempt {attempt + 1} failed: {e}")
	if attempt < max_retries - 1:
	await asyncio.sleep(2 ** (attempt + 1))
	else:
	print(f"Max retries reached. Unable to download PDF from {url}")
	return None


	async def upload_book(grade, subject, chapters=None):
	hclient = HybridClient()

	book = await get_book(grade, subject, chapters)
	for collection, pdf in book.items():
	print(collection)
	chunks = index_pdf(pdf, buffer=True)

	hclient.create(collection)
	hclient.insert(collection, chunks)


	async def save_book_to_json(grade, subject, chapters=None):
	book = await get_book(grade, subject, chapters)
	result = {}

	for collection, pdf in book.items():
	chunks = index_pdf(pdf, buffer=True)

	serializable_chunks = []
	for chunk in chunks:
	serializable_chunk = {}
	for key, value in chunk.items():
	if isinstance(value, bytes):
	serializable_chunk[key] = base64.b64encode(value).decode("utf-8")
	else:
	serializable_chunk[key] = value
	serializable_chunks.append(serializable_chunk)

	result[collection] = serializable_chunks

	with open(f"{grade}_{subject}.json", "w") as f:
	json.dump(result, f)


	def upload_book_from_json(json_file_path):
	hclient = HybridClient()

	with open(json_file_path, "r") as f:
	data = json.load(f)

	for collection, serialized_chunks in data.items():
	chunks = []
	for serialized_chunk in serialized_chunks:
	chunk = {}
	for key, value in serialized_chunk.items():
	if isinstance(value, str) and value.startswith("b'") and value.endswith("'"):
	chunk[key] = base64.b64decode(value[2:-1])
	else:
	chunk[key] = value
	chunks.append(chunk)

	hclient.create(collection)
	hclient.insert(collection, chunks)


	def is_visible_text(element):
	if element.parent.name in ["style", "script", "[document]", "head", "title"]:
	return False
	elif re.match("<!--.*-->", str(element)):
	return False
	elif type(element) is Comment or type(element) is Declaration:
	return False
	elif len(str(element)) < 50:
	return False
	return True


	async def extract(url: str):
	async with aiohttp.ClientSession() as session:
	headers = random_headers()
	async with session.get(url, headers=headers, timeout=10) as r:
	r.raise_for_status()
	content = await r.read()
	texts = BeautifulSoup(content, "html.parser").findAll(string=True)
	text = "".join(list(filter(is_visible_text, texts)))
	return text