Spaces:

omkar334
/

agentic_rag

Runtime error

File size: 4,484 Bytes

import asyncio
import base64
import io
import json
import re
from string import ascii_lowercase

import aiohttp
from bs4 import BeautifulSoup, Comment, Declaration

from client import HybridClient
from headers import random_headers
from preprocessing import index_pdf

grade_map = ascii_lowercase[:12]

subject_map = {
    "science": "esc1",
    "geography": "ess1",
    "economics": "ess2",
    "history": "ess3",
    "politics": "ess4",
}


def get_url(grade, subject, chapter):
    filename = grade_map[grade - 1] + subject_map[subject] + str(chapter).zfill(2)
    url = f"https://ncert.nic.in/textbook/pdf/{filename}.pdf"
    print(url)
    return url


async def get_book(grade, subject, chapters=None):
    book = {}
    if not chapters:
        chapters = range(1, 20)

    async with aiohttp.ClientSession() as session:
        print("Downloaded - ", end="")
        for i in chapters:
            url = get_url(grade, subject, i)

            pdf = await download(session, url)

            if pdf:
                collection = f"{grade}_{subject}_{i}"
                print(i, end="")
                book[collection] = pdf
            else:
                break
    print()
    return book


async def download(session: aiohttp.ClientSession, url: str, max_retries: int = 3) -> io.BytesIO | None:
    for attempt in range(max_retries):
        try:
            headers = {"Accept": "application/pdf"} | random_headers()
            async with session.get(url, headers=headers, timeout=10) as r:
                r.raise_for_status()
                content = await r.read()
                return io.BytesIO(content)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                await asyncio.sleep(2 ** (attempt + 1))
            else:
                print(f"Max retries reached. Unable to download PDF from {url}")
        return None


async def upload_book(grade, subject, chapters=None):
    hclient = HybridClient()

    book = await get_book(grade, subject, chapters)
    for collection, pdf in book.items():
        print(collection)
        chunks = index_pdf(pdf, buffer=True)

        hclient.create(collection)
        hclient.insert(collection, chunks)


async def save_book_to_json(grade, subject, chapters=None):
    book = await get_book(grade, subject, chapters)
    result = {}

    for collection, pdf in book.items():
        chunks = index_pdf(pdf, buffer=True)

        serializable_chunks = []
        for chunk in chunks:
            serializable_chunk = {}
            for key, value in chunk.items():
                if isinstance(value, bytes):
                    serializable_chunk[key] = base64.b64encode(value).decode("utf-8")
                else:
                    serializable_chunk[key] = value
            serializable_chunks.append(serializable_chunk)

        result[collection] = serializable_chunks

    with open(f"{grade}_{subject}.json", "w") as f:
        json.dump(result, f)


def upload_book_from_json(json_file_path):
    hclient = HybridClient()

    with open(json_file_path, "r") as f:
        data = json.load(f)

    for collection, serialized_chunks in data.items():
        chunks = []
        for serialized_chunk in serialized_chunks:
            chunk = {}
            for key, value in serialized_chunk.items():
                if isinstance(value, str) and value.startswith("b'") and value.endswith("'"):
                    chunk[key] = base64.b64decode(value[2:-1])
                else:
                    chunk[key] = value
            chunks.append(chunk)

        hclient.create(collection)
        hclient.insert(collection, chunks)


def is_visible_text(element):
    if element.parent.name in ["style", "script", "[document]", "head", "title"]:
        return False
    elif re.match("<!--.*-->", str(element)):
        return False
    elif type(element) is Comment or type(element) is Declaration:
        return False
    elif len(str(element)) < 50:
        return False
    return True


async def extract(url: str):
    async with aiohttp.ClientSession() as session:
        headers = random_headers()
        async with session.get(url, headers=headers, timeout=10) as r:
            r.raise_for_status()
            content = await r.read()
            texts = BeautifulSoup(content, "html.parser").findAll(string=True)
            text = "".join(list(filter(is_visible_text, texts)))
            return text