Spaces:
Runtime error
Runtime error
File size: 1,606 Bytes
d8f18fa 333a935 d8f18fa 333a935 d8f18fa 333a935 d8f18fa 333a935 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import io
from string import ascii_lowercase
import aiohttp
from client import HybridClient
from preprocessing import index_pdf
grade_map = ascii_lowercase[:10]
subject_map = {
"science": "esc1",
"geography": "ess1",
"economics": "ess2",
"history": "ess3",
"politics": "ess4",
}
def get_url(grade, subject, chapter):
filename = grade_map[grade] + subject_map[subject] + str(chapter).zfill(2)
url = f"https://ncert.nic.in/textbook/pdf/{filename}.pdf"
return url
async def get_book(grade, subject):
book = {}
chapter_num = 1
async with aiohttp.ClientSession() as session:
while True:
url = get_url(grade, subject, chapter_num)
pdf = download(session, url)
if pdf:
collection = f"{grade}_{subject}"
book[collection] = pdf
else:
break
return book
async def download(session, url):
try:
async with session.get(url, timeout=10) as r:
r.raise_for_status()
pdf_content = io.BytesIO()
async for chunk in r.content.iter_chunked(1000000):
pdf_content.write(chunk)
pdf_content.seek(0)
return pdf_content
except Exception as e:
print(f"Error downloading or processing PDF: {e}")
return None
def upload_book(grade, subject):
hclient = HybridClient()
book = get_book(grade, subject)
for collection, pdf in book.items():
chunks = index_pdf(pdf)
hclient.create(collection)
hclient.insert(collection, chunks)
|