File size: 3,636 Bytes
76bc633
 
d8f18fa
76bc633
d8f18fa
 
 
 
333a935
76bc633
333a935
 
76bc633
d8f18fa
 
 
 
 
 
 
 
 
 
 
76bc633
d8f18fa
76bc633
d8f18fa
 
 
76bc633
333a935
76bc633
 
 
d8f18fa
76bc633
 
 
d8f18fa
76bc633
d8f18fa
 
76bc633
 
333a935
d8f18fa
 
76bc633
d8f18fa
 
 
76bc633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8f18fa
 
76bc633
 
d8f18fa
2468331
76bc633
 
 
333a935
76bc633
 
333a935
 
76bc633
 
 
 
333a935
76bc633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333a935
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import asyncio
import base64
import io
import json
from string import ascii_lowercase

import aiohttp

from client import HybridClient
from headers import random_headers
from preprocessing import index_pdf

grade_map = ascii_lowercase[:12]

subject_map = {
    "science": "esc1",
    "geography": "ess1",
    "economics": "ess2",
    "history": "ess3",
    "politics": "ess4",
}


def get_url(grade, subject, chapter):
    filename = grade_map[grade - 1] + subject_map[subject] + str(chapter).zfill(2)
    url = f"https://ncert.nic.in/textbook/pdf/{filename}.pdf"
    print(url)
    return url


async def get_book(grade, subject, chapters=None):
    book = {}
    if not chapters:
        chapters = range(1, 20)

    async with aiohttp.ClientSession() as session:
        print("Downloaded - ", end="")
        for i in chapters:
            url = get_url(grade, subject, i)

            pdf = await download(session, url)

            if pdf:
                collection = f"{grade}_{subject}_{i}"
                print(i, end="")
                book[collection] = pdf
            else:
                break
    print()
    return book


async def download(session: aiohttp.ClientSession, url: str, max_retries: int = 3) -> io.BytesIO | None:
    for attempt in range(max_retries):
        try:
            headers = {"Accept": "application/pdf"} | random_headers()
            async with session.get(url, headers=headers, timeout=10) as r:
                r.raise_for_status()
                content = await r.read()
                return io.BytesIO(content)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                await asyncio.sleep(2 ** (attempt + 1))
            else:
                print(f"Max retries reached. Unable to download PDF from {url}")
        return None


async def upload_book(grade, subject, chapters=None):
    hclient = HybridClient()

    book = await get_book(grade, subject, chapters)
    for collection, pdf in book.items():
        print(collection)
        chunks = index_pdf(pdf, buffer=True)

        hclient.create(collection)
        hclient.insert(collection, chunks)


async def save_book_to_json(grade, subject, chapters=None):
    book = await get_book(grade, subject, chapters)
    result = {}

    for collection, pdf in book.items():
        chunks = index_pdf(pdf, buffer=True)

        serializable_chunks = []
        for chunk in chunks:
            serializable_chunk = {}
            for key, value in chunk.items():
                if isinstance(value, bytes):
                    serializable_chunk[key] = base64.b64encode(value).decode("utf-8")
                else:
                    serializable_chunk[key] = value
            serializable_chunks.append(serializable_chunk)

        result[collection] = serializable_chunks

    with open(f"{grade}_{subject}.json", "w") as f:
        json.dump(result, f)


def upload_book_from_json(json_file_path):
    hclient = HybridClient()

    with open(json_file_path, "r") as f:
        data = json.load(f)

    for collection, serialized_chunks in data.items():
        chunks = []
        for serialized_chunk in serialized_chunks:
            chunk = {}
            for key, value in serialized_chunk.items():
                if isinstance(value, str) and value.startswith("b'") and value.endswith("'"):
                    chunk[key] = base64.b64decode(value[2:-1])
                else:
                    chunk[key] = value
            chunks.append(chunk)

        hclient.create(collection)
        hclient.insert(collection, chunks)