File size: 4,484 Bytes
76bc633
 
d8f18fa
76bc633
7244d35
d8f18fa
 
 
7244d35
d8f18fa
333a935
76bc633
333a935
 
76bc633
d8f18fa
 
 
 
 
 
 
 
 
 
 
76bc633
d8f18fa
76bc633
d8f18fa
 
 
76bc633
333a935
76bc633
 
 
d8f18fa
76bc633
 
 
d8f18fa
76bc633
d8f18fa
 
76bc633
 
333a935
d8f18fa
 
76bc633
d8f18fa
 
 
76bc633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8f18fa
 
76bc633
 
d8f18fa
2468331
76bc633
 
 
333a935
76bc633
 
333a935
 
76bc633
 
 
 
333a935
76bc633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333a935
 
 
7244d35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import asyncio
import base64
import io
import json
import re
from string import ascii_lowercase

import aiohttp
from bs4 import BeautifulSoup, Comment, Declaration

from client import HybridClient
from headers import random_headers
from preprocessing import index_pdf

grade_map = ascii_lowercase[:12]

subject_map = {
    "science": "esc1",
    "geography": "ess1",
    "economics": "ess2",
    "history": "ess3",
    "politics": "ess4",
}


def get_url(grade, subject, chapter):
    filename = grade_map[grade - 1] + subject_map[subject] + str(chapter).zfill(2)
    url = f"https://ncert.nic.in/textbook/pdf/{filename}.pdf"
    print(url)
    return url


async def get_book(grade, subject, chapters=None):
    book = {}
    if not chapters:
        chapters = range(1, 20)

    async with aiohttp.ClientSession() as session:
        print("Downloaded - ", end="")
        for i in chapters:
            url = get_url(grade, subject, i)

            pdf = await download(session, url)

            if pdf:
                collection = f"{grade}_{subject}_{i}"
                print(i, end="")
                book[collection] = pdf
            else:
                break
    print()
    return book


async def download(session: aiohttp.ClientSession, url: str, max_retries: int = 3) -> io.BytesIO | None:
    for attempt in range(max_retries):
        try:
            headers = {"Accept": "application/pdf"} | random_headers()
            async with session.get(url, headers=headers, timeout=10) as r:
                r.raise_for_status()
                content = await r.read()
                return io.BytesIO(content)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                await asyncio.sleep(2 ** (attempt + 1))
            else:
                print(f"Max retries reached. Unable to download PDF from {url}")
        return None


async def upload_book(grade, subject, chapters=None):
    hclient = HybridClient()

    book = await get_book(grade, subject, chapters)
    for collection, pdf in book.items():
        print(collection)
        chunks = index_pdf(pdf, buffer=True)

        hclient.create(collection)
        hclient.insert(collection, chunks)


async def save_book_to_json(grade, subject, chapters=None):
    book = await get_book(grade, subject, chapters)
    result = {}

    for collection, pdf in book.items():
        chunks = index_pdf(pdf, buffer=True)

        serializable_chunks = []
        for chunk in chunks:
            serializable_chunk = {}
            for key, value in chunk.items():
                if isinstance(value, bytes):
                    serializable_chunk[key] = base64.b64encode(value).decode("utf-8")
                else:
                    serializable_chunk[key] = value
            serializable_chunks.append(serializable_chunk)

        result[collection] = serializable_chunks

    with open(f"{grade}_{subject}.json", "w") as f:
        json.dump(result, f)


def upload_book_from_json(json_file_path):
    hclient = HybridClient()

    with open(json_file_path, "r") as f:
        data = json.load(f)

    for collection, serialized_chunks in data.items():
        chunks = []
        for serialized_chunk in serialized_chunks:
            chunk = {}
            for key, value in serialized_chunk.items():
                if isinstance(value, str) and value.startswith("b'") and value.endswith("'"):
                    chunk[key] = base64.b64decode(value[2:-1])
                else:
                    chunk[key] = value
            chunks.append(chunk)

        hclient.create(collection)
        hclient.insert(collection, chunks)


def is_visible_text(element):
    if element.parent.name in ["style", "script", "[document]", "head", "title"]:
        return False
    elif re.match("<!--.*-->", str(element)):
        return False
    elif type(element) is Comment or type(element) is Declaration:
        return False
    elif len(str(element)) < 50:
        return False
    return True


async def extract(url: str):
    async with aiohttp.ClientSession() as session:
        headers = random_headers()
        async with session.get(url, headers=headers, timeout=10) as r:
            r.raise_for_status()
            content = await r.read()
            texts = BeautifulSoup(content, "html.parser").findAll(string=True)
            text = "".join(list(filter(is_visible_text, texts)))
            return text