omkar334 commited on
Commit
d8f18fa
1 Parent(s): df32b2c
Files changed (1) hide show
  1. scraper.py +54 -0
scraper.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from string import ascii_lowercase
3
+
4
+ import aiohttp
5
+
6
+ grade_map = ascii_lowercase[:10]
7
+
8
+ subject_map = {
9
+ "science": "esc1",
10
+ "geography": "ess1",
11
+ "economics": "ess2",
12
+ "history": "ess3",
13
+ "politics": "ess4",
14
+ }
15
+
16
+
17
+ def get_url(grade, subject, chapter):
18
+ filename = grade_map[grade] + subject_map[subject] + str(chapter).zfill(2)
19
+ url = f"https://ncert.nic.in/textbook/pdf/{filename}.pdf"
20
+ return url
21
+
22
+
23
+ async def get_book(grade, subject):
24
+ book = []
25
+ chapter_num = 1
26
+ async with aiohttp.ClientSession() as session:
27
+ while True:
28
+ url = get_url(grade, subject, chapter_num)
29
+
30
+ pdf = download(session, url)
31
+
32
+ if pdf:
33
+ chapter = (pdf, grade, subject)
34
+ book.append(chapter)
35
+ else:
36
+ break
37
+ return book
38
+
39
+
40
+ async def download(session, url):
41
+ try:
42
+ async with session.get(url, timeout=10) as r:
43
+ r.raise_for_status()
44
+
45
+ pdf_content = io.BytesIO()
46
+ async for chunk in r.content.iter_chunked(1000000):
47
+ pdf_content.write(chunk)
48
+
49
+ pdf_content.seek(0)
50
+ return pdf_content
51
+
52
+ except Exception as e:
53
+ print(f"Error downloading or processing PDF: {e}")
54
+ return None