ireneng commited on
Commit
4692739
1 Parent(s): 4ed6507

change tokenizer and model instance to load summarizer model

Browse files
Files changed (1) hide show
  1. app.py +106 -97
app.py CHANGED
@@ -4,123 +4,132 @@ import torch
4
  from torch import nn
5
  import pickle
6
  import pandas as pd
7
- from transformers import PegasusForConditionalGeneration, PegasusTokenizer
8
  import sentencepiece
9
  import string
10
  import requests
11
 
 
12
  @st.cache_resource
13
  def load_stuff():
14
- # Load the data loader
15
- dls= pd.read_pickle('dataloader.pkl')
16
- # Create an instance of the model
17
- learn = collab_learner(dls, use_nn=True,layers=[20,10],y_range=(0,10.5))
18
- # Load the saved state dictionary
19
- state_dict = torch.load('myModel.pth',map_location=torch.device('cpu'))
20
- # Assign the loaded state dictionary to the model's load_state_dict() method
21
- learn.model.load_state_dict(state_dict)
22
- #load books dataframe
23
- books = pd.read_csv('./data/BX_Books.csv', sep=';',encoding='latin-1')
24
- #load tokenizer
25
- tokenizer = PegasusTokenizer.from_pretrained("pszemraj/pegasus-x-large-book-summary")
26
- #load model
27
- model = PegasusForConditionalGeneration.from_pretrained("pszemraj/pegasus-x-large-book-summary")
28
- return dls, learn, books, tokenizer, model
 
 
 
29
 
30
  dls, learn, books, tokenizer, model = load_stuff()
31
 
32
- #function to get recommendations
 
33
  def get_3_recs(book):
34
- book_factors = learn.model.embeds[1].weight
35
- idx = dls.classes['title'].o2i[book]
36
- distances = nn.CosineSimilarity(dim=1)(book_factors, book_factors[idx][None])
37
- idxs = distances.argsort(descending=True)[1:4]
38
- recs = [dls.classes['title'][i] for i in idxs]
39
- return recs
40
-
41
- #function to get descriptions from Google Books
 
42
  def search_book_description(title):
43
- # Google Books API endpoint for book search
44
- url = "https://www.googleapis.com/books/v1/volumes"
45
- # Parameters for the book search
46
- params = {
47
- "q": title,
48
- "maxResults": 1
49
- }
50
- # Send GET request to Google Books API
51
- response = requests.get(url, params=params)
52
- # Check if the request was successful
53
- if response.status_code == 200:
54
- # Parse the JSON response to extract the book description
55
- data = response.json()
56
-
57
- if "items" in data and len(data["items"]) > 0:
58
- book_description = data["items"][0]["volumeInfo"].get("description", "No description available.")
59
- return book_description
60
- else:
61
- print("No book found with the given title.")
 
 
 
62
  return None
63
- else:
64
- # If the request failed, print the error message
65
- print("Error:", response.status_code, response.text)
66
- return None
67
-
68
- #function to ensure summaries end with punctuation
69
  def cut(sum):
70
  last_punc_idx = max(sum.rfind(p) for p in string.punctuation)
71
- output = sum[:last_punc_idx + 1]
72
  return output
73
 
74
 
75
- #function to summarize
76
  def summarize(des_list):
77
  if "No description available." in des_list:
78
- idx = des_list.index("No description available.")
79
- des = des_list.copy()
80
- des.pop(idx)
81
- rest = summarize(des)
82
- rest.insert(idx,'No description available.')
83
- return rest
84
- else:
85
- # Tokenize all the descriptions in the list
86
- encoded_inputs = tokenizer(des_list, truncation=True, padding="longest", return_tensors="pt")
87
-
88
- # Generate summaries for all the inputs
89
- summaries = model.generate(**encoded_inputs, max_new_tokens=100)
90
-
91
- # Decode the summaries and process them
92
- outputs = tokenizer.batch_decode(summaries, skip_special_tokens=True)
93
- outputs = list(map(cut, outputs))
94
- return outputs
95
-
96
- #function to get cover images
 
 
 
97
  def get_covers(recs):
98
- imgs = [books[books['Book-Title']==r]['Image-URL-L'].tolist()[0]for r in recs]
99
- return imgs
100
 
101
- #streamlit app construction
102
- st.title('Your digital librarian')
103
- st.markdown("Hi there! I recommend you books based on one you love (which might not be in the same genre because that's boring) and give you my own synopsis of each book. Enjoy!")
 
 
 
104
  options = books["Book-Title"].tolist()
105
- input = st.selectbox('Select your favorite book', options)
106
  if st.button("Get recommendations"):
107
- recs = get_3_recs(input)
108
- descriptions = list(map(search_book_description,recs))
109
- des_sums = summarize(descriptions)
110
- imgs = get_covers(recs)
111
-
112
- col1, col2, col3 = st.columns(3)
113
- col1.image(imgs[0])
114
- col1.markdown(f"**{recs[0]}**")
115
- col1.write(des_sums[0])
116
-
117
- col2.image(imgs[1])
118
- col2.markdown(f"**{recs[1]}**")
119
- col2.write(des_sums[1])
120
-
121
- col3.image(imgs[2])
122
- col3.markdown(f"**{recs[2]}**")
123
- col3.write(des_sums[2])
124
-
125
-
126
-
 
4
  from torch import nn
5
  import pickle
6
  import pandas as pd
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
  import sentencepiece
9
  import string
10
  import requests
11
 
12
+
13
  @st.cache_resource
14
  def load_stuff():
15
+ # Load the data loader
16
+ dls = pd.read_pickle("dataloader.pkl")
17
+ # Create an instance of the model
18
+ learn = collab_learner(dls, use_nn=True, layers=[20, 10], y_range=(0, 10.5))
19
+ # Load the saved state dictionary
20
+ state_dict = torch.load("myModel.pth", map_location=torch.device("cpu"))
21
+ # Assign the loaded state dictionary to the model's load_state_dict() method
22
+ learn.model.load_state_dict(state_dict)
23
+ # load books dataframe
24
+ books = pd.read_csv("./data/BX_Books.csv", sep=";", encoding="latin-1")
25
+ # load tokenizer
26
+ tokenizer = AutoTokenizer.from_pretrained("pszemraj/pegasus-x-large-book-summary")
27
+ # load model
28
+ model = AutoModelForSeq2SeqLM.from_pretrained(
29
+ "pszemraj/pegasus-x-large-book-summary"
30
+ )
31
+ return dls, learn, books, tokenizer, model
32
+
33
 
34
  dls, learn, books, tokenizer, model = load_stuff()
35
 
36
+
37
+ # function to get recommendations
38
  def get_3_recs(book):
39
+ book_factors = learn.model.embeds[1].weight
40
+ idx = dls.classes["title"].o2i[book]
41
+ distances = nn.CosineSimilarity(dim=1)(book_factors, book_factors[idx][None])
42
+ idxs = distances.argsort(descending=True)[1:4]
43
+ recs = [dls.classes["title"][i] for i in idxs]
44
+ return recs
45
+
46
+
47
+ # function to get descriptions from Google Books
48
  def search_book_description(title):
49
+ # Google Books API endpoint for book search
50
+ url = "https://www.googleapis.com/books/v1/volumes"
51
+ # Parameters for the book search
52
+ params = {"q": title, "maxResults": 1}
53
+ # Send GET request to Google Books API
54
+ response = requests.get(url, params=params)
55
+ # Check if the request was successful
56
+ if response.status_code == 200:
57
+ # Parse the JSON response to extract the book description
58
+ data = response.json()
59
+
60
+ if "items" in data and len(data["items"]) > 0:
61
+ book_description = data["items"][0]["volumeInfo"].get(
62
+ "description", "No description available."
63
+ )
64
+ return book_description
65
+ else:
66
+ print("No book found with the given title.")
67
+ return None
68
+ else:
69
+ # If the request failed, print the error message
70
+ print("Error:", response.status_code, response.text)
71
  return None
72
+
73
+
74
+ # function to ensure summaries end with punctuation
 
 
 
75
  def cut(sum):
76
  last_punc_idx = max(sum.rfind(p) for p in string.punctuation)
77
+ output = sum[: last_punc_idx + 1]
78
  return output
79
 
80
 
81
+ # function to summarize
82
  def summarize(des_list):
83
  if "No description available." in des_list:
84
+ idx = des_list.index("No description available.")
85
+ des = des_list.copy()
86
+ des.pop(idx)
87
+ rest = summarize(des)
88
+ rest.insert(idx, "No description available.")
89
+ return rest
90
+ else:
91
+ # Tokenize all the descriptions in the list
92
+ encoded_inputs = tokenizer(
93
+ des_list, truncation=True, padding="longest", return_tensors="pt"
94
+ )
95
+
96
+ # Generate summaries for all the inputs
97
+ summaries = model.generate(**encoded_inputs, max_new_tokens=100)
98
+
99
+ # Decode the summaries and process them
100
+ outputs = tokenizer.batch_decode(summaries, skip_special_tokens=True)
101
+ outputs = list(map(cut, outputs))
102
+ return outputs
103
+
104
+
105
+ # function to get cover images
106
  def get_covers(recs):
107
+ imgs = [books[books["Book-Title"] == r]["Image-URL-L"].tolist()[0] for r in recs]
108
+ return imgs
109
 
110
+
111
+ # streamlit app construction
112
+ st.title("Your digital librarian")
113
+ st.markdown(
114
+ "Hi there! I recommend you books based on one you love (which might not be in the same genre because that's boring) and give you my own synopsis of each book. Enjoy!"
115
+ )
116
  options = books["Book-Title"].tolist()
117
+ input = st.selectbox("Select your favorite book", options)
118
  if st.button("Get recommendations"):
119
+ recs = get_3_recs(input)
120
+ descriptions = list(map(search_book_description, recs))
121
+ des_sums = summarize(descriptions)
122
+ imgs = get_covers(recs)
123
+
124
+ col1, col2, col3 = st.columns(3)
125
+ col1.image(imgs[0])
126
+ col1.markdown(f"**{recs[0]}**")
127
+ col1.write(des_sums[0])
128
+
129
+ col2.image(imgs[1])
130
+ col2.markdown(f"**{recs[1]}**")
131
+ col2.write(des_sums[1])
132
+
133
+ col3.image(imgs[2])
134
+ col3.markdown(f"**{recs[2]}**")
135
+ col3.write(des_sums[2])