This file is a space to construct functions, experiment and see changes directly instead of having to reload the app everytime. It serves as the draft for app.py and contains similar functions except for the streamlit app component

In [5]:
import streamlit as st
from fastai.collab import *
import torch
from torch import nn
import pickle
import pandas as pd
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import sentencepiece
import string
import requests

In [89]:
# Load the data loader 
dls= pd.read_pickle('dataloader.pkl')

# Create an instance of the model
learn = collab_learner(dls, use_nn=True,layers=[20,10],y_range=(0,10.5))

# Load the saved state dictionary
state_dict = torch.load('myModel.pth',map_location=torch.device('cpu'))

# Assign the loaded state dictionary to the model's load_state_dict() method
learn.model.load_state_dict(state_dict)

KeyboardInterrupt: 

In [None]:
def get_3_recs(book):
 book_factors = learn.model.embeds[1].weight
 idx = dls.classes['title'].o2i[book]
 distances = nn.CosineSimilarity(dim=1)(book_factors, book_factors[idx][None])
 idxs = distances.argsort(descending=True)[1:4]
 recs = [dls.classes['title'][i] for i in idxs]
 return recs

In [3]:
#load books dataframe
books_df = pd.read_csv('./data/BX_Books.csv', sep=';',encoding='latin-1')

In [12]:
#get covers
def get_covers(recs):
 imgs = [books_df[books_df['Book-Title']==r]['Image-URL-L'].tolist()[0]for r in recs]
 return imgs

get_covers(['1984', 'The Great Gatsby'])

['http://images.amazon.com/images/P/0451524934.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/185326041X.01.LZZZZZZZ.jpg']

In [None]:
user_input = st.text_input("What's your favorite book?")
recs = get_3_recs(user_input)
st.write("Try these books:", ",".join(recs))

2023-06-24 16:15:04.552 
 command:

 streamlit run /Users/irenenguyen/mambaforge/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


Description Summarizer

Getting book description from Google Books API

In [None]:
def search_book_description(title):
 # Google Books API endpoint for book search
 url = "https://www.googleapis.com/books/v1/volumes"

 # Parameters for the book search
 params = {
 "q": title,
 "maxResults": 1
 }

 # Send GET request to Google Books API
 response = requests.get(url, params=params)

 # Check if the request was successful
 if response.status_code == 200:
 # Parse the JSON response to extract the book description
 data = response.json()

 if "items" in data and len(data["items"]) > 0:
 book_description = data["items"][0]["volumeInfo"].get("description", "No description available.")
 return book_description
 else:
 print("No book found with the given title.")
 return None
 else:
 # If the request failed, print the error message
 print("Error:", response.status_code, response.text)
 return None

Summarization Model

In [None]:
#load tokenizer
tokenizer = PegasusTokenizer.from_pretrained("pszemraj/pegasus-x-large-book-summary")
#load model
model = PegasusForConditionalGeneration.from_pretrained("pszemraj/pegasus-x-large-book-summary")

You are using a model of type pegasus_x to instantiate a model of type pegasus. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at pszemraj/pegasus-x-large-book-summary were not used when initializing PegasusForConditionalGeneration: ['model.encoder.layers.5.global_self_attn_layer_norm.bias', 'model.encoder.layers.2.global_self_attn_layer_norm.weight', 'model.encoder.layers.15.global_self_attn_layer_norm.weight', 'model.encoder.layers.11.global_self_attn_layer_norm.weight', 'model.encoder.layers.12.global_self_attn_layer_norm.bias', 'model.encoder.layers.4.global_self_attn_layer_norm.weight', 'model.encoder.layers.0.global_self_attn_layer_norm.bias', 'model.encoder.layers.10.global_self_attn_layer_norm.bias', 'model.encoder.layers.5.global_self_attn_layer_norm.weight', 'model.encoder.layers.7.global_self_attn_layer_norm.bias', 'model.encoder.layers.11.global_self_attn_layer_norm.bias', 'model.encoder.layers.13.global_sel

In [108]:
#function to ensure summaries end with punctuation
def cut(sum):
 last_punc_idx = max(sum.rfind(p) for p in string.punctuation)
 output = sum[:last_punc_idx + 1]
 return output


#function to summarize

def summarize(des_list):
 if "No description available." in des_list:
 idx = des_list.index("No description available.")
 des = des_list.copy()
 des.pop(idx)
 rest = summarize(des)
 rest.insert(idx,'No description available.')
 return rest
 else: 
 # Tokenize all the descriptions in the list
 encoded_inputs = tokenizer(des_list, truncation=True, padding="longest", return_tensors="pt")

 # Generate summaries for all the inputs
 summaries = model.generate(**encoded_inputs, max_new_tokens=100)

 # Decode the summaries and process them
 outputs = tokenizer.batch_decode(summaries, skip_special_tokens=True)
 outputs = list(map(cut, outputs))
 return outputs
