File size: 4,498 Bytes
44de051
 
 
 
 
 
4692739
44de051
 
 
 
4692739
44de051
 
4692739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44de051
 
 
4692739
 
44de051
4692739
 
 
 
 
 
 
 
 
44de051
4692739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44de051
4692739
 
 
44de051
 
4692739
44de051
 
 
4692739
44de051
 
4692739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44de051
4692739
 
44de051
4692739
 
 
 
 
 
44de051
4692739
44de051
4692739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import streamlit as st
from fastai.collab import *
import torch
from torch import nn
import pickle
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sentencepiece
import string
import requests


@st.cache_resource
def load_stuff():
    # Load the data loader
    dls = pd.read_pickle("dataloader.pkl")
    # Create an instance of the model
    learn = collab_learner(dls, use_nn=True, layers=[20, 10], y_range=(0, 10.5))
    # Load the saved state dictionary
    state_dict = torch.load("myModel.pth", map_location=torch.device("cpu"))
    # Assign the loaded state dictionary to the model's load_state_dict() method
    learn.model.load_state_dict(state_dict)
    # load books dataframe
    books = pd.read_csv("./data/BX_Books.csv", sep=";", encoding="latin-1")
    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("pszemraj/pegasus-x-large-book-summary")
    # load model
    model = AutoModelForSeq2SeqLM.from_pretrained(
        "pszemraj/pegasus-x-large-book-summary"
    )
    return dls, learn, books, tokenizer, model


dls, learn, books, tokenizer, model = load_stuff()


# function to get recommendations
def get_3_recs(book):
    book_factors = learn.model.embeds[1].weight
    idx = dls.classes["title"].o2i[book]
    distances = nn.CosineSimilarity(dim=1)(book_factors, book_factors[idx][None])
    idxs = distances.argsort(descending=True)[1:4]
    recs = [dls.classes["title"][i] for i in idxs]
    return recs


# function to get descriptions from Google Books
def search_book_description(title):
    # Google Books API endpoint for book search
    url = "https://www.googleapis.com/books/v1/volumes"
    # Parameters for the book search
    params = {"q": title, "maxResults": 1}
    # Send GET request to Google Books API
    response = requests.get(url, params=params)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response to extract the book description
        data = response.json()

        if "items" in data and len(data["items"]) > 0:
            book_description = data["items"][0]["volumeInfo"].get(
                "description", "No description available."
            )
            return book_description
        else:
            print("No book found with the given title.")
            return None
    else:
        # If the request failed, print the error message
        print("Error:", response.status_code, response.text)
        return None


# function to ensure summaries end with punctuation
def cut(sum):
    last_punc_idx = max(sum.rfind(p) for p in string.punctuation)
    output = sum[: last_punc_idx + 1]
    return output


# function to summarize
def summarize(des_list):
    if "No description available." in des_list:
        idx = des_list.index("No description available.")
        des = des_list.copy()
        des.pop(idx)
        rest = summarize(des)
        rest.insert(idx, "No description available.")
        return rest
    else:
        # Tokenize all the descriptions in the list
        encoded_inputs = tokenizer(
            des_list, truncation=True, padding="longest", return_tensors="pt"
        )

        # Generate summaries for all the inputs
        summaries = model.generate(**encoded_inputs, max_new_tokens=100)

        # Decode the summaries and process them
        outputs = tokenizer.batch_decode(summaries, skip_special_tokens=True)
        outputs = list(map(cut, outputs))
        return outputs


# function to get cover images
def get_covers(recs):
    imgs = [books[books["Book-Title"] == r]["Image-URL-L"].tolist()[0] for r in recs]
    return imgs


# streamlit app construction
st.title("Your digital librarian")
st.markdown(
    "Hi there! I recommend you books based on one you love (which might not be in the same genre because that's boring) and give you my own synopsis of each book. Enjoy!"
)
options = books["Book-Title"].tolist()
input = st.selectbox("Select your favorite book", options)
if st.button("Get recommendations"):
    recs = get_3_recs(input)
    descriptions = list(map(search_book_description, recs))
    des_sums = summarize(descriptions)
    imgs = get_covers(recs)

    col1, col2, col3 = st.columns(3)
    col1.image(imgs[0])
    col1.markdown(f"**{recs[0]}**")
    col1.write(des_sums[0])

    col2.image(imgs[1])
    col2.markdown(f"**{recs[1]}**")
    col2.write(des_sums[1])

    col3.image(imgs[2])
    col3.markdown(f"**{recs[2]}**")
    col3.write(des_sums[2])