File size: 2,254 Bytes
2063044
 
 
 
 
 
 
 
dd738af
b9578f7
 
dd738af
 
 
 
 
 
 
 
 
 
 
b9578f7
 
 
 
 
 
2063044
a2ff068
 
2063044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer
import pickle
import os
import shutil
from langchain.document_loaders import BSHTMLLoader, DirectoryLoader

!git clone https://github.com/TheMITTech/shakespeare

from glob import glob
files = glob("./shakespeare/**/*.html")

os.mkdir('./data')
destination_folder = './data/'

for html_file in files:
    shutil.move(html_file, destination_folder + html_file.split("/"[-1]))

bshtml_dir_loader = DirectoryLoader('./data/', loader_cls = BSHTMLLoader)

data = bshtml_dir_loader.load()

with open("shakespeare.pkl", "wb") as fp:
  pickle.dump(data, fp)

with open('shakespeare.pkl', 'rb') as fp:
    data = pickle.load(fp)

bloomz_tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz-1b7')

text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(bloomz_tokenizer, chunk_size=100, chunk_overlap=0, separator='\n')

documents = text_splitter.split_documents(data)

embeddings = HuggingFaceEmbeddings()

persist_directory = "vector_db"

vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=persist_directory)

vectordb.persist()
vectordb = None

vectordb_persist = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

llm = HuggingFacePipeline.from_model_id(
    model_id="bigscience/bloomz-1b7",
    task="text-generation",
    model_kwargs={"temperature" : 0, "max_length" : 500})

doc_retriever = vectordb_persist.as_retriever()

shakespeare_qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=doc_retriever)

def make_inference(query):
    inference = shakespeare_qa.run(query)
    return inference

if __name__ == "__main__":
    # make a gradio interface
    import gradio as gr

    gr.Interface(
        make_inference,
        gr.inputs.Textbox(lines=2, label="Query"),
        gr.outputs.Textbox(label="Response"),
        title="Ask_Shakespeare",
        description="️building_w_llms_qa_Shakespeare allows you to inquire about the Shakespeare's plays.",
    ).launch()