Spaces:
Sleeping
Sleeping
File size: 3,456 Bytes
d4c1054 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import re
import openai
import pandas as pd
import streamlit_scrollable_textbox as stx
import torch
from InstructorEmbedding import INSTRUCTOR
from gradio_client import Client
from transformers import (
AutoModelForMaskedLM,
AutoTokenizer,
)
from rank_bm25 import BM25Okapi, BM25L, BM25Plus
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import streamlit as st
@st.cache_resource
def get_data():
data = pd.read_csv("earnings_calls_cleaned_metadata_keywords_indices.csv")
return data
# Preprocessing for BM25
def tokenizer(
string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}"
):
regex = reg
string = string.replace("-", " ")
return " ".join(re.findall(regex, string))
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Tokenize the text
tokens = word_tokenize(text)
# Remove stop words
stop_words = set(stopwords.words("english"))
tokens = [token for token in tokens if token not in stop_words]
# Stem the tokens
porter_stemmer = PorterStemmer()
tokens = [porter_stemmer.stem(token) for token in tokens]
# Join the tokens back into a single string
preprocessed_text = " ".join(tokens)
preprocessed_text = tokenizer(preprocessed_text)
return preprocessed_text
# Initialize models from HuggingFace
@st.cache_resource
def get_splade_sparse_embedding_model():
model_sparse = "naver/splade-cocondenser-ensembledistil"
# check device
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_sparse)
model_sparse = AutoModelForMaskedLM.from_pretrained(model_sparse)
# move to gpu if available
model_sparse.to(device)
return model_sparse, tokenizer
@st.cache_resource
def get_instructor_embedding_model():
model = INSTRUCTOR("hkunlp/instructor-xl")
return model
@st.cache_resource
def get_instructor_embedding_model_api():
client = Client("https://awinml-api-instructor-xl-2.hf.space/")
return client
@st.cache_resource
def get_alpaca_model():
client = Client("https://awinml-alpaca-cpp.hf.space")
return client
@st.cache_resource
def get_vicuna_ner_1_model():
client = Client("https://awinml-api-vicuna-openblas-ner-1.hf.space/")
return client
@st.cache_resource
def get_vicuna_ner_2_model():
client = Client("https://awinml-api-vicuna-openblas-ner-2.hf.space/")
return client
@st.cache_resource
def get_vicuna_text_gen_model():
client = Client("https://awinml-api-vicuna-openblas-4.hf.space/")
return client
@st.cache_resource
def get_bm25_model(data):
corpus = data.Text.tolist()
corpus_clean = [preprocess_text(x) for x in corpus]
tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
bm25 = BM25Plus(tokenized_corpus)
return corpus, bm25
@st.cache_resource
def save_key(api_key):
return api_key
# Text Generation
def vicuna_text_generate(prompt, model):
generated_text = model.predict(prompt, api_name="/predict")
return generated_text
def gpt_turbo_model(prompt):
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": prompt},
],
temperature=0.01,
max_tokens=1024,
)
return response["choices"][0]["message"]["content"]
|