import re import openai import streamlit_scrollable_textbox as stx import pinecone import streamlit as st st.set_page_config(layout="wide") # isort: split from utils.entity_extraction import ( clean_entities, extract_quarter_year, extract_ticker_spacy, format_entities_flan_alpaca, generate_alpaca_ner_prompt, ) from utils.models import ( generate_entities_flan_alpaca_checkpoint, generate_entities_flan_alpaca_inference_api, generate_text_flan_t5, get_data, get_flan_alpaca_xl_model, get_flan_t5_model, get_mpnet_embedding_model, get_sgpt_embedding_model, get_spacy_model, get_splade_sparse_embedding_model, get_t5_model, gpt_turbo_model, save_key, ) from utils.prompts import ( generate_flant5_prompt_instruct_chunk_context, generate_flant5_prompt_instruct_chunk_context_single, generate_flant5_prompt_instruct_complete_context, generate_flant5_prompt_summ_chunk_context, generate_flant5_prompt_summ_chunk_context_single, generate_gpt_j_two_shot_prompt_1, generate_gpt_j_two_shot_prompt_2, generate_gpt_prompt_alpaca, generate_gpt_prompt_alpaca_multi_doc, generate_gpt_prompt_original, generate_multi_doc_context, get_context_list_prompt, ) from utils.retriever import ( format_query, query_pinecone, query_pinecone_sparse, sentence_id_combine, text_lookup, year_quarter_range, ) from utils.transcript_retrieval import retrieve_transcript from utils.vector_index import ( create_dense_embeddings, create_sparse_embeddings, hybrid_score_norm, ) st.title("Question Answering on Earnings Call Transcripts") st.write( "The app uses the quarterly earnings call transcripts for 10 companies (Apple, AMD, Amazon, Cisco, Google, Microsoft, Nvidia, ASML, Intel, Micron) for the years 2016 to 2020." ) col1, col2 = st.columns([3, 3], gap="medium") with st.sidebar: ner_choice = st.selectbox("Select NER Model", ["Spacy", "Alpaca"]) document_type = st.selectbox( "Select Query Type", ["Single-Document", "Multi-Document"] ) if ner_choice == "Spacy": ner_model = get_spacy_model() with col1: st.subheader("Question") if document_type == "Single-Document": query_text = st.text_area( "Input Query", value="What was discussed regarding Wearables revenue performance?", ) else: query_text = st.text_area( "Input Query", value="How has revenue from Wearables performed over the past 2 years?", ) years_choice = ["2020", "2019", "2018", "2017", "2016", "All"] quarters_choice = ["Q1", "Q2", "Q3", "Q4", "All"] ticker_choice = [ "AAPL", "CSCO", "MSFT", "ASML", "NVDA", "GOOGL", "MU", "INTC", "AMZN", "AMD", ] if document_type == "Single-Document": if ner_choice == "Alpaca": ner_prompt = generate_alpaca_ner_prompt(query_text) entity_text = generate_entities_flan_alpaca_inference_api(ner_prompt) company_ent, quarter_ent, year_ent = format_entities_flan_alpaca( entity_text ) else: company_ent = extract_ticker_spacy(query_text, ner_model) quarter_ent, year_ent = extract_quarter_year(query_text) ticker_index, quarter_index, year_index = clean_entities( company_ent, quarter_ent, year_ent ) with col1: # Hardcoding the defaults for a question without metadata if ( query_text == "What was discussed regarding Wearables revenue performance?" ): year = st.selectbox("Year", years_choice) quarter = st.selectbox("Quarter", quarters_choice) ticker = st.selectbox("Company", ticker_choice) else: year = st.selectbox("Year", years_choice, index=year_index) quarter = st.selectbox( "Quarter", quarters_choice, index=quarter_index ) ticker = st.selectbox("Company", ticker_choice, ticker_index) participant_type = st.selectbox( "Speaker", ["Company Speaker", "Analyst"] ) else: # Multi-Document Case with col1: # Hardcoding the defaults for a question without metadata if ( query_text == "How has revenue from Wearables performed over the past 2 years?" ): start_year = st.selectbox("Start Year", years_choice, index=2) start_quarter = st.selectbox( "Start Quarter", quarters_choice, index=0 ) end_year = st.selectbox("End Year", years_choice, index=0) end_quarter = st.selectbox("End Quarter", quarters_choice, index=0) ticker = st.selectbox("Company", ticker_choice, index=0) else: start_year = st.selectbox("Start Year", years_choice, index=2) start_quarter = st.selectbox( "Start Quarter", quarters_choice, index=0 ) end_year = st.selectbox("End Year", years_choice, index=0) end_quarter = st.selectbox("End Quarter", quarters_choice, index=0) ticker = st.selectbox("Company", ticker_choice, index=0) participant_type = st.selectbox( "Speaker", ["Company Speaker", "Analyst"] ) with st.sidebar: st.subheader("Select Options:") if document_type == "Single-Document": num_results = int( st.number_input("Number of Results to query", 1, 15, value=5) ) else: num_results = int( st.number_input("Number of Results to query", 1, 15, value=2) ) # Choose encoder model encoder_models_choice = ["MPNET", "SGPT", "Hybrid MPNET - SPLADE"] with st.sidebar: encoder_model = st.selectbox("Select Encoder Model", encoder_models_choice) # Choose decoder model # Restricting multi-document to only GPT-3 if document_type == "Single-Document": decoder_models_choice = ["GPT-3.5 Turbo", "T5", "FLAN-T5", "GPT-J"] else: decoder_models_choice = ["GPT-3.5 Turbo"] with st.sidebar: decoder_model = st.selectbox("Select Decoder Model", decoder_models_choice) if encoder_model == "MPNET": # Connect to pinecone environment pinecone.init( api_key=st.secrets["pinecone_mpnet"], environment="us-east1-gcp" ) pinecone_index_name = "week2-all-mpnet-base" pinecone_index = pinecone.Index(pinecone_index_name) retriever_model = get_mpnet_embedding_model() elif encoder_model == "SGPT": # Connect to pinecone environment pinecone.init( api_key=st.secrets["pinecone_sgpt"], environment="us-east1-gcp" ) pinecone_index_name = "week2-sgpt-125m" pinecone_index = pinecone.Index(pinecone_index_name) retriever_model = get_sgpt_embedding_model() elif encoder_model == "Hybrid MPNET - SPLADE": pinecone.init( api_key=st.secrets["pinecone_hybrid_splade_mpnet"], environment="us-central1-gcp", ) pinecone_index_name = "splade-mpnet" pinecone_index = pinecone.Index(pinecone_index_name) retriever_model = get_mpnet_embedding_model() ( sparse_retriever_model, sparse_retriever_tokenizer, ) = get_splade_sparse_embedding_model() with st.sidebar: if document_type == "Single-Document": window = int(st.number_input("Sentence Window Size", 0, 10, value=1)) threshold = float( st.number_input( label="Similarity Score Threshold", step=0.05, format="%.2f", value=0.25, ) ) else: window = int(st.number_input("Sentence Window Size", 0, 10, value=0)) threshold = float( st.number_input( label="Similarity Score Threshold", step=0.05, format="%.2f", value=0.6, ) ) data = get_data() if document_type == "Single-Document": if encoder_model == "Hybrid SGPT - SPLADE": dense_query_embedding = create_dense_embeddings( query_text, retriever_model ) sparse_query_embedding = create_sparse_embeddings( query_text, sparse_retriever_model, sparse_retriever_tokenizer ) dense_query_embedding, sparse_query_embedding = hybrid_score_norm( dense_query_embedding, sparse_query_embedding, 0 ) query_results = query_pinecone_sparse( dense_query_embedding, sparse_query_embedding, num_results, pinecone_index, year, quarter, ticker, participant_type, threshold, ) else: dense_query_embedding = create_dense_embeddings( query_text, retriever_model ) query_results = query_pinecone( dense_query_embedding, num_results, pinecone_index, year, quarter, ticker, participant_type, threshold, ) if threshold <= 0.90: context_list = sentence_id_combine(data, query_results, lag=window) else: context_list = format_query(query_results) else: # Multi-Document Retreival if encoder_model == "Hybrid SGPT - SPLADE": dense_query_embedding = create_dense_embeddings( query_text, retriever_model ) sparse_query_embedding = create_sparse_embeddings( query_text, sparse_retriever_model, sparse_retriever_tokenizer ) dense_query_embedding, sparse_query_embedding = hybrid_score_norm( dense_query_embedding, sparse_query_embedding, 0 ) year_quarter_list = year_quarter_range( start_quarter, start_year, end_quarter, end_year ) context_group = [] for year, quarter in year_quarter_list: query_results = query_pinecone_sparse( dense_query_embedding, sparse_query_embedding, num_results, pinecone_index, year, quarter, ticker, participant_type, threshold, ) results_list = sentence_id_combine(data, query_results, lag=window) context_group.append((results_list, year, quarter)) else: dense_query_embedding = create_dense_embeddings( query_text, retriever_model ) year_quarter_list = year_quarter_range( start_quarter, start_year, end_quarter, end_year ) context_group = [] for year, quarter in year_quarter_list: query_results = query_pinecone( dense_query_embedding, num_results, pinecone_index, year, quarter, ticker, participant_type, threshold, ) results_list = sentence_id_combine(data, query_results, lag=window) context_group.append((results_list, year, quarter)) multi_doc_context = generate_multi_doc_context(context_group) if decoder_model == "GPT-3.5 Turbo": if document_type == "Single-Document": prompt = generate_gpt_prompt_alpaca(query_text, context_list) else: prompt = generate_gpt_prompt_alpaca_multi_doc( query_text, context_group ) with col2: with st.form("my_form"): edited_prompt = st.text_area( label="Model Prompt", value=prompt, height=400 ) openai_key = st.text_input( "Enter OpenAI key", value="", type="password", ) submitted = st.form_submit_button("Submit") if submitted: api_key = save_key(openai_key) openai.api_key = api_key generated_text = gpt_turbo_model(edited_prompt) st.subheader("Answer:") regex_pattern_sentences = ( "(?
{answer_text}
{context_text}
{context_text}