awinml commited on
Commit
fd7f9d5
1 Parent(s): dbcbca3

Upload 17 files

Browse files
app.py CHANGED
@@ -59,6 +59,12 @@ decoder_models_choice = ["GPT-3.5 Turbo", "Vicuna-7B"]
59
  with st.sidebar:
60
  st.subheader("Select Options:")
61
 
 
 
 
 
 
 
62
  num_results = int(
63
  st.number_input("Number of Results to query", 1, 15, value=4)
64
  )
@@ -74,7 +80,6 @@ with st.sidebar:
74
  )
75
  )
76
 
77
- use_bm25 = st.checkbox("Use 2-Stage Retrieval (BM25)", value=True)
78
  num_candidates = int(
79
  st.number_input(
80
  "Number of Candidates to Generate:",
@@ -84,9 +89,6 @@ with st.sidebar:
84
  value=50,
85
  )
86
  )
87
- decoder_model = st.selectbox(
88
- "Select Text Generation Model", decoder_models_choice
89
- )
90
 
91
 
92
  col1, col2 = st.columns([3, 3], gap="medium")
@@ -94,9 +96,10 @@ col1, col2 = st.columns([3, 3], gap="medium")
94
  with col1:
95
  query_text = st.text_area(
96
  "Input Query",
97
- value="How has the growth been for AMD in the PC market in 2020?",
98
  )
99
 
 
100
  # Extracting Document Entities from Question
101
  (
102
  companies,
@@ -116,11 +119,28 @@ ticker_year_quarter_tuples_list = ticker_year_quarter_tuples_creator(
116
  ticker_list, year_quarter_range_list
117
  )
118
 
 
 
 
 
 
 
 
 
119
 
120
  # Extract keywords from query
121
  all_keywords = extract_entities_keywords(query_text, vicuna_ner_2_model)
122
  if all_keywords != []:
123
  keywords = clean_keywords_all_combs(all_keywords)
 
 
 
 
 
 
 
 
 
124
  else:
125
  keywords = None
126
 
@@ -135,9 +155,7 @@ pinecone.init(
135
  pinecone_index_name = "week13-instructor-xl"
136
  pinecone_index = pinecone.Index(pinecone_index_name)
137
  retriever_model = get_instructor_embedding_model_api()
138
- instruction = (
139
- "Represent the financial question for retrieving supporting documents:"
140
- )
141
 
142
 
143
  dense_query_embedding = create_dense_embeddings(
@@ -148,8 +166,9 @@ context_group = []
148
  if ticker_year_quarter_tuples_list != []:
149
  for ticker, quarter, year in ticker_year_quarter_tuples_list:
150
  if use_bm25 == True:
 
151
  indices = get_indices_bm25(
152
- data, ticker, quarter, year, num_candidates
153
  )
154
  else:
155
  indices = None
@@ -194,6 +213,12 @@ with col1:
194
  label="Model Prompt", value=prompt, height=400
195
  )
196
 
 
 
 
 
 
 
197
  if decoder_model == "GPT-3.5 Turbo":
198
  with col2:
199
  with st.form("gpt_form"):
@@ -224,9 +249,12 @@ if decoder_model == "GPT-3.5 Turbo":
224
 
225
  if decoder_model == "Vicuna-7B":
226
  with col2:
227
- st.write("The Vicuna Model is running: ...")
228
- st.write("The model takes 10-15 mins to generate the text.")
229
- generated_text = vicuna_text_generate(prompt, vicuna_text_gen_model)
 
 
 
230
  st.subheader("Answer:")
231
  regex_pattern_sentences = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
232
  generated_text_list = re.split(regex_pattern_sentences, generated_text)
 
59
  with st.sidebar:
60
  st.subheader("Select Options:")
61
 
62
+ use_bm25 = st.checkbox("Use 2-Stage Retrieval (BM25)", value=True)
63
+
64
+ use_keyword_matching = st.checkbox(
65
+ "Use Exact Keyword Matching", value=False
66
+ )
67
+
68
  num_results = int(
69
  st.number_input("Number of Results to query", 1, 15, value=4)
70
  )
 
80
  )
81
  )
82
 
 
83
  num_candidates = int(
84
  st.number_input(
85
  "Number of Candidates to Generate:",
 
89
  value=50,
90
  )
91
  )
 
 
 
92
 
93
 
94
  col1, col2 = st.columns([3, 3], gap="medium")
 
96
  with col1:
97
  query_text = st.text_area(
98
  "Input Query",
99
+ value="How has the growth been for AMD in the PC market in Q1 and Q2 2020?",
100
  )
101
 
102
+
103
  # Extracting Document Entities from Question
104
  (
105
  companies,
 
119
  ticker_list, year_quarter_range_list
120
  )
121
 
122
+ with col2:
123
+ if ticker_year_quarter_tuples_list != []:
124
+ st.markdown("**Companies mentioned in the question:**")
125
+ for i in ticker_list:
126
+ st.markdown("- " + i)
127
+ st.write("**Duration:**")
128
+ st.write(f"{start_quarter} {start_year} - {end_quarter} {end_year}")
129
+
130
 
131
  # Extract keywords from query
132
  all_keywords = extract_entities_keywords(query_text, vicuna_ner_2_model)
133
  if all_keywords != []:
134
  keywords = clean_keywords_all_combs(all_keywords)
135
+ store_keywords = keywords.copy()
136
+ else:
137
+ keywords = None
138
+
139
+ # Setting Keywords to None if use_keywords is False
140
+
141
+
142
+ if use_keyword_matching == True:
143
+ keywords = store_keywords
144
  else:
145
  keywords = None
146
 
 
155
  pinecone_index_name = "week13-instructor-xl"
156
  pinecone_index = pinecone.Index(pinecone_index_name)
157
  retriever_model = get_instructor_embedding_model_api()
158
+ instruction = "Represent the finance query for retrieving related documents:"
 
 
159
 
160
 
161
  dense_query_embedding = create_dense_embeddings(
 
166
  if ticker_year_quarter_tuples_list != []:
167
  for ticker, quarter, year in ticker_year_quarter_tuples_list:
168
  if use_bm25 == True:
169
+ # Setting Ticker, Quarter, Year=None to trigger global bm25
170
  indices = get_indices_bm25(
171
+ data, query_text, None, None, None, num_candidates
172
  )
173
  else:
174
  indices = None
 
213
  label="Model Prompt", value=prompt, height=400
214
  )
215
 
216
+ with st.sidebar:
217
+ decoder_model = st.selectbox(
218
+ "Select Text Generation Model", decoder_models_choice
219
+ )
220
+
221
+
222
  if decoder_model == "GPT-3.5 Turbo":
223
  with col2:
224
  with st.form("gpt_form"):
 
249
 
250
  if decoder_model == "Vicuna-7B":
251
  with col2:
252
+ with st.spinner(
253
+ text="The Vicuna Model is running. The model takes approximately 10-15 mins to generate the text."
254
+ ):
255
+ generated_text = vicuna_text_generate(
256
+ prompt, vicuna_text_gen_model
257
+ )
258
  st.subheader("Answer:")
259
  regex_pattern_sentences = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
260
  generated_text_list = re.split(regex_pattern_sentences, generated_text)
utils/__pycache__/entity_extraction.cpython-38.pyc CHANGED
Binary files a/utils/__pycache__/entity_extraction.cpython-38.pyc and b/utils/__pycache__/entity_extraction.cpython-38.pyc differ
 
utils/__pycache__/models.cpython-38.pyc CHANGED
Binary files a/utils/__pycache__/models.cpython-38.pyc and b/utils/__pycache__/models.cpython-38.pyc differ
 
utils/__pycache__/retriever.cpython-38.pyc CHANGED
Binary files a/utils/__pycache__/retriever.cpython-38.pyc and b/utils/__pycache__/retriever.cpython-38.pyc differ
 
utils/__pycache__/transcript_retrieval.cpython-38.pyc CHANGED
Binary files a/utils/__pycache__/transcript_retrieval.cpython-38.pyc and b/utils/__pycache__/transcript_retrieval.cpython-38.pyc differ
 
utils/entity_extraction.py CHANGED
@@ -35,8 +35,9 @@ def extract_entities_docs(query, model):
35
  """
36
  prompt = generate_ner_docs_prompt(query)
37
  string_of_dict = model.predict(prompt, api_name="/predict")
38
-
39
- entities_dict = literal_eval(string_of_dict)
 
40
  start_quarter, start_year = entities_dict["start-duration"]
41
  end_quarter, end_year = entities_dict["end-duration"]
42
  companies = entities_dict["companies"]
@@ -176,8 +177,9 @@ def extract_entities_keywords(query, model):
176
  """
177
  prompt = generate_ner_keywords_prompt(query)
178
  string_of_dict = model.predict(prompt, api_name="/predict")
179
-
180
- entities_dict = literal_eval(string_of_dict)
 
181
  keywords_list = entities_dict["entities"]
182
  return keywords_list
183
 
 
35
  """
36
  prompt = generate_ner_docs_prompt(query)
37
  string_of_dict = model.predict(prompt, api_name="/predict")
38
+ print(string_of_dict)
39
+ string_of_dict = string_of_dict.strip()
40
+ entities_dict = literal_eval(f"""{string_of_dict}""")
41
  start_quarter, start_year = entities_dict["start-duration"]
42
  end_quarter, end_year = entities_dict["end-duration"]
43
  companies = entities_dict["companies"]
 
177
  """
178
  prompt = generate_ner_keywords_prompt(query)
179
  string_of_dict = model.predict(prompt, api_name="/predict")
180
+ print(string_of_dict)
181
+ string_of_dict = string_of_dict.strip()
182
+ entities_dict = literal_eval(f"""{string_of_dict}""")
183
  keywords_list = entities_dict["entities"]
184
  return keywords_list
185
 
utils/retriever.py CHANGED
@@ -55,6 +55,7 @@ def query_pinecone(
55
  filter_dict = {
56
  "QA_Flag": {"$eq": "Answer"},
57
  }
 
58
  if year is not None:
59
  filter_dict["Year"] = int(year)
60
  if quarter is not None:
@@ -66,6 +67,7 @@ def query_pinecone(
66
  if indices is not None:
67
  filter_dict["index"] = {"$in": indices}
68
 
 
69
  xc = index.query(
70
  vector=dense_vec,
71
  top_k=top_k,
 
55
  filter_dict = {
56
  "QA_Flag": {"$eq": "Answer"},
57
  }
58
+
59
  if year is not None:
60
  filter_dict["Year"] = int(year)
61
  if quarter is not None:
 
67
  if indices is not None:
68
  filter_dict["index"] = {"$in": indices}
69
 
70
+ print(filter_dict)
71
  xc = index.query(
72
  vector=dense_vec,
73
  top_k=top_k,
utils/transcript_retrieval.py CHANGED
@@ -2,30 +2,19 @@
2
 
3
 
4
  def retrieve_transcript(data, year, quarter, ticker):
5
- if year == "All" or quarter == "All":
6
- row = (
7
- data.loc[
8
- (data.Ticker == ticker),
9
- ["File_Name"],
10
- ]
11
- .drop_duplicates()
12
- .iloc[0, 0]
13
- )
14
- else:
15
- row = (
16
- data.loc[
17
- (data.Year == int(year))
18
- & (data.Quarter == quarter)
19
- & (data.Ticker == ticker),
20
- ["File_Name"],
21
- ]
22
- .drop_duplicates()
23
- .iloc[0, 0]
24
- )
25
  # convert row to a string and join values with "-"
26
  # row_str = "-".join(row.astype(str)) + ".txt"
27
  open_file = open(
28
- f"Transcripts/{ticker}/{row}",
29
  "r",
30
  )
31
  file_text = open_file.read()
 
2
 
3
 
4
  def retrieve_transcript(data, year, quarter, ticker):
5
+ print(year, quarter, ticker)
6
+ row = data.loc[
7
+ (data.Year == int(year))
8
+ & (data.Quarter == quarter)
9
+ & (data.Ticker == ticker),
10
+ ["File_Name"],
11
+ ]
12
+ filename = row.iloc[0, 0]
13
+ print(filename)
 
 
 
 
 
 
 
 
 
 
 
14
  # convert row to a string and join values with "-"
15
  # row_str = "-".join(row.astype(str)) + ".txt"
16
  open_file = open(
17
+ f"Transcripts/{ticker}/{filename}",
18
  "r",
19
  )
20
  file_text = open_file.read()