awinml's picture
Upload 8 files
c5e4524
raw
history blame
6.02 kB
def query_pinecone_sparse(
dense_vec,
sparse_vec,
top_k,
index,
year,
quarter,
ticker,
participant_type,
threshold=0.25,
):
if participant_type == "Company Speaker":
participant = "Answer"
else:
participant = "Question"
if year == "All":
if quarter == "All":
xc = index.query(
vector=dense_vec,
sparse_vector=sparse_vec,
top_k=top_k,
filter={
"Year": {
"$in": [
int("2020"),
int("2019"),
int("2018"),
int("2017"),
int("2016"),
]
},
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
"Ticker": {"$eq": ticker},
"QA_Flag": {"$eq": participant},
},
include_metadata=True,
)
else:
xc = index.query(
vector=dense_vec,
sparse_vector=sparse_vec,
top_k=top_k,
filter={
"Year": {
"$in": [
int("2020"),
int("2019"),
int("2018"),
int("2017"),
int("2016"),
]
},
"Quarter": {"$eq": quarter},
"Ticker": {"$eq": ticker},
"QA_Flag": {"$eq": participant},
},
include_metadata=True,
)
else:
# search pinecone index for context passage with the answer
xc = index.query(
vector=dense_vec,
sparse_vector=sparse_vec,
top_k=top_k,
filter={
"Year": int(year),
"Quarter": {"$eq": quarter},
"Ticker": {"$eq": ticker},
"QA_Flag": {"$eq": participant},
},
include_metadata=True,
)
# filter the context passages based on the score threshold
filtered_matches = []
for match in xc["matches"]:
if match["score"] >= threshold:
filtered_matches.append(match)
xc["matches"] = filtered_matches
return xc
def query_pinecone(
dense_vec,
top_k,
index,
year,
quarter,
ticker,
participant_type,
threshold=0.25,
):
if participant_type == "Company Speaker":
participant = "Answer"
else:
participant = "Question"
if year == "All":
if quarter == "All":
xc = index.query(
vector=dense_vec,
top_k=top_k,
filter={
"Year": {
"$in": [
int("2020"),
int("2019"),
int("2018"),
int("2017"),
int("2016"),
]
},
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
"Ticker": {"$eq": ticker},
"QA_Flag": {"$eq": participant},
},
include_metadata=True,
)
else:
xc = index.query(
vector=dense_vec,
top_k=top_k,
filter={
"Year": {
"$in": [
int("2020"),
int("2019"),
int("2018"),
int("2017"),
int("2016"),
]
},
"Quarter": {"$eq": quarter},
"Ticker": {"$eq": ticker},
"QA_Flag": {"$eq": participant},
},
include_metadata=True,
)
else:
# search pinecone index for context passage with the answer
xc = index.query(
vector=dense_vec,
top_k=top_k,
filter={
"Year": int(year),
"Quarter": {"$eq": quarter},
"Ticker": {"$eq": ticker},
"QA_Flag": {"$eq": participant},
},
include_metadata=True,
)
# filter the context passages based on the score threshold
filtered_matches = []
for match in xc["matches"]:
if match["score"] >= threshold:
filtered_matches.append(match)
xc["matches"] = filtered_matches
return xc
def format_query(query_results):
# extract passage_text from Pinecone search result
context = [
result["metadata"]["Text"] for result in query_results["matches"]
]
return context
def sentence_id_combine(data, query_results, lag=1):
# Extract sentence IDs from query results
ids = [
result["metadata"]["Sentence_id"]
for result in query_results["matches"]
]
# Generate new IDs by adding a lag value to the original IDs
new_ids = [id + i for id in ids for i in range(-lag, lag + 1)]
# Remove duplicates and sort the new IDs
new_ids = sorted(set(new_ids))
# Create a list of lookup IDs by grouping the new IDs in groups of lag*2+1
lookup_ids = [
new_ids[i : i + (lag * 2 + 1)]
for i in range(0, len(new_ids), lag * 2 + 1)
]
# Create a list of context sentences by joining the sentences
# corresponding to the lookup IDs
context_list = [
" ".join(
data.loc[data["Sentence_id"].isin(lookup_id), "Text"].to_list()
)
for lookup_id in lookup_ids
]
return context_list
def text_lookup(data, sentence_ids):
context = ". ".join(data.iloc[sentence_ids].to_list())
return context