Spaces:
Build error
Build error
import re | |
from nltk.stem import PorterStemmer, WordNetLemmatizer | |
# Keyword Extraction | |
def expand_list_of_lists(list_of_lists): | |
""" | |
Expands a list of lists of strings to a list of strings. | |
Args: | |
list_of_lists: A list of lists of strings. | |
Returns: | |
A list of strings. | |
""" | |
expanded_list = [] | |
for inner_list in list_of_lists: | |
for string in inner_list: | |
expanded_list.append(string) | |
return expanded_list | |
def keywords_no_companies(texts): | |
# Company list (to remove companies from extracted entities) | |
company_list = [ | |
"apple", | |
"amd", | |
"amazon", | |
"cisco", | |
"google", | |
"microsoft", | |
"nvidia", | |
"asml", | |
"intel", | |
"micron", | |
"aapl", | |
"csco", | |
"msft", | |
"asml", | |
"nvda", | |
"googl", | |
"mu", | |
"intc", | |
"amzn", | |
"amd", | |
] | |
texts = [text.split(" ") for text in texts] | |
texts = expand_list_of_lists(texts) | |
# Convert all strings to lowercase. | |
lower_texts = [text.lower() for text in texts] | |
keywords = [text for text in lower_texts if text not in company_list] | |
return keywords | |
def all_keywords_combs(texts): | |
texts = [text.split(" ") for text in texts] | |
texts = expand_list_of_lists(texts) | |
# Convert all strings to lowercase. | |
lower_texts = [text.lower() for text in texts] | |
# Stem the words in each string. | |
stemmer = PorterStemmer() | |
stem_texts = [stemmer.stem(text) for text in texts] | |
# Lemmatize the words in each string. | |
lemmatizer = WordNetLemmatizer() | |
lemm_texts = [lemmatizer.lemmatize(text) for text in texts] | |
texts.extend(lower_texts) | |
texts.extend(stem_texts) | |
texts.extend(lemm_texts) | |
return texts | |
def extract_keywords(query_text, model): | |
prompt = "###Instruction: Identify the key entities that accurately describe the context.\n\nInput:{query_text}\n\n###Response:" | |
#prompt = f"###Instruction:Extract the important keywords which describe the context accurately.\n\nInput:{query_text}\n\n###Response:" | |
response = model.predict(prompt) | |
keywords = response.split(", ") | |
keywords = keywords_no_companies(keywords) | |
return keywords | |
# Entity Extraction | |
def generate_alpaca_ner_prompt(query): | |
prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Use the following guidelines to extract the entities representing the Company, Quarter, and Year in the sentence. | |
### Instruction: | |
- The output should be in the form "Company - Value, Quarter - Value, Year - Value". | |
- The output should be in the form "Company - None, Quarter - None, Year - None", if no entities are found. | |
- Only use entities that exist in the final sentence. | |
- If Company cannot be found in the sentence, return "none" for that entity. | |
- If Quarter cannot be found in the sentence, return "none" for that entity. | |
- If Year cannot be found in the sentence, return "none" for that entity. | |
- If there is ambiguity finding the entity, return "none" for that entity. | |
### Input: | |
What was discussed regarding Services revenue performance in Apple's Q3 2020 earnings call? | |
Company - Apple, Quarter - Q3, Year - 2020 | |
How has the growth in Q1 been for the consumer market as seen by AMD? | |
Company - AMD, Quarter - Q1, Year - none | |
What was the long term view on GOOGL's cloud business growth as discussed in their earnings call? | |
Company - Google, Quarter - none, Year - none | |
What is Nvidia's outlook in the data center business in Q3 2020? | |
Company - Nvidia, Quarter - Q3, Year - 2020 | |
What are the expansion plans of Amazon in the Asia Pacific region as discussed in their earnings call? | |
Company - Amazon, Quarter - none, Year - none | |
What did the Analysts ask about CSCO's cybersecurity business in the earnings call in 2016? | |
Company - Cisco, Quarter - none, Year - 2016 | |
{query} | |
### Response:""" | |
return prompt | |
def format_entities_flan_alpaca(values): | |
""" | |
Extracts the text for each entity from the output generated by the | |
Flan-Alpaca model. | |
""" | |
try: | |
company_string, quarter_string, year_string = values.split(", ") | |
except: | |
company = None | |
quarter = None | |
year = None | |
try: | |
company = company_string.split(" - ")[1].lower() | |
company = None if company.lower() == "none" else company | |
except: | |
company = None | |
try: | |
quarter = quarter_string.split(" - ")[1] | |
quarter = None if quarter.lower() == "none" else quarter | |
except: | |
quarter = None | |
try: | |
year = year_string.split(" - ")[1] | |
year = None if year.lower() == "none" else year | |
except: | |
year = None | |
print((company, quarter, year)) | |
return company, quarter, year | |
def extract_quarter_year(string): | |
# Extract year from string | |
year_match = re.search(r"\d{4}", string) | |
if year_match: | |
year = year_match.group() | |
else: | |
year = None | |
# Extract quarter from string | |
quarter_match = re.search(r"Q\d", string) | |
if quarter_match: | |
quarter = "Q" + quarter_match.group()[1] | |
else: | |
quarter = None | |
return quarter, year | |
def extract_ticker_spacy(query, model): | |
doc = model(query) | |
entities = {ent.label_: ent.text for ent in doc.ents} | |
print(entities.keys()) | |
if "ORG" in entities.keys(): | |
company = entities["ORG"].lower() | |
else: | |
company = None | |
return company | |
def clean_entities(company, quarter, year): | |
company_ticker_map = { | |
"apple": "AAPL", | |
"amd": "AMD", | |
"amazon": "AMZN", | |
"cisco": "CSCO", | |
"google": "GOOGL", | |
"microsoft": "MSFT", | |
"nvidia": "NVDA", | |
"asml": "ASML", | |
"intel": "INTC", | |
"micron": "MU", | |
} | |
ticker_choice = [ | |
"AAPL", | |
"CSCO", | |
"MSFT", | |
"ASML", | |
"NVDA", | |
"GOOGL", | |
"MU", | |
"INTC", | |
"AMZN", | |
"AMD", | |
] | |
year_choice = ["2020", "2019", "2018", "2017", "2016", "All"] | |
quarter_choice = ["Q1", "Q2", "Q3", "Q4", "All"] | |
if company is not None: | |
if company in company_ticker_map.keys(): | |
ticker = company_ticker_map[company] | |
ticker_index = ticker_choice.index(ticker) | |
else: | |
ticker_index = 0 | |
else: | |
ticker_index = 0 | |
if quarter is not None: | |
if quarter in quarter_choice: | |
quarter_index = quarter_choice.index(quarter) | |
else: | |
quarter_index = len(quarter_choice) - 1 | |
else: | |
quarter_index = len(quarter_choice) - 1 | |
if year is not None: | |
if year in year_choice: | |
year_index = year_choice.index(year) | |
else: | |
year_index = len(year_choice) - 1 | |
else: | |
year_index = len(year_choice) - 1 | |
return ticker_index, quarter_index, year_index | |