Spaces:

awinml
/

2-qa-earnings-sentencewise

Build error

App Files Files Community

2-qa-earnings-sentencewise / utils /entity_extraction.py

awinml

Upload 17 files (#18)

eb6952b over 1 year ago

raw

history blame

6.9 kB

	import re

	from nltk.stem import PorterStemmer, WordNetLemmatizer

	# Keyword Extraction


	def expand_list_of_lists(list_of_lists):
	"""
	Expands a list of lists of strings to a list of strings.
	Args:
	list_of_lists: A list of lists of strings.
	Returns:
	A list of strings.
	"""

	expanded_list = []
	for inner_list in list_of_lists:
	for string in inner_list:
	expanded_list.append(string)
	return expanded_list


	def keywords_no_companies(texts):
	# Company list (to remove companies from extracted entities)

	company_list = [
	"apple",
	"amd",
	"amazon",
	"cisco",
	"google",
	"microsoft",
	"nvidia",
	"asml",
	"intel",
	"micron",
	"aapl",
	"csco",
	"msft",
	"asml",
	"nvda",
	"googl",
	"mu",
	"intc",
	"amzn",
	"amd",
	]

	texts = [text.split(" ") for text in texts]
	texts = expand_list_of_lists(texts)

	# Convert all strings to lowercase.
	lower_texts = [text.lower() for text in texts]
	keywords = [text for text in lower_texts if text not in company_list]
	return keywords


	def all_keywords_combs(texts):

	texts = [text.split(" ") for text in texts]
	texts = expand_list_of_lists(texts)

	# Convert all strings to lowercase.
	lower_texts = [text.lower() for text in texts]

	# Stem the words in each string.
	stemmer = PorterStemmer()
	stem_texts = [stemmer.stem(text) for text in texts]

	# Lemmatize the words in each string.
	lemmatizer = WordNetLemmatizer()
	lemm_texts = [lemmatizer.lemmatize(text) for text in texts]

	texts.extend(lower_texts)
	texts.extend(stem_texts)
	texts.extend(lemm_texts)
	return texts


	def extract_keywords(query_text, model):
	prompt = "###Instruction: Identify the key entities that accurately describe the context.\n\nInput:{query_text}\n\n###Response:"
	#prompt = f"###Instruction:Extract the important keywords which describe the context accurately.\n\nInput:{query_text}\n\n###Response:"
	response = model.predict(prompt)
	keywords = response.split(", ")
	keywords = keywords_no_companies(keywords)
	return keywords


	# Entity Extraction


	def generate_alpaca_ner_prompt(query):
	prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Use the following guidelines to extract the entities representing the Company, Quarter, and Year in the sentence.

	### Instruction:
	- The output should be in the form "Company - Value, Quarter - Value, Year - Value".
	- The output should be in the form "Company - None, Quarter - None, Year - None", if no entities are found.
	- Only use entities that exist in the final sentence.
	- If Company cannot be found in the sentence, return "none" for that entity.
	- If Quarter cannot be found in the sentence, return "none" for that entity.
	- If Year cannot be found in the sentence, return "none" for that entity.
	- If there is ambiguity finding the entity, return "none" for that entity.

	### Input:

	What was discussed regarding Services revenue performance in Apple's Q3 2020 earnings call?
	Company - Apple, Quarter - Q3, Year - 2020

	How has the growth in Q1 been for the consumer market as seen by AMD?
	Company - AMD, Quarter - Q1, Year - none

	What was the long term view on GOOGL's cloud business growth as discussed in their earnings call?
	Company - Google, Quarter - none, Year - none

	What is Nvidia's outlook in the data center business in Q3 2020?
	Company - Nvidia, Quarter - Q3, Year - 2020

	What are the expansion plans of Amazon in the Asia Pacific region as discussed in their earnings call?
	Company - Amazon, Quarter - none, Year - none

	What did the Analysts ask about CSCO's cybersecurity business in the earnings call in 2016?
	Company - Cisco, Quarter - none, Year - 2016


	{query}
	### Response:"""
	return prompt


	def format_entities_flan_alpaca(values):
	"""
	Extracts the text for each entity from the output generated by the
	Flan-Alpaca model.
	"""
	try:
	company_string, quarter_string, year_string = values.split(", ")
	except:
	company = None
	quarter = None
	year = None
	try:
	company = company_string.split(" - ")[1].lower()
	company = None if company.lower() == "none" else company
	except:
	company = None
	try:
	quarter = quarter_string.split(" - ")[1]
	quarter = None if quarter.lower() == "none" else quarter

	except:
	quarter = None
	try:
	year = year_string.split(" - ")[1]
	year = None if year.lower() == "none" else year

	except:
	year = None

	print((company, quarter, year))
	return company, quarter, year


	def extract_quarter_year(string):
	# Extract year from string
	year_match = re.search(r"\d{4}", string)
	if year_match:
	year = year_match.group()
	else:
	year = None

	# Extract quarter from string
	quarter_match = re.search(r"Q\d", string)
	if quarter_match:
	quarter = "Q" + quarter_match.group()[1]
	else:
	quarter = None

	return quarter, year


	def extract_ticker_spacy(query, model):
	doc = model(query)
	entities = {ent.label_: ent.text for ent in doc.ents}
	print(entities.keys())
	if "ORG" in entities.keys():
	company = entities["ORG"].lower()
	else:
	company = None
	return company


	def clean_entities(company, quarter, year):
	company_ticker_map = {
	"apple": "AAPL",
	"amd": "AMD",
	"amazon": "AMZN",
	"cisco": "CSCO",
	"google": "GOOGL",
	"microsoft": "MSFT",
	"nvidia": "NVDA",
	"asml": "ASML",
	"intel": "INTC",
	"micron": "MU",
	}

	ticker_choice = [
	"AAPL",
	"CSCO",
	"MSFT",
	"ASML",
	"NVDA",
	"GOOGL",
	"MU",
	"INTC",
	"AMZN",
	"AMD",
	]
	year_choice = ["2020", "2019", "2018", "2017", "2016", "All"]
	quarter_choice = ["Q1", "Q2", "Q3", "Q4", "All"]
	if company is not None:
	if company in company_ticker_map.keys():
	ticker = company_ticker_map[company]
	ticker_index = ticker_choice.index(ticker)
	else:
	ticker_index = 0
	else:
	ticker_index = 0
	if quarter is not None:
	if quarter in quarter_choice:
	quarter_index = quarter_choice.index(quarter)
	else:
	quarter_index = len(quarter_choice) - 1
	else:
	quarter_index = len(quarter_choice) - 1
	if year is not None:
	if year in year_choice:
	year_index = year_choice.index(year)
	else:
	year_index = len(year_choice) - 1
	else:
	year_index = len(year_choice) - 1
	return ticker_index, quarter_index, year_index