Spaces:

awinml
/

2-qa-earnings-sentencewise

Build error

File size: 6,903 Bytes

import re

from nltk.stem import PorterStemmer, WordNetLemmatizer

# Keyword Extraction


def expand_list_of_lists(list_of_lists):
    """
    Expands a list of lists of strings to a list of strings.
    Args:
      list_of_lists: A list of lists of strings.
    Returns:
      A list of strings.
    """

    expanded_list = []
    for inner_list in list_of_lists:
        for string in inner_list:
            expanded_list.append(string)
    return expanded_list


def keywords_no_companies(texts):
    # Company list (to remove companies from extracted entities)

    company_list = [
        "apple",
        "amd",
        "amazon",
        "cisco",
        "google",
        "microsoft",
        "nvidia",
        "asml",
        "intel",
        "micron",
        "aapl",
        "csco",
        "msft",
        "asml",
        "nvda",
        "googl",
        "mu",
        "intc",
        "amzn",
        "amd",
    ]

    texts = [text.split(" ") for text in texts]
    texts = expand_list_of_lists(texts)

    # Convert all strings to lowercase.
    lower_texts = [text.lower() for text in texts]
    keywords = [text for text in lower_texts if text not in company_list]
    return keywords


def all_keywords_combs(texts):

    texts = [text.split(" ") for text in texts]
    texts = expand_list_of_lists(texts)

    # Convert all strings to lowercase.
    lower_texts = [text.lower() for text in texts]

    # Stem the words in each string.
    stemmer = PorterStemmer()
    stem_texts = [stemmer.stem(text) for text in texts]

    # Lemmatize the words in each string.
    lemmatizer = WordNetLemmatizer()
    lemm_texts = [lemmatizer.lemmatize(text) for text in texts]

    texts.extend(lower_texts)
    texts.extend(stem_texts)
    texts.extend(lemm_texts)
    return texts


def extract_keywords(query_text, model):
    prompt = "###Instruction: Identify the key entities that accurately describe the context.\n\nInput:{query_text}\n\n###Response:"
    #prompt = f"###Instruction:Extract the important keywords which describe the context accurately.\n\nInput:{query_text}\n\n###Response:"
    response = model.predict(prompt)
    keywords = response.split(", ")
    keywords = keywords_no_companies(keywords)
    return keywords


# Entity Extraction


def generate_alpaca_ner_prompt(query):
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Use the following guidelines to extract the entities representing the Company, Quarter, and Year in the sentence.

### Instruction:
- The output should be in the form "Company - Value, Quarter - Value, Year - Value".
- The output should be in the form "Company - None, Quarter - None, Year - None", if no entities are found.
- Only use entities that exist in the final sentence.
- If Company cannot be found in the sentence, return "none" for that entity.
- If Quarter cannot be found in the sentence, return "none" for that entity.
- If Year cannot be found in the sentence, return "none" for that entity.
- If there is ambiguity finding the entity, return "none" for that entity.

### Input:

What was discussed regarding Services revenue performance in Apple's Q3 2020 earnings call?
Company - Apple, Quarter - Q3, Year - 2020

How has the growth in Q1 been for the consumer market as seen by AMD?
Company - AMD, Quarter - Q1, Year - none

What was the long term view on GOOGL's cloud business growth as discussed in their earnings call?
Company - Google, Quarter - none, Year - none

What is Nvidia's outlook in the data center business in Q3 2020?
Company - Nvidia, Quarter - Q3, Year - 2020

What are the expansion plans of Amazon in the Asia Pacific region as discussed in their earnings call?
Company - Amazon, Quarter - none, Year - none

What did the Analysts ask about CSCO's cybersecurity business in the earnings call in 2016?
Company - Cisco, Quarter - none, Year - 2016


{query}
### Response:"""
    return prompt


def format_entities_flan_alpaca(values):
    """
    Extracts the text for each entity from the output generated by the
    Flan-Alpaca model.
    """
    try:
        company_string, quarter_string, year_string = values.split(", ")
    except:
        company = None
        quarter = None
        year = None
    try:
        company = company_string.split(" - ")[1].lower()
        company = None if company.lower() == "none" else company
    except:
        company = None
    try:
        quarter = quarter_string.split(" - ")[1]
        quarter = None if quarter.lower() == "none" else quarter

    except:
        quarter = None
    try:
        year = year_string.split(" - ")[1]
        year = None if year.lower() == "none" else year

    except:
        year = None

    print((company, quarter, year))
    return company, quarter, year


def extract_quarter_year(string):
    # Extract year from string
    year_match = re.search(r"\d{4}", string)
    if year_match:
        year = year_match.group()
    else:
        year = None

    # Extract quarter from string
    quarter_match = re.search(r"Q\d", string)
    if quarter_match:
        quarter = "Q" + quarter_match.group()[1]
    else:
        quarter = None

    return quarter, year


def extract_ticker_spacy(query, model):
    doc = model(query)
    entities = {ent.label_: ent.text for ent in doc.ents}
    print(entities.keys())
    if "ORG" in entities.keys():
        company = entities["ORG"].lower()
    else:
        company = None
    return company


def clean_entities(company, quarter, year):
    company_ticker_map = {
        "apple": "AAPL",
        "amd": "AMD",
        "amazon": "AMZN",
        "cisco": "CSCO",
        "google": "GOOGL",
        "microsoft": "MSFT",
        "nvidia": "NVDA",
        "asml": "ASML",
        "intel": "INTC",
        "micron": "MU",
    }

    ticker_choice = [
        "AAPL",
        "CSCO",
        "MSFT",
        "ASML",
        "NVDA",
        "GOOGL",
        "MU",
        "INTC",
        "AMZN",
        "AMD",
    ]
    year_choice = ["2020", "2019", "2018", "2017", "2016", "All"]
    quarter_choice = ["Q1", "Q2", "Q3", "Q4", "All"]
    if company is not None:
        if company in company_ticker_map.keys():
            ticker = company_ticker_map[company]
            ticker_index = ticker_choice.index(ticker)
        else:
            ticker_index = 0
    else:
        ticker_index = 0
    if quarter is not None:
        if quarter in quarter_choice:
            quarter_index = quarter_choice.index(quarter)
        else:
            quarter_index = len(quarter_choice) - 1
    else:
        quarter_index = len(quarter_choice) - 1
    if year is not None:
        if year in year_choice:
            year_index = year_choice.index(year)
        else:
            year_index = len(year_choice) - 1
    else:
        year_index = len(year_choice) - 1
    return ticker_index, quarter_index, year_index