Spaces:
Runtime error
Runtime error
from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, LLMPredictor, Document,ServiceContext | |
from langchain.llms import OpenAIChat | |
from llama_index import download_loader | |
from langchain.chains import LLMChain, TransformChain, SimpleSequentialChain | |
from langchain.prompts import PromptTemplate | |
from langchain.agents import initialize_agent, Tool,load_tools | |
from langchain.chat_models import ChatOpenAI | |
import gradio as gr | |
import pandas as pd | |
import openai | |
import re | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import datetime | |
from datetime import datetime, date, time, timedelta | |
import os | |
import regex | |
import requests | |
import json | |
from sec_edgar_downloader._utils import get_filing_urls_to_download | |
listofcategories=["10-K", "10-Q","8-K"] | |
def getstuff(openapikey,category_selector,ticker_input,user_question): | |
dateforfilesave=datetime.today().strftime("%d-%m-%Y %I:%M%p") | |
print(ticker_input) | |
print(user_question) | |
print(dateforfilesave) | |
if openapikey=='': | |
return ["Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key",] | |
os.environ['OPENAI_API_KEY'] = str(openapikey) | |
if category_selector=='10-K': | |
num_filings_needed=1 | |
elif category_selector=='8-K': | |
num_filings_needed=4 | |
elif category_selector=='10-Q': | |
num_filings_needed=2 | |
else: | |
num_filings_needed=1 | |
filings_temp=get_filing_urls_to_download(category_selector, ticker_input,num_filings_to_download=num_filings_needed,include_amends=False,before_date='2023-04-01',after_date='2022-01-01') | |
files=[filings_temp[i].full_submission_url for i in range(len(filings_temp))] | |
print('Came here1') | |
filetextcontentlist=[] | |
for each in files: | |
headers = { | |
"User-Agent": '[email protected]', | |
"Accept-Encoding": "gzip, deflate", | |
"Host": "www.sec.gov", | |
} | |
resp=requests.get(each,headers=headers) | |
raw_10k = resp.text | |
print('Came here2') | |
# Regex to find <DOCUMENT> tags | |
doc_start_pattern = re.compile(r'<DOCUMENT>') | |
doc_end_pattern = re.compile(r'</DOCUMENT>') | |
# Regex to find <TYPE> tag prceeding any characters, terminating at new line | |
type_pattern = re.compile(r'<TYPE>[^\n]+') | |
# Create 3 lists with the span idices for each regex | |
### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc | |
### First filter will give us document tag start <end> and document tag end's <start> | |
### We will use this to later grab content in between these tags | |
doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)] | |
doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)] | |
### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign | |
### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K' | |
### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K' | |
### as section names | |
doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)] | |
document = {} | |
# Create a loop to go through each section type and save only the 10-K section in the dictionary | |
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is): | |
if doc_type == category_selector: | |
document[doc_type] = raw_10k[doc_start:doc_end] | |
item_content = BeautifulSoup(document[category_selector], 'lxml') | |
filetextcontentlist.append(str(item_content.text.encode('ascii','ignore'))) | |
print('Came here3') | |
temp=". ".join(filetextcontentlist).replace('\xa024',' ') | |
temp=temp.replace('\n',' ').strip() | |
temp=temp.split('.') | |
newlist=[] | |
for each in temp: | |
if len(each.split())>10: ###eliminate sentences with less words | |
newlist.append(each) | |
documents=[Document(t) for t in newlist] | |
index = GPTSimpleVectorIndex.from_documents(documents) | |
print('Came here4') | |
querylist=['What are the main products/ services mentioned?','What are the major risks?',"What are the top investment focus areas?","What is the financial outlook of the company?","What key technologies like AI, blockchain etc are mentioned?","What other company names/ competitors are mentioned?"] | |
if user_question=='': | |
querylist.append('What is the key summary?') | |
else: | |
querylist.append(user_question) | |
llm = ChatOpenAI(temperature=0) | |
llm_predictor = LLMPredictor(llm=OpenAIChat(temperature=0, model_name="gpt-3.5-turbo")) | |
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor) | |
answerlist=[] | |
for i in range(len(querylist)): | |
print(i,"Query: ",querylist[i]) | |
response = index.query( | |
querylist[i], | |
service_context=service_context, | |
response_mode="tree_summarize", | |
similarity_top_k=min(int(len(documents)/3),20) | |
) | |
print(response.response) | |
if 'dataframe' in querylist[i]: | |
try: | |
pattern = regex.compile(r'\{(?:[^{}]|(?R))*\}') | |
jsonextract=pattern.findall(response.response)[0] | |
#print("json extract\n",jsonextract) | |
df_tmp=pd.read_json(jsonextract) | |
if len(df_tmp.columns)<=1: | |
df=pd.DataFrame(df_tmp[df_tmp.columns[0]].tolist()) | |
else: | |
df=df_tmp | |
except: | |
df=pd.DataFrame() | |
df['message']=['Data insufficient to decipher'] | |
df['action']=['try again in a few hours'] | |
answerlist.append(df) | |
else: | |
answerlist.append(querylist[i]+'\n\n'+response.response) | |
print('Came to return statement') | |
return answerlist | |
with gr.Blocks() as demo: | |
gr.Markdown("<h1><center>ChatGPT SEC Filings Question Answers</center></h1>") | |
gr.Markdown( | |
"""What are the products & services? What are the risks? What is the outlook? and much more. \n\nThis is a demo & showcases ChatGPT integrated with real data. It shows how to get real-time data and marry it with ChatGPT capabilities.\n\nMultiple snapshots/ Question-Answers are provided for illustration (products, risks, focus areas, etc)\n\nNote: llama-index & gpt-3.5-turbo are used. The analysis takes more than 1-3 mins & may not always be consistent. If ChatGPT API is overloaded/ no API key is provided/ API quota is over you will get an error\n ![visitors](https://visitor-badge.glitch.me/badge?page_id=hra.ChatGPT-SEC-Docs-QA)""" | |
) | |
with gr.Row() as row: | |
with gr.Column(): | |
category_selector=gr.Dropdown( | |
listofcategories, label="Filing Categories", info="Select the filing you want..." | |
) | |
input1 = gr.Textbox(placeholder='Enter ticker (USA only)', lines=1,label='Ticker') | |
with gr.Column(): | |
input2 = gr.Textbox(placeholder='Enter your question', lines=1,label='User Question') | |
textboxopenapi = gr.Textbox(placeholder="Enter OpenAPI Key...", lines=1,label='OpenAPI Key') | |
with gr.Column(): | |
btn = gr.Button("Generate \nAnswers") | |
with gr.Row() as row: | |
with gr.Column(): | |
output1 = gr.Textbox(placeholder='', lines=4,label='Snapshot 1') | |
with gr.Column(): | |
output2 = gr.Textbox(placeholder='', lines=4,label='Snapshot 2') | |
with gr.Row() as row: | |
with gr.Column(): | |
output3 = gr.Textbox(placeholder='', lines=4,label='Snapshot 3') | |
with gr.Column(): | |
output4 = gr.Textbox(placeholder='', lines=4,label='Snapshot 4') | |
with gr.Row() as row: | |
with gr.Column(): | |
output5 = gr.Textbox(placeholder='', lines=4,label='Snapshot 5') | |
with gr.Column(): | |
output6 = gr.Textbox(placeholder='', lines=4,label='Snapshot 6') | |
btn.click(getstuff, inputs=[textboxopenapi,category_selector,input1,input2],outputs=[output1,output2,output3,output4,output5,output6,input2]) | |
demo.launch(debug=True) |