hra commited on
Commit
db1003c
1 Parent(s): 8517284

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -0
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, LLMPredictor, Document,ServiceContext
2
+ from langchain.llms import OpenAIChat
3
+ from llama_index import download_loader
4
+ from langchain.chains import LLMChain, TransformChain, SimpleSequentialChain
5
+ from langchain.prompts import PromptTemplate
6
+ from langchain.agents import initialize_agent, Tool,load_tools
7
+ from langchain.chat_models import ChatOpenAI
8
+
9
+ import gradio as gr
10
+ import pandas as pd
11
+ import openai
12
+
13
+ import datetime
14
+ from datetime import datetime, date, time, timedelta
15
+ import os
16
+ import regex
17
+ import requests
18
+ import json
19
+ from sec_edgar_downloader._utils import get_filing_urls_to_download
20
+
21
+ listofcategories=["10-K", "10-Q","8-K"]
22
+
23
+
24
+ def getstuff(openapikey,category_selector,ticker_input):
25
+ dateforfilesave=datetime.today().strftime("%d-%m-%Y %I:%M%p")
26
+ print(ticker_input)
27
+ print(dateforfilesave)
28
+ if openapikey=='':
29
+ return pd.DataFrame(["Please provide OpenAPI Key"],columns=['ERROR']),pd.DataFrame(["Please provide OpenAPI Key"],columns=['ERROR']),'Error: Please provide OpenAPI key','Error: Please provide OpenAPI key'
30
+
31
+ os.environ['OPENAI_API_KEY'] = str(openapikey)
32
+
33
+ if category_selector=='10-K':
34
+ num_filings_needed=1
35
+ elif category_selector=='8-K':
36
+ num_filings_needed=8
37
+ elif category_selector=='10-Q':
38
+ num_filings_needed=4
39
+ else:
40
+ num_filings_needed=1
41
+ filings_temp=get_filing_urls_to_download(category_selector, ticker_input,num_filings_to_download=num_filings_needed,include_amends=False,before_date='2023-04-01',after_date='2022-01-01')
42
+ files=[filings_temp[0].full_submission_url]
43
+ print('Came here1')
44
+ filetextcontentlist=[]
45
+ for each in files:
46
+ headers = {
47
+ "User-Agent": '[email protected]',
48
+ "Accept-Encoding": "gzip, deflate",
49
+ "Host": "www.sec.gov",
50
+ }
51
+ resp=requests.get(each,headers=headers)
52
+ raw_10k = resp.text
53
+ print('Came here2')
54
+ # Regex to find <DOCUMENT> tags
55
+ doc_start_pattern = re.compile(r'<DOCUMENT>')
56
+ doc_end_pattern = re.compile(r'</DOCUMENT>')
57
+ # Regex to find <TYPE> tag prceeding any characters, terminating at new line
58
+ type_pattern = re.compile(r'<TYPE>[^\n]+')
59
+
60
+ # Create 3 lists with the span idices for each regex
61
+
62
+ ### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
63
+ ### First filter will give us document tag start <end> and document tag end's <start>
64
+ ### We will use this to later grab content in between these tags
65
+ doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
66
+ doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]
67
+
68
+ ### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
69
+ ### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
70
+ ### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K'
71
+ ### as section names
72
+ doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
73
+
74
+ document = {}
75
+
76
+ # Create a loop to go through each section type and save only the 10-K section in the dictionary
77
+ for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
78
+ if doc_type == '10-K':
79
+ document[doc_type] = raw_10k[doc_start:doc_end]
80
+ item_content = BeautifulSoup(document['10-K'], 'lxml')
81
+
82
+ filetextcontentlist.append(str(item_content.text.encode('ascii','ignore')))
83
+
84
+ print('Came here3')
85
+ temp=". ".join(filetextcontentlist).replace('\xa024',' ')
86
+ temp=temp.replace('\n',' ').strip()
87
+ temp=temp.split('.')
88
+ newlist=[]
89
+ for each in temp:
90
+ if len(each.split())>8:
91
+ newlist.append(each)
92
+ documents=[Document(t) for t in newlist]
93
+ index = GPTSimpleVectorIndex.from_documents(documents)
94
+ print('Came here4')
95
+ querylist=['What are the main products/ services mentioned?','What are the major risks?',"What are the top investment focus areas?","What is the financial outlook of the company?"]
96
+
97
+
98
+
99
+ llm = ChatOpenAI(temperature=0)
100
+ llm_predictor = LLMPredictor(llm=OpenAIChat(temperature=0, model_name="gpt-3.5-turbo"))
101
+ service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
102
+
103
+ answerlist=[]
104
+ for i in range(len(querylist)):
105
+ print(i,"Query: ",querylist[i])
106
+ response = index.query(
107
+ querylist[i],
108
+ llm_predictor=llm_predictor,
109
+ response_mode="tree_summarize",
110
+ similarity_top_k=min(int(len(documents)/3),20)
111
+ )
112
+ print(response.response)
113
+ if 'dataframe' in querylist[i]:
114
+ try:
115
+ pattern = regex.compile(r'\{(?:[^{}]|(?R))*\}')
116
+ jsonextract=pattern.findall(response.response)[0]
117
+ #print("json extract\n",jsonextract)
118
+ df_tmp=pd.read_json(jsonextract)
119
+ if len(df_tmp.columns)<=1:
120
+ df=pd.DataFrame(df_tmp[df_tmp.columns[0]].tolist())
121
+ else:
122
+ df=df_tmp
123
+ except:
124
+ df=pd.DataFrame()
125
+ df['message']=['Data insufficient to decipher']
126
+ df['action']=['try again in a few hours']
127
+ answerlist.append(df)
128
+ else:
129
+ answerlist.append(response.response)
130
+
131
+ print('Came to return statement')
132
+ return answerlist
133
+
134
+ with gr.Blocks() as demo:
135
+ gr.Markdown("<h1><center>ChatGPT SEC Filings Question Answers</center></h1>")
136
+ gr.Markdown(
137
+ """What are the products & services? What are the risks? What is the outlook? and much more. \n\nThis is a demo & showcases ChatGPT integrated with real data. It shows how to get real-time data and marry it with ChatGPT capabilities. This demonstrates 'Chain of Thought' thinking using ChatGPT.\n\n4 snapshots are provided for illustration (trends, sector outlook, news summary email, macro trends email)\n\nNote: llama-index & gpt-3.5-turbo are used. The analysis takes roughly 120 secs & may not always be consistent. If ChatGPT API is overloaded you will get an error\n ![visitors](https://visitor-badge.glitch.me/badge?page_id=hra.chatgpt-stock-news-snapshots)"""
138
+ )
139
+
140
+ with gr.Row() as row:
141
+ with gr.Column():
142
+ category_selector=gr.Dropdown(
143
+ listofcategories, label="Filing Categories", info="Select the filing you want..."
144
+ )
145
+ input1 = gr.Textbox(placeholder='Enter ticker (USA only)', lines=1,label='Ticker')
146
+ with gr.Column():
147
+ textboxopenapi = gr.Textbox(placeholder="Enter OpenAPI Key...", lines=1,label='OpenAPI Key')
148
+
149
+ with gr.Column():
150
+ btn = gr.Button("Generate \nAnswers")
151
+
152
+ with gr.Row() as row:
153
+ with gr.Column():
154
+ output1 = gr.Textbox(placeholder='', lines=4,label='Snapshot 1')
155
+ with gr.Column():
156
+ output2 = gr.Textbox(placeholder='', lines=4,label='Snapshot 2')
157
+ with gr.Row() as row:
158
+ with gr.Column():
159
+ output3 = gr.Textbox(placeholder='', lines=4,label='Snapshot 3')
160
+ with gr.Column():
161
+ output4 = gr.Textbox(placeholder='', lines=4,label='Snapshot 4')
162
+
163
+ btn.click(getstuff, inputs=[textboxopenapi,category_selector,input1],outputs=[output1,output2,output3,output4])
164
+
165
+
166
+ demo.launch(debug=True)