hra commited on
Commit
b37597c
1 Parent(s): 9cd95d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -22
app.py CHANGED
@@ -30,6 +30,9 @@ def getstuff(openapikey,category_selector,ticker_input,user_question):
30
  print(dateforfilesave)
31
  if openapikey=='':
32
  return ["Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key",]
 
 
 
33
 
34
  os.environ['OPENAI_API_KEY'] = str(openapikey)
35
 
@@ -43,7 +46,6 @@ def getstuff(openapikey,category_selector,ticker_input,user_question):
43
  num_filings_needed=1
44
  filings_temp=get_filing_urls_to_download(category_selector, ticker_input,num_filings_to_download=num_filings_needed,include_amends=False,before_date='2023-04-01',after_date='2022-01-01')
45
  files=[filings_temp[i].full_submission_url for i in range(len(filings_temp))]
46
- print('Came here1')
47
  filetextcontentlist=[]
48
  for each in files:
49
  headers = {
@@ -52,39 +54,26 @@ def getstuff(openapikey,category_selector,ticker_input,user_question):
52
  "Host": "www.sec.gov",
53
  }
54
  resp=requests.get(each,headers=headers)
55
- raw_10k = resp.text
56
  print('Came here2')
57
- # Regex to find <DOCUMENT> tags
58
  doc_start_pattern = re.compile(r'<DOCUMENT>')
59
  doc_end_pattern = re.compile(r'</DOCUMENT>')
60
- # Regex to find <TYPE> tag prceeding any characters, terminating at new line
61
  type_pattern = re.compile(r'<TYPE>[^\n]+')
 
 
 
62
 
63
- # Create 3 lists with the span idices for each regex
64
-
65
- ### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
66
- ### First filter will give us document tag start <end> and document tag end's <start>
67
- ### We will use this to later grab content in between these tags
68
- doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
69
- doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]
70
-
71
- ### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
72
- ### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
73
- ### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K'
74
- ### as section names
75
- doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
76
 
77
  document = {}
78
-
79
- # Create a loop to go through each section type and save only the 10-K section in the dictionary
80
  for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
81
  if doc_type == category_selector:
82
- document[doc_type] = raw_10k[doc_start:doc_end]
83
  item_content = BeautifulSoup(document[category_selector], 'lxml')
84
 
85
  filetextcontentlist.append(str(item_content.text.encode('ascii','ignore')))
86
 
87
- print('Came here3')
88
  temp=". ".join(filetextcontentlist).replace('\xa024',' ')
89
  temp=temp.replace('\n',' ').strip()
90
  temp=temp.split('.')
@@ -94,7 +83,6 @@ def getstuff(openapikey,category_selector,ticker_input,user_question):
94
  newlist.append(each)
95
  documents=[Document(t) for t in newlist]
96
  index = GPTSimpleVectorIndex.from_documents(documents)
97
- print('Came here4')
98
  querylist=['What are the main products/ services mentioned?','What are the major risks?',"What are the top investment focus areas?","What is the financial outlook of the company?","What key technologies like AI, blockchain etc are mentioned?","What other company names/ competitors are mentioned?"]
99
  if user_question=='':
100
  querylist.append('What is the key summary?')
 
30
  print(dateforfilesave)
31
  if openapikey=='':
32
  return ["Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key","Please provide OpenAPI Key",]
33
+
34
+ if ticker_input=='':
35
+ return ["Please enter Ticker","Please enter Ticker","Please enter Ticker","Please enter Ticker","Please enter Ticker","Please enter Ticker","Please enter Ticker",]
36
 
37
  os.environ['OPENAI_API_KEY'] = str(openapikey)
38
 
 
46
  num_filings_needed=1
47
  filings_temp=get_filing_urls_to_download(category_selector, ticker_input,num_filings_to_download=num_filings_needed,include_amends=False,before_date='2023-04-01',after_date='2022-01-01')
48
  files=[filings_temp[i].full_submission_url for i in range(len(filings_temp))]
 
49
  filetextcontentlist=[]
50
  for each in files:
51
  headers = {
 
54
  "Host": "www.sec.gov",
55
  }
56
  resp=requests.get(each,headers=headers)
57
+ rawfile = resp.text
58
  print('Came here2')
59
+ # Find text between <DOCUMENT> tags
60
  doc_start_pattern = re.compile(r'<DOCUMENT>')
61
  doc_end_pattern = re.compile(r'</DOCUMENT>')
 
62
  type_pattern = re.compile(r'<TYPE>[^\n]+')
63
+
64
+ doc_start_is = [tmp.end() for tmp in doc_start_pattern.finditer(rawfile)]
65
+ doc_end_is = [tmp.start() for tmp in doc_end_pattern.finditer(rawfile)]
66
 
67
+ doc_types = [tmp[len('<TYPE>'):] for tmp in type_pattern.findall(rawfile)]
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  document = {}
 
 
70
  for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
71
  if doc_type == category_selector:
72
+ document[doc_type] = rawfile[doc_start:doc_end]
73
  item_content = BeautifulSoup(document[category_selector], 'lxml')
74
 
75
  filetextcontentlist.append(str(item_content.text.encode('ascii','ignore')))
76
 
 
77
  temp=". ".join(filetextcontentlist).replace('\xa024',' ')
78
  temp=temp.replace('\n',' ').strip()
79
  temp=temp.split('.')
 
83
  newlist.append(each)
84
  documents=[Document(t) for t in newlist]
85
  index = GPTSimpleVectorIndex.from_documents(documents)
 
86
  querylist=['What are the main products/ services mentioned?','What are the major risks?',"What are the top investment focus areas?","What is the financial outlook of the company?","What key technologies like AI, blockchain etc are mentioned?","What other company names/ competitors are mentioned?"]
87
  if user_question=='':
88
  querylist.append('What is the key summary?')