Spaces:

nickmuchi
/

fintweet-GPT-Search

Build error

File size: 5,201 Bytes

##Variables

import os
import streamlit as st
import pathlib

from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chat_models.openai import ChatOpenAI
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain import VectorDBQA
import pandas as pd

from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer
from optimum.pipelines import pipeline
import tweepy
import pandas as pd
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
from datetime import datetime as dt
from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode
from datasets import Dataset
from huggingface_hub import Repository

@st.experimental_singleton(suppress_st_warning=True)
def load_models():
    '''load sentimant and topic clssification models'''
    sent_pipe = pipeline(task,model=sent_model_id, tokenizer=sent_model_id)
    topic_pipe = pipeline(task, model=topic_model_id, tokenizer=topic_model_id)
    
    return sent_pipe, topic_pipe

@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def process_tweets(df,df_users):
    '''process tweets into a dataframe'''
    
    df['author'] = df['author'].astype(np.int64)
    
    df_merged = df.merge(df_users, on='author')

    tweet_list = df_merged['tweet'].tolist()
    
    sentiment, topic = pd.DataFrame(sentiment_classifier(tweet_list)), pd.DataFrame(topic_classifier(tweet_list))
    
    sentiment.rename(columns={'score':'sentiment_confidence','label':'sentiment'}, inplace=True)
    
    topic.rename(columns={'score':'topic_confidence','label':'topic'}, inplace=True)
    
    df_group = pd.concat([df_merged,sentiment,topic],axis=1)

    df_group[['sentiment_confidence','topic_confidence']] = df_group[['sentiment_confidence','topic_confidence']].round(2).mul(100)

    df_tweets = df_group[['creation_time','username','tweet','sentiment','topic','sentiment_confidence','topic_confidence']]

    df_tweets = df_tweets.sort_values(by=['creation_time'],ascending=False)

    return df_tweets

@st.experimental_singleton(suppress_st_warning=True)
def create_vectorstore(texts,model,username,topic,creation_time):
    '''Create FAISS vectorstore'''

    if model == "hkunlp/instructor-large":
        emb = HuggingFaceInstructEmbeddings(model_name=model,
                                            query_instruction='Represent the Financial question for retrieving supporting documents: ',
                                            embed_instruction='Represent the Financial document for retrieval: ')
        
    elif model == "sentence-transformers/all-mpnet-base-v2":
        emb = HuggingFaceEmbeddings(model_name=model)

    docsearch = FAISS.from_texts(texts, emb,
                                metadatas=[{"source": user,"topic":top,"extraction_time":tme} for user,top,tme in zip(username,topic,creation_time)])

    return docsearch

    
@st.experimental_singleton(suppress_st_warning=True)
def embed_tweets(query,_prompt,_docsearch):
    '''Process file with latest tweets'''

    streaming_llm = ChatOpenAI(streaming=True, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), verbose=True, temperature=0)
    chain_type_kwargs = {"prompt": _prompt}
    chain = VectorDBQA.from_chain_type(
    ChatOpenAI(temperature=0), 
    chain_type="stuff", 
    vectorstore=_docsearch,
    chain_type_kwargs=chain_type_kwargs,
    return_source_documents=True,
    k=5
    )

    result = chain({"query": query})

    return result

CONFIG = {
    "bearer_token": os.environ.get("bearer_token")
              }

sent_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-fintwitter-classification'
topic_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-finance-topic-classification'
task = 'text-classification'

sentiments = {"0": "Bearish", "1": "Bullish", "2": "Neutral"}

topics = {
    "0": "Analyst Update",
    "1": "Fed | Central Banks",
    "2": "Company | Product News",
    "3": "Treasuries | Corporate Debt",
    "4": "Dividend",
    "5": "Earnings",
    "6": "Energy | Oil",
    "7": "Financials",
    "8": "Currencies",
    "9": "General News | Opinion",
    "10": "Gold | Metals | Materials",
    "11": "IPO",
    "12": "Legal | Regulation",
    "13": "M&A | Investments",
    "14": "Macro",
    "15": "Markets",
    "16": "Politics",
    "17": "Personnel Change",
    "18": "Stock Commentary",
    "19": "Stock Movement",
}

sentiment_classifier, topic_classifier = load_models()

def convert_user_names(user_name: list):
    '''convert user_names to tweepy format'''
    users = []
    for user in user_name:
        users.append(f"from:{user}")
    
    return " OR ".join(users)