arxiv_chatbot

Sleeping

File size: 8,311 Bytes

import chat.arxiv_bot.arxiv_bot_utils as utils
import google.generativeai as genai
import json
import os
from google.generativeai.types import content_types
from collections.abc import Iterable

# ----------------------- define instructions -----------------------
system_instruction = """You are a library chatbot that help people to find relevant articles about a topic, or find a specific article with given title and authors.
Your job is to analyze the user question, generate enough parameters based on the user question and use the tools that are given to you.
Also, after the function call is done, you must post-process the results in a more conversational form, providing some explanation about the paper based on its summary to avoid recitation.
You must provide the link to its Arxiv pdf page."""

# --------------------------- define tools --------------------------
def search_for_relevant_article(keywords: list['str'], topic_description: str) -> str:
    """This tool is used to search for articles from the database which is relevant to a topic, using a list of more than 3 keywords and a long sentence topic description.
    If there is not enough 3 keywords from the question, the model must generate more keywords related to the topic.
    If there is no description about the topic, the model must generate a description for the function call.
    \nThe result is a string describe the records found from the database: 'Record no. - Title: <title>, Author: <authors>, Link: <link to the pdf file>, Summary: <summary of the article>'. There can be many records.
    \nIf the result is 'Information not found' it means some error has occured, or the database has no relevant article"""

    print('Keywords: {}, description: {}'.format(keywords,topic_description))
    
    results = utils.ArxivChroma.query_relevant(keywords=keywords, query_texts=topic_description)
    # print(results)
    ids = results['metadatas'][0]
    if len(ids) == 0:
        # go crawl some
        new_records = utils.crawl_arxiv(keyword_list=keywords, max_results=10)
        # print("Got new records: ",len(new_records))
        if type(new_records) == str:
            return "Information not found"
        
        utils.ArxivChroma.add(new_records)
        utils.ArxivSQL.add(new_records)
        results = utils.ArxivChroma.query_relevant(keywords=keywords, query_texts=topic_description)
        ids = results['metadatas'][0]
        # print("Re-queried on chromadb, results: ",ids)
        
    paper_id = [id['paper_id'] for id in ids]
    paper_info = utils.ArxivSQL.query_id(paper_id)
    # print(paper_info)
    records = [] # get title (2), author (3), link (6)
    result_string = ""
    if paper_info:
        for i in range(len(paper_info)):
            result_string += "Record no.{} - Title: {}, Author: {}, Link: {}, ".format(i+1,paper_info[i][2],paper_info[i][3],paper_info[i][6])
            id = paper_info[i][0]
            selected_document = utils.ArxivChroma.query_exact(id)["documents"]
            doc_str = "Summary:"
            for doc in selected_document:
                doc_str+= doc + " "
            result_string += doc_str
            records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
        return result_string
    else:
        return "Information not found"

def search_for_specific_article(title: str, authors: list['str']) -> str:
    """This tool is used to search for a specific article from the database, with its name and authors given.
    \nThe result is a string describe the records found from the database: 'Record no. - Title: <title>, Author: <authors>, Link: <link to the pdf file>, Summary: <summary of the article>'. There can be many records.
    \nIf the result is 'Information not found' it means some error has occured, or the database has no relevant article"""

    print('Keywords: {}, description: {}'.format(title,authors))

    paper_info = utils.ArxivSQL.query(title = title,author = authors)
    if paper_info:
        new_records = utils.crawl_exact_paper(title=title,author=authors)
        # print("Got new records: ",len(new_records))
        if type(new_records) == str:
            # print(new_records)
            return "Information not found"
        utils.ArxivChroma.add(new_records)
        utils.ArxivSQL.add(new_records)
        paper_info = utils.ArxivSQL.query(title = title,author = authors)
        # print("Re-queried on chromadb, results: ",paper_info)
    # -------------------------------------
    records = [] # get title (2), author (3), link (6)
    result_string = ""
    if paper_info:
        for i in range(len(paper_info)):
            result_string += "Record no.{} - Title: {}, Author: {}, Link: {}, ".format(i+1,paper_info[i][2],paper_info[i][3],paper_info[i][6])
            id = paper_info[i][0]
            selected_document = utils.ArxivChroma.query_exact(id)["documents"]
            doc_str = "Summary:"
            for doc in selected_document:
                doc_str+= doc + " "
            result_string += doc_str
            records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
    # process results:
    if len(result_string) == 0:
        return "Information not found"
    return result_string

def answer_others_questions(question: str) -> str:
    """This tool is the default option for other questions that are not related to article or paper request. The model will response the question with its own answer."""
    return question

tools = [search_for_relevant_article, search_for_specific_article, answer_others_questions]
tools_name = ['search_for_relevant_article', 'search_for_specific_article', 'answer_others_questions']

# load key, prepare config ------------------------
if os.path.exists('apikey.txt'):
    with open("apikey.txt","r") as apikey:
        key = apikey.readline()
else:
    key = os.environ.get('API_KEY')
genai.configure(api_key=key)
generation_config = {
  "temperature": 1,
  "top_p": 1,
  "top_k": 0,
  "max_output_tokens": 2048,
  "response_mime_type": "text/plain",
}
safety_settings = [
    {
        "category": "HARM_CATEGORY_DANGEROUS",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]
# this function return a tool_config with mode 'none', 'any', 'auto'
def tool_config_from_mode(mode: str, fns: Iterable[str] = ()):
    """Create a tool config with the specified function calling mode."""
    return content_types.to_tool_config(
        {"function_calling_config": {"mode": mode, "allowed_function_names": fns}}
    )

def init_model(mode = "auto"):
    # return an instance of a model, holding its own ChatSession
    # every socket session holds its own model
    # this function must be called upon socket init, also start_chat() to begin chat
    model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest",
                                 safety_settings=safety_settings,
                                 generation_config=generation_config,
                                 tools=tools,
                                 tool_config=tool_config_from_mode(mode),
                                 system_instruction=system_instruction)
    chat_instance = model.start_chat(enable_automatic_function_calling=True)
    return model, chat_instance

# handle tool call and chatsession
def full_chain_history_question(user_input, chat_instance: genai.ChatSession, mode="auto"):
    try:            
        response = chat_instance.send_message(user_input,tool_config=tool_config_from_mode(mode)).text
        return response, chat_instance.history
    except Exception as e:
        print(e)
        return f'Error occured during call: {e}', chat_instance.history

# for printing log session
def print_history(history):
    for content in history:
        part = content.parts[0]
        print(content.role, "->", type(part).to_dict(part))
        print('-'*80)

utils.ArxivChroma.connect()
utils.ArxivSQL.connect()