import nest_asyncio import os from dotenv import load_dotenv from jinja2 import Template from pydantic import BaseModel, Field from pymongo.mongo_client import MongoClient from llama_index.program.openai import OpenAIPydanticProgram from llama_index.core.extractors import PydanticProgramExtractor from llama_index.llms.openai import OpenAI from core.prompt import ADD_METADATA_TEMPLATE from core.summarization.summarizer import SummarizeGenerator nest_asyncio.apply() load_dotenv() class NodeMetadata(BaseModel): """Metadata for nodes, capturing topic and subtopic from the book.""" topic: str = Field( ..., description="The main subject or category that the node is associated with, representing a broad theme within the book.", ) subtopic: str = Field( ..., description="A more specific aspect or section under the main topic, refining the context of the node within the book.", ) def extract_topic(references, content_table): uri = os.getenv("MONGO_URI") client = MongoClient(uri) try: client.admin.command('ping') print("Pinged your deployment. You successfully connected to MongoDB!") except Exception as e: print(e) # Access a specific database db = client["summarizer"] # Access a collection within the database collection = db["topic_collection"] generate_content_table = SummarizeGenerator(references) extractor_output, extractor_dics = generate_content_table.extract_content_table(content_table) print(extractor_output) data_to_insert = { "title": references["title"], **extractor_dics # Unpack the extractor_output dictionary } collection.insert_one(data_to_insert) add_metadata_template = str( Template(ADD_METADATA_TEMPLATE).render(extractor_output=extractor_output) ) print("add metadata template : ", add_metadata_template) llm = OpenAI(temperature=0.1, model="gpt-4o-mini") openai_program = OpenAIPydanticProgram.from_defaults( output_cls=NodeMetadata, prompt_template_str="{input}", extract_template_str=add_metadata_template, llm=llm, ) topic_extractor = PydanticProgramExtractor( program=openai_program, input_key="input", show_progress=True, extract_template_str=add_metadata_template, llm=llm, ) return topic_extractor