File size: 2,443 Bytes
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import nest_asyncio
import os
from dotenv import load_dotenv
from jinja2 import Template
from pydantic import BaseModel, Field
from pymongo.mongo_client import MongoClient

from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.core.extractors import PydanticProgramExtractor
from llama_index.llms.openai import OpenAI

from core.prompt import ADD_METADATA_TEMPLATE
from core.summarization.summarizer import SummarizeGenerator

nest_asyncio.apply()

load_dotenv()


class NodeMetadata(BaseModel):
    """Metadata for nodes, capturing topic and subtopic from the book."""

    topic: str = Field(
        ...,
        description="The main subject or category that the node is associated with, representing a broad theme within the book.",
    )
    subtopic: str = Field(
        ...,
        description="A more specific aspect or section under the main topic, refining the context of the node within the book.",
    )


def extract_topic(references, content_table):
    uri = os.getenv("MONGO_URI")
    client = MongoClient(uri)
    
    try:
        client.admin.command('ping')
        print("Pinged your deployment. You successfully connected to MongoDB!")
    except Exception as e:
        print(e)
        # Access a specific database
    db = client["summarizer"]

    # Access a collection within the database
    collection = db["topic_collection"]
        
    generate_content_table = SummarizeGenerator(references)
    extractor_output, extractor_dics  = generate_content_table.extract_content_table(content_table)
    print(extractor_output)
    data_to_insert = {
    "title": references["title"],
    **extractor_dics  # Unpack the extractor_output dictionary
    }
    
    collection.insert_one(data_to_insert)
    

    add_metadata_template = str(
        Template(ADD_METADATA_TEMPLATE).render(extractor_output=extractor_output)
    )

    print("add metadata template : ", add_metadata_template)

    llm = OpenAI(temperature=0.1, model="gpt-4o-mini")

    openai_program = OpenAIPydanticProgram.from_defaults(
        output_cls=NodeMetadata,
        prompt_template_str="{input}",
        extract_template_str=add_metadata_template,
        llm=llm,
    )

    topic_extractor = PydanticProgramExtractor(
        program=openai_program,
        input_key="input",
        show_progress=True,
        extract_template_str=add_metadata_template,
        llm=llm,
    )

    return topic_extractor