File size: 3,603 Bytes
cfd52b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import time
from src.helper import PINECONE_API_KEY, text_split, download_hugging_face_embeddings
from langchain.vectorstores import Pinecone as LangchainPinecone  # Alias to avoid confusion
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from PyPDF2 import PdfReader

# Define the load_pdf function
def load_pdf(file_path):
    all_text = ""
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            all_text += page.extract_text() + "\n"
    return all_text if all_text else None

# Define the text_split function
def text_split(text):
    from langchain.text_splitter import CharacterTextSplitter
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    return text_splitter.split_text(text)

# Load environment variables if not already set
load_dotenv()

# Load and process data
pdf_file_path = "data/Okelloetal.2008TourismanalysisManka.pdf"  # Update this path to your single PDF file
extracted_data = load_pdf(pdf_file_path)
if extracted_data is None:
    raise ValueError("The extracted data is None. Please check the load_pdf function.")

print(f"Extracted Data: {extracted_data}")

# Split the extracted text into chunks
text_chunks = text_split(extracted_data)
if text_chunks is None:
    raise ValueError("The text_chunks is None. Please check the text_split function.")

print(f"Text Chunks: {text_chunks}")

embeddings = download_hugging_face_embeddings()
if embeddings is None:
    raise ValueError("The embeddings is None. Please check the download_hugging_face_embeddings function.")

print(f"Embeddings: {embeddings}")

# Ensure Pinecone API key is available
api_key = os.environ.get("PINECONE_API_KEY")
if not api_key:
    raise ValueError("PINECONE_API_KEY environment variable not set.")

# Initialize Pinecone client
pc = Pinecone(api_key=api_key)

# Specify cloud and region for the serverless index
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

# Define the index name
index_name = "healthbot"

# Create the index if it does not exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=spec
    )
    # Wait for the index to be ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# Connect to the created index
index = pc.Index(index_name)
time.sleep(1)

# Example: Add data to the index with reduced metadata
# Create a dictionary to simulate external storage of text chunks
text_chunk_store = {}

# Function to simulate storing text chunk and returning a reference ID
def store_text_chunk(text_chunk):
    chunk_id = f"chunk_{len(text_chunk_store)}"
    text_chunk_store[chunk_id] = text_chunk
    return chunk_id

# Add text chunks to Pinecone with reference IDs
for i, text_chunk in enumerate(text_chunks):
    chunk_id = store_text_chunk(text_chunk)
    embedding = embeddings.embed_query(text_chunk)  # Embed the text chunk
    index.upsert(
        vectors=[
            {
                "id": f"vec_{i}", 
                "values": embedding, 
                "metadata": {"chunk_id": chunk_id}  # Only store the reference ID as metadata
            }
        ],
        namespace="ns1"
    )

print("Indexing completed successfully.")