from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import CharacterTextSplitter from langchain_community.embeddings import BedrockEmbeddings from langchain_aws import ChatBedrock from langchain_community.vectorstores import Chroma #Las variables de ambiente AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_DEFAULT_REGION #se deben configurar en la línea de comando del sistema operativo def initLLM(): return ChatBedrock(model_id="anthropic.claude-3-sonnet-20240229-v1:0") def initEmbedder(): return BedrockEmbeddings(model_id='amazon.titan-embed-text-v1') def initChromaDB(document_chunks,embbeder): return Chroma.from_documents(document_chunks,embedding=embbeder, persist_directory='./data') def embedding(thePathFile,embedder): #cargar el archivo PDF loader = PyPDFLoader(thePathFile) pages = loader.load() print(len(pages)) #hacer chunk de 500 caracteres document_splitter=CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100) document_chunks=document_splitter.split_documents(pages) print(len(document_chunks)) print(embedder) if embedder is not None: print("Cargando a la base vectorial...") vectorDB=initChromaDB(document_chunks, embedder) print("Fin carga") return vectorDB # Ejecutar la aplicación if __name__ == "__main__": bedrock_llm=initLLM() bedrock_embedder=initEmbedder() chromaDB=embedding("el principito.pdf",bedrock_embedder) print(chromaDB)