{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# !python3 -m venv env \n", "# !source env/bin/activate \n", "# !pip3 install langchain\n", "# !pip3 install pypdf2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import PyPDF2\n", "import re" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "with open(\"bk_example.pdf\", \"rb\") as file:\n", " reader = PyPDF2.PdfReader(file)\n", " text_all = ''\n", " # Extract text from each page\n", " for page_num in range(len(reader.pages)):\n", " page = reader.pages[page_num]\n", " text = page.extract_text()\n", " text_all = text_all +text" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import getpass\n", "import os\n", "\n", "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", "os.environ[\"LANGCHAIN_API_KEY\"] = getpass.getpass()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from typing import Optional\n", "\n", "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n", "from langchain_core.pydantic_v1 import BaseModel, Field\n", "\n", "# Define a custom prompt to provide instructions and any additional context.\n", "# 1) You can add examples into the prompt template to improve extraction quality\n", "# 2) Introduce additional parameters to take context into account (e.g., include metadata\n", "# about the document from which the text was extracted.)\n", "prompt = ChatPromptTemplate.from_messages(\n", " [\n", " (\n", " \"system\",\n", " \"You are an expert extraction algorithm. \"\n", " \"Only extract relevant information from the text. \"\n", " \"If you do not know the value of an attribute asked to extract, \"\n", " \"return null for the attribute's value.\",\n", " ),\n", " # Please see the how-to about improving performance with\n", " # reference examples.\n", " # MessagesPlaceholder('examples'),\n", " (\"human\", \"{text}\"),\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from typing import Optional\n", "\n", "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n", "from langchain_core.pydantic_v1 import BaseModel, Field\n", "\n", "# Define a custom prompt to provide instructions and any additional context.\n", "# 1) You can add examples into the prompt template to improve extraction quality\n", "# 2) Introduce additional parameters to take context into account (e.g., include metadata\n", "# about the document from which the text was extracted.)\n", "prompt = ChatPromptTemplate.from_messages(\n", " [\n", " (\n", " \"system\",\n", " \"You are an expert extraction algorithm. \"\n", " \"Only extract relevant information from the text. \"\n", " \"If you do not know the value of an attribute asked to extract, \"\n", " \"return null for the attribute's value.\",\n", " ),\n", " # Please see the how-to about improving performance with\n", " # reference examples.\n", " # MessagesPlaceholder('examples'),\n", " (\"human\", \"{text}\"),\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'langchain_mistralai'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_mistralai\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatMistralAI\n\u001b[1;32m 3\u001b[0m llm \u001b[38;5;241m=\u001b[39m ChatMistralAI(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmistral-large-latest\u001b[39m\u001b[38;5;124m\"\u001b[39m, temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m 5\u001b[0m runnable \u001b[38;5;241m=\u001b[39m prompt \u001b[38;5;241m|\u001b[39m llm\u001b[38;5;241m.\u001b[39mwith_structured_output(schema\u001b[38;5;241m=\u001b[39mPerson)\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'langchain_mistralai'" ] } ], "source": [ "from langchain_mistralai import ChatMistralAI\n", "\n", "llm = ChatMistralAI(model=\"mistral-large-latest\", temperature=0)\n", "\n", "runnable = prompt | llm.with_structured_output(schema=Person)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from typing import List, Optional\n", "\n", "from langchain_core.pydantic_v1 import BaseModel, Field\n", "\n", "\n", "class Bankruptcy(BaseModel):\n", " \"\"\"Information about a bankruptcy declaration.\"\"\"\n", "\n", " # ^ Doc-string for the entity Person.\n", " # This doc-string is sent to the LLM as the description of the schema Person,\n", " # and it can help to improve extraction results.\n", "\n", " # Note that:\n", " # 1. Each field is an `optional` -- this allows the model to decline to extract it!\n", " # 2. Each field has a `description` -- this description is used by the LLM.\n", " # Having a good description can help improve extraction results.\n", " ssns: Optional[list] = Field(default=None, description=\"The ssns of the persons\")\n", " chapter: Optional[str] = Field(\n", " default=None, description=\"The chapter of the bankruptcy declaration\"\n", " )\n", " country: Optional[str] = Field(\n", " default=None, description=\"Country were the bankruptcy declaration is made\"\n", " )" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "class Data(BaseModel):\n", " \"\"\"Extracted data about bankruptcy declaration..\"\"\"\n", "\n", " # Creates a model so that we can extract multiple entities.\n", " people: List[Bankruptcy]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'prompt' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m runnable \u001b[38;5;241m=\u001b[39m \u001b[43mprompt\u001b[49m \u001b[38;5;241m|\u001b[39m llm\u001b[38;5;241m.\u001b[39mwith_structured_output(schema\u001b[38;5;241m=\u001b[39mData)\n\u001b[1;32m 2\u001b[0m runnable\u001b[38;5;241m.\u001b[39minvoke({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m: text_all})\n", "\u001b[0;31mNameError\u001b[0m: name 'prompt' is not defined" ] } ], "source": [ "runnable = prompt | llm.with_structured_output(schema=Data)\n", "runnable.invoke({\"text\": text_all})" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#print(text_all)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#Find SSNs\n", "ssn_pattern = r'\\b(?:Social Security number|ITIN)\\D*(\\d{3}[−\\s]\\d{2}[−\\s]\\d{4})\\b'\n", "ssns = re.findall(ssn_pattern, text_all)\n", "\n", "def find_ssns(text):\n", " ssns = re.findall(ssn_pattern, text_all)\n", " return ssns" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#Find chapter\n", "chapter_pattern = r'Notice of Chapter (\\d+) Bankruptcy Case \\d{1,2}/\\d{2}'\n", "\n", "def find_chapter(text):\n", " chapters = re.findall(chapter_pattern, text_all)\n", " return chapters[0]\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "country_code = {\"United States\": \"US\", \"Canada\":\"CA\"}\n", "\n", "country_pattern = r'\\b(?:United States|Canada)\\b'\n", "\n", "def find_country_code(text):\n", " country_match = re.search(country_pattern, text, re.IGNORECASE)\n", " return country_code.get(country_match[0],None) " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "#Find State\n", "state_pattern = r'\\nDistrict of (\\w+)'\n", "\n", "# Dictionaries for state codes\n", "us_states = {\n", " \"Alabama\": \"AL\", \"Alaska\": \"AK\", \"Arizona\": \"AZ\", \"Arkansas\": \"AR\", \"California\": \"CA\",\n", " \"Colorado\": \"CO\", \"Connecticut\": \"CT\", \"Delaware\": \"DE\", \"Florida\": \"FL\", \"Georgia\": \"GA\",\n", " \"Hawaii\": \"HI\", \"Idaho\": \"ID\", \"Illinois\": \"IL\", \"Indiana\": \"IN\", \"Iowa\": \"IA\",\n", " \"Kansas\": \"KS\", \"Kentucky\": \"KY\", \"Louisiana\": \"LA\", \"Maine\": \"ME\", \"Maryland\": \"MD\",\n", " \"Massachusetts\": \"MA\", \"Michigan\": \"MI\", \"Minnesota\": \"MN\", \"Mississippi\": \"MS\", \"Missouri\": \"MO\",\n", " \"Montana\": \"MT\", \"Nebraska\": \"NE\", \"Nevada\": \"NV\", \"New Hampshire\": \"NH\", \"New Jersey\": \"NJ\",\n", " \"New Mexico\": \"NM\", \"New York\": \"NY\", \"North Carolina\": \"NC\", \"North Dakota\": \"ND\", \"Ohio\": \"OH\",\n", " \"Oklahoma\": \"OK\", \"Oregon\": \"OR\", \"Pennsylvania\": \"PA\", \"Rhode Island\": \"RI\", \"South Carolina\": \"SC\",\n", " \"South Dakota\": \"SD\", \"Tennessee\": \"TN\", \"Texas\": \"TX\", \"Utah\": \"UT\", \"Vermont\": \"VT\",\n", " \"Virginia\": \"VA\", \"Washington\": \"WA\", \"West Virginia\": \"WV\", \"Wisconsin\": \"WI\", \"Wyoming\": \"WY\"\n", "}\n", "\n", "canadian_provinces = {\n", " \"Alberta\": \"AB\", \"British Columbia\": \"BC\", \"Manitoba\": \"MB\", \"New Brunswick\": \"NB\", \"Newfoundland and Labrador\": \"NL\",\n", " \"Northwest Territories\": \"NT\", \"Nova Scotia\": \"NS\", \"Nunavut\": \"NU\", \"Ontario\": \"ON\", \"Prince Edward Island\": \"PE\",\n", " \"Quebec\": \"QC\", \"Saskatchewan\": \"SK\", \"Yukon\": \"YT\"\n", "}\n", "\n", "def find_state_code(text,country_code):\n", " state_match = re.search(state_pattern, text)\n", " \n", " if state_match:\n", " # Extract the state or province name from the match\n", " state_name = state_match.group(1).strip()\n", " \n", " if country_code == 'US':\n", " state_code = us_states.get(state_name,None)\n", " elif country_code == 'CA':\n", " state_code = canadian_provinces.get(state_name,None)\n", " else:\n", " state_code = None\n", " \n", " return state_code\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "#Find stage\n", "stage_patterns = {\n", " 'Petition': r'\\b(case filed|petition filed|automatic stay)\\b',\n", " 'Discharge': r'\\b(discharge of debts|discharge order|case discharged)\\b',\n", " 'Dismissed': r'\\b(case dismissed|dismissal|converted to Chapter 7)\\b'\n", "}\n", "\n", "# Function to categorize bankruptcy stages from text\n", "def categorize_stage(text):\n", " categorized_stages = {'Petition': False, 'Discharge': False, 'Dismissed': False}\n", " \n", " for stage, pattern in stage_patterns.items():\n", " if re.search(pattern, text, re.IGNORECASE):\n", " categorized_stages[stage] = True\n", " \n", " # Determine the final stage based on the presence of keywords\n", " if categorized_stages['Petition']:\n", " return 'Petition'\n", " elif categorized_stages['Discharge']:\n", " return 'Discharge'\n", " elif categorized_stages['Dismissed']:\n", " return 'Dismissed'\n", " else:\n", " return 'Unknown'" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data found: {'ssns': ['461−81−0513', '529−97−1200'], 'chapter': '13', 'country_code': 'US', 'state': 'UT', 'stage': 'Petition'}\n" ] } ], "source": [ "data = { \"ssns\": find_ssns(text_all),\n", " \"chapter\": find_chapter(text_all),\n", " \"country_code\": find_country_code(text_all),\n", " \"state\": find_state_code(text_all, find_country_code(text_all)),\n", " \"stage\": categorize_stage(text_all)\n", " }\n", "\n", "print(f\"Data found: {data}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 2 }