{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !python3 -m venv env \n",
    "# !source env/bin/activate \n",
    "# !pip3 install langchain\n",
    "# !pip3 install pypdf2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import PyPDF2\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"bk_example.pdf\", \"rb\") as file:\n",
    "    reader = PyPDF2.PdfReader(file)\n",
    "    text_all = ''\n",
    "    # Extract text from each page\n",
    "    for page_num in range(len(reader.pages)):\n",
    "        page = reader.pages[page_num]\n",
    "        text = page.extract_text()\n",
    "        text_all = text_all +text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import getpass\n",
    "import os\n",
    "\n",
    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
    "os.environ[\"LANGCHAIN_API_KEY\"] = getpass.getpass()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Optional\n",
    "\n",
    "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "# Define a custom prompt to provide instructions and any additional context.\n",
    "# 1) You can add examples into the prompt template to improve extraction quality\n",
    "# 2) Introduce additional parameters to take context into account (e.g., include metadata\n",
    "#    about the document from which the text was extracted.)\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "    [\n",
    "        (\n",
    "            \"system\",\n",
    "            \"You are an expert extraction algorithm. \"\n",
    "            \"Only extract relevant information from the text. \"\n",
    "            \"If you do not know the value of an attribute asked to extract, \"\n",
    "            \"return null for the attribute's value.\",\n",
    "        ),\n",
    "        # Please see the how-to about improving performance with\n",
    "        # reference examples.\n",
    "        # MessagesPlaceholder('examples'),\n",
    "        (\"human\", \"{text}\"),\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Optional\n",
    "\n",
    "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "# Define a custom prompt to provide instructions and any additional context.\n",
    "# 1) You can add examples into the prompt template to improve extraction quality\n",
    "# 2) Introduce additional parameters to take context into account (e.g., include metadata\n",
    "#    about the document from which the text was extracted.)\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "    [\n",
    "        (\n",
    "            \"system\",\n",
    "            \"You are an expert extraction algorithm. \"\n",
    "            \"Only extract relevant information from the text. \"\n",
    "            \"If you do not know the value of an attribute asked to extract, \"\n",
    "            \"return null for the attribute's value.\",\n",
    "        ),\n",
    "        # Please see the how-to about improving performance with\n",
    "        # reference examples.\n",
    "        # MessagesPlaceholder('examples'),\n",
    "        (\"human\", \"{text}\"),\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'langchain_mistralai'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_mistralai\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatMistralAI\n\u001b[1;32m      3\u001b[0m llm \u001b[38;5;241m=\u001b[39m ChatMistralAI(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmistral-large-latest\u001b[39m\u001b[38;5;124m\"\u001b[39m, temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m      5\u001b[0m runnable \u001b[38;5;241m=\u001b[39m prompt \u001b[38;5;241m|\u001b[39m llm\u001b[38;5;241m.\u001b[39mwith_structured_output(schema\u001b[38;5;241m=\u001b[39mPerson)\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'langchain_mistralai'"
     ]
    }
   ],
   "source": [
    "from langchain_mistralai import ChatMistralAI\n",
    "\n",
    "llm = ChatMistralAI(model=\"mistral-large-latest\", temperature=0)\n",
    "\n",
    "runnable = prompt | llm.with_structured_output(schema=Person)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import List, Optional\n",
    "\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "\n",
    "class Bankruptcy(BaseModel):\n",
    "    \"\"\"Information about a bankruptcy declaration.\"\"\"\n",
    "\n",
    "    # ^ Doc-string for the entity Person.\n",
    "    # This doc-string is sent to the LLM as the description of the schema Person,\n",
    "    # and it can help to improve extraction results.\n",
    "\n",
    "    # Note that:\n",
    "    # 1. Each field is an `optional` -- this allows the model to decline to extract it!\n",
    "    # 2. Each field has a `description` -- this description is used by the LLM.\n",
    "    # Having a good description can help improve extraction results.\n",
    "    ssns: Optional[list] = Field(default=None, description=\"The ssns of the persons\")\n",
    "    chapter: Optional[str] = Field(\n",
    "        default=None, description=\"The chapter of the bankruptcy declaration\"\n",
    "    )\n",
    "    country: Optional[str] = Field(\n",
    "        default=None, description=\"Country were the bankruptcy declaration is made\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Data(BaseModel):\n",
    "    \"\"\"Extracted data about bankruptcy declaration..\"\"\"\n",
    "\n",
    "    # Creates a model so that we can extract multiple entities.\n",
    "    people: List[Bankruptcy]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'prompt' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m runnable \u001b[38;5;241m=\u001b[39m \u001b[43mprompt\u001b[49m \u001b[38;5;241m|\u001b[39m llm\u001b[38;5;241m.\u001b[39mwith_structured_output(schema\u001b[38;5;241m=\u001b[39mData)\n\u001b[1;32m      2\u001b[0m runnable\u001b[38;5;241m.\u001b[39minvoke({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m: text_all})\n",
      "\u001b[0;31mNameError\u001b[0m: name 'prompt' is not defined"
     ]
    }
   ],
   "source": [
    "runnable = prompt | llm.with_structured_output(schema=Data)\n",
    "runnable.invoke({\"text\": text_all})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(text_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find SSNs\n",
    "ssn_pattern = r'\\b(?:Social Security number|ITIN)\\D*(\\d{3}[−\\s]\\d{2}[−\\s]\\d{4})\\b'\n",
    "ssns = re.findall(ssn_pattern, text_all)\n",
    "\n",
    "def find_ssns(text):\n",
    "    ssns = re.findall(ssn_pattern, text_all)\n",
    "    return ssns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find chapter\n",
    "chapter_pattern = r'Notice of Chapter (\\d+) Bankruptcy Case \\d{1,2}/\\d{2}'\n",
    "\n",
    "def find_chapter(text):\n",
    "    chapters = re.findall(chapter_pattern, text_all)\n",
    "    return chapters[0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "country_code = {\"United States\": \"US\", \"Canada\":\"CA\"}\n",
    "\n",
    "country_pattern = r'\\b(?:United States|Canada)\\b'\n",
    "\n",
    "def find_country_code(text):\n",
    "    country_match = re.search(country_pattern, text, re.IGNORECASE)\n",
    "    return country_code.get(country_match[0],None) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find State\n",
    "state_pattern = r'\\nDistrict of (\\w+)'\n",
    "\n",
    "# Dictionaries for state codes\n",
    "us_states = {\n",
    "    \"Alabama\": \"AL\", \"Alaska\": \"AK\", \"Arizona\": \"AZ\", \"Arkansas\": \"AR\", \"California\": \"CA\",\n",
    "    \"Colorado\": \"CO\", \"Connecticut\": \"CT\", \"Delaware\": \"DE\", \"Florida\": \"FL\", \"Georgia\": \"GA\",\n",
    "    \"Hawaii\": \"HI\", \"Idaho\": \"ID\", \"Illinois\": \"IL\", \"Indiana\": \"IN\", \"Iowa\": \"IA\",\n",
    "    \"Kansas\": \"KS\", \"Kentucky\": \"KY\", \"Louisiana\": \"LA\", \"Maine\": \"ME\", \"Maryland\": \"MD\",\n",
    "    \"Massachusetts\": \"MA\", \"Michigan\": \"MI\", \"Minnesota\": \"MN\", \"Mississippi\": \"MS\", \"Missouri\": \"MO\",\n",
    "    \"Montana\": \"MT\", \"Nebraska\": \"NE\", \"Nevada\": \"NV\", \"New Hampshire\": \"NH\", \"New Jersey\": \"NJ\",\n",
    "    \"New Mexico\": \"NM\", \"New York\": \"NY\", \"North Carolina\": \"NC\", \"North Dakota\": \"ND\", \"Ohio\": \"OH\",\n",
    "    \"Oklahoma\": \"OK\", \"Oregon\": \"OR\", \"Pennsylvania\": \"PA\", \"Rhode Island\": \"RI\", \"South Carolina\": \"SC\",\n",
    "    \"South Dakota\": \"SD\", \"Tennessee\": \"TN\", \"Texas\": \"TX\", \"Utah\": \"UT\", \"Vermont\": \"VT\",\n",
    "    \"Virginia\": \"VA\", \"Washington\": \"WA\", \"West Virginia\": \"WV\", \"Wisconsin\": \"WI\", \"Wyoming\": \"WY\"\n",
    "}\n",
    "\n",
    "canadian_provinces = {\n",
    "    \"Alberta\": \"AB\", \"British Columbia\": \"BC\", \"Manitoba\": \"MB\", \"New Brunswick\": \"NB\", \"Newfoundland and Labrador\": \"NL\",\n",
    "    \"Northwest Territories\": \"NT\", \"Nova Scotia\": \"NS\", \"Nunavut\": \"NU\", \"Ontario\": \"ON\", \"Prince Edward Island\": \"PE\",\n",
    "    \"Quebec\": \"QC\", \"Saskatchewan\": \"SK\", \"Yukon\": \"YT\"\n",
    "}\n",
    "\n",
    "def find_state_code(text,country_code):\n",
    "    state_match = re.search(state_pattern, text)\n",
    "    \n",
    "    if state_match:\n",
    "        # Extract the state or province name from the match\n",
    "        state_name = state_match.group(1).strip()\n",
    "    \n",
    "    if country_code == 'US':\n",
    "        state_code = us_states.get(state_name,None)\n",
    "    elif country_code == 'CA':\n",
    "        state_code = canadian_provinces.get(state_name,None)\n",
    "    else:\n",
    "        state_code = None\n",
    "    \n",
    "    return state_code\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find stage\n",
    "stage_patterns = {\n",
    "    'Petition': r'\\b(case filed|petition filed|automatic stay)\\b',\n",
    "    'Discharge': r'\\b(discharge of debts|discharge order|case discharged)\\b',\n",
    "    'Dismissed': r'\\b(case dismissed|dismissal|converted to Chapter 7)\\b'\n",
    "}\n",
    "\n",
    "# Function to categorize bankruptcy stages from text\n",
    "def categorize_stage(text):\n",
    "    categorized_stages = {'Petition': False, 'Discharge': False, 'Dismissed': False}\n",
    "    \n",
    "    for stage, pattern in stage_patterns.items():\n",
    "        if re.search(pattern, text, re.IGNORECASE):\n",
    "            categorized_stages[stage] = True\n",
    "    \n",
    "    # Determine the final stage based on the presence of keywords\n",
    "    if categorized_stages['Petition']:\n",
    "        return 'Petition'\n",
    "    elif categorized_stages['Discharge']:\n",
    "        return 'Discharge'\n",
    "    elif categorized_stages['Dismissed']:\n",
    "        return 'Dismissed'\n",
    "    else:\n",
    "        return 'Unknown'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data found: {'ssns': ['461−81−0513', '529−97−1200'], 'chapter': '13', 'country_code': 'US', 'state': 'UT', 'stage': 'Petition'}\n"
     ]
    }
   ],
   "source": [
    "data = { \"ssns\": find_ssns(text_all),\n",
    "        \"chapter\": find_chapter(text_all),\n",
    "        \"country_code\": find_country_code(text_all),\n",
    "        \"state\": find_state_code(text_all, find_country_code(text_all)),\n",
    "        \"stage\": categorize_stage(text_all)\n",
    "        }\n",
    "\n",
    "print(f\"Data found: {data}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}