Spaces:

LeoWalker
/

jobDescriptionParser

Sleeping

App Files Files Community

LeoWalker commited on May 10

Commit

4fad2af

•

1 Parent(s): 47cc304

set up to dump to MongoDB instead of PostgreSQL

Browse files

Files changed (3) hide show

notebooks/gj_error.ipynb +188 -0
notebooks/parse_description_test.ipynb +2 -2
utils/google_mongo_jobs.py +100 -0

notebooks/gj_error.ipynb ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from multiprocessing import process\n",
+    "import pandas as pd\n",
+    "import datetime as dt\n",
+    "import http.client\n",
+    "import json\n",
+    "import urllib.parse\n",
+    "import os\n",
+    "from pymongo import MongoClient\n",
+    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "\n",
+    "mongodb_conn = os.getenv('MONGODB_CONNECTION_STRING')\n",
+    "\n",
+    "# Global variables to keep track of searched job titles and cities\n",
+    "searched_jobs = set()\n",
+    "searched_cities = set()\n",
+    "\n",
+    "def google_job_search(job_title, city_state, start=0):\n",
+    "    '''\n",
+    "    job_title(str): \"Data Scientist\", \"Data Analyst\"\n",
+    "    city_state(str): \"Denver, CO\"\n",
+    "    '''\n",
+    "    query = f\"{job_title} {city_state}\"\n",
+    "    params = {\n",
+    "        \"api_key\": os.getenv('WEBSCRAPING_API_KEY'),\n",
+    "        \"engine\": \"google_jobs\",\n",
+    "        \"q\": query,\n",
+    "        \"hl\": \"en\",\n",
+    "        # \"google_domain\": \"google.com\",\n",
+    "        # \"start\": start,\n",
+    "        # \"chips\": f\"date_posted:{post_age}\",\n",
+    "    }\n",
+    "\n",
+    "    query_string = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)\n",
+    "\n",
+    "    conn = http.client.HTTPSConnection(\"serpapi.webscrapingapi.com\")\n",
+    "    try:\n",
+    "        conn.request(\"GET\", f\"/v1?{query_string}\")\n",
+    "        print(f\"GET /v1?{query_string}\")\n",
+    "        res = conn.getresponse()\n",
+    "        try:\n",
+    "            data = res.read()\n",
+    "        finally:\n",
+    "            res.close()\n",
+    "    finally:\n",
+    "        conn.close()\n",
+    "\n",
+    "    try:\n",
+    "        json_data = json.loads(data.decode(\"utf-8\"))\n",
+    "        jobs_results = json_data['google_jobs_results']\n",
+    "        return jobs_results\n",
+    "    except (KeyError, json.JSONDecodeError) as e:\n",
+    "        print(f\"Error occurred for search: {job_title} in {city_state}\")\n",
+    "        print(f\"Error message: {str(e)}\")\n",
+    "        print(f\"Data: {data}\")\n",
+    "        return None\n",
+    "\n",
+    "def mongo_dump(jobs_results, collection_name):\n",
+    "    client = MongoClient(mongodb_conn)\n",
+    "    db = client.job_search_db\n",
+    "    collection = db[collection_name]\n",
+    "    \n",
+    "    for job in jobs_results:\n",
+    "        job['retrieve_date'] = dt.datetime.today().strftime('%Y-%m-%d')\n",
+    "        collection.insert_one(job)\n",
+    "    \n",
+    "    print(f\"Dumped {len(jobs_results)} documents to MongoDB collection {collection_name}\")\n",
+    "\n",
+    "def process_batch(job, city_state, start=0):\n",
+    "    global searched_jobs, searched_cities\n",
+    "\n",
+    "    # Check if the job title and city have already been searched\n",
+    "    if (job, city_state) in searched_jobs:\n",
+    "        print(f'Skipping already searched job: {job} in {city_state}')\n",
+    "        return\n",
+    "\n",
+    "    jobs_results = google_job_search(job, city_state, start)\n",
+    "    if jobs_results is not None:\n",
+    "        print(f'City: {city_state} Job: {job} Start: {start}')\n",
+    "        mongo_dump(jobs_results, 'sf_bay_test_jobs')\n",
+    "\n",
+    "        # Add the job title and city to the searched sets\n",
+    "        searched_jobs.add((job, city_state))\n",
+    "        searched_cities.add(city_state)\n",
+    "\n",
+    "def main(job_list, city_state_list):\n",
+    "    for job in job_list:\n",
+    "        for city_state in city_state_list:\n",
+    "            output = process_batch(job, city_state)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job_list = [\"Data Scientist\", \"Machine Learning Engineer\", \"AI Gen Engineer\", \"ML Ops\"]\n",
+    "city_state_list = [\"Atlanta, GA\", \"Austin, TX\", \"Boston, MA\", \"Chicago, IL\", \n",
+    "                \"Denver CO\", \"Dallas-Ft. Worth, TX\", \"Los Angeles, CA\",\n",
+    "                \"New York City NY\", \"San Francisco, CA\", \"Seattle, WA\",\n",
+    "                \"Palo Alto CA\", \"Mountain View CA\", \"San Jose, CA\"]\n",
+    "simple_city_state_list: list[str] = [\"Palo Alto CA\", \"San Francisco CA\", \"Mountain View CA\"]\n",
+    "main(job_list, simple_city_state_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping already searched job: Data Scientist in San Francisco, CA\n"
+     ]
+    }
+   ],
+   "source": [
+    "process_batch(\"Data Scientist\", \"San Francisco, CA\", 10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = MongoClient(mongodb_conn)\n",
+    "db = client.job_search_db\n",
+    "collection = db['sf_bay_test_jobs']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "datajobs",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

notebooks/parse_description_test.ipynb CHANGED Viewed

@@ -90,7 +90,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -295,7 +295,7 @@
        "[495 rows x 7 columns]"
       ]
      },
-     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }

   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
        "[495 rows x 7 columns]"
       ]
      },
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }

utils/google_mongo_jobs.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from multiprocessing import process
+import pandas as pd
+import datetime as dt
+import http.client
+import json
+import urllib.parse
+import os
+from pymongo import MongoClient
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dotenv import load_dotenv
+load_dotenv()
+mongodb_conn = os.getenv('MONGODB_CONNECTION_STRING')
+# Global variables to keep track of searched job titles and cities
+searched_jobs = set()
+searched_cities = set()
+def google_job_search(job_title, city_state, start=0):
+    '''
+    job_title(str): "Data Scientist", "Data Analyst"
+    city_state(str): "Denver, CO"
+    '''
+    query = f"{job_title} {city_state}"
+    params = {
+        "api_key": os.getenv('WEBSCRAPING_API_KEY'),
+        "engine": "google_jobs",
+        "q": query,
+        "hl": "en",
+        # "google_domain": "google.com",
+        # "start": start,
+        # "chips": f"date_posted:{post_age}",
+    }
+    query_string = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
+    conn = http.client.HTTPSConnection("serpapi.webscrapingapi.com")
+    try:
+        conn.request("GET", f"/v1?{query_string}")
+        print(f"GET /v1?{query_string}")
+        res = conn.getresponse()
+        try:
+            data = res.read()
+        finally:
+            res.close()
+    finally:
+        conn.close()
+    try:
+        json_data = json.loads(data.decode("utf-8"))
+        jobs_results = json_data['google_jobs_results']
+        return jobs_results
+    except (KeyError, json.JSONDecodeError) as e:
+        print(f"Error occurred for search: {job_title} in {city_state}")
+        print(f"Error message: {str(e)}")
+        print(f"Data: {data}")
+        return None
+def mongo_dump(jobs_results, collection_name):
+    client = MongoClient(mongodb_conn)
+    db = client.job_search_db
+    collection = db[collection_name]
+    for job in jobs_results:
+        job['retrieve_date'] = dt.datetime.today().strftime('%Y-%m-%d')
+        collection.insert_one(job)
+    print(f"Dumped {len(jobs_results)} documents to MongoDB collection {collection_name}")
+def process_batch(job, city_state, start=0):
+    global searched_jobs, searched_cities
+    # Check if the job title and city have already been searched
+    if (job, city_state) in searched_jobs:
+        print(f'Skipping already searched job: {job} in {city_state}')
+        return
+    jobs_results = google_job_search(job, city_state, start)
+    if jobs_results is not None:
+        print(f'City: {city_state} Job: {job} Start: {start}')
+        mongo_dump(jobs_results, 'sf_bay_test_jobs')
+        # Add the job title and city to the searched sets
+        searched_jobs.add((job, city_state))
+        searched_cities.add(city_state)
+def main(job_list, city_state_list):
+    for job in job_list:
+        for city_state in city_state_list:
+            output = process_batch(job, city_state)
+if __name__ == "__main__":
+    job_list = ["Data Scientist", "Machine Learning Engineer", "AI Gen Engineer", "ML Ops"]
+    city_state_list = ["Atlanta, GA", "Austin, TX", "Boston, MA", "Chicago, IL",
+                    "Denver CO", "Dallas-Ft. Worth, TX", "Los Angeles, CA",
+                    "New York City NY", "San Francisco, CA", "Seattle, WA",
+                    "Palo Alto CA", "Mountain View CA", "San Jose, CA"]
+    simple_city_state_list: list[str] = ["Palo Alto CA", "San Francisco CA", "Mountain View CA"]
+    main(job_list, simple_city_state_list)