{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Job description from google jobs" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import pandas as pd\n", "# # from serpapi import GoogleSearch\n", "# import sqlite3\n", "# import datetime as dt\n", "# import http.client\n", "# import json\n", "# import config\n", "# import urllib.parse\n", "# import os\n", "# from sqlalchemy import create_engine\n", "# import psycopg2\n", "\n", "# def google_job_search(job_title, city_state):\n", "# '''\n", "# job_title(str): \"Data Scientist\", \"Data Analyst\"\n", "# city_state(str): \"Denver, CO\"\n", "# post_age,(str)(optional): \"3day\", \"week\", \"month\"\n", "# '''\n", "# query = f\"{job_title} {city_state}\"\n", "# params = {\n", "# \"engine\": \"google_jobs\",\n", "# \"q\": query,\n", "# \"hl\": \"en\",\n", "# \"api_key\": os.getenv('SerpAPIkey'),\n", "# # \"chips\": f\"date_posted:{post_age}\",\n", "# }\n", "\n", "# query_string = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)\n", "\n", "# conn = http.client.HTTPSConnection(\"serpapi.webscrapingapi.com/v1\")\n", "# try:\n", "# conn.request(\"GET\", f\"/v1?{query_string}\")\n", "# res = conn.getresponse()\n", "# try:\n", "# data = res.read()\n", "# finally:\n", "# res.close()\n", "# finally:\n", "# conn.close()\n", "\n", "# try:\n", "# json_data = json.loads(data.decode(\"utf-8\"))\n", "# jobs_results = json_data['google_jobs_results']\n", "# job_columns = ['title', 'company_name', 'location', 'description']\n", "# df = pd.DataFrame(jobs_results, columns=job_columns)\n", "# return df\n", "# except (KeyError, json.JSONDecodeError) as e:\n", "# print(f\"Error occurred for search: {job_title} in {city_state}\")\n", "# print(f\"Error message: {str(e)}\")\n", "# return None\n", "\n", "# def sql_dump(df, table):\n", "# engine = create_engine(f\"postgresql://{os.getenv('MasterName')}:{os.getenv('MasterPass')}@{os.getenv('RDS_EndPoint')}:5432/postgres\")\n", "# with engine.connect() as conn:\n", "# df.to_sql(table, conn, if_exists='append', chunksize=1000, method='multi', index=False)\n", "\n", "# def main(job_list, city_state_list):\n", "# for job in job_list:\n", "# for city_state in city_state_list:\n", "# df_10jobs = google_job_search(job, city_state)\n", "# if df_10jobs is not None:\n", "# print(f'City: {city_state} Job: {job}')\n", "# print(df_10jobs.shape)\n", "# date = dt.datetime.today().strftime('%Y-%m-%d')\n", "# df_10jobs['retrieve_date'] = date\n", "# sql_dump(df_10jobs, 'datajobs24')\n", "\n", "# return None" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import sqlite3\n", "import datetime as dt\n", "import http.client\n", "import json\n", "import urllib.parse\n", "import os\n", "from sqlalchemy import create_engine\n", "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()\n", "\n", "def google_job_search(job_title, city_state, start=0):\n", " '''\n", " job_title(str): \"Data Scientist\", \"Data Analyst\"\n", " city_state(str): \"Denver, CO\"\n", " post_age,(str)(optional): \"3day\", \"week\", \"month\"\n", " '''\n", " query = f\"{job_title} {city_state}\"\n", " params = {\n", " \"api_key\": os.getenv('SerpAPIkey'),\n", " \"engine\": \"google_jobs\",\n", " \"q\": query,\n", " \"hl\": \"en\",\n", " \"start\": start,\n", " # \"chips\": f\"date_posted:{post_age}\",\n", " }\n", "\n", " query_string = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)\n", "\n", " conn = http.client.HTTPSConnection(\"serpapi.webscrapingapi.com\")\n", " try:\n", " conn.request(\"GET\", f\"/v1?{query_string}\")\n", " res = conn.getresponse()\n", " try:\n", " data = res.read()\n", " finally:\n", " res.close()\n", " finally:\n", " conn.close()\n", "\n", " try:\n", " json_data = json.loads(data.decode(\"utf-8\"))\n", " jobs_results = json_data['google_jobs_results']\n", " job_columns = ['title', 'company_name', 'location', 'description', 'extensions', 'job_id']\n", " df = pd.DataFrame(jobs_results, columns=job_columns)\n", " return df\n", " except (KeyError, json.JSONDecodeError) as e:\n", " print(f\"Error occurred for search: {job_title} in {city_state}\")\n", " print(f\"Error message: {str(e)}\")\n", " return None\n", "\n", "def sql_dump(df, table):\n", " engine = create_engine(f\"postgresql://{os.getenv('PSQL_MASTER_NAME')}:{os.getenv('PSQL_KEY')}@{os.getenv('RDS_ENDPOINT')}:5432/postgres\")\n", " with engine.connect() as conn:\n", " df.to_sql(table, conn, if_exists='append', chunksize=20, method='multi', index=False)\n", " print(f\"Dumped {df.shape} to SQL table {table}\")\n", "\n", "def process_batch(job, city_state, start):\n", " df_10jobs = google_job_search(job, city_state, start)\n", " if df_10jobs is not None:\n", " print(f'City: {city_state} Job: {job} Start: {start}')\n", " print(df_10jobs.shape)\n", " date = dt.datetime.today().strftime('%Y-%m-%d')\n", " df_10jobs['retrieve_date'] = date\n", " df_10jobs.drop_duplicates(subset=['job_id', 'company_name'], inplace=True)\n", " rows_affected = sql_dump(df_10jobs, 'usajobs24')\n", " print(f\"Rows affected: {rows_affected}\")\n", "\n", "def main(job_list, city_state_list):\n", " with ThreadPoolExecutor() as executor:\n", " futures = []\n", " for job in job_list:\n", " for city_state in city_state_list:\n", " for start in range(0, 2):\n", " future = executor.submit(process_batch, job, city_state, start)\n", " futures.append(future)\n", "\n", " for future in as_completed(futures):\n", " future.result()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# job_list = [\"Data Analyst\", \"Data Engineer\", \"Big Data Engineer\"]\n", "# simple_city_state_list = [\"Menlo Park CA\", \"Palo Alto CA\", \"San Francisco CA\", \"Mountain View CA\"]\n", "# main(job_list, simple_city_state_list)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Great now that we have written some data lets read it." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sqlalchemy import create_engine\n", "\n", "def read_data_from_db(table_name):\n", " engine = create_engine(f\"postgresql://{os.getenv('PSQL_MASTER_NAME')}:{os.getenv('PSQL_KEY')}@{os.getenv('RDS_ENDPOINT')}:5432/postgres\")\n", " \n", " try:\n", " with engine.connect() as conn:\n", " query = f'SELECT * FROM \"{table_name}\"'\n", " df = pd.read_sql(query, conn)\n", " return df\n", " except Exception as e:\n", " print(f\"Error occurred while reading data from the database: {str(e)}\")\n", " return None\n", "\n", "data24_df = read_data_from_db('usajobstest')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(417, 7)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data24_df.shape" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | title | \n", "company_name | \n", "location | \n", "description | \n", "extensions | \n", "job_id | \n", "retrieve_date | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "Business Intelligence Analyst | \n", "Nuvolum | \n", "San Francisco, CA | \n", "Nuvolum combines innovative, data-driven strat... | \n", "{\"3 days ago\",Full-time,\"No degree mentioned\"} | \n", "eyJqb2JfdGl0bGUiOiJCdXNpbmVzcyBJbnRlbGxpZ2VuY2... | \n", "2024-05-04 | \n", "
1 | \n", "Sr. Strategy and Business Intelligence Analyst | \n", "Sunrun | \n", "San Francisco, CA (+1 other) | \n", "Everything we do at Sunrun is driven by a dete... | \n", "{\"12 days ago\",Full-time,\"Health insurance\",\"D... | \n", "eyJqb2JfdGl0bGUiOiJTci4gU3RyYXRlZ3kgYW5kIEJ1c2... | \n", "2024-05-04 | \n", "
2 | \n", "Business Intelligence Analyst | \n", "Side | \n", "Anywhere | \n", "Side, Inc. seeks Business Intelligence Analyst... | \n", "{\"11 days ago\",\"151,736–157,000 a year\",\"Work ... | \n", "eyJqb2JfdGl0bGUiOiJCdXNpbmVzcyBJbnRlbGxpZ2VuY2... | \n", "2024-05-04 | \n", "
3 | \n", "Senior Business Intelligence Developer | \n", "TekNavigators Staffing | \n", "San Francisco, CA | \n", "Role: Senior BI Developer\\n\\nLocation: San Fra... | \n", "{\"20 hours ago\",Contractor,\"No degree mentioned\"} | \n", "eyJqb2JfdGl0bGUiOiJTZW5pb3IgQnVzaW5lc3MgSW50ZW... | \n", "2024-05-04 | \n", "
4 | \n", "Senior Business Intelligence Analyst | \n", "FIS Fidelity National Information Services | \n", "San Francisco, CA | \n", "Position Type : Full time Type Of Hire : Exper... | \n", "{\"19 days ago\",Full-time} | \n", "eyJqb2JfdGl0bGUiOiJTZW5pb3IgQnVzaW5lc3MgSW50ZW... | \n", "2024-05-04 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
412 | \n", "Business Intelligence Analyst - Diabetes Marke... | \n", "Medtronic | \n", "Anywhere | \n", "Careers that Change Lives\\n\\nWe are looking fo... | \n", "{\"10 days ago\",\"Work from home\",Full-time,\"No ... | \n", "eyJqb2JfdGl0bGUiOiJCdXNpbmVzcyBJbnRlbGxpZ2VuY2... | \n", "2024-05-04 | \n", "
413 | \n", "IT Analyst, Business Intelligence/Data Warehou... | \n", "Keck Medicine of USC | \n", "Alhambra, CA | \n", "Actively design and develop ETL solutions that... | \n", "{\"13 days ago\",Full-time} | \n", "eyJqb2JfdGl0bGUiOiJJVCBBbmFseXN0LCBCdXNpbmVzcy... | \n", "2024-05-04 | \n", "
414 | \n", "Director, Business Intelligence | \n", "Deutsch LA | \n", "Los Angeles, CA | \n", "DIRECTOR, BUSINESS INTELLIGENCE\\n\\nWe are seek... | \n", "{\"3 days ago\",Full-time,\"No degree mentioned\"} | \n", "eyJqb2JfdGl0bGUiOiJEaXJlY3RvciwgQnVzaW5lc3MgSW... | \n", "2024-05-04 | \n", "
415 | \n", "Business Intelligence Programmer 1 | \n", "U.S. Bank | \n", "Los Angeles, CA | \n", "At U.S. Bank, we’re on a journey to do our bes... | \n", "{\"3 days ago\",Full-time,\"Health insurance\",\"De... | \n", "eyJqb2JfdGl0bGUiOiJCdXNpbmVzcyBJbnRlbGxpZ2VuY2... | \n", "2024-05-04 | \n", "
416 | \n", "Business Intelligence Analyst | \n", "BIGO | \n", "Los Angeles, CA | \n", "Location: 10250 Constellation Blvd., Century C... | \n", "{\"1 day ago\",Full-time,\"Health insurance\",\"Den... | \n", "eyJqb2JfdGl0bGUiOiJCdXNpbmVzcyBJbnRlbGxpZ2VuY2... | \n", "2024-05-04 | \n", "
417 rows × 7 columns
\n", "\n", " | title | \n", "company_name | \n", "location | \n", "description | \n", "
---|---|---|---|---|
0 | \n", "Business Intelligence Analyst | \n", "Nuvolum | \n", "San Francisco, CA | \n", "Nuvolum combines innovative, data-driven strat... | \n", "
1 | \n", "Sr. Strategy and Business Intelligence Analyst | \n", "Sunrun | \n", "San Francisco, CA (+1 other) | \n", "Everything we do at Sunrun is driven by a dete... | \n", "
2 | \n", "Business Intelligence Analyst | \n", "Side | \n", "Anywhere | \n", "Side, Inc. seeks Business Intelligence Analyst... | \n", "
3 | \n", "Senior Business Intelligence Developer | \n", "TekNavigators Staffing | \n", "San Francisco, CA | \n", "Role: Senior BI Developer\\n\\nLocation: San Fra... | \n", "
4 | \n", "Senior Business Intelligence Analyst | \n", "FIS Fidelity National Information Services | \n", "San Francisco, CA | \n", "Position Type : Full time Type Of Hire : Exper... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
412 | \n", "Business Intelligence Analyst - Diabetes Marke... | \n", "Medtronic | \n", "Anywhere | \n", "Careers that Change Lives\\n\\nWe are looking fo... | \n", "
413 | \n", "IT Analyst, Business Intelligence/Data Warehou... | \n", "Keck Medicine of USC | \n", "Alhambra, CA | \n", "Actively design and develop ETL solutions that... | \n", "
414 | \n", "Director, Business Intelligence | \n", "Deutsch LA | \n", "Los Angeles, CA | \n", "DIRECTOR, BUSINESS INTELLIGENCE\\n\\nWe are seek... | \n", "
415 | \n", "Business Intelligence Programmer 1 | \n", "U.S. Bank | \n", "Los Angeles, CA | \n", "At U.S. Bank, we’re on a journey to do our bes... | \n", "
416 | \n", "Business Intelligence Analyst | \n", "BIGO | \n", "Los Angeles, CA | \n", "Location: 10250 Constellation Blvd., Century C... | \n", "
405 rows × 4 columns
\n", "