{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install -q -U pandarallel","metadata":{"execution":{"iopub.status.busy":"2023-07-13T20:36:39.815633Z","iopub.execute_input":"2023-07-13T20:36:39.816064Z","iopub.status.idle":"2023-07-13T20:36:56.731310Z","shell.execute_reply.started":"2023-07-13T20:36:39.816027Z","shell.execute_reply":"2023-07-13T20:36:56.729385Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nfrom sklearn.metrics.pairwise import cosine_similarity\nfrom ast import literal_eval\nfrom tqdm import tqdm\nfrom pandarallel import pandarallel\n\npandarallel.initialize(progress_bar=True)\npath = \"/kaggle/input/week18-generate-emb-csv/earnings_calls_cleaned_metadata_keywords_indices.csv\"\ndata = pd.read_csv(path)\nprint(data.columns)\ndata[\"Embedding\"] = data[\"Embedding\"].parallel_apply(literal_eval)\n\n# Iterate over the text column in a rolling window of 3 rows\n\n\ndef compute_rolling_text_sim(df, similarity_thresh=0.75):\n\n columns = [\n \"Text\",\n \"Name\",\n \"Company\",\n \"Position\",\n \"Year\",\n \"Month\",\n \"Date\",\n \"Ticker\",\n \"Section\",\n \"Quarter\",\n \"Participant_Type\",\n \"Paragraph_id\",\n \"File_Name\",\n \"Sentence_id\",\n \"QA_Flag\",\n # \"Embedding\",\n ]\n\n\n\n new_df = pd.DataFrame(columns=columns)\n\n for i in tqdm(range(len(df) - 2)):\n first_row_embedding = df.loc[i, \"Embedding\"]\n sec_row_embedding = df.loc[i + 1, \"Embedding\"]\n third_row_embedding = df.loc[i + 2, \"Embedding\"]\n similarity_1 = cosine_similarity(first_row_embedding, sec_row_embedding)[0][0]\n similarity_2 = cosine_similarity(sec_row_embedding, third_row_embedding)[0][0]\n\n first_row_paragraph_id = df.loc[i, \"Paragraph_id\"]\n second_row_paragraph_id = df.loc[i + 1, \"Paragraph_id\"]\n third_row_paragraph_id = df.loc[i + 2, \"Paragraph_id\"]\n\n first_row_filename = df.loc[i, \"File_Name\"]\n second_row_filename = df.loc[i + 1, \"File_Name\"]\n third_row_filename = df.loc[i + 2, \"File_Name\"]\n\n # print((similarity_1, similarity_2))\n\n first_row_text = df.loc[i, \"Text\"]\n second_row_text = df.loc[i + 1, \"Text\"]\n third_row_text = df.loc[i + 2, \"Text\"]\n\n new_text = \"\"\n new_row = df.loc[i, columns]\n\n if (\n (similarity_1 >= similarity_thresh)\n and first_row_filename == second_row_filename\n and first_row_paragraph_id == second_row_paragraph_id\n ):\n new_text = first_row_text + \" \" + second_row_text\n\n if similarity_2 >= similarity_thresh:\n new_text = first_row_text + \" \" + second_row_text + \" \" + third_row_text\n else:\n new_text = first_row_text + \" \" + second_row_text\n else:\n new_text = first_row_text\n\n new_row[\"Text\"] = new_text\n new_df = new_df.append(new_row, ignore_index=True)\n\n return new_df\n\n\ntest_df = compute_rolling_text_sim(data)\nprint(test_df.head(20))\n\ntest_df.to_csv(\"test_df.csv\", index=False)\n","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-07-13T20:38:04.663724Z","iopub.execute_input":"2023-07-13T20:38:04.664130Z"},"trusted":true},"execution_count":null,"outputs":[{"name":"stdout","text":"INFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\nIndex(['Text', 'Name', 'Company', 'Position', 'Year', 'Month', 'Date',\n 'Ticker', 'Section', 'Quarter', 'Participant_Type', 'Paragraph_id',\n 'File_Name', 'Sentence_id', 'QA_Flag', 'Embedding'],\n dtype='object')\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=29393), Label(value='0 / 29393')))…","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"befdc9ceffa04e0eb7b818db2134b599"}},"metadata":{}},{"name":"stderr","text":" 0%| | 0/58783 [00:00