Merge pull request #22 from themarshallproject/create-id

Add a script that merges old and new data, also, MAKEFILE!
themarshallproject · Aug 22, 2022 · 1882805 · 1882805
2 parents 8b8a2d0 + 0bf5e44
commit 1882805
Show file tree

Hide file tree

Showing 14 changed files with 329 additions and 31 deletions.
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/arpa20220125.iml b/.idea/arpa20220125.iml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/Makefile b/Makefile
@@ -16,6 +16,9 @@ all: analysis/output_data/output.csv  ## Download source data and run R analysis
 .PHONY: clean
 clean: clean/source_data clean/output_data  ## Clean files
 
+.PHONY: merge_old_and_new_data
+merge_old_and_new_data: analysis/output_data/q1_data_with_303_vetted_info.csv
+
 .PHONY: help
 help:  ## Display this help
 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z0-9\%\\.\/_-]+:.*?##/ { printf "\033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
@@ -31,6 +34,9 @@ analysis/output_data/output.csv: analysis/source_data/input.csv  ## Run R analys
 	@echo "Running R analysis"
 	Rscript analysis/analysis.R
 
+##@ Merge old and new datasets
+analysis/output_data/q1_data_with_303_vetted_info.csv:
+	$(PYENV) python analysis/merge_two_datasets_with_unique_id.py
 
 ##@ Source files
 analysis/source_data/April-2022-Quarterly-and-Annual-Reporting-Data-through-March-31-2022.xlsx:	## Download April twenty-two ARPA Data

diff --git a/analysis/classfications b/analysis/classfications
@@ -1,5 +1,6 @@
-list_le = "police","PD","gun","law enforcement","public safety","crime","criminal","body cameras","tasers","armor","sheriff","officer"
+list_le = ["police","PD","gun","law enforcement","public safety","crime","criminal","body cameras","tasers","armor","sheriff","officer"]
 
-list_court = "court","public defenders","prosecutors","juvenile court","juvenile justice"
+list_court = ["court","public defenders","prosecutors","juvenile court","juvenile justice"]
 
-list_correction = "jail","prison","correction","incarcerated","inmate","guards","custody"
+list_correction = ["jail","prison","correction","incarcerated","inmate","guards","custody"]
+]
diff --git a/analysis/create_id.ipynb b/analysis/create_id.ipynb
@@ -0,0 +1,187 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "decreased-chocolate",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "invalid-possibility",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_data(filenames):    \n",
+    "    df_old = pd.read_csv(filenames[0])\n",
+    "    df_new = pd.read_csv(filenames[1])\n",
+    "    \n",
+    "    return df_old, df_new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "measured-illinois",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_unique_ids(df_old, df_new):\n",
+    "    df_old[\"tmp_id\"] = df_old[\"Recipient Name\"] +\\\n",
+    "    \"_\" + df_old[\"Project Name\"]\n",
+    "    \n",
+    "    df_new[\"tmp_id\"] = df_new[\"Recipient Name\"] +\\\n",
+    "    \"_\" + df_new[\"Project Name\"]\n",
+    "    \n",
+    "    return df_old, df_new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "id": "average-bench",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def merge(df_old_with_id, df_new_with_id):\n",
+    "    df_old_vetted = df_old_with_id[~df_old_with_id[\"vet\"].isnull()]\n",
+    "    df_old_vetted_short = df_old_vetted[[\"tmp_id\", 'law_enforcement', 'court', 'corrections', 'cj_related','vet', 'reporter']]\n",
+    "    \n",
+    "    # merge the new dataset with rows from the old one that we looked at.\n",
+    "    df_merge = pd.merge(df_new, df_old_vetted_short, on=\"tmp_id\", how=\"left\")\n",
+    "    \n",
+    "    # there are 33 rows that did not merge. Will export and investigate more.\n",
+    "    id_joined = pd.merge(df_new_with_id, df_old_vetted_short, on=\"tmp_id\")[\"tmp_id\"].to_list()\n",
+    "    df_did_not_merge = df_old_vetted[~df_old_vetted[\"tmp_id\"].isin(id_joined)]\n",
+    "    \n",
+    "    id_joined_all = pd.merge(df_new_with_id, df_old_with_id, on=\"tmp_id\")[\"tmp_id\"].to_list()\n",
+    "    df_did_not_merge_all = df_old_with_id[~df_old_with_id[\"tmp_id\"].isin(id_joined_all)]\n",
+    "\n",
+    "    return df_merge, df_did_not_merge, df_did_not_merge_all"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "returning-ceremony",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path):\n",
+    "    df_merge.to_csv(df_merge_path, index=False)\n",
+    "    df_did_not_merge.to_csv(df_did_not_merge_path, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "urban-dietary",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "id": "violent-european",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filenames = [\"source_data/cj_related/cj_related_old.csv\", \"source_data/cj_related/cj_related_new.csv\"]\n",
+    "\n",
+    "# read in two datasets\n",
+    "df_old, df_new = read_data(filenames)\n",
+    "# get unique id by combining columns in the two datasets\n",
+    "df_old_with_id, df_new_with_id = get_unique_ids(df_old, df_new)\n",
+    "# let's do some merging!\n",
+    "df_merge, df_did_not_merge, df_did_not_merge_all = merge(df_old_with_id, df_new_with_id)\n",
+    "\n",
+    "# export\n",
+    "df_merge_path = \"output_data/q1_data_with_303_vetted_info.csv\"\n",
+    "df_did_not_merge_path = \"output_data/missing_from_q1.csv\"\n",
+    "export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "id": "central-meter",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_did_not_merge_all.to_csv(\"output_data/not_merging.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "bridal-fireplace",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.03772282921219092"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "1640/len(df_new)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "increasing-liberal",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "listed-recipient",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "recorded-carpet",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analysis/merge_two_datasets_with_unique_id.py b/analysis/merge_two_datasets_with_unique_id.py
@@ -0,0 +1,60 @@
+import pandas as pd
+
+# Ana and Weihua vetted 336 cj-related projects from the 2021 dataset
+# After getting the Q1 2022 dataset, we did not want to lose that progress and start over
+# So we created this script that creates unique IDs for cj-related projects, and merge thd vetted data
+# from the old dataset to the new one.
+
+# Ideally you'll only need to run this script once new data come in
+# We did run into one problem: some projects from the old dataset do not exist in the new one.
+# Why? Gotta report that out!
+
+def read_data(filenames):
+    df_old = pd.read_csv(filenames[0])
+    df_new = pd.read_csv(filenames[1])
+
+    return df_old, df_new
+
+
+def get_unique_ids(df_old, df_new):
+    df_old["tmp_id"] = df_old["Recipient Name"] + \
+                       "_" + df_old["Project Name"]
+
+    df_new["tmp_id"] = df_new["Recipient Name"] + \
+                       "_" + df_new["Project Name"]
+
+    return df_old, df_new
+
+
+def merge(df_old_with_id, df_new_with_id):
+    df_old_vetted = df_old_with_id[~df_old_with_id["vet"].isnull()]
+    df_old_vetted_short = df_old_vetted[
+        ["tmp_id", 'law_enforcement', 'court', 'corrections', 'cj_related', 'vet', 'reporter']]
+
+    # merge the new dataset with rows from the old one that we looked at.
+    df_merge = pd.merge(df_new, df_old_vetted_short, on="tmp_id", how="left")
+
+    # there are 33 rows that did not merge. Will export and investigate more.
+    id_joined = pd.merge(df_new_with_id, df_old_vetted_short, on="tmp_id")["tmp_id"].to_list()
+    df_did_not_merge = df_old_vetted[~df_old_vetted["tmp_id"].isin(id_joined)]
+
+    return df_merge, df_did_not_merge
+
+def export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path):
+    df_merge.to_csv(df_merge_path, index=False)
+    df_did_not_merge.to_csv(df_did_not_merge_path, index=False)
+
+if __name__ == "__main__":
+    filenames = ["analysis/source_data/cj_related/cj_related_old.csv", "analysis/source_data/cj_related/cj_related_new.csv"]
+
+    # read in two datasets
+    df_old, df_new = read_data(filenames)
+    # get unique id by combining columns in the two datasets
+    df_old_with_id, df_new_with_id = get_unique_ids(df_old, df_new)
+    # let's do some merging!
+    df_merge, df_did_not_merge = merge(df_old_with_id, df_new_with_id)
+
+    # export
+    df_merge_path = "analysis/output_data/q1_data_with_303_vetted_info.csv"
+    df_did_not_merge_path = "analysis/output_data/missing_from_q1.csv"
+    export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path)
diff --git a/analysis/output_data/missing_from_q1.csv b/analysis/output_data/missing_from_q1.csv
diff --git a/analysis/output_data/not_merging.csv b/analysis/output_data/not_merging.csv
diff --git a/analysis/output_data/q1_data_with_303_vetted_info.csv b/analysis/output_data/q1_data_with_303_vetted_info.csv