Skip to content

Commit

Permalink
Merge pull request #22 from themarshallproject/create-id
Browse files Browse the repository at this point in the history
Add a script that merges old and new data, also, MAKEFILE!
  • Loading branch information
Weihua4455 committed Aug 22, 2022
2 parents 8b8a2d0 + 0bf5e44 commit 1882805
Show file tree
Hide file tree
Showing 14 changed files with 329 additions and 31 deletions.
3 changes: 3 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/arpa20220125.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ all: analysis/output_data/output.csv ## Download source data and run R analysis
.PHONY: clean
clean: clean/source_data clean/output_data ## Clean files

.PHONY: merge_old_and_new_data
merge_old_and_new_data: analysis/output_data/q1_data_with_303_vetted_info.csv

.PHONY: help
help: ## Display this help
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z0-9\%\\.\/_-]+:.*?##/ { printf "\033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
Expand All @@ -31,6 +34,9 @@ analysis/output_data/output.csv: analysis/source_data/input.csv ## Run R analys
@echo "Running R analysis"
Rscript analysis/analysis.R

##@ Merge old and new datasets
analysis/output_data/q1_data_with_303_vetted_info.csv:
$(PYENV) python analysis/merge_two_datasets_with_unique_id.py

##@ Source files
analysis/source_data/April-2022-Quarterly-and-Annual-Reporting-Data-through-March-31-2022.xlsx: ## Download April twenty-two ARPA Data
Expand Down
7 changes: 4 additions & 3 deletions analysis/classfications
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
list_le = "police","PD","gun","law enforcement","public safety","crime","criminal","body cameras","tasers","armor","sheriff","officer"
list_le = ["police","PD","gun","law enforcement","public safety","crime","criminal","body cameras","tasers","armor","sheriff","officer"]

list_court = "court","public defenders","prosecutors","juvenile court","juvenile justice"
list_court = ["court","public defenders","prosecutors","juvenile court","juvenile justice"]

list_correction = "jail","prison","correction","incarcerated","inmate","guards","custody"
list_correction = ["jail","prison","correction","incarcerated","inmate","guards","custody"]
]
187 changes: 187 additions & 0 deletions analysis/create_id.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 86,
"id": "decreased-chocolate",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "invalid-possibility",
"metadata": {},
"outputs": [],
"source": [
"def read_data(filenames): \n",
" df_old = pd.read_csv(filenames[0])\n",
" df_new = pd.read_csv(filenames[1])\n",
" \n",
" return df_old, df_new"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "measured-illinois",
"metadata": {},
"outputs": [],
"source": [
"def get_unique_ids(df_old, df_new):\n",
" df_old[\"tmp_id\"] = df_old[\"Recipient Name\"] +\\\n",
" \"_\" + df_old[\"Project Name\"]\n",
" \n",
" df_new[\"tmp_id\"] = df_new[\"Recipient Name\"] +\\\n",
" \"_\" + df_new[\"Project Name\"]\n",
" \n",
" return df_old, df_new"
]
},
{
"cell_type": "code",
"execution_count": 104,
"id": "average-bench",
"metadata": {},
"outputs": [],
"source": [
"def merge(df_old_with_id, df_new_with_id):\n",
" df_old_vetted = df_old_with_id[~df_old_with_id[\"vet\"].isnull()]\n",
" df_old_vetted_short = df_old_vetted[[\"tmp_id\", 'law_enforcement', 'court', 'corrections', 'cj_related','vet', 'reporter']]\n",
" \n",
" # merge the new dataset with rows from the old one that we looked at.\n",
" df_merge = pd.merge(df_new, df_old_vetted_short, on=\"tmp_id\", how=\"left\")\n",
" \n",
" # there are 33 rows that did not merge. Will export and investigate more.\n",
" id_joined = pd.merge(df_new_with_id, df_old_vetted_short, on=\"tmp_id\")[\"tmp_id\"].to_list()\n",
" df_did_not_merge = df_old_vetted[~df_old_vetted[\"tmp_id\"].isin(id_joined)]\n",
" \n",
" id_joined_all = pd.merge(df_new_with_id, df_old_with_id, on=\"tmp_id\")[\"tmp_id\"].to_list()\n",
" df_did_not_merge_all = df_old_with_id[~df_old_with_id[\"tmp_id\"].isin(id_joined_all)]\n",
"\n",
" return df_merge, df_did_not_merge, df_did_not_merge_all"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "returning-ceremony",
"metadata": {},
"outputs": [],
"source": [
"def export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path):\n",
" df_merge.to_csv(df_merge_path, index=False)\n",
" df_did_not_merge.to_csv(df_did_not_merge_path, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "urban-dietary",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 105,
"id": "violent-european",
"metadata": {},
"outputs": [],
"source": [
"filenames = [\"source_data/cj_related/cj_related_old.csv\", \"source_data/cj_related/cj_related_new.csv\"]\n",
"\n",
"# read in two datasets\n",
"df_old, df_new = read_data(filenames)\n",
"# get unique id by combining columns in the two datasets\n",
"df_old_with_id, df_new_with_id = get_unique_ids(df_old, df_new)\n",
"# let's do some merging!\n",
"df_merge, df_did_not_merge, df_did_not_merge_all = merge(df_old_with_id, df_new_with_id)\n",
"\n",
"# export\n",
"df_merge_path = \"output_data/q1_data_with_303_vetted_info.csv\"\n",
"df_did_not_merge_path = \"output_data/missing_from_q1.csv\"\n",
"export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path)"
]
},
{
"cell_type": "code",
"execution_count": 110,
"id": "central-meter",
"metadata": {},
"outputs": [],
"source": [
"df_did_not_merge_all.to_csv(\"output_data/not_merging.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "bridal-fireplace",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.03772282921219092"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"1640/len(df_new)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "increasing-liberal",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "listed-recipient",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "recorded-carpet",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
60 changes: 60 additions & 0 deletions analysis/merge_two_datasets_with_unique_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pandas as pd

# Ana and Weihua vetted 336 cj-related projects from the 2021 dataset
# After getting the Q1 2022 dataset, we did not want to lose that progress and start over
# So we created this script that creates unique IDs for cj-related projects, and merge thd vetted data
# from the old dataset to the new one.

# Ideally you'll only need to run this script once new data come in
# We did run into one problem: some projects from the old dataset do not exist in the new one.
# Why? Gotta report that out!

def read_data(filenames):
df_old = pd.read_csv(filenames[0])
df_new = pd.read_csv(filenames[1])

return df_old, df_new


def get_unique_ids(df_old, df_new):
df_old["tmp_id"] = df_old["Recipient Name"] + \
"_" + df_old["Project Name"]

df_new["tmp_id"] = df_new["Recipient Name"] + \
"_" + df_new["Project Name"]

return df_old, df_new


def merge(df_old_with_id, df_new_with_id):
df_old_vetted = df_old_with_id[~df_old_with_id["vet"].isnull()]
df_old_vetted_short = df_old_vetted[
["tmp_id", 'law_enforcement', 'court', 'corrections', 'cj_related', 'vet', 'reporter']]

# merge the new dataset with rows from the old one that we looked at.
df_merge = pd.merge(df_new, df_old_vetted_short, on="tmp_id", how="left")

# there are 33 rows that did not merge. Will export and investigate more.
id_joined = pd.merge(df_new_with_id, df_old_vetted_short, on="tmp_id")["tmp_id"].to_list()
df_did_not_merge = df_old_vetted[~df_old_vetted["tmp_id"].isin(id_joined)]

return df_merge, df_did_not_merge

def export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path):
df_merge.to_csv(df_merge_path, index=False)
df_did_not_merge.to_csv(df_did_not_merge_path, index=False)

if __name__ == "__main__":
filenames = ["analysis/source_data/cj_related/cj_related_old.csv", "analysis/source_data/cj_related/cj_related_new.csv"]

# read in two datasets
df_old, df_new = read_data(filenames)
# get unique id by combining columns in the two datasets
df_old_with_id, df_new_with_id = get_unique_ids(df_old, df_new)
# let's do some merging!
df_merge, df_did_not_merge = merge(df_old_with_id, df_new_with_id)

# export
df_merge_path = "analysis/output_data/q1_data_with_303_vetted_info.csv"
df_did_not_merge_path = "analysis/output_data/missing_from_q1.csv"
export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path)
3 changes: 3 additions & 0 deletions analysis/output_data/missing_from_q1.csv
Git LFS file not shown
3 changes: 3 additions & 0 deletions analysis/output_data/not_merging.csv
Git LFS file not shown
3 changes: 3 additions & 0 deletions analysis/output_data/q1_data_with_303_vetted_info.csv
Git LFS file not shown
Loading

0 comments on commit 1882805

Please sign in to comment.