-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
Add a script that merges old and new data, also, MAKEFILE!
- Loading branch information
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
list_le = "police","PD","gun","law enforcement","public safety","crime","criminal","body cameras","tasers","armor","sheriff","officer" | ||
list_le = ["police","PD","gun","law enforcement","public safety","crime","criminal","body cameras","tasers","armor","sheriff","officer"] | ||
|
||
list_court = "court","public defenders","prosecutors","juvenile court","juvenile justice" | ||
list_court = ["court","public defenders","prosecutors","juvenile court","juvenile justice"] | ||
|
||
list_correction = "jail","prison","correction","incarcerated","inmate","guards","custody" | ||
list_correction = ["jail","prison","correction","incarcerated","inmate","guards","custody"] | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,187 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 86, | ||
"id": "decreased-chocolate", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 87, | ||
"id": "invalid-possibility", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def read_data(filenames): \n", | ||
" df_old = pd.read_csv(filenames[0])\n", | ||
" df_new = pd.read_csv(filenames[1])\n", | ||
" \n", | ||
" return df_old, df_new" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 88, | ||
"id": "measured-illinois", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_unique_ids(df_old, df_new):\n", | ||
" df_old[\"tmp_id\"] = df_old[\"Recipient Name\"] +\\\n", | ||
" \"_\" + df_old[\"Project Name\"]\n", | ||
" \n", | ||
" df_new[\"tmp_id\"] = df_new[\"Recipient Name\"] +\\\n", | ||
" \"_\" + df_new[\"Project Name\"]\n", | ||
" \n", | ||
" return df_old, df_new" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 104, | ||
"id": "average-bench", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def merge(df_old_with_id, df_new_with_id):\n", | ||
" df_old_vetted = df_old_with_id[~df_old_with_id[\"vet\"].isnull()]\n", | ||
" df_old_vetted_short = df_old_vetted[[\"tmp_id\", 'law_enforcement', 'court', 'corrections', 'cj_related','vet', 'reporter']]\n", | ||
" \n", | ||
" # merge the new dataset with rows from the old one that we looked at.\n", | ||
" df_merge = pd.merge(df_new, df_old_vetted_short, on=\"tmp_id\", how=\"left\")\n", | ||
" \n", | ||
" # there are 33 rows that did not merge. Will export and investigate more.\n", | ||
" id_joined = pd.merge(df_new_with_id, df_old_vetted_short, on=\"tmp_id\")[\"tmp_id\"].to_list()\n", | ||
" df_did_not_merge = df_old_vetted[~df_old_vetted[\"tmp_id\"].isin(id_joined)]\n", | ||
" \n", | ||
" id_joined_all = pd.merge(df_new_with_id, df_old_with_id, on=\"tmp_id\")[\"tmp_id\"].to_list()\n", | ||
" df_did_not_merge_all = df_old_with_id[~df_old_with_id[\"tmp_id\"].isin(id_joined_all)]\n", | ||
"\n", | ||
" return df_merge, df_did_not_merge, df_did_not_merge_all" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 90, | ||
"id": "returning-ceremony", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path):\n", | ||
" df_merge.to_csv(df_merge_path, index=False)\n", | ||
" df_did_not_merge.to_csv(df_did_not_merge_path, index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "urban-dietary", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 105, | ||
"id": "violent-european", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"filenames = [\"source_data/cj_related/cj_related_old.csv\", \"source_data/cj_related/cj_related_new.csv\"]\n", | ||
"\n", | ||
"# read in two datasets\n", | ||
"df_old, df_new = read_data(filenames)\n", | ||
"# get unique id by combining columns in the two datasets\n", | ||
"df_old_with_id, df_new_with_id = get_unique_ids(df_old, df_new)\n", | ||
"# let's do some merging!\n", | ||
"df_merge, df_did_not_merge, df_did_not_merge_all = merge(df_old_with_id, df_new_with_id)\n", | ||
"\n", | ||
"# export\n", | ||
"df_merge_path = \"output_data/q1_data_with_303_vetted_info.csv\"\n", | ||
"df_did_not_merge_path = \"output_data/missing_from_q1.csv\"\n", | ||
"export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 110, | ||
"id": "central-meter", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_did_not_merge_all.to_csv(\"output_data/not_merging.csv\", index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 109, | ||
"id": "bridal-fireplace", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"0.03772282921219092" | ||
] | ||
}, | ||
"execution_count": 109, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"1640/len(df_new)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "increasing-liberal", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "listed-recipient", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "recorded-carpet", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import pandas as pd | ||
|
||
# Ana and Weihua vetted 336 cj-related projects from the 2021 dataset | ||
# After getting the Q1 2022 dataset, we did not want to lose that progress and start over | ||
# So we created this script that creates unique IDs for cj-related projects, and merge thd vetted data | ||
# from the old dataset to the new one. | ||
|
||
# Ideally you'll only need to run this script once new data come in | ||
# We did run into one problem: some projects from the old dataset do not exist in the new one. | ||
# Why? Gotta report that out! | ||
|
||
def read_data(filenames): | ||
df_old = pd.read_csv(filenames[0]) | ||
df_new = pd.read_csv(filenames[1]) | ||
|
||
return df_old, df_new | ||
|
||
|
||
def get_unique_ids(df_old, df_new): | ||
df_old["tmp_id"] = df_old["Recipient Name"] + \ | ||
"_" + df_old["Project Name"] | ||
|
||
df_new["tmp_id"] = df_new["Recipient Name"] + \ | ||
"_" + df_new["Project Name"] | ||
|
||
return df_old, df_new | ||
|
||
|
||
def merge(df_old_with_id, df_new_with_id): | ||
df_old_vetted = df_old_with_id[~df_old_with_id["vet"].isnull()] | ||
df_old_vetted_short = df_old_vetted[ | ||
["tmp_id", 'law_enforcement', 'court', 'corrections', 'cj_related', 'vet', 'reporter']] | ||
|
||
# merge the new dataset with rows from the old one that we looked at. | ||
df_merge = pd.merge(df_new, df_old_vetted_short, on="tmp_id", how="left") | ||
|
||
# there are 33 rows that did not merge. Will export and investigate more. | ||
id_joined = pd.merge(df_new_with_id, df_old_vetted_short, on="tmp_id")["tmp_id"].to_list() | ||
df_did_not_merge = df_old_vetted[~df_old_vetted["tmp_id"].isin(id_joined)] | ||
|
||
return df_merge, df_did_not_merge | ||
|
||
def export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path): | ||
df_merge.to_csv(df_merge_path, index=False) | ||
df_did_not_merge.to_csv(df_did_not_merge_path, index=False) | ||
|
||
if __name__ == "__main__": | ||
filenames = ["analysis/source_data/cj_related/cj_related_old.csv", "analysis/source_data/cj_related/cj_related_new.csv"] | ||
|
||
# read in two datasets | ||
df_old, df_new = read_data(filenames) | ||
# get unique id by combining columns in the two datasets | ||
df_old_with_id, df_new_with_id = get_unique_ids(df_old, df_new) | ||
# let's do some merging! | ||
df_merge, df_did_not_merge = merge(df_old_with_id, df_new_with_id) | ||
|
||
# export | ||
df_merge_path = "analysis/output_data/q1_data_with_303_vetted_info.csv" | ||
df_did_not_merge_path = "analysis/output_data/missing_from_q1.csv" | ||
export(df_merge, df_did_not_merge, df_merge_path, df_did_not_merge_path) |