From 34571d88c0fe420625e36f47e4a84558af2b577b Mon Sep 17 00:00:00 2001 From: msj Date: Thu, 12 Sep 2024 14:49:00 -0400 Subject: [PATCH] Add independent expenditure scraper --- Makefile | 8 +- independent_expenditures.mk | 5 + .../scrape_independent_expenditures.py | 164 ++++++++++++++++++ 3 files changed, 174 insertions(+), 3 deletions(-) create mode 100644 independent_expenditures.mk create mode 100644 scrapers/financial_disclosure/scrape_independent_expenditures.py diff --git a/Makefile b/Makefile index 6f3354c..ff43fba 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -include lobbyists.mk disclosures.mk candidates.mk +include lobbyists.mk disclosures.mk candidates.mk independent_expenditures.mk .PHONY : all upload-to-s3 clean @@ -6,13 +6,15 @@ all : data/processed/disclosures.xlsx \ data/processed/candidate_committees.csv \ data/processed/candidate_committee_filings.csv \ data/processed/pac_committees.csv \ - data/processed/pac_committee_filings.csv + data/processed/pac_committee_filings.csv \ + data/processed/independent_expenditures.csv upload-to-s3 : data/processed/disclosures.xlsx \ data/processed/candidate_committees.csv \ data/processed/candidate_committee_filings.csv \ data/processed/pac_committees.csv \ - data/processed/pac_committee_filings.csv + data/processed/pac_committee_filings.csv \ + data/processed/independent_expenditures.csv for file in $^; do aws s3 cp $$file $(S3BUCKET) --acl public-read; done diff --git a/independent_expenditures.mk b/independent_expenditures.mk new file mode 100644 index 0000000..ad0fd13 --- /dev/null +++ b/independent_expenditures.mk @@ -0,0 +1,5 @@ +.PHONY: independent_expenditures +independent_expenditures : data/processed/independent_expenditures.csv + +data/processed/independent_expenditures.csv : + python -m scrapers.financial_disclosure.scrape_independent_expenditures > $@ diff --git a/scrapers/financial_disclosure/scrape_independent_expenditures.py b/scrapers/financial_disclosure/scrape_independent_expenditures.py new file mode 100644 index 0000000..54e6072 --- /dev/null +++ b/scrapers/financial_disclosure/scrape_independent_expenditures.py @@ -0,0 +1,164 @@ +import csv +import json +import logging +import scrapelib +import sys + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) + + +class IndependentExpenditureScraper(scrapelib.Scraper): + election_years = ("2021", "2022", "2023", "2024") + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _get_donor_details(self, transaction_id): + donor_details_resp = self.get( + "https://login.cfis.sos.state.nm.us/api/Public/" + f"GetIEDonors?transactionId={transaction_id}&transactionVersId=1" + ) + + if not donor_details_resp.ok: + return None + + donor_details = donor_details_resp.json() + if not donor_details: + return [ + { + "DonorName": None, + "DonorAmount": None, + "DonorAddress": None, + } + ] + + return [ + { + "DonorName": donor["DonorName"], + "DonorAmount": donor["DonatedAmount"], + "DonorAddress": donor["Donor"]["Address"]["CompleteAddress"], + } + for donor in donor_details + ] + + def _expenditures(self, year): + payload = { + "TransactionType": "IE", + "CommitteeType": None, + "ElectionYear": year, + "CommitteeName": None, + "TransactionCategoryCode": None, + "AmountType": None, + "ContributorPayeeName": None, + "TransactionBeginDate": None, + "TransactionEndDate": None, + "ValidationRequired": 0, + "pageNumber": 1, + "pageSize": 1000, + "sortDir": "asc", + "sortedBy": "", + "TransactionAmount": None, + "TransactionUnderAmount": None, + "pacType": "", + "Occupation": None, + "StateCode": None, + "city": None, + "Zipcode": None, + "ZipExt": None, + "Reason": None, + "Stance": "", + } + + page_number = 1 + page_size = 1000 + result_count = 1000 + + while result_count == page_size: + logger.debug(f"Fetching page {page_number} for {year}") + + _payload = payload.copy() + _payload.update({"PageNo": page_number}) + + logger.debug(_payload) + + response = self.post( + "https://login.cfis.sos.state.nm.us/api///" + "Search/TransactionSearchInformation", + data=json.dumps(_payload), + headers={"Content-Type": "application/json"}, + verify=False, + ) + + if response.ok: + results = response.json() + + yield from results + + result_count = len(results) + + logger.debug(f"Last page {page_number} had {result_count} results") + + page_number += 1 + else: + logger.error( + f"Failed to fetch results:\n{response.content.decode('utf-8')}" + ) + sys.exit() + + def scrape(self): + for year in IndependentExpenditureScraper.election_years: + for result in self._expenditures(year): + transaction_id = result["TransactionId"] + + donors = self._get_donor_details(transaction_id) + for donor in donors: + yield { + "ReportingEntityName": result["Name"], + "ReportingEntityType": result["CommitteeType"], + "Payee": result["ContributorPayeeName"], + "PayeeType": result["EntityTypeDescription"], + "PayeeAddress": result["Address"], + "TransactionDate": result["TransactionDate"], + "ExpenditureAmount": result["Amount"], + "ExpenditureDescription": result[ + "TransactionPurposeDescription" + ], + "ElectionYear": result["ElectionYear"], + "ElectionType": result["ElectionPeriod"], + "Reason": result["Reason"], + "Stance": result["Stance"], + **donor, + } + + +if __name__ == "__main__": + writer = csv.DictWriter( + sys.stdout, + fieldnames=[ + "TransactionDate", + "ReportingEntityName", + "ReportingEntityType", + "Payee", + "PayeeType", + "PayeeAddress", + "ExpenditureAmount", + "ExpenditureDescription", + "ElectionType", + "ElectionYear", + "Reason", + "Stance", + "DonorName", + "DonorAddress", + "DonorAmount", + ], + extrasaction="ignore", + ) + + writer.writeheader() + + scraper = IndependentExpenditureScraper( + requests_per_minute=60, retry_attempts=3, verify=False + ) + for result in scraper.scrape(): + writer.writerow(result)