From df9e009baddba7f5d91ec6087cea050e44fe0387 Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Mon, 5 Aug 2024 15:58:57 -0400 Subject: [PATCH 01/15] mgmt cmd structure --- .../tdpservice/parsers/management/__init__.py | 0 .../parsers/management/commands/__init__.py | 0 .../parsers/management/commands/seed_db.py | 70 +++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 tdrs-backend/tdpservice/parsers/management/__init__.py create mode 100644 tdrs-backend/tdpservice/parsers/management/commands/__init__.py create mode 100644 tdrs-backend/tdpservice/parsers/management/commands/seed_db.py diff --git a/tdrs-backend/tdpservice/parsers/management/__init__.py b/tdrs-backend/tdpservice/parsers/management/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tdrs-backend/tdpservice/parsers/management/commands/__init__.py b/tdrs-backend/tdpservice/parsers/management/commands/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py new file mode 100644 index 0000000000..82773acf30 --- /dev/null +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -0,0 +1,70 @@ +"""`populate_stts` command.""" + +import csv +import json +import logging +from pathlib import Path + +from django.core.management import BaseCommand +from django.utils import timezone + +from ...models import STT, Region + +DATA_DIR = BASE_DIR = Path(__file__).resolve().parent / "data" +logger = logging.getLogger(__name__) + + +def _populate_regions(): + with open(DATA_DIR / "regions.csv") as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + Region.objects.get_or_create(id=row["Id"]) + Region.objects.get_or_create(id=1000) + +def _load_csv(filename, entity): + with open(DATA_DIR / filename) as csvfile: + reader = csv.DictReader(csvfile) + + for row in reader: + stt, stt_created = STT.objects.get_or_create(name=row["Name"]) + if stt_created: # These lines are spammy, should remove before merge + logger.debug("Created new entry for " + row["Name"]) + + stt.postal_code = row["Code"] + stt.region_id = row["Region"] + if filename == "tribes.csv": + stt.state = STT.objects.get(postal_code=row["Code"], type=STT.EntityType.STATE) + + chars = 3 if entity == STT.EntityType.TRIBE else 2 + stt.stt_code = str(row["STT_CODE"]).zfill(chars) + + stt.type = entity + stt.filenames = json.loads(row["filenames"].replace('\'', '"')) + stt.ssp = row["SSP"] + stt.sample = row["Sample"] + # TODO: Was seeing lots of references to STT.objects.filter(pk=... + # We could probably one-line this but we'd miss .save() signals + # https://stackoverflow.com/questions/41744096/ + # TODO: we should finish the last columns from the csvs: Sample, SSN_Encrypted + stt.save() + + +class Command(BaseCommand): + """Command class.""" + + help = "Populate regions, states, territories, and tribes." + + def handle(self, *args, **options): + """Populate the various regions, states, territories, and tribes.""" + _populate_regions() + + stt_map = [ + ("states.csv", STT.EntityType.STATE), + ("territories.csv", STT.EntityType.TERRITORY), + ("tribes.csv", STT.EntityType.TRIBE) + ] + + for datafile, entity in stt_map: + _load_csv(datafile, entity) + + logger.info("STT import executed by Admin at %s", timezone.now()) From c333266ffc6de1aeab8d9afbba61db7645a4f27d Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Tue, 6 Aug 2024 10:42:03 -0400 Subject: [PATCH 02/15] Pushing after pair w/ Jan --- Taskfile.yml | 9 + .../parsers/management/commands/seed_db.py | 245 ++++++++++++++---- 2 files changed, 200 insertions(+), 54 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index 74f3e9c7c6..c8d13b79d4 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -59,6 +59,15 @@ tasks: cmds: - docker-compose -f docker-compose.yml exec web sh -c "python ./manage.py shell" + backend-exec: + desc: Execute a command in the backend container + dir: tdrs-backend + vars: + CMD: '{{.CMD}}' + cmds: + - docker-compose -f docker-compose.yml up -d + - docker-compose -f docker-compose.yml exec web sh -c "python ./manage.py {{.CMD}}" + backend-pytest: desc: 'Run pytest in the backend container E.g: task backend-pytest PYTEST_ARGS="tdpservice/test/ -s -vv"' dir: tdrs-backend diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index 82773acf30..152b5705f1 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -1,6 +1,5 @@ -"""`populate_stts` command.""" +"""`seed_db` command.""" -import csv import json import logging from pathlib import Path @@ -8,63 +7,201 @@ from django.core.management import BaseCommand from django.utils import timezone -from ...models import STT, Region - -DATA_DIR = BASE_DIR = Path(__file__).resolve().parent / "data" +from tdpservice.parsers.schema_defs.utils import get_schema_options, get_text_from_df # maybe need other utilities +from tdpservice.parsers.test.factories import ParsingFileFactory, TanfT1Factory # maybe need other factories +from tdpservice.parsers.schema_defs.header import header +from tdpservice.parsers.schema_defs.tanf import t1, t2, t3, t4, t5, t6, t7 +from tdpservice.parsers import schema_defs +from tdpservice.data_files.models import DataFile logger = logging.getLogger(__name__) - -def _populate_regions(): - with open(DATA_DIR / "regions.csv") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - Region.objects.get_or_create(id=row["Id"]) - Region.objects.get_or_create(id=1000) - -def _load_csv(filename, entity): - with open(DATA_DIR / filename) as csvfile: - reader = csv.DictReader(csvfile) - - for row in reader: - stt, stt_created = STT.objects.get_or_create(name=row["Name"]) - if stt_created: # These lines are spammy, should remove before merge - logger.debug("Created new entry for " + row["Name"]) - - stt.postal_code = row["Code"] - stt.region_id = row["Region"] - if filename == "tribes.csv": - stt.state = STT.objects.get(postal_code=row["Code"], type=STT.EntityType.STATE) - - chars = 3 if entity == STT.EntityType.TRIBE else 2 - stt.stt_code = str(row["STT_CODE"]).zfill(chars) - - stt.type = entity - stt.filenames = json.loads(row["filenames"].replace('\'', '"')) - stt.ssp = row["SSP"] - stt.sample = row["Sample"] - # TODO: Was seeing lots of references to STT.objects.filter(pk=... - # We could probably one-line this but we'd miss .save() signals - # https://stackoverflow.com/questions/41744096/ - # TODO: we should finish the last columns from the csvs: Sample, SSN_Encrypted - stt.save() - +''' +Create a tool/mechanism for generating "random" and internally consistent data + implement faker for test factories + create datafile generator from factories + + class ParsingFileFactory(factory.django.DjangoModelFactory): + # class DataFileSummaryFactory(factory.django.DjangoModelFactory): + # class ParserErrorFactory(factory.django.DjangoModelFactory): + class TanfT1Factory(factory.django.DjangoModelFactory): + ... + + parse generated datafiles + +Ensure tool is version/schema aware (we should be able to use django's migration table) +Tool will "fuzz" or generate out of range values to intentionally create issues [ stretch ] + utilize parsers/schema_defs/utils.py as a reference for generating the `file__data` line by line + have to start with header + + given a ParsingFileFactory, generate a file__data line by line + +Create django command such that tool can be pointed at deployed environments +''' + +schema_options = { + 'TAN': { + 'A': { + 'section': DataFile.Section.ACTIVE_CASE_DATA, + 'models': { + 'T1': schema_defs.tanf.t1, + 'T2': schema_defs.tanf.t2, + 'T3': schema_defs.tanf.t3, + } + }, + 'C': { + 'section': DataFile.Section.CLOSED_CASE_DATA, + 'models': { + 'T4': schema_defs.tanf.t4, + 'T5': schema_defs.tanf.t5, + } + }, + 'G': { + 'section': DataFile.Section.AGGREGATE_DATA, + 'models': { + 'T6': schema_defs.tanf.t6, + } + }, + 'S': { + 'section': DataFile.Section.STRATUM_DATA, + 'models': { + 'T7': schema_defs.tanf.t7, + } + } + }, + 'SSP': { + 'A': { + 'section': DataFile.Section.SSP_ACTIVE_CASE_DATA, + 'models': { + 'M1': schema_defs.ssp.m1, + 'M2': schema_defs.ssp.m2, + 'M3': schema_defs.ssp.m3, + } + }, + 'C': { + 'section': DataFile.Section.SSP_CLOSED_CASE_DATA, + 'models': { + 'M4': schema_defs.ssp.m4, + 'M5': schema_defs.ssp.m5, + } + }, + 'G': { + 'section': DataFile.Section.SSP_AGGREGATE_DATA, + 'models': { + 'M6': schema_defs.ssp.m6, + } + }, + 'S': { + 'section': DataFile.Section.SSP_STRATUM_DATA, + 'models': { + 'M7': schema_defs.ssp.m7, + } + } + }, + 'Tribal TAN': { + 'A': { + 'section': DataFile.Section.TRIBAL_ACTIVE_CASE_DATA, + 'models': { + 'T1': schema_defs.tribal_tanf.t1, + 'T2': schema_defs.tribal_tanf.t2, + 'T3': schema_defs.tribal_tanf.t3, + } + }, + 'C': { + 'section': DataFile.Section.TRIBAL_CLOSED_CASE_DATA, + 'models': { + 'T4': schema_defs.tribal_tanf.t4, + 'T5': schema_defs.tribal_tanf.t5, + } + }, + 'G': { + 'section': DataFile.Section.TRIBAL_AGGREGATE_DATA, + 'models': { + 'T6': schema_defs.tribal_tanf.t6, + } + }, + 'S': { + 'section': DataFile.Section.TRIBAL_STRATUM_DATA, + 'models': { + 'T7': schema_defs.tribal_tanf.t7, + } + }, + }, +} class Command(BaseCommand): """Command class.""" - help = "Populate regions, states, territories, and tribes." + help = "Populate datafiles, records, summaries, and errors for all STTs." def handle(self, *args, **options): - """Populate the various regions, states, territories, and tribes.""" - _populate_regions() - - stt_map = [ - ("states.csv", STT.EntityType.STATE), - ("territories.csv", STT.EntityType.TERRITORY), - ("tribes.csv", STT.EntityType.TRIBE) - ] - - for datafile, entity in stt_map: - _load_csv(datafile, entity) - - logger.info("STT import executed by Admin at %s", timezone.now()) + """Populate datafiles, records, summaries, and errors for all STTs.""" + + + # from file__prog_type -> get me the prog type for referencing the schema_def + """ def get_program_models(str_prog, str_section): + def get_program_model(str_prog, str_section, str_model): + def get_section_reference(str_prog, str_section): + def get_text_from_df(df): + def get_schema(line, section, program_type): """ + + + + + + """ parsing_file = ParsingFileFactory.build(year=2021, + quarter='Q2', + original_filename='t3_file.txt', + file__name='t3_file.txt', + section=DataFile.Section.CLOSED_CASE_DATA, + #**how do we make this** + file__data=b'', + ) """ + #parsing_file.save() + + x = get_text_from_df(parsing_file) + print(x) + + # t1 = schema_options.get('TAN').get('A').get('models').get('T1') + T1_fields = t1.schemas[0].fields + [print(i) for i in T1_fields] + + t1_line = '' + for field in T1_fields: + field_len = field.endIndex - field.startIndex + if field.type is "number": + t1_line += "0" * field_len + elif field.type is "string": + t1_line += "a" * field_len + else: + raise ValueError("Field type not recognized") + print(t1_line) + + # TODO: allowed values per field, try manual and if commonalities exist, create a function to generate + # TODO: can we utilize validators somehow to get a validValues(schemaMgr.fields[])? + + def validValues(): + '''Takes in a field and returns a list of valid values.''' + #niave implementation will just zero or 'a' fill the field + #brute implementation will use faker and we'll run it through the validators + # elegant implementation will use the validators to generate the valid values + pass + + + ''' + # utilize parsers/schema_defs/utils.py as a reference for getting the lists of STT/years/quarters/sections + for i in STT[]: + for y in years[] + for q in quarters[] + for s in sections[] + # now we're generating a binary file? b/c we'll need the DF FK reference + # TODO: + for r in rowSchemas[] + for f in rowSchemas.Fields[] + for v in f.validValues[] + (look at seed_records.py for how to generate random data) + # write a temp file + # ok now parse this thing (but turn off emails) + # generate a DFS + # dump db in full to a seed file + ''' + From 6d23208c87f0df509ff094836b0652e22eb2f21f Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Wed, 7 Aug 2024 13:12:27 -0400 Subject: [PATCH 03/15] Pushing latest prior to lunch --- .../parsers/management/commands/seed_db.py | 241 +++++++++++++++--- 1 file changed, 210 insertions(+), 31 deletions(-) diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index 152b5705f1..5df6304690 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -1,16 +1,19 @@ """`seed_db` command.""" -import json +import faker +import random import logging from pathlib import Path from django.core.management import BaseCommand from django.utils import timezone -from tdpservice.parsers.schema_defs.utils import get_schema_options, get_text_from_df # maybe need other utilities + from tdpservice.parsers.test.factories import ParsingFileFactory, TanfT1Factory # maybe need other factories from tdpservice.parsers.schema_defs.header import header -from tdpservice.parsers.schema_defs.tanf import t1, t2, t3, t4, t5, t6, t7 +from tdpservice.parsers.schema_defs.trailer import trailer +from tdpservice.parsers.schema_defs.utils import * # maybe need other utilities +# all models should be referenced by using the utils.py get_schema_options wrappers from tdpservice.parsers import schema_defs from tdpservice.data_files.models import DataFile logger = logging.getLogger(__name__) @@ -38,6 +41,7 @@ class TanfT1Factory(factory.django.DjangoModelFactory): Create django command such that tool can be pointed at deployed environments ''' +''' schema_options = { 'TAN': { 'A': { @@ -127,6 +131,186 @@ class TanfT1Factory(factory.django.DjangoModelFactory): }, }, } +''' + +# t1 fields +''' +RecordType(0-2) +RPT_MONTH_YEAR(2-8) +CASE_NUMBER(8-19) +COUNTY_FIPS_CODE(19-22) +STRATUM(22-24) +ZIP_CODE(24-29) +FUNDING_STREAM(29-30) +DISPOSITION(30-31) +NEW_APPLICANT(31-32) +NBR_FAMILY_MEMBERS(32-34) +FAMILY_TYPE(34-35) +RECEIVES_SUB_HOUSING(35-36) +RECEIVES_MED_ASSISTANCE(36-37) +RECEIVES_FOOD_STAMPS(37-38) +AMT_FOOD_STAMP_ASSISTANCE(38-42) +RECEIVES_SUB_CC(42-43) +AMT_SUB_CC(43-47) +CHILD_SUPPORT_AMT(47-51) +FAMILY_CASH_RESOURCES(51-55) +CASH_AMOUNT(55-59) +NBR_MONTHS(59-62) +CC_AMOUNT(62-66) +CHILDREN_COVERED(66-68) +CC_NBR_MONTHS(68-71) +TRANSP_AMOUNT(71-75) +TRANSP_NBR_MONTHS(75-78) +TRANSITION_SERVICES_AMOUNT(78-82) +TRANSITION_NBR_MONTHS(82-85) +OTHER_AMOUNT(85-89) +OTHER_NBR_MONTHS(89-92) +SANC_REDUCTION_AMT(92-96) +WORK_REQ_SANCTION(96-97) +FAMILY_SANC_ADULT(97-98) +SANC_TEEN_PARENT(98-99) +NON_COOPERATION_CSE(99-100) +FAILURE_TO_COMPLY(100-101) +OTHER_SANCTION(101-102) +RECOUPMENT_PRIOR_OVRPMT(102-106) +OTHER_TOTAL_REDUCTIONS(106-110) +FAMILY_CAP(110-111) +REDUCTIONS_ON_RECEIPTS(111-112) +OTHER_NON_SANCTION(112-113) +WAIVER_EVAL_CONTROL_GRPS(113-114) +FAMILY_EXEMPT_TIME_LIMITS(114-116) +FAMILY_NEW_CHILD(116-117) +BLANK(117-156) +''' +# https://faker.readthedocs.io/en/stable/providers/baseprovider.html#faker.providers.BaseProvider +class FieldFaker(faker.providers.BaseProvider): + def record_type(self): + return self.random_element(elements=('00', '01', '02')) + + def rpt_month_year(self): + return self.date_time_this_month(before_now=True, after_now=False).strftime('%m%Y') + + def case_number(self): + return self.random_int(min=1000000000, max=9999999999) + + def county_fips_code(self): + return self.random_int(min=0, max=999) + + def stratum(self): + return self.random_int(min=0, max=99) + + def zip_code(self): + return self.random_int(min=0, max=99999) + + def funding_stream(self): + return self.random_element(elements=('A', 'B', 'C')) + + def disposition(self): + return self.random_element(elements=('A', 'B', 'C')) + + def new_applicant(self): + return self.random_element(elements=('Y', 'N')) + + def nbr_family_members(self): + return self.random_int(min=0, max=99) + + def family_type(self): + return self.random_element(elements=('A', 'B', 'C')) + + def receives_sub_housing(self): + return self.random_element(elements=('Y', 'N')) + +def build_datafile(year, quarter, original_filename, file_name, section, file_data): + """Build a datafile.""" + return ParsingFileFactory.build( + year=year, + quarter=quarter, + original_filename=original_filename, + file__name=file_name, + section=section, + file__data=file_data, + ) + +def validValues(field): + '''Takes in a field and returns a line of valid values.''' + #niave implementation will just zero or 'a' fill the field + '''field_len = field.endIndex - field.startIndex + if field.type is "number": + line += "0" * field_len + elif field.type is "string": + line += "A" * field_len + else: + raise ValueError("Field type not recognized") + return line''' + #brute implementation will use faker and we'll run it through the validators + #pass + + # elegant implementation will use the validators to generate the valid values + + field_len = field.endIndex - field.startIndex + # check list of validators + # treat header/trailer special, it actually checks for string values + # check for zero or pad fill + # transformField might be tricky + if field.name == 'SSN': + # only used by recordtypes 2,3,5 + # TODO: reverse the TransformField logic to 'encrypt' a random number + field_format = '?' * field_len + else: + field_format = '#' * field_len + return faker.bothify(text=field_format) + + +def make_line(schemaMgr): + '''Takes in a schema manager and returns a line of data.''' + line = '' + + #TODO: check for header/trailer + if schemaMgr.record_type == 'HEADER' or schemaMgr.record_type == 'TRAILER': + for field in schemaMgr.fields: + line += validValues(field) + + else: + for field in schemaMgr.fields: + line += validValues(field) + return line + +def make_files(stt, year, quarter): + '''Given a STT, parameterize calls to build_datafile and make_line.''' + """Psuedo code""" + sections = stt.filenames + + for section in sections: + + # based on section, get models from schema_options + + models_in_section = get_program_models(stt.program, section) + temp_file = '' + #TODO: make header line + temp_file += make_line(header) + + # iterate over models and generate lines + for model in models_in_section: + if section in ['Active Case Data', 'Closed Case Data']: + # obviously, this first approach can't prevent duplicates (unlikely), + # nor can it ensure that the case data is internally consistent + # (e.g. a case with a child but no adult) + + # we should generate hundreds, thousands, tens of thousands of records + for i in range(random.randint(5, 9999)): + temp_file += make_line(model) + elif section in ['Aggregate Data', 'Stratum Data']: + # we should generate a smaller count of lines...maybe leave this as a TODO + # shouldn't this be based on the active/closed case data? + pass + + # make trailer line + temp_file += make_line(trailer) + + # build datafile + + # return dictionary of binary blobs + # return {'Active Case Data': b'...', 'Closed Case Data': b'...', 'Aggregate Data': b'...', 'Stratum Data': b'...'} class Command(BaseCommand): """Command class.""" @@ -144,9 +328,6 @@ def get_section_reference(str_prog, str_section): def get_text_from_df(df): def get_schema(line, section, program_type): """ - - - """ parsing_file = ParsingFileFactory.build(year=2021, quarter='Q2', @@ -163,45 +344,43 @@ def get_schema(line, section, program_type): """ # t1 = schema_options.get('TAN').get('A').get('models').get('T1') T1_fields = t1.schemas[0].fields + [print(i) for i in T1_fields] t1_line = '' for field in T1_fields: field_len = field.endIndex - field.startIndex - if field.type is "number": - t1_line += "0" * field_len - elif field.type is "string": - t1_line += "a" * field_len + # check list of validators + # treat header/trailer special, it actually checks for string values + # check for zero or pad fill + # transformField might be tricky + if field.name == 'SSN': + # only used by recordtypes 2,3,5 + # TODO: reverse the TransformField logic to 'encrypt' a random number + field_format = '?' * field_len else: - raise ValueError("Field type not recognized") + field_format = '#' * field_len + t1_line += faker.bothify(text=field_format) print(t1_line) # TODO: allowed values per field, try manual and if commonalities exist, create a function to generate # TODO: can we utilize validators somehow to get a validValues(schemaMgr.fields[])? - def validValues(): - '''Takes in a field and returns a list of valid values.''' - #niave implementation will just zero or 'a' fill the field - #brute implementation will use faker and we'll run it through the validators - # elegant implementation will use the validators to generate the valid values - pass - - ''' # utilize parsers/schema_defs/utils.py as a reference for getting the lists of STT/years/quarters/sections for i in STT[]: - for y in years[] - for q in quarters[] - for s in sections[] - # now we're generating a binary file? b/c we'll need the DF FK reference - # TODO: - for r in rowSchemas[] - for f in rowSchemas.Fields[] - for v in f.validValues[] - (look at seed_records.py for how to generate random data) - # write a temp file - # ok now parse this thing (but turn off emails) - # generate a DFS + for y in years[] # 1998 - 2099 + for q in quarters[] # 1-4 + for p in programs[] # TAN, SSP, Tribal TAN + for s in sections[] # [x for x['section'] in schema_options[p].keys()] + #need the DF FK reference? + for m in models[] + for f in rowSchemas.Fields[] + for v in f.validValues[] + (look at seed_records.py for how to generate random data) + # write a temp file + # ok now parse this thing (but turn off emails) + # generate a DFS # dump db in full to a seed file ''' From fbdaecef7aa41b595006449340ed934d8fc820fc Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Wed, 7 Aug 2024 13:14:29 -0400 Subject: [PATCH 04/15] ignore for research/grep results --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 2fee3eca0a..2974197b1d 100644 --- a/.gitignore +++ b/.gitignore @@ -108,4 +108,5 @@ tfapply cypress.env.json # Patches -*.patch \ No newline at end of file +*.patch +tdrs-backend/tdpservice/parsers/management/commands/*.txt From c3283c3bb5070686e12d8a6c18d904f3da936e9f Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Wed, 7 Aug 2024 17:13:43 -0400 Subject: [PATCH 05/15] End of day commit, successful run w/o syntax issues. Need to do something with files --- .../parsers/management/commands/seed_db.py | 170 ++++++------------ 1 file changed, 59 insertions(+), 111 deletions(-) diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index 5df6304690..c2dd163bc2 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -1,13 +1,13 @@ """`seed_db` command.""" -import faker +import sys import random import logging from pathlib import Path from django.core.management import BaseCommand -from django.utils import timezone - +from faker import Faker +fake = Faker() from tdpservice.parsers.test.factories import ParsingFileFactory, TanfT1Factory # maybe need other factories from tdpservice.parsers.schema_defs.header import header @@ -16,6 +16,8 @@ # all models should be referenced by using the utils.py get_schema_options wrappers from tdpservice.parsers import schema_defs from tdpservice.data_files.models import DataFile +from tdpservice.stts.models import STT + logger = logging.getLogger(__name__) ''' @@ -41,97 +43,7 @@ class TanfT1Factory(factory.django.DjangoModelFactory): Create django command such that tool can be pointed at deployed environments ''' -''' -schema_options = { - 'TAN': { - 'A': { - 'section': DataFile.Section.ACTIVE_CASE_DATA, - 'models': { - 'T1': schema_defs.tanf.t1, - 'T2': schema_defs.tanf.t2, - 'T3': schema_defs.tanf.t3, - } - }, - 'C': { - 'section': DataFile.Section.CLOSED_CASE_DATA, - 'models': { - 'T4': schema_defs.tanf.t4, - 'T5': schema_defs.tanf.t5, - } - }, - 'G': { - 'section': DataFile.Section.AGGREGATE_DATA, - 'models': { - 'T6': schema_defs.tanf.t6, - } - }, - 'S': { - 'section': DataFile.Section.STRATUM_DATA, - 'models': { - 'T7': schema_defs.tanf.t7, - } - } - }, - 'SSP': { - 'A': { - 'section': DataFile.Section.SSP_ACTIVE_CASE_DATA, - 'models': { - 'M1': schema_defs.ssp.m1, - 'M2': schema_defs.ssp.m2, - 'M3': schema_defs.ssp.m3, - } - }, - 'C': { - 'section': DataFile.Section.SSP_CLOSED_CASE_DATA, - 'models': { - 'M4': schema_defs.ssp.m4, - 'M5': schema_defs.ssp.m5, - } - }, - 'G': { - 'section': DataFile.Section.SSP_AGGREGATE_DATA, - 'models': { - 'M6': schema_defs.ssp.m6, - } - }, - 'S': { - 'section': DataFile.Section.SSP_STRATUM_DATA, - 'models': { - 'M7': schema_defs.ssp.m7, - } - } - }, - 'Tribal TAN': { - 'A': { - 'section': DataFile.Section.TRIBAL_ACTIVE_CASE_DATA, - 'models': { - 'T1': schema_defs.tribal_tanf.t1, - 'T2': schema_defs.tribal_tanf.t2, - 'T3': schema_defs.tribal_tanf.t3, - } - }, - 'C': { - 'section': DataFile.Section.TRIBAL_CLOSED_CASE_DATA, - 'models': { - 'T4': schema_defs.tribal_tanf.t4, - 'T5': schema_defs.tribal_tanf.t5, - } - }, - 'G': { - 'section': DataFile.Section.TRIBAL_AGGREGATE_DATA, - 'models': { - 'T6': schema_defs.tribal_tanf.t6, - } - }, - 'S': { - 'section': DataFile.Section.TRIBAL_STRATUM_DATA, - 'models': { - 'T7': schema_defs.tribal_tanf.t7, - } - }, - }, -} -''' + # t1 fields ''' @@ -183,7 +95,7 @@ class TanfT1Factory(factory.django.DjangoModelFactory): BLANK(117-156) ''' # https://faker.readthedocs.io/en/stable/providers/baseprovider.html#faker.providers.BaseProvider -class FieldFaker(faker.providers.BaseProvider): +""" class FieldFaker(faker.providers.BaseProvider): def record_type(self): return self.random_element(elements=('00', '01', '02')) @@ -219,7 +131,7 @@ def family_type(self): def receives_sub_housing(self): return self.random_element(elements=('Y', 'N')) - + """ def build_datafile(year, quarter, original_filename, file_name, section, file_data): """Build a datafile.""" return ParsingFileFactory.build( @@ -258,7 +170,7 @@ def validValues(field): field_format = '?' * field_len else: field_format = '#' * field_len - return faker.bothify(text=field_format) + return fake.bothify(text=field_format) def make_line(schemaMgr): @@ -273,18 +185,35 @@ def make_line(schemaMgr): else: for field in schemaMgr.fields: line += validValues(field) + return line +from tdpservice.data_files.models import DataFile + def make_files(stt, year, quarter): '''Given a STT, parameterize calls to build_datafile and make_line.''' - """Psuedo code""" - sections = stt.filenames + sections = stt.filenames.keys() + # {'Active Case Data': 'ADS.E2J.FTP1.TS05', 'Closed Case Data': 'ADS.E2J.FTP2.TS05', 'Aggregate Data': 'ADS.E2J.FTP3.TS05'}" + # "{'Active Case Data': 'ADS.E2J.NDM1.TS24', 'Closed Case Data': 'ADS.E2J.NDM2.TS24', 'Aggregate Data': 'ADS.E2J.NDM3.TS24', + # 'Stratum Data': 'ADS.E2J.NDM4.TS24', 'SSP Active Case Data': 'ADS.E2J.NDM1.MS24', 'SSP Closed Case Data': 'ADS.E2J.NDM2.MS24', 'SSP Aggregate Data': + # 'ADS.E2J.NDM3.MS24', 'SSP Stratum Data': 'ADS.E2J.NDM4.MS24'}" + files_for_quarter = {} - for section in sections: + for s in sections: # based on section, get models from schema_options + #if stt.ssp is True: + # we can match section to the schema_options + # elif stt.state is not None: + # we can declare prog_type to Tribal + + # given a leaf of 'section', get 'TAN' or 'SSP' or 'Tribal TAN' from schema_options - models_in_section = get_program_models(stt.program, section) + # match schema_options[_]['section'] to our section + text_dict = get_schema_options("", section=s, query='text') + prog_type = text_dict['program_type'] + section = text_dict['section'] + models_in_section = get_program_models(prog_type, section) temp_file = '' #TODO: make header line temp_file += make_line(header) @@ -308,9 +237,18 @@ def make_files(stt, year, quarter): temp_file += make_line(trailer) # build datafile - - # return dictionary of binary blobs - # return {'Active Case Data': b'...', 'Closed Case Data': b'...', 'Aggregate Data': b'...', 'Stratum Data': b'...'} + # TODO convert temp_file to bytes literal + datafile = build_datafile( + year=year, + quarter=quarter, + original_filename=f'{section}.txt', #this is awful + file_name=f'{section}.txt', #also bad + section=section, + file_data=temp_file, + ) + files_for_quarter[section] = datafile + + return files_for_quarter class Command(BaseCommand): """Command class.""" @@ -339,15 +277,15 @@ def get_schema(line, section, program_type): """ ) """ #parsing_file.save() - x = get_text_from_df(parsing_file) - print(x) + #x = get_text_from_df(parsing_file) + #print(x) # t1 = schema_options.get('TAN').get('A').get('models').get('T1') - T1_fields = t1.schemas[0].fields + #T1_fields = t1.schemas[0].fields - [print(i) for i in T1_fields] + #[print(i) for i in T1_fields] - t1_line = '' + """ t1_line = '' for field in T1_fields: field_len = field.endIndex - field.startIndex # check list of validators @@ -361,15 +299,25 @@ def get_schema(line, section, program_type): """ else: field_format = '#' * field_len t1_line += faker.bothify(text=field_format) - print(t1_line) + print(t1_line) """ # TODO: allowed values per field, try manual and if commonalities exist, create a function to generate # TODO: can we utilize validators somehow to get a validValues(schemaMgr.fields[])? + stts = STT.objects.all() + for stt in stts: + # for y in years[2024] + for q in [1,2,3,4]: + placeholder = make_files(stt, 2024, q) + # save to db or upload endpoint + # parse the file? or use DFS factory? + # dump db in full + + ''' # utilize parsers/schema_defs/utils.py as a reference for getting the lists of STT/years/quarters/sections for i in STT[]: - for y in years[] # 1998 - 2099 + for y in years[] # 2020-2022 for q in quarters[] # 1-4 for p in programs[] # TAN, SSP, Tribal TAN for s in sections[] # [x for x['section'] in schema_options[p].keys()] From 03bab8e48100d1a2153c285efa517924a5433bc5 Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Fri, 23 Aug 2024 15:22:57 -0400 Subject: [PATCH 06/15] just the useful changes, sorry for the mess --- Taskfile.yml | 2 +- .../parsers/management/commands/seed_db.py | 138 +++++++++++++----- 2 files changed, 99 insertions(+), 41 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index c8d13b79d4..a6ea63e7a8 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -66,7 +66,7 @@ tasks: CMD: '{{.CMD}}' cmds: - docker-compose -f docker-compose.yml up -d - - docker-compose -f docker-compose.yml exec web sh -c "python ./manage.py {{.CMD}}" + - docker-compose -f docker-compose.yml exec web sh -c "python manage.py populate_stts; python ./manage.py seed_db" backend-pytest: desc: 'Run pytest in the backend container E.g: task backend-pytest PYTEST_ARGS="tdpservice/test/ -s -vv"' diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index c2dd163bc2..54d0c0d733 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -8,7 +8,7 @@ from django.core.management import BaseCommand from faker import Faker fake = Faker() - +from django.core.files.base import ContentFile from tdpservice.parsers.test.factories import ParsingFileFactory, TanfT1Factory # maybe need other factories from tdpservice.parsers.schema_defs.header import header from tdpservice.parsers.schema_defs.trailer import trailer @@ -16,7 +16,11 @@ # all models should be referenced by using the utils.py get_schema_options wrappers from tdpservice.parsers import schema_defs from tdpservice.data_files.models import DataFile +from tdpservice.scheduling import parser_task from tdpservice.stts.models import STT +#import the system_user +from tdpservice.users.models import User + logger = logging.getLogger(__name__) @@ -132,18 +136,30 @@ def family_type(self): def receives_sub_housing(self): return self.random_element(elements=('Y', 'N')) """ -def build_datafile(year, quarter, original_filename, file_name, section, file_data): + + +def build_datafile(stt, year, quarter, original_filename, file_name, section, file_data): """Build a datafile.""" - return ParsingFileFactory.build( - year=year, - quarter=quarter, - original_filename=original_filename, - file__name=file_name, - section=section, - file__data=file_data, - ) - -def validValues(field): + + try: + d = DataFile.objects.create( + user=User.objects.get_or_create(username='system')[0], + stt=stt, + year=year, + quarter=quarter, + original_filename=original_filename, + section=section, + version=random.randint(1, 1993415), + ) + + + d.file.save(file_name, ContentFile(file_data)) + except django.db.utils.IntegrityError as e: + pass + return d + + +def validValues(schemaMgr, field): '''Takes in a field and returns a line of valid values.''' #niave implementation will just zero or 'a' fill the field '''field_len = field.endIndex - field.startIndex @@ -164,6 +180,9 @@ def validValues(field): # treat header/trailer special, it actually checks for string values # check for zero or pad fill # transformField might be tricky + if field.name == 'RecordType': + print(schemaMgr.record_type) + return schemaMgr.record_type if field.name == 'SSN': # only used by recordtypes 2,3,5 # TODO: reverse the TransformField logic to 'encrypt' a random number @@ -172,21 +191,25 @@ def validValues(field): field_format = '#' * field_len return fake.bothify(text=field_format) - -def make_line(schemaMgr): +from tdpservice.parsers.row_schema import RowSchema +def make_line(schemaMgr, section): '''Takes in a schema manager and returns a line of data.''' line = '' #TODO: check for header/trailer - if schemaMgr.record_type == 'HEADER' or schemaMgr.record_type == 'TRAILER': - for field in schemaMgr.fields: - line += validValues(field) - + if type(schemaMgr) is RowSchema: + if schemaMgr.record_type == 'HEADER': + line += 'HEADER20204{}01 TAN1 D'.format(section) + elif schemaMgr.record_type == 'TRAILER': + line += ' ' * 23 else: - for field in schemaMgr.fields: - line += validValues(field) + row_schema = schemaMgr.schemas[0] + for field in row_schema.fields: + print(field) + line += validValues(row_schema, field) - return line + print(line) + return line + '\n' from tdpservice.data_files.models import DataFile @@ -211,41 +234,48 @@ def make_files(stt, year, quarter): # match schema_options[_]['section'] to our section text_dict = get_schema_options("", section=s, query='text') - prog_type = text_dict['program_type'] - section = text_dict['section'] + prog_type = text_dict['program_type'] # TAN + section = text_dict['section'] # A models_in_section = get_program_models(prog_type, section) temp_file = '' #TODO: make header line - temp_file += make_line(header) + print("making file for section: ", section) + temp_file += make_line(header,section) # iterate over models and generate lines - for model in models_in_section: - if section in ['Active Case Data', 'Closed Case Data']: + for _, model in models_in_section.items(): + print(section) + if s in ['Active Case Data', 'Closed Case Data','Aggregate Data', 'Stratum Data']: + print('yes, secction in all of them') # obviously, this first approach can't prevent duplicates (unlikely), # nor can it ensure that the case data is internally consistent # (e.g. a case with a child but no adult) # we should generate hundreds, thousands, tens of thousands of records - for i in range(random.randint(5, 9999)): - temp_file += make_line(model) - elif section in ['Aggregate Data', 'Stratum Data']: - # we should generate a smaller count of lines...maybe leave this as a TODO - # shouldn't this be based on the active/closed case data? - pass + length = range(random.randint(5, 9)) + print(length) + for i in length: + temp_file += make_line(model,section) + #elif section in ['Aggregate Data', 'Stratum Data']: + # # we should generate a smaller count of lines...maybe leave this as a TODO + # # shouldn't this be based on the active/closed case data? + # pass # make trailer line - temp_file += make_line(trailer) + temp_file += make_line(trailer,section) # build datafile # TODO convert temp_file to bytes literal datafile = build_datafile( + stt=stt, year=year, quarter=quarter, original_filename=f'{section}.txt', #this is awful file_name=f'{section}.txt', #also bad section=section, - file_data=temp_file, + file_data=bytes(temp_file, 'utf-8'), ) + datafile.save() files_for_quarter[section] = datafile return files_for_quarter @@ -303,17 +333,45 @@ def get_schema(line, section, program_type): """ # TODO: allowed values per field, try manual and if commonalities exist, create a function to generate # TODO: can we utilize validators somehow to get a validValues(schemaMgr.fields[])? - - stts = STT.objects.all() - for stt in stts: - # for y in years[2024] - for q in [1,2,3,4]: - placeholder = make_files(stt, 2024, q) + from tdpservice.parsers.models import DataFileSummary + from tdpservice.parsers import parse + from tdpservice.parsers.test.factories import DataFileSummaryFactory + files_for_qtr = make_files(STT.objects.get(id=1), 2024, 1) + print(files_for_qtr) # file has no id, and no payload/content + for f in files_for_qtr.keys(): + df = files_for_qtr[f] + #dfs = DataFileSummary.objects.create(datafile=df, status=DataFileSummary.Status.PENDING) #maybe i need df.file_data? + dfs = DataFileSummaryFactory.build() + dfs.datafile = df + parse.parse_datafile(df, dfs) + + #files_for_qtr[0].save() + + # HALT + # stts = STT.objects.all() + # for stt in stts: + # print(stt) + # # for y in years[2024] + # for q in [1,2,3,4]: + # files_for_qtr = make_files(stt, 2024, q) + # print(files_for_qtr) # save to db or upload endpoint + + #for f in files_for_qtr: + + # parser_task.parse(f.id, should_send_submission_email=False) # parse the file? or use DFS factory? # dump db in full + '''TODO: try out parameterization like so: + T2Factory.create( + RPT_MONTH_YEAR=202010, + CASE_NUMBER='123', + FAMILY_AFFILIATION=1, + ), + ''' + ''' # utilize parsers/schema_defs/utils.py as a reference for getting the lists of STT/years/quarters/sections for i in STT[]: From 8636858b702fb503b5a24448f670150b96301331 Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Wed, 28 Aug 2024 11:57:38 -0400 Subject: [PATCH 07/15] latest work for RPY to get around preparser blocker --- .../parsers/management/commands/seed_db.py | 43 +++++++++---------- tdrs-backend/tdpservice/parsers/parse.py | 6 +++ tdrs-backend/tdpservice/parsers/row_schema.py | 2 + 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index 54d0c0d733..8cb98c4d92 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -181,12 +181,18 @@ def validValues(schemaMgr, field): # check for zero or pad fill # transformField might be tricky if field.name == 'RecordType': - print(schemaMgr.record_type) return schemaMgr.record_type if field.name == 'SSN': # only used by recordtypes 2,3,5 # TODO: reverse the TransformField logic to 'encrypt' a random number field_format = '?' * field_len + elif field.name in ('RPT_MONTH_YEAR', 'CALENDAR_QUARTER'): + + lower = 1 # TODO: get quarter and use acceptable range for month + upper = 3 + # need to generate a two-digit month with leading zero using format() and randit() + month = '{}'.format(random.randint(lower, upper)).zfill(2) + field_format = '2024' + str(month) # fake.date_time_this_month(before_now=True, after_now=False).strftime('%m%Y') else: field_format = '#' * field_len return fake.bothify(text=field_format) @@ -196,20 +202,16 @@ def make_line(schemaMgr, section): '''Takes in a schema manager and returns a line of data.''' line = '' - #TODO: check for header/trailer if type(schemaMgr) is RowSchema: if schemaMgr.record_type == 'HEADER': - line += 'HEADER20204{}01 TAN1 D'.format(section) + line += 'HEADER20241{}01 TAN1 D'.format(section) elif schemaMgr.record_type == 'TRAILER': - line += ' ' * 23 + line += 'TRAILER' + '1' * 16 else: row_schema = schemaMgr.schemas[0] for field in row_schema.fields: - print(field) line += validValues(row_schema, field) - - print(line) - return line + '\n' + return line + ' \n' from tdpservice.data_files.models import DataFile @@ -238,23 +240,19 @@ def make_files(stt, year, quarter): section = text_dict['section'] # A models_in_section = get_program_models(prog_type, section) temp_file = '' - #TODO: make header line + print("making file for section: ", section) temp_file += make_line(header,section) # iterate over models and generate lines for _, model in models_in_section.items(): - print(section) if s in ['Active Case Data', 'Closed Case Data','Aggregate Data', 'Stratum Data']: - print('yes, secction in all of them') # obviously, this first approach can't prevent duplicates (unlikely), # nor can it ensure that the case data is internally consistent # (e.g. a case with a child but no adult) # we should generate hundreds, thousands, tens of thousands of records - length = range(random.randint(5, 9)) - print(length) - for i in length: + for i in range(random.randint(5, 9)): temp_file += make_line(model,section) #elif section in ['Aggregate Data', 'Stratum Data']: # # we should generate a smaller count of lines...maybe leave this as a TODO @@ -263,17 +261,16 @@ def make_files(stt, year, quarter): # make trailer line temp_file += make_line(trailer,section) - - # build datafile - # TODO convert temp_file to bytes literal + print(temp_file) + datafile = build_datafile( stt=stt, year=year, - quarter=quarter, - original_filename=f'{section}.txt', #this is awful - file_name=f'{section}.txt', #also bad - section=section, - file_data=bytes(temp_file, 'utf-8'), + quarter=f"Q{quarter}", + original_filename=f'{stt}-{section}-{year}Q{quarter}.txt', + file_name=f'{stt}-{section}-{year}Q{quarter}', + section=s, + file_data=bytes(temp_file.rstrip(), 'utf-8'), ) datafile.save() files_for_quarter[section] = datafile @@ -336,7 +333,7 @@ def get_schema(line, section, program_type): """ from tdpservice.parsers.models import DataFileSummary from tdpservice.parsers import parse from tdpservice.parsers.test.factories import DataFileSummaryFactory - files_for_qtr = make_files(STT.objects.get(id=1), 2024, 1) + files_for_qtr = make_files(STT.objects.get(id=1), 2024, 2) print(files_for_qtr) # file has no id, and no payload/content for f in files_for_qtr.keys(): df = files_for_qtr[f] diff --git a/tdrs-backend/tdpservice/parsers/parse.py b/tdrs-backend/tdpservice/parsers/parse.py index 6b1f1a338d..b4f5d2dfe7 100644 --- a/tdrs-backend/tdpservice/parsers/parse.py +++ b/tdrs-backend/tdpservice/parsers/parse.py @@ -33,6 +33,7 @@ def parse_datafile(datafile, dfs): logger.info(f"Preparser Error: {len(header_errors)} header errors encountered.") errors['header'] = header_errors bulk_create_errors({1: header_errors}, 1, flush=True) + print("ERRORS: ", errors) return errors field_values = schema_defs.header.get_field_values_by_names(header_line, @@ -466,8 +467,13 @@ def parse_datafile_lines(datafile, dfs, program_type, section, is_encrypted, cas def manager_parse_line(line, schema_manager, generate_error, datafile, is_encrypted=False): """Parse and validate a datafile line using SchemaManager.""" + if type(schema_manager) is row_schema.SchemaManager: schema_manager.datafile = datafile + elif type(schema_manager) is list: + print(line) + print("sMgr(" +str(type(schema_manager))+"): " +str(schema_manager)) + print("datafile: " +str(type(datafile))) try: schema_manager.update_encrypted_fields(is_encrypted) records = schema_manager.parse_and_validate(line, generate_error) diff --git a/tdrs-backend/tdpservice/parsers/row_schema.py b/tdrs-backend/tdpservice/parsers/row_schema.py index 7dd01556fe..8b27fd69e0 100644 --- a/tdrs-backend/tdpservice/parsers/row_schema.py +++ b/tdrs-backend/tdpservice/parsers/row_schema.py @@ -66,6 +66,8 @@ def parse_and_validate(self, line, generate_error): if is_quiet_preparser_errors: return None, True, [] logger.info(f"{len(preparsing_errors)} preparser error(s) encountered.") + logger.info(line) + logger.info(preparsing_errors) return None, False, preparsing_errors # parse line to model From cb204ab52849bd5ed2b0ec909a11888bb1b61800 Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Fri, 30 Aug 2024 14:03:46 -0400 Subject: [PATCH 08/15] Latest changes, removing comments/prints mostly. --- .../parsers/management/commands/seed_db.py | 164 +++++------------- 1 file changed, 48 insertions(+), 116 deletions(-) diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index 8cb98c4d92..7cb49ae94d 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -1,26 +1,25 @@ """`seed_db` command.""" -import sys + import random import logging -from pathlib import Path from django.core.management import BaseCommand from faker import Faker fake = Faker() from django.core.files.base import ContentFile -from tdpservice.parsers.test.factories import ParsingFileFactory, TanfT1Factory # maybe need other factories from tdpservice.parsers.schema_defs.header import header from tdpservice.parsers.schema_defs.trailer import trailer from tdpservice.parsers.schema_defs.utils import * # maybe need other utilities # all models should be referenced by using the utils.py get_schema_options wrappers -from tdpservice.parsers import schema_defs + from tdpservice.data_files.models import DataFile -from tdpservice.scheduling import parser_task +from tdpservice.parsers import parse +from tdpservice.parsers.test.factories import DataFileSummaryFactory +from tdpservice.scheduling import parser_task # not using this, don't have datafile id from tdpservice.stts.models import STT -#import the system_user from tdpservice.users.models import User - +from tdpservice.parsers.row_schema import RowSchema logger = logging.getLogger(__name__) @@ -49,55 +48,6 @@ class TanfT1Factory(factory.django.DjangoModelFactory): -# t1 fields -''' -RecordType(0-2) -RPT_MONTH_YEAR(2-8) -CASE_NUMBER(8-19) -COUNTY_FIPS_CODE(19-22) -STRATUM(22-24) -ZIP_CODE(24-29) -FUNDING_STREAM(29-30) -DISPOSITION(30-31) -NEW_APPLICANT(31-32) -NBR_FAMILY_MEMBERS(32-34) -FAMILY_TYPE(34-35) -RECEIVES_SUB_HOUSING(35-36) -RECEIVES_MED_ASSISTANCE(36-37) -RECEIVES_FOOD_STAMPS(37-38) -AMT_FOOD_STAMP_ASSISTANCE(38-42) -RECEIVES_SUB_CC(42-43) -AMT_SUB_CC(43-47) -CHILD_SUPPORT_AMT(47-51) -FAMILY_CASH_RESOURCES(51-55) -CASH_AMOUNT(55-59) -NBR_MONTHS(59-62) -CC_AMOUNT(62-66) -CHILDREN_COVERED(66-68) -CC_NBR_MONTHS(68-71) -TRANSP_AMOUNT(71-75) -TRANSP_NBR_MONTHS(75-78) -TRANSITION_SERVICES_AMOUNT(78-82) -TRANSITION_NBR_MONTHS(82-85) -OTHER_AMOUNT(85-89) -OTHER_NBR_MONTHS(89-92) -SANC_REDUCTION_AMT(92-96) -WORK_REQ_SANCTION(96-97) -FAMILY_SANC_ADULT(97-98) -SANC_TEEN_PARENT(98-99) -NON_COOPERATION_CSE(99-100) -FAILURE_TO_COMPLY(100-101) -OTHER_SANCTION(101-102) -RECOUPMENT_PRIOR_OVRPMT(102-106) -OTHER_TOTAL_REDUCTIONS(106-110) -FAMILY_CAP(110-111) -REDUCTIONS_ON_RECEIPTS(111-112) -OTHER_NON_SANCTION(112-113) -WAIVER_EVAL_CONTROL_GRPS(113-114) -FAMILY_EXEMPT_TIME_LIMITS(114-116) -FAMILY_NEW_CHILD(116-117) -BLANK(117-156) -''' # https://faker.readthedocs.io/en/stable/providers/baseprovider.html#faker.providers.BaseProvider """ class FieldFaker(faker.providers.BaseProvider): def record_type(self): @@ -159,21 +109,8 @@ def build_datafile(stt, year, quarter, original_filename, file_name, section, fi return d -def validValues(schemaMgr, field): +def validValues(schemaMgr, field, year): '''Takes in a field and returns a line of valid values.''' - #niave implementation will just zero or 'a' fill the field - '''field_len = field.endIndex - field.startIndex - if field.type is "number": - line += "0" * field_len - elif field.type is "string": - line += "A" * field_len - else: - raise ValueError("Field type not recognized") - return line''' - #brute implementation will use faker and we'll run it through the validators - #pass - - # elegant implementation will use the validators to generate the valid values field_len = field.endIndex - field.startIndex # check list of validators @@ -187,41 +124,41 @@ def validValues(schemaMgr, field): # TODO: reverse the TransformField logic to 'encrypt' a random number field_format = '?' * field_len elif field.name in ('RPT_MONTH_YEAR', 'CALENDAR_QUARTER'): - lower = 1 # TODO: get quarter and use acceptable range for month - upper = 3 + upper = 12 # need to generate a two-digit month with leading zero using format() and randit() month = '{}'.format(random.randint(lower, upper)).zfill(2) - field_format = '2024' + str(month) # fake.date_time_this_month(before_now=True, after_now=False).strftime('%m%Y') + field_format = '{}{}'.format(year, str(month)) # fake.date_time_this_month(before_now=True, after_now=False).strftime('%m%Y') else: field_format = '#' * field_len return fake.bothify(text=field_format) -from tdpservice.parsers.row_schema import RowSchema -def make_line(schemaMgr, section): + +def make_line(schemaMgr, section, year): '''Takes in a schema manager and returns a line of data.''' line = '' if type(schemaMgr) is RowSchema: if schemaMgr.record_type == 'HEADER': - line += 'HEADER20241{}01 TAN1 D'.format(section) + line += 'HEADER{}1{}01 TAN1 D'.format(year, section) # do I need to do that off-by-one thing on quarter elif schemaMgr.record_type == 'TRAILER': line += 'TRAILER' + '1' * 16 else: - row_schema = schemaMgr.schemas[0] - for field in row_schema.fields: - line += validValues(row_schema, field) - return line + ' \n' + #row_schema = schemaMgr.schemas[0] + for row_schema in schemaMgr.schemas: # this is to handle multi-schema like T6 + for field in row_schema.fields: + line += validValues(row_schema, field, year) + return line + '\n' -from tdpservice.data_files.models import DataFile def make_files(stt, year, quarter): '''Given a STT, parameterize calls to build_datafile and make_line.''' sections = stt.filenames.keys() - # {'Active Case Data': 'ADS.E2J.FTP1.TS05', 'Closed Case Data': 'ADS.E2J.FTP2.TS05', 'Aggregate Data': 'ADS.E2J.FTP3.TS05'}" + """ {'Active Case Data': 'ADS.E2J.FTP1.TS05', 'Closed Case Data': 'ADS.E2J.FTP2.TS05', 'Aggregate Data': 'ADS.E2J.FTP3.TS05'}" # "{'Active Case Data': 'ADS.E2J.NDM1.TS24', 'Closed Case Data': 'ADS.E2J.NDM2.TS24', 'Aggregate Data': 'ADS.E2J.NDM3.TS24', # 'Stratum Data': 'ADS.E2J.NDM4.TS24', 'SSP Active Case Data': 'ADS.E2J.NDM1.MS24', 'SSP Closed Case Data': 'ADS.E2J.NDM2.MS24', 'SSP Aggregate Data': # 'ADS.E2J.NDM3.MS24', 'SSP Stratum Data': 'ADS.E2J.NDM4.MS24'}" + """ files_for_quarter = {} @@ -241,8 +178,7 @@ def make_files(stt, year, quarter): models_in_section = get_program_models(prog_type, section) temp_file = '' - print("making file for section: ", section) - temp_file += make_line(header,section) + temp_file += make_line(header, section, year) # iterate over models and generate lines for _, model in models_in_section.items(): @@ -253,15 +189,15 @@ def make_files(stt, year, quarter): # we should generate hundreds, thousands, tens of thousands of records for i in range(random.randint(5, 9)): - temp_file += make_line(model,section) + temp_file += make_line(model,section, year) #elif section in ['Aggregate Data', 'Stratum Data']: # # we should generate a smaller count of lines...maybe leave this as a TODO # # shouldn't this be based on the active/closed case data? # pass # make trailer line - temp_file += make_line(trailer,section) - print(temp_file) + temp_file += make_line(trailer, section, year) + #print(temp_file) datafile = build_datafile( stt=stt, @@ -277,6 +213,14 @@ def make_files(stt, year, quarter): return files_for_quarter +def make_seed(): + """Invokes scheduling/management/commands/backup_db management command.""" + from tdpservice.scheduling.management.commands.backup_db import Command as BackupCommand + backup = BackupCommand() + backup.handle(file = '/tdpapp/tdrs_db_seed.pg') # /tmp/tdrs_db_backup.pg') + + + class Command(BaseCommand): """Command class.""" @@ -330,35 +274,23 @@ def get_schema(line, section, program_type): """ # TODO: allowed values per field, try manual and if commonalities exist, create a function to generate # TODO: can we utilize validators somehow to get a validValues(schemaMgr.fields[])? - from tdpservice.parsers.models import DataFileSummary - from tdpservice.parsers import parse - from tdpservice.parsers.test.factories import DataFileSummaryFactory - files_for_qtr = make_files(STT.objects.get(id=1), 2024, 2) - print(files_for_qtr) # file has no id, and no payload/content - for f in files_for_qtr.keys(): - df = files_for_qtr[f] - #dfs = DataFileSummary.objects.create(datafile=df, status=DataFileSummary.Status.PENDING) #maybe i need df.file_data? - dfs = DataFileSummaryFactory.build() - dfs.datafile = df - parse.parse_datafile(df, dfs) - - #files_for_qtr[0].save() - - # HALT - # stts = STT.objects.all() - # for stt in stts: - # print(stt) - # # for y in years[2024] - # for q in [1,2,3,4]: - # files_for_qtr = make_files(stt, 2024, q) - # print(files_for_qtr) - # save to db or upload endpoint - - #for f in files_for_qtr: - - # parser_task.parse(f.id, should_send_submission_email=False) - # parse the file? or use DFS factory? - # dump db in full + from tdpservice.scheduling.parser_task import parse as parse_task + for yr in range(2020, 2025): + for qtr in [1,2,3,4]: + files_for_qtr = make_files(STT.objects.get(id=1), yr, qtr) + print(files_for_qtr) + for f in files_for_qtr.keys(): + df = files_for_qtr[f] + print(df.id) + #dfs = DataFileSummary.objects.create(datafile=df, status=DataFileSummary.Status.PENDING) #maybe i need df.file_data? + dfs = DataFileSummaryFactory.build() + dfs.datafile = df + #parse.parse_datafile(df, dfs) + parse_task(df.id, False) # does this work too? + + # dump db in full using `make_seed` func + make_seed() + '''TODO: try out parameterization like so: From e8db2f63e48b5f40e71ae9d14b21ef8b274af389 Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Fri, 30 Aug 2024 16:03:18 -0400 Subject: [PATCH 09/15] functional again after a header rework --- .../parsers/management/commands/seed_db.py | 165 +++++++++++++----- 1 file changed, 125 insertions(+), 40 deletions(-) diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index 7cb49ae94d..fd7b55138e 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -11,12 +11,13 @@ from tdpservice.parsers.schema_defs.header import header from tdpservice.parsers.schema_defs.trailer import trailer from tdpservice.parsers.schema_defs.utils import * # maybe need other utilities +from tdpservice.parsers.util import fiscal_to_calendar # all models should be referenced by using the utils.py get_schema_options wrappers from tdpservice.data_files.models import DataFile from tdpservice.parsers import parse from tdpservice.parsers.test.factories import DataFileSummaryFactory -from tdpservice.scheduling import parser_task # not using this, don't have datafile id +from tdpservice.scheduling.parser_task import parse as parse_task from tdpservice.stts.models import STT from tdpservice.users.models import User from tdpservice.parsers.row_schema import RowSchema @@ -138,20 +139,71 @@ def make_line(schemaMgr, section, year): '''Takes in a schema manager and returns a line of data.''' line = '' + #row_schema = schemaMgr.schemas[0] + for row_schema in schemaMgr.schemas: # this is to handle multi-schema like T6 + for field in row_schema.fields: + line += validValues(row_schema, field, year) + return line + '\n' + +def make_HT(schemaMgr, prog_type, section, year, quarter, stt): + line = '' + + ''' + The following fields are defined in the schema for the HEADER row of all submission types: + 1. title + 2. year + 3. quarter + 4. type + 5. state_fips + 6. tribe_code + 7. program_type + 8. edit + 9. encryption + 10. update + ''' if type(schemaMgr) is RowSchema: - if schemaMgr.record_type == 'HEADER': - line += 'HEADER{}1{}01 TAN1 D'.format(year, section) # do I need to do that off-by-one thing on quarter - elif schemaMgr.record_type == 'TRAILER': - line += 'TRAILER' + '1' * 16 + if schemaMgr.record_type == 'HEADER': + # HEADER2020Q1CAL000TAN1ED + for field in schemaMgr.fields: + if field.name == 'title': + line += 'HEADER' + elif field.name == 'year': + line += '{}'.format(year) + elif field.name == 'quarter': + line += quarter[1:] # remove the 'Q', e.g., 'Q1' -> '1' + elif field.name == 'type': + line += section + elif field.name == 'state_fips': + if stt.state is not None: # this is a tribe + my_stt = stt.state + else: + my_stt = stt + line += '{}'.format(my_stt.stt_code).zfill(2) + elif field.name == 'tribe_code': + if stt.type == 'tribe': + line += stt.stt_code + else: + line += '000' + elif field.name == 'program_type': + line += prog_type + elif field.name == 'edit': + line += '1' + elif field.name == 'encryption': + line += 'E' + elif field.name == 'update': + line += 'D' + + + #line += 'HEADER{}1{}01 TAN1 D'.format(year, section) # do I need to do that off-by-one thing on quarter + elif schemaMgr.record_type == 'TRAILER': + line += 'TRAILER' + '1' * 16 else: - #row_schema = schemaMgr.schemas[0] - for row_schema in schemaMgr.schemas: # this is to handle multi-schema like T6 - for field in row_schema.fields: - line += validValues(row_schema, field, year) - return line + '\n' + print('Invalid record type') + return None + return line + '\n' -def make_files(stt, year, quarter): +def make_files(stt, sub_year, sub_quarter): '''Given a STT, parameterize calls to build_datafile and make_line.''' sections = stt.filenames.keys() """ {'Active Case Data': 'ADS.E2J.FTP1.TS05', 'Closed Case Data': 'ADS.E2J.FTP2.TS05', 'Aggregate Data': 'ADS.E2J.FTP3.TS05'}" @@ -162,7 +214,7 @@ def make_files(stt, year, quarter): files_for_quarter = {} - for s in sections: + for long_section in sections: # based on section, get models from schema_options #if stt.ssp is True: # we can match section to the schema_options @@ -172,40 +224,71 @@ def make_files(stt, year, quarter): # given a leaf of 'section', get 'TAN' or 'SSP' or 'Tribal TAN' from schema_options # match schema_options[_]['section'] to our section - text_dict = get_schema_options("", section=s, query='text') + text_dict = get_schema_options("", section=long_section, query='text') prog_type = text_dict['program_type'] # TAN section = text_dict['section'] # A models_in_section = get_program_models(prog_type, section) temp_file = '' - temp_file += make_line(header, section, year) + ''' + def fiscal_to_calendar(year, fiscal_quarter): + """Decrement the input quarter text by one.""" + array = [1, 2, 3, 4] # wrapping around an array + int_qtr = int(fiscal_quarter[1:]) # remove the 'Q', e.g., 'Q1' -> '1' + if int_qtr == 1: + year = year - 1 + + ind_qtr = array.index(int_qtr) # get the index so we can easily wrap-around end of array + return year, "Q{}".format(array[ind_qtr - 1]) # return the previous quarter + + + def calendar_to_fiscal(calendar_year, fiscal_quarter): + """Decrement the calendar year if in Q1.""" + return calendar_year - 1 if fiscal_quarter == 'Q1' else calendar_year + + + def transform_to_months(quarter): + """Return a list of months in a quarter depending the quarter's format.""" + match quarter: + case "Q1": + return ["Jan", "Feb", "Mar"] + .... + + def month_to_int(month): + """Return the integer value of a month.""" + return datetime.strptime(month, '%b').strftime('%m') + calendar_year, calendar_quarter = get_calendar_quarter(year, quarter) + ''' + + + + cal_year, cal_quarter = fiscal_to_calendar(sub_year, 'Q{}'.format(sub_quarter)) + + + + temp_file += make_HT(header, prog_type, section, cal_year, cal_quarter, stt) # iterate over models and generate lines for _, model in models_in_section.items(): - if s in ['Active Case Data', 'Closed Case Data','Aggregate Data', 'Stratum Data']: - # obviously, this first approach can't prevent duplicates (unlikely), - # nor can it ensure that the case data is internally consistent - # (e.g. a case with a child but no adult) - - # we should generate hundreds, thousands, tens of thousands of records + if long_section in ['Active Case Data', 'Closed Case Data','Aggregate Data', 'Stratum Data']: for i in range(random.randint(5, 9)): - temp_file += make_line(model,section, year) + temp_file += make_line(model,section, cal_year) #elif section in ['Aggregate Data', 'Stratum Data']: # # we should generate a smaller count of lines...maybe leave this as a TODO # # shouldn't this be based on the active/closed case data? # pass # make trailer line - temp_file += make_line(trailer, section, year) + temp_file += make_HT(trailer, prog_type, section, cal_year, cal_quarter, stt) #print(temp_file) datafile = build_datafile( stt=stt, - year=year, - quarter=f"Q{quarter}", - original_filename=f'{stt}-{section}-{year}Q{quarter}.txt', - file_name=f'{stt}-{section}-{year}Q{quarter}', - section=s, + year=sub_year, # fiscal submission year + quarter=f"Q{sub_quarter}", # fiscal submission quarter + original_filename=f'{stt}-{section}-{sub_year}Q{sub_quarter}.txt', + file_name=f'{stt}-{section}-{sub_year}Q{sub_quarter}', + section=long_section, file_data=bytes(temp_file.rstrip(), 'utf-8'), ) datafile.save() @@ -274,19 +357,21 @@ def get_schema(line, section, program_type): """ # TODO: allowed values per field, try manual and if commonalities exist, create a function to generate # TODO: can we utilize validators somehow to get a validValues(schemaMgr.fields[])? - from tdpservice.scheduling.parser_task import parse as parse_task - for yr in range(2020, 2025): - for qtr in [1,2,3,4]: - files_for_qtr = make_files(STT.objects.get(id=1), yr, qtr) - print(files_for_qtr) - for f in files_for_qtr.keys(): - df = files_for_qtr[f] - print(df.id) - #dfs = DataFileSummary.objects.create(datafile=df, status=DataFileSummary.Status.PENDING) #maybe i need df.file_data? - dfs = DataFileSummaryFactory.build() - dfs.datafile = df - #parse.parse_datafile(df, dfs) - parse_task(df.id, False) # does this work too? + + for stt in STT.objects.filter(id__in=range(1,2)): + #get(id=1): # all(): + for yr in range(2020, 2025): + for qtr in [1,2,3,4]: + files_for_qtr = make_files(stt, yr, qtr) + print(files_for_qtr) + for f in files_for_qtr.keys(): + df = files_for_qtr[f] + print(df.id) + #dfs = DataFileSummary.objects.create(datafile=df, status=DataFileSummary.Status.PENDING) #maybe i need df.file_data? + dfs = DataFileSummaryFactory.build() + dfs.datafile = df + #parse.parse_datafile(df, dfs) + parse_task(df.id, False) # does this work too? # dump db in full using `make_seed` func make_seed() From d6cf1e197b4d2ef3a3e6f5dcc173532b29c5e485 Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Tue, 3 Sep 2024 09:17:01 -0400 Subject: [PATCH 10/15] Cleaning up comments/prints for PR --- .gitignore | 2 +- Taskfile.yml | 2 +- .../parsers/management/commands/seed_db.py | 226 +----------------- tdrs-backend/tdpservice/parsers/parse.py | 5 - tdrs-backend/tdpservice/parsers/row_schema.py | 2 - 5 files changed, 12 insertions(+), 225 deletions(-) diff --git a/.gitignore b/.gitignore index 2974197b1d..6be3a50173 100644 --- a/.gitignore +++ b/.gitignore @@ -109,4 +109,4 @@ cypress.env.json # Patches *.patch -tdrs-backend/tdpservice/parsers/management/commands/*.txt +tdrs-backend/*.pg diff --git a/Taskfile.yml b/Taskfile.yml index a6ea63e7a8..7e0c48e48b 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -59,7 +59,7 @@ tasks: cmds: - docker-compose -f docker-compose.yml exec web sh -c "python ./manage.py shell" - backend-exec: + backend-exec-seed-db: desc: Execute a command in the backend container dir: tdrs-backend vars: diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index fd7b55138e..a81b4cdd36 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -15,7 +15,7 @@ # all models should be referenced by using the utils.py get_schema_options wrappers from tdpservice.data_files.models import DataFile -from tdpservice.parsers import parse +#from tdpservice.parsers import parse from tdpservice.parsers.test.factories import DataFileSummaryFactory from tdpservice.scheduling.parser_task import parse as parse_task from tdpservice.stts.models import STT @@ -24,70 +24,8 @@ logger = logging.getLogger(__name__) -''' -Create a tool/mechanism for generating "random" and internally consistent data - implement faker for test factories - create datafile generator from factories - - class ParsingFileFactory(factory.django.DjangoModelFactory): - # class DataFileSummaryFactory(factory.django.DjangoModelFactory): - # class ParserErrorFactory(factory.django.DjangoModelFactory): - class TanfT1Factory(factory.django.DjangoModelFactory): - ... - - parse generated datafiles - -Ensure tool is version/schema aware (we should be able to use django's migration table) -Tool will "fuzz" or generate out of range values to intentionally create issues [ stretch ] - utilize parsers/schema_defs/utils.py as a reference for generating the `file__data` line by line - have to start with header - - given a ParsingFileFactory, generate a file__data line by line - -Create django command such that tool can be pointed at deployed environments -''' - - - # https://faker.readthedocs.io/en/stable/providers/baseprovider.html#faker.providers.BaseProvider -""" class FieldFaker(faker.providers.BaseProvider): - def record_type(self): - return self.random_element(elements=('00', '01', '02')) - - def rpt_month_year(self): - return self.date_time_this_month(before_now=True, after_now=False).strftime('%m%Y') - - def case_number(self): - return self.random_int(min=1000000000, max=9999999999) - - def county_fips_code(self): - return self.random_int(min=0, max=999) - - def stratum(self): - return self.random_int(min=0, max=99) - - def zip_code(self): - return self.random_int(min=0, max=99999) - - def funding_stream(self): - return self.random_element(elements=('A', 'B', 'C')) - - def disposition(self): - return self.random_element(elements=('A', 'B', 'C')) - - def new_applicant(self): - return self.random_element(elements=('Y', 'N')) - - def nbr_family_members(self): - return self.random_int(min=0, max=99) - - def family_type(self): - return self.random_element(elements=('A', 'B', 'C')) - - def receives_sub_housing(self): - return self.random_element(elements=('Y', 'N')) - """ - +""" class FieldFaker(faker.providers.BaseProvider):...""" def build_datafile(stt, year, quarter, original_filename, file_name, section, file_data): """Build a datafile.""" @@ -114,10 +52,7 @@ def validValues(schemaMgr, field, year): '''Takes in a field and returns a line of valid values.''' field_len = field.endIndex - field.startIndex - # check list of validators - # treat header/trailer special, it actually checks for string values - # check for zero or pad fill - # transformField might be tricky + if field.name == 'RecordType': return schemaMgr.record_type if field.name == 'SSN': @@ -127,9 +62,9 @@ def validValues(schemaMgr, field, year): elif field.name in ('RPT_MONTH_YEAR', 'CALENDAR_QUARTER'): lower = 1 # TODO: get quarter and use acceptable range for month upper = 12 - # need to generate a two-digit month with leading zero using format() and randit() + month = '{}'.format(random.randint(lower, upper)).zfill(2) - field_format = '{}{}'.format(year, str(month)) # fake.date_time_this_month(before_now=True, after_now=False).strftime('%m%Y') + field_format = '{}{}'.format(year, str(month)) else: field_format = '#' * field_len return fake.bothify(text=field_format) @@ -139,7 +74,6 @@ def make_line(schemaMgr, section, year): '''Takes in a schema manager and returns a line of data.''' line = '' - #row_schema = schemaMgr.schemas[0] for row_schema in schemaMgr.schemas: # this is to handle multi-schema like T6 for field in row_schema.fields: line += validValues(row_schema, field, year) @@ -148,22 +82,9 @@ def make_line(schemaMgr, section, year): def make_HT(schemaMgr, prog_type, section, year, quarter, stt): line = '' - ''' - The following fields are defined in the schema for the HEADER row of all submission types: - 1. title - 2. year - 3. quarter - 4. type - 5. state_fips - 6. tribe_code - 7. program_type - 8. edit - 9. encryption - 10. update - ''' if type(schemaMgr) is RowSchema: if schemaMgr.record_type == 'HEADER': - # HEADER2020Q1CAL000TAN1ED + # e.g. HEADER20201CAL000TAN1ED for field in schemaMgr.fields: if field.name == 'title': line += 'HEADER' @@ -193,8 +114,6 @@ def make_HT(schemaMgr, prog_type, section, year, quarter, stt): elif field.name == 'update': line += 'D' - - #line += 'HEADER{}1{}01 TAN1 D'.format(year, section) # do I need to do that off-by-one thing on quarter elif schemaMgr.record_type == 'TRAILER': line += 'TRAILER' + '1' * 16 else: @@ -206,62 +125,16 @@ def make_HT(schemaMgr, prog_type, section, year, quarter, stt): def make_files(stt, sub_year, sub_quarter): '''Given a STT, parameterize calls to build_datafile and make_line.''' sections = stt.filenames.keys() - """ {'Active Case Data': 'ADS.E2J.FTP1.TS05', 'Closed Case Data': 'ADS.E2J.FTP2.TS05', 'Aggregate Data': 'ADS.E2J.FTP3.TS05'}" - # "{'Active Case Data': 'ADS.E2J.NDM1.TS24', 'Closed Case Data': 'ADS.E2J.NDM2.TS24', 'Aggregate Data': 'ADS.E2J.NDM3.TS24', - # 'Stratum Data': 'ADS.E2J.NDM4.TS24', 'SSP Active Case Data': 'ADS.E2J.NDM1.MS24', 'SSP Closed Case Data': 'ADS.E2J.NDM2.MS24', 'SSP Aggregate Data': - # 'ADS.E2J.NDM3.MS24', 'SSP Stratum Data': 'ADS.E2J.NDM4.MS24'}" - """ files_for_quarter = {} for long_section in sections: - # based on section, get models from schema_options - #if stt.ssp is True: - # we can match section to the schema_options - # elif stt.state is not None: - # we can declare prog_type to Tribal - - # given a leaf of 'section', get 'TAN' or 'SSP' or 'Tribal TAN' from schema_options - - # match schema_options[_]['section'] to our section text_dict = get_schema_options("", section=long_section, query='text') prog_type = text_dict['program_type'] # TAN section = text_dict['section'] # A models_in_section = get_program_models(prog_type, section) temp_file = '' - ''' - def fiscal_to_calendar(year, fiscal_quarter): - """Decrement the input quarter text by one.""" - array = [1, 2, 3, 4] # wrapping around an array - int_qtr = int(fiscal_quarter[1:]) # remove the 'Q', e.g., 'Q1' -> '1' - if int_qtr == 1: - year = year - 1 - - ind_qtr = array.index(int_qtr) # get the index so we can easily wrap-around end of array - return year, "Q{}".format(array[ind_qtr - 1]) # return the previous quarter - - - def calendar_to_fiscal(calendar_year, fiscal_quarter): - """Decrement the calendar year if in Q1.""" - return calendar_year - 1 if fiscal_quarter == 'Q1' else calendar_year - - - def transform_to_months(quarter): - """Return a list of months in a quarter depending the quarter's format.""" - match quarter: - case "Q1": - return ["Jan", "Feb", "Mar"] - .... - - def month_to_int(month): - """Return the integer value of a month.""" - return datetime.strptime(month, '%b').strftime('%m') - calendar_year, calendar_quarter = get_calendar_quarter(year, quarter) - ''' - - - cal_year, cal_quarter = fiscal_to_calendar(sub_year, 'Q{}'.format(sub_quarter)) @@ -271,7 +144,7 @@ def month_to_int(month): # iterate over models and generate lines for _, model in models_in_section.items(): if long_section in ['Active Case Data', 'Closed Case Data','Aggregate Data', 'Stratum Data']: - for i in range(random.randint(5, 9)): + for i in range(random.randint(5, 999)): temp_file += make_line(model,section, cal_year) #elif section in ['Aggregate Data', 'Stratum Data']: # # we should generate a smaller count of lines...maybe leave this as a TODO @@ -300,9 +173,7 @@ def make_seed(): """Invokes scheduling/management/commands/backup_db management command.""" from tdpservice.scheduling.management.commands.backup_db import Command as BackupCommand backup = BackupCommand() - backup.handle(file = '/tdpapp/tdrs_db_seed.pg') # /tmp/tdrs_db_backup.pg') - - + backup.handle(file = '/tdpapp/tdrs_db_seed.pg') class Command(BaseCommand): """Command class.""" @@ -312,95 +183,18 @@ class Command(BaseCommand): def handle(self, *args, **options): """Populate datafiles, records, summaries, and errors for all STTs.""" - - # from file__prog_type -> get me the prog type for referencing the schema_def - """ def get_program_models(str_prog, str_section): - def get_program_model(str_prog, str_section, str_model): - def get_section_reference(str_prog, str_section): - def get_text_from_df(df): - def get_schema(line, section, program_type): """ - - - """ parsing_file = ParsingFileFactory.build(year=2021, - quarter='Q2', - original_filename='t3_file.txt', - file__name='t3_file.txt', - section=DataFile.Section.CLOSED_CASE_DATA, - #**how do we make this** - file__data=b'', - ) """ - #parsing_file.save() - - #x = get_text_from_df(parsing_file) - #print(x) - - # t1 = schema_options.get('TAN').get('A').get('models').get('T1') - #T1_fields = t1.schemas[0].fields - - #[print(i) for i in T1_fields] - - """ t1_line = '' - for field in T1_fields: - field_len = field.endIndex - field.startIndex - # check list of validators - # treat header/trailer special, it actually checks for string values - # check for zero or pad fill - # transformField might be tricky - if field.name == 'SSN': - # only used by recordtypes 2,3,5 - # TODO: reverse the TransformField logic to 'encrypt' a random number - field_format = '?' * field_len - else: - field_format = '#' * field_len - t1_line += faker.bothify(text=field_format) - print(t1_line) """ - - # TODO: allowed values per field, try manual and if commonalities exist, create a function to generate - # TODO: can we utilize validators somehow to get a validValues(schemaMgr.fields[])? - - for stt in STT.objects.filter(id__in=range(1,2)): - #get(id=1): # all(): + for stt in STT.objects.all(): # filter(id__in=range(1,2)): for yr in range(2020, 2025): for qtr in [1,2,3,4]: files_for_qtr = make_files(stt, yr, qtr) print(files_for_qtr) for f in files_for_qtr.keys(): df = files_for_qtr[f] - print(df.id) - #dfs = DataFileSummary.objects.create(datafile=df, status=DataFileSummary.Status.PENDING) #maybe i need df.file_data? dfs = DataFileSummaryFactory.build() dfs.datafile = df #parse.parse_datafile(df, dfs) - parse_task(df.id, False) # does this work too? + parse_task(df.id, False) # dump db in full using `make_seed` func make_seed() - - - - '''TODO: try out parameterization like so: - T2Factory.create( - RPT_MONTH_YEAR=202010, - CASE_NUMBER='123', - FAMILY_AFFILIATION=1, - ), - ''' - - ''' - # utilize parsers/schema_defs/utils.py as a reference for getting the lists of STT/years/quarters/sections - for i in STT[]: - for y in years[] # 2020-2022 - for q in quarters[] # 1-4 - for p in programs[] # TAN, SSP, Tribal TAN - for s in sections[] # [x for x['section'] in schema_options[p].keys()] - #need the DF FK reference? - for m in models[] - for f in rowSchemas.Fields[] - for v in f.validValues[] - (look at seed_records.py for how to generate random data) - # write a temp file - # ok now parse this thing (but turn off emails) - # generate a DFS - # dump db in full to a seed file - ''' diff --git a/tdrs-backend/tdpservice/parsers/parse.py b/tdrs-backend/tdpservice/parsers/parse.py index b4f5d2dfe7..8c17a70878 100644 --- a/tdrs-backend/tdpservice/parsers/parse.py +++ b/tdrs-backend/tdpservice/parsers/parse.py @@ -33,7 +33,6 @@ def parse_datafile(datafile, dfs): logger.info(f"Preparser Error: {len(header_errors)} header errors encountered.") errors['header'] = header_errors bulk_create_errors({1: header_errors}, 1, flush=True) - print("ERRORS: ", errors) return errors field_values = schema_defs.header.get_field_values_by_names(header_line, @@ -470,10 +469,6 @@ def manager_parse_line(line, schema_manager, generate_error, datafile, is_encryp if type(schema_manager) is row_schema.SchemaManager: schema_manager.datafile = datafile - elif type(schema_manager) is list: - print(line) - print("sMgr(" +str(type(schema_manager))+"): " +str(schema_manager)) - print("datafile: " +str(type(datafile))) try: schema_manager.update_encrypted_fields(is_encrypted) records = schema_manager.parse_and_validate(line, generate_error) diff --git a/tdrs-backend/tdpservice/parsers/row_schema.py b/tdrs-backend/tdpservice/parsers/row_schema.py index 8b27fd69e0..7dd01556fe 100644 --- a/tdrs-backend/tdpservice/parsers/row_schema.py +++ b/tdrs-backend/tdpservice/parsers/row_schema.py @@ -66,8 +66,6 @@ def parse_and_validate(self, line, generate_error): if is_quiet_preparser_errors: return None, True, [] logger.info(f"{len(preparsing_errors)} preparser error(s) encountered.") - logger.info(line) - logger.info(preparsing_errors) return None, False, preparsing_errors # parse line to model From c10f7965356f6513c339582f78d7289dcb4a22de Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Tue, 3 Sep 2024 11:01:40 -0400 Subject: [PATCH 11/15] Linter clean-up --- .../parsers/management/commands/seed_db.py | 60 ++++++++----------- tdrs-backend/tdpservice/parsers/parse.py | 1 - 2 files changed, 25 insertions(+), 36 deletions(-) diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index a81b4cdd36..c00db0ac68 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -1,35 +1,31 @@ """`seed_db` command.""" - -import random -import logging - from django.core.management import BaseCommand -from faker import Faker -fake = Faker() from django.core.files.base import ContentFile from tdpservice.parsers.schema_defs.header import header from tdpservice.parsers.schema_defs.trailer import trailer -from tdpservice.parsers.schema_defs.utils import * # maybe need other utilities +from tdpservice.parsers.schema_defs.utils import get_schema_options, get_program_models from tdpservice.parsers.util import fiscal_to_calendar # all models should be referenced by using the utils.py get_schema_options wrappers - from tdpservice.data_files.models import DataFile -#from tdpservice.parsers import parse +# from tdpservice.parsers import parse from tdpservice.parsers.test.factories import DataFileSummaryFactory from tdpservice.scheduling.parser_task import parse as parse_task from tdpservice.stts.models import STT from tdpservice.users.models import User from tdpservice.parsers.row_schema import RowSchema +from faker import Faker +import logging +import random +fake = Faker() logger = logging.getLogger(__name__) # https://faker.readthedocs.io/en/stable/providers/baseprovider.html#faker.providers.BaseProvider -""" class FieldFaker(faker.providers.BaseProvider):...""" +# """ class FieldFaker(faker.providers.BaseProvider):...""" def build_datafile(stt, year, quarter, original_filename, file_name, section, file_data): """Build a datafile.""" - try: d = DataFile.objects.create( user=User.objects.get_or_create(username='system')[0], @@ -41,26 +37,25 @@ def build_datafile(stt, year, quarter, original_filename, file_name, section, fi version=random.randint(1, 1993415), ) - d.file.save(file_name, ContentFile(file_data)) except django.db.utils.IntegrityError as e: + logger.error(f"Error creating datafile: {e}") pass return d def validValues(schemaMgr, field, year): - '''Takes in a field and returns a line of valid values.''' - + """Take in a field and returns a line of valid values.""" field_len = field.endIndex - field.startIndex if field.name == 'RecordType': return schemaMgr.record_type if field.name == 'SSN': - # only used by recordtypes 2,3,5 + # only used by recordtypes 2,3,5 # TODO: reverse the TransformField logic to 'encrypt' a random number field_format = '?' * field_len elif field.name in ('RPT_MONTH_YEAR', 'CALENDAR_QUARTER'): - lower = 1 # TODO: get quarter and use acceptable range for month + lower = 1 # TODO: get quarter and use acceptable range for month upper = 12 month = '{}'.format(random.randint(lower, upper)).zfill(2) @@ -71,7 +66,7 @@ def validValues(schemaMgr, field, year): def make_line(schemaMgr, section, year): - '''Takes in a schema manager and returns a line of data.''' + """Take in a schema manager and returns a line of data.""" line = '' for row_schema in schemaMgr.schemas: # this is to handle multi-schema like T6 @@ -80,6 +75,7 @@ def make_line(schemaMgr, section, year): return line + '\n' def make_HT(schemaMgr, prog_type, section, year, quarter, stt): + """Handle special case of header/trailer lines.""" line = '' if type(schemaMgr) is RowSchema: @@ -113,7 +109,7 @@ def make_HT(schemaMgr, prog_type, section, year, quarter, stt): line += 'E' elif field.name == 'update': line += 'D' - + elif schemaMgr.record_type == 'TRAILER': line += 'TRAILER' + '1' * 16 else: @@ -123,38 +119,34 @@ def make_HT(schemaMgr, prog_type, section, year, quarter, stt): return line + '\n' def make_files(stt, sub_year, sub_quarter): - '''Given a STT, parameterize calls to build_datafile and make_line.''' + """Given a STT, parameterize calls to build_datafile and make_line.""" sections = stt.filenames.keys() files_for_quarter = {} - for long_section in sections: text_dict = get_schema_options("", section=long_section, query='text') - prog_type = text_dict['program_type'] # TAN + prog_type = text_dict['program_type'] # TAN section = text_dict['section'] # A models_in_section = get_program_models(prog_type, section) temp_file = '' cal_year, cal_quarter = fiscal_to_calendar(sub_year, 'Q{}'.format(sub_quarter)) - - - temp_file += make_HT(header, prog_type, section, cal_year, cal_quarter, stt) # iterate over models and generate lines for _, model in models_in_section.items(): - if long_section in ['Active Case Data', 'Closed Case Data','Aggregate Data', 'Stratum Data']: + if long_section in ['Active Case Data', 'Closed Case Data', 'Aggregate Data', 'Stratum Data']: for i in range(random.randint(5, 999)): - temp_file += make_line(model,section, cal_year) - #elif section in ['Aggregate Data', 'Stratum Data']: + temp_file += make_line(model, section, cal_year) + # elif section in ['Aggregate Data', 'Stratum Data']: # # we should generate a smaller count of lines...maybe leave this as a TODO # # shouldn't this be based on the active/closed case data? # pass # make trailer line temp_file += make_HT(trailer, prog_type, section, cal_year, cal_quarter, stt) - #print(temp_file) - + # print(temp_file) + datafile = build_datafile( stt=stt, year=sub_year, # fiscal submission year @@ -170,10 +162,10 @@ def make_files(stt, sub_year, sub_quarter): return files_for_quarter def make_seed(): - """Invokes scheduling/management/commands/backup_db management command.""" + """Invoke scheduling/management/commands/backup_db management command.""" from tdpservice.scheduling.management.commands.backup_db import Command as BackupCommand backup = BackupCommand() - backup.handle(file = '/tdpapp/tdrs_db_seed.pg') + backup.handle(file='/tdpapp/tdrs_db_seed.pg') class Command(BaseCommand): """Command class.""" @@ -182,19 +174,17 @@ class Command(BaseCommand): def handle(self, *args, **options): """Populate datafiles, records, summaries, and errors for all STTs.""" - for stt in STT.objects.all(): # filter(id__in=range(1,2)): for yr in range(2020, 2025): - for qtr in [1,2,3,4]: + for qtr in [1, 2, 3, 4]: files_for_qtr = make_files(stt, yr, qtr) print(files_for_qtr) for f in files_for_qtr.keys(): df = files_for_qtr[f] dfs = DataFileSummaryFactory.build() dfs.datafile = df - #parse.parse_datafile(df, dfs) + # parse.parse_datafile(df, dfs) parse_task(df.id, False) # dump db in full using `make_seed` func make_seed() - diff --git a/tdrs-backend/tdpservice/parsers/parse.py b/tdrs-backend/tdpservice/parsers/parse.py index a81fc397cc..1f14b6557b 100644 --- a/tdrs-backend/tdpservice/parsers/parse.py +++ b/tdrs-backend/tdpservice/parsers/parse.py @@ -494,7 +494,6 @@ def parse_datafile_lines(datafile, dfs, program_type, section, is_encrypted, cas def manager_parse_line(line, schema_manager, generate_error, datafile, is_encrypted=False): """Parse and validate a datafile line using SchemaManager.""" - if type(schema_manager) is row_schema.SchemaManager: schema_manager.datafile = datafile try: From 7b44e63e0eb8cc5830b748b811f401b31fdc7b9d Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Tue, 3 Sep 2024 11:17:34 -0400 Subject: [PATCH 12/15] linter pt2 --- tdrs-backend/tdpservice/parsers/management/commands/seed_db.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index c00db0ac68..733bb5a403 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -2,6 +2,7 @@ from django.core.management import BaseCommand from django.core.files.base import ContentFile +from django.db.utils import IntegrityError from tdpservice.parsers.schema_defs.header import header from tdpservice.parsers.schema_defs.trailer import trailer from tdpservice.parsers.schema_defs.utils import get_schema_options, get_program_models @@ -38,7 +39,7 @@ def build_datafile(stt, year, quarter, original_filename, file_name, section, fi ) d.file.save(file_name, ContentFile(file_data)) - except django.db.utils.IntegrityError as e: + except IntegrityError as e: logger.error(f"Error creating datafile: {e}") pass return d From 5fcf42331d0c8cee0cee636b22e1f11bf17166a3 Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Mon, 9 Sep 2024 10:15:03 -0400 Subject: [PATCH 13/15] Improvements to header, linting, and Eric's PR feedback --- Taskfile.yml | 10 ++++- .../parsers/management/commands/seed_db.py | 43 ++++++------------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index 525ea16fae..fa152a53f4 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -109,9 +109,17 @@ tasks: cmds: - docker-compose -f docker-compose.yml exec web sh -c "python ./manage.py shell" - backend-exec-seed-db: + backend-exec: desc: Execute a command in the backend container dir: tdrs-backend + vars: + CMD: '{{.CMD}}' + cmds: + - docker-compose -f docker-compose.yml exec web sh -c "python manage.py {{.CMD}}" + + backend-exec-seed-db: + desc: Execute seed_db command in the backend container + dir: tdrs-backend vars: CMD: '{{.CMD}}' cmds: diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index 733bb5a403..5d34ae90c7 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -82,34 +82,16 @@ def make_HT(schemaMgr, prog_type, section, year, quarter, stt): if type(schemaMgr) is RowSchema: if schemaMgr.record_type == 'HEADER': # e.g. HEADER20201CAL000TAN1ED - for field in schemaMgr.fields: - if field.name == 'title': - line += 'HEADER' - elif field.name == 'year': - line += '{}'.format(year) - elif field.name == 'quarter': - line += quarter[1:] # remove the 'Q', e.g., 'Q1' -> '1' - elif field.name == 'type': - line += section - elif field.name == 'state_fips': - if stt.state is not None: # this is a tribe - my_stt = stt.state - else: - my_stt = stt - line += '{}'.format(my_stt.stt_code).zfill(2) - elif field.name == 'tribe_code': - if stt.type == 'tribe': - line += stt.stt_code - else: - line += '000' - elif field.name == 'program_type': - line += prog_type - elif field.name == 'edit': - line += '1' - elif field.name == 'encryption': - line += 'E' - elif field.name == 'update': - line += 'D' + + if stt.state is not None: # this is a tribe + my_stt = stt.state + else: + my_stt = stt + state_fips = '{}'.format(my_stt.stt_code).zfill(2) + # state_fips = stt.state.stt_code if stt.state is not None else stt.stt_code + tribe_code = '{}'.format(stt.stt_code) if stt.type == 'tribe' else '000' + + line = f"HEADER{year}{quarter[1:]}{section}{state_fips}{tribe_code}{prog_type}1ED" elif schemaMgr.record_type == 'TRAILER': line += 'TRAILER' + '1' * 16 @@ -136,7 +118,8 @@ def make_files(stt, sub_year, sub_quarter): # iterate over models and generate lines for _, model in models_in_section.items(): - if long_section in ['Active Case Data', 'Closed Case Data', 'Aggregate Data', 'Stratum Data']: + # below is equivalent to 'contains' for the tuple + if any(section in long_section for section in ('Active Case', 'Closed Case', 'Aggregate', 'Stratum')): for i in range(random.randint(5, 999)): temp_file += make_line(model, section, cal_year) # elif section in ['Aggregate Data', 'Stratum Data']: @@ -175,7 +158,7 @@ class Command(BaseCommand): def handle(self, *args, **options): """Populate datafiles, records, summaries, and errors for all STTs.""" - for stt in STT.objects.all(): # filter(id__in=range(1,2)): + for stt in STT.objects.all(): # .filter(id__in=range(1,25)) for yr in range(2020, 2025): for qtr in [1, 2, 3, 4]: files_for_qtr = make_files(stt, yr, qtr) From 8a6f87532da002d6e8f2a7c96ab09e9c87453eea Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Mon, 23 Sep 2024 12:11:24 -0400 Subject: [PATCH 14/15] Fixed quarter preparing issue --- .../parsers/management/commands/seed_db.py | 77 +++++++++++++++---- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index 5d34ae90c7..bc105e9987 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -45,7 +45,7 @@ def build_datafile(stt, year, quarter, original_filename, file_name, section, fi return d -def validValues(schemaMgr, field, year): +def validValues(schemaMgr, field, year, qtr): """Take in a field and returns a line of valid values.""" field_len = field.endIndex - field.startIndex @@ -55,24 +55,43 @@ def validValues(schemaMgr, field, year): # only used by recordtypes 2,3,5 # TODO: reverse the TransformField logic to 'encrypt' a random number field_format = '?' * field_len - elif field.name in ('RPT_MONTH_YEAR', 'CALENDAR_QUARTER'): - lower = 1 # TODO: get quarter and use acceptable range for month - upper = 12 + elif field.name in ('RPT_MONTH_YEAR'): # previously had CALENDAR_QUARTER + # given a quarter, set upper lower bounds for month + qtr = qtr[1:] + match qtr: + case '1': + lower = 1 + upper = 3 + case '2': + lower = 4 + upper = 6 + case '3': + lower = 7 + upper = 9 + case '4': + lower = 10 + upper = 12 month = '{}'.format(random.randint(lower, upper)).zfill(2) field_format = '{}{}'.format(year, str(month)) else: + if field.friendly_name == 'Family Affiliation': + print('Family Affiliation') field_format = '#' * field_len return fake.bothify(text=field_format) -def make_line(schemaMgr, section, year): +def make_line(schemaMgr, section, year, qtr): """Take in a schema manager and returns a line of data.""" line = '' - for row_schema in schemaMgr.schemas: # this is to handle multi-schema like T6 - for field in row_schema.fields: - line += validValues(row_schema, field, year) + #for row_schema in schemaMgr.schemas: # this is to handle multi-schema like T6 + #if len(schemaMgr.schemas) > 1: + row_schema = schemaMgr.schemas[0] + + for field in row_schema.fields: + line += validValues(row_schema, field, year, qtr) + print(f"Field: {field.name}, field length {field.endIndex - field.startIndex} Value: {line}") return line + '\n' def make_HT(schemaMgr, prog_type, section, year, quarter, stt): @@ -120,8 +139,8 @@ def make_files(stt, sub_year, sub_quarter): for _, model in models_in_section.items(): # below is equivalent to 'contains' for the tuple if any(section in long_section for section in ('Active Case', 'Closed Case', 'Aggregate', 'Stratum')): - for i in range(random.randint(5, 999)): - temp_file += make_line(model, section, cal_year) + for i in range(random.randint(1, 3)): + temp_file += make_line(model, section, cal_year, cal_quarter) # elif section in ['Aggregate Data', 'Stratum Data']: # # we should generate a smaller count of lines...maybe leave this as a TODO # # shouldn't this be based on the active/closed case data? @@ -158,9 +177,11 @@ class Command(BaseCommand): def handle(self, *args, **options): """Populate datafiles, records, summaries, and errors for all STTs.""" - for stt in STT.objects.all(): # .filter(id__in=range(1,25)) - for yr in range(2020, 2025): - for qtr in [1, 2, 3, 4]: + + + for stt in STT.objects.filter(id__in=range(1,2)): # .all(): + for yr in range(2020, 2021): + for qtr in [1, 2]: #, 3, 4]: files_for_qtr = make_files(stt, yr, qtr) print(files_for_qtr) for f in files_for_qtr.keys(): @@ -169,6 +190,34 @@ def handle(self, *args, **options): dfs.datafile = df # parse.parse_datafile(df, dfs) parse_task(df.id, False) + """ + + # run validValues() and make_line() but only for T3 types + from tdpservice.parsers.row_schema import SchemaManager + from tdpservice.parsers.parse import manager_parse_line + from tdpservice.parsers.util import make_generate_parser_error + t3_model = get_schema_options(program='TAN', section='A', model_name='T3', query='models') + + quarter = 'Q2' + + datafile = DataFile.objects.create( + user=User.objects.get_or_create(username='system')[0], + stt=STT.objects.get(id=1), + year=2021, + quarter='Q3', + original_filename='TAN-A-2021Q1.txt', + section='Active Case', + version=random.randint(1, 415), + ) + generate_error = make_generate_parser_error(datafile, 1) + print(type(t3_model)) + schemaMgr = t3_model + + for i in range(5): + t3_line = make_line(schemaMgr, 'A', 2021, quarter) + manager_parse_line(t3_line, schemaMgr, generate_error, datafile, is_encrypted=False) + + """ # dump db in full using `make_seed` func - make_seed() + # make_seed() From 014fddb1bc6fb0956c2703f7429ec491e99b7eab Mon Sep 17 00:00:00 2001 From: andrew-jameson Date: Mon, 23 Sep 2024 13:54:44 -0400 Subject: [PATCH 15/15] Comment clean up, linting fix --- .../parsers/management/commands/seed_db.py | 56 +++---------------- 1 file changed, 7 insertions(+), 49 deletions(-) diff --git a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py index bc105e9987..e8b8f61365 100644 --- a/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py +++ b/tdrs-backend/tdpservice/parsers/management/commands/seed_db.py @@ -58,19 +58,8 @@ def validValues(schemaMgr, field, year, qtr): elif field.name in ('RPT_MONTH_YEAR'): # previously had CALENDAR_QUARTER # given a quarter, set upper lower bounds for month qtr = qtr[1:] - match qtr: - case '1': - lower = 1 - upper = 3 - case '2': - lower = 4 - upper = 6 - case '3': - lower = 7 - upper = 9 - case '4': - lower = 10 - upper = 12 + upper = int(qtr) * 3 + lower = upper - 2 month = '{}'.format(random.randint(lower, upper)).zfill(2) field_format = '{}{}'.format(year, str(month)) @@ -85,8 +74,8 @@ def make_line(schemaMgr, section, year, qtr): """Take in a schema manager and returns a line of data.""" line = '' - #for row_schema in schemaMgr.schemas: # this is to handle multi-schema like T6 - #if len(schemaMgr.schemas) > 1: + # for row_schema in schemaMgr.schemas: # this is to handle multi-schema like T6 + # if len(schemaMgr.schemas) > 1: row_schema = schemaMgr.schemas[0] for field in row_schema.fields: @@ -177,47 +166,16 @@ class Command(BaseCommand): def handle(self, *args, **options): """Populate datafiles, records, summaries, and errors for all STTs.""" - - - for stt in STT.objects.filter(id__in=range(1,2)): # .all(): + for stt in STT.objects.filter(id__in=range(1, 2)): # .all(): for yr in range(2020, 2021): - for qtr in [1, 2]: #, 3, 4]: + for qtr in [1, 2]: # , 3, 4]: files_for_qtr = make_files(stt, yr, qtr) print(files_for_qtr) for f in files_for_qtr.keys(): df = files_for_qtr[f] dfs = DataFileSummaryFactory.build() dfs.datafile = df - # parse.parse_datafile(df, dfs) parse_task(df.id, False) - """ - - # run validValues() and make_line() but only for T3 types - from tdpservice.parsers.row_schema import SchemaManager - from tdpservice.parsers.parse import manager_parse_line - from tdpservice.parsers.util import make_generate_parser_error - t3_model = get_schema_options(program='TAN', section='A', model_name='T3', query='models') - - quarter = 'Q2' - - datafile = DataFile.objects.create( - user=User.objects.get_or_create(username='system')[0], - stt=STT.objects.get(id=1), - year=2021, - quarter='Q3', - original_filename='TAN-A-2021Q1.txt', - section='Active Case', - version=random.randint(1, 415), - ) - generate_error = make_generate_parser_error(datafile, 1) - print(type(t3_model)) - schemaMgr = t3_model - - for i in range(5): - t3_line = make_line(schemaMgr, 'A', 2021, quarter) - manager_parse_line(t3_line, schemaMgr, generate_error, datafile, is_encrypted=False) - - """ # dump db in full using `make_seed` func - # make_seed() + make_seed()