diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py index 5df6ef8de..11a98395b 100644 --- a/lhotse/bin/modes/recipes/__init__.py +++ b/lhotse/bin/modes/recipes/__init__.py @@ -62,6 +62,7 @@ from .peoples_speech import * from .primewords import * from .rir_noise import * +from .sbcsae import * from .speechcommands import * from .spgispeech import * from .stcmds import * diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py new file mode 100644 index 000000000..23f36e09b --- /dev/null +++ b/lhotse/bin/modes/recipes/sbcsae.py @@ -0,0 +1,37 @@ +from typing import Optional, Sequence + +import click + +from lhotse.bin.modes import download, prepare +from lhotse.recipes.sbcsae import download_sbcsae, prepare_sbcsae +from lhotse.utils import Pathlike + +__all__ = ["sbcsae"] + + +@prepare.command(context_settings=dict(show_default=True)) +@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True)) +@click.argument("output_dir", type=click.Path()) +def sbcsae( + corpus_dir: Pathlike, + output_dir: Pathlike, +): + """SBCSAE data preparation.""" + prepare_sbcsae(corpus_dir, output_dir=output_dir) + + +@download.command(context_settings=dict(show_default=True)) +@click.argument("target_dir", type=click.Path()) +@click.option( + "--download-mp3", + type=bool, + is_flag=True, + default=False, + help="Download the mp3 copy of the audio as well as wav.", +) +def sbcsae( + target_dir: Pathlike, + download_mp3: Optional[bool] = False, +): + """SBCSAE download.""" + download_sbcsae(target_dir, download_mp3=download_mp3) diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py index 57e129ea1..2fc7a6f0b 100644 --- a/lhotse/recipes/__init__.py +++ b/lhotse/recipes/__init__.py @@ -63,6 +63,7 @@ from .nsc import prepare_nsc from .peoples_speech import prepare_peoples_speech from .rir_noise import download_rir_noise, prepare_rir_noise +from .sbcsae import prepare_sbcsae from .speechcommands import download_speechcommands, prepare_speechcommands from .spgispeech import download_spgispeech, prepare_spgispeech from .stcmds import download_stcmds, prepare_stcmds diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py new file mode 100644 index 000000000..55883cfac --- /dev/null +++ b/lhotse/recipes/sbcsae.py @@ -0,0 +1,904 @@ +""" +This script downloads and prepares the data directory for the Santa Barbara +Corpus of Spoken American English. + +The Santa Barbara Corpus of Spoken American English is based on a large body of +recordings of naturally occurring spoken interaction from all over the United +States. The Santa Barbara Corpus represents a wide variety of people of +different regional origins, ages, occupations, genders, and ethnic and social +backgrounds. The predominant form of language use represented is face-to-face +conversation, but the corpus also documents many other ways that that people use +language in their everyday lives: telephone conversations, card games, food +preparation, on-the-job talk, classroom lectures, sermons, story-telling, town +hall meetings, tour-guide spiels, and more. + +The Santa Barbara Corpus was compiled by researchers in the Linguistics +Department of the University of California, Santa Barbara. The Director of the +Santa Barbara Corpus is John W. Du Bois, working with Associate Editors Wallace +L. Chafe and Sandra A. Thompson (all of UC Santa Barbara), and Charles Meyer +(UMass, Boston). For the publication of Parts 3 and 4, the authors are John W. +Du Bois and Robert Englebretson. + +TODO: detail on splits and such +""" +import logging +import re +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Union + +from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet +from lhotse.utils import Pathlike, resumable_download +from lhotse import fix_manifests + +from tqdm import tqdm + +TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/" +TALKBANK_WAV_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/0wav/" + +lang_iterators = { + "SBC004": iter(["Spanish"] * 17), + "SBC006": iter(["French"] * 2), + "SBC010": iter(["Spanish"]), + "SBC012": iter(["Greek"] * 2), + "SBC015": iter(["Spanish"] * 10), + "SBC025": iter(["German"] * 2 + ["Latin"]), + "SBC027": iter(["Spanish"] * 6 + ["French"] * 2), + "SBC031": iter(["French"] * 2), + "SBC033": iter(["French"]), + "SBC034": iter(["French"] * 3), + "SBC036": iter(["Spanish"] * 36), + "SBC037": iter(["Spanish"] * 60), + "SBC047": iter(["Spanish"]), + "SBC057": iter(["Japanese"] * 62), + "SBC058": iter(["Spanish"] + ["Italian"] * 2), +} + +# These corrections to the participant metadata were needed to get geolocations +# from the geopy package. +annotation_corrections = { + "metro St.L. IL": "Saint Louis MO", # Use the MO side of the city + "middle Wes MO": "Missouri", # Just use the state location + "S.E.Texas TX": "South East Texas", # The geo package seems to parse this + "South Alabama mostly AL": "Andalusia Alabama", # Arbitrarily chosen nearby town + "South FL": "South Bay Florida", # Arbitrarily chosen nearby town + "Walnut Cre CA": "Walnut Creek CA", # Spelling error + "San Leandr CA": "San Leandro CA", + "Boston/Santa Fe MA/NM": "Boston/Santa Fe\tMA/NM", # Handle this specially + "Boston/New Mexico MA/NM": "Boston/Santa Fe\tMA/NM", + "Millstad IL": "Millstadt IL", # Spelling error + "Cleveland/San Francisco OH/CA": "Cleveland/San Fransisco\tOH/CA", # Handle specially + "Jamesville WI": "Janesville WI", # Spelling error + "Falls Church/Albuquerque VA/NM": "Falls Church/Albuquerque\tVA/NM", # Handle specially + "Southern Florida": "South Bay Florida", # Arbitarily chosen nearby town + "Massachusetts MA": "Massachusetts", + "New Zealand n/a": "New Zealand", + "French n/a": "France", +} + +bad_stereo = ["SBC020","SBC021","SBC027","SBC028"] + +class Dummy_Spk_Iterator: + def __init__(self): + self.ind = 213 + + def next(self, spk="SBCXXX_X"): + self.ind = self.ind + 1 + name = "_".join(spk.split("_")[1:]) + if name.startswith("X") or name.startswith("AUD"): + name = "UNK" + return f"{self.ind:04d}_{name}" + + +dummy_spk_iterator = Dummy_Spk_Iterator() + + +def download_sbcsae( + target_dir: Pathlike = ".", + download_mp3: Optional[bool] = False, +) -> Path: + """ + Download the dataset. Due to availability/broken link issues, this downloads + from multiple sources. + + :param: target_dir: Pathlike, the path of the directory where the SBCSAE + dataset will be downloaded. + :param: download_mp3: bool, if True download the mp3 files as well as wav. + :return: The path to the directory with the data. + """ + target_dir = Path(target_dir) + corpus_dir = target_dir / "SBCSAE" + corpus_dir.mkdir(parents=True, exist_ok=True) + + completed_detector = target_dir / ".sbcsae_completed" + if completed_detector.is_fil(): + logging.info(f"Skipping download because {completed_detector} exists.") + return corpus_dir + return "FALSE" + + +def prepare_sbcsae( + corpus_dir: Pathlike, + output_dir: Optional[Pathlike] = None, +) -> Dict[str, Union[RecordingSet, SupervisionSet]]: + """ + Prepares manifest for SBCSAE dataset. + + :param: corpus_dir: Path to the root where SBCSAE data was downloaded. It + should be called SBCSAE. There is no consistent formatting between + releases of the data. Check script comments for details if using an + existing corpus download rather than Lhotse's download script. + :param: output_dir: Root directory where .json manifests are stored. + :return: + """ + # Resolve corpus_dir type + if isinstance(corpus_dir, str): + corpus_dir = Path(corpus_dir) + + # Resolve output_dir type + if isinstance(output_dir, str): + output_dir = Path(output_dir) + + audio_dir = corpus_dir / "WAV" + recordings = RecordingSet.from_recordings( + Recording.from_file(p) for p in audio_dir.glob("*.wav") + ) + if len(recordings) == 0: + logging.warning(f"No .wav files found in {audio_dir}") + + doc_dir = corpus_dir / "documentation" + spk2gen_dict, spk2glob_dict = generate_speaker_map_dicts(doc_dir) + spk_coords = generate_geolocations(corpus_dir, spk2glob_dict) + supervisions = [] + trn_dir = corpus_dir / "TRN" + for p in tqdm(list(trn_dir.glob("*.trn")), "Collecting and normalizing transcripts ..."): + for supervision in _filename_to_supervisions(p, spk2gen_dict, spk2glob_dict): + supervisions.append(supervision) + + if len(supervisions) == 0: + logging.warning(f"No supervisions found in {trn_dir}") + + + supervisions_ = [] + for s in supervisions: + if s.duration < 0.02: + s_ = s.pad(pad=0.02) + else: + s_ = s + if s_.speaker in spk_coords: + # Just use the first location if there is more than one + s_.custom = { + 'lat': spk_coords[s.speaker][0][0], + 'lon': spk_coords[s.speaker][0][1] + } + + if ( + not isinstance(recordings[s.recording_id].channel_ids, list) or + len(recordings[s.recording_id].channel_ids) < 2 or + s.recording_id in bad_stereo + ): + s_.channel = recordings[s.recording_id].channel_ids[0] + supervisions_.append(s_) + + + supervisions = SupervisionSet.from_segments(supervisions_) + recordings, supervisions = fix_manifests(recordings, supervisions) + + if output_dir is not None: + if isinstance(output_dir, str): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + recordings.to_file(output_dir / "sbcsae_recordings.jsonl.gz") + supervisions.to_file(output_dir / "sbcsae_supervisions.jsonl.gz") + + manifests = {"recordings": recordings, "supervisions": supervisions} + + return manifests + + +def generate_geolocations(corpus: Path, spk2glob_dict: dict): + if not is_module_available("geopy"): + raise ImportError( + "geopy package not found. Please install..." " (pip install geopy)" + ) + else: + from geopy.geocoders import Nominatim + from geopy import geocoders + + speakers = corpus.rglob("documentation/LDC*/speaker.tbl") + # This geolocator object is repsonsible for generating a + # latitiude and longitude from a textual description of a location, i.e., + # CHICAGO IL --> (41,-87) + geolocator = Nominatim(user_agent='myapplication') + spk_coords = {} + for spk in tqdm(list(speakers), "Generating speaker geolocations..."): + with open(spk) as f: + for l in f: + vals = l.strip().split(",") + if len(vals) < 5: + continue + # Check non-empty + empty_hometown = vals[4] in ("", "?") + empty_state = vals[5] in ("", "?") + if empty_hometown and not empty_state: + loc = vals[5] + ", United States" + elif not empty_hometown: + orig_loc = vals[4] + " " + vals[5] + loc = annotation_corrections.get(orig_loc, orig_loc) + else: + continue + if "/" in loc: + try: + hometowns, states = loc.split("\t", 1) + hometowns = hometowns.split("/") + states = states.split("/") + coords = [] + for h, s in zip(hometowns, states): + coords.append(geolocator.geocode(f"{h} {s}", timeout=None)[1]) + except ValueError: + states, country = loc.split(",", 1) + coords = [] + for s in states.split("/"): + coords.append(geolocator.geocode(f"{s}, {country}", timeout=None)[1]) + else: + coords = [geolocator.geocode(loc, timeout=None)[1]] + spk_coords[vals[0]] = coords + spknum2spk_name = {n.split("_")[0]: n for s, n in spk2glob_dict.items()} + spk_coords_ = {} + for s in spk_coords: + if s in spknum2spk_name: + spk_coords_[spknum2spk_name[s]] = spk_coords[s] + return spk_coords_ + + +def generate_speaker_map_dicts(doc_dir: Path): + spk2gen_dict = dict() + spk2glob_dict = dict() + + spk_num_to_reco_ids = dict() + for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]: + filename = doc_dir / LDC_split / "segment.tbl" + for line in filename.read_text().split("\n"): + if "speaker:" in line: + line = line.replace(" 0", "\t0") + reco_id = re.sub(r"sbc0?([0-9]{3})\s.*", r"SBC\1", line) + spk_num = line.split("\t")[-1][:4] + if spk_num not in spk_num_to_reco_ids: + spk_num_to_reco_ids[spk_num] = [] + if reco_id not in spk_num_to_reco_ids[spk_num]: + spk_num_to_reco_ids[spk_num].append(reco_id) + + for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]: + filename = doc_dir / LDC_split / "speaker.tbl" + for line in filename.read_text().split("\n"): + if "," not in line: + continue + line = line.replace("0163,Dan,m", "0166,Dan,M") + spk_num, name, gen = line.split(",")[:3] + name = ( + name.replace(" (extra-corpus)", "").upper().split(" ")[-1].split("/")[0] + ) + gen = gen.upper() + if not gen: + gen = None + + if spk_num in ["0069", "0091", "0092", "0097"]: + continue + for reco in spk_num_to_reco_ids[spk_num]: + spk2gen_dict[reco + "_" + name] = gen + spk2glob_dict[reco + "_" + name] = spk_num + "_" + name + + for LDC_split in ["LDC2004S10"]: + seg_list = [] + filename = doc_dir / LDC_split / "segment.tbl" + for line in filename.read_text().split("\n"): + if "speaker:" in line: + reco_id = re.sub(r"sbc0?([0-9]{3})\s.*", r"SBC\1", line) + name = line.split(" ")[-1].upper().split("/")[0] + seg_list.append([name, reco_id]) + + spk_list = [] + filename = doc_dir / LDC_split / "speaker.tbl" + for line in filename.read_text().split("\n"): + if "," not in line: + continue + spk_num, name, gen = line.split(",")[:3] + name = name.upper().split("/")[0] + spk_list.append([name, spk_num, gen]) + + for seg_info, spk_info in zip(seg_list, spk_list): + assert seg_info[0] == spk_info[0], f"{seg_info[0]} != {spk_info[0]}" + spk2gen_dict[seg_info[1] + "_" + seg_info[0]] = spk_info[2] + spk2glob_dict[seg_info[1] + "_" + seg_info[0]] = ( + spk_info[1] + "_" + spk_info[0] + ) + + for spk_key in [ + "SBC006_ALL", + "SBC008_ALL", + "SBC012_MANY", + "SBC020_AUD", + "SBC021_MANY", + "SBC023_MANY", + "SBC025_AUD", + "SBC026_AUD", + "SBC027_MANY", + "SBC027_AUD", + "SBC028_BOTH", + "SBC030_AUD", + "SBC038_AUD", + "SBC053_RADIO", + "SBC054_AUD", + "SBC054_MANY", + "SBC055_AUD", + ]: + spk2gen_dict[spk_key] = None + spk2glob_dict[spk_key] = spk_key + + return spk2gen_dict, spk2glob_dict + + +def _filename_to_supervisions(filename: Path, spk2gen_dict: dict, spk2glob_dict: dict): + reco_id = filename.stem.split(".")[0] + lines = filename.read_text(encoding="latin1") + supervisions = [] + + #### Transcript fix + lines = lines.replace("\x92", "'") + lines = lines.replace("\u007f", "") + lines = lines.replace("\u0000", "c") + + if reco_id == "SBC002": + lines = lines.replace("(TSK ", "(TSK) ") + elif reco_id == "SBC004": + lines = lines.replace("KATE", "KATHY") + lines = lines.replace("sen~orita", "se\xf1orita") + elif reco_id == "SBC005": + lines = lines.replace("good_/god/", "good") + lines = lines.replace("(H)@>", "(H) @>") + lines = lines.replace("[@@ <@Mm@>]", "[@@ <@ Mm @>]") + elif reco_id == "SBC006": + lines = lines.replace("/pub/", "pub") + lines = lines.replace("", "") + lines = lines.replace("[2(H)2]1", "[2(H)2]") + elif reco_id == "SBC007": + lines = lines.replace( + "\\000000000 000000000 MARY: 1182.90 1186.92\t ", + "\n1182.90 1186.92\tMARY: ", + ) + lines = lines.replace("(YAWN0", "(YAWN)") + elif reco_id == "SBC008": + lines = lines.replace("[", "[") + elif reco_id == "SBC012": + lines = lines.replace( + "\n".join(["807.02 807.92\tFRANK: \t.. Mhm."] * 2), + "807.02 807.92\tFRANK: \t.. Mhm.", + ) + lines = lines.replace("MONTOYA", "MONTOYO") + elif reco_id == "SBC013": + lines = lines.replace("[8<@She8]", "[8<@ She8]") + lines = lines.replace("[2(H) cou_ couch@>2]", "[2(H) cou_ couch @>2]") + lines = lines.replace("[4<@No=4]", "[4<@ No=4]") + lines = lines.replace("VOX2]", "VOX>2]") + elif reco_id == "SBC014": + lines = lines.replace("\\000000000 000000000 ", "\n") + lines = lines.replace("<@he thought", "<@ he thought") + elif reco_id == "SBC015": + lines = lines.replace( + "243.055\t244.080\tKEN:\t(H)] the little,", + "243.465\t244.670\tKEN:\t(H)] the little,", + ) + lines = lines.replace("\u0000urch things.", "church things.") + lines = lines.replace("2(H]=2", "2(H)=2") + lines = lines.replace(" 0.000000e+00", "e") + lines = lines.replace("0m=,", "um=,") + lines = lines.replace("0eople", "people") + lines = lines.replace("0id", "did") + lines = lines.replace("X 0ne %tho", "X uh line %tho") + lines = lines.replace("and 0t [was]", "and it [was]") + lines = lines.replace("0t was like", "it was like") + elif reco_id == "SBC016": + lines = lines.replace("/sed ai/", "sed ai") + elif reco_id == "SBC017": + lines = lines.replace("a\tand names the] na=me,", "and names the] na=me,") + lines = lines.replace(" 0.000000e+00", "e") + lines = lines.replace("[2I mean2", "[2I mean2]") + lines = lines.replace("no2.", "no.") + lines = lines.replace("0rganisms", "organisms") + lines = lines.replace("0ttle", "little") + elif reco_id == "SBC018": + lines = lines.replace("0f", "if") + elif reco_id == "SBC019": + lines = lines.replace("cello_(/cheller/)", "cheller") + lines = lines.replace("(sigh)", "(SIGH)") + lines = lines.replace(" Mo=m", "]", "[]") + lines = lines.replace("5]", "X>5]") + lines = lines.replace("0nly", "uh only") + lines = lines.replace("[50r5]", "[5Or5]") + elif reco_id == "SBC024": + lines = lines.replace(" >ENV: ", ">ENV:\t") + lines = lines.replace(" 0.000000irst", "First") + lines = lines.replace("2[cause", "[2cause") + lines = lines.replace(" 0oes", "does") + lines = lines.replace("0id]", "did]") + elif reco_id == "SBC025": + lines = lines.replace("", "<@ Oh[2= @>") + lines = lines.replace(" 0.000000", " ") + lines = lines.replace("i 0f", "i- if") + lines = lines.replace("0f we", "if we") + lines = lines.replace("th- 0t's", "th- that's") + lines = lines.replace("0t's", "it's") + lines = lines.replace("0f", "if") + elif reco_id == "SBC029": + lines = lines.replace("96.230\t98.240\t>ENV: ", "96.230\t98.240\t>ENV:\t") + lines = lines.replace("(H )", "(H)") + lines = lines.replace("<0h=,", "<% Oh=,") + lines = lines.replace("knowX>]", "know X>]") + lines = lines.replace("0verheating", "overheating") + elif reco_id == "SBC030": + lines = lines.replace("DANNY", "BRADLEY") + lines = lines.replace("AUD:\tYes", "X:\tYes") + elif reco_id == "SBC034": + lines = lines.replace("13548.02 ", "1354.802") + elif reco_id == "SBC036": + lines = lines.replace( + "1558.463\t1558.906\t\t[thought he was,", + "1558.906\t1558.923\t\t[thought he was,", + ) + elif reco_id == "SBC038": + lines = lines.replace("AUD:\t... What's", "X_2:\t... What's") + lines = lines.replace("AUD:\t... U", "X_3:\t... U") + lines = lines.replace("AUD:\t... How far", "X_2:\t... How far") + lines = lines.replace("AUD:\t", "") + lines = lines.replace("ANNETTE", "ANETTE") + elif reco_id == "SBC048": + lines = lines.replace("<@in San[2ta", "<@ in San[2ta") + elif reco_id == "SBC052": + lines = lines.replace("~Janine\t said", "~Janine said") + elif reco_id == "SBC054": + lines = lines.replace("", "") + lines = lines.replace("AUD:\tX", "X:\tX") + lines = lines.replace("AUD:\t") + lines = lines.replace("sensei", "") + lines = lines.replace("ippon", "Ippon") + lines = lines.replace("Ippon", "") + lines = re.sub(r"gi([^a-z])", r"\1", lines) + lines = re.sub(r"Makikomi([^-])", r"\1", lines) + lines = lines.replace("Hane-goshi", "") + lines = lines.replace("Sode-makikomi", "") + lines = lines.replace("shiai", "") + lines = lines.replace("randori", "") + lines = re.sub(r"Sode([^-])", r"\1", lines) + lines = lines.replace("Ukemi", "") + lines = lines.replace("Ha-jime", "") + lines = lines.replace("Ude-garami", "") + lines = lines.replace("Hane-uchi-mata", "") + lines = lines.replace("Uchi-", "Uchi-mata") + lines = lines.replace("Uchi-mata", "") + lines = lines.replace("Hande-maki- \1", lines) + lines = lines.replace("%Sode-maki[komi]", "") + lines = lines.replace("Tsuri-komi", "") + lines = lines.replace("Uchi-komi", "") + lines = lines.replace("O-uchi", "") + lines = lines.replace("Goshi", "") + lines = lines.replace("Uchi]-mata", "") + lines = lines.replace("Komi", "") + lines = lines.replace("Tani-otoshi", "") + lines = lines.replace("Hane-maki][2komi=", "") + lines = lines.replace("Makikomi-waza", "") + lines = lines.replace("Seoi", "") + lines = lines.replace("uke", "") + elif reco_id == "SBC059": + lines = lines.replace("[]", "hour[6=6] F>") + + spk_buffer = "" + lang_buffer = "English" + for line in lines.split("\n"): + #### Transcript fixes + if line == "77.200\t77.540 :\t(H)": + continue + if line.startswith("000000000 000000000 ") or line.startswith("0.00 0.00"): + continue + if line.startswith("\t"): + line.lstrip("\t") + if "and in his pamphlet the Liber Arbetrio" in line: + continue + + line = line.strip() + line = re.sub(r" +", " ", line) + line = re.sub(r"\t+", "\t", line) + fields = line.strip().split("\t") + if len(fields) == 4: + spk_field, raw_trans = fields[2:] + start, end = [float(time.rstrip()) for time in fields[:2]] + elif len(fields) == 3: + if len(fields[0].rstrip().split(" ")) > 1: + spk_field, raw_trans = fields[1:] + start, end = [float(time) for time in fields[0].split(" ")[:2]] + raw_trans = fields[-1] + else: + start, end = [float(time.rstrip()) for time in fields[:2]] + spk_field_candidate = fields[2].split(" ")[0] + if re.fullmatch(r"[A-Z]+:", spk_field_candidate): + spk_field = spk_field_candidate + raw_trans = " ".join(fields[2].split(" ")[1:]) + else: + spk_field = "" + raw_trans = fields[2] + elif len(fields) == 2: + timesish = fields[0].rstrip().split(" ") + if len(timesish) == 1: + continue + start, end = [float(time) for time in timesish[:2]] + if len(timesish) > 2: + spk_field = timesish[2] + raw_trans = fields[1] + else: + spk_field_candidate = fields[1].split(" ")[0] + if re.fullmatch(r"[A-Z]+:", spk_field_candidate): + spk_field = spk_field_candidate + raw_trans = " ".join(fields[1].split(" ")[1:]) + else: + spk_field = "" + raw_trans = fields[1] + else: + split = line.split(" ") + if re.fullmatch(r"[0-9]+\.[0-9]+", split[0]) and re.fullmatch( + r"[0-9]+\.[0-9]+", split[1] + ): + start, end = [float(time.rstrip()) for time in split[:2]] + if re.fullmatch(r"[A-Z]+:", split[2]): + spk_field = split[2] + raw_trans = " ".join(split[3:]) + else: + spk_field = "" + raw_trans = " ".join(split[2:]) + else: + continue + + #### Transcript fixes + if raw_trans == "[2ENV", "ENV", ">MAC", ">DOG", ">HORSE", ">CAT", ">BABY"]: + continue + elif spk_field == "#READ": + spk_field = "WALT" + + if spk_field: + spk_field = re.sub(r"^[^A-Z]", "", spk_field) + spk_buffer = spk_field + + utt_id = f"{reco_id}_{int(start*1000):07}_{int(end*1000):07}_{spk_buffer}" + + text, lang_tag = _parse_raw_transcript(raw_trans) + + if "l" in lang_tag: + for _ in range(lang_tag.count("l")): + new_lang = next(lang_iterators[reco_id]) + if "c" in lang_tag: + lang_buffer = f"English-{new_lang}" + else: + lang_buffer = new_lang + elif "c" in lang_tag: + lang_buffer = f"English-{lang_buffer.split('-')[-1]}" + + spk_key = reco_id + "_" + spk_buffer + if spk_key not in spk2glob_dict and reco_id != "SBC021": + spk2gen_dict[spk_key] = None + spk2glob_dict[spk_key] = dummy_spk_iterator.next(spk_key) + + if spk_key in spk2glob_dict: + speaker = spk2glob_dict[spk_key] + gender = spk2gen_dict[spk_key] + else: + speaker = dummy_spk_iterator.next(spk_key) + gender = None + + if re.search(r"[A-Za-z]", text): + supervisions.append( + SupervisionSegment( + id=utt_id, + recording_id=reco_id, + start=start, + duration=end - start, + channel=[0, 1], + text=text, + language=lang_buffer, + speaker=speaker, + gender=gender, + ) + ) + + if lang_tag: + if lang_tag[-1] == "r": + lang_buffer = "English" + if lang_tag[-1] == "l": + lang_buffer = lang_buffer.split("-")[-1] + + return supervisions + + +def _parse_raw_transcript(transcript: str): + + transcript = transcript.replace("0h", "oh") + transcript = transcript.replace("s@so", "s- so") + transcript = transcript.replace("la@ter", "later") + transcript = transcript.replace("you@.", "you @.") + transcript = transcript.replace("[N=]", "N") + transcript = transcript.replace("[2C2]=", "C") + transcript = transcript.replace("[MM=]", "MM") + transcript = transcript.replace("[I=]", "I") + + transcript = transcript.replace("(YELL)", "") + + transcript = transcript.replace("_", "-") + + transcript = transcript.replace("=", "") + transcript = transcript.replace("%", "") + + # Process overlapped UNKs before they get removed by the following step + transcript = re.sub(r"\[([2-9]?)([A-Z])+\1\]", r"\2", transcript) + + # Paired parenthetical/bracket annotation remover + paren_matches = re.findall(r"\([^a-z@ ]*\)", transcript) + for paren_match in paren_matches: + transcript = transcript.replace( + paren_match, re.sub(r"[^\[\]]", "", paren_match) + ) + brack_matches = re.findall(r"\[[^a-z@ ]+\]", transcript) + for brack_match in brack_matches: + transcript = transcript.replace( + brack_match, re.sub(r"[^\(\)]", "", brack_match) + ) + + transcript = re.sub(r"<<[^a-z@ ]+>>", "", transcript) + transcript = re.sub(r"<<[^a-z@ ]+", "", transcript) + transcript = re.sub(r"[^a-z@ ]+>>", "", transcript) + + transcript = re.sub(r"<[^a-z@ ]+>", "", transcript) + transcript = re.sub(r"<[^a-z2 ]*[^2 ]([ <])", r"\1", transcript) + transcript = re.sub(r"([ >])[^a-z2 ]*[^a-z 2]>", r"\1", transcript) + + transcript = re.sub(r"\[[2-9]?", "", transcript) + transcript = re.sub(r"[2-9]?\]", "", transcript) + + transcript = transcript.replace("(Hx)", " ") + transcript = transcript.replace("(hx)", " ") + transcript = transcript.replace("(@Hx)", "@") + + transcript = transcript.replace("(COUGH COUGH)", " ") + transcript = transcript.replace("(SNIFF", "") + + transcript = transcript.replace("(", "") + transcript = transcript.replace(")", "") + + transcript = transcript.replace("< ", " ") + transcript = transcript.replace(" >", " ") + + transcript = re.sub(r"[^A-Za-z-]-+", "", transcript) + transcript = re.sub(r"\.\.+", "", transcript) + + transcript = transcript.replace("+", "") + transcript = transcript.replace("&", "") + transcript = transcript.replace("#", "") + transcript = transcript.replace("*", "") + + transcript = re.sub(r"!([A-Za-z])", r"\1", transcript) + + # Deal with extra white space + transcript = re.sub(r" +", " ", transcript) + + # Merge X's + transcript = re.sub(r"X+", "X", transcript) + + # Parse laughter + transcript = transcript.replace("on@,", "on @,") + transcript = re.sub(r"([a-z-])@([a-z])", r"\1\2", transcript) + transcript = re.sub(r"@+", "@", transcript) + transcript = re.sub(r"(^| )@([^ ])", r" @ \2", transcript) + transcript = re.sub(r"([^ ])@( |$)", r"\1 @ ", transcript) + transcript = transcript.replace("@ @", "@").replace("@ @", "@") + + transcript = re.sub(r"(^| )X([ ,.?']|$)", r"\1\2", transcript) + transcript = re.sub(r"(^| )X([ ,.?']|$)", r"\1\2", transcript) + transcript = re.sub(r"X-($| )", r"\1", transcript) + + transcript = re.sub(r"^ ", "", transcript) + transcript = re.sub(r" $", "", transcript) + + transcript = transcript.replace(" .", ".") + transcript = transcript.replace(" ,", ",") + transcript = transcript.replace(" ?", "?") + + transcript = re.sub(r"^\. ", "", transcript) + transcript = re.sub(r"^\.$", "", transcript) + + if ( + len(transcript.split(" 1 + and re.search(r"[A-Za-z]", transcript.split("")) > 1 + and re.search(r"[A-Za-z]", transcript.split("L2>")[-1]) + ): + lang_tag = "c" + else: + lang_tag = "" + + transcript = transcript.replace("@", "") + transcript = transcript.replace("", "") + + if "L2" in transcript: + lang_tag = lang_tag + re.sub( + r"()(?!.*()).*$", + r"\1", + re.sub(r".*?()", r"\1", transcript), + ) + lang_tag = lang_tag.replace("", "r") + + # We choose to leave the language tags in, but uncommenting this would remove them. + # transcript = transcript.replace("", "") + + return transcript, lang_tag