diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
index 5df6ef8de..11a98395b 100644
--- a/lhotse/bin/modes/recipes/__init__.py
+++ b/lhotse/bin/modes/recipes/__init__.py
@@ -62,6 +62,7 @@
 from .peoples_speech import *
 from .primewords import *
 from .rir_noise import *
+from .sbcsae import *
 from .speechcommands import *
 from .spgispeech import *
 from .stcmds import *
diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py
new file mode 100644
index 000000000..23f36e09b
--- /dev/null
+++ b/lhotse/bin/modes/recipes/sbcsae.py
@@ -0,0 +1,37 @@
+from typing import Optional, Sequence
+
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.sbcsae import download_sbcsae, prepare_sbcsae
+from lhotse.utils import Pathlike
+
+__all__ = ["sbcsae"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+def sbcsae(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+):
+    """SBCSAE data preparation."""
+    prepare_sbcsae(corpus_dir, output_dir=output_dir)
+
+
+@download.command(context_settings=dict(show_default=True))
+@click.argument("target_dir", type=click.Path())
+@click.option(
+    "--download-mp3",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Download the mp3 copy of the audio as well as wav.",
+)
+def sbcsae(
+    target_dir: Pathlike,
+    download_mp3: Optional[bool] = False,
+):
+    """SBCSAE download."""
+    download_sbcsae(target_dir, download_mp3=download_mp3)
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
index 57e129ea1..2fc7a6f0b 100644
--- a/lhotse/recipes/__init__.py
+++ b/lhotse/recipes/__init__.py
@@ -63,6 +63,7 @@
 from .nsc import prepare_nsc
 from .peoples_speech import prepare_peoples_speech
 from .rir_noise import download_rir_noise, prepare_rir_noise
+from .sbcsae import prepare_sbcsae
 from .speechcommands import download_speechcommands, prepare_speechcommands
 from .spgispeech import download_spgispeech, prepare_spgispeech
 from .stcmds import download_stcmds, prepare_stcmds
diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
new file mode 100644
index 000000000..55883cfac
--- /dev/null
+++ b/lhotse/recipes/sbcsae.py
@@ -0,0 +1,904 @@
+"""
+This script downloads and prepares the data directory for the Santa Barbara
+Corpus of Spoken American English.
+
+The Santa Barbara Corpus of Spoken American English is based on a large body of
+recordings of naturally occurring spoken interaction from all over the United
+States. The Santa Barbara Corpus represents a wide variety of people of
+different regional origins, ages, occupations, genders, and ethnic and social
+backgrounds. The predominant form of language use represented is face-to-face
+conversation, but the corpus also documents many other ways that that people use
+language in their everyday lives: telephone conversations, card games, food
+preparation, on-the-job talk, classroom lectures, sermons, story-telling, town
+hall meetings, tour-guide spiels, and more.
+
+The Santa Barbara Corpus was compiled by researchers in the Linguistics
+Department of the University of California, Santa Barbara. The Director of the
+Santa Barbara Corpus is John W. Du Bois, working with Associate Editors Wallace
+L. Chafe and Sandra A. Thompson (all of UC Santa Barbara), and Charles Meyer
+(UMass, Boston). For the publication of Parts 3 and 4, the authors are John W.
+Du Bois and Robert Englebretson.
+
+TODO: detail on splits and such
+"""
+import logging
+import re
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Union
+
+from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike, resumable_download
+from lhotse import fix_manifests
+
+from tqdm import tqdm
+
+TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/"
+TALKBANK_WAV_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/0wav/"
+
+lang_iterators = {
+    "SBC004": iter(["Spanish"] * 17),
+    "SBC006": iter(["French"] * 2),
+    "SBC010": iter(["Spanish"]),
+    "SBC012": iter(["Greek"] * 2),
+    "SBC015": iter(["Spanish"] * 10),
+    "SBC025": iter(["German"] * 2 + ["Latin"]),
+    "SBC027": iter(["Spanish"] * 6 + ["French"] * 2),
+    "SBC031": iter(["French"] * 2),
+    "SBC033": iter(["French"]),
+    "SBC034": iter(["French"] * 3),
+    "SBC036": iter(["Spanish"] * 36),
+    "SBC037": iter(["Spanish"] * 60),
+    "SBC047": iter(["Spanish"]),
+    "SBC057": iter(["Japanese"] * 62),
+    "SBC058": iter(["Spanish"] + ["Italian"] * 2),
+}
+
+# These corrections to the participant metadata were needed to get geolocations
+# from the geopy package.
+annotation_corrections = {
+    "metro St.L. IL": "Saint Louis MO", # Use the MO side of the city
+    "middle Wes MO": "Missouri", # Just use the state location
+    "S.E.Texas TX": "South East Texas", # The geo package seems to parse this
+    "South Alabama mostly AL": "Andalusia Alabama", # Arbitrarily chosen nearby town
+    "South FL": "South Bay Florida", # Arbitrarily chosen nearby town
+    "Walnut Cre CA": "Walnut Creek CA", # Spelling error
+    "San Leandr CA": "San Leandro CA",
+    "Boston/Santa Fe MA/NM": "Boston/Santa Fe\tMA/NM", # Handle this specially
+    "Boston/New Mexico MA/NM": "Boston/Santa Fe\tMA/NM",
+    "Millstad IL": "Millstadt IL", # Spelling error
+    "Cleveland/San Francisco OH/CA": "Cleveland/San Fransisco\tOH/CA", # Handle specially
+    "Jamesville WI": "Janesville WI", # Spelling error
+    "Falls Church/Albuquerque VA/NM": "Falls Church/Albuquerque\tVA/NM", # Handle specially
+    "Southern Florida": "South Bay Florida", # Arbitarily chosen nearby town
+    "Massachusetts MA": "Massachusetts",
+    "New Zealand n/a": "New Zealand",
+    "French n/a": "France",
+}
+
+bad_stereo = ["SBC020","SBC021","SBC027","SBC028"]
+
+class Dummy_Spk_Iterator:
+    def __init__(self):
+        self.ind = 213
+
+    def next(self, spk="SBCXXX_X"):
+        self.ind = self.ind + 1
+        name = "_".join(spk.split("_")[1:])
+        if name.startswith("X") or name.startswith("AUD"):
+            name = "UNK"
+        return f"{self.ind:04d}_{name}"
+
+
+dummy_spk_iterator = Dummy_Spk_Iterator()
+
+
+def download_sbcsae(
+    target_dir: Pathlike = ".",
+    download_mp3: Optional[bool] = False,
+) -> Path:
+    """
+    Download the dataset. Due to availability/broken link issues, this downloads
+    from multiple sources.
+
+    :param: target_dir: Pathlike, the path of the directory where the SBCSAE
+        dataset will be downloaded.
+    :param: download_mp3: bool, if True download the mp3 files as well as wav.
+    :return: The path to the directory with the data.
+    """
+    target_dir = Path(target_dir)
+    corpus_dir = target_dir / "SBCSAE"
+    corpus_dir.mkdir(parents=True, exist_ok=True)
+
+    completed_detector = target_dir / ".sbcsae_completed"
+    if completed_detector.is_fil():
+        logging.info(f"Skipping download because {completed_detector} exists.")
+        return corpus_dir
+    return "FALSE"
+
+
+def prepare_sbcsae(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike] = None,
+) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
+    """
+    Prepares manifest for SBCSAE dataset.
+
+    :param: corpus_dir: Path to the root where SBCSAE data was downloaded. It
+        should be called SBCSAE. There is no consistent formatting between
+        releases of the data. Check script comments for details if using an
+        existing corpus download rather than Lhotse's download script.
+    :param: output_dir: Root directory where .json manifests are stored.
+    :return:
+    """
+    # Resolve corpus_dir type
+    if isinstance(corpus_dir, str):
+        corpus_dir = Path(corpus_dir)
+
+    # Resolve output_dir type
+    if isinstance(output_dir, str):
+        output_dir = Path(output_dir)
+
+    audio_dir = corpus_dir / "WAV"
+    recordings = RecordingSet.from_recordings(
+        Recording.from_file(p) for p in audio_dir.glob("*.wav")
+    )
+    if len(recordings) == 0:
+        logging.warning(f"No .wav files found in {audio_dir}")
+
+    doc_dir = corpus_dir / "documentation"
+    spk2gen_dict, spk2glob_dict = generate_speaker_map_dicts(doc_dir)
+    spk_coords = generate_geolocations(corpus_dir, spk2glob_dict)
+    supervisions = []
+    trn_dir = corpus_dir / "TRN"
+    for p in tqdm(list(trn_dir.glob("*.trn")), "Collecting and normalizing transcripts ..."):
+        for supervision in _filename_to_supervisions(p, spk2gen_dict, spk2glob_dict):
+            supervisions.append(supervision)
+
+    if len(supervisions) == 0:
+        logging.warning(f"No supervisions found in {trn_dir}")
+   
+   
+    supervisions_ = [] 
+    for s in supervisions:
+        if s.duration < 0.02:
+            s_ = s.pad(pad=0.02)
+        else:
+            s_ = s
+        if s_.speaker in spk_coords:
+            # Just use the first location if there is more than one
+            s_.custom = {
+                'lat': spk_coords[s.speaker][0][0],
+                'lon': spk_coords[s.speaker][0][1]
+            }
+
+        if (
+            not isinstance(recordings[s.recording_id].channel_ids, list) or 
+            len(recordings[s.recording_id].channel_ids) < 2 or
+            s.recording_id in bad_stereo
+        ):
+            s_.channel = recordings[s.recording_id].channel_ids[0]
+        supervisions_.append(s_) 
+          
+   
+    supervisions = SupervisionSet.from_segments(supervisions_)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
+
+    if output_dir is not None:
+        if isinstance(output_dir, str):
+            output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        recordings.to_file(output_dir / "sbcsae_recordings.jsonl.gz")
+        supervisions.to_file(output_dir / "sbcsae_supervisions.jsonl.gz")
+
+    manifests = {"recordings": recordings, "supervisions": supervisions}
+
+    return manifests
+
+
+def generate_geolocations(corpus: Path, spk2glob_dict: dict):
+    if not is_module_available("geopy"):
+        raise ImportError(
+            "geopy package not found. Please install..." " (pip install geopy)"
+        )
+    else:
+        from geopy.geocoders import Nominatim
+        from geopy import geocoders
+
+    speakers = corpus.rglob("documentation/LDC*/speaker.tbl")
+    # This geolocator object is repsonsible for generating a 
+    # latitiude and longitude from a textual description of a location, i.e.,
+    # CHICAGO IL --> (41,-87) 
+    geolocator = Nominatim(user_agent='myapplication')
+    spk_coords = {}
+    for spk in tqdm(list(speakers), "Generating speaker geolocations..."):
+        with open(spk) as f:
+            for l in f:
+                vals = l.strip().split(",")
+                if len(vals) < 5:
+                    continue
+                # Check non-empty
+                empty_hometown = vals[4] in ("", "?")
+                empty_state = vals[5] in ("", "?")
+                if empty_hometown and not empty_state:
+                    loc = vals[5] + ", United States"
+                elif not empty_hometown:
+                    orig_loc = vals[4] + " " + vals[5]
+                    loc = annotation_corrections.get(orig_loc, orig_loc)
+                else:
+                    continue
+                if "/" in loc:
+                    try:
+                        hometowns, states = loc.split("\t", 1)
+                        hometowns = hometowns.split("/")
+                        states = states.split("/")
+                        coords = []
+                        for h, s in zip(hometowns, states):
+                            coords.append(geolocator.geocode(f"{h} {s}", timeout=None)[1])
+                    except ValueError:
+                        states, country = loc.split(",", 1)
+                        coords = []
+                        for s in states.split("/"):
+                            coords.append(geolocator.geocode(f"{s}, {country}", timeout=None)[1])
+                else:
+                    coords = [geolocator.geocode(loc, timeout=None)[1]]
+                spk_coords[vals[0]] = coords
+    spknum2spk_name = {n.split("_")[0]: n for s, n in spk2glob_dict.items()}
+    spk_coords_ = {}
+    for s in spk_coords:
+        if s in spknum2spk_name:
+            spk_coords_[spknum2spk_name[s]] = spk_coords[s]
+    return spk_coords_
+
+
+def generate_speaker_map_dicts(doc_dir: Path):
+    spk2gen_dict = dict()
+    spk2glob_dict = dict()
+
+    spk_num_to_reco_ids = dict()
+    for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]:
+        filename = doc_dir / LDC_split / "segment.tbl"
+        for line in filename.read_text().split("\n"):
+            if "speaker:" in line:
+                line = line.replace(" 0", "\t0")
+                reco_id = re.sub(r"sbc0?([0-9]{3})\s.*", r"SBC\1", line)
+                spk_num = line.split("\t")[-1][:4]
+                if spk_num not in spk_num_to_reco_ids:
+                    spk_num_to_reco_ids[spk_num] = []
+                if reco_id not in spk_num_to_reco_ids[spk_num]:
+                    spk_num_to_reco_ids[spk_num].append(reco_id)
+
+    for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]:
+        filename = doc_dir / LDC_split / "speaker.tbl"
+        for line in filename.read_text().split("\n"):
+            if "," not in line:
+                continue
+            line = line.replace("0163,Dan,m", "0166,Dan,M")
+            spk_num, name, gen = line.split(",")[:3]
+            name = (
+                name.replace(" (extra-corpus)", "").upper().split(" ")[-1].split("/")[0]
+            )
+            gen = gen.upper()
+            if not gen:
+                gen = None
+
+            if spk_num in ["0069", "0091", "0092", "0097"]:
+                continue
+            for reco in spk_num_to_reco_ids[spk_num]:
+                spk2gen_dict[reco + "_" + name] = gen
+                spk2glob_dict[reco + "_" + name] = spk_num + "_" + name
+
+    for LDC_split in ["LDC2004S10"]:
+        seg_list = []
+        filename = doc_dir / LDC_split / "segment.tbl"
+        for line in filename.read_text().split("\n"):
+            if "speaker:" in line:
+                reco_id = re.sub(r"sbc0?([0-9]{3})\s.*", r"SBC\1", line)
+                name = line.split(" ")[-1].upper().split("/")[0]
+                seg_list.append([name, reco_id])
+
+        spk_list = []
+        filename = doc_dir / LDC_split / "speaker.tbl"
+        for line in filename.read_text().split("\n"):
+            if "," not in line:
+                continue
+            spk_num, name, gen = line.split(",")[:3]
+            name = name.upper().split("/")[0]
+            spk_list.append([name, spk_num, gen])
+
+        for seg_info, spk_info in zip(seg_list, spk_list):
+            assert seg_info[0] == spk_info[0], f"{seg_info[0]} != {spk_info[0]}"
+            spk2gen_dict[seg_info[1] + "_" + seg_info[0]] = spk_info[2]
+            spk2glob_dict[seg_info[1] + "_" + seg_info[0]] = (
+                spk_info[1] + "_" + spk_info[0]
+            )
+
+    for spk_key in [
+        "SBC006_ALL",
+        "SBC008_ALL",
+        "SBC012_MANY",
+        "SBC020_AUD",
+        "SBC021_MANY",
+        "SBC023_MANY",
+        "SBC025_AUD",
+        "SBC026_AUD",
+        "SBC027_MANY",
+        "SBC027_AUD",
+        "SBC028_BOTH",
+        "SBC030_AUD",
+        "SBC038_AUD",
+        "SBC053_RADIO",
+        "SBC054_AUD",
+        "SBC054_MANY",
+        "SBC055_AUD",
+    ]:
+        spk2gen_dict[spk_key] = None
+        spk2glob_dict[spk_key] = spk_key
+
+    return spk2gen_dict, spk2glob_dict
+
+
+def _filename_to_supervisions(filename: Path, spk2gen_dict: dict, spk2glob_dict: dict):
+    reco_id = filename.stem.split(".")[0]
+    lines = filename.read_text(encoding="latin1")
+    supervisions = []
+
+    #### Transcript fix
+    lines = lines.replace("\x92", "'")
+    lines = lines.replace("\u007f", "")
+    lines = lines.replace("\u0000", "c")
+
+    if reco_id == "SBC002":
+        lines = lines.replace("(TSK ", "(TSK) ")
+    elif reco_id == "SBC004":
+        lines = lines.replace("KATE", "KATHY")
+        lines = lines.replace("sen~orita", "se\xf1orita")
+    elif reco_id == "SBC005":
+        lines = lines.replace("good_/god/", "good")
+        lines = lines.replace("(H)@>", "(H) @>")
+        lines = lines.replace("[@@ <@Mm@>]", "[@@ <@ Mm @>]")
+    elif reco_id == "SBC006":
+        lines = lines.replace("/pub/", "pub")
+        lines = lines.replace("<WH@@@@ (H) @@WH>", "<WH @@@@ (H) @@ WH>")
+        lines = lines.replace("[2(H)2]1", "[2(H)2]")
+    elif reco_id == "SBC007":
+        lines = lines.replace(
+            "\\000000000 000000000 MARY: 1182.90 1186.92\t        ",
+            "\n1182.90 1186.92\tMARY:   ",
+        )
+        lines = lines.replace("(YAWN0", "(YAWN)")
+    elif reco_id == "SBC008":
+        lines = lines.replace("[<X Go]=dX>", "[<X Go]=d X>")
+    elif reco_id == "SBC012":
+        lines = lines.replace(
+            "\n".join(["807.02 807.92\tFRANK:  \t.. Mhm."] * 2),
+            "807.02 807.92\tFRANK:  \t.. Mhm.",
+        )
+        lines = lines.replace("MONTOYA", "MONTOYO")
+    elif reco_id == "SBC013":
+        lines = lines.replace("[8<@She8]", "[8<@ She8]")
+        lines = lines.replace("[2(H) cou_ couch@>2]", "[2(H) cou_ couch @>2]")
+        lines = lines.replace("[4<@No=4]", "[4<@ No=4]")
+        lines = lines.replace("VOX2]", "VOX>2]")
+    elif reco_id == "SBC014":
+        lines = lines.replace("\\000000000 000000000 ", "\n")
+        lines = lines.replace("<@he thought", "<@ he thought")
+    elif reco_id == "SBC015":
+        lines = lines.replace(
+            "243.055\t244.080\tKEN:\t(H)] the little,",
+            "243.465\t244.670\tKEN:\t(H)] the little,",
+        )
+        lines = lines.replace("\u0000urch things.", "church things.")
+        lines = lines.replace("2(H]=2", "2(H)=2")
+        lines = lines.replace(" 0.000000e+00", "e")
+        lines = lines.replace("0m=,", "um=,")
+        lines = lines.replace("0eople", "people")
+        lines = lines.replace("0id", "did")
+        lines = lines.replace("X 0ne %tho", "X uh line %tho")
+        lines = lines.replace("and 0t [was]", "and it [was]")
+        lines = lines.replace("0t was like", "it was like")
+    elif reco_id == "SBC016":
+        lines = lines.replace("/sed ai/", "sed ai")
+    elif reco_id == "SBC017":
+        lines = lines.replace("a\tand names the] na=me,", "and names the] na=me,")
+        lines = lines.replace(" 0.000000e+00", "e")
+        lines = lines.replace("[2I mean2", "[2I mean2]")
+        lines = lines.replace("no2.", "no.")
+        lines = lines.replace("0rganisms", "organisms")
+        lines = lines.replace("0ttle", "little")
+    elif reco_id == "SBC018":
+        lines = lines.replace("0f", "if")
+    elif reco_id == "SBC019":
+        lines = lines.replace("cello_(/cheller/)", "cheller")
+        lines = lines.replace("(sigh)", "(SIGH)")
+        lines = lines.replace("<F<VOX> Mo=m", "<F<VOX Mo=m")
+        lines = lines.replace("@@[3@=3", "@@[3@=3]")
+        lines = lines.replace("[#5Jason", "[5#Jason")
+        lines = lines.replace("[20nh2]", "[2Unh2]")
+        lines = lines.replace("Draw 0n", "Draw on")
+        lines = lines.replace("0oes", "Does")
+        lines = lines.replace("0=kay", "O=kay")
+    elif reco_id == "SBC020":
+        lines = lines.replace("(COUGh)", "(COUGH)")
+        lines = lines.replace("(throat)", "(THROAT)")
+        lines = lines.replace("S-  0emon", "S- demon")
+        lines = lines.replace(" 0.000000E+00", "E")
+        lines = lines.replace("now 0m", "now um")
+        lines = lines.replace("uh  0s", "uh is")
+        lines = lines.replace("but  0n", "but uh in")
+        lines = lines.replace("i- % 0t's", "i- uh it's")
+        lines = lines.replace("0retty", "pretty")
+        lines = lines.replace("AUD:\tY", "X:\tY")
+    elif reco_id == "SBC022":
+        lines = lines.replace("(h)", "(H)")
+        lines = lines.replace("0.000000e+00", "e-")
+        lines = lines.replace("0ttle", "little")
+        lines = lines.replace("0ne thing", "uh one thing")
+    elif reco_id == "SBC023":
+        lines = lines.replace("JANICD", "JANICE")
+        lines = lines.replace("NORA?", "NORA")
+        lines = lines.replace("SUE?", "SUE")
+        lines = lines.replace("(throat)", "(THROAT)")
+        lines = lines.replace("2(SNIFF2", "2(SNIFF)2")
+        lines = lines.replace("[<Xbu=tX>]", "[<X bu=t X>]")
+        lines = lines.replace("<or did it", "<Q or did it")
+        lines = lines.replace("x>5]", "X>5]")
+        lines = lines.replace("0nly", "uh only")
+        lines = lines.replace("[50r5]", "[5Or5]")
+    elif reco_id == "SBC024":
+        lines = lines.replace(" >ENV: ", ">ENV:\t")
+        lines = lines.replace(" 0.000000irst", "First")
+        lines = lines.replace("2[cause", "[2cause")
+        lines = lines.replace(" 0oes", "does")
+        lines = lines.replace("0id]", "did]")
+    elif reco_id == "SBC025":
+        lines = lines.replace("<ot,", "<% not,")
+        lines = lines.replace(" 0.000000e+00", "e")
+        lines = lines.replace("0mself", "himself")
+    elif reco_id == "SBC026":
+        lines = lines.replace("does_(/uz/)", "does")
+        lines = lines.replace(" 0.000000e+00", "e")
+        lines = lines.replace("0ngoing", "ongoing")
+        lines = lines.replace("AUD:\t<X", "X_2:\t<X")
+    elif reco_id == "SBC027":
+        lines = lines.replace("142.870\t144.790 :", "142.870\t144.790")
+        lines = lines.replace("451.510\t452.130 :", "451.510\t452.130")
+        lines = lines.replace(" 0oing", "doing")
+        lines = lines.replace("AUD:\t.. [We", "X:\t.. [We")
+        lines = lines.replace("AUD:\t... Liquid", "X_1:\t... Liquid")
+        lines = lines.replace("AUD:\tAdd", "X_2:\tAdd")
+        lines = lines.replace("AUD:\t     [", "X_3:\t     [")
+        lines = lines.replace("AUD1:\t... One", "X_4:\t... One")
+        lines = lines.replace("AUD2:\t[One", "X_5:\t[One")
+        lines = lines.replace("AUD:\t...X [X", "X_6:\tX [X")
+        lines = lines.replace("AUD1:\tEight", "X_7:\tEight")
+        lines = lines.replace("AUD2:\t... [@", "AUD:\t... [@")
+        lines = lines.replace("AUD3:\t    [Four", "X_8:\t    [Four")
+        lines = lines.replace("AUD:\t... Seven", "X_9:\t... Seven")
+        lines = lines.replace("AUD1:\t.. <L2", "X_10:\t.. <L2")
+        lines = lines.replace("AUD2:\t        [", "X_11:\t       [")
+        lines = lines.replace("AUD:\t... <L2", "X_12:\t... <L2")
+        lines = lines.replace("AUD1:\t... [E", "X_13:\t... [E")
+        lines = lines.replace("AUD2:\t    [<L2", "X_14:\t    [<L2")
+        lines = lines.replace("AUD1:\t     ", "X_15:\t     ")
+        lines = lines.replace("AUD2:\t... There", "X_16:\t... There")
+        lines = lines.replace("AUD1:\t[Pull", "X_17:\t[Pull")
+        lines = lines.replace("AUD2:\tYou", "X_18:\tYou")
+        lines = lines.replace("AUD:\t[<X", "X_19:\t[<X")
+        lines = lines.replace("AUD:\t... Solid", "X_20:\t... Solid")
+        lines = lines.replace("AUD:\t.. Hydrogen", "X_21:\t.. Hydrogen")
+        lines = lines.replace("AUD:\t.. Oxygen", "X_22:\t.. Oxygen")
+        lines = lines.replace("AUD:\t.. [<", "X_23:\t.. [<")
+        lines = lines.replace("AUD:\t       ", "X_24:\t       ")
+        lines = lines.replace("AUD:\tThey're", "X_25:\tThey're")
+        lines = lines.replace("AUD:\t XXX", "X_26:\t XXX")
+        lines = lines.replace("AUD:\t... No", "X_27:\t... No")
+        lines = lines.replace("AUD:\t<X", "X_28:\t<X")
+        lines = lines.replace("AUD:\tThrow", "X_29:\tThrow")
+        lines = lines.replace("AUD:\tHotter", "X_30:\tHotter")
+        lines = lines.replace("AUD:\t.. Liquid", "X_31:\t.. Liquid")
+        lines = lines.replace("AUD:\t Did", "X_32:\t Did")
+        lines = lines.replace("AUD:\tX", "X_33:\tX")
+    elif reco_id == "SBC028":
+        lines = lines.replace(
+            "482.610\t484.010\tJILL_S: ", "482.610\t484.010\tJILL_S:\t"
+        )
+        lines = lines.replace("<@Oh[2=@>", "<@ Oh[2= @>")
+        lines = lines.replace(" 0.000000", " ")
+        lines = lines.replace("i 0f", "i- if")
+        lines = lines.replace("0f we", "if we")
+        lines = lines.replace("th- 0t's", "th- that's")
+        lines = lines.replace("0t's", "it's")
+        lines = lines.replace("0f", "if")
+    elif reco_id == "SBC029":
+        lines = lines.replace("96.230\t98.240\t>ENV: ", "96.230\t98.240\t>ENV:\t")
+        lines = lines.replace("(H )", "(H)")
+        lines = lines.replace("<0h=,", "<% Oh=,")
+        lines = lines.replace("knowX>]", "know X>]")
+        lines = lines.replace("0verheating", "overheating")
+    elif reco_id == "SBC030":
+        lines = lines.replace("DANNY", "BRADLEY")
+        lines = lines.replace("AUD:\tYes", "X:\tYes")
+    elif reco_id == "SBC034":
+        lines = lines.replace("13548.02 ", "1354.802")
+    elif reco_id == "SBC036":
+        lines = lines.replace(
+            "1558.463\t1558.906\t\t[thought he was,",
+            "1558.906\t1558.923\t\t[thought he was,",
+        )
+    elif reco_id == "SBC038":
+        lines = lines.replace("AUD:\t... What's", "X_2:\t... What's")
+        lines = lines.replace("AUD:\t... U", "X_3:\t... U")
+        lines = lines.replace("AUD:\t... How far", "X_2:\t... How far")
+        lines = lines.replace("AUD:\t<X Quite", "X_4:\t<X Quite")
+        lines = lines.replace("AUD:\tYeah", "X_5:\tYeah")
+        lines = lines.replace("AUD:\tAbout", "X_6:\tAbout")
+        lines = lines.replace("AUD:\t... That", "X_7:\t... That")
+        lines = lines.replace("AUD:\t.. <X Oh", "X_8:\t.. <X Oh")
+        lines = lines.replace("AUD:\t... How long", "X_3:\t... How long")
+        lines = lines.replace("AUD:\t<X @", "X_3:\t<X @")
+        lines = lines.replace("AUD:\tEach", "X_2:\tEach")
+        lines = lines.replace("AUD:\tThe water", "X_2:\tThe water")
+        lines = lines.replace("AUD:\t[Right", "X_9:\t[Right")
+        lines = lines.replace("AUD:\t... It's", "X_9:\t... It's")
+        lines = lines.replace("AUD:\t[Perp", "X_9:\t[Perp")
+        lines = lines.replace("AUD:\t[2perp", "X_9:\t[2perp")
+        lines = lines.replace("AUD:\t[3The", "X_9:\t[3The")
+        lines = lines.replace("AUD:\t[4Right", "X_9:\t[4Right")
+        lines = lines.replace("AUD:\tOh yeah", "X_9:\tOh yeah")
+        lines = lines.replace("AUD:\t[6Now", "X_9:\t[6Now")
+        lines = lines.replace("AUD:\twith the", "X_9:\twith the")
+        lines = lines.replace("AUD:\t[That-", "X_9:\t[That-")
+        lines = lines.replace("AUD:\t[Spinning", "X_9:\t[Spinning")
+        lines = lines.replace("AUD:\t[2Yeah", "X_9:\t[2Yeah")
+        lines = lines.replace("AUD:\t[3X", "X_9:\t[3X")
+        lines = lines.replace("AUD:\t[4<X", "X_9:\t[4<X")
+        lines = lines.replace("AUD:\tAnd that's", "X_9:\tAnd that's")
+        lines = lines.replace("AUD:\t[So", "X_9:\t[So")
+        lines = lines.replace("AUD:\t[2that's", "X_9:\t[2that's")
+        lines = lines.replace("AUD:\tthat's3", "X_9:\tthat's3")
+        lines = lines.replace("AUD:\tWe", "X_9:\tWe")
+        lines = lines.replace("AUD:\t.. All", "X_9:\t.. All")
+        lines = lines.replace("AUD:\t.. What's", "X_10:\t.. What's")
+        lines = lines.replace("AUD:\t... Are", "X_3:\t... Are")
+        lines = lines.replace("AUD:\tThe rest", "X_11:\tThe rest")
+        lines = lines.replace("AUD:\t... Y'all", "X_12:\t... Y'all")
+        lines = lines.replace("AUD:\t... Is", "X_13:\t... Is")
+        lines = lines.replace("AUD:\t[<X", "X_13:\t[<X")
+        lines = lines.replace("AUD:\t[Yeah", "X_13:\t[Yeah")
+        lines = lines.replace("AUD:\t... What are", "X_13:\t... What are")
+        lines = lines.replace("AUD_2", "AUD")
+        lines = lines.replace("AUD:\t[What are", "X_13:\t[What are")
+        lines = lines.replace("AUD:\t... Say", "X_14:\t... Say")
+        lines = lines.replace("AUD:\t[what's", "X_14:\t[what's")
+        lines = lines.replace("AUD:\t.. Hmm", "X_14:\t.. Hmm")
+        lines = lines.replace("AUD:\t[3When", "X_14:\t[3When")
+        lines = lines.replace("AUD:\t[It's", "X_15:\t[It's")
+        lines = lines.replace("AUD:\t... Have", "X_16:\t... Have")
+        lines = lines.replace("AUD:\tThanks", "X_17:\tThanks")
+        lines = lines.replace("AUD:\t... Wow", "X_13:\t... Wow")
+    elif reco_id == "SBC040":
+        lines = lines.replace("AUD:\t... What's", "X:\t... What's")
+        lines = lines.replace("AUD:\t... He", "X_2:\t... He")
+        lines = lines.replace("AUD:\t[What", "X_3:\t[What")
+        lines = lines.replace("AUD:\t.. Isn't", "X_4:\t.. Isn't")
+        lines = lines.replace("AUD:\tClaiborne", "X_4:\tClaiborne")
+        lines = lines.replace("AUD:\t... How", "X_4:\t... How")
+        lines = lines.replace("AUD:\t.. How", "X_4:\t.. How")
+        lines = lines.replace("AUD:\t.. The", "X_5:\t.. The")
+        lines = lines.replace("AUD:\t... Yes", "X_6:\t... Yes")
+    elif reco_id == "SBC043":
+        lines = lines.replace("< HI any nights HI>", "<HI any nights HI>")
+        lines = lines.replace("ANNETTE", "ANETTE")
+    elif reco_id == "SBC048":
+        lines = lines.replace("<@in San[2ta", "<@ in San[2ta")
+    elif reco_id == "SBC052":
+        lines = lines.replace("~Janine\t said", "~Janine said")
+    elif reco_id == "SBC054":
+        lines = lines.replace("<VOX Ugh VOX >", "<VOX Ugh VOX>")
+        lines = lines.replace("AUD:\tX", "X:\tX")
+        lines = lines.replace("AUD:\t<X", "X_2:\t<X")
+        lines = lines.replace("AUD_2:\t[Tha-]", "X_3:\t[Tha-]")
+        lines = lines.replace("AUD_3:\t[Tha-]", "X_4:\t[Tha-]")
+        lines = lines.replace("AUD:\t[@rhino", "X_5:\t[@rhino")
+        lines = lines.replace("AUD_2", "AUD")
+    elif reco_id == "SBC055":
+        lines = lines.replace("in spite ..\tof having", "in spite .. of having")
+        lines = lines.replace("AUD:\t... Beatrice", "X:\t... Beatrice")
+        lines = lines.replace("AUD:\tHow was", "X_2:\tHow was")
+        lines = lines.replace("AUD:\tCan", "X_3:\tCan")
+        lines = lines.replace("AUD_2:", "X_4:")
+    elif reco_id == "SBC056":
+        lines = lines.replace("@@@2]\t[3@@@@3]", "@@@2] [3@@@@3]")
+        lines = lines.replace("(sniff)", "(SNIFF)")
+    elif reco_id == "SBC057":
+        lines = lines.replace("Hane-makikomi", "<L2 Hane-makikomi L2>")
+        lines = lines.replace("sensei", "<L2 sensei L2>")
+        lines = lines.replace("ippon", "Ippon")
+        lines = lines.replace("Ippon", "<L2 Ippon L2>")
+        lines = re.sub(r"gi([^a-z])", r"<L2 gi L2>\1", lines)
+        lines = re.sub(r"Makikomi([^-])", r"<L2 Makikomi L2>\1", lines)
+        lines = lines.replace("Hane-goshi", "<L2 Hane-goshi L2>")
+        lines = lines.replace("Sode-makikomi", "<L2 Sode-makikomi L2>")
+        lines = lines.replace("shiai", "<L2 shiai L2>")
+        lines = lines.replace("randori", "<L2 randori L2>")
+        lines = re.sub(r"Sode([^-])", r"<L2 Sode L2>\1", lines)
+        lines = lines.replace("Ukemi", "<L2 Ukemi L2>")
+        lines = lines.replace("Ha-jime", "<L2 Ha-jime L2>")
+        lines = lines.replace("Ude-garami", "<L2 Ude-garami L2>")
+        lines = lines.replace("Hane-uchi-mata", "<L2 Hane-uchi-mata L2>")
+        lines = lines.replace("Uchi-<X mother X>", "Uchi-mata")
+        lines = lines.replace("Uchi-mata", "<L2 Uchi-mata L2>")
+        lines = lines.replace("Hande-maki- <L2 ", "<L2 Hande-maki- ")
+        lines = re.sub(r"Hane([^-])", r"<L2 Hane L2>\1", lines)
+        lines = lines.replace("%Sode-maki[komi]", "<L2 %Sode-maki[komi] L2>")
+        lines = lines.replace("Tsuri-komi", "<L2 Tsuri-komi L2>")
+        lines = lines.replace("Uchi-komi", "<L2 Uchi-komi L2>")
+        lines = lines.replace("O-uchi", "<L2 O-uchi L2>")
+        lines = lines.replace("Goshi", "<L2 Goshi L2>")
+        lines = lines.replace("Uchi]-mata", "<L2 Uchi]-mata L2>")
+        lines = lines.replace("Komi", "<L2 Komi L2>")
+        lines = lines.replace("Tani-otoshi", "<L2 Tani-otoshi L2>")
+        lines = lines.replace("Hane-maki][2komi=", "<L2 Hane-maki][2komi= L2>")
+        lines = lines.replace("Makikomi-waza", "<L2 Makikomi-waza L2>")
+        lines = lines.replace("Seoi", "<L2 Seoi L2>")
+        lines = lines.replace("uke", "<L2 uke L2>")
+    elif reco_id == "SBC059":
+        lines = lines.replace("[<F 3And you", "<F [3And you")
+        lines = lines.replace("hour[6=6 F>]", "hour[6=6] F>")
+
+    spk_buffer = ""
+    lang_buffer = "English"
+    for line in lines.split("\n"):
+        #### Transcript fixes
+        if line == "77.200\t77.540 :\t(H)":
+            continue
+        if line.startswith("000000000 000000000 ") or line.startswith("0.00 0.00"):
+            continue
+        if line.startswith("\t"):
+            line.lstrip("\t")
+        if "and in his pamphlet the Liber Arbetrio" in line:
+            continue
+
+        line = line.strip()
+        line = re.sub(r" +", " ", line)
+        line = re.sub(r"\t+", "\t", line)
+        fields = line.strip().split("\t")
+        if len(fields) == 4:
+            spk_field, raw_trans = fields[2:]
+            start, end = [float(time.rstrip()) for time in fields[:2]]
+        elif len(fields) == 3:
+            if len(fields[0].rstrip().split(" ")) > 1:
+                spk_field, raw_trans = fields[1:]
+                start, end = [float(time) for time in fields[0].split(" ")[:2]]
+                raw_trans = fields[-1]
+            else:
+                start, end = [float(time.rstrip()) for time in fields[:2]]
+                spk_field_candidate = fields[2].split(" ")[0]
+                if re.fullmatch(r"[A-Z]+:", spk_field_candidate):
+                    spk_field = spk_field_candidate
+                    raw_trans = " ".join(fields[2].split(" ")[1:])
+                else:
+                    spk_field = ""
+                    raw_trans = fields[2]
+        elif len(fields) == 2:
+            timesish = fields[0].rstrip().split(" ")
+            if len(timesish) == 1:
+                continue
+            start, end = [float(time) for time in timesish[:2]]
+            if len(timesish) > 2:
+                spk_field = timesish[2]
+                raw_trans = fields[1]
+            else:
+                spk_field_candidate = fields[1].split(" ")[0]
+                if re.fullmatch(r"[A-Z]+:", spk_field_candidate):
+                    spk_field = spk_field_candidate
+                    raw_trans = " ".join(fields[1].split(" ")[1:])
+                else:
+                    spk_field = ""
+                    raw_trans = fields[1]
+        else:
+            split = line.split(" ")
+            if re.fullmatch(r"[0-9]+\.[0-9]+", split[0]) and re.fullmatch(
+                r"[0-9]+\.[0-9]+", split[1]
+            ):
+                start, end = [float(time.rstrip()) for time in split[:2]]
+                if re.fullmatch(r"[A-Z]+:", split[2]):
+                    spk_field = split[2]
+                    raw_trans = " ".join(split[3:])
+                else:
+                    spk_field = ""
+                    raw_trans = " ".join(split[2:])
+            else:
+                continue
+
+        #### Transcript fixes
+        if raw_trans == "[2<L2 Zocalo.":
+            raw_trans = "[2<L2 Zocalo L2>2]."
+        elif raw_trans == "[You're <L2 outre mer L2].":
+            raw_trans = "[You're <L2 outre mer L2>]."
+
+        if " $ " in raw_trans:
+            continue
+
+        spk_field = spk_field.strip().rstrip(":").rstrip().upper()
+        if spk_field in [">ENV", "ENV", ">MAC", ">DOG", ">HORSE", ">CAT", ">BABY"]:
+            continue
+        elif spk_field == "#READ":
+            spk_field = "WALT"
+
+        if spk_field:
+            spk_field = re.sub(r"^[^A-Z]", "", spk_field)
+            spk_buffer = spk_field
+
+        utt_id = f"{reco_id}_{int(start*1000):07}_{int(end*1000):07}_{spk_buffer}"
+
+        text, lang_tag = _parse_raw_transcript(raw_trans)
+
+        if "l" in lang_tag:
+            for _ in range(lang_tag.count("l")):
+                new_lang = next(lang_iterators[reco_id])
+            if "c" in lang_tag:
+                lang_buffer = f"English-{new_lang}"
+            else:
+                lang_buffer = new_lang
+        elif "c" in lang_tag:
+            lang_buffer = f"English-{lang_buffer.split('-')[-1]}"
+
+        spk_key = reco_id + "_" + spk_buffer
+        if spk_key not in spk2glob_dict and reco_id != "SBC021":
+            spk2gen_dict[spk_key] = None
+            spk2glob_dict[spk_key] = dummy_spk_iterator.next(spk_key)
+
+        if spk_key in spk2glob_dict:
+            speaker = spk2glob_dict[spk_key]
+            gender = spk2gen_dict[spk_key]
+        else:
+            speaker = dummy_spk_iterator.next(spk_key)
+            gender = None
+
+        if re.search(r"[A-Za-z]", text):
+            supervisions.append(
+                SupervisionSegment(
+                    id=utt_id,
+                    recording_id=reco_id,
+                    start=start,
+                    duration=end - start,
+                    channel=[0, 1],
+                    text=text,
+                    language=lang_buffer,
+                    speaker=speaker,
+                    gender=gender,
+                )
+            )
+
+        if lang_tag:
+            if lang_tag[-1] == "r":
+                lang_buffer = "English"
+            if lang_tag[-1] == "l":
+                lang_buffer = lang_buffer.split("-")[-1]
+
+    return supervisions
+
+
+def _parse_raw_transcript(transcript: str):
+
+    transcript = transcript.replace("0h", "oh")
+    transcript = transcript.replace("s@so", "s- so")
+    transcript = transcript.replace("la@ter", "later")
+    transcript = transcript.replace("you@.", "you @.")
+    transcript = transcript.replace("[N=]", "N")
+    transcript = transcript.replace("[2C2]=", "C")
+    transcript = transcript.replace("[MM=]", "MM")
+    transcript = transcript.replace("[I=]", "I")
+
+    transcript = transcript.replace("(YELL)", "<yell>")
+
+    transcript = transcript.replace("_", "-")
+
+    transcript = transcript.replace("=", "")
+    transcript = transcript.replace("%", "")
+
+    # Process overlapped UNKs before they get removed by the following step
+    transcript = re.sub(r"\[([2-9]?)([A-Z])+\1\]", r"\2", transcript)
+
+    # Paired parenthetical/bracket annotation remover
+    paren_matches = re.findall(r"\([^a-z@ ]*\)", transcript)
+    for paren_match in paren_matches:
+        transcript = transcript.replace(
+            paren_match, re.sub(r"[^\[\]]", "", paren_match)
+        )
+    brack_matches = re.findall(r"\[[^a-z@ ]+\]", transcript)
+    for brack_match in brack_matches:
+        transcript = transcript.replace(
+            brack_match, re.sub(r"[^\(\)]", "", brack_match)
+        )
+
+    transcript = re.sub(r"<<[^a-z@ ]+>>", "", transcript)
+    transcript = re.sub(r"<<[^a-z@ ]+", "", transcript)
+    transcript = re.sub(r"[^a-z@ ]+>>", "", transcript)
+
+    transcript = re.sub(r"<[^a-z@ ]+>", "", transcript)
+    transcript = re.sub(r"<[^a-z2 ]*[^2 ]([ <])", r"\1", transcript)
+    transcript = re.sub(r"([ >])[^a-z2 ]*[^a-z 2]>", r"\1", transcript)
+
+    transcript = re.sub(r"\[[2-9]?", "", transcript)
+    transcript = re.sub(r"[2-9]?\]", "", transcript)
+
+    transcript = transcript.replace("(Hx)", " ")
+    transcript = transcript.replace("(hx)", " ")
+    transcript = transcript.replace("(@Hx)", "@")
+
+    transcript = transcript.replace("(COUGH COUGH)", " ")
+    transcript = transcript.replace("(SNIFF", "")
+
+    transcript = transcript.replace("(", "")
+    transcript = transcript.replace(")", "")
+
+    transcript = transcript.replace("< ", " ")
+    transcript = transcript.replace(" >", " ")
+
+    transcript = re.sub(r"[^A-Za-z-]-+", "", transcript)
+    transcript = re.sub(r"\.\.+", "", transcript)
+
+    transcript = transcript.replace("+", "")
+    transcript = transcript.replace("&", "")
+    transcript = transcript.replace("#", "")
+    transcript = transcript.replace("*", "")
+
+    transcript = re.sub(r"!([A-Za-z])", r"\1", transcript)
+
+    # Deal with extra white space
+    transcript = re.sub(r" +", " ", transcript)
+
+    # Merge X's
+    transcript = re.sub(r"X+", "X", transcript)
+
+    # Parse laughter
+    transcript = transcript.replace("on@,", "on @,")
+    transcript = re.sub(r"([a-z-])@([a-z])", r"\1\2", transcript)
+    transcript = re.sub(r"@+", "@", transcript)
+    transcript = re.sub(r"(^| )@([^ ])", r" @ \2", transcript)
+    transcript = re.sub(r"([^ ])@( |$)", r"\1 @ ", transcript)
+    transcript = transcript.replace("@ @", "@").replace("@ @", "@")
+
+    transcript = re.sub(r"(^| )X([ ,.?']|$)", r"\1<UNK>\2", transcript)
+    transcript = re.sub(r"(^| )X([ ,.?']|$)", r"\1<UNK>\2", transcript)
+    transcript = re.sub(r"X-($| )", r"<UNK>\1", transcript)
+
+    transcript = re.sub(r"^ ", "", transcript)
+    transcript = re.sub(r" $", "", transcript)
+
+    transcript = transcript.replace(" .", ".")
+    transcript = transcript.replace(" ,", ",")
+    transcript = transcript.replace(" ?", "?")
+
+    transcript = re.sub(r"^\. ", "", transcript)
+    transcript = re.sub(r"^\.$", "", transcript)
+
+    if (
+        len(transcript.split("<L2")) > 1
+        and re.search(r"[A-Za-z]", transcript.split("<L2")[0])
+    ) or (
+        len(transcript.split("L2>")) > 1
+        and re.search(r"[A-Za-z]", transcript.split("L2>")[-1])
+    ):
+        lang_tag = "c"
+    else:
+        lang_tag = ""
+
+    transcript = transcript.replace("@", "<LAUGH>")
+    transcript = transcript.replace("<yell>", "<YELL>")
+
+    if "L2" in transcript:
+        lang_tag = lang_tag + re.sub(
+            r"(<L2|L2>)(?!.*(<L2|L2>)).*$",
+            r"\1",
+            re.sub(r".*?(<L2|L2>)", r"\1", transcript),
+        )
+        lang_tag = lang_tag.replace("<L2", "l").replace("L2>", "r")
+
+    # We choose to leave the language tags in, but uncommenting this would remove them.
+    #    transcript = transcript.replace("<L2 ", "")
+    #    transcript = transcript.replace(" L2>", "")
+
+    return transcript, lang_tag