From 28ada09ded30382912f18bba2d0879269cbc7712 Mon Sep 17 00:00:00 2001 From: Jindra Helcl Date: Tue, 31 Oct 2023 13:24:44 +0100 Subject: [PATCH] safer stripping, fail on too many tabs --- opuscleaner/filters/normalize_whitespace.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/opuscleaner/filters/normalize_whitespace.py b/opuscleaner/filters/normalize_whitespace.py index 880095a..85baaf2 100755 --- a/opuscleaner/filters/normalize_whitespace.py +++ b/opuscleaner/filters/normalize_whitespace.py @@ -17,17 +17,17 @@ def collapse_whitespace(s): def clean(collapse): """Runs the filter.""" - for line in sys.stdin: - fields = line.strip().split("\t") + for i, line in enumerate(sys.stdin): + fields = line.split("\t") if len(fields) == 1: src = fields[0].strip() trg = None - else: - # Similar to max_length filter, here we throw away potential - # newlines. + elif len(fields) == 2: src = fields[0].strip() trg = fields[1].strip() + else: + raise ValueError(f"Too many tabs on input line {i + 1}") if collapse: src = collapse_whitespace(src)