diff --git a/wizard/parsers/gps/jm.py b/wizard/parsers/gps/jm.py index 72105a7..7e68f55 100644 --- a/wizard/parsers/gps/jm.py +++ b/wizard/parsers/gps/jm.py @@ -1,18 +1,20 @@ import io import csv +import re import pandas as pd from parsers.parser_base import Parser, Parsable from parsers.helpers import stream_chunk_contains -class GPS2JMParser(Parser): +class GPS2JMParser7_5(Parser): ''' - Parser for 2Jm format + Parser for 2Jm format v 7.5 ''' DATATYPE = "gps_2jm" # TODO: define fields FIELDS = [str(x) for x in range(0,13)] + VERSION = "v7.5" SEPARATOR = " " ENDINGS = [ "[EOF]", @@ -42,24 +44,36 @@ class GPS2JMParser(Parser): # "ring_nr": None, # "trip_nr": None, # } + + def _fix_content(self, data): + return data def __init__(self, parsable: Parsable): super().__init__(parsable) - with self.file.get_stream(binary=False) as stream: + with self.file.get_stream(binary=False, errors='backslashreplace') as stream: + # TODO: check the first byte instead of the whole stream chunk if not stream.seekable(): self._raise_not_supported('Stream not seekable') if not stream_chunk_contains(stream, 30, "2JmGPS-LOG"): self._raise_not_supported(f"Stream must start with 2JmGPS-LOG") - groups = stream.read().split('\n\n')[1:] + groups = stream.read().split('\n\n') + head = groups.pop(0) + + if self.VERSION not in head: + self._raise_not_supported(f"Version not supported") + data = None for group in groups: if group in self.ENDINGS: break data = group + + data = self._fix_content(data) + content = io.StringIO(data) reader = csv.reader(content, delimiter=self.SEPARATOR, skipinitialspace=True) @@ -69,6 +83,24 @@ def __init__(self, parsable: Parsable): self.data = pd.read_csv(content, header=0, names=self.FIELDS, sep=self.SEPARATOR, index_col=False) + +regex = re.compile(r'\s{2,10}', re.MULTILINE) + +class GPS2JMParser8(GPS2JMParser7_5): + VERSION = "v8" + + def _fix_content(self, data: str): + ''' + In version 8 there is a strange notation using the whitespace + also to right align the number for a specific column + In this case replace the multiple spaces + ''' + return regex.sub( + ' ', + data + ) + PARSERS = [ - GPS2JMParser, + GPS2JMParser7_5, + GPS2JMParser8, ] diff --git a/wizard/parsers/helpers.py b/wizard/parsers/helpers.py index acf832a..8a46ea6 100644 --- a/wizard/parsers/helpers.py +++ b/wizard/parsers/helpers.py @@ -18,7 +18,6 @@ def stream_chunk_match(stream, length, text): position = stream.tell() chunk = stream.read(length) stream.seek(position) - print(chunk) return re.search(text, chunk) diff --git a/wizard/parsers/parser_base.py b/wizard/parsers/parser_base.py index 8d743df..75223d3 100644 --- a/wizard/parsers/parser_base.py +++ b/wizard/parsers/parser_base.py @@ -25,10 +25,11 @@ def __init__(self, file_path: pathlib.Path) -> None: self.encoding = self._detect_encoding() @contextmanager - def get_stream(self, binary=False): + def get_stream(self, binary=False, errors="strict"): params = { 'mode': 'rb' if binary else 'r', - 'encoding': None if binary else self.encoding + 'encoding': None if binary else self.encoding, + 'errors': errors if not binary else None, } stream = open(self._file_path, **params) yield stream @@ -41,7 +42,7 @@ def _detect_encoding(self): detector.feed(line) if detector.done: break detector.close() - logging.debug(detector.result) + print(detector.result) return detector.result['encoding']