From a3755a8cfcae4d2d86ee41aad238cbeccd14fd89 Mon Sep 17 00:00:00 2001 From: Xi Bai Date: Tue, 23 Aug 2022 10:25:30 +0100 Subject: [PATCH 01/20] prepare for the new release (#74) * support py310 and tensorflow 2.8 --- Pipfile | 14 +++++++------- README.md | 5 +++-- docker/docker-compose.yml | 6 ------ requirements-dev.txt | 2 +- requirements-translation.txt | 2 +- requirements.txt | 10 +++++----- subaligner/__init__.py | 2 ++ subaligner/_version.py | 2 +- subaligner/trainer.py | 4 ++-- subaligner/translator.py | 4 ++-- tests/integration/feature/subaligner_train.feature | 2 +- 11 files changed, 25 insertions(+), 28 deletions(-) diff --git a/Pipfile b/Pipfile index 871c0b0..55c4e39 100644 --- a/Pipfile +++ b/Pipfile @@ -13,7 +13,7 @@ snakeviz = "==2.1.0" line-profiler = "==3.0.2" scikit-build = "==0.11.1" radish-bdd = "~=0.13.3" -pex = "==2.1.15" +pex = "<=2.1.80" mypy = "==0.790" parameterized = "==0.8.1" pylint = "~=2.8.2" @@ -45,7 +45,7 @@ google-auth-oauthlib = "==0.4.2" google-pasta = "~=0.2" graphviz = "==0.8.3" HeapDict = "==1.0.0" -h5py = "~=2.10.0" +h5py = "<=3.6.0" html5lib = "==1.0b9" hyperopt = "==0.2.4" idna = "==2.8" @@ -62,7 +62,7 @@ Markdown = "==2.6.11" mccabe = "==0.6.1" msgpack-python = "==0.5.6" numba = ">=0.50.0" -numpy = "<1.23.0" +numpy = "<1.24.0" oauthlib = "==3.1.0" pbr = "==4.0.2" pluggy = "==0.13.1" @@ -77,7 +77,7 @@ pylint = "==2.5.0" pyparsing = "==2.2.0" pyprof2calltree = "==1.4.3" pysrt = "==1.1.1" -pysubs2 = "==0.2.4" +pysubs2 = "<=1.4.2" pystack-debugger = "==0.8.0" python-dateutil = "==2.7.2" pytz = "==2018.4" @@ -85,17 +85,17 @@ PyYAML = ">=4.2b1" requests = "~=2.25.1" requests-oauthlib = "==1.3.0" rsa = "==4.7" -scipy = "~=1.5.4" +scipy = "<=1.8.1" scikit-learn = ">=0.19.1" sentencepiece = "~=0.1.95" setuptools = ">=41.0.0" six = "~=1.15.0" tblib = "==1.3.2" -tensorflow = ">=1.15.5,<2.8" +tensorflow = ">=1.15.5,<2.9" termcolor = "==1.1.0" toml = "==0.10.0" toolz = "==0.9.0" -torch = "~=1.8.1" +torch = "<=1.12.0" tornado = "==5.1.0" transformers = "~=4.5.1" typing-extensions = "~=3.7.0" diff --git a/README.md b/README.md index 6cc2432..634adac 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,12 @@ [![Build Status](https://github.com/baxtree/subaligner/actions/workflows/ci-pipeline.yml/badge.svg?branch=master)](https://github.com/baxtree/subaligner/actions/workflows/ci-pipeline.yml?query=branch%3Amaster) ![Codecov](https://img.shields.io/codecov/c/github/baxtree/subaligner) -[![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/) [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/) [![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/) +[![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-3100/) [![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/) [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/) [![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/) [![Documentation Status](https://readthedocs.org/projects/subaligner/badge/?version=latest)](https://subaligner.readthedocs.io/en/latest/?badge=latest) [![GitHub license](https://img.shields.io/github/license/baxtree/subaligner)](https://github.com/baxtree/subaligner/blob/master/LICENSE) [![PyPI](https://badge.fury.io/py/subaligner.svg)](https://badge.fury.io/py/subaligner) -[![Docker](https://img.shields.io/docker/cloud/build/baxtree/subaligner?label=Docker&style=flat)](https://hub.docker.com/r/baxtree/subaligner/builds) +[![Docker Build](https://img.shields.io/docker/cloud/build/baxtree/subaligner?label=Docker&style=flat)](https://hub.docker.com/r/baxtree/subaligner/builds) +[![Docker Pulls](https://img.shields.io/docker/pulls/baxtree/subaligner)](https://hub.docker.com/r/baxtree/subaligner) [![Citation](https://zenodo.org/badge/228440472.svg)](https://doi.org/10.5281/zenodo.5603083) ## Supported Formats diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index b127ea5..424665b 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -7,12 +7,6 @@ services: dockerfile: Dockerfile-CentOS7 image: baxtree/subaligner:${SUBALIGNER_VERSION}.el7 - subaligner-centos8: - build: - context: ./ - dockerfile: Dockerfile-CentOS8 - image: baxtree/subaligner:${SUBALIGNER_VERSION}.el8 - subaligner-ubuntu18: build: context: ./ diff --git a/requirements-dev.txt b/requirements-dev.txt index 55e0085..d6b5f10 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,7 +7,7 @@ snakeviz==2.1.0 line-profiler==3.1.0 scikit-build==0.11.1 radish-bdd~=0.13.3 -pex==2.1.34 +pex<=2.1.80 mypy==0.931 types-requests==2.27.9 types-setuptools==57.4.9 diff --git a/requirements-translation.txt b/requirements-translation.txt index 1fa5e3c..a400733 100644 --- a/requirements-translation.txt +++ b/requirements-translation.txt @@ -1,4 +1,4 @@ pycountry~=20.7.3 sentencepiece~=0.1.95 -torch~=1.8.1 +torch<1.13.0 transformers~=4.5.1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ff3a994..ee3ba41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,7 @@ google-auth-oauthlib==0.4.2 google-pasta~=0.2 graphviz==0.8.3 HeapDict==1.0.0 -h5py~=3.1.0 +h5py<=3.6.0 html5lib==1.0b9 hyperopt==0.2.4 idna==2.8 @@ -38,7 +38,7 @@ mccabe==0.6.1 networkx>=2.5.1 msgpack-python==0.5.6 numba>=0.50.0 -numpy<1.23.0 +numpy<1.24.0 oauthlib==3.1.0 pbr==4.0.2 pluggy==0.13.1 @@ -51,19 +51,19 @@ pydot-ng==1.0.0 pydotplus==2.0.2 pyprof2calltree==1.4.3 pysrt==1.1.1 -pysubs2==0.2.4 +pysubs2<=1.4.2 pystack-debugger==0.8.0 pytz==2018.4 PyYAML>=4.2b1 requests~=2.25.1 requests-oauthlib==1.3.0 rsa==4.7 -scipy~=1.5.4 +scipy<=1.8.1 scikit-learn~=0.24.2 setuptools>=41.0.0 six~=1.15.0 tblib==1.3.2 -tensorflow>=1.15.5,<2.8 +tensorflow>=1.15.5,<2.9 termcolor==1.1.0 toml==0.10.0 toolz==0.9.0 diff --git a/subaligner/__init__.py b/subaligner/__init__.py index f748777..e3206b2 100644 --- a/subaligner/__init__.py +++ b/subaligner/__init__.py @@ -1,5 +1,7 @@ +import os import multiprocessing as mp from ._version import __version__ __all__ = ["__version__"] mp.set_start_method("spawn", force=True) +os.environ["KMP_WARNINGS"] = "0" diff --git a/subaligner/_version.py b/subaligner/_version.py index cdd5ff7..0f6fa4c 100644 --- a/subaligner/_version.py +++ b/subaligner/_version.py @@ -1,2 +1,2 @@ """The semver for the current release.""" -__version__ = "0.2.4" +__version__ = "0.2.5" diff --git a/subaligner/trainer.py b/subaligner/trainer.py index 95f5a6b..eb0c25e 100644 --- a/subaligner/trainer.py +++ b/subaligner/trainer.py @@ -315,8 +315,8 @@ def __extract_data_and_label_from_avs( train_data = [x for x in train_data if x is not None] labels = [x for x in labels if x is not None] - train_data = np.concatenate(train_data) - labels = np.concatenate(labels) + train_data: np.ndarray = np.concatenate(train_data) # type: ignore + labels: np.ndarray = np.concatenate(labels) # type: ignore self.__LOGGER.debug( "Data and labels extracted after {} seconds".format( str(datetime.datetime.now() - extraction_start) diff --git a/subaligner/translator.py b/subaligner/translator.py index 78c548c..0cc504e 100644 --- a/subaligner/translator.py +++ b/subaligner/translator.py @@ -128,8 +128,8 @@ def translate(self, subs: List[SubRipItem]) -> List[SubRipItem]: num_of_batches = math.ceil(len(src_texts) / Translator.__TRANSLATING_BATCH_SIZE) self.__LOGGER.info("Translating %s subtitle cue(s)..." % len(src_texts)) for batch in tqdm(Translator.__batch(src_texts, Translator.__TRANSLATING_BATCH_SIZE), total=num_of_batches): - tokenizer = self.tokenizer(batch, return_tensors=Translator.__TENSOR_TYPE, padding=True) - translated = self.lang_model.generate(**tokenizer) + input_ids = self.tokenizer(batch, return_tensors=Translator.__TENSOR_TYPE, padding=True) + translated = self.lang_model.generate(**input_ids) translated_texts.extend([self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]) for index in range(len(new_subs)): new_subs[index].text = translated_texts[index] diff --git a/tests/integration/feature/subaligner_train.feature b/tests/integration/feature/subaligner_train.feature index 68cb446..2fcda5a 100644 --- a/tests/integration/feature/subaligner_train.feature +++ b/tests/integration/feature/subaligner_train.feature @@ -53,7 +53,7 @@ Feature: Subaligner CLI Then it exits with code "21" @train @embedded-subtitle - Scenario: Test training on video and embedded subtitles + Scenario: Test training on video with embedded subtitles Given I have an audiovisual file directory "av_embedded" And I want to save the training output in directory "output" When I run the subaligner_train with subtitle selector "embedded:stream_index=0,file_extension=srt" and the following options From 845878ee97b062ade2bbacdb6dfd457bd5893e8f Mon Sep 17 00:00:00 2001 From: baxtree Date: Tue, 23 Aug 2022 10:36:23 +0100 Subject: [PATCH 02/20] update dependencies --- Pipfile | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Pipfile b/Pipfile index 55c4e39..5f09cf8 100644 --- a/Pipfile +++ b/Pipfile @@ -39,7 +39,7 @@ Cython = "~=0.29.22" dask = "<2022.1.0" decorator = "==4.3.0" distributed = "==1.13.0" -filelock = "==3.0.12" +filelock = "<4.0.0" google-auth = "==1.27.0" google-auth-oauthlib = "==0.4.2" google-pasta = "~=0.2" diff --git a/requirements.txt b/requirements.txt index ee3ba41..5d2a4a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ Cython~=0.29.22 dask<2022.1.0 decorator==4.3.0 distributed==1.13.0 -filelock==3.0.12 +filelock<4.0.0 google-auth==1.27.0 google-auth-oauthlib==0.4.2 google-pasta~=0.2 From db44020e3989167eba8031730ce123f9517b9367 Mon Sep 17 00:00:00 2001 From: baxtree Date: Mon, 27 Feb 2023 09:43:49 +0000 Subject: [PATCH 03/20] issue-75 utilise opus tc big models for translation --- Pipfile | 2 +- requirements-translation.txt | 2 +- subaligner/translator.py | 84 +++++++++++++++----- tests/integration/feature/subaligner.feature | 1 + 4 files changed, 66 insertions(+), 23 deletions(-) diff --git a/Pipfile b/Pipfile index 5f09cf8..4abbaad 100644 --- a/Pipfile +++ b/Pipfile @@ -97,7 +97,7 @@ toml = "==0.10.0" toolz = "==0.9.0" torch = "<=1.12.0" tornado = "==5.1.0" -transformers = "~=4.5.1" +transformers = "<4.27.0" typing-extensions = "~=3.7.0" urllib3 = "~=1.26.5" Werkzeug = ">=0.15.3" diff --git a/requirements-translation.txt b/requirements-translation.txt index a400733..e12de75 100644 --- a/requirements-translation.txt +++ b/requirements-translation.txt @@ -1,4 +1,4 @@ pycountry~=20.7.3 sentencepiece~=0.1.95 torch<1.13.0 -transformers~=4.5.1 \ No newline at end of file +transformers<4.27.0 \ No newline at end of file diff --git a/subaligner/translator.py b/subaligner/translator.py index 0cc504e..28277fa 100644 --- a/subaligner/translator.py +++ b/subaligner/translator.py @@ -16,6 +16,7 @@ class Translator(metaclass=Singleton): __TENSOR_TYPE = "pt" __OPUS_MT = "Helsinki-NLP/opus-mt-{}-{}" + __OPUS_MT_TC_BIG = "Helsinki-NLP/opus-mt-tc-big-{}-{}" __OPUS_TATOEBA = "Helsinki-NLP/opus-tatoeba-{}-{}" __TRANSLATING_BATCH_SIZE = 10 __LANGUAGE_CODE_MAPPER = { @@ -140,59 +141,100 @@ def __initialise_model(self, src_lang: str, tgt_lang: str) -> None: src_lang = Translator.normalise_single(src_lang) tgt_lang = Translator.normalise_single(tgt_lang) src_lang, tgt_lang = Translator.normalise_pair(src_lang, tgt_lang) + + if self.__download_mt_model(src_lang, tgt_lang): + return + elif self.__download_mt_tc_big_model(src_lang, tgt_lang): + return + elif self.__download_tatoeba_model(src_lang, tgt_lang): + return + else: + message = 'Cannot find the MT model for source language "{}" and destination language "{}"'.format(src_lang, tgt_lang) + self.__LOGGER.error(message) + raise NotImplementedError(message) + + def __download_mt_model(self, src_lang: str, tgt_lang: str) -> bool: try: mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(tgt_lang)) - self.__download_mt_model(mt_model_name) - return + self.__download(mt_model_name) + return True except OSError: self.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_MT.format(src_lang, Translator.get_iso_639_alpha_2(tgt_lang)) - self.__download_mt_model(mt_model_name) - return + self.__download(mt_model_name) + return True except OSError: self.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), tgt_lang) - self.__download_mt_model(mt_model_name) - return + self.__download(mt_model_name) + return True except OSError: self.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_MT.format(src_lang, tgt_lang) - self.__download_mt_model(mt_model_name) - return + self.__download(mt_model_name) + return True except OSError: self.__log_and_back_off(mt_model_name) + return False + + def __download_mt_tc_big_model(self, src_lang: str, tgt_lang: str) -> bool: + try: + mt_tc_model_name = Translator.__OPUS_MT_TC_BIG.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(tgt_lang)) + self.__download(mt_tc_model_name) + return True + except OSError: + self.__log_and_back_off(mt_tc_model_name) + try: + mt_tc_model_name = Translator.__OPUS_MT_TC_BIG.format(src_lang, Translator.get_iso_639_alpha_2(tgt_lang)) + self.__download(mt_tc_model_name) + return True + except OSError: + self.__log_and_back_off(mt_tc_model_name) + try: + mt_tc_model_name = Translator.__OPUS_MT_TC_BIG.format(Translator.get_iso_639_alpha_2(src_lang), tgt_lang) + self.__download(mt_tc_model_name) + return True + except OSError: + self.__log_and_back_off(mt_tc_model_name) + try: + mt_tc_model_name = Translator.__OPUS_MT_TC_BIG.format(src_lang, tgt_lang) + self.__download(mt_tc_model_name) + return True + except OSError: + self.__log_and_back_off(mt_tc_model_name) + return False + + def __download_tatoeba_model(self, src_lang: str, tgt_lang: str) -> bool: try: mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(tgt_lang)) - self.__download_mt_model(mt_model_name) - return + self.__download(mt_model_name) + return True except OSError: self.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, Translator.get_iso_639_alpha_2(tgt_lang)) - self.__download_mt_model(mt_model_name) - return + self.__download(mt_model_name) + return True except OSError: self.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), tgt_lang) - self.__download_mt_model(mt_model_name) - return + self.__download(mt_model_name) + return True except OSError: self.__log_and_back_off(mt_model_name) try: mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, tgt_lang) - self.__download_mt_model(mt_model_name) - return + self.__download(mt_model_name) + return True except OSError: - self.__LOGGER.debug("Cannot download the MT model %s" % mt_model_name) - message = 'Cannot find the MT model for source language "{}" and destination language "{}"'.format(src_lang, tgt_lang) - self.__LOGGER.error(message) - raise NotImplementedError(message) + self.__log_and_back_off(mt_model_name) + return False - def __download_mt_model(self, mt_model_name: str) -> None: + def __download(self, mt_model_name: str) -> None: self.__LOGGER.debug("Trying to download the MT model %s" % mt_model_name) self.tokenizer = MarianTokenizer.from_pretrained(mt_model_name) self.lang_model = MarianMTModel.from_pretrained(mt_model_name) diff --git a/tests/integration/feature/subaligner.feature b/tests/integration/feature/subaligner.feature index acefc16..2858b79 100644 --- a/tests/integration/feature/subaligner.feature +++ b/tests/integration/feature/subaligner.feature @@ -230,6 +230,7 @@ Feature: Subaligner CLI | subaligner | single | "test.srt" | eng,zho | "test_aligned.srt" | | subaligner | dual | "test.srt" | eng,spa | "test_aligned.srt" | | subaligner | script | "test_plain.txt" | eng,ita | "test_aligned.srt" | + | subaligner | script | "test_plain.txt" | eng,por | "test_aligned.srt" | | subaligner_1pass | | "test.srt" | eng,fra | "test_aligned.srt" | | subaligner_2pass | | "test.srt" | eng,deu | "test_aligned.srt" | From 270e92df67437a008e66ff0f3973a103d6e19d4b Mon Sep 17 00:00:00 2001 From: baxtree Date: Mon, 27 Feb 2023 12:25:51 +0000 Subject: [PATCH 04/20] specify workflow target branches --- .github/workflows/ci-pipeline.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-pipeline.yml b/.github/workflows/ci-pipeline.yml index 8a6fe6e..b0a3227 100644 --- a/.github/workflows/ci-pipeline.yml +++ b/.github/workflows/ci-pipeline.yml @@ -1,9 +1,11 @@ name: ci pipeline on: - - push - - pull_request - - workflow_dispatch + push: + branches: [ master, development ] + pull_request: + branches: [ master, development ] + workflow_dispatch: jobs: main: From 1baa9a9f9c3b6b7ac6527b187da6749c4638cdb5 Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 10 Mar 2023 09:43:42 +0000 Subject: [PATCH 05/20] generate subtitle via transcription --- .github/workflows/ci-pipeline.yml | 2 +- Makefile | 16 +-- Pipfile | 3 +- README.md | 17 +-- docker/Dockerfile-ArchLinux | 2 +- ...ts-translation.txt => requirements-llm.txt | 3 +- setup.py | 11 +- site/source/acknowledgement.rst | 1 + site/source/index.rst | 4 + site/source/installation.rst | 2 +- site/source/usage.rst | 12 +- subaligner/__main__.py | 110 +++++++++++----- subaligner/exception.py | 4 + subaligner/predictor.py | 2 +- subaligner/subaligner_1pass/__main__.py | 2 +- subaligner/subaligner_2pass/__main__.py | 2 +- subaligner/subaligner_batch/__main__.py | 2 +- subaligner/subaligner_convert/__main__.py | 2 +- subaligner/subtitle.py | 15 +++ subaligner/transcriber.py | 118 ++++++++++++++++++ tests/integration/feature/subaligner.feature | 11 ++ tests/integration/radish/step.py | 14 +++ tests/subaligner/test_transcriber.py | 43 +++++++ 23 files changed, 335 insertions(+), 63 deletions(-) rename requirements-translation.txt => requirements-llm.txt (54%) create mode 100644 subaligner/transcriber.py create mode 100644 tests/subaligner/test_transcriber.py diff --git a/.github/workflows/ci-pipeline.yml b/.github/workflows/ci-pipeline.yml index b0a3227..169abcc 100644 --- a/.github/workflows/ci-pipeline.yml +++ b/.github/workflows/ci-pipeline.yml @@ -30,7 +30,7 @@ jobs: python -m pip install --upgrade pip cat requirements.txt | xargs -L 1 pip install cat requirements-stretch.txt | xargs -L 1 pip install - cat requirements-translation.txt | xargs -L 1 pip install + cat requirements-llm.txt | xargs -L 1 pip install cat requirements-dev.txt | xargs -L 1 pip install cat requirements-site.txt | xargs -L 1 pip install pip install -e . --ignore-installed diff --git a/Makefile b/Makefile index 3bf1626..4c414da 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ install: .$(PYTHON)/bin/pip install --upgrade pip setuptools wheel; \ cat requirements.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-stretch.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ - cat requirements-translation.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ + cat requirements-llm.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-dev.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ .$(PYTHON)/bin/pip install -e . --ignore-installed cp ./bin/subaligner_1pass .$(PYTHON)/bin/subaligner_1pass @@ -55,7 +55,7 @@ install-basic: .$(PYTHON)/bin/pip install -e '.' --no-cache-dir install-translation: - .$(PYTHON)/bin/pip install -e '.[translation]' --no-cache-dir + .$(PYTHON)/bin/pip install -e '.[llm]' --no-cache-dir install-stretch: .$(PYTHON)/bin/pip install -e '.[stretch]' --no-cache-dir @@ -82,7 +82,7 @@ test: .$(PYTHON)/bin/pip install --upgrade pip setuptools wheel; \ cat requirements.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-stretch.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ - cat requirements-translation.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ + cat requirements-llm.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-dev.txt | xargs -L 1 .$(PYTHON)/bin/pip install PYTHONPATH=. .$(PYTHON)/bin/python -m unittest discover -.$(PYTHON)/bin/pycodestyle subaligner tests examples misc bin/subaligner bin/subaligner_1pass bin/subaligner_2pass bin/subaligner_batch bin/subaligner_convert bin/subaligner_train bin/subaligner_tune setup.py --ignore=E203,E501,W503 --exclude="subaligner/lib" @@ -95,7 +95,7 @@ test-int: ## integration test .$(PYTHON)/bin/pip install --upgrade pip setuptools wheel; \ cat requirements.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-stretch.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ - cat requirements-translation.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ + cat requirements-llm.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-dev.txt | xargs -L 1 .$(PYTHON)/bin/pip install .$(PYTHON)/bin/pip install -e . --ignore-installed ( \ @@ -108,7 +108,7 @@ pydoc: clean-doc ## generate pydoc HTML documentation based on docstrings .$(PYTHON)/bin/pip install --upgrade pip setuptools wheel; \ cat requirements.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-stretch.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ - cat requirements-translation.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ + cat requirements-llm.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ .$(PYTHON)/bin/python -m pydoc -w subaligner; mv subaligner.html docs/index.html .$(PYTHON)/bin/python -m pydoc -w subaligner.embedder; mv subaligner.embedder.html docs .$(PYTHON)/bin/python -m pydoc -w subaligner.exception; mv subaligner.exception.html docs @@ -131,7 +131,7 @@ coverage: ## check code coverage quickly with the default Python .$(PYTHON)/bin/pip install --upgrade pip setuptools wheel; \ cat requirements.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-stretch.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ - cat requirements-translation.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ + cat requirements-llm.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-dev.txt | xargs -L 1 .$(PYTHON)/bin/pip install .$(PYTHON)/bin/coverage run --source subaligner -m unittest discover .$(PYTHON)/bin/coverage report @@ -167,7 +167,7 @@ profile: .$(PYTHON)/bin/pip install --upgrade pip setuptools wheel; \ cat requirements.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-stretch.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ - cat requirements-translation.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ + cat requirements-llm.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ cat requirements-dev.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ .$(PYTHON)/bin/python -c "import misc.profiler; misc.profiler.generate_profiles()" .$(PYTHON)/bin/kernprof -v -l ./misc/profiler.py @@ -176,7 +176,7 @@ app: clean-wheels if [ ! -e ".$(PYTHON)" ]; then ~/.pyenv/versions/$(PYTHON)/bin/python3 -m venv .$(PYTHON); fi .$(PYTHON)/bin/pip install --upgrade pip setuptools wheel; \ cat requirements-dev.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ - .$(PYTHON)/bin/pip wheel --no-cache-dir --wheel-dir=./wheels -r requirements.txt -r requirements-stretch.txt -r requirements-translation.txt; \ + .$(PYTHON)/bin/pip wheel --no-cache-dir --wheel-dir=./wheels -r requirements.txt -r requirements-stretch.txt -r requirements-llm.txt; \ STRETCH_OFF=True .$(PYTHON)/bin/python setup.py bdist_wheel -d ./wheels; \ .$(PYTHON)/bin/pex subaligner==$(SUBALIGNER_VERSION) --repo=./wheels --platform $(PLATFORM) --no-pypi --no-build --python-shebang="/usr/bin/env python3" -e subaligner -o subaligner-$(PLATFORM).app; \ diff --git a/Pipfile b/Pipfile index 4abbaad..934cca0 100644 --- a/Pipfile +++ b/Pipfile @@ -64,6 +64,7 @@ msgpack-python = "==0.5.6" numba = ">=0.50.0" numpy = "<1.24.0" oauthlib = "==3.1.0" +openai-whisper = "==20230124" pbr = "==4.0.2" pluggy = "==0.13.1" psutil = "==5.6.7" @@ -95,7 +96,7 @@ tensorflow = ">=1.15.5,<2.9" termcolor = "==1.1.0" toml = "==0.10.0" toolz = "==0.9.0" -torch = "<=1.12.0" +torch = "<1.13.0" tornado = "==5.1.0" transformers = "<4.27.0" typing-extensions = "~=3.7.0" diff --git a/README.md b/README.md index 634adac..2ed868a 100644 --- a/README.md +++ b/README.md @@ -34,9 +34,9 @@ $ pip install subaligner ## Installation with Optional Packages Supporting Additional Features ``` -# Install dependencies for enabling translation +# Install dependencies for enabling translation and transcription -$ pip install 'subaligner[translation]' +$ pip install 'subaligner[llm]' ``` ``` # Install dependencies for enabling forced alignment @@ -118,6 +118,10 @@ $ subaligner -m single -v https://example.com/video.mp4 -s https://example.com/s $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt ``` ``` +# Generate subtitles by transcribing audiovisual files +$ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt +``` +``` # Alignment on segmented plain texts (double newlines as the delimiter) $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt @@ -137,15 +141,11 @@ $ subaligner -m dual -v video.mkv -s embedded:stream_index=0 -o subtitle_aligned ``` ``` # Translative alignment with the ISO 639-3 language code pair (src,tgt) - -$ subaligner_1pass --languages -$ subaligner_1pass -v video.mp4 -s subtitle.srt -t src,tgt -$ subaligner_2pass --languages -$ subaligner_2pass -v video.mp4 -s subtitle.srt -t src,tgt $ subaligner --languages $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt -t src,tgt +$ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt ``` ``` # Shift subtitle manually by offset in seconds @@ -214,6 +214,7 @@ This tool wouldn't be possible without the following packages: [pysrt](https://github.com/byroot/pysrt) [pysubs2](https://github.com/tkarabela/pysubs2) [aeneas](https://www.readbeyond.it/aeneas/) -[transformers](https://huggingface.co/transformers/). +[transformers](https://huggingface.co/transformers/) +[openai-whisper](https://github.com/openai/whisper). Thanks to Alan Robinson and Nigel Megitt for their invaluable feedback. diff --git a/docker/Dockerfile-ArchLinux b/docker/Dockerfile-ArchLinux index e927e20..8f1c49b 100644 --- a/docker/Dockerfile-ArchLinux +++ b/docker/Dockerfile-ArchLinux @@ -20,7 +20,7 @@ RUN ["/bin/bash", "-c", "pacman --noconfirm -Syu &&\ python -m pip install --upgrade pip &&\ python -m pip install wheel &&\ python -m pip install \"subaligner==${RELEASE_VERSION}\" &&\ - python -m pip install \"subaligner[translation]==${RELEASE_VERSION}\""] + python -m pip install \"subaligner[llm]==${RELEASE_VERSION}\""] COPY ./scripts/entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh diff --git a/requirements-translation.txt b/requirements-llm.txt similarity index 54% rename from requirements-translation.txt rename to requirements-llm.txt index e12de75..fbe39c8 100644 --- a/requirements-translation.txt +++ b/requirements-llm.txt @@ -1,4 +1,5 @@ pycountry~=20.7.3 sentencepiece~=0.1.95 torch<1.13.0 -transformers<4.27.0 \ No newline at end of file +transformers<4.27.0 +openai-whisper==20230124 \ No newline at end of file diff --git a/setup.py b/setup.py index dd82aea..909ee15 100644 --- a/setup.py +++ b/setup.py @@ -20,18 +20,19 @@ with open("requirements-site.txt") as docs_requirements_file: docs_requirements = docs_requirements_file.read().splitlines()[::-1] -with open("requirements-translation.txt") as translate_requirements_file: - translate_requirements = translate_requirements_file.read().splitlines()[::-1] +with open("requirements-llm.txt") as llm_requirements_file: + llm_requirements = llm_requirements_file.read().splitlines()[::-1] with open("requirements-dev.txt") as dev_requirements_file: dev_requirements = dev_requirements_file.read().splitlines()[::-1] EXTRA_DEPENDENCIES = { - "harmony": stretch_requirements + translate_requirements, - "dev": dev_requirements + stretch_requirements + translate_requirements + docs_requirements, + "harmony": stretch_requirements + llm_requirements, + "dev": dev_requirements + stretch_requirements + llm_requirements + docs_requirements, "docs": docs_requirements, "stretch": stretch_requirements, - "translation": translate_requirements, + "translation": llm_requirements, # for backward compatibility and will be deprecated with "llm" + "llm": llm_requirements, } setup(name="subaligner", diff --git a/site/source/acknowledgement.rst b/site/source/acknowledgement.rst index 437da30..108c6c3 100644 --- a/site/source/acknowledgement.rst +++ b/site/source/acknowledgement.rst @@ -12,5 +12,6 @@ Acknowledgement - `pysubs2 `_ - `aeneas `_ - `transformers `_ + - `openai-whisper `_ Thanks to Alan Robinson and Nigel Megitt for their invaluable feedback. diff --git a/site/source/index.rst b/site/source/index.rst index ed870ca..7c0feb1 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -25,6 +25,10 @@ to developers wanting to perform those tasks programmatically. Moreover, with ex hand, advanced users can train their own synchronisers with a single command and zero setup. A handful of subtitle formats are supported and can be converted from one to another either during synchronisation and translation or on on-demand. +Even without any subtitles available beforehand, Subaligner provides transcription by utilising SOTA Large Language +models. This pipeline, combined with translation, can generate near ready-to-use subtitles of increasingly higher quality in +various languages and formats which cater to your preferences, thanks to those models continually advancing over time. + Subligner supports the following subtitle formats: SubRip, TTML, WebVTT, (Advanced) SubStation Alpha, MicroDVD, MPL2, TMP, EBU STL, SAMI, SCC and SBV. The source code can be found on GitHub: `subaligner `_. diff --git a/site/source/installation.rst b/site/source/installation.rst index c9672ba..1568f23 100644 --- a/site/source/installation.rst +++ b/site/source/installation.rst @@ -19,7 +19,7 @@ Installation **Install dependencies for enabling translation**:: - $ pip install 'subaligner[translation]' + $ pip install 'subaligner[llm]' **Pre-install additional dependencies before installing subaligner[stretch] or subaligner[dev]**:: diff --git a/site/source/usage.rst b/site/source/usage.rst index 93688c8..0ecfbea 100644 --- a/site/source/usage.rst +++ b/site/source/usage.rst @@ -7,6 +7,9 @@ lower latency and shifts all subtitle segments globally. The latter way has high segments individually with an option of stretching each segment. Multilingual translation on subtitles can be achieved together with the alignment in one go or separately (see in :ref:`Advanced Usage`). +With no subtitles in your hand beforehand, Subligner's transcribe mode utilises Large Language Models (LLMs) to transcribe +audiovisual content and generates subtitles in various formats which suit your needs. + Make sure you have got the virtual environment activated upfront. **Single-stage alignment (high-level shift with lower latency)**:: @@ -26,6 +29,10 @@ Make sure you have got the virtual environment activated upfront. (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt (.venv) $ subaligner -m dual -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt +**Generate subtitles by transcribing audiovisual files**:: + + (.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt + **Alignment on segmented plain texts (double newlines as the delimiter)**:: (.venv) $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt @@ -44,14 +51,11 @@ Make sure you have got the virtual environment activated upfront. **Translative alignment with the ISO 639-3 language code pair (src,tgt)**:: - (.venv) $ subaligner_1pass --languages - (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt -t src,tgt - (.venv) $ subaligner_2pass --languages - (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt -t src,tgt (.venv) $ subaligner --languages (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt (.venv) $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt -t src,tgt + (.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt **Shift subtitle manually by offset in seconds**:: diff --git a/subaligner/__main__.py b/subaligner/__main__.py index 774fbbb..4afa02f 100755 --- a/subaligner/__main__.py +++ b/subaligner/__main__.py @@ -1,13 +1,17 @@ #!/usr/bin/env python """ -usage: subaligner [-h] [-m {single,dual,script,shift}] [-v VIDEO_PATH] [-s SUBTITLE_PATH [SUBTITLE_PATH ...]] [-l MAX_LOGLOSS] [-so] +usage: subaligner [-h] [-m {single,dual,script,shift,transcribe}] [-v VIDEO_PATH] [-s SUBTITLE_PATH [SUBTITLE_PATH ...]] [-l MAX_LOGLOSS] [-so] [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] - [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-os OFFSET_SECONDS] [-lgs] [-d] [-q] [-ver] + [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-os OFFSET_SECONDS] + [-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] + [-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}] [-lgs] [-d] [-q] [-ver] Subaligner command line interface optional arguments: -h, --help show this help message and exit + -s SUBTITLE_PATH [SUBTITLE_PATH ...], --subtitle_path SUBTITLE_PATH [SUBTITLE_PATH ...] + File path or URL to the subtitle file (Extensions of supported subtitles: .ssa, .vtt, .srt, .txt, .smi, .ytt, .sub, .xml, .sbv, .ass, .sami, .scc, .tmp, .stl, .ttml, .dfxp) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) -l MAX_LOGLOSS, --max_logloss MAX_LOGLOSS Max global log loss for alignment -so, --stretch_on Switch on stretch on subtitles) @@ -23,18 +27,22 @@ Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) -os OFFSET_SECONDS, --offset_seconds OFFSET_SECONDS Offset by which the subtitle will be shifted + -ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --main_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho} + Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes] + -mr {whisper}, --llm_recipe {whisper} + LLM recipe used for transcribing video files + -mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}, --llm_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large} + Flavour variation for a specific LLM recipe -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit required arguments: - -m {single,dual,script,shift}, --mode {single,dual,script,shift} - Alignment mode: either single or dual + -m {single,dual,script,shift,transcribe}, --mode {single,dual,script,shift,transcribe} + Alignment mode: single, dual, script, shift or transcribe -v VIDEO_PATH, --video_path VIDEO_PATH File path or URL to the video file - -s SUBTITLE_PATH [SUBTITLE_PATH ...], --subtitle_path SUBTITLE_PATH [SUBTITLE_PATH ...] - File path or URL to the subtitle file (Extensions of supported subtitles: .sami, .ssa, .vtt, .xml, .sub, .smi, .ass, .srt, .tmp, .dfxp, .stl, .ttml, .sbv, .txt, .ytt, .scc) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) """ import argparse @@ -61,10 +69,10 @@ def main(): required_args.add_argument( "-m", "--mode", - type=str, + type=str.lower, default="", - choices=["single", "dual", "script", "shift"], - help="Alignment mode: either single or dual", + choices=["single", "dual", "script", "shift", "transcribe"], + help="Alignment mode: single, dual, script, shift or transcribe", ) required_args.add_argument( "-v", @@ -74,7 +82,7 @@ def main(): help="File path or URL to the video file", ) from subaligner.subtitle import Subtitle - required_args.add_argument( + parser.add_argument( "-s", "--subtitle_path", type=str, @@ -100,7 +108,7 @@ def main(): parser.add_argument( "-sil", "--stretch_in_language", - type=str, + type=str.lower, choices=Utils.get_stretch_language_codes(), default="eng", help="Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].\nNB: This will be ignored if neither -so nor --stretch_on is present", @@ -137,6 +145,29 @@ def main(): type=float, help="Offset by which the subtitle will be shifted" ) + parser.add_argument( + "-ml", + "--main_language", + type=str.lower, + choices=Utils.get_stretch_language_codes(), + help="Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]", + ) + parser.add_argument( + "-mr", + "--llm_recipe", + type=str.lower, + default="whisper", + choices=["whisper"], + help="LLM recipe used for transcribing video files" + ) + parser.add_argument( + "-mf", + "--llm_flavour", + type=str.lower, + default="small", + choices=["tiny", "tiny.en", "small", "medium", "medium.en", "base", "base.en", "large-v1", "large-v2", "large"], + help="Flavour variation for a specific LLM recipe" + ) parser.add_argument("-lgs", "--languages", action="store_true", help="Print out language codes used for stretch and translation") parser.add_argument("-d", "--debug", action="store_true", @@ -153,33 +184,45 @@ def main(): print("ERROR: --mode was not passed in") parser.print_usage() sys.exit(21) + FLAGS.subtitle_path = [path for paths in FLAGS.subtitle_path for path in paths] - if not FLAGS.subtitle_path: + if not FLAGS.subtitle_path and FLAGS.mode != "transcribe": print("ERROR: --subtitle_path was not passed in") parser.print_usage() sys.exit(21) - if FLAGS.mode != "shift": + elif FLAGS.mode == "transcribe": + FLAGS.subtitle_path = ["{}.srt".format(tempfile.mkstemp()[1])] + if FLAGS.mode in ["single", "dual", "script", "transcribe"]: for subtitle_path in FLAGS.subtitle_path: if FLAGS.video_path == "": print("ERROR: --video_path was not passed in") parser.print_usage() sys.exit(21) if subtitle_path.lower().startswith("http") and FLAGS.output == "": - print("ERROR: --output was not passed in for alignment on a remote subtitle file") + print("ERROR: --output was not passed in but required by alignment on a remote subtitle file") parser.print_usage() sys.exit(21) if subtitle_path.lower().startswith("embedded:") and FLAGS.output == "": - print("ERROR: --output was not passed in for alignment on embedded subtitles") + print("ERROR: --output was not passed in but required by alignment on embedded subtitles") parser.print_usage() sys.exit(21) if FLAGS.mode == "script" and FLAGS.output == "": - print("ERROR: --output was not passed in for alignment on plain texts") + print("ERROR: --output was not passed in but required by alignment on plain texts") parser.print_usage() sys.exit(21) - if FLAGS.translate is not None: + if FLAGS.mode == "transcribe": + if FLAGS.output == "": + print("ERROR: --output was not passed in but required by mode 'transcribe'") + parser.print_usage() + sys.exit(21) + if FLAGS.main_language is None: + print("ERROR: --main_language was not passed in but required by mode 'transcribe'") + parser.print_usage() + sys.exit(21) + if FLAGS.translate is not None or FLAGS.mode == "transcribe": if "transformers" not in {pkg.key for pkg in pkg_resources.working_set}: - print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[translation]" and run your command again.') + print('ERROR: Alignment has been configured to use language models. Please install "subaligner[llm]" and run your command again.') sys.exit(21) if FLAGS.stretch_on or FLAGS.mode == "script": if "aeneas" not in {pkg.key for pkg in pkg_resources.working_set}: @@ -190,13 +233,13 @@ def main(): local_subtitle_path = subtitle_path exit_segfail = FLAGS.exit_segfail stretch = FLAGS.stretch_on - stretch_in_lang = FLAGS.stretch_in_language + stretch_in_lang = FLAGS.main_language or FLAGS.stretch_in_language from subaligner.logger import Logger Logger.VERBOSE = FLAGS.debug Logger.QUIET = FLAGS.quiet from subaligner.predictor import Predictor - from subaligner.exception import UnsupportedFormatException + from subaligner.exception import UnsupportedFormatException, TranscriptionException from subaligner.exception import TerminalException try: @@ -230,6 +273,7 @@ def main(): parser.print_usage() sys.exit(21) + voice_probabilities = None predictor = Predictor() if FLAGS.mode == "single": aligned_subs, audio_file_path, voice_probabilities, frame_rate = predictor.predict_single_pass( @@ -252,6 +296,11 @@ def main(): subtitle_file_path=local_subtitle_path, stretch_in_lang=stretch_in_lang, ) + elif FLAGS.mode == "transcribe": + from subaligner.transcriber import Transcriber + transcriber = Transcriber(recipe=FLAGS.llm_recipe, flavour=FLAGS.llm_flavour) + subtitle, frame_rate = transcriber.transcribe(local_video_path, stretch_in_lang) + aligned_subs = subtitle.subs else: print("ERROR: Unknown mode {}".format(FLAGS.mode)) parser.print_usage() @@ -267,6 +316,9 @@ def main(): aligned_subs = translator.translate(aligned_subs) Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate, "utf-8") + elif FLAGS.mode == "transcribe": + Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, + frame_rate, "utf-8") else: Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate) @@ -277,35 +329,35 @@ def main(): print( "ERROR: Alignment failed with a too high loss value: {}".format(log_loss) ) - _remove_tmp_files(FLAGS.video_path, subtitle_path, local_video_path, local_subtitle_path) + _remove_tmp_files(FLAGS.video_path, subtitle_path, local_video_path, local_subtitle_path, FLAGS.mode) sys.exit(22) print("Aligned subtitle saved to: {}".format(aligned_subtitle_path)) - except UnsupportedFormatException as e: + except (UnsupportedFormatException, TranscriptionException) as e: print( "ERROR: {}\n{}".format(str(e), "".join(traceback.format_stack()) if FLAGS.debug else "") ) traceback.print_tb(e.__traceback__) - _remove_tmp_files(FLAGS.video_path, subtitle_path, local_video_path, local_subtitle_path) + _remove_tmp_files(FLAGS.video_path, subtitle_path, local_video_path, local_subtitle_path, FLAGS.mode) sys.exit(23) except TerminalException as e: print( "ERROR: {}\n{}".format(str(e), "".join(traceback.format_stack()) if FLAGS.debug else "") ) traceback.print_tb(e.__traceback__) - _remove_tmp_files(FLAGS.video_path, subtitle_path, local_video_path, local_subtitle_path) + _remove_tmp_files(FLAGS.video_path, subtitle_path, local_video_path, local_subtitle_path, FLAGS.mode) sys.exit(24) except Exception as e: print( "ERROR: {}\n{}".format(str(e), "".join(traceback.format_stack()) if FLAGS.debug else "") ) traceback.print_tb(e.__traceback__) - _remove_tmp_files(FLAGS.video_path, subtitle_path, local_video_path, local_subtitle_path) + _remove_tmp_files(FLAGS.video_path, subtitle_path, local_video_path, local_subtitle_path, FLAGS.mode) sys.exit(1) else: - _remove_tmp_files(FLAGS.video_path, subtitle_path, local_video_path, local_subtitle_path) + _remove_tmp_files(FLAGS.video_path, subtitle_path, local_video_path, local_subtitle_path, FLAGS.mode) sys.exit(0) - else: + elif FLAGS.mode == "shift": if FLAGS.offset_seconds is None: print("ERROR: --offset_seconds was not passed in during subtitle shifting") sys.exit(21) @@ -319,11 +371,13 @@ def main(): sys.exit(0) -def _remove_tmp_files(video_path, subtitle_path, local_video_path, local_subtitle_path): +def _remove_tmp_files(video_path, subtitle_path, local_video_path, local_subtitle_path, mode): if video_path.lower().startswith("http") and os.path.exists(local_video_path): os.remove(local_video_path) if subtitle_path.lower().startswith("http") and os.path.exists(local_subtitle_path): os.remove(local_subtitle_path) + if mode == "transcribe" and os.path.exists(local_subtitle_path): + os.remove(local_subtitle_path) if __name__ == "__main__": diff --git a/subaligner/exception.py b/subaligner/exception.py index 7e0acc3..cb35725 100644 --- a/subaligner/exception.py +++ b/subaligner/exception.py @@ -8,3 +8,7 @@ class TerminalException(Exception): class NoFrameRateException(Exception): """ An exception raised due to frame rate not found.""" + + +class TranscriptionException(Exception): + """ An exception raised due to transcription failures.""" diff --git a/subaligner/predictor.py b/subaligner/predictor.py index ab69177..bede1d2 100644 --- a/subaligner/predictor.py +++ b/subaligner/predictor.py @@ -37,7 +37,7 @@ class Predictor(metaclass=Singleton): __SEGMENT_PREDICTION_TIMEOUT = 60 # Maximum waiting time in seconds when predicting each segment __THREAD_QUEUE_SIZE = 8 - __THREAD_NUMBER = 4 + __THREAD_NUMBER = 1 # Do not change def __init__(self, **kwargs) -> None: """Feature predictor initialiser. diff --git a/subaligner/subaligner_1pass/__main__.py b/subaligner/subaligner_1pass/__main__.py index c485c77..73465a0 100755 --- a/subaligner/subaligner_1pass/__main__.py +++ b/subaligner/subaligner_1pass/__main__.py @@ -120,7 +120,7 @@ def main(): sys.exit(21) if FLAGS.translate is not None: if "transformers" not in {pkg.key for pkg in pkg_resources.working_set}: - print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[translation]" and run your command again.') + print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[llm]" and run your command again.') sys.exit(21) local_video_path = FLAGS.video_path diff --git a/subaligner/subaligner_2pass/__main__.py b/subaligner/subaligner_2pass/__main__.py index dd21e2f..b2c5ffa 100755 --- a/subaligner/subaligner_2pass/__main__.py +++ b/subaligner/subaligner_2pass/__main__.py @@ -147,7 +147,7 @@ def main(): sys.exit(21) if FLAGS.translate is not None: if "transformers" not in {pkg.key for pkg in pkg_resources.working_set}: - print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[translation]" and run your command again.') + print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[llm]" and run your command again.') sys.exit(21) if FLAGS.stretch_on: if "aeneas" not in {pkg.key for pkg in pkg_resources.working_set}: diff --git a/subaligner/subaligner_batch/__main__.py b/subaligner/subaligner_batch/__main__.py index 255630a..002688e 100755 --- a/subaligner/subaligner_batch/__main__.py +++ b/subaligner/subaligner_batch/__main__.py @@ -173,7 +173,7 @@ def main(): sys.exit(21) if FLAGS.translate is not None: if "transformers" not in {pkg.key for pkg in pkg_resources.working_set}: - print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[translation]" and run your command again.') + print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[llm]" and run your command again.') sys.exit(21) video_file_paths = [os.path.abspath(os.path.join(path, p)) for path, _, files in diff --git a/subaligner/subaligner_convert/__main__.py b/subaligner/subaligner_convert/__main__.py index 68b7220..ee521a4 100755 --- a/subaligner/subaligner_convert/__main__.py +++ b/subaligner/subaligner_convert/__main__.py @@ -99,7 +99,7 @@ def main(): sys.exit(21) if FLAGS.translate is not None: if "transformers" not in {pkg.key for pkg in pkg_resources.working_set}: - print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[translation]" and run your command again.') + print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[llm]" and run your command again.') sys.exit(21) local_subtitle_path = FLAGS.input_subtitle_path diff --git a/subaligner/subtitle.py b/subaligner/subtitle.py index 81541c7..105641e 100644 --- a/subaligner/subtitle.py +++ b/subaligner/subtitle.py @@ -59,6 +59,8 @@ def __init__(self, secret: object, subtitle_file_path: str, subtitle_format: str if subtitle_format == "subrip": self.__subs = self.__load_subrip(subtitle_file_path) + elif subtitle_format == "subrip_raw": + self.__subs = pysrt.SubRipFile().from_string(subtitle_file_path) elif subtitle_format == "ttml": self.__subs = self.__convert_ttml_to_subs(subtitle_file_path) elif subtitle_format == "webvtt": @@ -105,6 +107,19 @@ def load_subrip(cls, subtitle_file_path: str) -> "Subtitle": return cls(cls.__secret, subtitle_file_path, "subrip") + @classmethod + def load_subrip_str(cls, subrip_raw: str) -> "Subtitle": + """Load a SubRip subtitle string. + + Arguments: + subrip_str {string} -- The string representation of the SubRip content. + + Returns: + Subtitle -- Subtitle object. + """ + + return cls(cls.__secret, subrip_raw, "subrip_raw") + @classmethod def load_ttml(cls, subtitle_file_path: str) -> "Subtitle": """Load a TTML subtitle file. diff --git a/subaligner/transcriber.py b/subaligner/transcriber.py new file mode 100644 index 0000000..3dc2b77 --- /dev/null +++ b/subaligner/transcriber.py @@ -0,0 +1,118 @@ +import os +import whisper +from enum import Enum +from typing import Tuple, Optional +from pysrt import SubRipTime +from whisper.tokenizer import LANGUAGES +from .translator import Translator +from .subtitle import Subtitle +from .media_helper import MediaHelper +from .logger import Logger +from .exception import NoFrameRateException, TranscriptionException + + +class Transcriber(object): + """Transcribe audiovisual content for subtitle generation. + """ + + def __init__(self, recipe: str = "whisper", flavour: str = "small") -> None: + """Initialiser for the transcribing process. + + Arguments: + recipe {string} -- the LLM recipe used for transcribing video files (default: "whisper"). + flavour {string} -- the flavour variation for a specific LLM recipe (default: "small"). + Raises: + NotImplementedError -- Thrown when the LLM recipe is unknown. + """ + if recipe not in [r.value for r in Recipe]: + raise NotImplementedError(f"Unknown recipe: {recipe}") + if recipe == Recipe.whisper.value: + if flavour not in [f.value for f in WhisperFlavour]: + raise NotImplementedError(f"Unknown {recipe} flavour: {flavour}") + self.__model = whisper.load_model(flavour) + self.recipe = recipe + self.flavour = flavour + self.__media_helper = MediaHelper() + self.__LOGGER = Logger().get_logger(__name__) + + def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle, Optional[float]]: + """Transcribe an audiovisual file and generate subtitles. + + Arguments: + video_file_path {string} -- The input video file path. + language_code {string} -- An alpha 3 language code derived from ISO 639-3. + Raises: + TranscriptionException -- Thrown when transcription is failed. + NotImplementedError -- Thrown when the LLM recipe is not supported. + """ + if self.recipe == "whisper": + lang = Translator.get_iso_639_alpha_2(language_code) + if lang not in LANGUAGES: + raise TranscriptionException(f'"{language_code}" is not supported by {self.recipe} ({self.flavour})') + audio_file_path = self.__media_helper.extract_audio(video_file_path, True, 16000) + try: + audio = whisper.load_audio(audio_file_path) + self.__LOGGER.debug("Start transcribing the audio...") + result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang]) + self.__LOGGER.info("Finished transcribing the audio") + srt_str = "" + for i, segment in enumerate(result["segments"], start=1): + srt_str += f"{i}\n" \ + f"{self.__format_timestamp(segment['start'])} --> {self.__format_timestamp(segment['end'])}\n" \ + f"{segment['text'].strip().replace('-->', '->')}\n" \ + "\n" + subtitle = Subtitle.load_subrip_str(srt_str) + subtitle, frame_rate = self.__on_frame_timecodes(subtitle, video_file_path) + self.__LOGGER.debug("Generated the raw subtitle") + return subtitle, frame_rate + finally: + if os.path.exists(audio_file_path): + os.remove(audio_file_path) + else: + raise NotImplementedError(f"{self.recipe} ({self.flavour}) is not supported") + + @staticmethod + def __format_timestamp(seconds: float) -> str: + assert seconds >= 0, "non-negative timestamp expected" + milliseconds = round(seconds * 1000.0) + hours = milliseconds // 3_600_000 + milliseconds -= hours * 3_600_000 + minutes = milliseconds // 60_000 + milliseconds -= minutes * 60_000 + seconds = milliseconds // 1_000 + milliseconds -= seconds * 1_000 + hours_marker = f"{hours:02d}:" + return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}" + + def __on_frame_timecodes(self, subtitle: Subtitle, video_file_path: str) -> Tuple[Subtitle, Optional[float]]: + frame_rate = None + try: + frame_rate = self.__media_helper.get_frame_rate(video_file_path) + frame_duration = 1.0 / frame_rate + for sub in subtitle.subs: + start_seconds = sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000.0 + end_seconds = sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds + sub.end.milliseconds / 1000.0 + start_frames = int(start_seconds / frame_duration) + end_frames = int(end_seconds / frame_duration) + sub.start = SubRipTime(seconds=start_frames * frame_duration) + sub.end = SubRipTime(seconds=end_frames * frame_duration) + except NoFrameRateException: + self.__LOGGER.warning("Cannot detect the frame rate for %s" % video_file_path) + return subtitle, frame_rate + + +class Recipe(str, Enum): + whisper = "whisper" + + +class WhisperFlavour(str, Enum): + tiny = "tiny" + tiny_en = "tiny.en" + small = "small" + medium = "medium" + medium_en = "medium.en" + base = "base" + base_en = "base.en" + large_v1 = "large-v1" + large_v2 = "large-v2" + large = "large" diff --git a/tests/integration/feature/subaligner.feature b/tests/integration/feature/subaligner.feature index 2858b79..d68f56f 100644 --- a/tests/integration/feature/subaligner.feature +++ b/tests/integration/feature/subaligner.feature @@ -234,6 +234,17 @@ Feature: Subaligner CLI | subaligner_1pass | | "test.srt" | eng,fra | "test_aligned.srt" | | subaligner_2pass | | "test.srt" | eng,deu | "test_aligned.srt" | + @transcription + Scenario Outline: Test transcription on audiovisual input and subtitle generation + Given I have a video file + And I have a subtitle file + When I run the alignment with on them with stage with language, recipe and flavour + Then a new subtitle file is generated + Examples: + | video-in | aligner | mode | subtitle-in | language | recipe | flavour | subtitle-out | + | "test.mp4" | subaligner | transcribe | "test.srt" | eng | whisper | tiny | "test_aligned.srt" | + | "test.wav" | subaligner | transcribe | "test.srt" | eng | whisper | tiny | "test_aligned.srt" | + @batch Scenario Outline: Test batch alignment Given I have an audiovisual file directory "av" diff --git a/tests/integration/radish/step.py b/tests/integration/radish/step.py index 9da52cb..ddc52c9 100644 --- a/tests/integration/radish/step.py +++ b/tests/integration/radish/step.py @@ -105,6 +105,20 @@ def run_subaligner_with_translation(step, aligner, mode, language_pair): step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS) +@when('I run the alignment with {aligner:S} on them with {mode:S} stage with {language:S} language, {recipe:S} recipe and {flavour:S} flavour') +def run_subaligner_with_transcription(step, aligner, mode, language, recipe, flavour): + process = subprocess.Popen([ + os.path.join(PWD, "..", "..", "..", "bin", aligner), + "-m", mode, + "-v", step.context.video_file_path, + "-ml", language, + "-mr", recipe, + "-mf", flavour, + "-o", os.path.join(PWD, "..", "..", "subaligner", "resource", "test_aligned.srt"), + "-q"], shell=False) + step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS) + + @when('I run the alignment with {aligner:S} on them with {mode:S} stage and output "{file_name:S}"') def run_subaligner_with_output(step, aligner, mode, file_name): if mode == "": diff --git a/tests/subaligner/test_transcriber.py b/tests/subaligner/test_transcriber.py new file mode 100644 index 0000000..95d2de7 --- /dev/null +++ b/tests/subaligner/test_transcriber.py @@ -0,0 +1,43 @@ +import os +import unittest +from subaligner.transcriber import Transcriber as Undertest +from subaligner.exception import TranscriptionException + + +class TranscriberTest(unittest.TestCase): + + def setUp(self) -> None: + self.video_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resource", "test.mp4") + self.undertest = Undertest(recipe="whisper", flavour="tiny") + + def test_transcribe(self): + subtitle, frame_rate = self.undertest.transcribe(self.video_file_path, "eng") + assert len(subtitle.subs) > 0 + assert frame_rate == 24 + + def test_throw_exception_on_unknown_recipe(self): + try: + Undertest(recipe="unknown") + except Exception as e: + self.assertTrue(isinstance(e, NotImplementedError)) + self.assertEqual(str(e), "Unknown recipe: unknown") + else: + self.fail("Should have thrown exception") + + def test_throw_exception_on_unknown_flavour(self): + try: + Undertest(recipe="whisper", flavour="unknown") + except Exception as e: + self.assertTrue(isinstance(e, NotImplementedError)) + self.assertEqual(str(e), "Unknown whisper flavour: unknown") + else: + self.fail("Should have thrown exception") + + def test_throw_exception_on_unsupported_language(self): + try: + self.undertest.transcribe(self.video_file_path, "abc") + except Exception as e: + self.assertTrue(isinstance(e, TranscriptionException)) + self.assertEqual(str(e), '"abc" is not supported by whisper (tiny)') + else: + self.fail("Should have thrown exception") From 3eed835e9481f1fa0a05be7b03c16607dd6e8543 Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 10 Mar 2023 19:11:04 +0000 Subject: [PATCH 06/20] bump up the version for the new release --- Makefile | 2 +- subaligner/_version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4c414da..c756287 100644 --- a/Makefile +++ b/Makefile @@ -149,7 +149,7 @@ manual: clean-manual ## generate manual pages test-dist: if [ ! -e ".$(PYTHON)" ]; then ~/.pyenv/versions/$(PYTHON)/bin/python3 -m venv .$(PYTHON); fi .$(PYTHON)/bin/pip install --upgrade pip setuptools wheel; \ - .$(PYTHON)/bin/pip install -e . --use-feature=2020-resolver; \ + .$(PYTHON)/bin/pip install -e . dist: clean-dist test-dist cat requirements-dev.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ diff --git a/subaligner/_version.py b/subaligner/_version.py index 0f6fa4c..98c74dc 100644 --- a/subaligner/_version.py +++ b/subaligner/_version.py @@ -1,2 +1,2 @@ """The semver for the current release.""" -__version__ = "0.2.5" +__version__ = "0.3.0" From 197495fe528a7b276b6ad8af5319e368f490aaf0 Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 10 Mar 2023 19:52:11 +0000 Subject: [PATCH 07/20] retire ubuntu-18 image --- .github/workflows/dockerhub.yml | 13 ------------- README.md | 3 ++- docker/Dockerfile-Ubuntu18 | 19 ------------------- docker/Dockerfile-Ubuntu20 | 1 - site/source/index.rst | 2 +- site/source/usage.rst | 3 ++- 6 files changed, 5 insertions(+), 36 deletions(-) delete mode 100644 docker/Dockerfile-Ubuntu18 diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml index 7a41c85..c4c6ccd 100644 --- a/.github/workflows/dockerhub.yml +++ b/.github/workflows/dockerhub.yml @@ -60,19 +60,6 @@ jobs: tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.u20 push: true - - name: Build and push the Ubuntu 18 image - id: docker_build_u18 - uses: docker/build-push-action@v2 - with: - context: ./docker - file: "./docker/Dockerfile-Ubuntu18" - build-args: | - "RELEASE_VERSION=${{ steps.tag.outputs.TAG }}" - allow: network.host - github-token: ${{ github.token }} - tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.u18 - push: true - - name: Build and push the Fedora 31 image id: docker_build_fed31 uses: docker/build-push-action@v2 diff --git a/README.md b/README.md index 2ed868a..6d0f51d 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,7 @@ $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/sub ``` # Generate subtitles by transcribing audiovisual files $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt +$ subaligner -m transcribe -v video.mp4 -ml zho -mr whisper -mf medium -o subtitle_aligned.srt ``` ``` # Alignment on segmented plain texts (double newlines as the delimiter) @@ -145,7 +146,7 @@ $ subaligner --languages $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt -t src,tgt -$ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt +$ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt ``` ``` # Shift subtitle manually by offset in seconds diff --git a/docker/Dockerfile-Ubuntu18 b/docker/Dockerfile-Ubuntu18 deleted file mode 100644 index 997b85b..0000000 --- a/docker/Dockerfile-Ubuntu18 +++ /dev/null @@ -1,19 +0,0 @@ -# Subaligner Ubuntu 18 Docker Image -FROM ubuntu:18.04 - -ARG RELEASE_VERSION - -ENV DEBIAN_FRONTEND=noninteractive -ENV RELEASE_VERSION=${RELEASE_VERSION} -ENV TZ=Europe/London - -RUN ["/bin/bash", "-c", "apt-get -y update &&\ - apt-get -y install ffmpeg &&\ - apt-get -y install espeak libespeak1 libespeak-dev espeak-data &&\ - apt-get -y install libsndfile-dev &&\ - apt-get -y install python3-dev &&\ - apt-get -y install python3-tk &&\ - apt-get -y install python3-pip &&\ - python3 -m pip install --upgrade pip &&\ - python3 -m pip install \"subaligner==${RELEASE_VERSION}\" &&\ - python3 -m pip install \"subaligner[harmony]==${RELEASE_VERSION}\""] diff --git a/docker/Dockerfile-Ubuntu20 b/docker/Dockerfile-Ubuntu20 index b2a64ac..297bb4e 100644 --- a/docker/Dockerfile-Ubuntu20 +++ b/docker/Dockerfile-Ubuntu20 @@ -7,7 +7,6 @@ ENV RELEASE_VERSION=${RELEASE_VERSION} ENV DEBIAN_FRONTEND=noninteractive ENV TZ=Europe/London -RUN echo "$RELEASE_VERSION" RUN ["/bin/bash", "-c", "apt-get -y update &&\ apt-get -y install ffmpeg &&\ apt-get -y install espeak libespeak1 libespeak-dev espeak-data &&\ diff --git a/site/source/index.rst b/site/source/index.rst index 7c0feb1..6d34def 100644 --- a/site/source/index.rst +++ b/site/source/index.rst @@ -26,7 +26,7 @@ hand, advanced users can train their own synchronisers with a single command and and can be converted from one to another either during synchronisation and translation or on on-demand. Even without any subtitles available beforehand, Subaligner provides transcription by utilising SOTA Large Language -models. This pipeline, combined with translation, can generate near ready-to-use subtitles of increasingly higher quality in +Models (LLMs). This pipeline, combined with translation, can generate near ready-to-use subtitles of increasingly higher quality in various languages and formats which cater to your preferences, thanks to those models continually advancing over time. Subligner supports the following subtitle formats: SubRip, TTML, WebVTT, (Advanced) SubStation Alpha, MicroDVD, MPL2, TMP, diff --git a/site/source/usage.rst b/site/source/usage.rst index 0ecfbea..b0cbf5f 100644 --- a/site/source/usage.rst +++ b/site/source/usage.rst @@ -32,6 +32,7 @@ Make sure you have got the virtual environment activated upfront. **Generate subtitles by transcribing audiovisual files**:: (.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt + (.venv) $ subaligner -m transcribe -v video.mp4 -ml zho -mr whisper -mf medium -o subtitle_aligned.srt **Alignment on segmented plain texts (double newlines as the delimiter)**:: @@ -55,7 +56,7 @@ Make sure you have got the virtual environment activated upfront. (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt (.venv) $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt -t src,tgt - (.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt + (.venv) $ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt **Shift subtitle manually by offset in seconds**:: From f05f4d7a0f25cc205b14cf6086b0c472a4f26c62 Mon Sep 17 00:00:00 2001 From: baxtree Date: Tue, 14 Mar 2023 09:23:06 +0000 Subject: [PATCH 08/20] post relase tidy up --- CITATION.cff | 1 - README.md | 58 ++------ site/source/advanced_usage.rst | 2 - site/source/installation.rst | 8 +- site/source/usage.rst | 25 +--- subaligner/__init__.py | 5 + subaligner/__main__.py | 44 ++++-- subaligner/_version.py | 2 +- subaligner/exception.py | 4 + subaligner/llm.py | 29 ++++ subaligner/transcriber.py | 54 ++----- subaligner/translator.py | 215 ++++++++++++--------------- subaligner/utils.py | 36 +++++ tests/subaligner/test_transcriber.py | 5 +- tests/subaligner/test_translator.py | 5 - tests/subaligner/test_utils.py | 14 ++ 16 files changed, 256 insertions(+), 251 deletions(-) create mode 100644 subaligner/llm.py diff --git a/CITATION.cff b/CITATION.cff index 2fda752..2eb84f4 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -5,7 +5,6 @@ authors: given-names: Xi orcid: https://orcid.org/0000-0002-2177-8458 title: "Subaligner: Towards Automated Subtitle Alignment" -version: 0.2.1 doi: 10.5281/zenodo.5603083 date-released: 2021-10-28 url: "https://github.com/baxtree/subaligner" \ No newline at end of file diff --git a/README.md b/README.md index 6d0f51d..9713282 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,12 @@ $ brew install ffmpeg $ pip install -U pip $ pip install subaligner ``` +or install from source: +``` +$ git clone git@github.com:baxtree/subaligner.git +$ cd subaligner +$ python setup.py install +``` ## Installation with Optional Packages Supporting Additional Features ``` @@ -61,31 +67,10 @@ To install all supported features: $ pip install 'subaligner[harmony]' ``` -## Alternative Installations -``` -# Install via pipx -$ pip install -U pip pipx -$ pipx install subaligner -``` -or -``` -# Install from GitHub via Pipenv -$ pipenv install subaligner -$ pipenv install 'subaligner[stretch]' -$ pipenv install 'subaligner[dev]' -``` -or -``` -# Install from source +## Container Support +If you prefer using a containerised environment over installing everything locally, run: -$ git clone git@github.com:baxtree/subaligner.git -$ cd subaligner -$ python setup.py install -``` -or ``` -# Use dockerised installation - $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner bash ``` For users on Windows 10: [Docker Desktop](https://docs.docker.com/docker-for-windows/install/) is the only option at present. @@ -99,22 +84,13 @@ docker run -v "/d/media":/media -w "/media" -it baxtree/subaligner bash ``` # Single-stage alignment (high-level shift with lower latency) -$ subaligner_1pass -v video.mp4 -s subtitle.srt -$ subaligner_1pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt +$ subaligner -m single -v video.mp4 -s subtitle.srt +$ subaligner -m single -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt ``` ``` # Dual-stage alignment (low-level shift with higher latency) -$ subaligner_2pass -v video.mp4 -s subtitle.srt -$ subaligner_2pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt -``` -or -``` -# Pass in single-stage or dual-stage as the alignment mode - -$ subaligner -m single -v video.mp4 -s subtitle.srt $ subaligner -m dual -v video.mp4 -s subtitle.srt -$ subaligner -m single -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt ``` ``` @@ -142,6 +118,7 @@ $ subaligner -m dual -v video.mkv -s embedded:stream_index=0 -o subtitle_aligned ``` ``` # Translative alignment with the ISO 639-3 language code pair (src,tgt) + $ subaligner --languages $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt @@ -171,20 +148,17 @@ $ pipx run subaligner -m dual -v video.mp4 -s subtitle.srt # Run the module as a script $ python -m subaligner -m single -v video.mp4 -s subtitle.srt $ python -m subaligner -m dual -v video.mp4 -s subtitle.srt -$ python -m subaligner.subaligner_1pass -v video.mp4 -s subtitle.srt -$ python -m subaligner.subaligner_2pass -v video.mp4 -s subtitle.srt ``` ``` # Run alignments with the docker image $ docker pull baxtree/subaligner -$ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner_1pass -v video.mp4 -s subtitle.srt -$ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner_2pass -v video.mp4 -s subtitle.srt -$ docker run -it baxtree/subaligner subaligner_1pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt -$ docker run -it baxtree/subaligner subaligner_2pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt +$ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner -m single -v video.mp4 -s subtitle.srt +$ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner -m dual -v video.mp4 -s subtitle.srt +$ docker run -it baxtree/subaligner subaligner -m single -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt +$ docker run -it baxtree/subaligner subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt ``` -The aligned subtitle will be saved at `subtitle_aligned.srt`. For details on CLI, run `subaligner_1pass -h`, `subaligner_2pass -h` or `subaligner -h`. -Additional utilities can be used after consulting `subaligner_batch -h`, `subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h`. +The aligned subtitle will be saved at `subtitle_aligned.srt`. For details on CLIs, run `subaligner -h` or `subaligner_batch -h`, `subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h` for additional utilities. `subaligner_1pass` and `subaligner_2pass` are shortcuts for running `subaligner` with `-m single` and `-m dual` options, respectively. ![](figures/screencast.gif) diff --git a/site/source/advanced_usage.rst b/site/source/advanced_usage.rst index 33c8716..952ed59 100644 --- a/site/source/advanced_usage.rst +++ b/site/source/advanced_usage.rst @@ -64,8 +64,6 @@ is present, make sure the folder passed in is empty. (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -tod training_output_directory (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -tod training_output_directory - (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt -tod training_output_directory - (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt -tod training_output_directory To apply your trained model to subtitle alignment, pass in the training_output_directory containing training results as shown above with `-tod` or `--training_output_directory`. diff --git a/site/source/installation.rst b/site/source/installation.rst index 1568f23..8e8da43 100644 --- a/site/source/installation.rst +++ b/site/source/installation.rst @@ -53,11 +53,11 @@ Installation $ pipenv install 'subaligner[stretch]' $ pipenv install 'subaligner[dev]' -**Use dockerised installation**:: +**Container Support**:: $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner bash -The following builds are available on dockerhub for several Linux distributions: CentOS 7 (latest and VERSION.el7), CentOS 8 (VERSION.el8), Ubuntu 18 (VERSION.u18), Ubuntu 20 (VERSION.u20), Debian 10 (VERSION.deb10), Fedora 31 (VERSION.fed31) and ArchLinux (VERSION.arch). +Users may prefer using a containerised environment over installing everything locally. The following builds are available on dockerhub for several Linux distributions: CentOS 7 (latest and VERSION.el7), CentOS 8 (VERSION.el8), Ubuntu 18 (VERSION.u18), Ubuntu 20 (VERSION.u20), Debian 10 (VERSION.deb10), Fedora 31 (VERSION.fed31) and ArchLinux (VERSION.arch). You can also download the latest release on `GitHub `_ and follow the steps down below @@ -72,8 +72,8 @@ to create a virtual environment and set up all the dependencies: **Subaligner CLI should be on your PATH now**:: (.venv) $ subaligner --help - (.venv) $ subaligner_1pass --help - (.venv) $ subaligner_2pass --help + (.venv) $ subaligner_1pass --help # shortcut for "subaligner -m single" + (.venv) $ subaligner_2pass --help # shortcut for "subaligner -m dual" (.venv) $ subaligner_batch --help (.venv) $ subaligner_convert --help (.venv) $ subaligner_train --help diff --git a/site/source/usage.rst b/site/source/usage.rst index b0cbf5f..b67ab88 100644 --- a/site/source/usage.rst +++ b/site/source/usage.rst @@ -14,18 +14,11 @@ Make sure you have got the virtual environment activated upfront. **Single-stage alignment (high-level shift with lower latency)**:: - (.venv) $ subaligner_1pass -v video.mp4 -s subtitle.srt - (.venv) $ subaligner_1pass -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt + (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt + (.venv) $ subaligner -m single -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt **Dual-stage alignment (low-level shift with higher latency)**:: - (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt - (.venv) $ subaligner_2pass -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt - -**Pass in single-stage or dual-stage as the alignment mode**:: - - (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt - (.venv) $ subaligner -m single -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt (.venv) $ subaligner -m dual -v https://example.org/video.mp4 -s https://example.org/subtitle.srt -o subtitle_aligned.srt @@ -72,10 +65,10 @@ Make sure you have got the virtual environment activated upfront. **Run alignments with the docker image**:: $ docker pull baxtree/subaligner - $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner_1pass -v video.mp4 -s subtitle.srt - $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner_2pass -v video.mp4 -s subtitle.srt - $ docker run -it baxtree/subaligner subaligner_1pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt - $ docker run -it baxtree/subaligner subaligner_2pass -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt + $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner -m single -v video.mp4 -s subtitle.srt + $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner -m dual -v video.mp4 -s subtitle.srt + $ docker run -it baxtree/subaligner subaligner -m single -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt + $ docker run -it baxtree/subaligner subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt **Run alignments with pipx**:: @@ -86,22 +79,16 @@ Make sure you have got the virtual environment activated upfront. $ python -m subaligner -m single -v video.mp4 -s subtitle.srt $ python -m subaligner -m dual -v video.mp4 -s subtitle.srt - $ python -m subaligner.subaligner_1pass -v video.mp4 -s subtitle.srt - $ python -m subaligner.subaligner_2pass -v video.mp4 -s subtitle.srt Currently the stretching is experimental and make sure subaligner[stretch] is installed before switching it on with `-so` or `--stretch_on` as shown below. **Switch on stretching when aligning subtitles**:: - (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt -so - or (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -so **Save the aligned subtitle to a specific location**:: - (.venv) $ subaligner_2pass -v video.mp4 -s subtitle.srt -o /path/to/the/output/subtitle.srt - or (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -o /path/to/the/output/subtitle.srt **On Windows**:: diff --git a/subaligner/__init__.py b/subaligner/__init__.py index e3206b2..ae79fcb 100644 --- a/subaligner/__init__.py +++ b/subaligner/__init__.py @@ -1,7 +1,12 @@ import os +import warnings import multiprocessing as mp from ._version import __version__ __all__ = ["__version__"] + +warnings.filterwarnings("ignore") +warnings.simplefilter("ignore") + mp.set_start_method("spawn", force=True) os.environ["KMP_WARNINGS"] = "0" diff --git a/subaligner/__main__.py b/subaligner/__main__.py index 4afa02f..63cc178 100755 --- a/subaligner/__main__.py +++ b/subaligner/__main__.py @@ -4,14 +4,15 @@ [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-os OFFSET_SECONDS] [-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] - [-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}] [-lgs] [-d] [-q] [-ver] + [-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}] [-tr {helsinki-nlp,whisper}] [-tf TRANSLATION_FLAVOUR] [-lgs] + [-d] [-q] [-ver] Subaligner command line interface optional arguments: -h, --help show this help message and exit -s SUBTITLE_PATH [SUBTITLE_PATH ...], --subtitle_path SUBTITLE_PATH [SUBTITLE_PATH ...] - File path or URL to the subtitle file (Extensions of supported subtitles: .ssa, .vtt, .srt, .txt, .smi, .ytt, .sub, .xml, .sbv, .ass, .sami, .scc, .tmp, .stl, .ttml, .dfxp) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) + File path or URL to the subtitle file (Extensions of supported subtitles: .ttml, .sub, .ytt, .smi, .sami, .tmp, .txt, .ssa, .vtt, .stl, .xml, .ass, .scc, .dfxp, .sbv, .srt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) -l MAX_LOGLOSS, --max_logloss MAX_LOGLOSS Max global log loss for alignment -so, --stretch_on Switch on stretch on subtitles) @@ -32,7 +33,11 @@ -mr {whisper}, --llm_recipe {whisper} LLM recipe used for transcribing video files -mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}, --llm_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large} - Flavour variation for a specific LLM recipe + Flavour variation for a specific LLM recipe supporting transcription + -tr {helsinki-nlp,whisper}, --translation_recipe {helsinki-nlp,whisper} + LLM recipe used for translating subtitles + -tf TRANSLATION_FLAVOUR, --translation_flavour TRANSLATION_FLAVOUR + Flavour variation for a specific LLM recipe supporting translation -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information @@ -152,21 +157,40 @@ def main(): choices=Utils.get_stretch_language_codes(), help="Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]", ) + from subaligner.llm import TranscriptionRecipe + from subaligner.llm import WhisperFlavour parser.add_argument( "-mr", "--llm_recipe", type=str.lower, - default="whisper", - choices=["whisper"], + default=TranscriptionRecipe.WHISPER.value, + choices=[r.value for r in TranscriptionRecipe], help="LLM recipe used for transcribing video files" ) parser.add_argument( "-mf", "--llm_flavour", type=str.lower, - default="small", - choices=["tiny", "tiny.en", "small", "medium", "medium.en", "base", "base.en", "large-v1", "large-v2", "large"], - help="Flavour variation for a specific LLM recipe" + default=WhisperFlavour.SMALL.value, + choices=[wf.value for wf in WhisperFlavour], + help="Flavour variation for a specific LLM recipe supporting transcription" + ) + from subaligner.llm import TranslationRecipe + from subaligner.llm import HelsinkiNLPFlavour + parser.add_argument( + "-tr", + "--translation_recipe", + type=str.lower, + default=TranslationRecipe.HELSINKI_NLP.value, + choices=[r.value for r in TranslationRecipe], + help="LLM recipe used for translating subtitles" + ) + parser.add_argument( + "-tf", + "--translation_flavour", + type=str.lower, + default=None, + help="Flavour variation for a specific LLM recipe supporting translation" ) parser.add_argument("-lgs", "--languages", action="store_true", help="Print out language codes used for stretch and translation") @@ -312,8 +336,8 @@ def main(): if FLAGS.translate is not None: from subaligner.translator import Translator source, target = FLAGS.translate.split(",") - translator = Translator(source, target) - aligned_subs = translator.translate(aligned_subs) + translator = Translator(src_language=source, tgt_language=target, recipe=FLAGS.translation_recipe, flavour=FLAGS.translation_flavour) + aligned_subs = translator.translate(aligned_subs, local_video_path) Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate, "utf-8") elif FLAGS.mode == "transcribe": diff --git a/subaligner/_version.py b/subaligner/_version.py index 98c74dc..7945cf2 100644 --- a/subaligner/_version.py +++ b/subaligner/_version.py @@ -1,2 +1,2 @@ """The semver for the current release.""" -__version__ = "0.3.0" +__version__ = "0.3.1" diff --git a/subaligner/exception.py b/subaligner/exception.py index cb35725..c686b4d 100644 --- a/subaligner/exception.py +++ b/subaligner/exception.py @@ -10,5 +10,9 @@ class NoFrameRateException(Exception): """ An exception raised due to frame rate not found.""" +class TranslationException(Exception): + """ An exception raised due to translation failures.""" + + class TranscriptionException(Exception): """ An exception raised due to transcription failures.""" diff --git a/subaligner/llm.py b/subaligner/llm.py new file mode 100644 index 0000000..7d7aecf --- /dev/null +++ b/subaligner/llm.py @@ -0,0 +1,29 @@ +from enum import Enum + + +class TranscriptionRecipe(Enum): + WHISPER = "whisper" + + +class TranslationRecipe(Enum): + HELSINKI_NLP = "helsinki-nlp" + WHISPER = "whisper" + + +class WhisperFlavour(Enum): + TINY = "tiny" + TINY_EN = "tiny.en" + SMALL = "small" + MEDIUM = "medium" + MEDIUM_EN = "medium.en" + BASE = "base" + BASE_EN = "base.en" + LARGE_V1 = "large-v1" + LARGE_V2 = "large-v2" + LARGE = "large" + + +class HelsinkiNLPFlavour(Enum): + OPUS_MT = "Helsinki-NLP/opus-mt-{}-{}" + OPUS_MT_TC_BIG = "Helsinki-NLP/opus-mt-tc-big-{}-{}" + OPUS_TATOEBA = "Helsinki-NLP/opus-tatoeba-{}-{}" diff --git a/subaligner/transcriber.py b/subaligner/transcriber.py index 3dc2b77..0a0481a 100644 --- a/subaligner/transcriber.py +++ b/subaligner/transcriber.py @@ -1,13 +1,15 @@ import os import whisper -from enum import Enum from typing import Tuple, Optional from pysrt import SubRipTime from whisper.tokenizer import LANGUAGES from .translator import Translator from .subtitle import Subtitle from .media_helper import MediaHelper +from .llm import TranscriptionRecipe, WhisperFlavour +from .singleton import Singleton from .logger import Logger +from .utils import Utils from .exception import NoFrameRateException, TranscriptionException @@ -15,7 +17,7 @@ class Transcriber(object): """Transcribe audiovisual content for subtitle generation. """ - def __init__(self, recipe: str = "whisper", flavour: str = "small") -> None: + def __init__(self, recipe: str = TranscriptionRecipe.WHISPER.value, flavour: str = WhisperFlavour.SMALL.value) -> None: """Initialiser for the transcribing process. Arguments: @@ -24,14 +26,14 @@ def __init__(self, recipe: str = "whisper", flavour: str = "small") -> None: Raises: NotImplementedError -- Thrown when the LLM recipe is unknown. """ - if recipe not in [r.value for r in Recipe]: + if recipe not in [r.value for r in TranscriptionRecipe]: raise NotImplementedError(f"Unknown recipe: {recipe}") - if recipe == Recipe.whisper.value: + if recipe == TranscriptionRecipe.WHISPER.value: if flavour not in [f.value for f in WhisperFlavour]: raise NotImplementedError(f"Unknown {recipe} flavour: {flavour}") self.__model = whisper.load_model(flavour) - self.recipe = recipe - self.flavour = flavour + self.__recipe = recipe + self.__flavour = flavour self.__media_helper = MediaHelper() self.__LOGGER = Logger().get_logger(__name__) @@ -45,10 +47,10 @@ def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle TranscriptionException -- Thrown when transcription is failed. NotImplementedError -- Thrown when the LLM recipe is not supported. """ - if self.recipe == "whisper": - lang = Translator.get_iso_639_alpha_2(language_code) + if self.__recipe == "whisper": + lang = Utils.get_iso_639_alpha_2(language_code) if lang not in LANGUAGES: - raise TranscriptionException(f'"{language_code}" is not supported by {self.recipe} ({self.flavour})') + raise TranscriptionException(f'"{language_code}" is not supported by {self.__recipe} ({self.__flavour})') audio_file_path = self.__media_helper.extract_audio(video_file_path, True, 16000) try: audio = whisper.load_audio(audio_file_path) @@ -58,7 +60,7 @@ def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle srt_str = "" for i, segment in enumerate(result["segments"], start=1): srt_str += f"{i}\n" \ - f"{self.__format_timestamp(segment['start'])} --> {self.__format_timestamp(segment['end'])}\n" \ + f"{Utils.format_timestamp(segment['start'])} --> {Utils.format_timestamp(segment['end'])}\n" \ f"{segment['text'].strip().replace('-->', '->')}\n" \ "\n" subtitle = Subtitle.load_subrip_str(srt_str) @@ -69,20 +71,7 @@ def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle if os.path.exists(audio_file_path): os.remove(audio_file_path) else: - raise NotImplementedError(f"{self.recipe} ({self.flavour}) is not supported") - - @staticmethod - def __format_timestamp(seconds: float) -> str: - assert seconds >= 0, "non-negative timestamp expected" - milliseconds = round(seconds * 1000.0) - hours = milliseconds // 3_600_000 - milliseconds -= hours * 3_600_000 - minutes = milliseconds // 60_000 - milliseconds -= minutes * 60_000 - seconds = milliseconds // 1_000 - milliseconds -= seconds * 1_000 - hours_marker = f"{hours:02d}:" - return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}" + raise NotImplementedError(f"{self.__recipe} ({self.__flavour}) is not supported") def __on_frame_timecodes(self, subtitle: Subtitle, video_file_path: str) -> Tuple[Subtitle, Optional[float]]: frame_rate = None @@ -99,20 +88,3 @@ def __on_frame_timecodes(self, subtitle: Subtitle, video_file_path: str) -> Tupl except NoFrameRateException: self.__LOGGER.warning("Cannot detect the frame rate for %s" % video_file_path) return subtitle, frame_rate - - -class Recipe(str, Enum): - whisper = "whisper" - - -class WhisperFlavour(str, Enum): - tiny = "tiny" - tiny_en = "tiny.en" - small = "small" - medium = "medium" - medium_en = "medium.en" - base = "base" - base_en = "base.en" - large_v1 = "large-v1" - large_v2 = "large-v2" - large = "large" diff --git a/subaligner/translator.py b/subaligner/translator.py index 28277fa..723183e 100644 --- a/subaligner/translator.py +++ b/subaligner/translator.py @@ -1,23 +1,25 @@ import math -import pycountry import time +import whisper from copy import deepcopy +from typing import List, Generator, Optional from pysrt import SubRipItem from tqdm import tqdm from transformers import MarianMTModel, MarianTokenizer -from typing import List, Generator +from whisper.tokenizer import LANGUAGES from .singleton import Singleton +from .llm import TranslationRecipe, HelsinkiNLPFlavour, WhisperFlavour +from .utils import Utils +from .subtitle import Subtitle from .logger import Logger +from .exception import TranslationException -class Translator(metaclass=Singleton): +class Translator(object): """Translate subtitles. """ __TENSOR_TYPE = "pt" - __OPUS_MT = "Helsinki-NLP/opus-mt-{}-{}" - __OPUS_MT_TC_BIG = "Helsinki-NLP/opus-mt-tc-big-{}-{}" - __OPUS_TATOEBA = "Helsinki-NLP/opus-tatoeba-{}-{}" __TRANSLATING_BATCH_SIZE = 10 __LANGUAGE_CODE_MAPPER = { "bos": "zls", @@ -46,41 +48,29 @@ class Translator(metaclass=Singleton): "jpn-eng": "jap-eng" } - def __init__(self, src_language, tgt_language) -> None: + def __init__(self, + src_language: str, + tgt_language: str, + recipe: str = TranslationRecipe.HELSINKI_NLP.value, + flavour: Optional[str] = None) -> None: """Initialiser for the subtitle translation. Arguments: src_language {string} -- The source language code derived from ISO 639-3. tgt_language {string} -- The target language code derived from ISO 639-3. + recipe {string} -- the LLM recipe used for transcribing video files (default: "helsinki-nlp"). + flavour {string} -- the flavour variation for a specific LLM recipe (default: None). Raises: NotImplementedError -- Thrown when the model of the specified language pair is not found. """ self.__LOGGER = Logger().get_logger(__name__) - self.__initialise_model(src_language, tgt_language) - - @staticmethod - def get_iso_639_alpha_2(language_code: str) -> str: - """Find the alpha 2 language code based on an alpha 3 one. - - Arguments: - language_code {string} -- An alpha 3 language code derived from ISO 639-3. - - Returns: - string -- The alpha 2 language code if exists otherwise the alpha 3 one. - - Raises: - ValueError -- Thrown when the input language code cannot be recognised. - """ - - lang = pycountry.languages.get(alpha_3=language_code) - if lang is None: - return language_code - elif hasattr(lang, "alpha_2"): - return lang.alpha_2 - else: - return lang.alpha_3 + if recipe not in [r.value for r in TranslationRecipe]: + raise NotImplementedError(f"Unknown recipe: {recipe}") + self.__recipe = recipe + self.__tgt_language = tgt_language + self.__initialise_model(src_language, tgt_language, recipe, flavour) @staticmethod def normalise_single(language_code: str) -> str: @@ -112,129 +102,106 @@ def normalise_pair(src_language: str, tgt_language: str) -> List[str]: else: return [src_language, tgt_language] - def translate(self, subs: List[SubRipItem]) -> List[SubRipItem]: + def translate(self, subs: List[SubRipItem], video_file_path: Optional[str] = None) -> List[SubRipItem]: """Translate a list of subtitle cues. Arguments: subs {list} -- A list of SubRipItems. + video_file_path {string} -- The input video file path (default: None).. Returns: {list} -- A list of new SubRipItems holding the translation results. """ - translated_texts = [] - self.lang_model.eval() - new_subs = deepcopy(subs) - src_texts = [sub.text for sub in new_subs] - num_of_batches = math.ceil(len(src_texts) / Translator.__TRANSLATING_BATCH_SIZE) - self.__LOGGER.info("Translating %s subtitle cue(s)..." % len(src_texts)) - for batch in tqdm(Translator.__batch(src_texts, Translator.__TRANSLATING_BATCH_SIZE), total=num_of_batches): - input_ids = self.tokenizer(batch, return_tensors=Translator.__TENSOR_TYPE, padding=True) - translated = self.lang_model.generate(**input_ids) - translated_texts.extend([self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]) - for index in range(len(new_subs)): - new_subs[index].text = translated_texts[index] - self.__LOGGER.info("Subtitle translated") - return new_subs - - def __initialise_model(self, src_lang: str, tgt_lang: str) -> None: - src_lang = Translator.normalise_single(src_lang) - tgt_lang = Translator.normalise_single(tgt_lang) - src_lang, tgt_lang = Translator.normalise_pair(src_lang, tgt_lang) - - if self.__download_mt_model(src_lang, tgt_lang): - return - elif self.__download_mt_tc_big_model(src_lang, tgt_lang): - return - elif self.__download_tatoeba_model(src_lang, tgt_lang): - return + if self.__recipe == TranslationRecipe.HELSINKI_NLP.value: + translated_texts = [] + self.lang_model.eval() + new_subs = deepcopy(subs) + src_texts = [sub.text for sub in new_subs] + num_of_batches = math.ceil(len(src_texts) / Translator.__TRANSLATING_BATCH_SIZE) + self.__LOGGER.info("Translating %s subtitle cue(s)..." % len(src_texts)) + for batch in tqdm(Translator.__batch(src_texts, Translator.__TRANSLATING_BATCH_SIZE), total=num_of_batches): + input_ids = self.tokenizer(batch, return_tensors=Translator.__TENSOR_TYPE, padding=True) + translated = self.lang_model.generate(**input_ids) + translated_texts.extend([self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]) + for index in range(len(new_subs)): + new_subs[index].text = translated_texts[index] + self.__LOGGER.info("Subtitle translated") + return new_subs + elif self.__recipe == TranslationRecipe.WHISPER.value: + assert video_file_path is not None + lang = Utils.get_iso_639_alpha_2(self.__tgt_language) + if lang not in LANGUAGES: + raise TranslationException(f'"{self.__tgt_language}" is not supported by {self.__recipe}') + audio = whisper.load_audio(video_file_path) + self.__LOGGER.debug("Start translating the audio...") + result = self.lang_model.transcribe(audio, task="translate", language=LANGUAGES[lang]) + self.__LOGGER.info("Finished translating the audio") + srt_str = "" + for i, segment in enumerate(result["segments"], start=1): + srt_str += f"{i}\n" \ + f"{Utils.format_timestamp(segment['start'])} --> {Utils.format_timestamp(segment['end'])}\n" \ + f"{segment['text'].strip().replace('-->', '->')}\n" \ + "\n" + subtitle = Subtitle.load_subrip_str(srt_str) + return subtitle.subs else: - message = 'Cannot find the MT model for source language "{}" and destination language "{}"'.format(src_lang, tgt_lang) - self.__LOGGER.error(message) - raise NotImplementedError(message) - - def __download_mt_model(self, src_lang: str, tgt_lang: str) -> bool: + return [] + + def __initialise_model(self, src_lang: str, tgt_lang: str, recipe: str, flavour: Optional[str]) -> None: + if recipe == TranslationRecipe.HELSINKI_NLP.value: + src_lang = Translator.normalise_single(src_lang) + tgt_lang = Translator.normalise_single(tgt_lang) + src_lang, tgt_lang = Translator.normalise_pair(src_lang, tgt_lang) + + if self.__download_mt_model(src_lang, tgt_lang, HelsinkiNLPFlavour.OPUS_MT.value): + return + elif self.__download_mt_model(src_lang, tgt_lang, HelsinkiNLPFlavour.OPUS_TATOEBA.value): + return + elif self.__download_mt_model(src_lang, tgt_lang, HelsinkiNLPFlavour.OPUS_MT_TC_BIG.value): + return + else: + message = 'Cannot find the MT model for source language "{}" and destination language "{}"'.format(src_lang, tgt_lang) + self.__LOGGER.error(message) + raise NotImplementedError(message) + elif recipe == TranslationRecipe.WHISPER.value: + if flavour in [f.value for f in WhisperFlavour]: + # self.__download_whisper_model(flavour) + self.__download_whisper_model("medium") # works for translation target other than English + else: + raise NotImplementedError(f"Unknown {recipe} flavour: {flavour}") + + def __download_mt_model(self, src_lang: str, tgt_lang: str, flavour: str) -> bool: try: - mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(tgt_lang)) - self.__download(mt_model_name) + mt_model_name = flavour.format(Utils.get_iso_639_alpha_2(src_lang), Utils.get_iso_639_alpha_2(tgt_lang)) + self.__download_by_mt_name(mt_model_name) return True except OSError: self.__log_and_back_off(mt_model_name) try: - mt_model_name = Translator.__OPUS_MT.format(src_lang, Translator.get_iso_639_alpha_2(tgt_lang)) - self.__download(mt_model_name) + mt_model_name = flavour.format(src_lang, Utils.get_iso_639_alpha_2(tgt_lang)) + self.__download_by_mt_name(mt_model_name) return True except OSError: self.__log_and_back_off(mt_model_name) try: - mt_model_name = Translator.__OPUS_MT.format(Translator.get_iso_639_alpha_2(src_lang), tgt_lang) - self.__download(mt_model_name) + mt_model_name = flavour.format(Utils.get_iso_639_alpha_2(src_lang), tgt_lang) + self.__download_by_mt_name(mt_model_name) return True except OSError: self.__log_and_back_off(mt_model_name) try: - mt_model_name = Translator.__OPUS_MT.format(src_lang, tgt_lang) - self.__download(mt_model_name) + mt_model_name = flavour.format(src_lang, tgt_lang) + self.__download_by_mt_name(mt_model_name) return True except OSError: self.__log_and_back_off(mt_model_name) return False - def __download_mt_tc_big_model(self, src_lang: str, tgt_lang: str) -> bool: - try: - mt_tc_model_name = Translator.__OPUS_MT_TC_BIG.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(tgt_lang)) - self.__download(mt_tc_model_name) - return True - except OSError: - self.__log_and_back_off(mt_tc_model_name) - try: - mt_tc_model_name = Translator.__OPUS_MT_TC_BIG.format(src_lang, Translator.get_iso_639_alpha_2(tgt_lang)) - self.__download(mt_tc_model_name) - return True - except OSError: - self.__log_and_back_off(mt_tc_model_name) - try: - mt_tc_model_name = Translator.__OPUS_MT_TC_BIG.format(Translator.get_iso_639_alpha_2(src_lang), tgt_lang) - self.__download(mt_tc_model_name) - return True - except OSError: - self.__log_and_back_off(mt_tc_model_name) - try: - mt_tc_model_name = Translator.__OPUS_MT_TC_BIG.format(src_lang, tgt_lang) - self.__download(mt_tc_model_name) - return True - except OSError: - self.__log_and_back_off(mt_tc_model_name) - return False - - def __download_tatoeba_model(self, src_lang: str, tgt_lang: str) -> bool: - try: - mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), Translator.get_iso_639_alpha_2(tgt_lang)) - self.__download(mt_model_name) - return True - except OSError: - self.__log_and_back_off(mt_model_name) - try: - mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, Translator.get_iso_639_alpha_2(tgt_lang)) - self.__download(mt_model_name) - return True - except OSError: - self.__log_and_back_off(mt_model_name) - try: - mt_model_name = Translator.__OPUS_TATOEBA.format(Translator.get_iso_639_alpha_2(src_lang), tgt_lang) - self.__download(mt_model_name) - return True - except OSError: - self.__log_and_back_off(mt_model_name) - try: - mt_model_name = Translator.__OPUS_TATOEBA.format(src_lang, tgt_lang) - self.__download(mt_model_name) - return True - except OSError: - self.__log_and_back_off(mt_model_name) - return False + def __download_whisper_model(self, flavour: str) -> None: + self.lang_model = whisper.load_model(flavour) - def __download(self, mt_model_name: str) -> None: + def __download_by_mt_name(self, mt_model_name: str) -> None: self.__LOGGER.debug("Trying to download the MT model %s" % mt_model_name) self.tokenizer = MarianTokenizer.from_pretrained(mt_model_name) self.lang_model = MarianMTModel.from_pretrained(mt_model_name) diff --git a/subaligner/utils.py b/subaligner/utils.py index 29aec66..9375b16 100644 --- a/subaligner/utils.py +++ b/subaligner/utils.py @@ -5,6 +5,7 @@ import shutil import cchardet import shlex +import pycountry from pycaption import ( CaptionConverter, @@ -652,6 +653,41 @@ def get_language_table() -> List[str]: 'sem', 'sit', 'sla', 'srn', 'ssp', 'swc', 'taw', 'tdt', 'tiv', 'tll', 'toi', 'tpi', 'trk', 'tum', 'tut', 'tvl', 'tzo', 'umb', 'urj', 'vsl', 'wal', 'war', 'wls', 'yap', 'yua', 'zai', 'zle', 'zls', 'zlw', 'zne'] + @staticmethod + def get_iso_639_alpha_2(language_code: str) -> str: + """Find the alpha 2 language code based on an alpha 3 one. + + Arguments: + language_code {string} -- An alpha 3 language code derived from ISO 639-3. + + Returns: + string -- The alpha 2 language code if exists otherwise the alpha 3 one. + + Raises: + ValueError -- Thrown when the input language code cannot be recognised. + """ + + lang = pycountry.languages.get(alpha_3=language_code) + if lang is None: + return language_code + elif hasattr(lang, "alpha_2"): + return lang.alpha_2 + else: + return lang.alpha_3 + + @staticmethod + def format_timestamp(seconds: float) -> str: + assert seconds >= 0, "non-negative timestamp expected" + milliseconds = round(seconds * 1000.0) + hours = milliseconds // 3_600_000 + milliseconds -= hours * 3_600_000 + minutes = milliseconds // 60_000 + milliseconds -= minutes * 60_000 + seconds = milliseconds // 1_000 + milliseconds -= seconds * 1_000 + hours_marker = f"{hours:02d}:" + return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}" + @staticmethod def __convert_subtitle(source_file_path: str, source_ext: str, target_file_path: Optional[str], target_ext: str, format: str, frame_rate: Optional[float] = None) -> Tuple[str, str]: encoding = Utils.detect_encoding(source_file_path) diff --git a/tests/subaligner/test_transcriber.py b/tests/subaligner/test_transcriber.py index 95d2de7..94bd124 100644 --- a/tests/subaligner/test_transcriber.py +++ b/tests/subaligner/test_transcriber.py @@ -1,5 +1,6 @@ import os import unittest +from subaligner.llm import TranscriptionRecipe, WhisperFlavour from subaligner.transcriber import Transcriber as Undertest from subaligner.exception import TranscriptionException @@ -8,7 +9,7 @@ class TranscriberTest(unittest.TestCase): def setUp(self) -> None: self.video_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resource", "test.mp4") - self.undertest = Undertest(recipe="whisper", flavour="tiny") + self.undertest = Undertest(recipe=TranscriptionRecipe.WHISPER.value, flavour=WhisperFlavour.TINY.value) def test_transcribe(self): subtitle, frame_rate = self.undertest.transcribe(self.video_file_path, "eng") @@ -26,7 +27,7 @@ def test_throw_exception_on_unknown_recipe(self): def test_throw_exception_on_unknown_flavour(self): try: - Undertest(recipe="whisper", flavour="unknown") + Undertest(recipe=TranscriptionRecipe.WHISPER.value, flavour="unknown") except Exception as e: self.assertTrue(isinstance(e, NotImplementedError)) self.assertEqual(str(e), "Unknown whisper flavour: unknown") diff --git a/tests/subaligner/test_translator.py b/tests/subaligner/test_translator.py index 562ee97..a116f2f 100644 --- a/tests/subaligner/test_translator.py +++ b/tests/subaligner/test_translator.py @@ -13,11 +13,6 @@ def setUp(self): os.path.dirname(os.path.abspath(__file__)), "resource/test.srt" ) - def test_get_iso_639_alpha_2(self): - self.assertEqual("en", Undertest.get_iso_639_alpha_2("eng")) - self.assertEqual("ada", Undertest.get_iso_639_alpha_2("ada")) - self.assertEqual("xyz", Undertest.get_iso_639_alpha_2("xyz")) - @patch("transformers.MarianMTModel.from_pretrained") @patch("transformers.MarianTokenizer.from_pretrained") def test_translate(self, tokenizer_from_pretrained, model_from_pretrained): diff --git a/tests/subaligner/test_utils.py b/tests/subaligner/test_utils.py index 7b1f2a0..7581765 100644 --- a/tests/subaligner/test_utils.py +++ b/tests/subaligner/test_utils.py @@ -298,6 +298,20 @@ def test_get_misc_language_codes(self): def test_get_language_table(self): self.assertEqual(200, len(Undertest.get_language_table())) + def test_get_iso_639_alpha_2(self): + self.assertEqual("en", Undertest.get_iso_639_alpha_2("eng")) + self.assertEqual("ada", Undertest.get_iso_639_alpha_2("ada")) + self.assertEqual("xyz", Undertest.get_iso_639_alpha_2("xyz")) + + def test_format_timestamp(self): + test_cases = [ + (0, "00:00:00,000"), + (100, "00:01:40,000"), + (100.1, "00:01:40,100"), + ] + for seconds, time_code in test_cases: + self.assertEqual(time_code, Undertest.format_timestamp(seconds)) + @patch("subprocess.Popen.communicate", return_value=1) def test_throw_exception_on_srt2vtt_with_error_code(self, mock_communicate): self._assert_exception_on_subproces(lambda: Undertest.srt2vtt(self.real_srt_path, "output"), mock_communicate) From 307a325cb172082fd0da13a5b6985d88db806475 Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 17 Mar 2023 09:38:34 +0000 Subject: [PATCH 09/20] issue-78 escape quotes in names of files fed into ffmpeg --- subaligner/media_helper.py | 20 +++++++++++--------- subaligner/utils.py | 14 +++++++++----- tests/subaligner/test_utils.py | 6 +++++- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/subaligner/media_helper.py b/subaligner/media_helper.py index cd91e54..97f669a 100644 --- a/subaligner/media_helper.py +++ b/subaligner/media_helper.py @@ -16,6 +16,7 @@ from .exception import TerminalException from .exception import NoFrameRateException from .logger import Logger +from .utils import Utils TEMP_DIR_PATH = tempfile.mkdtemp() @@ -73,14 +74,15 @@ def extract_audio(self, video_file_path, decompress: bool = False, freq: int = 1 ) command = ( - "{0} -y -xerror -i '{1}' -ac 2 -ar {2} -vn '{3}'".format( - self.FFMPEG_BIN, video_file_path, freq, audio_file_path + "{0} -y -xerror -i {1} -ac 2 -ar {2} -vn {3}".format( + self.FFMPEG_BIN, Utils.double_quoted(video_file_path), freq, Utils.double_quoted(audio_file_path) ) if decompress - else "{0} -y -xerror -i '{1}' -vn -acodec copy '{2}'".format( - self.FFMPEG_BIN, video_file_path, audio_file_path + else "{0} -y -xerror -i {1} -vn -acodec copy {2}".format( + self.FFMPEG_BIN, Utils.double_quoted(video_file_path), Utils.double_quoted(audio_file_path) ) ) + print(command) with subprocess.Popen( shlex.split(command), shell=False, @@ -182,12 +184,12 @@ def extract_audio_from_start_to_end(self, audio_file_path: str, start: str, end: if end is not None: duration = self.get_duration_in_seconds(start, end) - command = "{0} -y -xerror -i '{1}' -ss {2} -t {3} -acodec copy '{4}'".format( - self.FFMPEG_BIN, audio_file_path, start, duration, segment_path + command = "{0} -y -xerror -i {1} -ss {2} -t {3} -acodec copy {4}".format( + self.FFMPEG_BIN, Utils.double_quoted(audio_file_path), start, duration, Utils.double_quoted(segment_path) ) else: - command = "{0} -y -xerror -i '{1}' -ss {2} -acodec copy '{3}'".format( - self.FFMPEG_BIN, audio_file_path, start, segment_path + command = "{0} -y -xerror -i {1} -ss {2} -acodec copy {3}".format( + self.FFMPEG_BIN, Utils.double_quoted(audio_file_path), start, Utils.double_quoted(segment_path) ) with subprocess.Popen( shlex.split(command), @@ -316,7 +318,7 @@ def get_frame_rate(self, file_path: str) -> float: discarded = "NUL:" if os.name == "nt" else "/dev/null" with subprocess.Popen( - shlex.split("{0} -i '{1}' -t 00:00:10 -f null {2}".format(self.FFMPEG_BIN, file_path, discarded)), + shlex.split("{0} -i {1} -t 00:00:10 -f null {2}".format(self.FFMPEG_BIN, Utils.double_quoted(file_path), discarded)), shell=False, stderr=subprocess.PIPE, close_fds=True, diff --git a/subaligner/utils.py b/subaligner/utils.py index 9375b16..fa50d0f 100644 --- a/subaligner/utils.py +++ b/subaligner/utils.py @@ -88,7 +88,7 @@ def srt2vtt(srt_file_path: str, vtt_file_path: Optional[str] = None, timeout_sec _vtt_file_path = srt_file_path.replace(".srt", ".vtt") if vtt_file_path is None else vtt_file_path encoding = Utils.detect_encoding(srt_file_path) - command = "{0} -y -sub_charenc {1} -i '{2}' -f webvtt '{3}'".format(Utils.FFMPEG_BIN, encoding, srt_file_path, _vtt_file_path) + command = "{0} -y -sub_charenc {1} -i {2} -f webvtt {3}".format(Utils.FFMPEG_BIN, encoding, Utils.double_quoted(srt_file_path), Utils.double_quoted(_vtt_file_path)) timeout_msg = "Timeout on converting SubRip to WebVTT: {}".format(srt_file_path) error_msg = "Cannot convert SubRip to WebVTT: {}".format(srt_file_path) @@ -115,7 +115,7 @@ def vtt2srt(vtt_file_path: str, srt_file_path: Optional[str] = None, timeout_sec _srt_file_path = vtt_file_path.replace(".vtt", ".srt") if srt_file_path is None else srt_file_path encoding = Utils.detect_encoding(vtt_file_path) - command = "{0} -y -sub_charenc {1} -i '{2}' -f srt '{3}'".format(Utils.FFMPEG_BIN, encoding, vtt_file_path, _srt_file_path) + command = "{0} -y -sub_charenc {1} -i {2} -f srt {3}".format(Utils.FFMPEG_BIN, encoding, Utils.double_quoted(vtt_file_path), Utils.double_quoted(_srt_file_path)) timeout_msg = "Timeout on converting WebVTT to SubRip: {}".format(vtt_file_path) error_msg = "Cannot convert WebVTT to SubRip: {}".format(vtt_file_path) @@ -492,7 +492,7 @@ def extract_teletext_as_subtitle(ts_file_path: str, page_num: int, output_file_p timeout_secs {int} -- The timeout in seconds on extraction {default: 30}. """ - command = "{0} -y -fix_sub_duration -txt_page {1} -txt_format text -i '{2}' '{3}'".format(Utils.FFMPEG_BIN, page_num, ts_file_path, output_file_path) + command = "{0} -y -fix_sub_duration -txt_page {1} -txt_format text -i {2} {3}".format(Utils.FFMPEG_BIN, page_num, Utils.double_quoted(ts_file_path), Utils.double_quoted(output_file_path)) timeout_msg = "Timeout on extracting Teletext from transport stream: {} on page: {}".format(ts_file_path, page_num) error_msg = "Cannot extract Teletext from transport stream: {} on page: {}".format(ts_file_path, page_num) @@ -518,7 +518,7 @@ def extract_matroska_subtitle(mkv_file_path: str, stream_index: int, output_file timeout_secs {int} -- The timeout in seconds on extraction {default: 30}. """ - command = "{0} -y -i '{1}' -map 0:s:{2} '{3}'".format(Utils.FFMPEG_BIN, mkv_file_path, stream_index, output_file_path) + command = "{0} -y -i {1} -map 0:s:{2} {3}".format(Utils.FFMPEG_BIN, Utils.double_quoted(mkv_file_path), stream_index, Utils.double_quoted(output_file_path)) timeout_msg = "Timeout on extracting the subtitle from file: {} with stream index: {}".format(mkv_file_path, stream_index) error_msg = "Cannot extract the subtitle from file: {} with stream index: {}".format(mkv_file_path, stream_index) @@ -570,7 +570,7 @@ def contains_embedded_subtitles(video_file_path: str, timeout_secs: int = 30) -> bool -- True if the video contains embedded subtitles or False otherwise. """ - command = "{0} -y -i '{1}' -c copy -map 0:s -f null - -v 0 -hide_banner".format(Utils.FFMPEG_BIN, video_file_path) + command = "{0} -y -i {1} -c copy -map 0:s -f null - -v 0 -hide_banner".format(Utils.FFMPEG_BIN, Utils.double_quoted(video_file_path)) timeout_msg = "Timeout on detecting embedded subtitles from file: {}".format(video_file_path) error_msg = "Embedded subtitle detection failed for file: {}".format(video_file_path) @@ -688,6 +688,10 @@ def format_timestamp(seconds: float) -> str: hours_marker = f"{hours:02d}:" return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}" + @staticmethod + def double_quoted(s: str) -> str: + return "\"{}\"".format(s.replace('"', "\\\"")) + @staticmethod def __convert_subtitle(source_file_path: str, source_ext: str, target_file_path: Optional[str], target_ext: str, format: str, frame_rate: Optional[float] = None) -> Tuple[str, str]: encoding = Utils.detect_encoding(source_file_path) diff --git a/tests/subaligner/test_utils.py b/tests/subaligner/test_utils.py index 7581765..da6689c 100644 --- a/tests/subaligner/test_utils.py +++ b/tests/subaligner/test_utils.py @@ -245,7 +245,7 @@ def test_ytt2srt(self): def test_extract_teletext_as_srt(self, mocked_run_command): Undertest.extract_teletext_as_subtitle("ts_file_path", 888, "srt_file_path") - mocked_run_command.assert_called_once_with("ffmpeg -y -fix_sub_duration -txt_page 888 -txt_format text -i {} {}".format("'ts_file_path'", "'srt_file_path'"), ANY, ANY, ANY, ANY) + mocked_run_command.assert_called_once_with("ffmpeg -y -fix_sub_duration -txt_page 888 -txt_format text -i {} {}".format("\"ts_file_path\"", "\"srt_file_path\""), ANY, ANY, ANY, ANY) def test_extract_matroska_subtitle(self): output_file_path = os.path.join(self.resource_tmp, "extracted.matroska.srt") @@ -312,6 +312,10 @@ def test_format_timestamp(self): for seconds, time_code in test_cases: self.assertEqual(time_code, Undertest.format_timestamp(seconds)) + def test_double_quoted(self): + self.assertEqual("\"file'path\"", Undertest.double_quoted("file'path")) + self.assertEqual("\"file\\\"path\"", Undertest.double_quoted("file\"path")) + @patch("subprocess.Popen.communicate", return_value=1) def test_throw_exception_on_srt2vtt_with_error_code(self, mock_communicate): self._assert_exception_on_subproces(lambda: Undertest.srt2vtt(self.real_srt_path, "output"), mock_communicate) From 87fd6e3eb932d4c8f0746b09e40ba2176a31b62f Mon Sep 17 00:00:00 2001 From: baxtree Date: Mon, 20 Mar 2023 09:29:10 +0000 Subject: [PATCH 10/20] support tensorflow 2.11 and deprecate py37 --- .github/workflows/ci-pipeline.yml | 2 +- Pipfile | 9 +++------ README.md | 11 +++++++---- requirements.txt | 10 +++------- site/source/advanced_usage.rst | 28 ++++++++++++++-------------- site/source/installation.rst | 8 ++++---- 6 files changed, 32 insertions(+), 36 deletions(-) diff --git a/.github/workflows/ci-pipeline.yml b/.github/workflows/ci-pipeline.yml index 169abcc..65bfb67 100644 --- a/.github/workflows/ci-pipeline.yml +++ b/.github/workflows/ci-pipeline.yml @@ -45,7 +45,7 @@ jobs: coverage run -m unittest discover coverage combine coverage xml - bash <(curl -s https://codecov.io/bash) + bash <(curl -s https://codecov.io/bash) -n patch -F 90 - name: Integration tests run: | radish -b tests/integration/radish tests/integration/feature diff --git a/Pipfile b/Pipfile index 934cca0..791b29b 100644 --- a/Pipfile +++ b/Pipfile @@ -22,10 +22,8 @@ sphinx = "==3.3.1" sphinx-rtd-theme = "==0.5.0" [packages] -absl-py = "~=0.10" astor = "==0.7.1" astroid = "~=2.5.6" -audioread = "==2.1.5" beautifulsoup4 = "<4.9.0" bleach = "==3.3.0" cachetools = "==3.1.1" @@ -56,11 +54,10 @@ Keras-Preprocessing = ">=1.0.9" kiwisolver = "==1.0.1" lazy-object-proxy = "==1.4.3" le-pycaption = "==2.2.0a1" -librosa = ">=0.8.0" +librosa = "<0.10.0" locket = "==0.2.0" Markdown = "==2.6.11" mccabe = "==0.6.1" -msgpack-python = "==0.5.6" numba = ">=0.50.0" numpy = "<1.24.0" oauthlib = "==3.1.0" @@ -92,7 +89,7 @@ sentencepiece = "~=0.1.95" setuptools = ">=41.0.0" six = "~=1.15.0" tblib = "==1.3.2" -tensorflow = ">=1.15.5,<2.9" +tensorflow = ">=1.15.5,<2.12" termcolor = "==1.1.0" toml = "==0.10.0" toolz = "==0.9.0" @@ -107,4 +104,4 @@ zipp = "==0.6.0" aeneas = "==1.7.3.0" [requires] -python_version = "3.7" +python_version = "3.8" diff --git a/README.md b/README.md index 9713282..7229920 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Build Status](https://github.com/baxtree/subaligner/actions/workflows/ci-pipeline.yml/badge.svg?branch=master)](https://github.com/baxtree/subaligner/actions/workflows/ci-pipeline.yml?query=branch%3Amaster) ![Codecov](https://img.shields.io/codecov/c/github/baxtree/subaligner) -[![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-3100/) [![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/) [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/) [![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/) +[![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-3100/) [![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/) [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/) [![Documentation Status](https://readthedocs.org/projects/subaligner/badge/?version=latest)](https://subaligner.readthedocs.io/en/latest/?badge=latest) [![GitHub license](https://img.shields.io/github/license/baxtree/subaligner)](https://github.com/baxtree/subaligner/blob/master/LICENSE) [![PyPI](https://badge.fury.io/py/subaligner.svg)](https://badge.fury.io/py/subaligner) @@ -16,6 +16,8 @@ Subtitle: SubRip, TTML, WebVTT, (Advanced) SubStation Alpha, MicroDVD, MPL2, TMP Video/Audio: MP4, WebM, Ogg, 3GP, FLV, MOV, Matroska, MPEG TS, WAV, MP3, AAC, FLAC, etc. +:information_source: Subaligner relies on file extensions as default hints to process a wide range of audiovisual or subtitle formats. It is recommended to use extensions widely acceppted by the community to ensure compatibility. + ## Dependencies Required by basic: [FFmpeg](https://www.ffmpeg.org/) ``` @@ -28,15 +30,16 @@ $ brew install ffmpeg ## Basic Installation ``` -$ pip install -U pip +$ pip install -U pip && pip install -U setuptools $ pip install subaligner ``` or install from source: ``` -$ git clone git@github.com:baxtree/subaligner.git -$ cd subaligner +$ git clone git@github.com:baxtree/subaligner.git && cd subaligner +$ pip install -U pip && pip install -U setuptools $ python setup.py install ``` +:information_source: It is highly recommended creating a virtual environment prior to installation. ## Installation with Optional Packages Supporting Additional Features ``` diff --git a/requirements.txt b/requirements.txt index 5d2a4a0..92c23bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,4 @@ -absl-py~=0.10 astor==0.7.1 -audioread==2.1.5 beautifulsoup4<4.9.0 bleach==3.3.0 cachetools==3.1.1 @@ -16,7 +14,6 @@ dask<2022.1.0 decorator==4.3.0 distributed==1.13.0 filelock<4.0.0 -google-auth==1.27.0 google-auth-oauthlib==0.4.2 google-pasta~=0.2 graphviz==0.8.3 @@ -31,12 +28,11 @@ Keras-Preprocessing>=1.0.9 kiwisolver==1.0.1 lazy-object-proxy==1.4.3 le-pycaption==2.2.0a1 -librosa>=0.8.0 +librosa<0.10.0 locket==0.2.0 Markdown==2.6.11 mccabe==0.6.1 networkx>=2.5.1 -msgpack-python==0.5.6 numba>=0.50.0 numpy<1.24.0 oauthlib==3.1.0 @@ -59,11 +55,11 @@ requests~=2.25.1 requests-oauthlib==1.3.0 rsa==4.7 scipy<=1.8.1 -scikit-learn~=0.24.2 +scikit-learn<1.2.0 setuptools>=41.0.0 six~=1.15.0 tblib==1.3.2 -tensorflow>=1.15.5,<2.9 +tensorflow>=1.15.5,<2.12 termcolor==1.1.0 toml==0.10.0 toolz==0.9.0 diff --git a/site/source/advanced_usage.rst b/site/source/advanced_usage.rst index 952ed59..c49f424 100644 --- a/site/source/advanced_usage.rst +++ b/site/source/advanced_usage.rst @@ -7,7 +7,7 @@ the model can be imported and used for synchronising out-of-sync subtitles. **Start fresh training**:: - (.venv) $ subaligner_train -vd av_directory -sd subtitle_directory -tod training_output_directory + $ subaligner_train -vd av_directory -sd subtitle_directory -tod training_output_directory Make sure each subtitle file and its companion audiovisual file are sharing the same base filename, e.g., "awesome.mp4" and "awesome.srt" share the base filename "awesome". Then split them into two separate folders, e.g., @@ -17,7 +17,7 @@ the results after training is finished and make sure it is writable to Subaligne **Resume training**:: - (.venv) $ subaligner_train -vd av_directory -sd subtitle_directory -tod training_output_directory -e 200 -r + $ subaligner_train -vd av_directory -sd subtitle_directory -tod training_output_directory -e 200 -r Training over a large dataset is usually an expensive process and time consuming. You can stop the training and resume it with `-r` or `--resume` at another convenient time to enhance an existing model stored in the aforementioned training output @@ -26,14 +26,14 @@ already completed in the past. If the number is forgotten, you can pass in `-dde **Display completed epochs**:: - (.venv) $ subaligner_train -dde -tod training_output_directory + $ subaligner_train -dde -tod training_output_directory Also note that on training resumption, av_directory and subtitle_directory will be ignored due to the reuse of feature embedding by default. **Reuse embeddings**:: - (.venv) $ subaligner_train -utd -tod training_output_directory + $ subaligner_train -utd -tod training_output_directory Embeddings extracted from your media files can be reused with `-utd` or `--use_training_dump`. With that flag on, you can train a new model of another kind (instead of re-using the same model on training resumption) without going through the feature embedding process, @@ -41,7 +41,7 @@ which could take quite long to finish for a large dataset so as to be unnecessar **Ignore sound effects**:: - (.venv) $ subaligner_train -vd av_directory -sd subtitle_directory -tod training_output_directory --sound_effect_start_marker "(" --sound_effect_end_marker ")" + $ subaligner_train -vd av_directory -sd subtitle_directory -tod training_output_directory --sound_effect_start_marker "(" --sound_effect_end_marker ")" It is not uncommon that subtitles sometimes contain sound effects (e.g., "BARK", "(applause)" and "[MUSIC]", etc.). For limited training data sets and not sophisticated enough network architectures, the model usually cannot capture all the sound effects very well. @@ -51,7 +51,7 @@ For example, the above exemplary command will treat any strings starting with "( **Train with embedded subtitles**:: - (.venv) $ subaligner_train -vd av_directory -ess embedded:stream_index=0,file_extension=srt -tod training_output_directory + $ subaligner_train -vd av_directory -ess embedded:stream_index=0,file_extension=srt -tod training_output_directory If your audiovisual files all contain embedded subtitles or teletexts of the same format and have been encoded in the same fashion, `-sd` or `--subtitle_directory` can be omitted and subtitles will be extracted based on the specified subtitle selector. For instance, "embedded:stream_index=0,file_extension=srt" @@ -62,8 +62,8 @@ is present, make sure the folder passed in is empty. **Run alignments after training**:: - (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -tod training_output_directory - (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -tod training_output_directory + $ subaligner -m single -v video.mp4 -s subtitle.srt -tod training_output_directory + $ subaligner -m dual -v video.mp4 -s subtitle.srt -tod training_output_directory To apply your trained model to subtitle alignment, pass in the training_output_directory containing training results as shown above with `-tod` or `--training_output_directory`. @@ -96,7 +96,7 @@ Subaligner tune hyperparameters automatically and the how-to is shown below. **Hyperparameters tuning**:: - (.venv) $ subaligner_tune -vd av_directory -sd subtitle_directory -tod training_output_directory + $ subaligner_tune -vd av_directory -sd subtitle_directory -tod training_output_directory Subaligner has used the `Tree-structured Parzen Estimator Approach (TPE) `_ to automatically run trails on different settings of hyper-parameter values and recommend the best one. You can pass in the following @@ -115,17 +115,17 @@ flags to customise the configuration on tuning: **Convert the subtitle to another format**:: - (.venv) $ subaligner_convert -i subtitle.srt -o subtitle.vtt + $ subaligner_convert -i subtitle.srt -o subtitle.vtt **Convert the subtitle to another format and translate**:: - (.venv) $ subaligner_convert --languages - (.venv) $ subaligner_convert -i subtitle_en.srt -o subtitle_zh.vtt -t eng,zho + $ subaligner_convert --languages + $ subaligner_convert -i subtitle_en.srt -o subtitle_zh.vtt -t eng,zho **Translate the subtitle without changing the format**:: - (.venv) $ subaligner_convert --languages - (.venv) $ subaligner_convert -i subtitle_en.srt -o subtitle_es.srt -t eng,spa + $ subaligner_convert --languages + $ subaligner_convert -i subtitle_en.srt -o subtitle_es.srt -t eng,spa For output subtitles like MicroDVD relying on the frame rate, its value needs to be passed in with `-fr` or `--frame_rate`. diff --git a/site/source/installation.rst b/site/source/installation.rst index 8e8da43..e85391f 100644 --- a/site/source/installation.rst +++ b/site/source/installation.rst @@ -14,7 +14,7 @@ Installation **Install Subaligner via PyPI (pre-emptive NumPy)**:: - $ pip install -U pip + $ pip install -U pip && pip install -U setuptools $ pip install subaligner **Install dependencies for enabling translation**:: @@ -65,9 +65,9 @@ to create a virtual environment and set up all the dependencies: **Install Subaligner from source**:: - $ git clone git@github.com:baxtree/subaligner.git - $ cd subaligner - $ make install && source .venv/bin/activate + $ git clone git@github.com:baxtree/subaligner.git && cd subaligner + $ pip install -U pip && pip install -U setuptools + $ python setup.py install **Subaligner CLI should be on your PATH now**:: From 35786e175bb28ae051cca049dcd3adf391aee3e8 Mon Sep 17 00:00:00 2001 From: baxtree Date: Wed, 22 Mar 2023 09:41:15 +0000 Subject: [PATCH 11/20] add fb mbart models for translation --- subaligner/__main__.py | 20 +++---- subaligner/llm.py | 5 ++ subaligner/translator.py | 86 ++++++++++++++++++++++++----- tests/subaligner/test_translator.py | 26 ++++++++- 4 files changed, 111 insertions(+), 26 deletions(-) diff --git a/subaligner/__main__.py b/subaligner/__main__.py index 63cc178..d2a2d6e 100755 --- a/subaligner/__main__.py +++ b/subaligner/__main__.py @@ -4,15 +4,15 @@ [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-os OFFSET_SECONDS] [-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] - [-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}] [-tr {helsinki-nlp,whisper}] [-tf TRANSLATION_FLAVOUR] [-lgs] - [-d] [-q] [-ver] + [-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}] [-tr {helsinki-nlp,whisper,facebook-mbart}] + [-tf TRANSLATION_FLAVOUR] [-lgs] [-d] [-q] [-ver] Subaligner command line interface optional arguments: -h, --help show this help message and exit -s SUBTITLE_PATH [SUBTITLE_PATH ...], --subtitle_path SUBTITLE_PATH [SUBTITLE_PATH ...] - File path or URL to the subtitle file (Extensions of supported subtitles: .ttml, .sub, .ytt, .smi, .sami, .tmp, .txt, .ssa, .vtt, .stl, .xml, .ass, .scc, .dfxp, .sbv, .srt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) + File path or URL to the subtitle file (Extensions of supported subtitles: .ttml, .ssa, .stl, .sbv, .dfxp, .srt, .txt, .ytt, .vtt, .sub, .sami, .xml, .scc, .ass, .smi, .tmp) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0) -l MAX_LOGLOSS, --max_logloss MAX_LOGLOSS Max global log loss for alignment -so, --stretch_on Switch on stretch on subtitles) @@ -30,11 +30,11 @@ Offset by which the subtitle will be shifted -ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --main_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho} Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes] - -mr {whisper}, --llm_recipe {whisper} + -mr {whisper}, --transcription_recipe {whisper} LLM recipe used for transcribing video files - -mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}, --llm_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large} + -mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}, --transcription_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large} Flavour variation for a specific LLM recipe supporting transcription - -tr {helsinki-nlp,whisper}, --translation_recipe {helsinki-nlp,whisper} + -tr {helsinki-nlp,whisper,facebook-mbart}, --translation_recipe {helsinki-nlp,whisper,facebook-mbart} LLM recipe used for translating subtitles -tf TRANSLATION_FLAVOUR, --translation_flavour TRANSLATION_FLAVOUR Flavour variation for a specific LLM recipe supporting translation @@ -161,7 +161,7 @@ def main(): from subaligner.llm import WhisperFlavour parser.add_argument( "-mr", - "--llm_recipe", + "--transcription_recipe", type=str.lower, default=TranscriptionRecipe.WHISPER.value, choices=[r.value for r in TranscriptionRecipe], @@ -169,7 +169,7 @@ def main(): ) parser.add_argument( "-mf", - "--llm_flavour", + "--transcription_flavour", type=str.lower, default=WhisperFlavour.SMALL.value, choices=[wf.value for wf in WhisperFlavour], @@ -322,7 +322,7 @@ def main(): ) elif FLAGS.mode == "transcribe": from subaligner.transcriber import Transcriber - transcriber = Transcriber(recipe=FLAGS.llm_recipe, flavour=FLAGS.llm_flavour) + transcriber = Transcriber(recipe=FLAGS.transcription_recipe, flavour=FLAGS.transcription_flavour) subtitle, frame_rate = transcriber.transcribe(local_video_path, stretch_in_lang) aligned_subs = subtitle.subs else: @@ -337,7 +337,7 @@ def main(): from subaligner.translator import Translator source, target = FLAGS.translate.split(",") translator = Translator(src_language=source, tgt_language=target, recipe=FLAGS.translation_recipe, flavour=FLAGS.translation_flavour) - aligned_subs = translator.translate(aligned_subs, local_video_path) + aligned_subs = translator.translate(aligned_subs, local_video_path, (source, target)) Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate, "utf-8") elif FLAGS.mode == "transcribe": diff --git a/subaligner/llm.py b/subaligner/llm.py index 7d7aecf..3475ac0 100644 --- a/subaligner/llm.py +++ b/subaligner/llm.py @@ -8,6 +8,7 @@ class TranscriptionRecipe(Enum): class TranslationRecipe(Enum): HELSINKI_NLP = "helsinki-nlp" WHISPER = "whisper" + FACEBOOK_MBART = "facebook-mbart" class WhisperFlavour(Enum): @@ -27,3 +28,7 @@ class HelsinkiNLPFlavour(Enum): OPUS_MT = "Helsinki-NLP/opus-mt-{}-{}" OPUS_MT_TC_BIG = "Helsinki-NLP/opus-mt-tc-big-{}-{}" OPUS_TATOEBA = "Helsinki-NLP/opus-tatoeba-{}-{}" + + +class FacebookMbartFlavour(Enum): + LARGE = "large" diff --git a/subaligner/translator.py b/subaligner/translator.py index 723183e..d4b66bc 100644 --- a/subaligner/translator.py +++ b/subaligner/translator.py @@ -2,13 +2,20 @@ import time import whisper from copy import deepcopy -from typing import List, Generator, Optional +from typing import List, Generator, Optional, Tuple from pysrt import SubRipItem from tqdm import tqdm -from transformers import MarianMTModel, MarianTokenizer +from transformers import ( + PreTrainedModel, + PreTrainedTokenizer, + MarianMTModel, + MarianTokenizer, + MBart50TokenizerFast, + MBartForConditionalGeneration, +) from whisper.tokenizer import LANGUAGES from .singleton import Singleton -from .llm import TranslationRecipe, HelsinkiNLPFlavour, WhisperFlavour +from .llm import TranslationRecipe, HelsinkiNLPFlavour, WhisperFlavour, FacebookMbartFlavour from .utils import Utils from .subtitle import Subtitle from .logger import Logger @@ -47,6 +54,16 @@ class Translator(object): "eng-jpn": "eng-jap", "jpn-eng": "jap-eng" } + __MBART_LANGUAGE_CODE_MAPPER = { + "ara": "ar_AR", "ces": "cs_CZ", "deu": "de_DE", "eng": "en_XX", "spa": "es_XX", "est": "et_EE", "fin": "fi_FI", + "fra": "fr_XX", "guj": "gu_IN", "hin": "hi_IN", "ita": "it_IT", "jpn": "ja_XX", "kaz": "kk_KZ", "kor": "ko_KR", + "lit": "lt_LT", "lav": "lv_LV", "mya": "my_MM", "nep": "ne_NP", "nld": "nl_XX", "ron": "ro_RO", "rus": "ru_RU", + "sin": "si_LK", "tur": "tr_TR", "vie": "vi_VN", "zho": "zh_CN", "afr": "af_ZA", "aze": "az_AZ", "ben": "bn_IN", + "fas": "fa_IR", "heb": "he_IL", "hrv": "hr_HR", "ind": "id_ID", "kat": "ka_GE", "khm": "km_KH", "mkd": "mk_MK", + "mal": "ml_IN", "mon": "mn_MN", "mar": "mr_IN", "pol": "pl_PL", "pus": "ps_AF", "por": "pt_XX", "swe": "sv_SE", + "swa": "sw_KE", "tam": "ta_IN", "tel": "te_IN", "tha": "th_TH", "tgl": "tl_XX", "ukr": "uk_UA", "urd": "ur_PK", + "xho": "xh_ZA", "glg": "gl_ES", "slv": "sl_SI" + } def __init__(self, src_language: str, @@ -69,7 +86,10 @@ def __init__(self, if recipe not in [r.value for r in TranslationRecipe]: raise NotImplementedError(f"Unknown recipe: {recipe}") self.__recipe = recipe + self.__src_language = src_language self.__tgt_language = tgt_language + self.__tokenizer: PreTrainedTokenizer = None + self.__lang_model: PreTrainedModel = None self.__initialise_model(src_language, tgt_language, recipe, flavour) @staticmethod @@ -102,28 +122,36 @@ def normalise_pair(src_language: str, tgt_language: str) -> List[str]: else: return [src_language, tgt_language] - def translate(self, subs: List[SubRipItem], video_file_path: Optional[str] = None) -> List[SubRipItem]: + def translate(self, + subs: List[SubRipItem], + video_file_path: Optional[str] = None, + language_pair: Optional[Tuple[str, str]] = None) -> List[SubRipItem]: """Translate a list of subtitle cues. Arguments: subs {list} -- A list of SubRipItems. - video_file_path {string} -- The input video file path (default: None).. + + Keyword Arguments: + video_file_path {string} -- The input video file path (default: None). + language_pair {Tuple[str, str]} -- Used for overriding the default language pair (default: None). Returns: {list} -- A list of new SubRipItems holding the translation results. """ if self.__recipe == TranslationRecipe.HELSINKI_NLP.value: + if language_pair is not None: + self.__LOGGER.debug(f"Language pair ignored: {language_pair}") translated_texts = [] - self.lang_model.eval() + self.__lang_model.eval() new_subs = deepcopy(subs) src_texts = [sub.text for sub in new_subs] num_of_batches = math.ceil(len(src_texts) / Translator.__TRANSLATING_BATCH_SIZE) self.__LOGGER.info("Translating %s subtitle cue(s)..." % len(src_texts)) for batch in tqdm(Translator.__batch(src_texts, Translator.__TRANSLATING_BATCH_SIZE), total=num_of_batches): - input_ids = self.tokenizer(batch, return_tensors=Translator.__TENSOR_TYPE, padding=True) - translated = self.lang_model.generate(**input_ids) - translated_texts.extend([self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]) + input_ids = self.__tokenizer(batch, return_tensors=Translator.__TENSOR_TYPE, padding=True) + translated = self.__lang_model.generate(**input_ids) + translated_texts.extend([self.__tokenizer.decode(t, skip_special_tokens=True) for t in translated]) for index in range(len(new_subs)): new_subs[index].text = translated_texts[index] self.__LOGGER.info("Subtitle translated") @@ -135,7 +163,7 @@ def translate(self, subs: List[SubRipItem], video_file_path: Optional[str] = Non raise TranslationException(f'"{self.__tgt_language}" is not supported by {self.__recipe}') audio = whisper.load_audio(video_file_path) self.__LOGGER.debug("Start translating the audio...") - result = self.lang_model.transcribe(audio, task="translate", language=LANGUAGES[lang]) + result = self.__lang_model.transcribe(audio, task="translate", language=LANGUAGES[lang]) self.__LOGGER.info("Finished translating the audio") srt_str = "" for i, segment in enumerate(result["segments"], start=1): @@ -145,6 +173,26 @@ def translate(self, subs: List[SubRipItem], video_file_path: Optional[str] = Non "\n" subtitle = Subtitle.load_subrip_str(srt_str) return subtitle.subs + elif self.__recipe == TranslationRecipe.FACEBOOK_MBART.value: + src_lang, tgt_lang = language_pair if language_pair is not None else (self.__src_language, self.__tgt_language) + self.__tokenizer.src_lang = Translator.__MBART_LANGUAGE_CODE_MAPPER.get(src_lang, None) + lang_code = Translator.__MBART_LANGUAGE_CODE_MAPPER.get(tgt_lang, None) + if src_lang is None or tgt_lang is None: + raise NotImplementedError(f"Language pair of {src_lang} and {src_lang} is not supported") + translated_texts = [] + self.__lang_model.eval() + new_subs = deepcopy(subs) + src_texts = [sub.text for sub in new_subs] + num_of_batches = math.ceil(len(src_texts) / Translator.__TRANSLATING_BATCH_SIZE) + self.__LOGGER.info("Translating %s subtitle cue(s)..." % len(src_texts)) + for batch in tqdm(Translator.__batch(src_texts, Translator.__TRANSLATING_BATCH_SIZE), total=num_of_batches): + input_ids = self.__tokenizer(batch, return_tensors=Translator.__TENSOR_TYPE, padding=True) + translated = self.__lang_model.generate(**input_ids, forced_bos_token_id=self.__tokenizer.lang_code_to_id[lang_code]) + translated_texts.extend([self.__tokenizer.decode(t, skip_special_tokens=True) for t in translated]) + for index in range(len(new_subs)): + new_subs[index].text = translated_texts[index] + self.__LOGGER.info("Subtitle translated") + return new_subs else: return [] @@ -170,6 +218,11 @@ def __initialise_model(self, src_lang: str, tgt_lang: str, recipe: str, flavour: self.__download_whisper_model("medium") # works for translation target other than English else: raise NotImplementedError(f"Unknown {recipe} flavour: {flavour}") + elif recipe == TranslationRecipe.FACEBOOK_MBART.value: + if flavour in [f.value for f in FacebookMbartFlavour]: + self.__download_mbart_model(flavour) + else: + raise NotImplementedError(f"Unknown {recipe} flavour: {flavour}") def __download_mt_model(self, src_lang: str, tgt_lang: str, flavour: str) -> bool: try: @@ -199,12 +252,19 @@ def __download_mt_model(self, src_lang: str, tgt_lang: str, flavour: str) -> boo return False def __download_whisper_model(self, flavour: str) -> None: - self.lang_model = whisper.load_model(flavour) + self.__lang_model = whisper.load_model(flavour) + + def __download_mbart_model(self, flavour: str) -> None: + mbart_model_name = f"facebook/mbart-{flavour}-50-many-to-many-mmt" + self.__LOGGER.debug("Trying to download the mBART model %s" % mbart_model_name) + self.__tokenizer = MBart50TokenizerFast.from_pretrained(mbart_model_name) + self.__lang_model = MBartForConditionalGeneration.from_pretrained(mbart_model_name) + self.__LOGGER.debug("mBART model %s downloaded" % mbart_model_name) def __download_by_mt_name(self, mt_model_name: str) -> None: self.__LOGGER.debug("Trying to download the MT model %s" % mt_model_name) - self.tokenizer = MarianTokenizer.from_pretrained(mt_model_name) - self.lang_model = MarianMTModel.from_pretrained(mt_model_name) + self.__tokenizer = MarianTokenizer.from_pretrained(mt_model_name) + self.__lang_model = MarianMTModel.from_pretrained(mt_model_name) self.__LOGGER.debug("MT model %s downloaded" % mt_model_name) def __log_and_back_off(self, mt_model_name: str): diff --git a/tests/subaligner/test_translator.py b/tests/subaligner/test_translator.py index a116f2f..897b935 100644 --- a/tests/subaligner/test_translator.py +++ b/tests/subaligner/test_translator.py @@ -3,6 +3,7 @@ from mock import Mock, patch from parameterized import parameterized from subaligner.subtitle import Subtitle +from subaligner.llm import TranslationRecipe, HelsinkiNLPFlavour, FacebookMbartFlavour from subaligner.translator import Translator as Undertest @@ -13,19 +14,38 @@ def setUp(self): os.path.dirname(os.path.abspath(__file__)), "resource/test.srt" ) - @patch("transformers.MarianMTModel.from_pretrained") @patch("transformers.MarianTokenizer.from_pretrained") - def test_translate(self, tokenizer_from_pretrained, model_from_pretrained): + @patch("transformers.MarianMTModel.from_pretrained") + def test_translate_hel_nlp(self, model_from_pretrained, tokenizer_from_pretrained): + subs = Subtitle.load(self.srt_file_path).subs + mock_tokenizer = Mock() + mock_tokenizer.return_value = {"input_ids": None, "attention_mask": None} + mock_tokenizer.decode.return_value = "translated" + mock_model = Mock() + mock_model.generate.return_value = [None] * len(subs) + tokenizer_from_pretrained.return_value = mock_tokenizer + model_from_pretrained.return_value = mock_model + + undertest = Undertest("eng", "zho", recipe=TranslationRecipe.HELSINKI_NLP.value) + translated_subs = undertest.translate(subs) + + self.assertEqual(["translated"] * len(subs), [*map(lambda x: x.text, translated_subs)]) + + @patch("transformers.MBart50TokenizerFast.from_pretrained") + @patch("transformers.MBartForConditionalGeneration.from_pretrained") + def test_translate_fb_mbart(self, model_from_pretrained, tokenizer_from_pretrained): subs = Subtitle.load(self.srt_file_path).subs mock_tokenizer = Mock() mock_tokenizer.return_value = {"input_ids": None, "attention_mask": None} mock_tokenizer.decode.return_value = "translated" + mock_tokenizer.lang_code_to_id = {"zh_CN": 250025} mock_model = Mock() mock_model.generate.return_value = [None] * len(subs) tokenizer_from_pretrained.return_value = mock_tokenizer model_from_pretrained.return_value = mock_model - translated_subs = Undertest("eng", "zho").translate(subs) + undertest = Undertest("eng", "zho", recipe=TranslationRecipe.FACEBOOK_MBART.value, flavour=FacebookMbartFlavour.LARGE.value) + translated_subs = undertest.translate(subs) self.assertEqual(["translated"] * len(subs), [*map(lambda x: x.text, translated_subs)]) From 1228a9c39de9e103fc244cdfffa400cca4bc28dd Mon Sep 17 00:00:00 2001 From: baxtree Date: Wed, 29 Mar 2023 09:25:56 +0100 Subject: [PATCH 12/20] update dependencies --- Pipfile | 4 +- requirements.txt | 3 +- subaligner/translator.py | 64 ++++------------------------- tests/subaligner/test_translator.py | 47 +++++++-------------- 4 files changed, 27 insertions(+), 91 deletions(-) diff --git a/Pipfile b/Pipfile index 791b29b..f84d858 100644 --- a/Pipfile +++ b/Pipfile @@ -34,7 +34,7 @@ click = "==5.1" cloudpickle = "==0.5.3" cycler = "==0.10.0" Cython = "~=0.29.22" -dask = "<2022.1.0" +dask = ">=2021.10.0,<2022.1.0" decorator = "==4.3.0" distributed = "==1.13.0" filelock = "<4.0.0" @@ -48,7 +48,7 @@ html5lib = "==1.0b9" hyperopt = "==0.2.4" idna = "==2.8" isort = "==4.3.4" -joblib = "==0.11" +joblib = ">=1.2.0" Keras-Applications = ">=1.0.8" Keras-Preprocessing = ">=1.0.9" kiwisolver = "==1.0.1" diff --git a/requirements.txt b/requirements.txt index 92c23bb..1818ca4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ click==5.1 cloudpickle~=1.6.0 cycler==0.10.0 Cython~=0.29.22 -dask<2022.1.0 +dask>=2021.10.0,<2022.1.0 decorator==4.3.0 distributed==1.13.0 filelock<4.0.0 @@ -23,6 +23,7 @@ html5lib==1.0b9 hyperopt==0.2.4 idna==2.8 isort==4.3.4 +joblib>=1.2.0 Keras-Applications>=1.0.8 Keras-Preprocessing>=1.0.9 kiwisolver==1.0.1 diff --git a/subaligner/translator.py b/subaligner/translator.py index d4b66bc..2cd977d 100644 --- a/subaligner/translator.py +++ b/subaligner/translator.py @@ -28,29 +28,12 @@ class Translator(object): __TENSOR_TYPE = "pt" __TRANSLATING_BATCH_SIZE = 10 - __LANGUAGE_CODE_MAPPER = { - "bos": "zls", - "cmn": "zho", - "gla": "cel", - "grc": "grk", - "guj": "inc", - "ina": "art", - "jbo": "art", - "kan": "dra", - "kir": "trk", - "lat": "itc", - "lfn": "art", - "mya": "sit", - "nep": "inc", - "ori": "inc", - "sin": "inc", - "srp": "zls", - "tam": "dra", - "tat": "trk", - "tel": "dra", - "yue": "zho" + __HELSINKI_LANGUAGE_CODE_MAPPER = { + "bos": "zls", "cmn": "zho", "gla": "cel", "grc": "grk", "guj": "inc", "ina": "art", "jbo": "art", "kan": "dra", + "kir": "trk", "lat": "itc", "lfn": "art", "mya": "sit", "nep": "inc", "ori": "inc", "sin": "inc", "srp": "zls", + "tam": "dra", "tat": "trk", "tel": "dra", "yue": "zho" } - __LANGUAGE_PAIR_MAPPER = { + __HELSINKI_LANGUAGE_PAIR_MAPPER = { "eng-jpn": "eng-jap", "jpn-eng": "jap-eng" } @@ -92,36 +75,6 @@ def __init__(self, self.__lang_model: PreTrainedModel = None self.__initialise_model(src_language, tgt_language, recipe, flavour) - @staticmethod - def normalise_single(language_code: str) -> str: - """Normalise a single language code. - - Arguments: - language_code {string} -- A language code derived from ISO 639-3. - - Returns: - string -- The language code understood by the language model. - """ - - return Translator.__LANGUAGE_CODE_MAPPER[language_code] if language_code in Translator.__LANGUAGE_CODE_MAPPER else language_code - - @staticmethod - def normalise_pair(src_language: str, tgt_language: str) -> List[str]: - """Normalise a pair of language codes. - - Arguments: - src_language {string} -- The source language code derived from ISO 639-3. - tgt_language {string} -- The target language code derived from ISO 639-3. - - Returns: - list -- The language code pair understood by the language model. - """ - - if "{}-{}".format(src_language, tgt_language) in Translator.__LANGUAGE_PAIR_MAPPER: - return Translator.__LANGUAGE_PAIR_MAPPER["{}-{}".format(src_language, tgt_language)].split("-") - else: - return [src_language, tgt_language] - def translate(self, subs: List[SubRipItem], video_file_path: Optional[str] = None, @@ -198,9 +151,10 @@ def translate(self, def __initialise_model(self, src_lang: str, tgt_lang: str, recipe: str, flavour: Optional[str]) -> None: if recipe == TranslationRecipe.HELSINKI_NLP.value: - src_lang = Translator.normalise_single(src_lang) - tgt_lang = Translator.normalise_single(tgt_lang) - src_lang, tgt_lang = Translator.normalise_pair(src_lang, tgt_lang) + src_lang = Translator.__HELSINKI_LANGUAGE_CODE_MAPPER.get(src_lang, src_lang) + tgt_lang = Translator.__HELSINKI_LANGUAGE_CODE_MAPPER.get(tgt_lang, tgt_lang) + lang_pair = "{}-{}".format(src_lang, tgt_lang) + src_lang, tgt_lang = Translator.__HELSINKI_LANGUAGE_PAIR_MAPPER.get(lang_pair, lang_pair).split("-") if self.__download_mt_model(src_lang, tgt_lang, HelsinkiNLPFlavour.OPUS_MT.value): return diff --git a/tests/subaligner/test_translator.py b/tests/subaligner/test_translator.py index 897b935..6aa810c 100644 --- a/tests/subaligner/test_translator.py +++ b/tests/subaligner/test_translator.py @@ -3,7 +3,7 @@ from mock import Mock, patch from parameterized import parameterized from subaligner.subtitle import Subtitle -from subaligner.llm import TranslationRecipe, HelsinkiNLPFlavour, FacebookMbartFlavour +from subaligner.llm import TranslationRecipe, HelsinkiNLPFlavour, WhisperFlavour, FacebookMbartFlavour from subaligner.translator import Translator as Undertest @@ -31,6 +31,19 @@ def test_translate_hel_nlp(self, model_from_pretrained, tokenizer_from_pretraine self.assertEqual(["translated"] * len(subs), [*map(lambda x: x.text, translated_subs)]) + @patch("whisper.load_audio") + @patch("whisper.load_model") + def test_translate_whisper(self, load_model, load_audio): + subs = Subtitle.load(self.srt_file_path).subs + model = Mock() + load_model.return_value = model + model.transcribe.return_value = {"segments": [{"start": 0, "end": 1, "text": "translated"}]} + + undertest = Undertest("eng", "zho", recipe=TranslationRecipe.WHISPER.value, flavour=WhisperFlavour.TINY.value) + translated_subs = undertest.translate(subs, "video_path") + + self.assertEqual(["translated"], [*map(lambda x: x.text, translated_subs)]) + @patch("transformers.MBart50TokenizerFast.from_pretrained") @patch("transformers.MBartForConditionalGeneration.from_pretrained") def test_translate_fb_mbart(self, model_from_pretrained, tokenizer_from_pretrained): @@ -49,38 +62,6 @@ def test_translate_fb_mbart(self, model_from_pretrained, tokenizer_from_pretrain self.assertEqual(["translated"] * len(subs), [*map(lambda x: x.text, translated_subs)]) - @parameterized.expand([ - ["bos", "zls"], - ["cmn", "zho"], - ["gla", "cel"], - ["grc", "grk"], - ["guj", "inc"], - ["ina", "art"], - ["jbo", "art"], - ["kan", "dra"], - ["kir", "trk"], - ["lat", "itc"], - ["lfn", "art"], - ["mya", "sit"], - ["nep", "inc"], - ["ori", "inc"], - ["sin", "inc"], - ["srp", "zls"], - ["tam", "dra"], - ["tat", "trk"], - ["tel", "dra"], - ["yue", "zho"], - ]) - def test_normalise_single(self, original, normalised): - self.assertEqual(normalised, Undertest.normalise_single(original)) - - @parameterized.expand([ - ["eng-jpn", "eng-jap"], - ["jpn-eng", "jap-eng"], - ]) - def test_normalise_pair(self, original, normalised): - self.assertEqual(normalised, "-".join(Undertest.normalise_pair(*original.split("-")))) - @patch("transformers.MarianTokenizer.from_pretrained", side_effect=OSError) def test_throw_exception_on_translating_subs(self, mock_tokenizer_from_pretrained): subs = Subtitle.load(self.srt_file_path).subs From 2731fdfddbae6d2498d6b52986e154be8b745440 Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 14 Apr 2023 18:06:05 +0100 Subject: [PATCH 13/20] support transcription during batch processing --- Makefile | 2 +- README.md | 10 +- site/source/usage.rst | 8 +- subaligner/subaligner_batch/__main__.py | 148 +++++++++++++++++------- 4 files changed, 125 insertions(+), 43 deletions(-) diff --git a/Makefile b/Makefile index c756287..ff2eedf 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ ifdef PYTHON PYTHON := $(PYTHON) else -PYTHON := 3.7.7 +PYTHON := 3.8.2 endif ifdef PLATFORM diff --git a/README.md b/README.md index 7229920..71d9207 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,6 @@ [![Documentation Status](https://readthedocs.org/projects/subaligner/badge/?version=latest)](https://subaligner.readthedocs.io/en/latest/?badge=latest) [![GitHub license](https://img.shields.io/github/license/baxtree/subaligner)](https://github.com/baxtree/subaligner/blob/master/LICENSE) [![PyPI](https://badge.fury.io/py/subaligner.svg)](https://badge.fury.io/py/subaligner) -[![Docker Build](https://img.shields.io/docker/cloud/build/baxtree/subaligner?label=Docker&style=flat)](https://hub.docker.com/r/baxtree/subaligner/builds) [![Docker Pulls](https://img.shields.io/docker/pulls/baxtree/subaligner)](https://hub.docker.com/r/baxtree/subaligner) [![Citation](https://zenodo.org/badge/228440472.svg)](https://doi.org/10.5281/zenodo.5603083) @@ -126,7 +125,14 @@ $ subaligner --languages $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt -t src,tgt -$ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt +$ subaligner -m dual -v video.mp4 -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt +$ subaligner -m dual -v video.mp4 -tr facebook-mbart -tf large -o subtitle_aligned.srt -t src,tgt +$ subaligner -m dual -v video.mp4 -tr whisper -tf small -o subtitle_aligned.srt -t src,eng +``` +``` +# Transcribe audiovisual files and generate translated subtitles + +$ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt ``` ``` # Shift subtitle manually by offset in seconds diff --git a/site/source/usage.rst b/site/source/usage.rst index b67ab88..67fc637 100644 --- a/site/source/usage.rst +++ b/site/source/usage.rst @@ -49,7 +49,13 @@ Make sure you have got the virtual environment activated upfront. (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt (.venv) $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt -t src,tgt - (.venv) $ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt + (.venv) $ subaligner -m dual -v video.mp4 -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt + (.venv) $ subaligner -m dual -v video.mp4 -tr facebook-mbart -tf large -o subtitle_aligned.srt -t src,tgt + (.venv) $ subaligner -m dual -v video.mp4 -tr whisper -tf small -o subtitle_aligned.srt -t src,eng + +**Transcribe audiovisual files and generate translated subtitles**:: + + (.venv) $ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt **Shift subtitle manually by offset in seconds**:: diff --git a/subaligner/subaligner_batch/__main__.py b/subaligner/subaligner_batch/__main__.py index 002688e..e79fb56 100755 --- a/subaligner/subaligner_batch/__main__.py +++ b/subaligner/subaligner_batch/__main__.py @@ -1,8 +1,10 @@ #!/usr/bin/env python """ -usage: subaligner_batch [-h] [-m {single,dual}] [-vd VIDEO_DIRECTORY] [-sd SUBTITLE_DIRECTORY] [-l MAX_LOGLOSS] [-so] +usage: subaligner_batch [-h] [-m {single,dual,script,transcribe}] [-sd SUBTITLE_DIRECTORY] [-vd VIDEO_DIRECTORY] [-l MAX_LOGLOSS] [-so] [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] - [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-od OUTPUT_DIRECTORY] [-t TRANSLATE] [-lgs] [-d] [-q] [-ver] + [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-od OUTPUT_DIRECTORY] [-of {srt,ytt,ttml,txt,smi,xml,ssa,ass,dfxp,sub,scc,tmp,sami,vtt,stl,sbv}] [-t TRANSLATE] + [-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}] + [-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}] [-lgs] [-d] [-q] [-ver] Batch align multiple subtitle files and audiovisual files @@ -11,13 +13,13 @@ optional arguments: -h, --help show this help message and exit - -vd VIDEO_DIRECTORY, --video_directory VIDEO_DIRECTORY - Path to the video directory -sd SUBTITLE_DIRECTORY, --subtitle_directory SUBTITLE_DIRECTORY Path to the subtitle directory + -vd VIDEO_DIRECTORY, --video_directory VIDEO_DIRECTORY + Path to the video directory -l MAX_LOGLOSS, --max_logloss MAX_LOGLOSS Max global log loss for alignment - -so, --stretch_on Switch on stretch on subtitles + -so, --stretch_on Switch on stretch on subtitles) -sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --stretch_in_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho} Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]. NB: This will be ignored if neither -so nor --stretch_on is present @@ -26,15 +28,23 @@ Path to the output directory containing training results -od OUTPUT_DIRECTORY, --output_directory OUTPUT_DIRECTORY Path to the output subtitle directory + -of {srt,ytt,ttml,txt,smi,xml,ssa,ass,dfxp,sub,scc,tmp,sami,vtt,stl,sbv}, --output_format {srt,ytt,ttml,txt,smi,xml,ssa,ass,dfxp,sub,scc,tmp,sami,vtt,stl,sbv} + File format of the output subtitles -t TRANSLATE, --translate TRANSLATE Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho) + -ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --main_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho} + Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes] + -mr {whisper}, --transcription_recipe {whisper} + LLM recipe used for transcribing video files + -mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}, --transcription_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large} + Flavour variation for a specific LLM recipe supporting transcription -lgs, --languages Print out language codes used for stretch and translation -d, --debug Print out debugging information -q, --quiet Switch off logging information -ver, --version show program's version number and exit required arguments: - -m {single,dual}, --mode {single,dual} + -m {single,dual,script,transcribe}, --mode {single,dual,script,transcribe} Alignment mode: either single or dual """ @@ -43,6 +53,7 @@ import traceback import os import pkg_resources +import tempfile def main(): @@ -65,22 +76,22 @@ def main(): "--mode", type=str, default="", - choices=["single", "dual"], + choices=["single", "dual", "script", "transcribe"], help="Alignment mode: either single or dual", ) parser.add_argument( - "-vd", - "--video_directory", + "-sd", + "--subtitle_directory", type=str, default="", - help="Path to the video directory", + help="Path to the subtitle directory", ) parser.add_argument( - "-sd", - "--subtitle_directory", + "-vd", + "--video_directory", type=str, default="", - help="Path to the subtitle directory", + help="Path to the video directory", ) parser.add_argument( "-l", @@ -139,6 +150,31 @@ def main(): type=str, help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)", ) + parser.add_argument( + "-ml", + "--main_language", + type=str.lower, + choices=Utils.get_stretch_language_codes(), + help="Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]", + ) + from subaligner.llm import TranscriptionRecipe + from subaligner.llm import WhisperFlavour + parser.add_argument( + "-mr", + "--transcription_recipe", + type=str.lower, + default=TranscriptionRecipe.WHISPER.value, + choices=[r.value for r in TranscriptionRecipe], + help="LLM recipe used for transcribing video files" + ) + parser.add_argument( + "-mf", + "--transcription_flavour", + type=str.lower, + default=WhisperFlavour.SMALL.value, + choices=[wf.value for wf in WhisperFlavour], + help="Flavour variation for a specific LLM recipe supporting transcription" + ) parser.add_argument("-lgs", "--languages", action="store_true", help="Print out language codes used for stretch and translation") parser.add_argument("-d", "--debug", action="store_true", @@ -159,7 +195,7 @@ def main(): print("ERROR: --video_directory was not passed in") parser.print_usage() sys.exit(21) - if FLAGS.subtitle_directory == "": + if FLAGS.mode != "transcribe" and FLAGS.subtitle_directory == "": print("ERROR: --subtitle_directory was not passed in") parser.print_usage() sys.exit(21) @@ -167,28 +203,40 @@ def main(): print("ERROR: --output_directory was not passed in") parser.print_usage() sys.exit(21) - if os.path.abspath(FLAGS.subtitle_directory) == os.path.abspath(FLAGS.output_directory): + if FLAGS.mode != "transcribe" and os.path.abspath(FLAGS.subtitle_directory) == os.path.abspath(FLAGS.output_directory): print("ERROR: The output directory cannot be set to the same as the input subtitle directory") parser.print_usage() sys.exit(21) - if FLAGS.translate is not None: + if FLAGS.translate is not None or FLAGS.mode == "transcribe": if "transformers" not in {pkg.key for pkg in pkg_resources.working_set}: - print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[llm]" and run your command again.') + print('ERROR: Alignment has been configured to use language models. Please install "subaligner[llm]" and run your command again.') + sys.exit(21) + if FLAGS.stretch_on or FLAGS.mode == "script": + if "aeneas" not in {pkg.key for pkg in pkg_resources.working_set}: + print('ERROR: Alignment has been configured to use extra features. Please install "subaligner[stretch]" and run your command again.') + sys.exit(21) + if FLAGS.mode == "transcribe": + if FLAGS.main_language is None: + print("ERROR: --main_language was not passed in but required by mode 'transcribe'") + parser.print_usage() sys.exit(21) video_file_paths = [os.path.abspath(os.path.join(path, p)) for path, _, files in os.walk(FLAGS.video_directory) for p in files if not p.startswith(".")] - subtitle_file_paths = [os.path.abspath(os.path.join(path, p)) for path, _, files in - os.walk(FLAGS.subtitle_directory) for p in files if not p.startswith(".")] - if len(video_file_paths) != len(subtitle_file_paths): - print("ERROR: The numbers of input videos and subtitles do not match") - parser.print_usage() - sys.exit(21) + + if FLAGS.mode != "transcribe": + subtitle_file_paths = [os.path.abspath(os.path.join(path, p)) for path, _, files in + os.walk(FLAGS.subtitle_directory) for p in files if not p.startswith(".")] + if len(video_file_paths) != len(subtitle_file_paths): + print("ERROR: The numbers of input videos and subtitles do not match") + parser.print_usage() + sys.exit(21) output_dir = os.path.abspath(FLAGS.output_directory) os.makedirs(output_dir, exist_ok=True) video_file_paths = sorted(video_file_paths, key=lambda x: os.path.splitext(os.path.basename(x))[0]) - subtitle_file_paths = sorted(subtitle_file_paths, key=lambda x: os.path.splitext(os.path.basename(x))[0]) + if FLAGS.mode != "transcribe": + subtitle_file_paths = sorted(subtitle_file_paths, key=lambda x: os.path.splitext(os.path.basename(x))[0]) exit_segfail = FLAGS.exit_segfail stretch = FLAGS.stretch_on stretch_in_lang = FLAGS.stretch_in_language @@ -205,15 +253,16 @@ def main(): failures = [] for index in range(len(video_file_paths)): local_video_path = video_file_paths[index] - local_subtitle_path = subtitle_file_paths[index] + local_subtitle_path = subtitle_file_paths[index] if FLAGS.mode != "transcribe" else "{}.srt".format(tempfile.mkstemp()[1]) try: + voice_probabilities = None if FLAGS.mode == "single": aligned_subs, audio_file_path, voice_probabilities, frame_rate = predictor.predict_single_pass( video_file_path=local_video_path, subtitle_file_path=local_subtitle_path, weights_dir=os.path.join(FLAGS.training_output_directory, "models", "training", "weights") ) - else: + elif FLAGS.mode == "dual": aligned_subs, subs, voice_probabilities, frame_rate = predictor.predict_dual_pass( video_file_path=local_video_path, subtitle_file_path=local_subtitle_path, @@ -222,12 +271,30 @@ def main(): stretch_in_lang=stretch_in_lang, exit_segfail=exit_segfail, ) + elif FLAGS.mode == "script": + aligned_subs, _, voice_probabilities, frame_rate = predictor.predict_plain_text( + video_file_path=local_video_path, + subtitle_file_path=local_subtitle_path, + stretch_in_lang=stretch_in_lang, + ) + elif FLAGS.mode == "transcribe": + from subaligner.transcriber import Transcriber + transcriber = Transcriber(recipe=FLAGS.transcription_recipe, flavour=FLAGS.transcription_flavour) + subtitle, frame_rate = transcriber.transcribe(local_video_path, stretch_in_lang) + aligned_subs = subtitle.subs - parent_dir = os.path.dirname(local_subtitle_path.replace(os.path.abspath(FLAGS.subtitle_directory), output_dir)) - os.makedirs(parent_dir, exist_ok=True) - file_parts = os.path.basename(local_subtitle_path).rsplit(".", 1) - file_parts[1] = FLAGS.output_format if FLAGS.output_format != "" else file_parts[1] - aligned_subtitle_path = os.path.abspath(os.path.join(parent_dir, ".".join(file_parts).replace(".stl", ".srt"))) + if FLAGS.mode == "transcribe": + parent_dir = os.path.dirname(video_file_paths[index].replace(os.path.abspath(FLAGS.video_directory), output_dir)) + os.makedirs(parent_dir, exist_ok=True) + file_parts = os.path.basename(video_file_paths[index]).rsplit(".", 1) + file_parts[1] = FLAGS.output_format if FLAGS.output_format != "" else "srt" + aligned_subtitle_path = os.path.abspath(os.path.join(parent_dir, ".".join(file_parts).replace(".stl", ".srt"))) + else: + parent_dir = os.path.dirname(local_subtitle_path.replace(os.path.abspath(FLAGS.subtitle_directory), output_dir)) + os.makedirs(parent_dir, exist_ok=True) + file_parts = os.path.basename(local_subtitle_path).rsplit(".", 1) + file_parts[1] = FLAGS.output_format if FLAGS.output_format != "" else file_parts[1] + aligned_subtitle_path = os.path.abspath(os.path.join(parent_dir, ".".join(file_parts).replace(".stl", ".srt"))) if FLAGS.translate is not None: from subaligner.translator import Translator @@ -235,16 +302,19 @@ def main(): translator = Translator(source, target) aligned_subs = translator.translate(aligned_subs) Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate, "utf-8") - else: + elif FLAGS.mode == "transcribe": Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate, "utf-8") + else: + Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate) - log_loss = predictor.get_log_loss(voice_probabilities, aligned_subs) - if log_loss is None or log_loss > FLAGS.max_logloss: - print( - "ERROR: Alignment failed with a too high loss value: {} for {} and {}".format(log_loss, local_video_path, local_subtitle_path) - ) - failures.append((local_video_path, local_subtitle_path)) - continue + if voice_probabilities is not None: + log_loss = predictor.get_log_loss(voice_probabilities, aligned_subs) + if log_loss is None or log_loss > FLAGS.max_logloss: + print( + "ERROR: Alignment failed with a too high loss value: {} for {} and {}".format(log_loss, local_video_path, local_subtitle_path) + ) + failures.append((local_video_path, local_subtitle_path)) + continue print("Aligned subtitle saved to: {}".format(aligned_subtitle_path)) except UnsupportedFormatException as e: From 5c71ccf9598a1d860387b586cdac1c60057f7f55 Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 9 Jun 2023 09:52:19 +0100 Subject: [PATCH 14/20] reconcile dependencies and update project metadata --- Pipfile | 2 +- requirements-dev.txt | 4 ++-- requirements-llm.txt | 1 - requirements.txt | 6 +++--- setup.py | 8 ++++---- subaligner/_version.py | 2 +- subaligner/trainer.py | 6 +++--- subaligner/transcriber.py | 2 +- subaligner/translator.py | 10 +++++----- tests/subaligner/test_translator.py | 17 ++++++++++++++++- 10 files changed, 36 insertions(+), 22 deletions(-) diff --git a/Pipfile b/Pipfile index f84d858..5e4dffc 100644 --- a/Pipfile +++ b/Pipfile @@ -68,6 +68,7 @@ psutil = "==5.6.7" py = "==1.10.0" pyasn1 = "==0.4.8" pyasn1-modules = "==0.2.7" +pycountry = "~=20.7.3" pydot = "==1.2.4" pydot-ng = "==1.0.0" pydotplus = "==2.0.2" @@ -100,7 +101,6 @@ typing-extensions = "~=3.7.0" urllib3 = "~=1.26.5" Werkzeug = ">=0.15.3" zict = "==0.1.3" -zipp = "==0.6.0" aeneas = "==1.7.3.0" [requires] diff --git a/requirements-dev.txt b/requirements-dev.txt index d6b5f10..4007188 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,13 +2,13 @@ mock==4.0.3 coverage==5.5 tox~=3.23.0 pycodestyle==2.5.0 -twine>=3.1.1 +twine<4.0.0 snakeviz==2.1.0 line-profiler==3.1.0 scikit-build==0.11.1 radish-bdd~=0.13.3 pex<=2.1.80 -mypy==0.931 +mypy==1.3.0 types-requests==2.27.9 types-setuptools==57.4.9 typing-extensions<4.0.0 diff --git a/requirements-llm.txt b/requirements-llm.txt index fbe39c8..ec609ca 100644 --- a/requirements-llm.txt +++ b/requirements-llm.txt @@ -1,4 +1,3 @@ -pycountry~=20.7.3 sentencepiece~=0.1.95 torch<1.13.0 transformers<4.27.0 diff --git a/requirements.txt b/requirements.txt index 1818ca4..489686f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,6 +43,7 @@ psutil==5.6.7 py==1.10.0 pyasn1==0.4.8 pyasn1-modules==0.2.7 +pycountry~=20.7.3 pydot==1.2.4 pydot-ng==1.0.0 pydotplus==2.0.2 @@ -52,7 +53,7 @@ pysubs2<=1.4.2 pystack-debugger==0.8.0 pytz==2018.4 PyYAML>=4.2b1 -requests~=2.25.1 +requests<3.0.0 requests-oauthlib==1.3.0 rsa==4.7 scipy<=1.8.1 @@ -60,7 +61,7 @@ scikit-learn<1.2.0 setuptools>=41.0.0 six~=1.15.0 tblib==1.3.2 -tensorflow>=1.15.5,<2.12 +tensorflow>=1.15.5,<2.13 termcolor==1.1.0 toml==0.10.0 toolz==0.9.0 @@ -68,4 +69,3 @@ tornado==5.1.0 urllib3~=1.26.5 Werkzeug>=0.15.3 zict==0.1.3 -zipp==0.6.0 diff --git a/setup.py b/setup.py index 909ee15..1220641 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ "dev": dev_requirements + stretch_requirements + llm_requirements + docs_requirements, "docs": docs_requirements, "stretch": stretch_requirements, - "translation": llm_requirements, # for backward compatibility and will be deprecated with "llm" + "translation": llm_requirements, # for backward compatibility and now deprecated with "llm" "llm": llm_requirements, } @@ -41,17 +41,17 @@ author_email="xi.bai.ed@gmail.com", classifiers=[ "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Intended Audience :: Developers", ], license="MIT", url="https://subaligner.readthedocs.io/en/latest/", - description="Automatically synchronize and translate subtitles with pretrained deep neural networks, forced alignments and transformers.", + description="Automatically synchronize and translate subtitles, or create new ones by transcribing, using pre-trained DNNs, Forced Alignments and Transformers.", long_description=readme + "\n\n", long_description_content_type='text/markdown', - python_requires=">=3.6", + python_requires=">=3.8", package_dir={"subaligner": "subaligner"}, packages=[ "subaligner", diff --git a/subaligner/_version.py b/subaligner/_version.py index 7945cf2..eba52d9 100644 --- a/subaligner/_version.py +++ b/subaligner/_version.py @@ -1,2 +1,2 @@ """The semver for the current release.""" -__version__ = "0.3.1" +__version__ = "0.3.2" diff --git a/subaligner/trainer.py b/subaligner/trainer.py index eb0c25e..815439b 100644 --- a/subaligner/trainer.py +++ b/subaligner/trainer.py @@ -8,7 +8,7 @@ import numpy as np import multiprocessing as mp -from typing import List, Tuple, Optional +from typing import List, Tuple, Optional, Union from .network import Network from .media_helper import MediaHelper from .hyperparameters import Hyperparameters @@ -330,8 +330,8 @@ def __extract_in_multithreads( index: int, av_file_path: str, subtitle_file_path: str, - train_data: np.ndarray, - labels: np.ndarray, + train_data: Union[np.ndarray, List], + labels: Union[np.ndarray, List], sound_effect_start_marker: Optional[str], sound_effect_end_marker: Optional[str] ) -> Tuple[str, str]: diff --git a/subaligner/transcriber.py b/subaligner/transcriber.py index 0a0481a..3122ff4 100644 --- a/subaligner/transcriber.py +++ b/subaligner/transcriber.py @@ -55,7 +55,7 @@ def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle try: audio = whisper.load_audio(audio_file_path) self.__LOGGER.debug("Start transcribing the audio...") - result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang]) + result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang], logprob_threshold=-1.2, no_speech_threshold=0.16) self.__LOGGER.info("Finished transcribing the audio") srt_str = "" for i, segment in enumerate(result["segments"], start=1): diff --git a/subaligner/translator.py b/subaligner/translator.py index 2cd977d..89462c4 100644 --- a/subaligner/translator.py +++ b/subaligner/translator.py @@ -112,11 +112,11 @@ def translate(self, elif self.__recipe == TranslationRecipe.WHISPER.value: assert video_file_path is not None lang = Utils.get_iso_639_alpha_2(self.__tgt_language) - if lang not in LANGUAGES: - raise TranslationException(f'"{self.__tgt_language}" is not supported by {self.__recipe}') + if lang not in LANGUAGES or lang != "en": + raise TranslationException(f'"{self.__tgt_language}" is not supported by {self.__recipe} as a translation target by {self.__recipe}') audio = whisper.load_audio(video_file_path) self.__LOGGER.debug("Start translating the audio...") - result = self.__lang_model.transcribe(audio, task="translate", language=LANGUAGES[lang]) + result = self.__lang_model.transcribe(audio, task="translate") self.__LOGGER.info("Finished translating the audio") srt_str = "" for i, segment in enumerate(result["segments"], start=1): @@ -131,7 +131,7 @@ def translate(self, self.__tokenizer.src_lang = Translator.__MBART_LANGUAGE_CODE_MAPPER.get(src_lang, None) lang_code = Translator.__MBART_LANGUAGE_CODE_MAPPER.get(tgt_lang, None) if src_lang is None or tgt_lang is None: - raise NotImplementedError(f"Language pair of {src_lang} and {src_lang} is not supported") + raise NotImplementedError(f"Language pair of {src_lang} and {src_lang} is not supported by {self.__recipe}") translated_texts = [] self.__lang_model.eval() new_subs = deepcopy(subs) @@ -163,7 +163,7 @@ def __initialise_model(self, src_lang: str, tgt_lang: str, recipe: str, flavour: elif self.__download_mt_model(src_lang, tgt_lang, HelsinkiNLPFlavour.OPUS_MT_TC_BIG.value): return else: - message = 'Cannot find the MT model for source language "{}" and destination language "{}"'.format(src_lang, tgt_lang) + message = f'Cannot find the {recipe} MT model for source language "{src_lang}" and destination language "{tgt_lang}"' self.__LOGGER.error(message) raise NotImplementedError(message) elif recipe == TranslationRecipe.WHISPER.value: diff --git a/tests/subaligner/test_translator.py b/tests/subaligner/test_translator.py index 6aa810c..540323d 100644 --- a/tests/subaligner/test_translator.py +++ b/tests/subaligner/test_translator.py @@ -4,6 +4,7 @@ from parameterized import parameterized from subaligner.subtitle import Subtitle from subaligner.llm import TranslationRecipe, HelsinkiNLPFlavour, WhisperFlavour, FacebookMbartFlavour +from subaligner.exception import TranslationException from subaligner.translator import Translator as Undertest @@ -39,7 +40,7 @@ def test_translate_whisper(self, load_model, load_audio): load_model.return_value = model model.transcribe.return_value = {"segments": [{"start": 0, "end": 1, "text": "translated"}]} - undertest = Undertest("eng", "zho", recipe=TranslationRecipe.WHISPER.value, flavour=WhisperFlavour.TINY.value) + undertest = Undertest("eng", "eng", recipe=TranslationRecipe.WHISPER.value, flavour=WhisperFlavour.TINY.value) translated_subs = undertest.translate(subs, "video_path") self.assertEqual(["translated"], [*map(lambda x: x.text, translated_subs)]) @@ -72,3 +73,17 @@ def test_throw_exception_on_translating_subs(self, mock_tokenizer_from_pretraine self.assertTrue(isinstance(e, NotImplementedError)) else: self.fail("Should have thrown exception") + + @patch("whisper.load_model") + def test_throw_exception_on_unsupported_whisper_translation_target(self, load_model): + subs = Subtitle.load(self.srt_file_path).subs + model = Mock() + load_model.return_value = model + model.transcribe.return_value = {"segments": [{"start": 0, "end": 1, "text": "translated"}]} + + try: + Undertest("eng", "unk", recipe=TranslationRecipe.WHISPER.value, flavour=WhisperFlavour.TINY.value).translate(subs, "video_path") + except Exception as e: + self.assertTrue(isinstance(e, TranslationException)) + else: + self.fail("Should have thrown exception") From 5e4c3f31c8543f6f0e814e968b4858e7586c6b2a Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 9 Jun 2023 18:50:52 +0100 Subject: [PATCH 15/20] update docker base images --- .github/workflows/dockerhub.yml | 32 +++++++++---------- Makefile | 9 +++++- ...ockerfile-Debian10 => Dockerfile-Debian11} | 2 +- ...ockerfile-Fedora31 => Dockerfile-Fedora37} | 4 ++- docker/docker-compose.yml | 8 ++--- subaligner/_version.py | 2 +- subaligner/transcriber.py | 2 +- 7 files changed, 34 insertions(+), 25 deletions(-) rename docker/{Dockerfile-Debian10 => Dockerfile-Debian11} (94%) rename docker/{Dockerfile-Fedora31 => Dockerfile-Fedora37} (89%) diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml index c4c6ccd..93792dd 100644 --- a/.github/workflows/dockerhub.yml +++ b/.github/workflows/dockerhub.yml @@ -60,30 +60,30 @@ jobs: tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.u20 push: true - - name: Build and push the Fedora 31 image - id: docker_build_fed31 - uses: docker/build-push-action@v2 - with: - context: ./docker - file: "./docker/Dockerfile-Fedora31" - build-args: | - "RELEASE_VERSION=${{ steps.tag.outputs.TAG }}" - allow: network.host - github-token: ${{ github.token }} - tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.fed31 - push: true +# - name: Build and push the Fedora 37 image +# id: docker_build_fed37 +# uses: docker/build-push-action@v2 +# with: +# context: ./docker +# file: "./docker/Dockerfile-Fedora37" +# build-args: | +# "RELEASE_VERSION=${{ steps.tag.outputs.TAG }}" +# allow: network.host +# github-token: ${{ github.token }} +# tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.fed37 +# push: true - - name: Build and push the Debian 10 image - id: docker_build_deb10 + - name: Build and push the Debian 11 image + id: docker_build_deb11 uses: docker/build-push-action@v2 with: context: ./docker - file: "./docker/Dockerfile-Debian10" + file: "./docker/Dockerfile-Debian11" build-args: | "RELEASE_VERSION=${{ steps.tag.outputs.TAG }}" allow: network.host github-token: ${{ github.token }} - tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.deb10 + tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.deb11 push: true - name: Build and push the ArchLinux image diff --git a/Makefile b/Makefile index ff2eedf..1d3e7d1 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ endef export BROWSER_PYSCRIPT BROWSER := python -c "$$BROWSER_PYSCRIPT" -.PHONY: install uninstall build-gzip build-rpm test test-all pydoc coverage manual dist release clean clean-dist clean-doc clean-manual clean-build clean-pyc clean-test clean-rpm +.PHONY: install uninstall build-gzip build-rpm test test-all docker-build pydoc coverage manual dist release clean clean-dist clean-doc clean-manual clean-build clean-pyc clean-test clean-rpm install: if [ ! -e ".$(PYTHON)" ]; then ~/.pyenv/versions/$(PYTHON)/bin/python3 -m venv .$(PYTHON); fi @@ -180,6 +180,13 @@ app: clean-wheels STRETCH_OFF=True .$(PYTHON)/bin/python setup.py bdist_wheel -d ./wheels; \ .$(PYTHON)/bin/pex subaligner==$(SUBALIGNER_VERSION) --repo=./wheels --platform $(PLATFORM) --no-pypi --no-build --python-shebang="/usr/bin/env python3" -e subaligner -o subaligner-$(PLATFORM).app; \ +docker-build: + docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-Ubuntu20 . + docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-ArchLinux . + docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-CentOS7 . + docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-Debian11 . + docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-Fedora37 . + docker-images: SUBALIGNER_VERSION=$(SUBALIGNER_VERSION) docker-compose -f ./docker/docker-compose.yml build diff --git a/docker/Dockerfile-Debian10 b/docker/Dockerfile-Debian11 similarity index 94% rename from docker/Dockerfile-Debian10 rename to docker/Dockerfile-Debian11 index d3fe362..1d91f42 100644 --- a/docker/Dockerfile-Debian10 +++ b/docker/Dockerfile-Debian11 @@ -1,5 +1,5 @@ # Subaligner Debian docker image -FROM debian:stable-20211011-slim +FROM debian:stable-20230202-slim ARG RELEASE_VERSION diff --git a/docker/Dockerfile-Fedora31 b/docker/Dockerfile-Fedora37 similarity index 89% rename from docker/Dockerfile-Fedora31 rename to docker/Dockerfile-Fedora37 index 3e2323f..e4fb24e 100644 --- a/docker/Dockerfile-Fedora31 +++ b/docker/Dockerfile-Fedora37 @@ -1,5 +1,5 @@ # Subaligner Fedora Docker Image -From fedora:31 +From fedora:37 ARG RELEASE_VERSION @@ -15,6 +15,8 @@ RUN ["/bin/bash", "-c", "dnf install -y dnf-utils &&\ dnf install -y espeak-ng &&\ ln -s /usr/lib64/libespeak-ng.so.1 /usr/lib64/libespeak.so &&\ dnf install -y libsndfile-devel &&\ + dnf install -y blas lapack blas-devel lapack-devel &&\ + dnf install -y gcc-c++ &&\ dnf install -y python3 &&\ dnf install -y gcc &&\ dnf install -y python3-wheel &&\ diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 424665b..82f0474 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -28,11 +28,11 @@ services: subaligner-debian10: build: context: ./ - dockerfile: Dockerfile-Debian10 - image: baxtree/subaligner:${SUBALIGNER_VERSION}.deb10 + dockerfile: Dockerfile-Debian11 + image: baxtree/subaligner:${SUBALIGNER_VERSION}.deb11 subaligner-fedora31: build: context: ./ - dockerfile: Dockerfile-Fedora31 - image: baxtree/subaligner:${SUBALIGNER_VERSION}.fed31 + dockerfile: Dockerfile-Fedora37 + image: baxtree/subaligner:${SUBALIGNER_VERSION}.fed37 diff --git a/subaligner/_version.py b/subaligner/_version.py index eba52d9..c4206c9 100644 --- a/subaligner/_version.py +++ b/subaligner/_version.py @@ -1,2 +1,2 @@ """The semver for the current release.""" -__version__ = "0.3.2" +__version__ = "0.3.3" diff --git a/subaligner/transcriber.py b/subaligner/transcriber.py index 3122ff4..0a0481a 100644 --- a/subaligner/transcriber.py +++ b/subaligner/transcriber.py @@ -55,7 +55,7 @@ def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle try: audio = whisper.load_audio(audio_file_path) self.__LOGGER.debug("Start transcribing the audio...") - result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang], logprob_threshold=-1.2, no_speech_threshold=0.16) + result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang]) self.__LOGGER.info("Finished transcribing the audio") srt_str = "" for i, segment in enumerate(result["segments"], start=1): From 8d941ddadfe97cc71bb22151802d3f7fd2bb93e8 Mon Sep 17 00:00:00 2001 From: baxtree Date: Mon, 12 Jun 2023 09:27:27 +0100 Subject: [PATCH 16/20] add the ubuntu 22 image --- .github/workflows/dockerhub.yml | 37 +++++++++++++------ Makefile | 3 +- ...ockerfile-Fedora37 => Dockerfile-Fedora34} | 2 +- docker/Dockerfile-Ubuntu22 | 19 ++++++++++ docker/docker-compose.yml | 14 +++++-- 5 files changed, 57 insertions(+), 18 deletions(-) rename docker/{Dockerfile-Fedora37 => Dockerfile-Fedora34} (98%) create mode 100644 docker/Dockerfile-Ubuntu22 diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml index 93792dd..6283144 100644 --- a/.github/workflows/dockerhub.yml +++ b/.github/workflows/dockerhub.yml @@ -60,18 +60,31 @@ jobs: tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.u20 push: true -# - name: Build and push the Fedora 37 image -# id: docker_build_fed37 -# uses: docker/build-push-action@v2 -# with: -# context: ./docker -# file: "./docker/Dockerfile-Fedora37" -# build-args: | -# "RELEASE_VERSION=${{ steps.tag.outputs.TAG }}" -# allow: network.host -# github-token: ${{ github.token }} -# tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.fed37 -# push: true + - name: Build and push the Ubuntu 22 image + id: docker_build_u22 + uses: docker/build-push-action@v2 + with: + context: ./docker + file: "./docker/Dockerfile-Ubuntu22" + build-args: | + "RELEASE_VERSION=${{ steps.tag.outputs.TAG }}" + allow: network.host + github-token: ${{ github.token }} + tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.u22 + push: true + + - name: Build and push the Fedora 34 image + id: docker_build_fed34 + uses: docker/build-push-action@v2 + with: + context: ./docker + file: "./docker/Dockerfile-Fedora34" + build-args: | + "RELEASE_VERSION=${{ steps.tag.outputs.TAG }}" + allow: network.host + github-token: ${{ github.token }} + tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.fed34 + push: true - name: Build and push the Debian 11 image id: docker_build_deb11 diff --git a/Makefile b/Makefile index 1d3e7d1..37329ff 100644 --- a/Makefile +++ b/Makefile @@ -182,10 +182,11 @@ app: clean-wheels docker-build: docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-Ubuntu20 . + docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-Ubuntu22 . docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-ArchLinux . docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-CentOS7 . docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-Debian11 . - docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-Fedora37 . + docker build --build-arg RELEASE_VERSION=$(SUBALIGNER_VERSION) -f docker/Dockerfile-Fedora34 . docker-images: SUBALIGNER_VERSION=$(SUBALIGNER_VERSION) docker-compose -f ./docker/docker-compose.yml build diff --git a/docker/Dockerfile-Fedora37 b/docker/Dockerfile-Fedora34 similarity index 98% rename from docker/Dockerfile-Fedora37 rename to docker/Dockerfile-Fedora34 index e4fb24e..952058a 100644 --- a/docker/Dockerfile-Fedora37 +++ b/docker/Dockerfile-Fedora34 @@ -1,5 +1,5 @@ # Subaligner Fedora Docker Image -From fedora:37 +From fedora:34 ARG RELEASE_VERSION diff --git a/docker/Dockerfile-Ubuntu22 b/docker/Dockerfile-Ubuntu22 new file mode 100644 index 0000000..23974e5 --- /dev/null +++ b/docker/Dockerfile-Ubuntu22 @@ -0,0 +1,19 @@ +# Subaligner Ubuntu 22 Docker Image +FROM ubuntu:22.04 + +ARG RELEASE_VERSION + +ENV RELEASE_VERSION=${RELEASE_VERSION} +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Europe/London + +RUN ["/bin/bash", "-c", "apt-get -y update &&\ + apt-get -y install ffmpeg &&\ + apt-get -y install espeak libespeak1 libespeak-dev espeak-data &&\ + apt-get -y install libsndfile-dev &&\ + apt-get -y install python3-dev &&\ + apt-get -y install python3-tk &&\ + apt-get -y install python3-pip &&\ + python3 -m pip install --upgrade pip &&\ + python3 -m pip install \"subaligner==${RELEASE_VERSION}\" &&\ + python3 -m pip install \"subaligner[harmony]==${RELEASE_VERSION}\""] diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 82f0474..c68b25b 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -19,20 +19,26 @@ services: dockerfile: Dockerfile-Ubuntu20 image: baxtree/subaligner:${SUBALIGNER_VERSION}.u20 + subaligner-ubuntu22: + build: + context: ./ + dockerfile: Dockerfile-Ubuntu22 + image: baxtree/subaligner:${SUBALIGNER_VERSION}.u22 + subaligner-archlinux: build: context: ./ dockerfile: Dockerfile-ArchLinux image: baxtree/subaligner:${SUBALIGNER_VERSION}.arch - subaligner-debian10: + subaligner-debian11: build: context: ./ dockerfile: Dockerfile-Debian11 image: baxtree/subaligner:${SUBALIGNER_VERSION}.deb11 - subaligner-fedora31: + subaligner-fedora34: build: context: ./ - dockerfile: Dockerfile-Fedora37 - image: baxtree/subaligner:${SUBALIGNER_VERSION}.fed37 + dockerfile: Dockerfile-Fedora34 + image: baxtree/subaligner:${SUBALIGNER_VERSION}.fed34 From 5194dab978d327377a16dfeae18310c4f83ac8c0 Mon Sep 17 00:00:00 2001 From: baxtree Date: Mon, 19 Jun 2023 09:29:52 +0100 Subject: [PATCH 17/20] enable the progress bar during transcription --- subaligner/transcriber.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/subaligner/transcriber.py b/subaligner/transcriber.py index 0a0481a..658e823 100644 --- a/subaligner/transcriber.py +++ b/subaligner/transcriber.py @@ -54,8 +54,9 @@ def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle audio_file_path = self.__media_helper.extract_audio(video_file_path, True, 16000) try: audio = whisper.load_audio(audio_file_path) - self.__LOGGER.debug("Start transcribing the audio...") - result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang]) + self.__LOGGER.info("Start transcribing the audio...") + verbose = False if Logger.VERBOSE and not Logger.QUIET else None + result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang], verbose=verbose) self.__LOGGER.info("Finished transcribing the audio") srt_str = "" for i, segment in enumerate(result["segments"], start=1): From c0ad422edc21932abe885c84e7e82d0c238f5210 Mon Sep 17 00:00:00 2001 From: baxtree Date: Fri, 7 Jul 2023 18:32:30 +0100 Subject: [PATCH 18/20] upgrade whisper and add aarch64 requirements --- Makefile | 10 ++++-- Pipfile | 6 +--- requirements-aarch64.txt | 71 ++++++++++++++++++++++++++++++++++++++++ requirements-llm.txt | 2 +- requirements.txt | 3 -- setup.py | 12 ++++++- subaligner/_version.py | 2 +- 7 files changed, 93 insertions(+), 13 deletions(-) create mode 100644 requirements-aarch64.txt diff --git a/Makefile b/Makefile index 37329ff..8c6a7a6 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,13 @@ endif ifdef PLATFORM PLATFORM := $(PLATFORM) else -PLATFORM := linux-x86_64-cp-37-cp37 +PLATFORM := linux-x86_64-cp-38-cp38 +endif + +ifdef PYTHON_TAG +PYTHON_TAG := $(PYTHON_TAG) +else +PYTHON_TAG := py38 endif SUBALIGNER_VERSION := $(SUBALIGNER_VERSION) @@ -153,7 +159,7 @@ test-dist: dist: clean-dist test-dist cat requirements-dev.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ - .$(PYTHON)/bin/python setup.py sdist bdist_wheel bdist_egg + .$(PYTHON)/bin/python setup.py sdist bdist_wheel --python-tag=$(PYTHON_TAG) release: .$(PYTHON)/bin/twine upload dist/* diff --git a/Pipfile b/Pipfile index 5e4dffc..1c37619 100644 --- a/Pipfile +++ b/Pipfile @@ -38,8 +38,6 @@ dask = ">=2021.10.0,<2022.1.0" decorator = "==4.3.0" distributed = "==1.13.0" filelock = "<4.0.0" -google-auth = "==1.27.0" -google-auth-oauthlib = "==0.4.2" google-pasta = "~=0.2" graphviz = "==0.8.3" HeapDict = "==1.0.0" @@ -61,7 +59,7 @@ mccabe = "==0.6.1" numba = ">=0.50.0" numpy = "<1.24.0" oauthlib = "==3.1.0" -openai-whisper = "==20230124" +openai-whisper = "==20230314" pbr = "==4.0.2" pluggy = "==0.13.1" psutil = "==5.6.7" @@ -81,8 +79,6 @@ pystack-debugger = "==0.8.0" python-dateutil = "==2.7.2" pytz = "==2018.4" PyYAML = ">=4.2b1" -requests = "~=2.25.1" -requests-oauthlib = "==1.3.0" rsa = "==4.7" scipy = "<=1.8.1" scikit-learn = ">=0.19.1" diff --git a/requirements-aarch64.txt b/requirements-aarch64.txt new file mode 100644 index 0000000..71e5d49 --- /dev/null +++ b/requirements-aarch64.txt @@ -0,0 +1,71 @@ +astor==0.7.1 +beautifulsoup4<4.9.0 +bleach==3.3.0 +cachetools==3.1.1 +captionstransformer~=1.2.1 +cchardet==2.1.7 +certifi==2019.11.28 +chardet==3.0.4 +click==5.1 +cloudpickle~=1.6.0 +cycler==0.10.0 +Cython~=0.29.22 +dask>=2021.10.0,<2022.1.0 +decorator==4.3.0 +distributed==1.13.0 +filelock<4.0.0 +google-auth-oauthlib==0.4.2 +google-pasta~=0.2 +graphviz==0.8.3 +HeapDict==1.0.0 +h5py<=4.0.0 +html5lib==1.0b9 +hyperopt==0.2.4 +idna==2.8 +isort==4.3.4 +joblib>=1.2.0 +keras~=2.12.0 +kiwisolver==1.0.1 +lazy-object-proxy==1.4.3 +le-pycaption==2.2.0a1 +librosa<0.10.0 +locket==0.2.0 +Markdown==2.6.11 +mccabe==0.6.1 +networkx>=2.5.1 +numba>=0.50.0 +numpy<1.24.0 +oauthlib==3.1.0 +pbr==4.0.2 +pluggy==0.13.1 +protobuf<4.0 +psutil==5.6.7 +py==1.10.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.7 +pycountry~=20.7.3 +pydot==1.2.4 +pydot-ng==1.0.0 +pydotplus==2.0.2 +pyprof2calltree==1.4.3 +pysrt==1.1.1 +pysubs2<=1.4.2 +pystack-debugger==0.8.0 +pytz==2018.4 +PyYAML>=4.2b1 +rsa==4.7 +scipy<=1.8.1 +scikit-learn<1.2.0 +setuptools>=41.0.0 +six~=1.15.0 +tblib==1.3.2 +tensorflow-macos~=2.12.0 +tensorflow-metal~=0.8.0 +termcolor==1.1.0 +toml==0.10.0 +toolz==0.9.0 +tornado==5.1.0 +urllib3~=1.26.5 +Werkzeug>=0.15.3 +zict==0.1.3 +zipp==0.6.0 \ No newline at end of file diff --git a/requirements-llm.txt b/requirements-llm.txt index ec609ca..0e0de7e 100644 --- a/requirements-llm.txt +++ b/requirements-llm.txt @@ -1,4 +1,4 @@ sentencepiece~=0.1.95 torch<1.13.0 transformers<4.27.0 -openai-whisper==20230124 \ No newline at end of file +openai-whisper==20230314 diff --git a/requirements.txt b/requirements.txt index 489686f..01611cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,6 @@ dask>=2021.10.0,<2022.1.0 decorator==4.3.0 distributed==1.13.0 filelock<4.0.0 -google-auth-oauthlib==0.4.2 google-pasta~=0.2 graphviz==0.8.3 HeapDict==1.0.0 @@ -53,8 +52,6 @@ pysubs2<=1.4.2 pystack-debugger==0.8.0 pytz==2018.4 PyYAML>=4.2b1 -requests<3.0.0 -requests-oauthlib==1.3.0 rsa==4.7 scipy<=1.8.1 scikit-learn<1.2.0 diff --git a/setup.py b/setup.py index 1220641..3b6545d 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import os +import platform from setuptools import setup @@ -11,6 +12,13 @@ with open("README.md") as readme_file: readme = readme_file.read() +if platform.machine() == "arm64": + with open("requirements-aarch64.txt") as requirements_file: + requirements = requirements_file.read().splitlines()[::-1] +else: + with open("requirements.txt") as requirements_file: + requirements = requirements_file.read().splitlines()[::-1] + with open("requirements.txt") as requirements_file: requirements = requirements_file.read().splitlines()[::-1] @@ -52,6 +60,7 @@ long_description=readme + "\n\n", long_description_content_type='text/markdown', python_requires=">=3.8", + wheel=True, package_dir={"subaligner": "subaligner"}, packages=[ "subaligner", @@ -92,4 +101,5 @@ "subaligner_convert=subaligner.subaligner_convert.__main__:main", "subaligner_train=subaligner.subaligner_train.__main__:main", "subaligner_tune=subaligner.subaligner_tune.__main__:main", - ]}) + ] + }) diff --git a/subaligner/_version.py b/subaligner/_version.py index c4206c9..4103c8a 100644 --- a/subaligner/_version.py +++ b/subaligner/_version.py @@ -1,2 +1,2 @@ """The semver for the current release.""" -__version__ = "0.3.3" +__version__ = "0.3.4" From c400a14cfee9d1b6ff6a680cd7c3cce6c2d10bf6 Mon Sep 17 00:00:00 2001 From: baxtree Date: Mon, 10 Jul 2023 18:36:06 +0100 Subject: [PATCH 19/20] improve setup and deprecate dependiencies --- .github/workflows/ci-pipeline.yml | 3 +- Makefile | 8 +---- Pipfile | 8 ++--- README.md | 2 +- docker/Dockerfile-Debian11 | 4 +++ pyproject.toml | 2 ++ ...ents-aarch64.txt => requirements-arm64.txt | 4 --- requirements.txt | 9 ++--- setup.py | 35 ++++++++++++++----- subaligner/__init__.py | 5 +++ subaligner/utils.py | 6 ++-- tests/subaligner/test_utils.py | 4 +-- 12 files changed, 50 insertions(+), 40 deletions(-) create mode 100644 pyproject.toml rename requirements-aarch64.txt => requirements-arm64.txt (93%) diff --git a/.github/workflows/ci-pipeline.yml b/.github/workflows/ci-pipeline.yml index 65bfb67..147e736 100644 --- a/.github/workflows/ci-pipeline.yml +++ b/.github/workflows/ci-pipeline.yml @@ -1,4 +1,4 @@ -name: ci pipeline +name: CI Pipeline on: push: @@ -28,6 +28,7 @@ jobs: sudo apt-get -y install espeak libespeak1 libespeak-dev espeak-data sudo apt-get -y install libsndfile-dev python -m pip install --upgrade pip + python -m pip install --upgrade setuptools wheel cat requirements.txt | xargs -L 1 pip install cat requirements-stretch.txt | xargs -L 1 pip install cat requirements-llm.txt | xargs -L 1 pip install diff --git a/Makefile b/Makefile index 8c6a7a6..2463e21 100644 --- a/Makefile +++ b/Makefile @@ -10,12 +10,6 @@ else PLATFORM := linux-x86_64-cp-38-cp38 endif -ifdef PYTHON_TAG -PYTHON_TAG := $(PYTHON_TAG) -else -PYTHON_TAG := py38 -endif - SUBALIGNER_VERSION := $(SUBALIGNER_VERSION) TRIGGER_URL := ${TRIGGER_URL} @@ -159,7 +153,7 @@ test-dist: dist: clean-dist test-dist cat requirements-dev.txt | xargs -L 1 .$(PYTHON)/bin/pip install; \ - .$(PYTHON)/bin/python setup.py sdist bdist_wheel --python-tag=$(PYTHON_TAG) + .$(PYTHON)/bin/python setup.py sdist bdist_wheel release: .$(PYTHON)/bin/twine upload dist/* diff --git a/Pipfile b/Pipfile index 1c37619..1a4f1a7 100644 --- a/Pipfile +++ b/Pipfile @@ -29,11 +29,10 @@ bleach = "==3.3.0" cachetools = "==3.1.1" captionstransformer = "~=1.2.1" certifi = "==2019.11.28" -chardet = "==3.0.4" +chardet = "~=3.0.4" click = "==5.1" cloudpickle = "==0.5.3" cycler = "==0.10.0" -Cython = "~=0.29.22" dask = ">=2021.10.0,<2022.1.0" decorator = "==4.3.0" distributed = "==1.13.0" @@ -49,8 +48,6 @@ isort = "==4.3.4" joblib = ">=1.2.0" Keras-Applications = ">=1.0.8" Keras-Preprocessing = ">=1.0.9" -kiwisolver = "==1.0.1" -lazy-object-proxy = "==1.4.3" le-pycaption = "==2.2.0a1" librosa = "<0.10.0" locket = "==0.2.0" @@ -61,8 +58,8 @@ numpy = "<1.24.0" oauthlib = "==3.1.0" openai-whisper = "==20230314" pbr = "==4.0.2" +pkgconfig = "~=1.5.5" pluggy = "==0.13.1" -psutil = "==5.6.7" py = "==1.10.0" pyasn1 = "==0.4.8" pyasn1-modules = "==0.2.7" @@ -83,7 +80,6 @@ rsa = "==4.7" scipy = "<=1.8.1" scikit-learn = ">=0.19.1" sentencepiece = "~=0.1.95" -setuptools = ">=41.0.0" six = "~=1.15.0" tblib = "==1.3.2" tensorflow = ">=1.15.5,<2.12" diff --git a/README.md b/README.md index 71d9207..c34e715 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ $ brew install ffmpeg ## Basic Installation ``` -$ pip install -U pip && pip install -U setuptools +$ pip install -U pip && pip install -U setuptools wheel $ pip install subaligner ``` or install from source: diff --git a/docker/Dockerfile-Debian11 b/docker/Dockerfile-Debian11 index 1d91f42..475ff82 100644 --- a/docker/Dockerfile-Debian11 +++ b/docker/Dockerfile-Debian11 @@ -11,9 +11,13 @@ RUN ["/bin/bash", "-c", "apt -y update &&\ apt -y install ffmpeg &&\ apt -y install espeak libespeak1 libespeak-dev espeak-data &&\ apt -y install libsndfile-dev &&\ + apt -y install libblas-dev liblapack-dev &&\ apt -y install python3-dev &&\ apt -y install python3-tk &&\ apt -y install python3-pip &&\ + apt -y install python3-venv &&\ + python3 -m venv .venv &&\ + source .venv/bin/activate &&\ python3 -m pip install --upgrade pip &&\ python3 -m pip install \"subaligner==${RELEASE_VERSION}\" &&\ python3 -m pip install \"subaligner[harmony]==${RELEASE_VERSION}\""] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..eb56957 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[build-system] +requires = ["setuptools", "wheel", "Cython"] \ No newline at end of file diff --git a/requirements-aarch64.txt b/requirements-arm64.txt similarity index 93% rename from requirements-aarch64.txt rename to requirements-arm64.txt index 71e5d49..92ff604 100644 --- a/requirements-aarch64.txt +++ b/requirements-arm64.txt @@ -3,13 +3,11 @@ beautifulsoup4<4.9.0 bleach==3.3.0 cachetools==3.1.1 captionstransformer~=1.2.1 -cchardet==2.1.7 certifi==2019.11.28 chardet==3.0.4 click==5.1 cloudpickle~=1.6.0 cycler==0.10.0 -Cython~=0.29.22 dask>=2021.10.0,<2022.1.0 decorator==4.3.0 distributed==1.13.0 @@ -25,8 +23,6 @@ idna==2.8 isort==4.3.4 joblib>=1.2.0 keras~=2.12.0 -kiwisolver==1.0.1 -lazy-object-proxy==1.4.3 le-pycaption==2.2.0a1 librosa<0.10.0 locket==0.2.0 diff --git a/requirements.txt b/requirements.txt index 01611cd..167d439 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,13 +3,11 @@ beautifulsoup4<4.9.0 bleach==3.3.0 cachetools==3.1.1 captionstransformer~=1.2.1 -cchardet==2.1.7 certifi==2019.11.28 -chardet==3.0.4 +chardet~=3.0.4 click==5.1 cloudpickle~=1.6.0 cycler==0.10.0 -Cython~=0.29.22 dask>=2021.10.0,<2022.1.0 decorator==4.3.0 distributed==1.13.0 @@ -25,8 +23,6 @@ isort==4.3.4 joblib>=1.2.0 Keras-Applications>=1.0.8 Keras-Preprocessing>=1.0.9 -kiwisolver==1.0.1 -lazy-object-proxy==1.4.3 le-pycaption==2.2.0a1 librosa<0.10.0 locket==0.2.0 @@ -37,8 +33,8 @@ numba>=0.50.0 numpy<1.24.0 oauthlib==3.1.0 pbr==4.0.2 +pkgconfig~=1.5.5 pluggy==0.13.1 -psutil==5.6.7 py==1.10.0 pyasn1==0.4.8 pyasn1-modules==0.2.7 @@ -55,7 +51,6 @@ PyYAML>=4.2b1 rsa==4.7 scipy<=1.8.1 scikit-learn<1.2.0 -setuptools>=41.0.0 six~=1.15.0 tblib==1.3.2 tensorflow>=1.15.5,<2.13 diff --git a/setup.py b/setup.py index 3b6545d..b84e1a4 100644 --- a/setup.py +++ b/setup.py @@ -2,9 +2,10 @@ # -*- coding: utf-8 -*- import os -import platform - +import sys +from platform import architecture, machine from setuptools import setup +from wheel.bdist_wheel import bdist_wheel with open(os.path.join(os.getcwd(), "subaligner", "_version.py")) as f: exec(f.read()) @@ -12,16 +13,13 @@ with open("README.md") as readme_file: readme = readme_file.read() -if platform.machine() == "arm64": - with open("requirements-aarch64.txt") as requirements_file: +if machine() == "arm64": + with open("requirements-arm64.txt") as requirements_file: requirements = requirements_file.read().splitlines()[::-1] else: with open("requirements.txt") as requirements_file: requirements = requirements_file.read().splitlines()[::-1] -with open("requirements.txt") as requirements_file: - requirements = requirements_file.read().splitlines()[::-1] - with open("requirements-stretch.txt") as stretch_requirements_file: stretch_requirements = stretch_requirements_file.read().splitlines()[::-1] @@ -43,6 +41,24 @@ "llm": llm_requirements, } +architecture = architecture()[0] if sys.platform == "win32" else machine() + + +class bdist_wheel_local(bdist_wheel): + + def get_tag(self): + python = f"py{sys.version_info.major}{sys.version_info.minor}" + if sys.platform == "darwin" and architecture == "arm64": + os_arch = "macosx_11_0_arm64" + elif sys.platform == "win32": + os_arch = "win32" if architecture == "32bit" else "win_amd64" + # elif sys.platform == "linux": + # os_arch = f"manylinux_2_17_{architecture}" + else: + os_arch = "any" + return python, "none", os_arch + + setup(name="subaligner", version=__version__, author="Xi Bai", @@ -58,7 +74,7 @@ url="https://subaligner.readthedocs.io/en/latest/", description="Automatically synchronize and translate subtitles, or create new ones by transcribing, using pre-trained DNNs, Forced Alignments and Transformers.", long_description=readme + "\n\n", - long_description_content_type='text/markdown', + long_description_content_type="text/markdown", python_requires=">=3.8", wheel=True, package_dir={"subaligner": "subaligner"}, @@ -102,4 +118,5 @@ "subaligner_train=subaligner.subaligner_train.__main__:main", "subaligner_tune=subaligner.subaligner_tune.__main__:main", ] - }) + }, + cmdclass={"bdist_wheel": bdist_wheel_local}) diff --git a/subaligner/__init__.py b/subaligner/__init__.py index ae79fcb..70d02fe 100644 --- a/subaligner/__init__.py +++ b/subaligner/__init__.py @@ -1,5 +1,6 @@ import os import warnings +import logging import multiprocessing as mp from ._version import __version__ @@ -10,3 +11,7 @@ mp.set_start_method("spawn", force=True) os.environ["KMP_WARNINGS"] = "0" + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" +os.environ["TF_CPP_MIN_VLOG_LEVEL"] = "0" +logging.getLogger("tensorflow").disabled = True diff --git a/subaligner/utils.py b/subaligner/utils.py index fa50d0f..1a771dd 100644 --- a/subaligner/utils.py +++ b/subaligner/utils.py @@ -3,7 +3,7 @@ import pysubs2 import requests import shutil -import cchardet +import chardet import shlex import pycountry @@ -595,9 +595,9 @@ def detect_encoding(subtitle_file_path: str) -> str: # and hence this less memory-efficient solution: raw = b"".join(file.readlines()) - detected = cchardet.detect(raw) + detected = chardet.detect(raw) detected = detected or {} - return detected["encoding"] if "encoding" in detected else None + return detected["encoding"] if "encoding" in detected and detected["encoding"] is not None else "utf-8" @staticmethod def get_file_root_and_extension(file_path: str) -> Tuple[str, str]: diff --git a/tests/subaligner/test_utils.py b/tests/subaligner/test_utils.py index da6689c..b4f9cbd 100644 --- a/tests/subaligner/test_utils.py +++ b/tests/subaligner/test_utils.py @@ -281,8 +281,8 @@ def test_contains_embedded_subtitle(self): self.assertFalse(Undertest.contains_embedded_subtitles(self.mp4_file_path)) def test_detect_encoding(self): - self.assertEqual("ASCII", Undertest.detect_encoding(self.real_srt_path)) - self.assertEqual("UTF-8", Undertest.detect_encoding(self.mkv_file_path)) + self.assertEqual("ascii", Undertest.detect_encoding(self.real_srt_path)) + self.assertEqual("utf-8", Undertest.detect_encoding(self.mkv_file_path)) def test_get_file_root_and_extension(self): root, extension = Undertest.get_file_root_and_extension("/path/to/root.ext1.ext2") From cc5cd13c1c11ff12103a1d9e8aeeb10065c80d6a Mon Sep 17 00:00:00 2001 From: baxtree Date: Thu, 13 Jul 2023 09:45:15 +0100 Subject: [PATCH 20/20] retire the Debian image --- .github/workflows/dockerhub.yml | 13 ------------- docker/Dockerfile-Debian11 | 1 + 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml index 6283144..4a02ec9 100644 --- a/.github/workflows/dockerhub.yml +++ b/.github/workflows/dockerhub.yml @@ -86,19 +86,6 @@ jobs: tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.fed34 push: true - - name: Build and push the Debian 11 image - id: docker_build_deb11 - uses: docker/build-push-action@v2 - with: - context: ./docker - file: "./docker/Dockerfile-Debian11" - build-args: | - "RELEASE_VERSION=${{ steps.tag.outputs.TAG }}" - allow: network.host - github-token: ${{ github.token }} - tags: baxtree/subaligner:${{ steps.tag.outputs.TAG }}.deb11 - push: true - - name: Build and push the ArchLinux image id: docker_build_arch uses: docker/build-push-action@v2 diff --git a/docker/Dockerfile-Debian11 b/docker/Dockerfile-Debian11 index 475ff82..a1fdf84 100644 --- a/docker/Dockerfile-Debian11 +++ b/docker/Dockerfile-Debian11 @@ -8,6 +8,7 @@ ENV RELEASE_VERSION=${RELEASE_VERSION} ENV TZ=Europe/London RUN ["/bin/bash", "-c", "apt -y update &&\ + apt -y install build-essential &&\ apt -y install ffmpeg &&\ apt -y install espeak libespeak1 libespeak-dev espeak-data &&\ apt -y install libsndfile-dev &&\