From a8444811e890c9ddc5c44ede6c9e1bd3e783aab3 Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Wed, 16 Dec 2020 20:43:25 +0800 Subject: [PATCH 01/14] implemented multi-label support --- .gitignore | 3 +- hanlp/common/transform.py | 2 +- hanlp/common/vocab.py | 5 +- .../classifiers/transformer_classifier.py | 47 +++++++++++++++---- hanlp/transform/table.py | 24 ++++++++-- hanlp/utils/tf_util.py | 4 +- 6 files changed, 66 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index 76b3c5739..3c7ce29ca 100644 --- a/.gitignore +++ b/.gitignore @@ -284,4 +284,5 @@ fabric.properties .idea/caches/build_file_checksums.ser .idea *.iml -data \ No newline at end of file +data +.vscode/settings.json diff --git a/hanlp/common/transform.py b/hanlp/common/transform.py index 70fe81b8e..a5352d990 100644 --- a/hanlp/common/transform.py +++ b/hanlp/common/transform.py @@ -196,7 +196,7 @@ def mapper(X, Y): Y = self.y_to_idx(Y) return X, Y - dataset = dataset.map(mapper, num_parallel_calls=tf.data.experimental.AUTOTUNE) +dataset = dataset.map(mapper, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset @abstractmethod diff --git a/hanlp/common/vocab.py b/hanlp/common/vocab.py index 7dec92ed0..74a0fc11f 100644 --- a/hanlp/common/vocab.py +++ b/hanlp/common/vocab.py @@ -79,7 +79,10 @@ def update(self, tokens: Iterable[str]) -> None: self.add(token) def get_idx(self, token: str) -> int: - idx = self.token_to_idx.get(token, None) + if type(token) is list: + idx = [self.get_idx(t) for t in token] + else: + idx = self.token_to_idx.get(token, None) if idx is None: if self.mutable: idx = len(self.token_to_idx) diff --git a/hanlp/components/classifiers/transformer_classifier.py b/hanlp/components/classifiers/transformer_classifier.py index 85270eae5..9f586f4b4 100644 --- a/hanlp/components/classifiers/transformer_classifier.py +++ b/hanlp/components/classifiers/transformer_classifier.py @@ -15,13 +15,14 @@ from hanlp.transform.table import TableTransform from hanlp.utils.log_util import logger from hanlp.utils.util import merge_locals_kwargs +import numpy as np class TransformerTextTransform(TableTransform): def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=None, - y_column=-1, skip_header=True, delimiter='auto', **kwargs) -> None: - super().__init__(config, map_x, map_y, x_columns, y_column, skip_header, delimiter, **kwargs) + y_column=-1, skip_header=True, delimiter='auto', multi_label=False, **kwargs) -> None: + super().__init__(config, map_x, map_y, x_columns, y_column, multi_label, skip_header, delimiter, **kwargs) self.tokenizer: FullTokenizer = None def inputs_to_samples(self, inputs, gold=False): @@ -61,17 +62,17 @@ def inputs_to_samples(self, inputs, gold=False): segment_ids += [0] * diff assert len(token_ids) == max_length, "Error with input length {} vs {}".format(len(token_ids), max_length) - assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), - max_length) - assert len(segment_ids) == max_length, "Error with input length {} vs {}".format(len(segment_ids), - max_length) + assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) + assert len(segment_ids) == max_length, "Error with input length {} vs {}".format(len(segment_ids), max_length) + + label = Y yield (token_ids, attention_mask, segment_ids), label def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: max_length = self.config.max_length types = (tf.int32, tf.int32, tf.int32), tf.string - shapes = ([max_length], [max_length], [max_length]), [] + shapes = ([max_length], [max_length], [max_length]), [None,] if self.config.multi_label else [] values = (0, 0, 0), self.label_vocab.safe_pad_token return types, shapes, values @@ -79,8 +80,22 @@ def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: logger.fatal('map_x should always be set to True') exit(1) + def y_to_idx(self, y) -> tf.Tensor: + if self.config.multi_label: + #need to change index to binary vector + mapped = tf.map_fn(fn=lambda x: tf.cast(self.label_vocab.lookup(x), tf.int32), elems=y, fn_output_signature=tf.TensorSpec(dtype=tf.dtypes.int32, shape=[None,])) + one_hots = tf.one_hot(mapped, len(self.label_vocab)) + idx = tf.reduce_sum(one_hots, -2) + else: + idx = self.label_vocab.lookup(y) + return idx + def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, batch=None) -> Iterable: - preds = tf.argmax(Y, axis=-1) + # Prediction to be Y > 0: + if self.config.multi_label: + preds = Y + else: + preds = tf.argmax(Y, axis=-1) for y in preds: yield self.label_vocab.idx_to_token[y] @@ -126,7 +141,14 @@ def _y_id_to_str(self, Y_pred) -> str: return self.transform.label_vocab.idx_to_token[Y_pred.numpy()] def build_loss(self, loss, **kwargs): - loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + if loss: + assert isinstance(loss, tf.keras.losses.loss), 'Must specify loss as an instance in tf.keras.losses' + return loss + elif self.config.multi_label: + #Loss to be BinaryCrossentropy for multi-label: + loss = tf.keras.losses.BinaryCrossentropy(from_logits=True) + else: + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) return loss # noinspection PyMethodOverriding @@ -158,3 +180,10 @@ def build_vocab(self, trn_data, logger): warmup_steps_per_epoch = math.ceil(train_examples * self.config.warmup_steps_ratio / self.config.batch_size) self.config.warmup_steps = warmup_steps_per_epoch * self.config.epochs return train_examples + + def build_metrics(self, metrics, logger, **kwargs): + if self.config.multi_label: + metric = tf.keras.metrics.BinaryCrossentropy() + else: + metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + return [metric]x \ No newline at end of file diff --git a/hanlp/transform/table.py b/hanlp/transform/table.py index 046be98f0..ad95fd8f7 100644 --- a/hanlp/transform/table.py +++ b/hanlp/transform/table.py @@ -3,7 +3,7 @@ # Date: 2019-11-10 21:00 from abc import ABC from typing import Tuple, Union - +import numpy as np import tensorflow as tf from hanlp.common.structure import SerializableDict @@ -16,9 +16,9 @@ class TableTransform(Transform, ABC): def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=None, - y_column=-1, + y_column=-1, multi_label=False, skip_header=True, delimiter='auto', **kwargs) -> None: - super().__init__(config, map_x, map_y, x_columns=x_columns, y_column=y_column, + super().__init__(config, map_x, map_y, x_columns=x_columns, y_column=y_column, multi_label=multi_label, skip_header=skip_header, delimiter=delimiter, **kwargs) self.label_vocab = create_label_vocab() @@ -28,6 +28,9 @@ def file_to_inputs(self, filepath: str, gold=True): y_column = self.config.y_column num_features = self.config.get('num_features', None) for cells in read_cells(filepath, skip_header=self.config.skip_header, delimiter=self.config.delimiter): + #multi-label: Dataset in .tsv format: x_columns: at most 2 columns being a sentence pair while in most + # cases just one column being the doc content. y_column being the single label, which shall be modified + # to load a list of labels. if x_columns: inputs = tuple(c for i, c in enumerate(cells) if i in x_columns), cells[y_column] else: @@ -37,6 +40,15 @@ def file_to_inputs(self, filepath: str, gold=True): if num_features is None: num_features = len(inputs[0]) self.config.num_features = num_features + # multi-label support + if self.config.multi_label: + assert type(inputs[1]) is str, 'Y value has to be string' + if inputs[1][0] == '[': + # multi-label is in literal form of a list + labels = eval(inputs[1]) + else: + labels = inputs[1].strip().split(',') + inputs = inputs[0], labels else: assert num_features == len(inputs[0]), f'Numbers of columns {num_features} ' \ f'inconsistent with current {len(inputs[0])}' @@ -56,7 +68,11 @@ def y_to_idx(self, y) -> tf.Tensor: def fit(self, trn_path: str, **kwargs): samples = 0 for t in self.file_to_samples(trn_path, gold=True): - self.label_vocab.add(t[1]) # the second one regardless of t is pair or triple + if self.config.multi_label: + for l in t[1]: + self.label_vocab.add(l) + else: + self.label_vocab.add(t[1]) # the second one regardless of t is pair or triple samples += 1 return samples diff --git a/hanlp/utils/tf_util.py b/hanlp/utils/tf_util.py index 465856cc7..1ea040ea9 100644 --- a/hanlp/utils/tf_util.py +++ b/hanlp/utils/tf_util.py @@ -11,9 +11,7 @@ def size_of_dataset(dataset: tf.data.Dataset) -> int: - count = 0 - for element in dataset.unbatch().batch(1): - count += 1 + count = len(list(dataset.unbatch().as_numpy_iterator())) return count From 62f7b3da720e7e517c44f01e49594ac065ed79a8 Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Thu, 17 Dec 2020 13:27:21 +0800 Subject: [PATCH 02/14] multi-label support cherry picked to master --- hanlp/components/classifiers/transformer_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hanlp/components/classifiers/transformer_classifier.py b/hanlp/components/classifiers/transformer_classifier.py index 9f586f4b4..60ec82cbe 100644 --- a/hanlp/components/classifiers/transformer_classifier.py +++ b/hanlp/components/classifiers/transformer_classifier.py @@ -183,7 +183,7 @@ def build_vocab(self, trn_data, logger): def build_metrics(self, metrics, logger, **kwargs): if self.config.multi_label: - metric = tf.keras.metrics.BinaryCrossentropy() + metric = tf.keras.metrics.BinaryAccuracy('binary_accuracy') else: metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') return [metric]x \ No newline at end of file From d4104d7d57e1e08e24c659165422e43caaf017a9 Mon Sep 17 00:00:00 2001 From: Derek Zhang Date: Wed, 16 Dec 2020 21:49:57 +0800 Subject: [PATCH 03/14] minor fix --- hanlp/common/transform.py | 2 +- hanlp/components/classifiers/transformer_classifier.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hanlp/common/transform.py b/hanlp/common/transform.py index a5352d990..70fe81b8e 100644 --- a/hanlp/common/transform.py +++ b/hanlp/common/transform.py @@ -196,7 +196,7 @@ def mapper(X, Y): Y = self.y_to_idx(Y) return X, Y -dataset = dataset.map(mapper, num_parallel_calls=tf.data.experimental.AUTOTUNE) + dataset = dataset.map(mapper, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset @abstractmethod diff --git a/hanlp/components/classifiers/transformer_classifier.py b/hanlp/components/classifiers/transformer_classifier.py index 60ec82cbe..a70d9b65a 100644 --- a/hanlp/components/classifiers/transformer_classifier.py +++ b/hanlp/components/classifiers/transformer_classifier.py @@ -186,4 +186,4 @@ def build_metrics(self, metrics, logger, **kwargs): metric = tf.keras.metrics.BinaryAccuracy('binary_accuracy') else: metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') - return [metric]x \ No newline at end of file + return [metric] \ No newline at end of file From 91e9847dd941fcb728a2e031fc7d8beff30f73fd Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Thu, 17 Dec 2020 01:31:15 +0800 Subject: [PATCH 04/14] minor fix --- hanlp/common/component.py | 8 +++++++- hanlp/layers/transformers/loader.py | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/hanlp/common/component.py b/hanlp/common/component.py index 44fd8408c..5a347a5c7 100644 --- a/hanlp/common/component.py +++ b/hanlp/common/component.py @@ -10,7 +10,7 @@ import numpy as np import tensorflow as tf - +from tensorflow.keras.mixed_precision import experimental as mixed_precision import hanlp import hanlp.version from hanlp.callbacks.fine_csv_logger import FineCSVLogger @@ -331,6 +331,12 @@ def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=Fals logger.info('Building...') train_steps_per_epoch = math.ceil(num_examples / batch_size) if num_examples else None self.config.train_steps = train_steps_per_epoch * epochs if num_examples else None + # mixed precision + if self.config.use_amp: + policy = mixed_precision.Policy('mixed_float16') + mixed_precision.set_policy(policy) + print('Compute dtype: %s' % policy.compute_dtype) + print('Variable dtype: %s' % policy.variable_dtype) model, optimizer, loss, metrics = self.build(**merge_dict(self.config, logger=logger, training=True)) logger.info('Model built:\n' + summary_of_model(self.model)) self.save_config(save_dir) diff --git a/hanlp/layers/transformers/loader.py b/hanlp/layers/transformers/loader.py index 8cea1c08c..53ca44e91 100644 --- a/hanlp/layers/transformers/loader.py +++ b/hanlp/layers/transformers/loader.py @@ -12,6 +12,9 @@ from hanlp.layers.transformers import zh_albert_models_google, bert_models_google from hanlp.utils.io_util import get_resource, stdout_redirected, hanlp_home +gpu_devices = tf.config.experimental.list_physical_devices('GPU') +if len(gpu_devices)>0: + tf.config.experimental.set_memory_growth(gpu_devices[0], True) def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False): spm_model_file = None From 7bae452a112f5ea9f36acb527957d832d9a1b115 Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Thu, 17 Dec 2020 14:13:30 +0800 Subject: [PATCH 05/14] Revert "minor fix" This reverts commit 91e9847dd941fcb728a2e031fc7d8beff30f73fd. On branch master Your branch is up to date with 'origin/master'. Changes to be committed: modified: hanlp/common/component.py modified: hanlp/layers/transformers/loader.py --- hanlp/common/component.py | 8 +------- hanlp/layers/transformers/loader.py | 3 --- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/hanlp/common/component.py b/hanlp/common/component.py index 5a347a5c7..44fd8408c 100644 --- a/hanlp/common/component.py +++ b/hanlp/common/component.py @@ -10,7 +10,7 @@ import numpy as np import tensorflow as tf -from tensorflow.keras.mixed_precision import experimental as mixed_precision + import hanlp import hanlp.version from hanlp.callbacks.fine_csv_logger import FineCSVLogger @@ -331,12 +331,6 @@ def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=Fals logger.info('Building...') train_steps_per_epoch = math.ceil(num_examples / batch_size) if num_examples else None self.config.train_steps = train_steps_per_epoch * epochs if num_examples else None - # mixed precision - if self.config.use_amp: - policy = mixed_precision.Policy('mixed_float16') - mixed_precision.set_policy(policy) - print('Compute dtype: %s' % policy.compute_dtype) - print('Variable dtype: %s' % policy.variable_dtype) model, optimizer, loss, metrics = self.build(**merge_dict(self.config, logger=logger, training=True)) logger.info('Model built:\n' + summary_of_model(self.model)) self.save_config(save_dir) diff --git a/hanlp/layers/transformers/loader.py b/hanlp/layers/transformers/loader.py index 53ca44e91..8cea1c08c 100644 --- a/hanlp/layers/transformers/loader.py +++ b/hanlp/layers/transformers/loader.py @@ -12,9 +12,6 @@ from hanlp.layers.transformers import zh_albert_models_google, bert_models_google from hanlp.utils.io_util import get_resource, stdout_redirected, hanlp_home -gpu_devices = tf.config.experimental.list_physical_devices('GPU') -if len(gpu_devices)>0: - tf.config.experimental.set_memory_growth(gpu_devices[0], True) def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False): spm_model_file = None From 4a0dadc5bfb17ca0d90a703ba7ad3dc07319c910 Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Sat, 19 Dec 2020 01:49:29 +0800 Subject: [PATCH 06/14] fixed safe pad issue for mult-label --- hanlp/common/vocab.py | 10 ++--- .../classifiers/transformer_classifier.py | 39 ++++++++++++------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/hanlp/common/vocab.py b/hanlp/common/vocab.py index 74a0fc11f..f242174b7 100644 --- a/hanlp/common/vocab.py +++ b/hanlp/common/vocab.py @@ -30,7 +30,7 @@ def __init__(self, idx_to_token: List[str] = None, token_to_idx: Dict = None, mu self.pad_token = pad_token self.unk_token = unk_token self.token_to_idx_table: tf.lookup.StaticHashTable = None - self.idx_to_token_table = None + # self.idx_to_token_table = None def __setitem__(self, token: str, idx: int): assert self.mutable, 'Update an immutable Vocab object is not allowed' @@ -239,8 +239,8 @@ def safe_pad_token(self) -> str: """ if self.pad_token: return self.pad_token - if self.first_token: - return self.first_token + # if self.first_token: + # return self.first_token return PAD @property @@ -259,8 +259,8 @@ def safe_unk_token(self) -> str: """ if self.unk_token: return self.unk_token - if self.first_token: - return self.first_token + # if self.first_token: + # return self.first_token return UNK diff --git a/hanlp/components/classifiers/transformer_classifier.py b/hanlp/components/classifiers/transformer_classifier.py index a70d9b65a..74477ac6d 100644 --- a/hanlp/components/classifiers/transformer_classifier.py +++ b/hanlp/components/classifiers/transformer_classifier.py @@ -82,9 +82,9 @@ def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: def y_to_idx(self, y) -> tf.Tensor: if self.config.multi_label: - #need to change index to binary vector + #converrt index to binary vector mapped = tf.map_fn(fn=lambda x: tf.cast(self.label_vocab.lookup(x), tf.int32), elems=y, fn_output_signature=tf.TensorSpec(dtype=tf.dtypes.int32, shape=[None,])) - one_hots = tf.one_hot(mapped, len(self.label_vocab)) + one_hots = tf.one_hot(mapped, len(self.label_vocab), on_value=1, off_value=0) idx = tf.reduce_sum(one_hots, -2) else: idx = self.label_vocab.lookup(y) @@ -93,11 +93,13 @@ def y_to_idx(self, y) -> tf.Tensor: def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, batch=None) -> Iterable: # Prediction to be Y > 0: if self.config.multi_label: - preds = Y + preds = [np.flatnonzero(y>0) for y in Y] + for p in preds: + yield [self.label_vocab.idx_to_token[i] for i in p] else: preds = tf.argmax(Y, axis=-1) - for y in preds: - yield self.label_vocab.idx_to_token[y] + for y in preds: + yield self.label_vocab.idx_to_token[y] def input_is_single_sample(self, input: Any) -> bool: return isinstance(input, (str, tuple)) @@ -122,23 +124,29 @@ def evaluate_output(self, tst_data, out, num_batches, metric): out.write('sentence\tpred\tgold\n') total, correct, score = 0, 0, 0 for idx, batch in enumerate(tst_data): - outputs = self.model.predict_on_batch(batch[0])[0] - outputs = tf.argmax(outputs, axis=1) - for X, Y_pred, Y_gold, in zip(batch[0][0], outputs, batch[1]): - feature = ' '.join(self.transform.tokenizer.convert_ids_to_tokens(X.numpy(), skip_special_tokens=True)) + outputs = self.model.predict_on_batch(batch[0]) + tokens = self.transform.Y_to_outputs(outputs) + Y_GT = self.transform.Y_to_outputs(batch[1]) + for X, Y_pred, Y_gold, in zip(batch[0][0], tokens, Y_GT):#batch[1]): + feature = ' '.join(self.transform.tokenizer.convert_ids_to_tokens(X.numpy()))#, skip_special_tokens=True)) feature = feature.replace(' ##', '') # fix sub-word generated by BERT tagger - out.write('{}\t{}\t{}\n'.format(feature, - self._y_id_to_str(Y_pred), - self._y_id_to_str(Y_gold))) + # Y_gold = self.transform.label_vocab.idx_to_token[Y_gold] + out.write('{}\t{}\t{}\n'.format(feature, Y_pred, Y_gold)) total += 1 - correct += int(tf.equal(Y_pred, Y_gold).numpy()) + correct += sum([1 for y1 in Y_gold for y2 in Y_pred if y1==y2])/len(Y_gold) if self.config.multi_label else int(Y_pred == Y_gold) score = correct / total print('\r{}/{} {}: {:.2f}'.format(idx + 1, num_batches, metric, score * 100), end='') print() return score - def _y_id_to_str(self, Y_pred) -> str: - return self.transform.label_vocab.idx_to_token[Y_pred.numpy()] + # def _y_id_to_str(self, Y_pred) -> str: + # logger.info(f'start to produce Y_pred: {Y_pred}') + # if self.config.multi_label: + # Y_pred = np.flatnonzero(Y_pred>0) + # return [self.transform.label_vocab.idx_to_token[y.numpy()] for y in Y_pred] + # else: + # Y_pred = tf.argmax(Y_pred, axis=1) + # return self.transform.label_vocab.idx_to_token[Y_pred.numpy()] def build_loss(self, loss, **kwargs): if loss: @@ -176,6 +184,7 @@ def build_model(self, transformer, max_length, **kwargs): return model def build_vocab(self, trn_data, logger): + self.transform.label_vocab.unlock() train_examples = super().build_vocab(trn_data, logger) warmup_steps_per_epoch = math.ceil(train_examples * self.config.warmup_steps_ratio / self.config.batch_size) self.config.warmup_steps = warmup_steps_per_epoch * self.config.epochs From a3ff10ab2952ff108d0d559aa56677a6db1d2549 Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Sat, 19 Dec 2020 23:17:34 +0800 Subject: [PATCH 07/14] fixed loss and metrics for multi-label --- hanlp/common/component.py | 18 +++++++++++++----- .../classifiers/transformer_classifier.py | 14 ++++++++++---- hanlp/layers/transformers/loader.py | 4 ++++ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/hanlp/common/component.py b/hanlp/common/component.py index 44fd8408c..55fd57438 100644 --- a/hanlp/common/component.py +++ b/hanlp/common/component.py @@ -10,6 +10,7 @@ import numpy as np import tensorflow as tf +from tensorflow.keras.mixed_precision import experimental as mixed_precision import hanlp import hanlp.version @@ -326,6 +327,13 @@ def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=Fals if not logger: logger = init_logger(name='train', root_dir=save_dir, level=logging.INFO if verbose else logging.WARN) logger.info('Hyperparameter:\n' + self.config.to_json()) + if self.config.use_amp: + policy = mixed_precision.Policy('mixed_float16') + mixed_precision.set_policy(policy) + logger.info(f'Global mixed precision policy has been set.') + logger.info('Compute dtype: %s' % policy.compute_dtype) + logger.info('Variable dtype: %s' % policy.variable_dtype) + num_examples = self.build_vocab(trn_data, logger) # assert num_examples, 'You forgot to return the number of training examples in your build_vocab' logger.info('Building...') @@ -397,16 +405,16 @@ def build_train_dataset(self, trn_data, batch_size, num_examples): return trn_data def build_callbacks(self, save_dir, logger, **kwargs): - metrics = kwargs.get('metrics', 'accuracy') - if isinstance(metrics, (list, tuple)): - metrics = metrics[-1] - monitor = f'val_{metrics}' + metrics_names = [m.name for m in kwargs.get('metrics', 'accuracy')] + if isinstance(metrics_names, (list, tuple)): + metrics_names = metrics_names[-1] + monitor = f'val_{metrics_names}' checkpoint = tf.keras.callbacks.ModelCheckpoint( os.path.join(save_dir, 'model.h5'), # verbose=1, monitor=monitor, save_best_only=True, mode='max', - save_weights_only=True) + save_weights_only=False) logger.debug(f'Monitor {checkpoint.monitor} for checkpoint') tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=io_util.makedirs(io_util.path_join(save_dir, 'logs'))) diff --git a/hanlp/components/classifiers/transformer_classifier.py b/hanlp/components/classifiers/transformer_classifier.py index 74477ac6d..7c6407905 100644 --- a/hanlp/components/classifiers/transformer_classifier.py +++ b/hanlp/components/classifiers/transformer_classifier.py @@ -133,7 +133,7 @@ def evaluate_output(self, tst_data, out, num_batches, metric): # Y_gold = self.transform.label_vocab.idx_to_token[Y_gold] out.write('{}\t{}\t{}\n'.format(feature, Y_pred, Y_gold)) total += 1 - correct += sum([1 for y1 in Y_gold for y2 in Y_pred if y1==y2])/len(Y_gold) if self.config.multi_label else int(Y_pred == Y_gold) + correct += sum([1 for y1 in Y_gold for y2 in Y_pred if y1==y2])/max(len(Y_pred),len(Y_gold)) if self.config.multi_label else int(Y_pred == Y_gold) score = correct / total print('\r{}/{} {}: {:.2f}'.format(idx + 1, num_batches, metric, score * 100), end='') print() @@ -150,8 +150,9 @@ def evaluate_output(self, tst_data, out, num_batches, metric): def build_loss(self, loss, **kwargs): if loss: - assert isinstance(loss, tf.keras.losses.loss), 'Must specify loss as an instance in tf.keras.losses' - return loss + # assert isinstance(loss, tf.keras.losses.Loss), 'Must specify loss as an instance in tf.keras.losses.Loss' + if not isinstance(loss, tf.keras.losses.Loss): + logger.warn(f'loss function may not be compatible: {loss}') elif self.config.multi_label: #Loss to be BinaryCrossentropy for multi-label: loss = tf.keras.losses.BinaryCrossentropy(from_logits=True) @@ -191,8 +192,13 @@ def build_vocab(self, trn_data, logger): return train_examples def build_metrics(self, metrics, logger, **kwargs): + if metrics: + for metric in metrics: + assert isinstance(metric, tf.keras.metrics.Metric), f'Metrics defined may not be compatible: {metric}' + return metrics if self.config.multi_label: metric = tf.keras.metrics.BinaryAccuracy('binary_accuracy') else: metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') - return [metric] \ No newline at end of file + self.config['metrics'] = [metric] + return [metrics] \ No newline at end of file diff --git a/hanlp/layers/transformers/loader.py b/hanlp/layers/transformers/loader.py index 8cea1c08c..2d60d28ee 100644 --- a/hanlp/layers/transformers/loader.py +++ b/hanlp/layers/transformers/loader.py @@ -12,6 +12,10 @@ from hanlp.layers.transformers import zh_albert_models_google, bert_models_google from hanlp.utils.io_util import get_resource, stdout_redirected, hanlp_home +gpu_devices = tf.config.experimental.list_physical_devices('GPU') +for gpu in gpu_devices: + tf.config.experimental.set_memory_growth(gpu, True) + def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False): spm_model_file = None From c59fedccb19ec6c43f19bd74332f1ee55e00c1ec Mon Sep 17 00:00:00 2001 From: Derek Zhang Date: Sun, 20 Dec 2020 00:54:59 +0800 Subject: [PATCH 08/14] fix metrics passing --- hanlp/common/component.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hanlp/common/component.py b/hanlp/common/component.py index 55fd57438..ee52866cf 100644 --- a/hanlp/common/component.py +++ b/hanlp/common/component.py @@ -259,7 +259,7 @@ def build(self, logger, **kwargs): loss = self.build_loss( **self.config if 'loss' in self.config else dict(list(self.config.items()) + [('loss', None)])) # allow for different - metrics = self.build_metrics(**merge_dict(self.config, metrics=kwargs.get('metrics', 'accuracy'), + metrics = self.build_metrics(**merge_dict(self.config, metrics=kwargs.get('metrics', None), logger=logger, overwrite=True)) if not isinstance(metrics, list): if isinstance(metrics, tf.keras.metrics.Metric): From 6e71838be6af3f8fadc208a229154e1a2ed1377f Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Sun, 20 Dec 2020 03:01:00 +0800 Subject: [PATCH 09/14] metrics fix --- hanlp/common/component.py | 12 +++++++----- .../components/classifiers/transformer_classifier.py | 5 +++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/hanlp/common/component.py b/hanlp/common/component.py index 55fd57438..95fedee9b 100644 --- a/hanlp/common/component.py +++ b/hanlp/common/component.py @@ -259,7 +259,7 @@ def build(self, logger, **kwargs): loss = self.build_loss( **self.config if 'loss' in self.config else dict(list(self.config.items()) + [('loss', None)])) # allow for different - metrics = self.build_metrics(**merge_dict(self.config, metrics=kwargs.get('metrics', 'accuracy'), + metrics = self.build_metrics(**merge_dict(self.config, metrics=kwargs.get('metrics', None), logger=logger, overwrite=True)) if not isinstance(metrics, list): if isinstance(metrics, tf.keras.metrics.Metric): @@ -346,7 +346,7 @@ def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=Fals self.save_meta(save_dir) trn_data = self.build_train_dataset(trn_data, batch_size, num_examples) dev_data = self.build_valid_dataset(dev_data, batch_size) - callbacks = self.build_callbacks(save_dir, **merge_dict(self.config, overwrite=True, logger=logger)) + callbacks = self.build_callbacks(save_dir, **merge_dict(self.config, overwrite=True, logger=logger, metrics=metrics)) # need to know #batches, otherwise progbar crashes dev_steps = math.ceil(size_of_dataset(dev_data) / batch_size) checkpoint = get_callback_by_class(callbacks, tf.keras.callbacks.ModelCheckpoint) @@ -405,9 +405,11 @@ def build_train_dataset(self, trn_data, batch_size, num_examples): return trn_data def build_callbacks(self, save_dir, logger, **kwargs): - metrics_names = [m.name for m in kwargs.get('metrics', 'accuracy')] - if isinstance(metrics_names, (list, tuple)): - metrics_names = metrics_names[-1] + metrics = kwargs.get('metrics', 'accuracy') + if isinstance(metrics, str): + metrics_names = metrics + else: + metrics_names = [m.name for m in metrics][-1] monitor = f'val_{metrics_names}' checkpoint = tf.keras.callbacks.ModelCheckpoint( os.path.join(save_dir, 'model.h5'), diff --git a/hanlp/components/classifiers/transformer_classifier.py b/hanlp/components/classifiers/transformer_classifier.py index 7c6407905..6746e54f6 100644 --- a/hanlp/components/classifiers/transformer_classifier.py +++ b/hanlp/components/classifiers/transformer_classifier.py @@ -194,11 +194,12 @@ def build_vocab(self, trn_data, logger): def build_metrics(self, metrics, logger, **kwargs): if metrics: for metric in metrics: - assert isinstance(metric, tf.keras.metrics.Metric), f'Metrics defined may not be compatible: {metric}' + # assert isinstance(metric, tf.keras.metrics.Metric), f'Metrics defined may not be compatible: {metric}' + if not isinstance(metric, tf.keras.metrics.Metric): logger.warn(f'metric may not be compatible: {metric}') return metrics if self.config.multi_label: metric = tf.keras.metrics.BinaryAccuracy('binary_accuracy') else: metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') self.config['metrics'] = [metric] - return [metrics] \ No newline at end of file + return [metric] \ No newline at end of file From 825a300edf9721d803c63d4b98dd17f6071f9626 Mon Sep 17 00:00:00 2001 From: Derek Zhang Date: Sun, 20 Dec 2020 20:28:41 +0800 Subject: [PATCH 10/14] fixed metrics on loading --- hanlp/common/component.py | 15 ++++++++------- .../classifiers/transformer_classifier.py | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/hanlp/common/component.py b/hanlp/common/component.py index 95fedee9b..6ab4409a4 100644 --- a/hanlp/common/component.py +++ b/hanlp/common/component.py @@ -236,6 +236,7 @@ def save(self, save_dir: str, **kwargs): self.save_config(save_dir) self.save_vocabs(save_dir) self.save_weights(save_dir) + self.model.save(save_dir) def load(self, save_dir: str, logger=hanlp.utils.log_util.logger, **kwargs): self.meta['load_path'] = save_dir @@ -244,6 +245,7 @@ def load(self, save_dir: str, logger=hanlp.utils.log_util.logger, **kwargs): self.load_vocabs(save_dir) self.build(**merge_dict(self.config, training=False, logger=logger, **kwargs, overwrite=True, inplace=True)) self.load_weights(save_dir, **kwargs) + # tf.keras.models.load_model(save_dir) self.load_meta(save_dir) @property @@ -341,9 +343,7 @@ def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=Fals self.config.train_steps = train_steps_per_epoch * epochs if num_examples else None model, optimizer, loss, metrics = self.build(**merge_dict(self.config, logger=logger, training=True)) logger.info('Model built:\n' + summary_of_model(self.model)) - self.save_config(save_dir) - self.save_vocabs(save_dir) - self.save_meta(save_dir) + self.save(save_dir) trn_data = self.build_train_dataset(trn_data, batch_size, num_examples) dev_data = self.build_valid_dataset(dev_data, batch_size) callbacks = self.build_callbacks(save_dir, **merge_dict(self.config, overwrite=True, logger=logger, metrics=metrics)) @@ -361,7 +361,8 @@ def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=Fals except KeyboardInterrupt: print() if not checkpoint or checkpoint.best in (np.Inf, -np.Inf): - self.save_weights(save_dir) + # self.save_weights(save_dir) + self.save(save_dir) logger.info('Aborted with model saved') else: logger.info(f'Aborted with model saved with best {checkpoint.monitor} = {checkpoint.best:.4f}') @@ -413,10 +414,10 @@ def build_callbacks(self, save_dir, logger, **kwargs): monitor = f'val_{metrics_names}' checkpoint = tf.keras.callbacks.ModelCheckpoint( os.path.join(save_dir, 'model.h5'), - # verbose=1, + verbose=1, monitor=monitor, save_best_only=True, - mode='max', - save_weights_only=False) + mode='auto', + save_weights_only=True) logger.debug(f'Monitor {checkpoint.monitor} for checkpoint') tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=io_util.makedirs(io_util.path_join(save_dir, 'logs'))) diff --git a/hanlp/components/classifiers/transformer_classifier.py b/hanlp/components/classifiers/transformer_classifier.py index 6746e54f6..ed1dc61c5 100644 --- a/hanlp/components/classifiers/transformer_classifier.py +++ b/hanlp/components/classifiers/transformer_classifier.py @@ -192,7 +192,7 @@ def build_vocab(self, trn_data, logger): return train_examples def build_metrics(self, metrics, logger, **kwargs): - if metrics: + if metrics and type(metrics[0]) is not str: for metric in metrics: # assert isinstance(metric, tf.keras.metrics.Metric), f'Metrics defined may not be compatible: {metric}' if not isinstance(metric, tf.keras.metrics.Metric): logger.warn(f'metric may not be compatible: {metric}') From 35858918baf4c300e5cd421ce249addf75a8280c Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Sun, 20 Dec 2020 20:29:40 +0800 Subject: [PATCH 11/14] fix evaluation --- .../classifiers/transformer_classifier.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hanlp/components/classifiers/transformer_classifier.py b/hanlp/components/classifiers/transformer_classifier.py index 6746e54f6..b52b36437 100644 --- a/hanlp/components/classifiers/transformer_classifier.py +++ b/hanlp/components/classifiers/transformer_classifier.py @@ -126,15 +126,15 @@ def evaluate_output(self, tst_data, out, num_batches, metric): for idx, batch in enumerate(tst_data): outputs = self.model.predict_on_batch(batch[0]) tokens = self.transform.Y_to_outputs(outputs) - Y_GT = self.transform.Y_to_outputs(batch[1]) - for X, Y_pred, Y_gold, in zip(batch[0][0], tokens, Y_GT):#batch[1]): - feature = ' '.join(self.transform.tokenizer.convert_ids_to_tokens(X.numpy()))#, skip_special_tokens=True)) + Y_gold = self.transform.Y_to_outputs(batch[1]) if self.config.multi_label else batch[1] + for x, y_pred, y_gold, in zip(batch[0][0], tokens, Y_gold):#batch[1]): + feature = ''.join(self.transform.tokenizer.convert_ids_to_tokens(x.numpy()))#, skip_special_tokens=True)) feature = feature.replace(' ##', '') # fix sub-word generated by BERT tagger # Y_gold = self.transform.label_vocab.idx_to_token[Y_gold] - out.write('{}\t{}\t{}\n'.format(feature, Y_pred, Y_gold)) - total += 1 - correct += sum([1 for y1 in Y_gold for y2 in Y_pred if y1==y2])/max(len(Y_pred),len(Y_gold)) if self.config.multi_label else int(Y_pred == Y_gold) - score = correct / total + out.write('{}\t{}\t{}\n'.format(feature, y_pred, y_gold)) + # total += 1 + # correct += sum([1 for y1 in y_gold for y2 in y_pred if y1==y2])/max(len(y_pred),len(y_gold)) if self.config.multi_label else int(y_pred == y_gold) + score = metric[-1](Y_gold, list(tokens)) print('\r{}/{} {}: {:.2f}'.format(idx + 1, num_batches, metric, score * 100), end='') print() return score From ff092adb7cadf9c2b3b2b3c15d715f2fa7a0affb Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Mon, 21 Dec 2020 16:24:19 +0800 Subject: [PATCH 12/14] fixed evaluation output on classification --- hanlp/common/component.py | 4 +- .../classifiers/transformer_classifier.py | 52 +++++++++---------- hanlp/transform/table.py | 4 +- 3 files changed, 29 insertions(+), 31 deletions(-) diff --git a/hanlp/common/component.py b/hanlp/common/component.py index 6ab4409a4..df951f7b0 100644 --- a/hanlp/common/component.py +++ b/hanlp/common/component.py @@ -126,9 +126,7 @@ def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=128, format_scores(score) if isinstance(score, dict) else format_metrics(self.model.metrics), speed, extra_report)) if output: - logger.info('Saving output to {}'.format(output)) - with open(output, 'w', encoding='utf-8') as out: - self.evaluate_output(tst_data, out, num_batches, self.model.metrics) + self.evaluate_output(tst_data, output, num_batches, self.model.metrics) return loss, score, speed diff --git a/hanlp/components/classifiers/transformer_classifier.py b/hanlp/components/classifiers/transformer_classifier.py index daba371c6..7bb08e1ef 100644 --- a/hanlp/components/classifiers/transformer_classifier.py +++ b/hanlp/components/classifiers/transformer_classifier.py @@ -2,7 +2,7 @@ # Author: hankcs # Date: 2019-11-10 13:19 -import math +import math, re from typing import Union, Tuple, List, Any, Iterable import tensorflow as tf @@ -120,34 +120,32 @@ def fit(self, trn_data: Any, dev_data: Any, save_dir: str, transformer: str, max epochs=3, logger=None, verbose=1, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) - def evaluate_output(self, tst_data, out, num_batches, metric): - out.write('sentence\tpred\tgold\n') - total, correct, score = 0, 0, 0 - for idx, batch in enumerate(tst_data): - outputs = self.model.predict_on_batch(batch[0]) - tokens = self.transform.Y_to_outputs(outputs) - Y_gold = self.transform.Y_to_outputs(batch[1]) if self.config.multi_label else batch[1] - for x, y_pred, y_gold, in zip(batch[0][0], tokens, Y_gold):#batch[1]): - feature = ''.join(self.transform.tokenizer.convert_ids_to_tokens(x.numpy()))#, skip_special_tokens=True)) - feature = feature.replace(' ##', '') # fix sub-word generated by BERT tagger - # Y_gold = self.transform.label_vocab.idx_to_token[Y_gold] - out.write('{}\t{}\t{}\n'.format(feature, y_pred, y_gold)) - # total += 1 - # correct += sum([1 for y1 in y_gold for y2 in y_pred if y1==y2])/max(len(y_pred),len(y_gold)) if self.config.multi_label else int(y_pred == y_gold) - score = metric[-1](Y_gold, list(tokens)) - print('\r{}/{} {}: {:.2f}'.format(idx + 1, num_batches, metric, score * 100), end='') - print() + def evaluate_output(self, tst_data, output, num_batches, metrics): + metric = metrics[-1] + try: + metric_name = metric.name + except: + metric_name = 'accuracy' + logger.info('Saving output to {}'.format(output)) + with open(output, 'w', encoding='utf-8') as out: + out.write('sentence\tpred\tgold\n') + total, correct, score = 0, 0, 0 + for idx, batch in enumerate(tst_data): + Y_pred = self.model.predict_on_batch(batch[0]) + for x, y_pred, y_gold, in zip(batch[0][0], Y_pred, batch[1]): + feature = ''.join(self.transform.tokenizer.convert_ids_to_tokens(x.numpy()))#, skip_special_tokens=True)) + feature = feature.replace('[CLS]', '') + feature = feature.replace('[PAD]', '') + feature = feature.replace(' ##', '') # fix sub-word generated by BERT tagger + y_pred_str = self.transform.Y_to_outputs([y_pred]) + y_gold_str = self.transform.Y_to_outputs([y_gold]) + out.write('{}\t{}\t{}\n'.format(feature, y_pred_str, y_gold_str)) + total += 1 + correct += metric(y_gold, y_pred) + score = correct/total + logger.info(f'{idx + 1}/{num_batches} {metric_name}: {score * 100}') return score - # def _y_id_to_str(self, Y_pred) -> str: - # logger.info(f'start to produce Y_pred: {Y_pred}') - # if self.config.multi_label: - # Y_pred = np.flatnonzero(Y_pred>0) - # return [self.transform.label_vocab.idx_to_token[y.numpy()] for y in Y_pred] - # else: - # Y_pred = tf.argmax(Y_pred, axis=1) - # return self.transform.label_vocab.idx_to_token[Y_pred.numpy()] - def build_loss(self, loss, **kwargs): if loss: # assert isinstance(loss, tf.keras.losses.Loss), 'Must specify loss as an instance in tf.keras.losses.Loss' diff --git a/hanlp/transform/table.py b/hanlp/transform/table.py index ad95fd8f7..f86ef5316 100644 --- a/hanlp/transform/table.py +++ b/hanlp/transform/table.py @@ -31,7 +31,9 @@ def file_to_inputs(self, filepath: str, gold=True): #multi-label: Dataset in .tsv format: x_columns: at most 2 columns being a sentence pair while in most # cases just one column being the doc content. y_column being the single label, which shall be modified # to load a list of labels. - if x_columns: + if type(x_columns) is int: + inputs = [cells[x_columns]], cells[y_column] + elif type(x_columns) is list: inputs = tuple(c for i, c in enumerate(cells) if i in x_columns), cells[y_column] else: if y_column != -1: From 5d9bb8bb7ea033ad2672e79e82ccf8813ad754ac Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Mon, 21 Dec 2020 17:51:31 +0800 Subject: [PATCH 13/14] fixed predict on multi-label --- hanlp/transform/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hanlp/transform/table.py b/hanlp/transform/table.py index f86ef5316..36350fbd2 100644 --- a/hanlp/transform/table.py +++ b/hanlp/transform/table.py @@ -62,7 +62,7 @@ def inputs_to_samples(self, inputs, gold=False): if gold: yield cells else: - yield cells, pad + yield cells, [pad] if self.config.multi_label else pad def y_to_idx(self, y) -> tf.Tensor: return self.label_vocab.lookup(y) From d2f1dc6a2dea1b50a3fd7a49ceadff1437f372f1 Mon Sep 17 00:00:00 2001 From: Derek Zen Date: Wed, 23 Dec 2020 03:35:40 +0800 Subject: [PATCH 14/14] fix on evaluation output --- hanlp/common/component.py | 14 ++++++---- .../classifiers/transformer_classifier.py | 26 +++++++++---------- hanlp/transform/table.py | 5 +--- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/hanlp/common/component.py b/hanlp/common/component.py index df951f7b0..b99b0b6f8 100644 --- a/hanlp/common/component.py +++ b/hanlp/common/component.py @@ -92,7 +92,7 @@ def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=128, if save_dir and not logger: logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO if verbose else logging.WARN, mode='w') - tst_data = self.transform.file_to_dataset(input_path, batch_size=batch_size) + tst_data = self.transform.file_to_dataset(input_path, batch_size=batch_size) samples = size_of_dataset(tst_data) num_batches = math.ceil(samples / batch_size) if warm_up: @@ -126,7 +126,7 @@ def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=128, format_scores(score) if isinstance(score, dict) else format_metrics(self.model.metrics), speed, extra_report)) if output: - self.evaluate_output(tst_data, output, num_batches, self.model.metrics) + self.evaluate_output(tst_data, input=input_path, output=output, num_batches=num_batches, metrics=self.model.metrics) return loss, score, speed @@ -134,7 +134,7 @@ def evaluate_dataset(self, tst_data, callbacks, output, num_batches): loss, score = self.model.evaluate(tst_data, callbacks=callbacks, steps=num_batches) return loss, score, output - def evaluate_output(self, tst_data, out, num_batches, metrics: List[tf.keras.metrics.Metric]): + def evaluate_output(self, tst_data, input, output, num_batches, metrics: List[tf.keras.metrics.Metric]): # out.write('x\ty_true\ty_pred\n') for metric in metrics: metric.reset_states() @@ -231,14 +231,15 @@ def load_transform(self, save_dir) -> Transform: return self.transform def save(self, save_dir: str, **kwargs): + self.save_meta(save_dir) self.save_config(save_dir) self.save_vocabs(save_dir) self.save_weights(save_dir) self.model.save(save_dir) def load(self, save_dir: str, logger=hanlp.utils.log_util.logger, **kwargs): - self.meta['load_path'] = save_dir save_dir = get_resource(save_dir) + self.meta['load_path'] = save_dir self.load_config(save_dir) self.load_vocabs(save_dir) self.build(**merge_dict(self.config, training=False, logger=logger, **kwargs, overwrite=True, inplace=True)) @@ -408,7 +409,10 @@ def build_callbacks(self, save_dir, logger, **kwargs): if isinstance(metrics, str): metrics_names = metrics else: - metrics_names = [m.name for m in metrics][-1] + try: + metrics_names = metrics[-1].name + except: + metrics_names = 'accuracy' monitor = f'val_{metrics_names}' checkpoint = tf.keras.callbacks.ModelCheckpoint( os.path.join(save_dir, 'model.h5'), diff --git a/hanlp/components/classifiers/transformer_classifier.py b/hanlp/components/classifiers/transformer_classifier.py index 7bb08e1ef..bbcb9457c 100644 --- a/hanlp/components/classifiers/transformer_classifier.py +++ b/hanlp/components/classifiers/transformer_classifier.py @@ -93,11 +93,11 @@ def y_to_idx(self, y) -> tf.Tensor: def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, batch=None) -> Iterable: # Prediction to be Y > 0: if self.config.multi_label: - preds = [np.flatnonzero(y>0) for y in Y] + preds = [np.flatnonzero(y>0) for y in Y] if not gold else Y for p in preds: yield [self.label_vocab.idx_to_token[i] for i in p] else: - preds = tf.argmax(Y, axis=-1) + preds = tf.argmax(Y, axis=-1) if not gold else Y for y in preds: yield self.label_vocab.idx_to_token[y] @@ -120,7 +120,7 @@ def fit(self, trn_data: Any, dev_data: Any, save_dir: str, transformer: str, max epochs=3, logger=None, verbose=1, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) - def evaluate_output(self, tst_data, output, num_batches, metrics): + def evaluate_output(self, tst_data, input, output, num_batches, metrics): metric = metrics[-1] try: metric_name = metric.name @@ -128,22 +128,22 @@ def evaluate_output(self, tst_data, output, num_batches, metrics): metric_name = 'accuracy' logger.info('Saving output to {}'.format(output)) with open(output, 'w', encoding='utf-8') as out: - out.write('sentence\tpred\tgold\n') total, correct, score = 0, 0, 0 + prediction = [] for idx, batch in enumerate(tst_data): Y_pred = self.model.predict_on_batch(batch[0]) - for x, y_pred, y_gold, in zip(batch[0][0], Y_pred, batch[1]): - feature = ''.join(self.transform.tokenizer.convert_ids_to_tokens(x.numpy()))#, skip_special_tokens=True)) - feature = feature.replace('[CLS]', '') - feature = feature.replace('[PAD]', '') - feature = feature.replace(' ##', '') # fix sub-word generated by BERT tagger - y_pred_str = self.transform.Y_to_outputs([y_pred]) - y_gold_str = self.transform.Y_to_outputs([y_gold]) - out.write('{}\t{}\t{}\n'.format(feature, y_pred_str, y_gold_str)) + Y_pred_str = self.transform.Y_to_outputs(Y_pred) + prediction += [y for y in Y_pred_str] + for y_pred, y_gold, in zip(Y_pred, batch[1]): total += 1 correct += metric(y_gold, y_pred) score = correct/total - logger.info(f'{idx + 1}/{num_batches} {metric_name}: {score * 100}') + logger.info(f'{idx + 1}/{num_batches} {metric_name}: {score * 100:.2f}%') + with open(input, 'r') as f: + out.write(f.readline().replace('\n', '')+'\tpred\n') + for i, y_pred in enumerate(prediction): + out.write(f.readline().replace('\n', '')+f'\t{y_pred}\n') + return score def build_loss(self, loss, **kwargs): diff --git a/hanlp/transform/table.py b/hanlp/transform/table.py index 36350fbd2..4bf625fb4 100644 --- a/hanlp/transform/table.py +++ b/hanlp/transform/table.py @@ -28,10 +28,7 @@ def file_to_inputs(self, filepath: str, gold=True): y_column = self.config.y_column num_features = self.config.get('num_features', None) for cells in read_cells(filepath, skip_header=self.config.skip_header, delimiter=self.config.delimiter): - #multi-label: Dataset in .tsv format: x_columns: at most 2 columns being a sentence pair while in most - # cases just one column being the doc content. y_column being the single label, which shall be modified - # to load a list of labels. - if type(x_columns) is int: + if type(x_columns) is int: inputs = [cells[x_columns]], cells[y_column] elif type(x_columns) is list: inputs = tuple(c for i, c in enumerate(cells) if i in x_columns), cells[y_column]