Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix evaluation and other minor issues to adapt to multi-label classification #1595

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,10 @@ fabric.properties
.idea
*.iml
data
.vscode/settings.json
.vscode
*.pkl
*.pdf
_static/
_build/
_templates/
_templates/
4 changes: 3 additions & 1 deletion hanlp/common/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import inspect
from abc import ABC, abstractmethod
from typing import Any

import numpy as np
import tensorflow as tf
from tensorflow.keras.mixed_precision import experimental as mixed_precision
from hanlp_common.configurable import Configurable


Expand Down
8 changes: 4 additions & 4 deletions hanlp/common/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,8 @@ def safe_pad_token(self) -> str:
"""
if self.pad_token:
return self.pad_token
if self.first_token:
return self.first_token
# if self.first_token:
# return self.first_token
return PAD

@property
Expand All @@ -345,8 +345,8 @@ def safe_unk_token(self) -> str:
"""
if self.unk_token:
return self.unk_token
if self.first_token:
return self.first_token
# if self.first_token:
# return self.first_token
return UNK

def __repr__(self) -> str:
Expand Down
4 changes: 4 additions & 0 deletions hanlp/layers/transformers/loader_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from hanlp.layers.transformers.tf_imports import zh_albert_models_google, bert_models_google
from hanlp.utils.io_util import get_resource, stdout_redirected, hanlp_home

gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpu_devices:
tf.config.experimental.set_memory_growth(gpu, True)


def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False):
spm_model_file = None
Expand Down
9 changes: 4 additions & 5 deletions hanlp/transform/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,9 @@ def file_to_inputs(self, filepath: str, gold=True):
y_column = self.config.y_column
num_features = self.config.get('num_features', None)
for cells in read_cells(filepath, skip_header=self.config.skip_header, delimiter=self.config.delimiter):
#multi-label: Dataset in .tsv format: x_columns: at most 2 columns being a sentence pair while in most
# cases just one column being the doc content. y_column being the single label, which shall be modified
# to load a list of labels.
if x_columns:
if type(x_columns) is int:
inputs = [cells[x_columns]], cells[y_column]
elif type(x_columns) is list:
inputs = tuple(c for i, c in enumerate(cells) if i in x_columns), cells[y_column]
else:
if y_column != -1:
Expand Down Expand Up @@ -60,7 +59,7 @@ def inputs_to_samples(self, inputs, gold=False):
if gold:
yield cells
else:
yield cells, pad
yield cells, [pad] if self.config.multi_label else pad

def y_to_idx(self, y) -> tf.Tensor:
return self.label_vocab.lookup(y)
Expand Down