Skip to content

Commit

Permalink
Improve the merge rule for NER dict_whitelist
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Apr 29, 2021
1 parent 7e36bc4 commit 4eaf7ee
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 3 deletions.
3 changes: 1 addition & 2 deletions hanlp/components/ner/transformer_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ def tag_to_span(self, batch_tags, batch):
for tags, tokens in zip(batch_tags, sents):
if dict_whitelist:
for start, end, label in dict_whitelist.tokenize(tokens):
if (tags[start].startswith('B') or tags[start].startswith('S')) and (
tags[end - 1].startswith('E') or tags[end - 1].startswith('S')):
if (not tags[start][0] in 'ME') and (not tags[end - 1][0] in 'BM'):
if end - start == 1:
tags[start] = 'S-' + label
else:
Expand Down
2 changes: 1 addition & 1 deletion hanlp/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
# Author: hankcs
# Date: 2019-12-28 19:26

__version__ = '2.1.0-alpha.36'
__version__ = '2.1.0-alpha.38'
"""HanLP version"""
12 changes: 12 additions & 0 deletions plugins/hanlp_demo/hanlp_demo/zh/demo_ner_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-04-29 11:06
import hanlp

HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)
HanLP['ner/msra'].dict_whitelist = {'午饭后': 'TIME'}
doc = HanLP('2021年测试高血压是138,时间是午饭后2点45,低血压是44', tasks='ner/msra')
doc.pretty_print()
print(doc['ner/msra'])

# See https://hanlp.hankcs.com/docs/api/hanlp/components/mtl/tasks/ner/tag_ner.html

0 comments on commit 4eaf7ee

Please sign in to comment.