tokenized = jieba.tokenize(text) print('******') print(f"{[t for t in tokenized]}") print('******') tokens = [Token(word, start) for (word, start, end) in tokenized] return self._apply_token_pattern(tokens) ...
NLU model loaded. Type a message and press enter to parse it. Next message: 我想找地方吃饭 ****** Building prefix dict from the default dictionary ... Loading model from cache /var/folders/cz/kq5sssg12jx887hj62hwczrr0000gn/T/jieba.cache Loading model cost 0.729 seconds. Prefix dict has been built successfully. [('我', 0, 1), ('想', 1, 2), ('找', 2, 3), ('地方', 3, 5), ('吃饭', 5, 7)] ****** { "text": "我想找地方吃饭", "intent": { "name": "eat_search", "confidence": 1.0 }, "entities": [] } Next message:
构建 TF-Text 分词
注:由于 Rasa 目前只支持 TensorFlow 2.3 版本,而 TensorFlow-Text 最新版需要使用 TensorFlow 2.4 版本,所以我们为了兼容,下载 Rasa 源代码,并对源代码引入的 TensorFlow 和相关的插件版本号都做修改来匹配使用 TensorFlow-Text 的中文分词功能。
import glob import logging import os import shutil import typing from typing import Any, Dict, List, Optional, Text
from rasa.nlu.components import Component from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.shared.nlu.training_data.message import Message
logger = logging.getLogger(__name__)
if typing.TYPE_CHECKING: from rasa.nlu.model import Metadata
class TensorFlowTextTokenizer(Tokenizer): """This tokenizer is a wrapper for tensorflow_text (https://www.tensorflow.org/tutorials/tensorflow_text/intro)."""
supported_language_list = ["zh"]
defaults = { "model_handle": "https://hub.tensorflow.google.cn/google/zh_segmentation/1", # Flag to check whether to split intents "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", # Regular expression to detect tokens "token_pattern": None, } # default don't load custom dictionary
def __init__(self, component_config: Dict[Text, Any] = None) -> None: """Construct a new intent classifier using the TensorFlow framework."""
def tokenize(self, message: Message, attribute: Text) -> List[Token]: import tensorflow_text as tftext import tensorflow as tf
# 设定模型的 UR self.model_handle = self.component_config.get("model_handle") segmenter = tftext.HubModuleTokenizer(self.model_handle)
text = message.get(attribute) print(text) tokens, starts, ends = segmenter.tokenize_with_offsets(text) tokens_list = tokens.numpy() starts_list = starts.numpy() print('******') print(f"{[t.decode('utf-8') for t in tokens_list]}") print(f"{[t for t in starts_list]}") print('******') tokensData = [Token(tokens_list[i], starts_list[i]) for i in range(len(tokens_list))] return self._apply_token_pattern(tokensData)
初步模仿结巴分词代码,并直接打印出 log,看看分词的效果。
在 registry.py 注入我们写的插件:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
from rasa.nlu.tokenizers.tensorflow_text_tokenizer import TensorFlowTextTokenizer