tokenized = jieba.tokenize(text) print('******') print(f"{[t for t in tokenized]}") print('******') tokens = [Token(word, start) for (word, start, end) in tokenized] return self._apply_token_pattern(tokens) ...
NLU model loaded. Type a message and press enter to parse it. Next message: 我想找地方吃饭 ****** Building prefix dict from the default dictionary ... Loading model from cache /var/folders/cz/kq5sssg12jx887hj62hwczrr0000gn/T/jieba.cache Loading model cost 0.729 seconds. Prefix dict has been built successfully. [('我', 0, 1), ('想', 1, 2), ('找', 2, 3), ('地方', 3, 5), ('吃饭', 5, 7)] ****** { "text": "我想找地方吃饭", "intent": { "name": "eat_search", "confidence": 1.0 }, "entities": [] } Next message:
构建 TF-Text 分词
注:由于 Rasa 目前只支持 TensorFlow 2.3 版本,而 TensorFlow-Text 最新版需要使用 TensorFlow 2.4 版本,所以我们为了兼容,下载 Rasa 源代码,并对源代码引入的 TensorFlow 和相关的插件版本号都做修改来匹配使用 TensorFlow-Text 的中文分词功能。
import glob import logging import os import shutil import typing from typing import Any, Dict, List, Optional, Text
from rasa.nlu.components import Component from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.shared.nlu.training_data.message import Message
logger = logging.getLogger(__name__)
if typing.TYPE_CHECKING: from rasa.nlu.model import Metadata
class TensorFlowTextTokenizer(Tokenizer): """This tokenizer is a wrapper for tensorflow_text (https://www.tensorflow.org/tutorials/tensorflow_text/intro)."""
supported_language_list = ["zh"]
defaults = { "model_handle": "https://hub.tensorflow.google.cn/google/zh_segmentation/1", # Flag to check whether to split intents "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", # Regular expression to detect tokens "token_pattern": None, } # default don't load custom dictionary
def __init__(self, component_config: Dict[Text, Any] = None) -> None: """Construct a new intent classifier using the TensorFlow framework."""
def tokenize(self, message: Message, attribute: Text) -> List[Token]: import tensorflow_text as tftext import tensorflow as tf
# 设定模型的 UR self.model_handle = self.component_config.get("model_handle") segmenter = tftext.HubModuleTokenizer(self.model_handle)
text = message.get(attribute) print(text) tokens, starts, ends = segmenter.tokenize_with_offsets(text) tokens_list = tokens.numpy() starts_list = starts.numpy() print('******') print(f"{[t.decode('utf-8') for t in tokens_list]}") print(f"{[t for t in starts_list]}") print('******') tokensData = [Token(tokens_list[i], starts_list[i]) for i in range(len(tokens_list))] return self._apply_token_pattern(tokensData)
初步模仿结巴分词代码,并直接打印出 log,看看分词的效果。
在 registry.py 注入我们写的插件:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
from rasa.nlu.tokenizers.tensorflow_text_tokenizer import TensorFlowTextTokenizer
print("Before optimization, ", terminator: "") print("x: \(x) and f(x): \(f(x))")
// Optimization loop for _ in 1...maxIterations { /// Derivative of `f` w.r.t. `x`. let 𝛁xF = gradient(at: x) { x -> Float in return f(x) } // Optimization step: update `x` to maximize `f` x += η * 𝛁xF }
for _ in 1...maxIterations { let 𝛁xF = gradient(at: x) { x -> Float in return f(x) } // Optimization step: update `x` to maximize `f` x.move(along: 𝛁xF.scaled(by: -η)) } print("After gradient descent, ", terminator: "") print("input: \(x) and output: \(f(x))")
解释
这个原理挺好理解得,如果一个函数在极大值点周围:
同样的,在极小值点周围:
本文中,主要是使用函数:gradient(at: in:)
1
@inlinable public func gradient<T, R>(at x: T, in f: @differentiable (T) -> Tensor<R>) -> T.TangentVector where T : Differentiable, R : TensorFlowFloatingPoint
# Ratings data. ratings = tfds.load('movielens/100k-ratings', split="train", data_dir = os.path.join(os.getcwd(), "data")) # Features of all the available movies. movies = tfds.load('movielens/100k-movies', split="train", data_dir = os.path.join(os.getcwd(), "data"))
import matplotlib.pyplot as plt import tensorflow as tf from tensorflow.keras import layers, models import tensorflow_datasets as tfds import os import ssl
Tuple of Numpy arrays: (x_train, y_train), (x_test, y_test). x_train, x_test: uint8 arrays of RGB image data with shape (num_samples, 3, 32, 32) if tf.keras.backend.image_data_format() is ‘channels_first’, or (num_samples, 32, 32, 3) if the data format is ‘channels_last’.
y_train, y_test: uint8 arrays of category labels (integers in range 0-9) each with shape (num_samples, 1).
class StepsInteractor { let healthStore = HKHealthStore() let stepCountType = HKObjectType.quantityType(forIdentifier: HKQuantityTypeIdentifier.stepCount)! // Access Step Count let healthKitTypes: Set = [ HKObjectType.quantityType(forIdentifier: HKQuantityTypeIdentifier.stepCount)! ] func retrieveStepsWithAuth(completion: @escaping (Double) -> Void) { // Check for Authorization if (healthStore.authorizationStatus(for: stepCountType) != HKAuthorizationStatus.sharingAuthorized) { healthStore.requestAuthorization(toShare: healthKitTypes, read: healthKitTypes) { (success, error) in if (success) { // Authorization Successful self.getSteps { (result) in completion(result) } } else { completion(-1) } } } else { self.getSteps { (result) in completion(result) } } } func getSteps(completion: @escaping (Double) -> Void) { let stepsQuantityType = HKQuantityType.quantityType(forIdentifier: .stepCount)!
let now = Date() let startOfDay = now - 2.days var interval = DateComponents() interval.day = 1 let query = HKStatisticsCollectionQuery( quantityType: stepsQuantityType, quantitySamplePredicate: nil, options: [.cumulativeSum], anchorDate: startOfDay, intervalComponents: interval) query.initialResultsHandler = { _, result, error in var resultCount = 0.0 result!.enumerateStatistics(from: startOfDay, to: now) { statistics, _ in if let sum = statistics.sumQuantity() { // Get steps (they are of double type) resultCount = sum.doubleValue(for: HKUnit.count()) } // end if // Return completion(resultCount) } } query.statisticsUpdateHandler = { query, statistics, statisticsCollection, error in // If new statistics are available if let sum = statistics?.sumQuantity() { let resultCount = sum.doubleValue(for: HKUnit.count()) // Return completion(resultCount) } // end if } healthStore.execute(query) } }