File size: 1,086 Bytes
fda57dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from unittest import TestCase

from tokenizers.pre_tokenizers import Whitespace

from recognizers.utils import DifferenceSample


class DifferenceSampleTestCase(TestCase):

    def setUp(self):
        self.text_a = "Chinese shares close higher Friday."
        self.text_b = "Les actions chinoises clôturent en baisse mercredi."
        self.tokenizer = Whitespace()
        self.encoding_a = self.tokenizer.pre_tokenize_str(self.text_a)
        self.encoding_b = self.tokenizer.pre_tokenize_str(self.text_b)
        self.result = DifferenceSample(
            tokens_a=tuple([token[0] for token in self.encoding_a]),
            tokens_b=tuple([token[0] for token in self.encoding_b]),
            labels_a=tuple([0.1 for _ in range(len(self.encoding_a))]),
            labels_b=tuple([0.1 for _ in range(len(self.encoding_b))]),
        )

    def test_add_whitespace(self):
        self.result.add_whitespace(self.encoding_a, self.encoding_b)
        self.assertEqual("".join(self.result.tokens_a), self.text_a)
        self.assertEqual("".join(self.result.tokens_b), self.text_b)