1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <vector>
16 
17 #include "tensorflow/core/framework/fake_input.h"
18 #include "tensorflow/core/framework/node_def_builder.h"
19 #include "tensorflow/core/framework/shape_inference.h"
20 #include "tensorflow/core/framework/shape_inference_testutil.h"
21 #include "tensorflow/core/framework/tensor.h"
22 #include "tensorflow/core/framework/tensor_shape.h"
23 #include "tensorflow/core/framework/tensor_testutil.h"
24 #include "tensorflow/core/framework/types.pb.h"
25 #include "tensorflow/core/kernels/ops_testutil.h"
26 #include "tensorflow/core/lib/core/status.h"
27 #include "tensorflow/core/lib/core/status_test_util.h"
28 
29 namespace tensorflow {
30 namespace text {
31 
32 using tensorflow::FakeInput;
33 using tensorflow::NodeDefBuilder;
34 using tensorflow::Status;
35 using tensorflow::TensorShape;
36 
37 class NgramKernelTest : public tensorflow::OpsTestBase {
38  public:
MakeOp(string separator,std::vector<int> ngram_width,string left_pad,string right_pad,int pad_width,bool preserve)39   void MakeOp(string separator, std::vector<int> ngram_width, string left_pad,
40               string right_pad, int pad_width, bool preserve) {
41     TF_ASSERT_OK(NodeDefBuilder("tested_op", "StringNGrams")
42                      .Attr("separator", separator)
43                      .Attr("ngram_widths", ngram_width)
44                      .Attr("left_pad", left_pad)
45                      .Attr("right_pad", right_pad)
46                      .Attr("pad_width", pad_width)
47                      .Attr("preserve_short_sequences", preserve)
48                      .Input(FakeInput())
49                      .Input(FakeInput())
50                      .Finalize(node_def()));
51     TF_ASSERT_OK(InitOp());
52   }
53 
assert_string_equal(const std::vector<tstring> & expected,const Tensor & value)54   void assert_string_equal(const std::vector<tstring> &expected,
55                            const Tensor &value) {
56     Tensor expected_tensor(allocator(), DT_STRING,
57                            TensorShape({static_cast<int64>(expected.size())}));
58     test::FillValues<tstring>(&expected_tensor, expected);
59     test::ExpectTensorEqual<tstring>(expected_tensor, value);
60   }
assert_int64_equal(const std::vector<int64> & expected,const Tensor & value)61   void assert_int64_equal(const std::vector<int64> &expected,
62                           const Tensor &value) {
63     Tensor expected_tensor(allocator(), DT_INT64,
64                            TensorShape({static_cast<int64>(expected.size())}));
65     test::FillValues<int64>(&expected_tensor, expected);
66     test::ExpectTensorEqual<int64>(expected_tensor, value);
67   }
68 };
69 
TEST_F(NgramKernelTest,TestPaddedTrigrams)70 TEST_F(NgramKernelTest, TestPaddedTrigrams) {
71   MakeOp("|", {3}, "LP", "RP", -1, false);
72   // Batch items are:
73   // 0: "a", "b", "c", "d"
74   // 1: "e", "f"
75   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
76   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
77   TF_ASSERT_OK(RunOpKernel());
78 
79   std::vector<tstring> expected_values(                             //
80       {"LP|LP|a", "LP|a|b", "a|b|c", "b|c|d", "c|d|RP", "d|RP|RP",  // 0
81        "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"});                  // 1
82   std::vector<int64> expected_splits({0, 6, 10});
83 
84   assert_string_equal(expected_values, *GetOutput(0));
85   assert_int64_equal(expected_splits, *GetOutput(1));
86 }
87 
TEST_F(NgramKernelTest,TestPaddedBigramsAndTrigrams)88 TEST_F(NgramKernelTest, TestPaddedBigramsAndTrigrams) {
89   MakeOp("|", {2, 3}, "LP", "RP", -1, false);
90   // Batch items are:
91   // 0: "a", "b", "c", "d"
92   // 1: "e", "f"
93   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
94   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
95   TF_ASSERT_OK(RunOpKernel());
96 
97   std::vector<tstring> expected_values(
98       {"LP|a", "a|b", "b|c", "c|d", "d|RP", "LP|LP|a", "LP|a|b", "a|b|c",
99        "b|c|d", "c|d|RP", "d|RP|RP",                                       // 0
100        "LP|e", "e|f", "f|RP", "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"});  // 1
101   std::vector<int64> expected_splits({0, 11, 18});
102 
103   assert_string_equal(expected_values, *GetOutput(0));
104   assert_int64_equal(expected_splits, *GetOutput(1));
105 }
106 
TEST_F(NgramKernelTest,TestPaddedBigrams)107 TEST_F(NgramKernelTest, TestPaddedBigrams) {
108   MakeOp("|", {2}, "LP", "RP", -1, false);
109   // Batch items are:
110   // 0: "a", "b", "c", "d"
111   // 1: "e", "f"
112   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
113   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
114   TF_ASSERT_OK(RunOpKernel());
115 
116   std::vector<tstring> expected_values(      //
117       {"LP|a", "a|b", "b|c", "c|d", "d|RP",  // 0
118        "LP|e", "e|f", "f|RP"});              // 1
119   std::vector<int64> expected_splits({0, 5, 8});
120 
121   assert_string_equal(expected_values, *GetOutput(0));
122   assert_int64_equal(expected_splits, *GetOutput(1));
123 }
124 
TEST_F(NgramKernelTest,TestPaddingIsAtMostNGramSizeMinus1)125 TEST_F(NgramKernelTest, TestPaddingIsAtMostNGramSizeMinus1) {
126   MakeOp("|", {2}, "LP", "RP", 4, false);
127   // Batch items are:
128   // 0: "a", "b", "c", "d"
129   // 1: "e", "f"
130   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
131   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
132   TF_ASSERT_OK(RunOpKernel());
133 
134   std::vector<tstring> expected_values(      //
135       {"LP|a", "a|b", "b|c", "c|d", "d|RP",  // 0
136        "LP|e", "e|f", "f|RP"});              // 1
137   std::vector<int64> expected_splits({0, 5, 8});
138 
139   assert_string_equal(expected_values, *GetOutput(0));
140   assert_int64_equal(expected_splits, *GetOutput(1));
141 }
142 
TEST_F(NgramKernelTest,TestPaddedUnigramAndBigrams)143 TEST_F(NgramKernelTest, TestPaddedUnigramAndBigrams) {
144   MakeOp("|", {1, 2}, "LP", "RP", -1, false);
145   // Batch items are:
146   // 0: "a", "b", "c", "d"
147   // 1: "e", "f"
148   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
149   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
150   TF_ASSERT_OK(RunOpKernel());
151 
152   std::vector<tstring> expected_values(                          //
153       {"a", "b", "c", "d", "LP|a", "a|b", "b|c", "c|d", "d|RP",  // 0
154        "e", "f", "LP|e", "e|f", "f|RP"});                        // 1
155   std::vector<int64> expected_splits({0, 9, 14});
156 
157   assert_string_equal(expected_values, *GetOutput(0));
158   assert_int64_equal(expected_splits, *GetOutput(1));
159 }
160 
TEST_F(NgramKernelTest,TestOverlappingPaddedNGrams)161 TEST_F(NgramKernelTest, TestOverlappingPaddedNGrams) {
162   // This test validates that n-grams with both left and right padding in a
163   // single ngram token are created correctly.
164   MakeOp("|", {3}, "LP", "RP", -1, false);
165   // Batch items are:
166   // 0: "a"
167   // 1: "b", "c", "d"
168   // 2: "e", "f"
169   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
170   AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
171   TF_ASSERT_OK(RunOpKernel());
172 
173   std::vector<tstring> expected_values(                    //
174       {"LP|LP|a", "LP|a|RP", "a|RP|RP",                    // ngrams for elem. 0
175        "LP|LP|b", "LP|b|c", "b|c|d", "c|d|RP", "d|RP|RP",  // ngrams for elem. 1
176        "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"});         // ngrams for elem. 2
177   std::vector<int64> expected_splits({0, 3, 8, 12});
178 
179   assert_string_equal(expected_values, *GetOutput(0));
180   assert_int64_equal(expected_splits, *GetOutput(1));
181 }
182 
TEST_F(NgramKernelTest,TestOverlappingPaddedMultiCharNGrams)183 TEST_F(NgramKernelTest, TestOverlappingPaddedMultiCharNGrams) {
184   MakeOp("|", {3}, "LP", "RP", -1, false);
185   // Batch items are:
186   // 0: "a"
187   // 1: "b", "c", "d"
188   // 2: "e", "f"
189   AddInputFromArray<tstring>(TensorShape({6}),
190                              {"aa", "bb", "cc", "dd", "ee", "ff"});
191   AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
192   TF_ASSERT_OK(RunOpKernel());
193 
194   std::vector<tstring> expected_values(                             //
195       {"LP|LP|aa", "LP|aa|RP", "aa|RP|RP",                          //
196        "LP|LP|bb", "LP|bb|cc", "bb|cc|dd", "cc|dd|RP", "dd|RP|RP",  //
197        "LP|LP|ee", "LP|ee|ff", "ee|ff|RP", "ff|RP|RP"});            //
198   std::vector<int64> expected_splits({0, 3, 8, 12});
199 
200   assert_string_equal(expected_values, *GetOutput(0));
201   assert_int64_equal(expected_splits, *GetOutput(1));
202 }
203 
TEST_F(NgramKernelTest,TestMultiOverlappingPaddedNGrams)204 TEST_F(NgramKernelTest, TestMultiOverlappingPaddedNGrams) {
205   // This test validates that n-grams with more than 1 padding value on each
206   // side are created correctly.
207   MakeOp("|", {5}, "LP", "RP", -1, false);
208   // Batch items are:
209   // 0: "a"
210   AddInputFromArray<tstring>(TensorShape({1}), {"a"});
211   AddInputFromArray<int64>(TensorShape({2}), {0, 1});
212   TF_ASSERT_OK(RunOpKernel());
213 
214   std::vector<tstring> expected_values({"LP|LP|LP|LP|a", "LP|LP|LP|a|RP",
215                                         "LP|LP|a|RP|RP", "LP|a|RP|RP|RP",
216                                         "a|RP|RP|RP|RP"});
217   std::vector<int64> expected_splits({0, 5});
218 
219   assert_string_equal(expected_values, *GetOutput(0));
220   assert_int64_equal(expected_splits, *GetOutput(1));
221 }
222 
TEST_F(NgramKernelTest,TestUnpaddedTrigrams)223 TEST_F(NgramKernelTest, TestUnpaddedTrigrams) {
224   MakeOp("|", {3}, "", "", 0, false);
225   // Batch items are:
226   // 0: "a", "b", "c", "d"
227   // 1: "e", "f"
228   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
229   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
230   TF_ASSERT_OK(RunOpKernel());
231 
232   std::vector<tstring> expected_values({"a|b|c", "b|c|d"});
233   std::vector<int64> expected_splits({0, 2, 2});
234 
235   assert_string_equal(expected_values, *GetOutput(0));
236   assert_int64_equal(expected_splits, *GetOutput(1));
237 }
238 
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithEmptySequence)239 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithEmptySequence) {
240   MakeOp("|", {3}, "", "", 0, false);
241   // Batch items are:
242   // 0: "a", "b", "c", "d"
243   // 1: "e", "f"
244   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
245   AddInputFromArray<int64>(TensorShape({4}), {0, 4, 4, 6});
246   TF_ASSERT_OK(RunOpKernel());
247 
248   std::vector<tstring> expected_values({"a|b|c", "b|c|d"});
249   std::vector<int64> expected_splits({0, 2, 2, 2});
250 
251   assert_string_equal(expected_values, *GetOutput(0));
252   assert_int64_equal(expected_splits, *GetOutput(1));
253 }
254 
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithPreserveShort)255 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithPreserveShort) {
256   MakeOp("|", {3}, "", "", 0, true);
257   // Batch items are:
258   // 0: "a", "b", "c", "d"
259   // 1: "e", "f"
260   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
261   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
262   TF_ASSERT_OK(RunOpKernel());
263 
264   std::vector<tstring> expected_values({"a|b|c", "b|c|d", "e|f"});
265   std::vector<int64> expected_splits({0, 2, 3});
266 
267   assert_string_equal(expected_values, *GetOutput(0));
268   assert_int64_equal(expected_splits, *GetOutput(1));
269 }
270 
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithPreserveShortAndEmptySequence)271 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithPreserveShortAndEmptySequence) {
272   MakeOp("|", {3}, "", "", 0, true);
273   // Batch items are:
274   // 0: "a", "b", "c", "d"
275   // 1: "e", "f"
276   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
277   AddInputFromArray<int64>(TensorShape({4}), {0, 4, 4, 6});
278   TF_ASSERT_OK(RunOpKernel());
279 
280   std::vector<tstring> expected_values({"a|b|c", "b|c|d", "e|f"});
281   std::vector<int64> expected_splits({0, 2, 2, 3});
282 
283   assert_string_equal(expected_values, *GetOutput(0));
284   assert_int64_equal(expected_splits, *GetOutput(1));
285 }
286 
TEST_F(NgramKernelTest,TestUnpaddedTrigramsAndQuadgramsWithPreserveShort)287 TEST_F(NgramKernelTest, TestUnpaddedTrigramsAndQuadgramsWithPreserveShort) {
288   MakeOp("|", {4, 3}, "", "", 0, true);
289   // Batch items are:
290   // 0: "a", "b", "c", "d"
291   // 1: "e", "f"
292   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
293   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
294   TF_ASSERT_OK(RunOpKernel());
295 
296   std::vector<tstring> expected_values({"a|b|c|d", "a|b|c", "b|c|d", "e|f"});
297   std::vector<int64> expected_splits({0, 3, 4});
298 
299   assert_string_equal(expected_values, *GetOutput(0));
300   assert_int64_equal(expected_splits, *GetOutput(1));
301 }
302 
TEST_F(NgramKernelTest,TestUnpaddedBigramsAndTrigrams)303 TEST_F(NgramKernelTest, TestUnpaddedBigramsAndTrigrams) {
304   MakeOp("|", {2, 3}, "", "", 0, false);
305   // Batch items are:
306   // 0: "a", "b", "c", "d"
307   // 1: "e", "f"
308   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
309   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
310   TF_ASSERT_OK(RunOpKernel());
311 
312   std::vector<tstring> expected_values(
313       {"a|b", "b|c", "c|d", "a|b|c", "b|c|d", "e|f"});
314   std::vector<int64> expected_splits({0, 5, 6});
315 
316   assert_string_equal(expected_values, *GetOutput(0));
317   assert_int64_equal(expected_splits, *GetOutput(1));
318 }
319 
TEST_F(NgramKernelTest,TestUnpaddedBigramsAndTrigramsWithPreserveShort)320 TEST_F(NgramKernelTest, TestUnpaddedBigramsAndTrigramsWithPreserveShort) {
321   MakeOp("|", {2, 3}, "", "", 0, true);
322   // Batch items are:
323   // 0: "a", "b", "c", "d"
324   // 1: "e", "f"
325   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
326   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
327   TF_ASSERT_OK(RunOpKernel());
328 
329   // Note that in this case, because the bigram 'e|f' was already generated,
330   // the op will not generate a special preserve_short bigram.
331   std::vector<tstring> expected_values(
332       {"a|b", "b|c", "c|d", "a|b|c", "b|c|d", "e|f"});
333   std::vector<int64> expected_splits({0, 5, 6});
334 
335   assert_string_equal(expected_values, *GetOutput(0));
336   assert_int64_equal(expected_splits, *GetOutput(1));
337 }
338 
TEST_F(NgramKernelTest,TestUnpaddedTrigramsAndBigramsWithPreserveShort)339 TEST_F(NgramKernelTest, TestUnpaddedTrigramsAndBigramsWithPreserveShort) {
340   MakeOp("|", {3, 2}, "", "", 0, true);
341   // Batch items are:
342   // 0: "a", "b", "c", "d"
343   // 1: "e", "f"
344   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
345   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
346   TF_ASSERT_OK(RunOpKernel());
347 
348   // Note that in this case, because the bigram 'e|f' was already generated,
349   // the op will not generate a special preserve_short bigram.
350   std::vector<tstring> expected_values(
351       {"a|b|c", "b|c|d", "a|b", "b|c", "c|d", "e|f"});
352   std::vector<int64> expected_splits({0, 5, 6});
353 
354   assert_string_equal(expected_values, *GetOutput(0));
355   assert_int64_equal(expected_splits, *GetOutput(1));
356 }
357 
TEST_F(NgramKernelTest,TestUnpaddedBigrams)358 TEST_F(NgramKernelTest, TestUnpaddedBigrams) {
359   MakeOp("|", {2}, "", "", 0, false);
360   // Batch items are:
361   // 0: "a", "b", "c", "d"
362   // 1: "e", "f"
363   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
364   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
365   TF_ASSERT_OK(RunOpKernel());
366 
367   std::vector<tstring> expected_values({"a|b", "b|c", "c|d", "e|f"});
368   std::vector<int64> expected_splits({0, 3, 4});
369 
370   assert_string_equal(expected_values, *GetOutput(0));
371   assert_int64_equal(expected_splits, *GetOutput(1));
372 }
373 
TEST_F(NgramKernelTest,TestOverlappingUnpaddedNGrams)374 TEST_F(NgramKernelTest, TestOverlappingUnpaddedNGrams) {
375   MakeOp("|", {3}, "", "", 0, false);
376   // Batch items are:
377   // 0: "a"
378   // 1: "b", "c", "d"
379   // 2: "e", "f"
380   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
381   AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
382   TF_ASSERT_OK(RunOpKernel());
383 
384   std::vector<tstring> expected_values({"b|c|d"});
385   std::vector<int64> expected_splits({0, 0, 1, 1});
386 
387   assert_string_equal(expected_values, *GetOutput(0));
388   assert_int64_equal(expected_splits, *GetOutput(1));
389 }
390 
TEST_F(NgramKernelTest,TestOverlappingUnpaddedNGramsNoOutput)391 TEST_F(NgramKernelTest, TestOverlappingUnpaddedNGramsNoOutput) {
392   MakeOp("|", {5}, "", "", 0, false);
393   // Batch items are:
394   // 0: "a"
395   // 1: "b", "c", "d"
396   // 2: "e", "f"
397   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
398   AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
399   TF_ASSERT_OK(RunOpKernel());
400 
401   std::vector<tstring> expected_values({});
402   std::vector<int64> expected_splits({0, 0, 0, 0});
403 
404   assert_string_equal(expected_values, *GetOutput(0));
405   assert_int64_equal(expected_splits, *GetOutput(1));
406 }
407 
TEST_F(NgramKernelTest,TestSinglyPaddedTrigrams)408 TEST_F(NgramKernelTest, TestSinglyPaddedTrigrams) {
409   MakeOp("|", {3}, "LP", "RP", 1, false);
410   // Batch items are:
411   // 0: "a", "b", "c", "d"
412   // 1: "e", "f"
413   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
414   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
415   TF_ASSERT_OK(RunOpKernel());
416 
417   std::vector<tstring> expected_values({"LP|a|b", "a|b|c", "b|c|d",
418                                         "c|d|RP",  //
419                                         "LP|e|f", "e|f|RP"});
420   std::vector<int64> expected_splits({0, 4, 6});
421 
422   assert_string_equal(expected_values, *GetOutput(0));
423   assert_int64_equal(expected_splits, *GetOutput(1));
424 }
425 
TEST_F(NgramKernelTest,TestSinglyPaddedBigrams)426 TEST_F(NgramKernelTest, TestSinglyPaddedBigrams) {
427   MakeOp("|", {2}, "LP", "RP", 1, false);
428   // Batch items are:
429   // 0: "a", "b", "c", "d"
430   // 1: "e", "f"
431   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
432   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
433   TF_ASSERT_OK(RunOpKernel());
434 
435   std::vector<tstring> expected_values({"LP|a", "a|b", "b|c", "c|d", "d|RP",  //
436                                         "LP|e", "e|f", "f|RP"});
437   std::vector<int64> expected_splits({0, 5, 8});
438 
439   assert_string_equal(expected_values, *GetOutput(0));
440   assert_int64_equal(expected_splits, *GetOutput(1));
441 }
442 
TEST_F(NgramKernelTest,TestSinglyPaddedBigramsAnd5grams)443 TEST_F(NgramKernelTest, TestSinglyPaddedBigramsAnd5grams) {
444   MakeOp("|", {2, 5}, "LP", "RP", 1, false);
445   // Batch items are:
446   // 0: "a", "b", "c", "d"
447   // 1: "e", "f"
448   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
449   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
450   TF_ASSERT_OK(RunOpKernel());
451 
452   std::vector<tstring> expected_values(                                  //
453       {"LP|a", "a|b", "b|c", "c|d", "d|RP", "LP|a|b|c|d", "a|b|c|d|RP",  //
454        "LP|e", "e|f", "f|RP"});
455   std::vector<int64> expected_splits({0, 7, 10});
456 
457   assert_string_equal(expected_values, *GetOutput(0));
458   assert_int64_equal(expected_splits, *GetOutput(1));
459 }
460 
TEST_F(NgramKernelTest,TestSinglyPadded5gramsWithPreserveShort)461 TEST_F(NgramKernelTest, TestSinglyPadded5gramsWithPreserveShort) {
462   MakeOp("|", {5}, "LP", "RP", 1, true);
463   // Batch items are:
464   // 0: "a", "b", "c", "d"
465   // 1: "e", "f"
466   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
467   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
468   TF_ASSERT_OK(RunOpKernel());
469 
470   std::vector<tstring> expected_values(  //
471       {"LP|a|b|c|d", "a|b|c|d|RP",       //
472        "LP|e|f|RP"});
473   std::vector<int64> expected_splits({0, 2, 3});
474 
475   assert_string_equal(expected_values, *GetOutput(0));
476   assert_int64_equal(expected_splits, *GetOutput(1));
477 }
478 
TEST_F(NgramKernelTest,TestOverlappingSinglyPaddedNGrams)479 TEST_F(NgramKernelTest, TestOverlappingSinglyPaddedNGrams) {
480   MakeOp("|", {3}, "LP", "RP", 1, false);
481   // Batch items are:
482   // 0: "a"
483   // 1: "b", "c", "d"
484   // 2: "e", "f"
485   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
486   AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
487   TF_ASSERT_OK(RunOpKernel());
488 
489   std::vector<tstring> expected_values(
490       {"LP|a|RP",                    // ngrams for elem. 0
491        "LP|b|c", "b|c|d", "c|d|RP",  // ngrams for elem. 1
492        "LP|e|f", "e|f|RP"});         // ngrams for elem. 2
493   std::vector<int64> expected_splits({0, 1, 4, 6});
494 
495   assert_string_equal(expected_values, *GetOutput(0));
496   assert_int64_equal(expected_splits, *GetOutput(1));
497 }
498 
TEST_F(NgramKernelTest,TestOverlappingSinglyPaddedNGramsNoOutput)499 TEST_F(NgramKernelTest, TestOverlappingSinglyPaddedNGramsNoOutput) {
500   MakeOp("|", {5}, "LP", "RP", 1, false);
501   // Batch items are:
502   // 0: "a"
503   // 1: "b", "c", "d"
504   // 2: "e", "f"
505   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
506   AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
507   TF_ASSERT_OK(RunOpKernel());
508 
509   std::vector<tstring> expected_values({"LP|b|c|d|RP"});
510   std::vector<int64> expected_splits({0, 0, 1, 1});
511 
512   assert_string_equal(expected_values, *GetOutput(0));
513   assert_int64_equal(expected_splits, *GetOutput(1));
514 }
515 
TEST_F(NgramKernelTest,TestSinglyPaddedUnigrams)516 TEST_F(NgramKernelTest, TestSinglyPaddedUnigrams) {
517   MakeOp("|", {1}, "LP", "RP", 1, false);
518   // Batch items are:
519   // 0: "a", "b", "c", "d"
520   // 1: "e", "f"
521   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
522   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
523   TF_ASSERT_OK(RunOpKernel());
524 
525   std::vector<tstring> expected_values({"a", "b", "c", "d", "e", "f"});
526   std::vector<int64> expected_splits({0, 4, 6});
527 
528   assert_string_equal(expected_values, *GetOutput(0));
529   assert_int64_equal(expected_splits, *GetOutput(1));
530 }
531 
TEST_F(NgramKernelTest,TestEmptyInput)532 TEST_F(NgramKernelTest, TestEmptyInput) {
533   MakeOp("|", {1}, "LP", "RP", 3, false);
534   AddInputFromArray<tstring>(TensorShape({0}), {});
535   AddInputFromArray<int64>(TensorShape({0}), {});
536   TF_ASSERT_OK(RunOpKernel());
537 
538   std::vector<tstring> expected_values({});
539   std::vector<int64> expected_splits({});
540 
541   assert_string_equal(expected_values, *GetOutput(0));
542   assert_int64_equal(expected_splits, *GetOutput(1));
543 }
544 
TEST_F(NgramKernelTest,ShapeFn)545 TEST_F(NgramKernelTest, ShapeFn) {
546   ShapeInferenceTestOp op("StringNGrams");
547   INFER_OK(op, "?;?", "[?];[?]");
548   INFER_OK(op, "[1];?", "[?];[?]");
549   INFER_OK(op, "[1];[2]", "[?];in1");
550   INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[];?");
551   INFER_ERROR("Shape must be rank 1 but is rank 0", op, "?;[]");
552 }
553 
554 }  // namespace text
555 }  // namespace tensorflow
556