1 /*
2  * Copyright (C) 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MEDIAPROVIDER_PDF_JNI_PDFCLIENT_NORMALIZE_H_
18 #define MEDIAPROVIDER_PDF_JNI_PDFCLIENT_NORMALIZE_H_
19 
20 #include <string>
21 
22 namespace pdfClient {
23 
24 // Returns the codepoint that is representative of the group that this codepoint
25 // belongs to, for case-insensitive and accent-insensitive searching.
26 // For example, 'a' is returned for 'a', 'A', 'ä', 'Ä' and other 'a' variants.
27 char32_t NormalizeForSearch(char32_t codepoint);
28 
29 // Normalize the entire string for case/accent-insensitive searching.
30 void NormalizeStringForSearch(std::u32string* search_str);
31 
32 // Whether this character can be ignored when searching for matches.
33 // For example, the '\x2' character can be skipped because it is used to
34 // indicate that a word has been broken over two lines. Spaces can be skipped
35 // if they are repeated, so that "  " is equivalent to " ".
36 bool IsSkippableForSearch(char32_t codepoint, char32_t prev_codepoint);
37 
38 // Whether this character is used by pdfClient to indicate the start of a new line.
39 bool IsLineBreak(char32_t codepoint);
40 
41 // Holding down on some text selects a single word, and these characters
42 // are considered to separate words for this purpose. Not very rigorous.
43 bool IsWordBreak(char32_t codepoint);
44 
45 // Append the given codepoint that came from pdfClient to the string as UTF-8.
46 // pdfClient gives certain codepoints special meaning eg '\x2' for broken word, so
47 // these codepoints are not appended verbatim.
48 void AppendpdfClientCodepointAsUtf8(char32_t codepoint, std::string* output);
49 
50 }  // namespace pdfClient
51 
52 #endif  // MEDIAPROVIDER_PDF_JNI_PDFCLIENT_NORMALIZE_H_