1 /* 2 * Copyright (C) 2024 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MEDIAPROVIDER_PDF_JNI_PDFCLIENT_NORMALIZE_H_ 18 #define MEDIAPROVIDER_PDF_JNI_PDFCLIENT_NORMALIZE_H_ 19 20 #include <string> 21 22 namespace pdfClient { 23 24 // Returns the codepoint that is representative of the group that this codepoint 25 // belongs to, for case-insensitive and accent-insensitive searching. 26 // For example, 'a' is returned for 'a', 'A', 'ä', 'Ä' and other 'a' variants. 27 char32_t NormalizeForSearch(char32_t codepoint); 28 29 // Normalize the entire string for case/accent-insensitive searching. 30 void NormalizeStringForSearch(std::u32string* search_str); 31 32 // Whether this character can be ignored when searching for matches. 33 // For example, the '\x2' character can be skipped because it is used to 34 // indicate that a word has been broken over two lines. Spaces can be skipped 35 // if they are repeated, so that " " is equivalent to " ". 36 bool IsSkippableForSearch(char32_t codepoint, char32_t prev_codepoint); 37 38 // Whether this character is used by pdfClient to indicate the start of a new line. 39 bool IsLineBreak(char32_t codepoint); 40 41 // Holding down on some text selects a single word, and these characters 42 // are considered to separate words for this purpose. Not very rigorous. 43 bool IsWordBreak(char32_t codepoint); 44 45 // Append the given codepoint that came from pdfClient to the string as UTF-8. 46 // pdfClient gives certain codepoints special meaning eg '\x2' for broken word, so 47 // these codepoints are not appended verbatim. 48 void AppendpdfClientCodepointAsUtf8(char32_t codepoint, std::string* output); 49 50 } // namespace pdfClient 51 52 #endif // MEDIAPROVIDER_PDF_JNI_PDFCLIENT_NORMALIZE_H_