// Copyright 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "base/strings/string_util.h" #include #include #include "base/macros.h" #include "base/strings/utf_string_conversion_utils.h" #include "base/third_party/icu/icu_utf.h" namespace base { namespace { typedef uintptr_t MachineWord; const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1; inline bool IsAlignedToMachineWord(const void* pointer) { return !(reinterpret_cast(pointer) & kMachineWordAlignmentMask); } template inline T* AlignToMachineWord(T* pointer) { return reinterpret_cast(reinterpret_cast(pointer) & ~kMachineWordAlignmentMask); } template struct NonASCIIMask; template<> struct NonASCIIMask<4, char> { static inline uint32_t value() { return 0x80808080U; } }; template<> struct NonASCIIMask<8, char> { static inline uint64_t value() { return 0x8080808080808080ULL; } }; } // namespace namespace { template StringType ToLowerASCIIImpl(BasicStringPiece str) { StringType ret; ret.reserve(str.size()); for (size_t i = 0; i < str.size(); i++) ret.push_back(ToLowerASCII(str[i])); return ret; } template StringType ToUpperASCIIImpl(BasicStringPiece str) { StringType ret; ret.reserve(str.size()); for (size_t i = 0; i < str.size(); i++) ret.push_back(ToUpperASCII(str[i])); return ret; } } // namespace std::string ToLowerASCII(StringPiece str) { return ToLowerASCIIImpl(str); } std::string ToUpperASCII(StringPiece str) { return ToUpperASCIIImpl(str); } template int CompareCaseInsensitiveASCIIT(BasicStringPiece a, BasicStringPiece b) { // Find the first characters that aren't equal and compare them. If the end // of one of the strings is found before a nonequal character, the lengths // of the strings are compared. size_t i = 0; while (i < a.length() && i < b.length()) { typename StringType::value_type lower_a = ToLowerASCII(a[i]); typename StringType::value_type lower_b = ToLowerASCII(b[i]); if (lower_a < lower_b) return -1; if (lower_a > lower_b) return 1; i++; } // End of one string hit before finding a different character. Expect the // common case to be "strings equal" at this point so check that first. if (a.length() == b.length()) return 0; if (a.length() < b.length()) return -1; return 1; } int CompareCaseInsensitiveASCII(StringPiece a, StringPiece b) { return CompareCaseInsensitiveASCIIT(a, b); } bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b) { if (a.length() != b.length()) return false; return CompareCaseInsensitiveASCIIT(a, b) == 0; } template bool ReplaceCharsT(const STR& input, const STR& replace_chars, const STR& replace_with, STR* output) { bool removed = false; size_t replace_length = replace_with.length(); *output = input; size_t found = output->find_first_of(replace_chars); while (found != STR::npos) { removed = true; output->replace(found, 1, replace_with); found = output->find_first_of(replace_chars, found + replace_length); } return removed; } bool ReplaceChars(const std::string& input, const StringPiece& replace_chars, const std::string& replace_with, std::string* output) { return ReplaceCharsT(input, replace_chars.as_string(), replace_with, output); } template TrimPositions TrimStringT(const Str& input, BasicStringPiece trim_chars, TrimPositions positions, Str* output) { // Find the edges of leading/trailing whitespace as desired. Need to use // a StringPiece version of input to be able to call find* on it with the // StringPiece version of trim_chars (normally the trim_chars will be a // constant so avoid making a copy). BasicStringPiece input_piece(input); const size_t last_char = input.length() - 1; const size_t first_good_char = (positions & TRIM_LEADING) ? input_piece.find_first_not_of(trim_chars) : 0; const size_t last_good_char = (positions & TRIM_TRAILING) ? input_piece.find_last_not_of(trim_chars) : last_char; // When the string was all trimmed, report that we stripped off characters // from whichever position the caller was interested in. For empty input, we // stripped no characters, but we still need to clear |output|. if (input.empty() || (first_good_char == Str::npos) || (last_good_char == Str::npos)) { bool input_was_empty = input.empty(); // in case output == &input output->clear(); return input_was_empty ? TRIM_NONE : positions; } // Trim. *output = input.substr(first_good_char, last_good_char - first_good_char + 1); // Return where we trimmed from. return static_cast( ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) | ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING)); } bool TrimString(const std::string& input, StringPiece trim_chars, std::string* output) { return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; } template BasicStringPiece TrimStringPieceT(BasicStringPiece input, BasicStringPiece trim_chars, TrimPositions positions) { size_t begin = (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0; size_t end = (positions & TRIM_TRAILING) ? input.find_last_not_of(trim_chars) + 1 : input.size(); return input.substr(begin, end - begin); } StringPiece TrimString(StringPiece input, const StringPiece& trim_chars, TrimPositions positions) { return TrimStringPieceT(input, trim_chars, positions); } TrimPositions TrimWhitespaceASCII(const std::string& input, TrimPositions positions, std::string* output) { return TrimStringT(input, StringPiece(kWhitespaceASCII), positions, output); } template inline bool DoIsStringASCII(const Char* characters, size_t length) { MachineWord all_char_bits = 0; const Char* end = characters + length; // Prologue: align the input. while (!IsAlignedToMachineWord(characters) && characters != end) { all_char_bits |= *characters; ++characters; } // Compare the values of CPU word size. const Char* word_end = AlignToMachineWord(end); const size_t loop_increment = sizeof(MachineWord) / sizeof(Char); while (characters < word_end) { all_char_bits |= *(reinterpret_cast(characters)); characters += loop_increment; } // Process the remaining bytes. while (characters != end) { all_char_bits |= *characters; ++characters; } MachineWord non_ascii_bit_mask = NonASCIIMask::value(); return !(all_char_bits & non_ascii_bit_mask); } bool IsStringASCII(const StringPiece& str) { return DoIsStringASCII(str.data(), str.length()); } bool IsStringUTF8(const StringPiece& str) { const char *src = str.data(); int32_t src_len = static_cast(str.length()); int32_t char_index = 0; while (char_index < src_len) { int32_t code_point; CBU8_NEXT(src, char_index, src_len, code_point); if (!IsValidCharacter(code_point)) return false; } return true; } } // namespace base