1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCHEMA_SCHEMA_UTIL_H_ 16 #define ICING_SCHEMA_SCHEMA_UTIL_H_ 17 18 #include <cstdint> 19 #include <string> 20 #include <string_view> 21 #include <unordered_map> 22 #include <unordered_set> 23 24 #include "icing/text_classifier/lib3/utils/base/status.h" 25 #include "icing/text_classifier/lib3/utils/base/statusor.h" 26 #include "icing/proto/schema.pb.h" 27 28 namespace icing { 29 namespace lib { 30 31 class SchemaUtil { 32 public: 33 using TypeConfigMap = 34 std::unordered_map<std::string, const SchemaTypeConfigProto>; 35 36 // Maps from a child type to the parent types that depend on it. 37 // Ex. type A has a single property of type B 38 // The dependency map will be { { "B", { "A" } } } 39 using DependencyMap = 40 std::unordered_map<std::string_view, 41 std::unordered_set<std::string_view>>; 42 43 struct SchemaDelta { 44 // Whether an indexing config has changed, requiring the index to be 45 // regenerated. We don't list out all the types that make the index 46 // incompatible because our index isn't optimized for that. It's much easier 47 // to reset the entire index and reindex every document. 48 bool index_incompatible = false; 49 50 // Which schema types were present in the old schema, but were deleted from 51 // the new schema. 52 std::unordered_set<std::string> schema_types_deleted; 53 54 // Which schema types had their SchemaTypeConfigProto changed in a way that 55 // could invalidate existing Documents of that schema type. 56 std::unordered_set<std::string> schema_types_incompatible; 57 58 bool operator==(const SchemaDelta& other) const { 59 return index_incompatible == other.index_incompatible && 60 schema_types_deleted == other.schema_types_deleted && 61 schema_types_incompatible == other.schema_types_incompatible; 62 } 63 }; 64 65 struct ParsedPropertyConfigs { 66 // Mapping of property name to PropertyConfigProto 67 std::unordered_map<std::string_view, const PropertyConfigProto*> 68 property_config_map; 69 70 // Total number of properties that have an indexing config 71 int32_t num_indexed_properties = 0; 72 73 // Total number of properties that were REQUIRED 74 int32_t num_required_properties = 0; 75 }; 76 77 // This function validates: 78 // 1. SchemaTypeConfigProto.schema_type's must be unique 79 // 2. Properties within one SchemaTypeConfigProto must be unique 80 // 3. SchemaTypeConfigProtos.schema_type must be non-empty 81 // 4. PropertyConfigProtos.property_name must be non-empty 82 // 5. PropertyConfigProtos.property_name's must be unique within one 83 // SchemaTypeConfigProto 84 // 6. PropertyConfigProtos.data_type cannot be UNKNOWN 85 // 7. PropertyConfigProtos.data_type of DOCUMENT must also have a 86 // schema_type 87 // 8. PropertyConfigProtos.cardinality cannot be UNKNOWN 88 // 9. PropertyConfigProtos.schema_type's must correspond to a 89 // SchemaTypeConfigProto.schema_type 90 // 10. Property names can only be alphanumeric. 91 // 11. Any STRING data types have a valid string_indexing_config 92 // 12. A SchemaTypeConfigProto cannot have a property whose schema_type is 93 // itself, thus creating an infinite loop. 94 // 13. Two SchemaTypeConfigProtos cannot have properties that reference each 95 // other's schema_type, thus creating an infinite loop. 96 // 97 // TODO(b/171996137): Clarify 12 and 13 are only for indexed properties, once 98 // document properties can be opted out of indexing. 99 // 100 // Returns: 101 // On success, a dependency map from each child types to all parent types 102 // that depend on it directly or indirectly. 103 // ALREADY_EXISTS for case 1 and 2 104 // INVALID_ARGUMENT for 3-13 105 static libtextclassifier3::StatusOr<DependencyMap> Validate( 106 const SchemaProto& schema); 107 108 // Creates a mapping of schema type -> schema type config proto. The 109 // type_config_map is cleared, and then each schema-type_config_proto pair is 110 // placed in the given type_config_map parameter. 111 static void BuildTypeConfigMap(const SchemaProto& schema, 112 TypeConfigMap* type_config_map); 113 114 // Parses the given type_config and returns a struct of easily-parseable 115 // information about the properties. 116 static ParsedPropertyConfigs ParsePropertyConfigs( 117 const SchemaTypeConfigProto& type_config); 118 119 // Computes the delta between the old and new schema. There are a few 120 // differences that'll be reported: 121 // 1. The derived index would be incompatible. This is held in 122 // `SchemaDelta.index_incompatible`. 123 // 2. Some schema types existed in the old schema, but have been deleted 124 // from the new schema. This is held in 125 // `SchemaDelta.schema_types_deleted` 126 // 3. A schema type's new definition would mean any existing data of the old 127 // definition is now incompatible. 128 // 129 // For case 1, the two schemas would result in an incompatible index if: 130 // 1.1. The new SchemaProto has a different set of indexed properties than 131 // the old SchemaProto. 132 // 133 // For case 3, the two schemas would result in incompatible data if: 134 // 3.1. A SchemaTypeConfig exists in the old SchemaProto, but is not in the 135 // new SchemaProto 136 // 3.2. A property exists in the old SchemaTypeConfig, but is not in the new 137 // SchemaTypeConfig 138 // 3.3. A property in the new SchemaTypeConfig and has a REQUIRED 139 // PropertyConfigProto.cardinality, but is not in the old 140 // SchemaTypeConfig 141 // 3.4. A property is in both the old and new SchemaTypeConfig, but its 142 // PropertyConfigProto.data_type is different 143 // 3.5. A property is in both the old and new SchemaTypeConfig, but its 144 // PropertyConfigProto.schema_type is different 145 // 3.6. A property is in both the old and new SchemaTypeConfig, but its new 146 // PropertyConfigProto.cardinality is more restrictive. Restrictive 147 // scale defined as: 148 // LEAST <REPEATED - OPTIONAL - REQUIRED> MOST 149 // 150 // A property is defined by the combination of the 151 // SchemaTypeConfig.schema_type and the PropertyConfigProto.property_name. 152 // 153 // Returns a SchemaDelta that captures the aforementioned differences. 154 static const SchemaDelta ComputeCompatibilityDelta( 155 const SchemaProto& old_schema, const SchemaProto& new_schema, 156 const DependencyMap& new_schema_dependency_map); 157 158 // Validates the 'property_name' field. 159 // 1. Can't be an empty string 160 // 2. Can only contain alphanumeric characters 161 // 162 // NOTE: schema_type is only used for logging. It is not necessary to populate 163 // it. 164 // 165 // RETURNS: 166 // - OK if property_name is valid 167 // - INVALID_ARGUMENT if property name is empty or contains an 168 // non-alphabetic character. 169 static libtextclassifier3::Status ValidatePropertyName( 170 std::string_view property_name, std::string_view schema_type = ""); 171 172 private: 173 // Validates the 'schema_type' field 174 // 175 // Returns: 176 // INVALID_ARGUMENT if 'schema_type' is an empty string. 177 // OK on success 178 static libtextclassifier3::Status ValidateSchemaType( 179 std::string_view schema_type); 180 181 // Validates the 'data_type' field. 182 // 183 // Returns: 184 // INVALID_ARGUMENT if it's UNKNOWN 185 // OK on success 186 static libtextclassifier3::Status ValidateDataType( 187 PropertyConfigProto::DataType::Code data_type, 188 std::string_view schema_type, std::string_view property_name); 189 190 // Validates the 'cardinality' field. 191 // 192 // Returns: 193 // INVALID_ARGUMENT if it's UNKNOWN 194 // OK on success 195 static libtextclassifier3::Status ValidateCardinality( 196 PropertyConfigProto::Cardinality::Code cardinality, 197 std::string_view schema_type, std::string_view property_name); 198 199 // Checks that the 'string_indexing_config' satisfies the following rules: 200 // 1. Only STRING data types can be indexed 201 // 2. An indexed property must have a valid tokenizer type 202 // 203 // Returns: 204 // INVALID_ARGUMENT if any of the rules are not followed 205 // OK on success 206 static libtextclassifier3::Status ValidateStringIndexingConfig( 207 const StringIndexingConfig& config, 208 PropertyConfigProto::DataType::Code data_type, 209 std::string_view schema_type, std::string_view property_name); 210 }; 211 212 } // namespace lib 213 } // namespace icing 214 215 #endif // ICING_SCHEMA_SCHEMA_UTIL_H_ 216