1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_SCHEMA_SCHEMA_UTIL_H_
16 #define ICING_SCHEMA_SCHEMA_UTIL_H_
17 
18 #include <cstdint>
19 #include <string>
20 #include <string_view>
21 #include <unordered_map>
22 #include <unordered_set>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/proto/schema.pb.h"
27 
28 namespace icing {
29 namespace lib {
30 
31 class SchemaUtil {
32  public:
33   using TypeConfigMap =
34       std::unordered_map<std::string, const SchemaTypeConfigProto>;
35 
36   // Maps from a child type to the parent types that depend on it.
37   // Ex. type A has a single property of type B
38   // The dependency map will be { { "B", { "A" } } }
39   using DependencyMap =
40       std::unordered_map<std::string_view,
41                          std::unordered_set<std::string_view>>;
42 
43   struct SchemaDelta {
44     // Whether an indexing config has changed, requiring the index to be
45     // regenerated. We don't list out all the types that make the index
46     // incompatible because our index isn't optimized for that. It's much easier
47     // to reset the entire index and reindex every document.
48     bool index_incompatible = false;
49 
50     // Which schema types were present in the old schema, but were deleted from
51     // the new schema.
52     std::unordered_set<std::string> schema_types_deleted;
53 
54     // Which schema types had their SchemaTypeConfigProto changed in a way that
55     // could invalidate existing Documents of that schema type.
56     std::unordered_set<std::string> schema_types_incompatible;
57 
58     bool operator==(const SchemaDelta& other) const {
59       return index_incompatible == other.index_incompatible &&
60              schema_types_deleted == other.schema_types_deleted &&
61              schema_types_incompatible == other.schema_types_incompatible;
62     }
63   };
64 
65   struct ParsedPropertyConfigs {
66     // Mapping of property name to PropertyConfigProto
67     std::unordered_map<std::string_view, const PropertyConfigProto*>
68         property_config_map;
69 
70     // Total number of properties that have an indexing config
71     int32_t num_indexed_properties = 0;
72 
73     // Total number of properties that were REQUIRED
74     int32_t num_required_properties = 0;
75   };
76 
77   // This function validates:
78   //   1. SchemaTypeConfigProto.schema_type's must be unique
79   //   2. Properties within one SchemaTypeConfigProto must be unique
80   //   3. SchemaTypeConfigProtos.schema_type must be non-empty
81   //   4. PropertyConfigProtos.property_name must be non-empty
82   //   5. PropertyConfigProtos.property_name's must be unique within one
83   //      SchemaTypeConfigProto
84   //   6. PropertyConfigProtos.data_type cannot be UNKNOWN
85   //   7. PropertyConfigProtos.data_type of DOCUMENT must also have a
86   //      schema_type
87   //   8. PropertyConfigProtos.cardinality cannot be UNKNOWN
88   //   9. PropertyConfigProtos.schema_type's must correspond to a
89   //      SchemaTypeConfigProto.schema_type
90   //  10. Property names can only be alphanumeric.
91   //  11. Any STRING data types have a valid string_indexing_config
92   //  12. A SchemaTypeConfigProto cannot have a property whose schema_type is
93   //      itself, thus creating an infinite loop.
94   //  13. Two SchemaTypeConfigProtos cannot have properties that reference each
95   //      other's schema_type, thus creating an infinite loop.
96   //
97   //  TODO(b/171996137): Clarify 12 and 13 are only for indexed properties, once
98   //  document properties can be opted out of indexing.
99   //
100   // Returns:
101   //   On success, a dependency map from each child types to all parent types
102   //   that depend on it directly or indirectly.
103   //   ALREADY_EXISTS for case 1 and 2
104   //   INVALID_ARGUMENT for 3-13
105   static libtextclassifier3::StatusOr<DependencyMap> Validate(
106       const SchemaProto& schema);
107 
108   // Creates a mapping of schema type -> schema type config proto. The
109   // type_config_map is cleared, and then each schema-type_config_proto pair is
110   // placed in the given type_config_map parameter.
111   static void BuildTypeConfigMap(const SchemaProto& schema,
112                                  TypeConfigMap* type_config_map);
113 
114   // Parses the given type_config and returns a struct of easily-parseable
115   // information about the properties.
116   static ParsedPropertyConfigs ParsePropertyConfigs(
117       const SchemaTypeConfigProto& type_config);
118 
119   // Computes the delta between the old and new schema. There are a few
120   // differences that'll be reported:
121   //   1. The derived index would be incompatible. This is held in
122   //      `SchemaDelta.index_incompatible`.
123   //   2. Some schema types existed in the old schema, but have been deleted
124   //      from the new schema. This is held in
125   //      `SchemaDelta.schema_types_deleted`
126   //   3. A schema type's new definition would mean any existing data of the old
127   //      definition is now incompatible.
128   //
129   // For case 1, the two schemas would result in an incompatible index if:
130   //   1.1. The new SchemaProto has a different set of indexed properties than
131   //        the old SchemaProto.
132   //
133   // For case 3, the two schemas would result in incompatible data if:
134   //   3.1. A SchemaTypeConfig exists in the old SchemaProto, but is not in the
135   //        new SchemaProto
136   //   3.2. A property exists in the old SchemaTypeConfig, but is not in the new
137   //        SchemaTypeConfig
138   //   3.3. A property in the new SchemaTypeConfig and has a REQUIRED
139   //        PropertyConfigProto.cardinality, but is not in the old
140   //        SchemaTypeConfig
141   //   3.4. A property is in both the old and new SchemaTypeConfig, but its
142   //        PropertyConfigProto.data_type is different
143   //   3.5. A property is in both the old and new SchemaTypeConfig, but its
144   //        PropertyConfigProto.schema_type is different
145   //   3.6. A property is in both the old and new SchemaTypeConfig, but its new
146   //        PropertyConfigProto.cardinality is more restrictive. Restrictive
147   //        scale defined as:
148   //          LEAST <REPEATED - OPTIONAL - REQUIRED> MOST
149   //
150   // A property is defined by the combination of the
151   // SchemaTypeConfig.schema_type and the PropertyConfigProto.property_name.
152   //
153   // Returns a SchemaDelta that captures the aforementioned differences.
154   static const SchemaDelta ComputeCompatibilityDelta(
155       const SchemaProto& old_schema, const SchemaProto& new_schema,
156       const DependencyMap& new_schema_dependency_map);
157 
158   // Validates the 'property_name' field.
159   //   1. Can't be an empty string
160   //   2. Can only contain alphanumeric characters
161   //
162   // NOTE: schema_type is only used for logging. It is not necessary to populate
163   // it.
164   //
165   // RETURNS:
166   //   - OK if property_name is valid
167   //   - INVALID_ARGUMENT if property name is empty or contains an
168   //     non-alphabetic character.
169   static libtextclassifier3::Status ValidatePropertyName(
170       std::string_view property_name, std::string_view schema_type = "");
171 
172  private:
173   // Validates the 'schema_type' field
174   //
175   // Returns:
176   //   INVALID_ARGUMENT if 'schema_type' is an empty string.
177   //   OK on success
178   static libtextclassifier3::Status ValidateSchemaType(
179       std::string_view schema_type);
180 
181   // Validates the 'data_type' field.
182   //
183   // Returns:
184   //   INVALID_ARGUMENT if it's UNKNOWN
185   //   OK on success
186   static libtextclassifier3::Status ValidateDataType(
187       PropertyConfigProto::DataType::Code data_type,
188       std::string_view schema_type, std::string_view property_name);
189 
190   // Validates the 'cardinality' field.
191   //
192   // Returns:
193   //   INVALID_ARGUMENT if it's UNKNOWN
194   //   OK on success
195   static libtextclassifier3::Status ValidateCardinality(
196       PropertyConfigProto::Cardinality::Code cardinality,
197       std::string_view schema_type, std::string_view property_name);
198 
199   // Checks that the 'string_indexing_config' satisfies the following rules:
200   //   1. Only STRING data types can be indexed
201   //   2. An indexed property must have a valid tokenizer type
202   //
203   // Returns:
204   //   INVALID_ARGUMENT if any of the rules are not followed
205   //   OK on success
206   static libtextclassifier3::Status ValidateStringIndexingConfig(
207       const StringIndexingConfig& config,
208       PropertyConfigProto::DataType::Code data_type,
209       std::string_view schema_type, std::string_view property_name);
210 };
211 
212 }  // namespace lib
213 }  // namespace icing
214 
215 #endif  // ICING_SCHEMA_SCHEMA_UTIL_H_
216