1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/schema/section-manager.h"
16 
17 #include <algorithm>
18 #include <cinttypes>
19 #include <cstddef>
20 #include <cstdint>
21 #include <iterator>
22 #include <memory>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29 
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/str_cat.h"
34 #include "icing/legacy/core/icing-string-util.h"
35 #include "icing/proto/document.pb.h"
36 #include "icing/proto/schema.pb.h"
37 #include "icing/proto/term.pb.h"
38 #include "icing/schema/schema-util.h"
39 #include "icing/schema/section.h"
40 #include "icing/store/document-filter-data.h"
41 #include "icing/store/key-mapper.h"
42 #include "icing/util/status-macros.h"
43 
44 namespace icing {
45 namespace lib {
46 namespace {
47 
48 using TypeSectionMap =
49     std::unordered_map<std::string, const std::vector<SectionMetadata>>;
50 
51 // Helper function to concatenate a path and a property name
ConcatenatePath(const std::string & path,const std::string & next_property_name)52 std::string ConcatenatePath(const std::string& path,
53                             const std::string& next_property_name) {
54   if (path.empty()) {
55     return next_property_name;
56   }
57   return absl_ports::StrCat(path, kPropertySeparator, next_property_name);
58 }
59 
AssignSections(const SchemaTypeConfigProto & current_type_config,const std::string & current_section_path,const SchemaUtil::TypeConfigMap & type_config_map,std::vector<SectionMetadata> * metadata_list)60 libtextclassifier3::Status AssignSections(
61     const SchemaTypeConfigProto& current_type_config,
62     const std::string& current_section_path,
63     const SchemaUtil::TypeConfigMap& type_config_map,
64     std::vector<SectionMetadata>* metadata_list) {
65   // Sorts properties by name's alphabetical order so that order doesn't affect
66   // section assigning.
67   auto sorted_properties = current_type_config.properties();
68   std::sort(sorted_properties.pointer_begin(), sorted_properties.pointer_end(),
69             [](const PropertyConfigProto* p1, const PropertyConfigProto* p2) {
70               return p1->property_name() < p2->property_name();
71             });
72   for (const auto& property_config : sorted_properties) {
73     if (property_config.data_type() ==
74         PropertyConfigProto::DataType::DOCUMENT) {
75       auto nested_type_config_iter =
76           type_config_map.find(property_config.schema_type());
77       if (nested_type_config_iter == type_config_map.end()) {
78         // This should never happen because our schema should already be
79         // validated by this point.
80         return absl_ports::NotFoundError(absl_ports::StrCat(
81             "Type config not found: ", property_config.schema_type()));
82       }
83 
84       if (property_config.document_indexing_config()
85               .index_nested_properties()) {
86         // Assign any indexed sections recursively
87         const SchemaTypeConfigProto& nested_type_config =
88             nested_type_config_iter->second;
89         ICING_RETURN_IF_ERROR(
90             AssignSections(nested_type_config,
91                            ConcatenatePath(current_section_path,
92                                            property_config.property_name()),
93                            type_config_map, metadata_list));
94       }
95     }
96 
97     // Only index strings currently.
98     if (property_config.has_data_type() !=
99             PropertyConfigProto::DataType::STRING ||
100         property_config.string_indexing_config().term_match_type() ==
101             TermMatchType::UNKNOWN) {
102       // No need to create section for current property
103       continue;
104     }
105 
106     // Creates section metadata according to data type
107     // Validates next section id, makes sure that section id is the same as
108     // the list index so that we could find any section metadata by id in O(1)
109     // later.
110     auto new_section_id = static_cast<SectionId>(metadata_list->size());
111     if (!IsSectionIdValid(new_section_id)) {
112       // Max number of sections reached
113       return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
114           "Too many properties to be indexed, max number of properties "
115           "allowed: %d",
116           kMaxSectionId - kMinSectionId + 1));
117     }
118 
119     // Creates section metadata from property config
120     metadata_list->emplace_back(
121         new_section_id,
122         property_config.string_indexing_config().term_match_type(),
123         property_config.string_indexing_config().tokenizer_type(),
124         ConcatenatePath(current_section_path, property_config.property_name()));
125   }
126   return libtextclassifier3::Status::OK;
127 }
128 
129 // Builds a vector of vectors that holds SectionMetadatas for all the schema
130 // types. The outer vector's index corresponds with a type's SchemaTypeId. The
131 // inner vector's index corresponds to the section's SectionId.
132 libtextclassifier3::StatusOr<std::vector<std::vector<SectionMetadata>>>
BuildSectionMetadataCache(const SchemaUtil::TypeConfigMap & type_config_map,const KeyMapper<SchemaTypeId> & schema_type_mapper)133 BuildSectionMetadataCache(const SchemaUtil::TypeConfigMap& type_config_map,
134                           const KeyMapper<SchemaTypeId>& schema_type_mapper) {
135   // Create our vector and reserve the number of schema types we have
136   std::vector<std::vector<SectionMetadata>> section_metadata_cache(
137       schema_type_mapper.num_keys());
138 
139   for (const auto& name_and_type : type_config_map) {
140     // Assigns sections for each type config
141     const std::string& type_config_name = name_and_type.first;
142     const SchemaTypeConfigProto& type_config = name_and_type.second;
143     std::vector<SectionMetadata> metadata_list;
144     ICING_RETURN_IF_ERROR(AssignSections(type_config,
145                                          /*current_section_path*/ "",
146                                          type_config_map, &metadata_list));
147 
148     // Insert the section metadata list at the index of the type's SchemaTypeId
149     ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
150                            schema_type_mapper.Get(type_config_name));
151     section_metadata_cache[schema_type_id] = std::move(metadata_list);
152   }
153   return section_metadata_cache;
154 }
155 
156 // Helper function to get string content from a property. Repeated values are
157 // joined into one string. We only care about the STRING data type.
GetStringPropertyContent(const PropertyProto & property)158 std::vector<std::string_view> GetStringPropertyContent(
159     const PropertyProto& property) {
160   std::vector<std::string_view> values;
161   if (!property.string_values().empty()) {
162     std::copy(property.string_values().begin(), property.string_values().end(),
163               std::back_inserter(values));
164   }
165   return values;
166 }
167 
168 }  // namespace
169 
SectionManager(const KeyMapper<SchemaTypeId> * schema_type_mapper,std::vector<std::vector<SectionMetadata>> && section_metadata_cache)170 SectionManager::SectionManager(
171     const KeyMapper<SchemaTypeId>* schema_type_mapper,
172     std::vector<std::vector<SectionMetadata>>&& section_metadata_cache)
173     : schema_type_mapper_(*schema_type_mapper),
174       section_metadata_cache_(std::move(section_metadata_cache)) {}
175 
176 libtextclassifier3::StatusOr<std::unique_ptr<SectionManager>>
Create(const SchemaUtil::TypeConfigMap & type_config_map,const KeyMapper<SchemaTypeId> * schema_type_mapper)177 SectionManager::Create(const SchemaUtil::TypeConfigMap& type_config_map,
178                        const KeyMapper<SchemaTypeId>* schema_type_mapper) {
179   ICING_RETURN_ERROR_IF_NULL(schema_type_mapper);
180 
181   ICING_ASSIGN_OR_RETURN(
182       std::vector<std::vector<SectionMetadata>> section_metadata_cache,
183       BuildSectionMetadataCache(type_config_map, *schema_type_mapper));
184   return std::unique_ptr<SectionManager>(new SectionManager(
185       schema_type_mapper, std::move(section_metadata_cache)));
186 }
187 
188 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetStringSectionContent(const DocumentProto & document,std::string_view section_path) const189 SectionManager::GetStringSectionContent(const DocumentProto& document,
190                                         std::string_view section_path) const {
191   // Finds the first property name in section_path
192   size_t separator_position = section_path.find(kPropertySeparator);
193   std::string_view current_property_name =
194       (separator_position == std::string::npos)
195           ? section_path
196           : section_path.substr(0, separator_position);
197 
198   // Tries to match the property name with the ones in document
199   auto property_iterator =
200       std::find_if(document.properties().begin(), document.properties().end(),
201                    [current_property_name](const PropertyProto& property) {
202                      return property.name() == current_property_name;
203                    });
204 
205   if (property_iterator == document.properties().end()) {
206     // Property name not found, it could be one of the following 2 cases:
207     // 1. The property is optional and it's not in the document
208     // 2. The property name is invalid
209     return absl_ports::NotFoundError(absl_ports::StrCat(
210         "Section path '", section_path, "' not found in document."));
211   }
212 
213   if (separator_position == std::string::npos) {
214     // Current property name is the last one in section path
215     std::vector<std::string_view> content =
216         GetStringPropertyContent(*property_iterator);
217     if (content.empty()) {
218       // The content of property is explicitly set to empty, we'll treat it as
219       // NOT_FOUND because the index doesn't care about empty strings.
220       return absl_ports::NotFoundError(absl_ports::StrCat(
221           "Section path '", section_path, "' content was empty"));
222     }
223     return content;
224   }
225 
226   // Gets section content recursively
227   std::string_view sub_section_path =
228       section_path.substr(separator_position + 1);
229   std::vector<std::string_view> nested_document_content;
230   for (const auto& nested_document : property_iterator->document_values()) {
231     auto content_or =
232         GetStringSectionContent(nested_document, sub_section_path);
233     if (content_or.ok()) {
234       std::vector<std::string_view> content =
235           std::move(content_or).ValueOrDie();
236       std::move(content.begin(), content.end(),
237                 std::back_inserter(nested_document_content));
238     }
239   }
240   if (nested_document_content.empty()) {
241     return absl_ports::NotFoundError(
242         absl_ports::StrCat("Section path ", section_path,
243                            " not found in type config ", document.schema()));
244   }
245   return nested_document_content;
246 }
247 
248 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetStringSectionContent(const DocumentProto & document,SectionId section_id) const249 SectionManager::GetStringSectionContent(const DocumentProto& document,
250                                         SectionId section_id) const {
251   if (!IsSectionIdValid(section_id)) {
252     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
253         "Section id %d is greater than the max value %d", section_id,
254         kMaxSectionId));
255   }
256   ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list,
257                          GetMetadataList(document.schema()));
258   if (section_id >= metadata_list->size()) {
259     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
260         "Section with id %d doesn't exist in type config %s", section_id,
261         document.schema().c_str()));
262   }
263   // The index of metadata list is the same as the section id, so we can use
264   // section id as the index.
265   return GetStringSectionContent(document, metadata_list->at(section_id).path);
266 }
267 
268 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const269 SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id,
270                                    SectionId section_id) const {
271   if (!IsSectionIdValid(section_id)) {
272     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
273         "Section id %d is greater than the max value %d", section_id,
274         kMaxSectionId));
275   }
276   const std::vector<SectionMetadata>& section_metadatas =
277       section_metadata_cache_[schema_type_id];
278   if (section_id >= section_metadatas.size()) {
279     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
280         "Section with id %d doesn't exist in type config with id %d",
281         section_id, schema_type_id));
282   }
283 
284   // The index of metadata list is the same as the section id, so we can use
285   // section id as the index.
286   return &section_metadatas[section_id];
287 }
288 
289 libtextclassifier3::StatusOr<std::vector<Section>>
ExtractSections(const DocumentProto & document) const290 SectionManager::ExtractSections(const DocumentProto& document) const {
291   ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list,
292                          GetMetadataList(document.schema()));
293   std::vector<Section> sections;
294   for (const auto& section_metadata : *metadata_list) {
295     auto section_content_or =
296         GetStringSectionContent(document, section_metadata.path);
297     // Adds to result vector if section is found in document
298     if (section_content_or.ok()) {
299       sections.emplace_back(SectionMetadata(section_metadata),
300                             std::move(section_content_or).ValueOrDie());
301     }
302   }
303   return sections;
304 }
305 
306 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetMetadataList(const std::string & type_config_name) const307 SectionManager::GetMetadataList(const std::string& type_config_name) const {
308   ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
309                          schema_type_mapper_.Get(type_config_name));
310   return &section_metadata_cache_.at(schema_type_id);
311 }
312 
313 }  // namespace lib
314 }  // namespace icing
315