1# Copyright (c) 2014 Amazon.com, Inc. or its affiliates. All Rights Reserved 2# 3# Permission is hereby granted, free of charge, to any person obtaining a 4# copy of this software and associated documentation files (the 5# "Software"), to deal in the Software without restriction, including 6# without limitation the rights to use, copy, modify, merge, publish, dis- 7# tribute, sublicense, and/or sell copies of the Software, and to permit 8# persons to whom the Software is furnished to do so, subject to the fol- 9# lowing conditions: 10# 11# The above copyright notice and this permission notice shall be included 12# in all copies or substantial portions of the Software. 13# 14# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- 16# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 17# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 20# IN THE SOFTWARE. 21# 22from boto.compat import json 23from boto.exception import JSONResponseError 24from boto.connection import AWSAuthConnection 25from boto.regioninfo import RegionInfo 26from boto.cloudsearchdomain import exceptions 27 28 29class CloudSearchDomainConnection(AWSAuthConnection): 30 """ 31 You use the AmazonCloudSearch2013 API to upload documents to a 32 search domain and search those documents. 33 34 The endpoints for submitting `UploadDocuments`, `Search`, and 35 `Suggest` requests are domain-specific. To get the endpoints for 36 your domain, use the Amazon CloudSearch configuration service 37 `DescribeDomains` action. The domain endpoints are also displayed 38 on the domain dashboard in the Amazon CloudSearch console. You 39 submit suggest requests to the search endpoint. 40 41 For more information, see the `Amazon CloudSearch Developer 42 Guide`_. 43 """ 44 APIVersion = "2013-01-01" 45 AuthServiceName = 'cloudsearch' 46 DefaultRegionName = "us-east-1" 47 DefaultRegionEndpoint = "cloudsearch.us-east-1.amazonaws.com" 48 ResponseError = JSONResponseError 49 50 _faults = { 51 "SearchException": exceptions.SearchException, 52 "DocumentServiceException": exceptions.DocumentServiceException, 53 } 54 55 def __init__(self, **kwargs): 56 region = kwargs.get('region') 57 if not region: 58 region = RegionInfo(self, self.DefaultRegionName, 59 self.DefaultRegionEndpoint) 60 else: 61 del kwargs['region'] 62 if kwargs.get('host', None) is None: 63 raise ValueError( 64 'The argument, host, must be provided when creating a ' 65 'CloudSearchDomainConnection because its methods require the ' 66 'specific domain\'s endpoint in order to successfully make ' 67 'requests to that CloudSearch Domain.' 68 ) 69 super(CloudSearchDomainConnection, self).__init__(**kwargs) 70 self.region = region 71 72 def _required_auth_capability(self): 73 return ['hmac-v4'] 74 75 def search(self, query, cursor=None, expr=None, facet=None, 76 filter_query=None, highlight=None, partial=None, 77 query_options=None, query_parser=None, ret=None, size=None, 78 sort=None, start=None): 79 """ 80 Retrieves a list of documents that match the specified search 81 criteria. How you specify the search criteria depends on which 82 query parser you use. Amazon CloudSearch supports four query 83 parsers: 84 85 86 + `simple`: search all `text` and `text-array` fields for the 87 specified string. Search for phrases, individual terms, and 88 prefixes. 89 + `structured`: search specific fields, construct compound 90 queries using Boolean operators, and use advanced features 91 such as term boosting and proximity searching. 92 + `lucene`: specify search criteria using the Apache Lucene 93 query parser syntax. 94 + `dismax`: specify search criteria using the simplified 95 subset of the Apache Lucene query parser syntax defined by the 96 DisMax query parser. 97 98 99 For more information, see `Searching Your Data`_ in the Amazon 100 CloudSearch Developer Guide . 101 102 The endpoint for submitting `Search` requests is domain- 103 specific. You submit search requests to a domain's search 104 endpoint. To get the search endpoint for your domain, use the 105 Amazon CloudSearch configuration service `DescribeDomains` 106 action. A domain's endpoints are also displayed on the domain 107 dashboard in the Amazon CloudSearch console. 108 109 :type cursor: string 110 :param cursor: Retrieves a cursor value you can use to page through 111 large result sets. Use the `size` parameter to control the number 112 of hits to include in each response. You can specify either the 113 `cursor` or `start` parameter in a request; they are mutually 114 exclusive. To get the first cursor, set the cursor value to 115 `initial`. In subsequent requests, specify the cursor value 116 returned in the hits section of the response. 117 For more information, see `Paginating Results`_ in the Amazon 118 CloudSearch Developer Guide . 119 120 :type expr: string 121 :param expr: Defines one or more numeric expressions that can be used 122 to sort results or specify search or filter criteria. You can also 123 specify expressions as return fields. 124 For more information about defining and using expressions, see 125 `Configuring Expressions`_ in the Amazon CloudSearch Developer 126 Guide . 127 128 :type facet: string 129 :param facet: Specifies one or more fields for which to get facet 130 information, and options that control how the facet information is 131 returned. Each specified field must be facet-enabled in the domain 132 configuration. The fields and options are specified in JSON using 133 the form `{"FIELD":{"OPTION":VALUE,"OPTION:"STRING"},"FIELD":{"OPTI 134 ON":VALUE,"OPTION":"STRING"}}`. 135 You can specify the following faceting options: 136 137 138 + `buckets` specifies an array of the facet values or ranges to count. 139 Ranges are specified using the same syntax that you use to search 140 for a range of values. For more information, see ` Searching for a 141 Range of Values`_ in the Amazon CloudSearch Developer Guide . 142 Buckets are returned in the order they are specified in the 143 request. The `sort` and `size` options are not valid if you specify 144 `buckets`. 145 + `size` specifies the maximum number of facets to include in the 146 results. By default, Amazon CloudSearch returns counts for the top 147 10. The `size` parameter is only valid when you specify the `sort` 148 option; it cannot be used in conjunction with `buckets`. 149 + `sort` specifies how you want to sort the facets in the results: 150 `bucket` or `count`. Specify `bucket` to sort alphabetically or 151 numerically by facet value (in ascending order). Specify `count` to 152 sort by the facet counts computed for each facet value (in 153 descending order). To retrieve facet counts for particular values 154 or ranges of values, use the `buckets` option instead of `sort`. 155 156 157 If no facet options are specified, facet counts are computed for all 158 field values, the facets are sorted by facet count, and the top 10 159 facets are returned in the results. 160 161 For more information, see `Getting and Using Facet Information`_ in the 162 Amazon CloudSearch Developer Guide . 163 164 :type filter_query: string 165 :param filter_query: Specifies a structured query that filters the 166 results of a search without affecting how the results are scored 167 and sorted. You use `filterQuery` in conjunction with the `query` 168 parameter to filter the documents that match the constraints 169 specified in the `query` parameter. Specifying a filter controls 170 only which matching documents are included in the results, it has 171 no effect on how they are scored and sorted. The `filterQuery` 172 parameter supports the full structured query syntax. 173 For more information about using filters, see `Filtering Matching 174 Documents`_ in the Amazon CloudSearch Developer Guide . 175 176 :type highlight: string 177 :param highlight: Retrieves highlights for matches in the specified 178 `text` or `text-array` fields. Each specified field must be 179 highlight enabled in the domain configuration. The fields and 180 options are specified in JSON using the form `{"FIELD":{"OPTION":VA 181 LUE,"OPTION:"STRING"},"FIELD":{"OPTION":VALUE,"OPTION":"STRING"}}`. 182 You can specify the following highlight options: 183 184 185 + `format`: specifies the format of the data in the text field: `text` 186 or `html`. When data is returned as HTML, all non-alphanumeric 187 characters are encoded. The default is `html`. 188 + `max_phrases`: specifies the maximum number of occurrences of the 189 search term(s) you want to highlight. By default, the first 190 occurrence is highlighted. 191 + `pre_tag`: specifies the string to prepend to an occurrence of a 192 search term. The default for HTML highlights is `<em>`. The 193 default for text highlights is `*`. 194 + `post_tag`: specifies the string to append to an occurrence of a 195 search term. The default for HTML highlights is `</em>`. The 196 default for text highlights is `*`. 197 198 199 If no highlight options are specified for a field, the returned field 200 text is treated as HTML and the first match is highlighted with 201 emphasis tags: `<em>search-term</em>`. 202 203 :type partial: boolean 204 :param partial: Enables partial results to be returned if one or more 205 index partitions are unavailable. When your search index is 206 partitioned across multiple search instances, by default Amazon 207 CloudSearch only returns results if every partition can be queried. 208 This means that the failure of a single search instance can result 209 in 5xx (internal server) errors. When you enable partial results, 210 Amazon CloudSearch returns whatever results are available and 211 includes the percentage of documents searched in the search results 212 (percent-searched). This enables you to more gracefully degrade 213 your users' search experience. For example, rather than displaying 214 no results, you could display the partial results and a message 215 indicating that the results might be incomplete due to a temporary 216 system outage. 217 218 :type query: string 219 :param query: Specifies the search criteria for the request. How you 220 specify the search criteria depends on the query parser used for 221 the request and the parser options specified in the `queryOptions` 222 parameter. By default, the `simple` query parser is used to process 223 requests. To use the `structured`, `lucene`, or `dismax` query 224 parser, you must also specify the `queryParser` parameter. 225 For more information about specifying search criteria, see `Searching 226 Your Data`_ in the Amazon CloudSearch Developer Guide . 227 228 :type query_options: string 229 :param query_options: 230 Configures options for the query parser specified in the `queryParser` 231 parameter. 232 233 The options you can configure vary according to which parser you use: 234 235 236 + `defaultOperator`: The default operator used to combine individual 237 terms in the search string. For example: `defaultOperator: 'or'`. 238 For the `dismax` parser, you specify a percentage that represents 239 the percentage of terms in the search string (rounded down) that 240 must match, rather than a default operator. A value of `0%` is the 241 equivalent to OR, and a value of `100%` is equivalent to AND. The 242 percentage must be specified as a value in the range 0-100 followed 243 by the percent (%) symbol. For example, `defaultOperator: 50%`. 244 Valid values: `and`, `or`, a percentage in the range 0%-100% ( 245 `dismax`). Default: `and` ( `simple`, `structured`, `lucene`) or 246 `100` ( `dismax`). Valid for: `simple`, `structured`, `lucene`, and 247 `dismax`. 248 + `fields`: An array of the fields to search when no fields are 249 specified in a search. If no fields are specified in a search and 250 this option is not specified, all text and text-array fields are 251 searched. You can specify a weight for each field to control the 252 relative importance of each field when Amazon CloudSearch 253 calculates relevance scores. To specify a field weight, append a 254 caret ( `^`) symbol and the weight to the field name. For example, 255 to boost the importance of the `title` field over the `description` 256 field you could specify: `"fields":["title^5","description"]`. 257 Valid values: The name of any configured field and an optional 258 numeric value greater than zero. Default: All `text` and `text- 259 array` fields. Valid for: `simple`, `structured`, `lucene`, and 260 `dismax`. 261 + `operators`: An array of the operators or special characters you want 262 to disable for the simple query parser. If you disable the `and`, 263 `or`, or `not` operators, the corresponding operators ( `+`, `|`, 264 `-`) have no special meaning and are dropped from the search 265 string. Similarly, disabling `prefix` disables the wildcard 266 operator ( `*`) and disabling `phrase` disables the ability to 267 search for phrases by enclosing phrases in double quotes. Disabling 268 precedence disables the ability to control order of precedence 269 using parentheses. Disabling `near` disables the ability to use the 270 ~ operator to perform a sloppy phrase search. Disabling the `fuzzy` 271 operator disables the ability to use the ~ operator to perform a 272 fuzzy search. `escape` disables the ability to use a backslash ( 273 `\`) to escape special characters within the search string. 274 Disabling whitespace is an advanced option that prevents the parser 275 from tokenizing on whitespace, which can be useful for Vietnamese. 276 (It prevents Vietnamese words from being split incorrectly.) For 277 example, you could disable all operators other than the phrase 278 operator to support just simple term and phrase queries: 279 `"operators":["and","not","or", "prefix"]`. Valid values: `and`, 280 `escape`, `fuzzy`, `near`, `not`, `or`, `phrase`, `precedence`, 281 `prefix`, `whitespace`. Default: All operators and special 282 characters are enabled. Valid for: `simple`. 283 + `phraseFields`: An array of the `text` or `text-array` fields you 284 want to use for phrase searches. When the terms in the search 285 string appear in close proximity within a field, the field scores 286 higher. You can specify a weight for each field to boost that 287 score. The `phraseSlop` option controls how much the matches can 288 deviate from the search string and still be boosted. To specify a 289 field weight, append a caret ( `^`) symbol and the weight to the 290 field name. For example, to boost phrase matches in the `title` 291 field over the `abstract` field, you could specify: 292 `"phraseFields":["title^3", "plot"]` Valid values: The name of any 293 `text` or `text-array` field and an optional numeric value greater 294 than zero. Default: No fields. If you don't specify any fields with 295 `phraseFields`, proximity scoring is disabled even if `phraseSlop` 296 is specified. Valid for: `dismax`. 297 + `phraseSlop`: An integer value that specifies how much matches can 298 deviate from the search phrase and still be boosted according to 299 the weights specified in the `phraseFields` option; for example, 300 `phraseSlop: 2`. You must also specify `phraseFields` to enable 301 proximity scoring. Valid values: positive integers. Default: 0. 302 Valid for: `dismax`. 303 + `explicitPhraseSlop`: An integer value that specifies how much a 304 match can deviate from the search phrase when the phrase is 305 enclosed in double quotes in the search string. (Phrases that 306 exceed this proximity distance are not considered a match.) For 307 example, to specify a slop of three for dismax phrase queries, you 308 would specify `"explicitPhraseSlop":3`. Valid values: positive 309 integers. Default: 0. Valid for: `dismax`. 310 + `tieBreaker`: When a term in the search string is found in a 311 document's field, a score is calculated for that field based on how 312 common the word is in that field compared to other documents. If 313 the term occurs in multiple fields within a document, by default 314 only the highest scoring field contributes to the document's 315 overall score. You can specify a `tieBreaker` value to enable the 316 matches in lower-scoring fields to contribute to the document's 317 score. That way, if two documents have the same max field score for 318 a particular term, the score for the document that has matches in 319 more fields will be higher. The formula for calculating the score 320 with a tieBreaker is `(max field score) + (tieBreaker) * (sum of 321 the scores for the rest of the matching fields)`. Set `tieBreaker` 322 to 0 to disregard all but the highest scoring field (pure max): 323 `"tieBreaker":0`. Set to 1 to sum the scores from all fields (pure 324 sum): `"tieBreaker":1`. Valid values: 0.0 to 1.0. Default: 0.0. 325 Valid for: `dismax`. 326 327 :type query_parser: string 328 :param query_parser: 329 Specifies which query parser to use to process the request. If 330 `queryParser` is not specified, Amazon CloudSearch uses the 331 `simple` query parser. 332 333 Amazon CloudSearch supports four query parsers: 334 335 336 + `simple`: perform simple searches of `text` and `text-array` fields. 337 By default, the `simple` query parser searches all `text` and 338 `text-array` fields. You can specify which fields to search by with 339 the `queryOptions` parameter. If you prefix a search term with a 340 plus sign (+) documents must contain the term to be considered a 341 match. (This is the default, unless you configure the default 342 operator with the `queryOptions` parameter.) You can use the `-` 343 (NOT), `|` (OR), and `*` (wildcard) operators to exclude particular 344 terms, find results that match any of the specified terms, or 345 search for a prefix. To search for a phrase rather than individual 346 terms, enclose the phrase in double quotes. For more information, 347 see `Searching for Text`_ in the Amazon CloudSearch Developer Guide 348 . 349 + `structured`: perform advanced searches by combining multiple 350 expressions to define the search criteria. You can also search 351 within particular fields, search for values and ranges of values, 352 and use advanced options such as term boosting, `matchall`, and 353 `near`. For more information, see `Constructing Compound Queries`_ 354 in the Amazon CloudSearch Developer Guide . 355 + `lucene`: search using the Apache Lucene query parser syntax. For 356 more information, see `Apache Lucene Query Parser Syntax`_. 357 + `dismax`: search using the simplified subset of the Apache Lucene 358 query parser syntax defined by the DisMax query parser. For more 359 information, see `DisMax Query Parser Syntax`_. 360 361 :type ret: string 362 :param ret: Specifies the field and expression values to include in 363 the response. Multiple fields or expressions are specified as a 364 comma-separated list. By default, a search response includes all 365 return enabled fields ( `_all_fields`). To return only the document 366 IDs for the matching documents, specify `_no_fields`. To retrieve 367 the relevance score calculated for each document, specify `_score`. 368 369 :type size: long 370 :param size: Specifies the maximum number of search hits to include in 371 the response. 372 373 :type sort: string 374 :param sort: Specifies the fields or custom expressions to use to sort 375 the search results. Multiple fields or expressions are specified as 376 a comma-separated list. You must specify the sort direction ( `asc` 377 or `desc`) for each field; for example, `year desc,title asc`. To 378 use a field to sort results, the field must be sort-enabled in the 379 domain configuration. Array type fields cannot be used for sorting. 380 If no `sort` parameter is specified, results are sorted by their 381 default relevance scores in descending order: `_score desc`. You 382 can also sort by document ID ( `_id asc`) and version ( `_version 383 desc`). 384 For more information, see `Sorting Results`_ in the Amazon CloudSearch 385 Developer Guide . 386 387 :type start: long 388 :param start: Specifies the offset of the first search hit you want to 389 return. Note that the result set is zero-based; the first result is 390 at index 0. You can specify either the `start` or `cursor` 391 parameter in a request, they are mutually exclusive. 392 For more information, see `Paginating Results`_ in the Amazon 393 CloudSearch Developer Guide . 394 395 """ 396 uri = '/2013-01-01/search' 397 params = {} 398 headers = {} 399 query_params = {} 400 if cursor is not None: 401 query_params['cursor'] = cursor 402 if expr is not None: 403 query_params['expr'] = expr 404 if facet is not None: 405 query_params['facet'] = facet 406 if filter_query is not None: 407 query_params['fq'] = filter_query 408 if highlight is not None: 409 query_params['highlight'] = highlight 410 if partial is not None: 411 query_params['partial'] = partial 412 if query is not None: 413 query_params['q'] = query 414 if query_options is not None: 415 query_params['q.options'] = query_options 416 if query_parser is not None: 417 query_params['q.parser'] = query_parser 418 if ret is not None: 419 query_params['return'] = ret 420 if size is not None: 421 query_params['size'] = size 422 if sort is not None: 423 query_params['sort'] = sort 424 if start is not None: 425 query_params['start'] = start 426 return self.make_request('POST', uri, expected_status=200, 427 data=json.dumps(params), headers=headers, 428 params=query_params) 429 430 def suggest(self, query, suggester, size=None): 431 """ 432 Retrieves autocomplete suggestions for a partial query string. 433 You can use suggestions enable you to display likely matches 434 before users finish typing. In Amazon CloudSearch, suggestions 435 are based on the contents of a particular text field. When you 436 request suggestions, Amazon CloudSearch finds all of the 437 documents whose values in the suggester field start with the 438 specified query string. The beginning of the field must match 439 the query string to be considered a match. 440 441 For more information about configuring suggesters and 442 retrieving suggestions, see `Getting Suggestions`_ in the 443 Amazon CloudSearch Developer Guide . 444 445 The endpoint for submitting `Suggest` requests is domain- 446 specific. You submit suggest requests to a domain's search 447 endpoint. To get the search endpoint for your domain, use the 448 Amazon CloudSearch configuration service `DescribeDomains` 449 action. A domain's endpoints are also displayed on the domain 450 dashboard in the Amazon CloudSearch console. 451 452 :type query: string 453 :param query: Specifies the string for which you want to get 454 suggestions. 455 456 :type suggester: string 457 :param suggester: Specifies the name of the suggester to use to find 458 suggested matches. 459 460 :type size: long 461 :param size: Specifies the maximum number of suggestions to return. 462 463 """ 464 uri = '/2013-01-01/suggest' 465 params = {} 466 headers = {} 467 query_params = {} 468 if query is not None: 469 query_params['q'] = query 470 if suggester is not None: 471 query_params['suggester'] = suggester 472 if size is not None: 473 query_params['size'] = size 474 return self.make_request('GET', uri, expected_status=200, 475 data=json.dumps(params), headers=headers, 476 params=query_params) 477 478 def upload_documents(self, documents, content_type): 479 """ 480 Posts a batch of documents to a search domain for indexing. A 481 document batch is a collection of add and delete operations 482 that represent the documents you want to add, update, or 483 delete from your domain. Batches can be described in either 484 JSON or XML. Each item that you want Amazon CloudSearch to 485 return as a search result (such as a product) is represented 486 as a document. Every document has a unique ID and one or more 487 fields that contain the data that you want to search and 488 return in results. Individual documents cannot contain more 489 than 1 MB of data. The entire batch cannot exceed 5 MB. To get 490 the best possible upload performance, group add and delete 491 operations in batches that are close the 5 MB limit. 492 Submitting a large volume of single-document batches can 493 overload a domain's document service. 494 495 The endpoint for submitting `UploadDocuments` requests is 496 domain-specific. To get the document endpoint for your domain, 497 use the Amazon CloudSearch configuration service 498 `DescribeDomains` action. A domain's endpoints are also 499 displayed on the domain dashboard in the Amazon CloudSearch 500 console. 501 502 For more information about formatting your data for Amazon 503 CloudSearch, see `Preparing Your Data`_ in the Amazon 504 CloudSearch Developer Guide . For more information about 505 uploading data for indexing, see `Uploading Data`_ in the 506 Amazon CloudSearch Developer Guide . 507 508 :type documents: blob 509 :param documents: A batch of documents formatted in JSON or HTML. 510 511 :type content_type: string 512 :param content_type: 513 The format of the batch you are uploading. Amazon CloudSearch supports 514 two document batch formats: 515 516 517 + application/json 518 + application/xml 519 520 """ 521 uri = '/2013-01-01/documents/batch' 522 headers = {} 523 query_params = {} 524 if content_type is not None: 525 headers['Content-Type'] = content_type 526 return self.make_request('POST', uri, expected_status=200, 527 data=documents, headers=headers, 528 params=query_params) 529 530 def make_request(self, verb, resource, headers=None, data='', 531 expected_status=None, params=None): 532 if headers is None: 533 headers = {} 534 response = AWSAuthConnection.make_request( 535 self, verb, resource, headers=headers, data=data, params=params) 536 body = json.loads(response.read().decode('utf-8')) 537 if response.status == expected_status: 538 return body 539 else: 540 raise JSONResponseError(response.status, response.reason, body) 541