1# Copyright (c) 2014 Amazon.com, Inc. or its affiliates.  All Rights Reserved
2#
3# Permission is hereby granted, free of charge, to any person obtaining a
4# copy of this software and associated documentation files (the
5# "Software"), to deal in the Software without restriction, including
6# without limitation the rights to use, copy, modify, merge, publish, dis-
7# tribute, sublicense, and/or sell copies of the Software, and to permit
8# persons to whom the Software is furnished to do so, subject to the fol-
9# lowing conditions:
10#
11# The above copyright notice and this permission notice shall be included
12# in all copies or substantial portions of the Software.
13#
14# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
16# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
17# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20# IN THE SOFTWARE.
21#
22from boto.compat import json
23from boto.exception import JSONResponseError
24from boto.connection import AWSAuthConnection
25from boto.regioninfo import RegionInfo
26from boto.cloudsearchdomain import exceptions
27
28
29class CloudSearchDomainConnection(AWSAuthConnection):
30    """
31    You use the AmazonCloudSearch2013 API to upload documents to a
32    search domain and search those documents.
33
34    The endpoints for submitting `UploadDocuments`, `Search`, and
35    `Suggest` requests are domain-specific. To get the endpoints for
36    your domain, use the Amazon CloudSearch configuration service
37    `DescribeDomains` action. The domain endpoints are also displayed
38    on the domain dashboard in the Amazon CloudSearch console. You
39    submit suggest requests to the search endpoint.
40
41    For more information, see the `Amazon CloudSearch Developer
42    Guide`_.
43    """
44    APIVersion = "2013-01-01"
45    AuthServiceName = 'cloudsearch'
46    DefaultRegionName = "us-east-1"
47    DefaultRegionEndpoint = "cloudsearch.us-east-1.amazonaws.com"
48    ResponseError = JSONResponseError
49
50    _faults = {
51        "SearchException": exceptions.SearchException,
52        "DocumentServiceException": exceptions.DocumentServiceException,
53    }
54
55    def __init__(self, **kwargs):
56        region = kwargs.get('region')
57        if not region:
58            region = RegionInfo(self, self.DefaultRegionName,
59                                self.DefaultRegionEndpoint)
60        else:
61            del kwargs['region']
62        if kwargs.get('host', None) is None:
63            raise ValueError(
64                'The argument, host, must be provided when creating a '
65                'CloudSearchDomainConnection because its methods require the '
66                'specific domain\'s endpoint in order to successfully make '
67                'requests to that CloudSearch Domain.'
68            )
69        super(CloudSearchDomainConnection, self).__init__(**kwargs)
70        self.region = region
71
72    def _required_auth_capability(self):
73        return ['hmac-v4']
74
75    def search(self, query, cursor=None, expr=None, facet=None,
76               filter_query=None, highlight=None, partial=None,
77               query_options=None, query_parser=None, ret=None, size=None,
78               sort=None, start=None):
79        """
80        Retrieves a list of documents that match the specified search
81        criteria. How you specify the search criteria depends on which
82        query parser you use. Amazon CloudSearch supports four query
83        parsers:
84
85
86        + `simple`: search all `text` and `text-array` fields for the
87          specified string. Search for phrases, individual terms, and
88          prefixes.
89        + `structured`: search specific fields, construct compound
90          queries using Boolean operators, and use advanced features
91          such as term boosting and proximity searching.
92        + `lucene`: specify search criteria using the Apache Lucene
93          query parser syntax.
94        + `dismax`: specify search criteria using the simplified
95          subset of the Apache Lucene query parser syntax defined by the
96          DisMax query parser.
97
98
99        For more information, see `Searching Your Data`_ in the Amazon
100        CloudSearch Developer Guide .
101
102        The endpoint for submitting `Search` requests is domain-
103        specific. You submit search requests to a domain's search
104        endpoint. To get the search endpoint for your domain, use the
105        Amazon CloudSearch configuration service `DescribeDomains`
106        action. A domain's endpoints are also displayed on the domain
107        dashboard in the Amazon CloudSearch console.
108
109        :type cursor: string
110        :param cursor: Retrieves a cursor value you can use to page through
111            large result sets. Use the `size` parameter to control the number
112            of hits to include in each response. You can specify either the
113            `cursor` or `start` parameter in a request; they are mutually
114            exclusive. To get the first cursor, set the cursor value to
115            `initial`. In subsequent requests, specify the cursor value
116            returned in the hits section of the response.
117        For more information, see `Paginating Results`_ in the Amazon
118            CloudSearch Developer Guide .
119
120        :type expr: string
121        :param expr: Defines one or more numeric expressions that can be used
122            to sort results or specify search or filter criteria. You can also
123            specify expressions as return fields.
124        For more information about defining and using expressions, see
125            `Configuring Expressions`_ in the Amazon CloudSearch Developer
126            Guide .
127
128        :type facet: string
129        :param facet: Specifies one or more fields for which to get facet
130            information, and options that control how the facet information is
131            returned. Each specified field must be facet-enabled in the domain
132            configuration. The fields and options are specified in JSON using
133            the form `{"FIELD":{"OPTION":VALUE,"OPTION:"STRING"},"FIELD":{"OPTI
134            ON":VALUE,"OPTION":"STRING"}}`.
135        You can specify the following faceting options:
136
137
138        + `buckets` specifies an array of the facet values or ranges to count.
139              Ranges are specified using the same syntax that you use to search
140              for a range of values. For more information, see ` Searching for a
141              Range of Values`_ in the Amazon CloudSearch Developer Guide .
142              Buckets are returned in the order they are specified in the
143              request. The `sort` and `size` options are not valid if you specify
144              `buckets`.
145        + `size` specifies the maximum number of facets to include in the
146              results. By default, Amazon CloudSearch returns counts for the top
147              10. The `size` parameter is only valid when you specify the `sort`
148              option; it cannot be used in conjunction with `buckets`.
149        + `sort` specifies how you want to sort the facets in the results:
150              `bucket` or `count`. Specify `bucket` to sort alphabetically or
151              numerically by facet value (in ascending order). Specify `count` to
152              sort by the facet counts computed for each facet value (in
153              descending order). To retrieve facet counts for particular values
154              or ranges of values, use the `buckets` option instead of `sort`.
155
156
157        If no facet options are specified, facet counts are computed for all
158            field values, the facets are sorted by facet count, and the top 10
159            facets are returned in the results.
160
161        For more information, see `Getting and Using Facet Information`_ in the
162            Amazon CloudSearch Developer Guide .
163
164        :type filter_query: string
165        :param filter_query: Specifies a structured query that filters the
166            results of a search without affecting how the results are scored
167            and sorted. You use `filterQuery` in conjunction with the `query`
168            parameter to filter the documents that match the constraints
169            specified in the `query` parameter. Specifying a filter controls
170            only which matching documents are included in the results, it has
171            no effect on how they are scored and sorted. The `filterQuery`
172            parameter supports the full structured query syntax.
173        For more information about using filters, see `Filtering Matching
174            Documents`_ in the Amazon CloudSearch Developer Guide .
175
176        :type highlight: string
177        :param highlight: Retrieves highlights for matches in the specified
178            `text` or `text-array` fields. Each specified field must be
179            highlight enabled in the domain configuration. The fields and
180            options are specified in JSON using the form `{"FIELD":{"OPTION":VA
181            LUE,"OPTION:"STRING"},"FIELD":{"OPTION":VALUE,"OPTION":"STRING"}}`.
182        You can specify the following highlight options:
183
184
185        + `format`: specifies the format of the data in the text field: `text`
186              or `html`. When data is returned as HTML, all non-alphanumeric
187              characters are encoded. The default is `html`.
188        + `max_phrases`: specifies the maximum number of occurrences of the
189              search term(s) you want to highlight. By default, the first
190              occurrence is highlighted.
191        + `pre_tag`: specifies the string to prepend to an occurrence of a
192              search term. The default for HTML highlights is `<em>`. The
193              default for text highlights is `*`.
194        + `post_tag`: specifies the string to append to an occurrence of a
195              search term. The default for HTML highlights is `</em>`. The
196              default for text highlights is `*`.
197
198
199        If no highlight options are specified for a field, the returned field
200            text is treated as HTML and the first match is highlighted with
201            emphasis tags: `<em>search-term</em>`.
202
203        :type partial: boolean
204        :param partial: Enables partial results to be returned if one or more
205            index partitions are unavailable. When your search index is
206            partitioned across multiple search instances, by default Amazon
207            CloudSearch only returns results if every partition can be queried.
208            This means that the failure of a single search instance can result
209            in 5xx (internal server) errors. When you enable partial results,
210            Amazon CloudSearch returns whatever results are available and
211            includes the percentage of documents searched in the search results
212            (percent-searched). This enables you to more gracefully degrade
213            your users' search experience. For example, rather than displaying
214            no results, you could display the partial results and a message
215            indicating that the results might be incomplete due to a temporary
216            system outage.
217
218        :type query: string
219        :param query: Specifies the search criteria for the request. How you
220            specify the search criteria depends on the query parser used for
221            the request and the parser options specified in the `queryOptions`
222            parameter. By default, the `simple` query parser is used to process
223            requests. To use the `structured`, `lucene`, or `dismax` query
224            parser, you must also specify the `queryParser` parameter.
225        For more information about specifying search criteria, see `Searching
226            Your Data`_ in the Amazon CloudSearch Developer Guide .
227
228        :type query_options: string
229        :param query_options:
230        Configures options for the query parser specified in the `queryParser`
231            parameter.
232
233        The options you can configure vary according to which parser you use:
234
235
236        + `defaultOperator`: The default operator used to combine individual
237              terms in the search string. For example: `defaultOperator: 'or'`.
238              For the `dismax` parser, you specify a percentage that represents
239              the percentage of terms in the search string (rounded down) that
240              must match, rather than a default operator. A value of `0%` is the
241              equivalent to OR, and a value of `100%` is equivalent to AND. The
242              percentage must be specified as a value in the range 0-100 followed
243              by the percent (%) symbol. For example, `defaultOperator: 50%`.
244              Valid values: `and`, `or`, a percentage in the range 0%-100% (
245              `dismax`). Default: `and` ( `simple`, `structured`, `lucene`) or
246              `100` ( `dismax`). Valid for: `simple`, `structured`, `lucene`, and
247              `dismax`.
248        + `fields`: An array of the fields to search when no fields are
249              specified in a search. If no fields are specified in a search and
250              this option is not specified, all text and text-array fields are
251              searched. You can specify a weight for each field to control the
252              relative importance of each field when Amazon CloudSearch
253              calculates relevance scores. To specify a field weight, append a
254              caret ( `^`) symbol and the weight to the field name. For example,
255              to boost the importance of the `title` field over the `description`
256              field you could specify: `"fields":["title^5","description"]`.
257              Valid values: The name of any configured field and an optional
258              numeric value greater than zero. Default: All `text` and `text-
259              array` fields. Valid for: `simple`, `structured`, `lucene`, and
260              `dismax`.
261        + `operators`: An array of the operators or special characters you want
262              to disable for the simple query parser. If you disable the `and`,
263              `or`, or `not` operators, the corresponding operators ( `+`, `|`,
264              `-`) have no special meaning and are dropped from the search
265              string. Similarly, disabling `prefix` disables the wildcard
266              operator ( `*`) and disabling `phrase` disables the ability to
267              search for phrases by enclosing phrases in double quotes. Disabling
268              precedence disables the ability to control order of precedence
269              using parentheses. Disabling `near` disables the ability to use the
270              ~ operator to perform a sloppy phrase search. Disabling the `fuzzy`
271              operator disables the ability to use the ~ operator to perform a
272              fuzzy search. `escape` disables the ability to use a backslash (
273              `\`) to escape special characters within the search string.
274              Disabling whitespace is an advanced option that prevents the parser
275              from tokenizing on whitespace, which can be useful for Vietnamese.
276              (It prevents Vietnamese words from being split incorrectly.) For
277              example, you could disable all operators other than the phrase
278              operator to support just simple term and phrase queries:
279              `"operators":["and","not","or", "prefix"]`. Valid values: `and`,
280              `escape`, `fuzzy`, `near`, `not`, `or`, `phrase`, `precedence`,
281              `prefix`, `whitespace`. Default: All operators and special
282              characters are enabled. Valid for: `simple`.
283        + `phraseFields`: An array of the `text` or `text-array` fields you
284              want to use for phrase searches. When the terms in the search
285              string appear in close proximity within a field, the field scores
286              higher. You can specify a weight for each field to boost that
287              score. The `phraseSlop` option controls how much the matches can
288              deviate from the search string and still be boosted. To specify a
289              field weight, append a caret ( `^`) symbol and the weight to the
290              field name. For example, to boost phrase matches in the `title`
291              field over the `abstract` field, you could specify:
292              `"phraseFields":["title^3", "plot"]` Valid values: The name of any
293              `text` or `text-array` field and an optional numeric value greater
294              than zero. Default: No fields. If you don't specify any fields with
295              `phraseFields`, proximity scoring is disabled even if `phraseSlop`
296              is specified. Valid for: `dismax`.
297        + `phraseSlop`: An integer value that specifies how much matches can
298              deviate from the search phrase and still be boosted according to
299              the weights specified in the `phraseFields` option; for example,
300              `phraseSlop: 2`. You must also specify `phraseFields` to enable
301              proximity scoring. Valid values: positive integers. Default: 0.
302              Valid for: `dismax`.
303        + `explicitPhraseSlop`: An integer value that specifies how much a
304              match can deviate from the search phrase when the phrase is
305              enclosed in double quotes in the search string. (Phrases that
306              exceed this proximity distance are not considered a match.) For
307              example, to specify a slop of three for dismax phrase queries, you
308              would specify `"explicitPhraseSlop":3`. Valid values: positive
309              integers. Default: 0. Valid for: `dismax`.
310        + `tieBreaker`: When a term in the search string is found in a
311              document's field, a score is calculated for that field based on how
312              common the word is in that field compared to other documents. If
313              the term occurs in multiple fields within a document, by default
314              only the highest scoring field contributes to the document's
315              overall score. You can specify a `tieBreaker` value to enable the
316              matches in lower-scoring fields to contribute to the document's
317              score. That way, if two documents have the same max field score for
318              a particular term, the score for the document that has matches in
319              more fields will be higher. The formula for calculating the score
320              with a tieBreaker is `(max field score) + (tieBreaker) * (sum of
321              the scores for the rest of the matching fields)`. Set `tieBreaker`
322              to 0 to disregard all but the highest scoring field (pure max):
323              `"tieBreaker":0`. Set to 1 to sum the scores from all fields (pure
324              sum): `"tieBreaker":1`. Valid values: 0.0 to 1.0. Default: 0.0.
325              Valid for: `dismax`.
326
327        :type query_parser: string
328        :param query_parser:
329        Specifies which query parser to use to process the request. If
330            `queryParser` is not specified, Amazon CloudSearch uses the
331            `simple` query parser.
332
333        Amazon CloudSearch supports four query parsers:
334
335
336        + `simple`: perform simple searches of `text` and `text-array` fields.
337              By default, the `simple` query parser searches all `text` and
338              `text-array` fields. You can specify which fields to search by with
339              the `queryOptions` parameter. If you prefix a search term with a
340              plus sign (+) documents must contain the term to be considered a
341              match. (This is the default, unless you configure the default
342              operator with the `queryOptions` parameter.) You can use the `-`
343              (NOT), `|` (OR), and `*` (wildcard) operators to exclude particular
344              terms, find results that match any of the specified terms, or
345              search for a prefix. To search for a phrase rather than individual
346              terms, enclose the phrase in double quotes. For more information,
347              see `Searching for Text`_ in the Amazon CloudSearch Developer Guide
348              .
349        + `structured`: perform advanced searches by combining multiple
350              expressions to define the search criteria. You can also search
351              within particular fields, search for values and ranges of values,
352              and use advanced options such as term boosting, `matchall`, and
353              `near`. For more information, see `Constructing Compound Queries`_
354              in the Amazon CloudSearch Developer Guide .
355        + `lucene`: search using the Apache Lucene query parser syntax. For
356              more information, see `Apache Lucene Query Parser Syntax`_.
357        + `dismax`: search using the simplified subset of the Apache Lucene
358              query parser syntax defined by the DisMax query parser. For more
359              information, see `DisMax Query Parser Syntax`_.
360
361        :type ret: string
362        :param ret: Specifies the field and expression values to include in
363            the response. Multiple fields or expressions are specified as a
364            comma-separated list. By default, a search response includes all
365            return enabled fields ( `_all_fields`). To return only the document
366            IDs for the matching documents, specify `_no_fields`. To retrieve
367            the relevance score calculated for each document, specify `_score`.
368
369        :type size: long
370        :param size: Specifies the maximum number of search hits to include in
371            the response.
372
373        :type sort: string
374        :param sort: Specifies the fields or custom expressions to use to sort
375            the search results. Multiple fields or expressions are specified as
376            a comma-separated list. You must specify the sort direction ( `asc`
377            or `desc`) for each field; for example, `year desc,title asc`. To
378            use a field to sort results, the field must be sort-enabled in the
379            domain configuration. Array type fields cannot be used for sorting.
380            If no `sort` parameter is specified, results are sorted by their
381            default relevance scores in descending order: `_score desc`. You
382            can also sort by document ID ( `_id asc`) and version ( `_version
383            desc`).
384        For more information, see `Sorting Results`_ in the Amazon CloudSearch
385            Developer Guide .
386
387        :type start: long
388        :param start: Specifies the offset of the first search hit you want to
389            return. Note that the result set is zero-based; the first result is
390            at index 0. You can specify either the `start` or `cursor`
391            parameter in a request, they are mutually exclusive.
392        For more information, see `Paginating Results`_ in the Amazon
393            CloudSearch Developer Guide .
394
395        """
396        uri = '/2013-01-01/search'
397        params = {}
398        headers = {}
399        query_params = {}
400        if cursor is not None:
401            query_params['cursor'] = cursor
402        if expr is not None:
403            query_params['expr'] = expr
404        if facet is not None:
405            query_params['facet'] = facet
406        if filter_query is not None:
407            query_params['fq'] = filter_query
408        if highlight is not None:
409            query_params['highlight'] = highlight
410        if partial is not None:
411            query_params['partial'] = partial
412        if query is not None:
413            query_params['q'] = query
414        if query_options is not None:
415            query_params['q.options'] = query_options
416        if query_parser is not None:
417            query_params['q.parser'] = query_parser
418        if ret is not None:
419            query_params['return'] = ret
420        if size is not None:
421            query_params['size'] = size
422        if sort is not None:
423            query_params['sort'] = sort
424        if start is not None:
425            query_params['start'] = start
426        return self.make_request('POST', uri, expected_status=200,
427                                 data=json.dumps(params), headers=headers,
428                                 params=query_params)
429
430    def suggest(self, query, suggester, size=None):
431        """
432        Retrieves autocomplete suggestions for a partial query string.
433        You can use suggestions enable you to display likely matches
434        before users finish typing. In Amazon CloudSearch, suggestions
435        are based on the contents of a particular text field. When you
436        request suggestions, Amazon CloudSearch finds all of the
437        documents whose values in the suggester field start with the
438        specified query string. The beginning of the field must match
439        the query string to be considered a match.
440
441        For more information about configuring suggesters and
442        retrieving suggestions, see `Getting Suggestions`_ in the
443        Amazon CloudSearch Developer Guide .
444
445        The endpoint for submitting `Suggest` requests is domain-
446        specific. You submit suggest requests to a domain's search
447        endpoint. To get the search endpoint for your domain, use the
448        Amazon CloudSearch configuration service `DescribeDomains`
449        action. A domain's endpoints are also displayed on the domain
450        dashboard in the Amazon CloudSearch console.
451
452        :type query: string
453        :param query: Specifies the string for which you want to get
454            suggestions.
455
456        :type suggester: string
457        :param suggester: Specifies the name of the suggester to use to find
458            suggested matches.
459
460        :type size: long
461        :param size: Specifies the maximum number of suggestions to return.
462
463        """
464        uri = '/2013-01-01/suggest'
465        params = {}
466        headers = {}
467        query_params = {}
468        if query is not None:
469            query_params['q'] = query
470        if suggester is not None:
471            query_params['suggester'] = suggester
472        if size is not None:
473            query_params['size'] = size
474        return self.make_request('GET', uri, expected_status=200,
475                                 data=json.dumps(params), headers=headers,
476                                 params=query_params)
477
478    def upload_documents(self, documents, content_type):
479        """
480        Posts a batch of documents to a search domain for indexing. A
481        document batch is a collection of add and delete operations
482        that represent the documents you want to add, update, or
483        delete from your domain. Batches can be described in either
484        JSON or XML. Each item that you want Amazon CloudSearch to
485        return as a search result (such as a product) is represented
486        as a document. Every document has a unique ID and one or more
487        fields that contain the data that you want to search and
488        return in results. Individual documents cannot contain more
489        than 1 MB of data. The entire batch cannot exceed 5 MB. To get
490        the best possible upload performance, group add and delete
491        operations in batches that are close the 5 MB limit.
492        Submitting a large volume of single-document batches can
493        overload a domain's document service.
494
495        The endpoint for submitting `UploadDocuments` requests is
496        domain-specific. To get the document endpoint for your domain,
497        use the Amazon CloudSearch configuration service
498        `DescribeDomains` action. A domain's endpoints are also
499        displayed on the domain dashboard in the Amazon CloudSearch
500        console.
501
502        For more information about formatting your data for Amazon
503        CloudSearch, see `Preparing Your Data`_ in the Amazon
504        CloudSearch Developer Guide . For more information about
505        uploading data for indexing, see `Uploading Data`_ in the
506        Amazon CloudSearch Developer Guide .
507
508        :type documents: blob
509        :param documents: A batch of documents formatted in JSON or HTML.
510
511        :type content_type: string
512        :param content_type:
513        The format of the batch you are uploading. Amazon CloudSearch supports
514            two document batch formats:
515
516
517        + application/json
518        + application/xml
519
520        """
521        uri = '/2013-01-01/documents/batch'
522        headers = {}
523        query_params = {}
524        if content_type is not None:
525            headers['Content-Type'] = content_type
526        return self.make_request('POST', uri, expected_status=200,
527                                 data=documents, headers=headers,
528                                 params=query_params)
529
530    def make_request(self, verb, resource, headers=None, data='',
531                     expected_status=None, params=None):
532        if headers is None:
533            headers = {}
534        response = AWSAuthConnection.make_request(
535            self, verb, resource, headers=headers, data=data, params=params)
536        body = json.loads(response.read().decode('utf-8'))
537        if response.status == expected_status:
538            return body
539        else:
540            raise JSONResponseError(response.status, response.reason, body)
541