1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.example.android.wiktionary;
18 
19 import org.json.JSONArray;
20 import org.json.JSONException;
21 import org.json.JSONObject;
22 
23 import android.net.Uri;
24 import android.text.TextUtils;
25 import android.webkit.WebView;
26 
27 import java.util.ArrayList;
28 import java.util.HashSet;
29 import java.util.List;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
32 
33 /**
34  * Extended version of {@link SimpleWikiHelper}. This version adds methods to
35  * pick a random word, and to format generic wiki-style text into HTML.
36  */
37 public class ExtendedWikiHelper extends SimpleWikiHelper {
38     /**
39      * HTML style sheet to include with any {@link #formatWikiText(String)} HTML
40      * results. It formats nicely for a mobile screen, and hides some content
41      * boxes to keep things tidy.
42      */
43     private static final String STYLE_SHEET = "<style>h2 {font-size:1.2em;font-weight:normal;} " +
44             "a {color:#6688cc;} ol {padding-left:1.5em;} blockquote {margin-left:0em;} " +
45             ".interProject, .noprint {display:none;} " +
46             "li, blockquote {margin-top:0.5em;margin-bottom:0.5em;}</style>";
47 
48     /**
49      * Pattern of section titles we're interested in showing. This trims out
50      * extra sections that can clutter things up on a mobile screen.
51      */
52     private static final Pattern sValidSections =
53         Pattern.compile("(verb|noun|adjective|pronoun|interjection)", Pattern.CASE_INSENSITIVE);
54 
55     /**
56      * Pattern that can be used to split a returned wiki page into its various
57      * sections. Doesn't treat children sections differently.
58      */
59     private static final Pattern sSectionSplit =
60         Pattern.compile("^=+(.+?)=+.+?(?=^=)", Pattern.MULTILINE | Pattern.DOTALL);
61 
62     /**
63      * When picking random words in {@link #getRandomWord()}, we sometimes
64      * encounter special articles or templates. This pattern ignores any words
65      * like those, usually because they have ":" or other punctuation.
66      */
67     private static final Pattern sInvalidWord = Pattern.compile("[^A-Za-z0-9 ]");
68 
69     /**
70      * {@link Uri} authority to use when creating internal links.
71      */
72     public static final String WIKI_AUTHORITY = "wiktionary";
73 
74     /**
75      * {@link Uri} host to use when creating internal links.
76      */
77     public static final String WIKI_LOOKUP_HOST = "lookup";
78 
79     /**
80      * Mime-type to use when showing parsed results in a {@link WebView}.
81      */
82     public static final String MIME_TYPE = "text/html";
83 
84     /**
85      * Encoding to use when showing parsed results in a {@link WebView}.
86      */
87     public static final String ENCODING = "utf-8";
88 
89     /**
90      * {@link Uri} to use when requesting a random page.
91      */
92     private static final String WIKTIONARY_RANDOM =
93         "http://en.wiktionary.org/w/api.php?action=query&list=random&format=json";
94 
95     /**
96      * Fake section to insert at the bottom of a wiki response before parsing.
97      * This ensures that {@link #sSectionSplit} will always catch the last
98      * section, as it uses section headers in its searching.
99      */
100     private static final String STUB_SECTION = "\n=Stub section=";
101 
102     /**
103      * Number of times to try finding a random word in {@link #getRandomWord()}.
104      * These failures are usually when the found word fails the
105      * {@link #sInvalidWord} test, or when a network error happens.
106      */
107     private static final int RANDOM_TRIES = 3;
108 
109     /**
110      * Internal class to hold a wiki formatting rule. It's mostly a wrapper to
111      * simplify {@link Matcher#replaceAll(String)}.
112      */
113     private static class FormatRule {
114         private Pattern mPattern;
115         private String mReplaceWith;
116 
117         /**
118          * Create a wiki formatting rule.
119          *
120          * @param pattern Search string to be compiled into a {@link Pattern}.
121          * @param replaceWith String to replace any found occurances with. This
122          *            string can also include back-references into the given
123          *            pattern.
124          * @param flags Any flags to compile the {@link Pattern} with.
125          */
FormatRule(String pattern, String replaceWith, int flags)126         public FormatRule(String pattern, String replaceWith, int flags) {
127             mPattern = Pattern.compile(pattern, flags);
128             mReplaceWith = replaceWith;
129         }
130 
131         /**
132          * Create a wiki formatting rule.
133          *
134          * @param pattern Search string to be compiled into a {@link Pattern}.
135          * @param replaceWith String to replace any found occurances with. This
136          *            string can also include back-references into the given
137          *            pattern.
138          */
FormatRule(String pattern, String replaceWith)139         public FormatRule(String pattern, String replaceWith) {
140             this(pattern, replaceWith, 0);
141         }
142 
143         /**
144          * Apply this formatting rule to the given input string, and return the
145          * resulting new string.
146          */
apply(String input)147         public String apply(String input) {
148             Matcher m = mPattern.matcher(input);
149             return m.replaceAll(mReplaceWith);
150         }
151 
152     }
153 
154     /**
155      * List of internal formatting rules to apply when parsing wiki text. These
156      * include indenting various bullets, apply italic and bold styles, and
157      * adding internal linking.
158      */
159     private static final List<FormatRule> sFormatRules = new ArrayList<FormatRule>();
160 
161     static {
162         // Format header blocks and wrap outside content in ordered list
sFormatRules.add(new FormatRule("^=+(.+?)=+", "</ol><h2>$1</h2><ol>", Pattern.MULTILINE))163         sFormatRules.add(new FormatRule("^=+(.+?)=+", "</ol><h2>$1</h2><ol>",
164                 Pattern.MULTILINE));
165 
166         // Indent quoted blocks, handle ordered and bullet lists
sFormatRules.add(new FormatRule("^#+\\\\*?:(.+?)$", "<blockquote>$1</blockquote>", Pattern.MULTILINE))167         sFormatRules.add(new FormatRule("^#+\\*?:(.+?)$", "<blockquote>$1</blockquote>",
168                 Pattern.MULTILINE));
sFormatRules.add(new FormatRule("^#+:?\\\\*(.+?)$", "<ul><li>$1</li></ul>", Pattern.MULTILINE))169         sFormatRules.add(new FormatRule("^#+:?\\*(.+?)$", "<ul><li>$1</li></ul>",
170                 Pattern.MULTILINE));
sFormatRules.add(new FormatRule("^#+(.+?)$", "<li>$1</li>", Pattern.MULTILINE))171         sFormatRules.add(new FormatRule("^#+(.+?)$", "<li>$1</li>",
172                 Pattern.MULTILINE));
173 
174         // Add internal links
sFormatRules.add(new FormatRule("\\\\[\\\\[([^:\\\\|\\\\]]+)\\\\]\\\\]", String.format("<a href=\\"%s://%s/$1\\">$1</a>", WIKI_AUTHORITY, WIKI_LOOKUP_HOST)))175         sFormatRules.add(new FormatRule("\\[\\[([^:\\|\\]]+)\\]\\]",
176                 String.format("<a href=\"%s://%s/$1\">$1</a>", WIKI_AUTHORITY, WIKI_LOOKUP_HOST)));
sFormatRules.add(new FormatRule("\\\\[\\\\[([^:\\\\|\\\\]]+)\\\\|([^\\\\]]+)\\\\]\\\\]", String.format("<a href=\\"%s://%s/$1\\">$2</a>", WIKI_AUTHORITY, WIKI_LOOKUP_HOST)))177         sFormatRules.add(new FormatRule("\\[\\[([^:\\|\\]]+)\\|([^\\]]+)\\]\\]",
178                 String.format("<a href=\"%s://%s/$1\">$2</a>", WIKI_AUTHORITY, WIKI_LOOKUP_HOST)));
179 
180         // Add bold and italic formatting
sFormatRules.add(new FormatRule("'''(.+?)'''", "<b>$1</b>"))181         sFormatRules.add(new FormatRule("'''(.+?)'''", "<b>$1</b>"));
sFormatRules.add(new FormatRule("([^'])''([^'].*?[^'])''([^'])", "$1<i>$2</i>$3"))182         sFormatRules.add(new FormatRule("([^'])''([^'].*?[^'])''([^'])", "$1<i>$2</i>$3"));
183 
184         // Remove odd category links and convert remaining links into flat text
sFormatRules.add(new FormatRule("(\\\\{+.+?\\\\}+|\\\\[\\\\[[^:]+:[^\\\\\\\\|\\\\]]+\\\\]\\\\]|" + "\\\\[http.+?\\\\]|\\\\[\\\\[Category:.+?\\\\]\\\\])", "", Pattern.MULTILINE | Pattern.DOTALL))185         sFormatRules.add(new FormatRule("(\\{+.+?\\}+|\\[\\[[^:]+:[^\\\\|\\]]+\\]\\]|" +
186                 "\\[http.+?\\]|\\[\\[Category:.+?\\]\\])", "", Pattern.MULTILINE | Pattern.DOTALL));
sFormatRules.add(new FormatRule("\\\\[\\\\[([^\\\\|\\\\]]+\\\\|)?(.+?)\\\\]\\\\]", "$2", Pattern.MULTILINE))187         sFormatRules.add(new FormatRule("\\[\\[([^\\|\\]]+\\|)?(.+?)\\]\\]", "$2",
188                 Pattern.MULTILINE));
189 
190     }
191 
192     /**
193      * Query the Wiktionary API to pick a random dictionary word. Will try
194      * multiple times to find a valid word before giving up.
195      *
196      * @return Random dictionary word, or null if no valid word was found.
197      * @throws ApiException If any connection or server error occurs.
198      * @throws ParseException If there are problems parsing the response.
199      */
getRandomWord()200     public static String getRandomWord() throws ApiException, ParseException {
201         // Keep trying a few times until we find a valid word
202         int tries = 0;
203         while (tries++ < RANDOM_TRIES) {
204             // Query the API for a random word
205             String content = getUrlContent(WIKTIONARY_RANDOM);
206             try {
207                 // Drill into the JSON response to find the returned word
208                 JSONObject response = new JSONObject(content);
209                 JSONObject query = response.getJSONObject("query");
210                 JSONArray random = query.getJSONArray("random");
211                 JSONObject word = random.getJSONObject(0);
212                 String foundWord = word.getString("title");
213 
214                 // If we found an actual word, and it wasn't rejected by our invalid
215                 // filter, then accept and return it.
216                 if (foundWord != null &&
217                         !sInvalidWord.matcher(foundWord).find()) {
218                     return foundWord;
219                 }
220             } catch (JSONException e) {
221                 throw new ParseException("Problem parsing API response", e);
222             }
223         }
224 
225         // No valid word found in number of tries, so return null
226         return null;
227     }
228 
229     /**
230      * Format the given wiki-style text into formatted HTML content. This will
231      * create headers, lists, internal links, and style formatting for any wiki
232      * markup found.
233      *
234      * @param wikiText The raw text to format, with wiki-markup included.
235      * @return HTML formatted content, ready for display in {@link WebView}.
236      */
formatWikiText(String wikiText)237     public static String formatWikiText(String wikiText) {
238         if (wikiText == null) {
239             return null;
240         }
241 
242         // Insert a fake last section into the document so our section splitter
243         // can correctly catch the last section.
244         wikiText = wikiText.concat(STUB_SECTION);
245 
246         // Read through all sections, keeping only those matching our filter,
247         // and only including the first entry for each title.
248         HashSet<String> foundSections = new HashSet<String>();
249         StringBuilder builder = new StringBuilder();
250 
251         Matcher sectionMatcher = sSectionSplit.matcher(wikiText);
252         while (sectionMatcher.find()) {
253             String title = sectionMatcher.group(1);
254             if (!foundSections.contains(title) &&
255                     sValidSections.matcher(title).matches()) {
256                 String sectionContent = sectionMatcher.group();
257                 foundSections.add(title);
258                 builder.append(sectionContent);
259             }
260         }
261 
262         // Our new wiki text is the selected sections only
263         wikiText = builder.toString();
264 
265         // Apply all formatting rules, in order, to the wiki text
266         for (FormatRule rule : sFormatRules) {
267             wikiText = rule.apply(wikiText);
268         }
269 
270         // Return the resulting HTML with style sheet, if we have content left
271         if (!TextUtils.isEmpty(wikiText)) {
272             return STYLE_SHEET + wikiText;
273         } else {
274             return null;
275         }
276     }
277 
278 }
279