1 /*
2  * Copyright (C) 2007 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.android.util;
18 
19 import java.util.ArrayList;
20 import java.util.HashMap;
21 import java.util.LinkedHashMap;
22 import java.util.regex.Matcher;
23 import java.util.regex.Pattern;
24 import java.util.Set;
25 import java.util.List;
26 
27 /**
28  *
29  * Logic for parsing a text message typed by the user looking for smileys,
30  * urls, acronyms,formatting (e.g., '*'s for bold), me commands
31  * (e.g., "/me is asleep"), and punctuation.
32  *
33  * It constructs an array, which breaks the text up into its
34  * constituent pieces, which we return to the client.
35  *
36  */
37 public abstract class AbstractMessageParser {
38 /**
39  * Interface representing the set of resources needed by a message parser
40  *
41  * @author jessan (Jessan Hutchison-Quillian)
42  */
43   public static interface Resources {
44 
45     /** Get the known set of URL schemes. */
getSchemes()46     public Set<String> getSchemes();
47 
48     /** Get the possible values for the last part of a domain name.
49      *  Values are expected to be reversed in the Trie.
50      */
getDomainSuffixes()51     public TrieNode getDomainSuffixes();
52 
53     /** Get the smileys accepted by the parser. */
getSmileys()54     public TrieNode getSmileys();
55 
56     /** Get the acronyms accepted by the parser. */
getAcronyms()57     public TrieNode getAcronyms();
58   }
59 
60   /**
61    * Subclasses must define the schemes, domains, smileys and acronyms
62    * that are necessary for parsing
63    */
getResources()64   protected abstract Resources getResources();
65 
66   /** Music note that indicates user is listening to a music track. */
67   public static final String musicNote = "\u266B ";
68 
69   private String text;
70   private int nextChar;
71   private int nextClass;
72   private ArrayList<Part> parts;
73   private ArrayList<Token> tokens;
74   private HashMap<Character,Format> formatStart;
75   private boolean parseSmilies;
76   private boolean parseAcronyms;
77   private boolean parseFormatting;
78   private boolean parseUrls;
79   private boolean parseMeText;
80   private boolean parseMusic;
81 
82   /**
83    * Create a message parser to parse urls, formatting, acronyms, smileys,
84    * /me text and  music
85    *
86    * @param text the text to parse
87    */
AbstractMessageParser(String text)88   public AbstractMessageParser(String text) {
89     this(text, true, true, true, true, true, true);
90   }
91 
92   /**
93    * Create a message parser, specifying the kinds of text to parse
94    *
95    * @param text the text to parse
96    *
97    */
AbstractMessageParser(String text, boolean parseSmilies, boolean parseAcronyms, boolean parseFormatting, boolean parseUrls, boolean parseMusic, boolean parseMeText)98   public AbstractMessageParser(String text, boolean parseSmilies,
99       boolean parseAcronyms, boolean parseFormatting, boolean parseUrls,
100       boolean parseMusic, boolean parseMeText) {
101     this.text = text;
102     this.nextChar = 0;
103     this.nextClass = 10;
104     this.parts = new ArrayList<Part>();
105     this.tokens = new ArrayList<Token>();
106     this.formatStart = new HashMap<Character,Format>();
107     this.parseSmilies = parseSmilies;
108     this.parseAcronyms = parseAcronyms;
109     this.parseFormatting = parseFormatting;
110     this.parseUrls = parseUrls;
111     this.parseMusic = parseMusic;
112     this.parseMeText = parseMeText;
113   }
114 
115   /** Returns the raw text being parsed. */
getRawText()116   public final String getRawText() { return text; }
117 
118   /** Return the number of parts. */
getPartCount()119   public final int getPartCount() { return parts.size(); }
120 
121   /** Return the part at the given index. */
getPart(int index)122   public final Part getPart(int index) { return parts.get(index); }
123 
124   /** Return the list of parts from the parsed text */
getParts()125   public final List<Part> getParts() { return parts; }
126 
127   /** Parses the text string into an internal representation. */
parse()128   public void parse() {
129     // Look for music track (of which there would be only one and it'll be the
130     // first token)
131     if (parseMusicTrack()) {
132       buildParts(null);
133       return;
134     }
135 
136     // Look for me commands.
137     String meText = null;
138     if (parseMeText && text.startsWith("/me") && (text.length() > 3) &&
139         Character.isWhitespace(text.charAt(3))) {
140       meText = text.substring(0, 4);
141       text = text.substring(4);
142     }
143 
144     // Break the text into tokens.
145     boolean wasSmiley = false;
146     while (nextChar < text.length()) {
147       if (!isWordBreak(nextChar)) {
148         if (!wasSmiley || !isSmileyBreak(nextChar)) {
149           throw new AssertionError("last chunk did not end at word break");
150         }
151       }
152 
153       if (parseSmiley()) {
154         wasSmiley = true;
155       } else {
156         wasSmiley = false;
157 
158         if (!parseAcronym() && !parseURL() && !parseFormatting()) {
159           parseText();
160         }
161       }
162     }
163 
164     // Trim the whitespace before and after media components.
165     for (int i = 0; i < tokens.size(); ++i) {
166       if (tokens.get(i).isMedia()) {
167         if ((i > 0) && (tokens.get(i - 1) instanceof Html)) {
168           ((Html)tokens.get(i - 1)).trimLeadingWhitespace();
169         }
170         if ((i + 1 < tokens.size()) && (tokens.get(i + 1) instanceof Html)) {
171           ((Html)tokens.get(i + 1)).trimTrailingWhitespace();
172         }
173       }
174     }
175 
176     // Remove any empty html tokens.
177     for (int i = 0; i < tokens.size(); ++i) {
178       if (tokens.get(i).isHtml() &&
179           (tokens.get(i).toHtml(true).length() == 0)) {
180         tokens.remove(i);
181         --i;  // visit this index again
182       }
183     }
184 
185     buildParts(meText);
186   }
187 
188   /**
189    * Get a the appropriate Token for a given URL
190    *
191    * @param text the anchor text
192    * @param url the url
193    *
194    */
tokenForUrl(String url, String text)195   public static Token tokenForUrl(String url, String text) {
196     if(url == null) {
197       return null;
198     }
199 
200     //Look for video links
201     Video video = Video.matchURL(url, text);
202     if (video != null) {
203       return video;
204     }
205 
206     // Look for video links.
207     YouTubeVideo ytVideo = YouTubeVideo.matchURL(url, text);
208     if (ytVideo != null) {
209       return ytVideo;
210     }
211 
212     // Look for photo links.
213     Photo photo = Photo.matchURL(url, text);
214     if (photo != null) {
215       return photo;
216     }
217 
218     // Look for photo links.
219     FlickrPhoto flickrPhoto = FlickrPhoto.matchURL(url, text);
220     if (flickrPhoto != null) {
221       return flickrPhoto;
222     }
223 
224     //Not media, so must be a regular URL
225     return new Link(url, text);
226   }
227 
228   /**
229    * Builds the parts list.
230    *
231    * @param meText any meText parsed from the message
232    */
buildParts(String meText)233   private void buildParts(String meText) {
234     for (int i = 0; i < tokens.size(); ++i) {
235       Token token = tokens.get(i);
236       if (token.isMedia() || (parts.size() == 0) || lastPart().isMedia()) {
237         parts.add(new Part());
238       }
239       lastPart().add(token);
240     }
241 
242     // The first part inherits the meText of the line.
243     if (parts.size() > 0) {
244       parts.get(0).setMeText(meText);
245     }
246   }
247 
248   /** Returns the last part in the list. */
lastPart()249   private Part lastPart() { return parts.get(parts.size() - 1); }
250 
251   /**
252    * Looks for a music track (\u266B is first character, everything else is
253    * track info).
254    */
parseMusicTrack()255   private boolean parseMusicTrack() {
256 
257     if (parseMusic && text.startsWith(musicNote)) {
258       addToken(new MusicTrack(text.substring(musicNote.length())));
259       nextChar = text.length();
260       return true;
261     }
262     return false;
263   }
264 
265   /** Consumes all of the text in the next word . */
parseText()266   private void parseText() {
267     StringBuilder buf = new StringBuilder();
268     int start = nextChar;
269     do {
270       char ch = text.charAt(nextChar++);
271       switch (ch) {
272         case '<':  buf.append("&lt;"); break;
273         case '>':  buf.append("&gt;"); break;
274         case '&':  buf.append("&amp;"); break;
275         case '"':  buf.append("&quot;"); break;
276         case '\'':  buf.append("&apos;"); break;
277         case '\n':  buf.append("<br>"); break;
278         default:  buf.append(ch); break;
279       }
280     } while (!isWordBreak(nextChar));
281 
282     addToken(new Html(text.substring(start, nextChar), buf.toString()));
283   }
284 
285   /**
286    * Looks for smileys (e.g., ":)") in the text.  The set of known smileys is
287    * loaded from a file into a trie at server start.
288    */
parseSmiley()289   private boolean parseSmiley() {
290     if(!parseSmilies) {
291       return false;
292     }
293     TrieNode match = longestMatch(getResources().getSmileys(), this, nextChar,
294                                   true);
295     if (match == null) {
296       return false;
297     } else {
298       int previousCharClass = getCharClass(nextChar - 1);
299       int nextCharClass = getCharClass(nextChar + match.getText().length());
300       if ((previousCharClass == 2 || previousCharClass == 3)
301           && (nextCharClass == 2 || nextCharClass == 3)) {
302         return false;
303       }
304       addToken(new Smiley(match.getText()));
305       nextChar += match.getText().length();
306       return true;
307     }
308   }
309 
310   /** Looks for acronyms (e.g., "lol") in the text.
311    */
parseAcronym()312   private boolean parseAcronym() {
313     if(!parseAcronyms) {
314       return false;
315     }
316     TrieNode match = longestMatch(getResources().getAcronyms(), this, nextChar);
317     if (match == null) {
318       return false;
319     } else {
320       addToken(new Acronym(match.getText(), match.getValue()));
321       nextChar += match.getText().length();
322       return true;
323     }
324   }
325 
326   /** Determines if this is an allowable domain character. */
isDomainChar(char c)327   private boolean isDomainChar(char c) {
328     return c == '-' || Character.isLetter(c) || Character.isDigit(c);
329   }
330 
331   /** Determines if the given string is a valid domain. */
isValidDomain(String domain)332   private boolean isValidDomain(String domain) {
333     // For hostnames, check that it ends with a known domain suffix
334     if (matches(getResources().getDomainSuffixes(), reverse(domain))) {
335       return true;
336     }
337     return false;
338   }
339 
340   /**
341    * Looks for a URL in two possible forms:  either a proper URL with a known
342    * scheme or a domain name optionally followed by a path, query, or query.
343    */
parseURL()344   private boolean parseURL() {
345     // Make sure this is a valid place to start a URL.
346     if (!parseUrls || !isURLBreak(nextChar)) {
347       return false;
348     }
349 
350     int start = nextChar;
351 
352     // Search for the first block of letters.
353     int index = start;
354     while ((index < text.length()) && isDomainChar(text.charAt(index))) {
355       index += 1;
356     }
357 
358     String url = "";
359     boolean done = false;
360 
361     if (index == text.length()) {
362       return false;
363     } else if (text.charAt(index) == ':') {
364       // Make sure this is a known scheme.
365       String scheme = text.substring(nextChar, index);
366       if (!getResources().getSchemes().contains(scheme)) {
367         return false;
368       }
369     } else if (text.charAt(index) == '.') {
370       // Search for the end of the domain name.
371       while (index < text.length()) {
372         char ch = text.charAt(index);
373         if ((ch != '.') && !isDomainChar(ch)) {
374           break;
375         } else {
376           index += 1;
377         }
378       }
379 
380       // Make sure the domain name has a valid suffix.  Since tries look for
381       // prefix matches, we reverse all the strings to get suffix comparisons.
382       String domain = text.substring(nextChar, index);
383       if (!isValidDomain(domain)) {
384         return false;
385       }
386 
387       // Search for a port.  We deal with this specially because a colon can
388       // also be a punctuation character.
389       if ((index + 1 < text.length()) && (text.charAt(index) == ':')) {
390         char ch = text.charAt(index + 1);
391         if (Character.isDigit(ch)) {
392           index += 1;
393           while ((index < text.length()) &&
394                  Character.isDigit(text.charAt(index))) {
395             index += 1;
396           }
397         }
398       }
399 
400       // The domain name should be followed by end of line, whitespace,
401       // punctuation, or a colon, slash, question, or hash character.  The
402       // tricky part here is that some URL characters are also punctuation, so
403       // we need to distinguish them.  Since we looked for ports above, a colon
404       // is always punctuation here.  To distinguish '?' cases, we look at the
405       // character that follows it.
406       if (index == text.length()) {
407         done = true;
408       } else {
409         char ch = text.charAt(index);
410         if (ch == '?') {
411           // If the next character is whitespace or punctuation (or missing),
412           // then this question mark looks like punctuation.
413           if (index + 1 == text.length()) {
414             done = true;
415           } else {
416             char ch2 = text.charAt(index + 1);
417             if (Character.isWhitespace(ch2) || isPunctuation(ch2)) {
418               done = true;
419             }
420           }
421         } else if (isPunctuation(ch)) {
422           done = true;
423         } else if (Character.isWhitespace(ch)) {
424           done = true;
425         } else if ((ch == '/') || (ch == '#')) {
426           // In this case, the URL is not done.  We will search for the end of
427           // it below.
428         } else {
429           return false;
430         }
431       }
432 
433       // We will assume the user meant HTTP.  (One weird case is where they
434       // type a port of 443.  That could mean HTTPS, but they might also want
435       // HTTP.  We'll let them specify if they don't want HTTP.)
436       url = "http://";
437     } else {
438       return false;
439     }
440 
441     // If the URL is not done, search for the end, which is just before the
442     // next whitespace character.
443     if (!done) {
444       while ((index < text.length()) &&
445              !Character.isWhitespace(text.charAt(index))) {
446         index += 1;
447       }
448     }
449 
450     String urlText = text.substring(start, index);
451     url += urlText;
452 
453     // Figure out the appropriate token type.
454     addURLToken(url, urlText);
455 
456     nextChar = index;
457     return true;
458   }
459 
460   /**
461    * Adds the appropriate token for the given URL.  This might be a simple
462    * link or it might be a recognized media type.
463    */
addURLToken(String url, String text)464   private void addURLToken(String url, String text) {
465      addToken(tokenForUrl(url, text));
466   }
467 
468   /**
469    * Deal with formatting characters.
470    *
471    * Parsing is as follows:
472    *  - Treat all contiguous strings of formatting characters as one block.
473    *    (This method processes one block.)
474    *  - Only a single instance of a particular format character within a block
475    *    is used to determine whether to turn on/off that type of formatting;
476    *    other instances simply print the character itself.
477    *  - If the format is to be turned on, we use the _first_ instance; if it
478    *    is to be turned off, we use the _last_ instance (by appending the
479    *    format.)
480    *
481    * Example:
482    *   **string** turns into <b>*string*</b>
483    */
parseFormatting()484   private boolean parseFormatting() {
485     if(!parseFormatting) {
486       return false;
487     }
488     int endChar = nextChar;
489     while ((endChar < text.length()) && isFormatChar(text.charAt(endChar))) {
490       endChar += 1;
491     }
492 
493     if ((endChar == nextChar) || !isWordBreak(endChar)) {
494       return false;
495     }
496 
497     // Keeps track of whether we've seen a character (in map if we've seen it)
498     // and whether we should append a closing format token (if value in
499     // map is TRUE).  Linked hashmap for consistent ordering.
500     LinkedHashMap<Character, Boolean> seenCharacters =
501         new LinkedHashMap<Character, Boolean>();
502 
503     for (int index = nextChar; index < endChar; ++index) {
504       char ch = text.charAt(index);
505       Character key = Character.valueOf(ch);
506       if (seenCharacters.containsKey(key)) {
507         // Already seen this character, just append an unmatched token, which
508         // will print plaintext character
509         addToken(new Format(ch, false));
510       } else {
511         Format start = formatStart.get(key);
512         if (start != null) {
513           // Match the start token, and ask an end token to be appended
514           start.setMatched(true);
515           formatStart.remove(key);
516           seenCharacters.put(key, Boolean.TRUE);
517         } else {
518           // Append start token
519           start = new Format(ch, true);
520           formatStart.put(key, start);
521           addToken(start);
522           seenCharacters.put(key, Boolean.FALSE);
523         }
524       }
525     }
526 
527     // Append any necessary end tokens
528     for (Character key : seenCharacters.keySet()) {
529       if (seenCharacters.get(key) == Boolean.TRUE) {
530         Format end = new Format(key.charValue(), false);
531         end.setMatched(true);
532         addToken(end);
533       }
534     }
535 
536     nextChar = endChar;
537     return true;
538   }
539 
540   /** Determines whether the given index could be a possible word break. */
isWordBreak(int index)541   private boolean isWordBreak(int index) {
542     return getCharClass(index - 1) != getCharClass(index);
543   }
544 
545   /** Determines whether the given index could be a possible smiley break. */
isSmileyBreak(int index)546   private boolean isSmileyBreak(int index) {
547     if (index > 0 && index < text.length()) {
548       if (isSmileyBreak(text.charAt(index - 1), text.charAt(index))) {
549         return true;
550       }
551     }
552 
553     return false;
554   }
555 
556   /**
557    * Verifies that the character before the given index is end of line,
558    * whitespace, or punctuation.
559    */
isURLBreak(int index)560   private boolean isURLBreak(int index) {
561     switch (getCharClass(index - 1)) {
562       case 2:
563       case 3:
564       case 4:
565         return false;
566 
567       case 0:
568       case 1:
569       default:
570         return true;
571     }
572   }
573 
574   /** Returns the class for the character at the given index. */
getCharClass(int index)575   private int getCharClass(int index) {
576     if ((index < 0) || (text.length() <= index)) {
577       return 0;
578     }
579 
580     char ch = text.charAt(index);
581     if (Character.isWhitespace(ch)) {
582       return 1;
583     } else if (Character.isLetter(ch)) {
584       return 2;
585     } else if (Character.isDigit(ch)) {
586       return 3;
587     } else if (isPunctuation(ch)) {
588       // For punctuation, we return a unique value every time so that they are
589       // always different from any other character.  Punctuation should always
590       // be considered a possible word break.
591       return ++nextClass;
592     } else {
593       return 4;
594     }
595   }
596 
597   /**
598    * Returns true if <code>c1</code> could be the last character of
599    * a smiley and <code>c2</code> could be the first character of
600    * a different smiley, if {@link #isWordBreak} would not already
601    * recognize that this is possible.
602    */
isSmileyBreak(char c1, char c2)603   private static boolean isSmileyBreak(char c1, char c2) {
604     switch (c1) {
605       /*
606        * These characters can end smileys, but don't normally end words.
607        */
608       case '$': case '&': case '*': case '+': case '-':
609       case '/': case '<': case '=': case '>': case '@':
610       case '[': case '\\': case ']': case '^': case '|':
611       case '}': case '~':
612         switch (c2) {
613           /*
614            * These characters can begin smileys, but don't normally
615            * begin words.
616            */
617           case '#': case '$': case '%': case '*': case '/':
618           case '<': case '=': case '>': case '@': case '[':
619           case '\\': case '^': case '~':
620             return true;
621         }
622     }
623 
624     return false;
625   }
626 
627   /** Determines whether the given character is punctuation. */
isPunctuation(char ch)628   private static boolean isPunctuation(char ch) {
629     switch (ch) {
630       case '.': case ',': case '"': case ':': case ';':
631       case '?': case '!': case '(': case ')':
632         return true;
633 
634       default:
635         return false;
636     }
637   }
638 
639   /**
640    * Determines whether the given character is the beginning or end of a
641    * section with special formatting.
642    */
isFormatChar(char ch)643   private static boolean isFormatChar(char ch) {
644     switch (ch) {
645       case '*': case '_': case '^':
646         return true;
647 
648       default:
649         return false;
650     }
651   }
652 
653   /** Represents a unit of parsed output. */
654   public static abstract class Token {
655     public enum Type {
656 
657       HTML ("html"),
658       FORMAT ("format"),  // subtype of HTML
659       LINK ("l"),
660       SMILEY ("e"),
661       ACRONYM ("a"),
662       MUSIC ("m"),
663       GOOGLE_VIDEO ("v"),
664       YOUTUBE_VIDEO ("yt"),
665       PHOTO ("p"),
666       FLICKR ("f");
667 
668       //stringreps for HTML and FORMAT don't really matter
669       //because they don't define getInfo(), which is where it is used
670       //For the other types, code depends on their stringreps
671       private String stringRep;
672 
Type(String stringRep)673       Type(String stringRep) {
674         this.stringRep = stringRep;
675       }
676 
677       /** {@inheritDoc} */
toString()678       public String toString() {
679         return this.stringRep;
680       }
681     }
682 
683     protected Type type;
684     protected String text;
685 
Token(Type type, String text)686     protected Token(Type type, String text) {
687       this.type = type;
688       this.text = text;
689     }
690 
691     /** Returns the type of the token. */
getType()692     public Type getType() { return type; }
693 
694     /**
695      * Get the relevant information about a token
696      *
697      * @return a list of strings representing the token, not null
698      *         The first item is always a string representation of the type
699      */
getInfo()700     public List<String> getInfo() {
701       List<String> info = new ArrayList<String>();
702       info.add(getType().toString());
703       return info;
704     }
705 
706     /** Returns the raw text of the token. */
getRawText()707     public String getRawText() { return text; }
708 
isMedia()709     public boolean isMedia() { return false; }
isHtml()710     public abstract boolean isHtml();
isArray()711     public boolean isArray() { return !isHtml(); }
712 
toHtml(boolean caps)713     public String toHtml(boolean caps) { throw new AssertionError("not html"); }
714 
715     // The token can change the caps of the text after that point.
controlCaps()716     public boolean controlCaps() { return false; }
setCaps()717     public boolean setCaps() { return false; }
718   }
719 
720   /** Represents a simple string of html text. */
721   public static class Html extends Token {
722     private String html;
723 
Html(String text, String html)724     public Html(String text, String html) {
725       super(Type.HTML, text);
726       this.html = html;
727     }
728 
isHtml()729     public boolean isHtml() { return true; }
toHtml(boolean caps)730     public String toHtml(boolean caps) {
731       return caps ? html.toUpperCase() : html;
732     }
733     /**
734      * Not supported. Info should not be needed for this type
735      */
getInfo()736     public List<String> getInfo() {
737       throw new UnsupportedOperationException();
738     }
739 
trimLeadingWhitespace()740     public void trimLeadingWhitespace() {
741       text = trimLeadingWhitespace(text);
742       html = trimLeadingWhitespace(html);
743     }
744 
trimTrailingWhitespace()745     public void trimTrailingWhitespace() {
746       text = trimTrailingWhitespace(text);
747       html = trimTrailingWhitespace(html);
748     }
749 
trimLeadingWhitespace(String text)750     private static String trimLeadingWhitespace(String text) {
751       int index = 0;
752       while ((index < text.length()) &&
753              Character.isWhitespace(text.charAt(index))) {
754         ++index;
755       }
756       return text.substring(index);
757     }
758 
trimTrailingWhitespace(String text)759     public static String trimTrailingWhitespace(String text) {
760       int index = text.length();
761       while ((index > 0) && Character.isWhitespace(text.charAt(index - 1))) {
762         --index;
763       }
764       return text.substring(0, index);
765     }
766   }
767 
768   /** Represents a music track token at the beginning. */
769   public static class MusicTrack extends Token {
770     private String track;
771 
MusicTrack(String track)772     public MusicTrack(String track) {
773       super(Type.MUSIC, track);
774       this.track = track;
775     }
776 
getTrack()777     public String getTrack() { return track; }
778 
isHtml()779     public boolean isHtml() { return false; }
780 
getInfo()781     public List<String> getInfo() {
782       List<String> info = super.getInfo();
783       info.add(getTrack());
784       return info;
785     }
786   }
787 
788   /** Represents a link that was found in the input. */
789   public static class Link extends Token {
790     private String url;
791 
Link(String url, String text)792     public Link(String url, String text) {
793       super(Type.LINK, text);
794       this.url = url;
795     }
796 
getURL()797     public String getURL() { return url; }
798 
isHtml()799     public boolean isHtml() { return false; }
800 
getInfo()801     public List<String> getInfo() {
802       List<String> info = super.getInfo();
803       info.add(getURL());
804       info.add(getRawText());
805       return info;
806     }
807   }
808 
809   /** Represents a link to a Google Video. */
810   public static class Video extends Token {
811     /** Pattern for a video URL. */
812     private static final Pattern URL_PATTERN = Pattern.compile(
813         "(?i)http://video\\.google\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/videoplay\\?"
814         + ".*?\\bdocid=(-?\\d+).*");
815 
816     private String docid;
817 
Video(String docid, String text)818     public Video(String docid, String text) {
819       super(Type.GOOGLE_VIDEO, text);
820       this.docid = docid;
821     }
822 
getDocID()823     public String getDocID() { return docid; }
824 
isHtml()825     public boolean isHtml() { return false; }
isMedia()826     public boolean isMedia() { return true; }
827 
828     /** Returns a Video object if the given url is to a video. */
matchURL(String url, String text)829     public static Video matchURL(String url, String text) {
830       Matcher m = URL_PATTERN.matcher(url);
831       if (m.matches()) {
832         return new Video(m.group(1), text);
833       } else {
834         return null;
835       }
836     }
837 
getInfo()838     public List<String> getInfo() {
839       List<String> info = super.getInfo();
840       info.add(getRssUrl(docid));
841       info.add(getURL(docid));
842       return info;
843     }
844 
845     /** Returns the URL for the RSS description of the given video. */
getRssUrl(String docid)846     public static String getRssUrl(String docid) {
847       return "http://video.google.com/videofeed"
848              + "?type=docid&output=rss&sourceid=gtalk&docid=" + docid;
849     }
850 
851     /** (For testing purposes:) Returns a video URL with the given parts.  */
getURL(String docid)852     public static String getURL(String docid) {
853       return getURL(docid, null);
854     }
855 
856     /** (For testing purposes:) Returns a video URL with the given parts.  */
getURL(String docid, String extraParams)857     public static String getURL(String docid, String extraParams) {
858       if (extraParams == null) {
859         extraParams = "";
860       } else if (extraParams.length() > 0) {
861         extraParams += "&";
862       }
863       return "http://video.google.com/videoplay?" + extraParams
864              + "docid=" + docid;
865     }
866   }
867 
868   /** Represents a link to a YouTube video. */
869   public static class YouTubeVideo extends Token {
870     /** Pattern for a video URL. */
871     private static final Pattern URL_PATTERN = Pattern.compile(
872         "(?i)http://(?:[a-z0-9]+\\.)?youtube\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/watch\\?"
873         + ".*\\bv=([-_a-zA-Z0-9=]+).*");
874 
875     private String docid;
876 
YouTubeVideo(String docid, String text)877     public YouTubeVideo(String docid, String text) {
878       super(Type.YOUTUBE_VIDEO, text);
879       this.docid = docid;
880     }
881 
getDocID()882     public String getDocID() { return docid; }
883 
isHtml()884     public boolean isHtml() { return false; }
isMedia()885     public boolean isMedia() { return true; }
886 
887     /** Returns a Video object if the given url is to a video. */
matchURL(String url, String text)888     public static YouTubeVideo matchURL(String url, String text) {
889       Matcher m = URL_PATTERN.matcher(url);
890       if (m.matches()) {
891         return new YouTubeVideo(m.group(1), text);
892       } else {
893         return null;
894       }
895     }
896 
getInfo()897     public List<String> getInfo() {
898       List<String> info = super.getInfo();
899       info.add(getRssUrl(docid));
900       info.add(getURL(docid));
901       return info;
902     }
903 
904     /** Returns the URL for the RSS description of the given video. */
getRssUrl(String docid)905     public static String getRssUrl(String docid) {
906       return "http://youtube.com/watch?v=" + docid;
907     }
908 
909     /** (For testing purposes:) Returns a video URL with the given parts.  */
getURL(String docid)910     public static String getURL(String docid) {
911       return getURL(docid, null);
912     }
913 
914     /** (For testing purposes:) Returns a video URL with the given parts.  */
getURL(String docid, String extraParams)915     public static String getURL(String docid, String extraParams) {
916       if (extraParams == null) {
917         extraParams = "";
918       } else if (extraParams.length() > 0) {
919         extraParams += "&";
920       }
921       return "http://youtube.com/watch?" + extraParams + "v=" + docid;
922     }
923 
924     /** (For testing purposes:) Returns a video URL with the given parts.
925       * @param http If true, includes http://
926       * @param prefix If non-null/non-blank, adds to URL before youtube.com.
927       *   (e.g., prefix="br." --> "br.youtube.com")
928       */
getPrefixedURL(boolean http, String prefix, String docid, String extraParams)929     public static String getPrefixedURL(boolean http, String prefix,
930                                         String docid, String extraParams) {
931       String protocol = "";
932 
933       if (http) {
934         protocol = "http://";
935       }
936 
937       if (prefix == null) {
938         prefix = "";
939       }
940 
941       if (extraParams == null) {
942         extraParams = "";
943       } else if (extraParams.length() > 0) {
944         extraParams += "&";
945       }
946 
947       return protocol + prefix + "youtube.com/watch?" + extraParams + "v=" +
948               docid;
949     }
950   }
951 
952   /** Represents a link to a Picasa photo or album. */
953   public static class Photo extends Token {
954     /** Pattern for an album or photo URL. */
955     // TODO (katyarogers) searchbrowse includes search lists and tags,
956     // it follows a different pattern than albums - would be nice to add later
957     private static final Pattern URL_PATTERN = Pattern.compile(
958         "http://picasaweb.google.com/([^/?#&]+)/+((?!searchbrowse)[^/?#&]+)(?:/|/photo)?(?:\\?[^#]*)?(?:#(.*))?");
959 
960     private String user;
961     private String album;
962     private String photo;  // null for albums
963 
Photo(String user, String album, String photo, String text)964     public Photo(String user, String album, String photo, String text) {
965       super(Type.PHOTO, text);
966       this.user = user;
967       this.album = album;
968       this.photo = photo;
969     }
970 
getUser()971     public String getUser() { return user; }
getAlbum()972     public String getAlbum() { return album; }
getPhoto()973     public String getPhoto() { return photo; }
974 
isHtml()975     public boolean isHtml() { return false; }
isMedia()976     public boolean isMedia() { return true; }
977 
978     /** Returns a Photo object if the given url is to a photo or album. */
matchURL(String url, String text)979     public static Photo matchURL(String url, String text) {
980       Matcher m = URL_PATTERN.matcher(url);
981       if (m.matches()) {
982         return new Photo(m.group(1), m.group(2), m.group(3), text);
983       } else {
984         return null;
985       }
986     }
987 
getInfo()988     public List<String> getInfo() {
989       List<String> info = super.getInfo();
990       info.add(getRssUrl(getUser()));
991       info.add(getAlbumURL(getUser(), getAlbum()));
992       if (getPhoto() != null) {
993         info.add(getPhotoURL(getUser(), getAlbum(), getPhoto()));
994       } else {
995         info.add((String)null);
996       }
997       return info;
998     }
999 
1000     /** Returns the URL for the RSS description of the user's albums. */
getRssUrl(String user)1001     public static String getRssUrl(String user) {
1002       return "http://picasaweb.google.com/data/feed/api/user/" + user +
1003         "?category=album&alt=rss";
1004     }
1005 
1006     /** Returns the URL for an album. */
getAlbumURL(String user, String album)1007     public static String getAlbumURL(String user, String album) {
1008       return "http://picasaweb.google.com/" + user + "/" + album;
1009     }
1010 
1011     /** Returns the URL for a particular photo. */
getPhotoURL(String user, String album, String photo)1012     public static String getPhotoURL(String user, String album, String photo) {
1013       return "http://picasaweb.google.com/" + user + "/" + album + "/photo#"
1014              + photo;
1015     }
1016   }
1017 
1018   /** Represents a link to a Flickr photo or album. */
1019   public static class FlickrPhoto extends Token {
1020     /** Pattern for a user album or photo URL. */
1021     private static final Pattern URL_PATTERN = Pattern.compile(
1022         "http://(?:www.)?flickr.com/photos/([^/?#&]+)/?([^/?#&]+)?/?.*");
1023     private static final Pattern GROUPING_PATTERN = Pattern.compile(
1024         "http://(?:www.)?flickr.com/photos/([^/?#&]+)/(tags|sets)/" +
1025         "([^/?#&]+)/?");
1026 
1027     private static final String SETS = "sets";
1028     private static final String TAGS = "tags";
1029 
1030     private String user;
1031     private String photo;      // null for user album
1032     private String grouping;   // either "tags" or "sets"
1033     private String groupingId; // sets or tags identifier
1034 
FlickrPhoto(String user, String photo, String grouping, String groupingId, String text)1035     public FlickrPhoto(String user, String photo, String grouping,
1036                        String groupingId, String text) {
1037       super(Type.FLICKR, text);
1038 
1039       /* System wide tags look like the URL to a Flickr user. */
1040       if (!TAGS.equals(user)) {
1041         this.user = user;
1042         // Don't consider slide show URL a photo
1043         this.photo = (!"show".equals(photo) ? photo : null);
1044         this.grouping = grouping;
1045         this.groupingId = groupingId;
1046       } else {
1047         this.user = null;
1048         this.photo = null;
1049         this.grouping = TAGS;
1050         this.groupingId = photo;
1051       }
1052     }
1053 
getUser()1054     public String getUser() { return user; }
getPhoto()1055     public String getPhoto() { return photo; }
getGrouping()1056     public String getGrouping() { return grouping; }
getGroupingId()1057     public String getGroupingId() { return groupingId; }
1058 
isHtml()1059     public boolean isHtml() { return false; }
isMedia()1060     public boolean isMedia() { return true; }
1061 
1062     /**
1063      * Returns a FlickrPhoto object if the given url is to a photo or Flickr
1064      * user.
1065      */
matchURL(String url, String text)1066     public static FlickrPhoto matchURL(String url, String text) {
1067       Matcher m = GROUPING_PATTERN.matcher(url);
1068       if (m.matches()) {
1069         return new FlickrPhoto(m.group(1), null, m.group(2), m.group(3), text);
1070       }
1071 
1072       m = URL_PATTERN.matcher(url);
1073       if (m.matches()) {
1074         return new FlickrPhoto(m.group(1), m.group(2), null, null, text);
1075       } else {
1076         return null;
1077       }
1078     }
1079 
getInfo()1080     public List<String> getInfo() {
1081       List<String> info = super.getInfo();
1082       info.add(getUrl());
1083       info.add(getUser() != null ? getUser() : "");
1084       info.add(getPhoto() != null ? getPhoto() : "");
1085       info.add(getGrouping() != null ? getGrouping() : "");
1086       info.add(getGroupingId() != null ? getGroupingId() : "");
1087       return info;
1088     }
1089 
getUrl()1090     public String getUrl() {
1091       if (SETS.equals(grouping)) {
1092         return getUserSetsURL(user, groupingId);
1093       } else if (TAGS.equals(grouping)) {
1094         if (user != null) {
1095           return getUserTagsURL(user, groupingId);
1096         } else {
1097           return getTagsURL(groupingId);
1098         }
1099       } else if (photo != null) {
1100         return getPhotoURL(user, photo);
1101       } else {
1102         return getUserURL(user);
1103       }
1104     }
1105 
1106     /** Returns the URL for the RSS description. */
getRssUrl(String user)1107     public static String getRssUrl(String user) {
1108       return null;
1109     }
1110 
1111     /** Returns the URL for a particular tag. */
getTagsURL(String tag)1112     public static String getTagsURL(String tag) {
1113       return "http://flickr.com/photos/tags/" + tag;
1114     }
1115 
1116     /** Returns the URL to the user's Flickr homepage. */
getUserURL(String user)1117     public static String getUserURL(String user) {
1118       return "http://flickr.com/photos/" + user;
1119     }
1120 
1121     /** Returns the URL for a particular photo. */
getPhotoURL(String user, String photo)1122     public static String getPhotoURL(String user, String photo) {
1123       return "http://flickr.com/photos/" + user + "/" + photo;
1124     }
1125 
1126     /** Returns the URL for a user tag photo set. */
getUserTagsURL(String user, String tagId)1127     public static String getUserTagsURL(String user, String tagId) {
1128       return "http://flickr.com/photos/" + user + "/tags/" + tagId;
1129     }
1130 
1131     /** Returns the URL for user set. */
getUserSetsURL(String user, String setId)1132     public static String getUserSetsURL(String user, String setId) {
1133       return "http://flickr.com/photos/" + user + "/sets/" + setId;
1134     }
1135   }
1136 
1137   /** Represents a smiley that was found in the input. */
1138   public static class Smiley extends Token {
1139     // TODO: Pass the SWF URL down to the client.
1140 
Smiley(String text)1141     public Smiley(String text) {
1142       super(Type.SMILEY, text);
1143     }
1144 
isHtml()1145     public boolean isHtml() { return false; }
1146 
getInfo()1147     public List<String> getInfo() {
1148       List<String> info = super.getInfo();
1149       info.add(getRawText());
1150       return info;
1151     }
1152   }
1153 
1154   /** Represents an acronym that was found in the input. */
1155   public static class Acronym extends Token {
1156     private String value;
1157     // TODO: SWF
1158 
Acronym(String text, String value)1159     public Acronym(String text, String value) {
1160       super(Type.ACRONYM, text);
1161       this.value = value;
1162     }
1163 
getValue()1164     public String getValue() { return value; }
1165 
isHtml()1166     public boolean isHtml() { return false; }
1167 
getInfo()1168     public List<String> getInfo() {
1169       List<String> info = super.getInfo();
1170       info.add(getRawText());
1171       info.add(getValue());
1172       return info;
1173     }
1174   }
1175 
1176   /** Represents a character that changes formatting. */
1177   public static class Format extends Token {
1178     private char ch;
1179     private boolean start;
1180     private boolean matched;
1181 
Format(char ch, boolean start)1182     public Format(char ch, boolean start) {
1183       super(Type.FORMAT, String.valueOf(ch));
1184       this.ch = ch;
1185       this.start = start;
1186     }
1187 
setMatched(boolean matched)1188     public void setMatched(boolean matched) { this.matched = matched; }
1189 
isHtml()1190     public boolean isHtml() { return true; }
1191 
toHtml(boolean caps)1192     public String toHtml(boolean caps) {
1193       // This character only implies special formatting if it was matched.
1194       // Otherwise, it was just a plain old character.
1195       if (matched) {
1196         return start ? getFormatStart(ch) : getFormatEnd(ch);
1197       } else {
1198         // We have to make sure we escape HTML characters as usual.
1199         return (ch == '"') ? "&quot;" : String.valueOf(ch);
1200       }
1201     }
1202 
1203     /**
1204      * Not supported. Info should not be needed for this type
1205      */
getInfo()1206     public List<String> getInfo() {
1207       throw new UnsupportedOperationException();
1208     }
1209 
controlCaps()1210     public boolean controlCaps() { return (ch == '^'); }
setCaps()1211     public boolean setCaps() { return start; }
1212 
getFormatStart(char ch)1213     private String getFormatStart(char ch) {
1214       switch (ch) {
1215         case '*': return "<b>";
1216         case '_': return "<i>";
1217         case '^': return "<b><font color=\"#005FFF\">"; // TODO: all caps
1218         case '"': return "<font color=\"#999999\">\u201c";
1219         default: throw new AssertionError("unknown format '" + ch + "'");
1220       }
1221     }
1222 
getFormatEnd(char ch)1223     private String getFormatEnd(char ch) {
1224       switch (ch) {
1225         case '*': return "</b>";
1226         case '_': return "</i>";
1227         case '^': return "</font></b>"; // TODO: all caps
1228         case '"': return "\u201d</font>";
1229         default: throw new AssertionError("unknown format '" + ch + "'");
1230       }
1231     }
1232   }
1233 
1234   /** Adds the given token to the parsed output. */
addToken(Token token)1235   private void addToken(Token token) {
1236     tokens.add(token);
1237   }
1238 
1239   /** Converts the entire message into a single HTML display string. */
toHtml()1240   public String toHtml() {
1241     StringBuilder html = new StringBuilder();
1242 
1243     for (Part part : parts) {
1244       boolean caps = false;
1245 
1246       html.append("<p>");
1247       for (Token token : part.getTokens()) {
1248         if (token.isHtml()) {
1249           html.append(token.toHtml(caps));
1250         } else {
1251           switch (token.getType()) {
1252           case LINK:
1253             html.append("<a href=\"");
1254             html.append(((Link)token).getURL());
1255             html.append("\">");
1256             html.append(token.getRawText());
1257             html.append("</a>");
1258             break;
1259 
1260           case SMILEY:
1261             // TODO: link to an appropriate image
1262             html.append(token.getRawText());
1263             break;
1264 
1265           case ACRONYM:
1266             html.append(token.getRawText());
1267             break;
1268 
1269           case MUSIC:
1270             // TODO: include a music glyph
1271             html.append(((MusicTrack)token).getTrack());
1272             break;
1273 
1274           case GOOGLE_VIDEO:
1275             // TODO: include a Google Video icon
1276             html.append("<a href=\"");
1277             html.append(((Video)token).getURL(((Video)token).getDocID()));
1278             html.append("\">");
1279             html.append(token.getRawText());
1280             html.append("</a>");
1281             break;
1282 
1283           case YOUTUBE_VIDEO:
1284             // TODO: include a YouTube icon
1285             html.append("<a href=\"");
1286             html.append(((YouTubeVideo)token).getURL(
1287                 ((YouTubeVideo)token).getDocID()));
1288             html.append("\">");
1289             html.append(token.getRawText());
1290             html.append("</a>");
1291             break;
1292 
1293           case PHOTO: {
1294             // TODO: include a Picasa Web icon
1295             html.append("<a href=\"");
1296             html.append(Photo.getAlbumURL(
1297                 ((Photo)token).getUser(), ((Photo)token).getAlbum()));
1298             html.append("\">");
1299             html.append(token.getRawText());
1300             html.append("</a>");
1301             break;
1302           }
1303 
1304           case FLICKR:
1305             // TODO: include a Flickr icon
1306             Photo p = (Photo) token;
1307             html.append("<a href=\"");
1308             html.append(((FlickrPhoto)token).getUrl());
1309             html.append("\">");
1310             html.append(token.getRawText());
1311             html.append("</a>");
1312             break;
1313 
1314           default:
1315             throw new AssertionError("unknown token type: " + token.getType());
1316           }
1317         }
1318 
1319         if (token.controlCaps()) {
1320           caps = token.setCaps();
1321         }
1322       }
1323       html.append("</p>\n");
1324     }
1325 
1326     return html.toString();
1327   }
1328 
1329   /** Returns the reverse of the given string. */
reverse(String str)1330   protected static String reverse(String str) {
1331     StringBuilder buf = new StringBuilder();
1332     for (int i = str.length() - 1; i >= 0; --i) {
1333       buf.append(str.charAt(i));
1334     }
1335     return buf.toString();
1336   }
1337 
1338   public static class TrieNode {
1339     private final HashMap<Character,TrieNode> children =
1340         new HashMap<Character,TrieNode>();
1341     private String text;
1342     private String value;
1343 
TrieNode()1344     public TrieNode() { this(""); }
TrieNode(String text)1345     public TrieNode(String text) {
1346       this.text = text;
1347     }
1348 
exists()1349     public final boolean exists() { return value != null; }
getText()1350     public final String getText() { return text; }
getValue()1351     public final String getValue() { return value; }
setValue(String value)1352     public void setValue(String value) { this.value = value; }
1353 
getChild(char ch)1354     public TrieNode getChild(char ch) {
1355       return children.get(Character.valueOf(ch));
1356     }
1357 
getOrCreateChild(char ch)1358     public TrieNode getOrCreateChild(char ch) {
1359       Character key = Character.valueOf(ch);
1360       TrieNode node = children.get(key);
1361       if (node == null) {
1362         node = new TrieNode(text + String.valueOf(ch));
1363         children.put(key, node);
1364       }
1365       return node;
1366     }
1367 
1368     /** Adds the given string into the trie. */
addToTrie(TrieNode root, String str, String value)1369     public static  void addToTrie(TrieNode root, String str, String value) {
1370       int index = 0;
1371       while (index < str.length()) {
1372         root = root.getOrCreateChild(str.charAt(index++));
1373       }
1374       root.setValue(value);
1375     }
1376   }
1377 
1378 
1379 
1380   /** Determines whether the given string is in the given trie. */
matches(TrieNode root, String str)1381   private static boolean matches(TrieNode root, String str) {
1382     int index = 0;
1383     while (index < str.length()) {
1384       root = root.getChild(str.charAt(index++));
1385       if (root == null) {
1386         break;
1387       } else if (root.exists()) {
1388         return true;
1389       }
1390     }
1391     return false;
1392   }
1393 
1394   /**
1395    * Returns the longest substring of the given string, starting at the given
1396    * index, that exists in the trie.
1397    */
longestMatch( TrieNode root, AbstractMessageParser p, int start)1398   private static TrieNode longestMatch(
1399       TrieNode root, AbstractMessageParser p, int start) {
1400     return longestMatch(root, p, start, false);
1401   }
1402 
1403   /**
1404    * Returns the longest substring of the given string, starting at the given
1405    * index, that exists in the trie, with a special tokenizing case for
1406    * smileys if specified.
1407    */
longestMatch( TrieNode root, AbstractMessageParser p, int start, boolean smiley)1408   private static TrieNode longestMatch(
1409       TrieNode root, AbstractMessageParser p, int start, boolean smiley) {
1410     int index = start;
1411     TrieNode bestMatch = null;
1412     while (index < p.getRawText().length()) {
1413       root = root.getChild(p.getRawText().charAt(index++));
1414       if (root == null) {
1415         break;
1416       } else if (root.exists()) {
1417         if (p.isWordBreak(index)) {
1418           bestMatch = root;
1419         } else if (smiley && p.isSmileyBreak(index)) {
1420           bestMatch = root;
1421         }
1422       }
1423     }
1424     return bestMatch;
1425   }
1426 
1427 
1428   /** Represents set of tokens that are delivered as a single message. */
1429   public static class Part {
1430     private String meText;
1431     private ArrayList<Token> tokens;
1432 
Part()1433     public Part() {
1434       this.tokens = new ArrayList<Token>();
1435     }
1436 
getType(boolean isSend)1437     public String getType(boolean isSend) {
1438       return (isSend ? "s" : "r") + getPartType();
1439     }
1440 
getPartType()1441     private String getPartType() {
1442       if (isMedia()) {
1443         return "d";
1444       } else if (meText != null) {
1445         return "m";
1446       } else {
1447         return "";
1448       }
1449     }
1450 
isMedia()1451     public boolean isMedia() {
1452       return (tokens.size() == 1) && tokens.get(0).isMedia();
1453     }
1454     /**
1455      * Convenience method for getting the Token of a Part that represents
1456      * a media Token. Parts of this kind will always only have a single Token
1457      *
1458      * @return if this.isMedia(),
1459      *         returns the Token representing the media contained in this Part,
1460      *         otherwise returns null;
1461      */
getMediaToken()1462     public Token getMediaToken() {
1463       if(isMedia()) {
1464         return tokens.get(0);
1465       }
1466       return null;
1467     }
1468 
1469     /** Adds the given token to this part. */
add(Token token)1470     public void add(Token token) {
1471       if (isMedia()) {
1472         throw new AssertionError("media ");
1473       }
1474        tokens.add(token);
1475     }
1476 
setMeText(String meText)1477     public void setMeText(String meText) {
1478       this.meText = meText;
1479     }
1480 
1481     /** Returns the original text of this part. */
getRawText()1482     public String getRawText() {
1483       StringBuilder buf = new StringBuilder();
1484       if (meText != null) {
1485         buf.append(meText);
1486       }
1487       for (int i = 0; i < tokens.size(); ++i) {
1488         buf.append(tokens.get(i).getRawText());
1489       }
1490       return buf.toString();
1491     }
1492 
1493     /** Returns the tokens in this part. */
getTokens()1494     public ArrayList<Token> getTokens() { return tokens; }
1495 
1496     /** Adds the tokens into the given builder as an array. */
1497 //    public void toArray(JSArrayBuilder array) {
1498 //      if (isMedia()) {
1499 //        // For media, we send its array (i.e., we don't wrap this in another
1500 //        // array as we do for non-media parts).
1501 //        tokens.get(0).toArray(array);
1502 //      } else {
1503 //        array.beginArray();
1504 //        addToArray(array);
1505 //        array.endArray();
1506 //      }
1507 //    }
1508   }
1509 }
1510