1 /*
2  * Copyright 2001-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package org.apache.commons.codec.language;
18 
19 import org.apache.commons.codec.EncoderException;
20 import org.apache.commons.codec.StringEncoder;
21 
22 /**
23  * Encodes a string into a double metaphone value.
24  * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>.
25  * <ul>
26  * <li>Original Article: <a
27  * href="http://www.cuj.com/documents/s=8038/cuj0006philips/">
28  * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li>
29  * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
30  * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li>
31  * </ul>
32  *
33  * @author Apache Software Foundation
34  * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $
35  *
36  * @deprecated Please use {@link java.net.URL#openConnection} instead.
37  *     Please visit <a href="http://android-developers.blogspot.com/2011/09/androids-http-clients.html">this webpage</a>
38  *     for further details.
39  */
40 @Deprecated
41 public class DoubleMetaphone implements StringEncoder {
42 
43     /**
44      * "Vowels" to test for
45      */
46     private static final String VOWELS = "AEIOUY";
47 
48     /**
49      * Prefixes when present which are not pronounced
50      */
51     private static final String[] SILENT_START =
52     { "GN", "KN", "PN", "WR", "PS" };
53     private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
54     { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
55     private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
56     { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
57     private static final String[] L_T_K_S_N_M_B_Z =
58     { "L", "T", "K", "S", "N", "M", "B", "Z" };
59 
60     /**
61      * Maximum length of an encoding, default is 4
62      */
63     protected int maxCodeLen = 4;
64 
65     /**
66      * Creates an instance of this DoubleMetaphone encoder
67      */
DoubleMetaphone()68     public DoubleMetaphone() {
69         super();
70     }
71 
72     /**
73      * Encode a value with Double Metaphone
74      *
75      * @param value String to encode
76      * @return an encoded string
77      */
doubleMetaphone(String value)78     public String doubleMetaphone(String value) {
79         return doubleMetaphone(value, false);
80     }
81 
82     /**
83      * Encode a value with Double Metaphone, optionally using the alternate
84      * encoding.
85      *
86      * @param value String to encode
87      * @param alternate use alternate encode
88      * @return an encoded string
89      */
doubleMetaphone(String value, boolean alternate)90     public String doubleMetaphone(String value, boolean alternate) {
91         value = cleanInput(value);
92         if (value == null) {
93             return null;
94         }
95 
96         boolean slavoGermanic = isSlavoGermanic(value);
97         int index = isSilentStart(value) ? 1 : 0;
98 
99         DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
100 
101         while (!result.isComplete() && index <= value.length() - 1) {
102             switch (value.charAt(index)) {
103             case 'A':
104             case 'E':
105             case 'I':
106             case 'O':
107             case 'U':
108             case 'Y':
109                 index = handleAEIOUY(value, result, index);
110                 break;
111             case 'B':
112                 result.append('P');
113                 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
114                 break;
115             case '\u00C7':
116                 // A C with a Cedilla
117                 result.append('S');
118                 index++;
119                 break;
120             case 'C':
121                 index = handleC(value, result, index);
122                 break;
123             case 'D':
124                 index = handleD(value, result, index);
125                 break;
126             case 'F':
127                 result.append('F');
128                 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
129                 break;
130             case 'G':
131                 index = handleG(value, result, index, slavoGermanic);
132                 break;
133             case 'H':
134                 index = handleH(value, result, index);
135                 break;
136             case 'J':
137                 index = handleJ(value, result, index, slavoGermanic);
138                 break;
139             case 'K':
140                 result.append('K');
141                 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
142                 break;
143             case 'L':
144                 index = handleL(value, result, index);
145                 break;
146             case 'M':
147                 result.append('M');
148                 index = conditionM0(value, index) ? index + 2 : index + 1;
149                 break;
150             case 'N':
151                 result.append('N');
152                 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
153                 break;
154             case '\u00D1':
155                 // N with a tilde (spanish ene)
156                 result.append('N');
157                 index++;
158                 break;
159             case 'P':
160                 index = handleP(value, result, index);
161                 break;
162             case 'Q':
163                 result.append('K');
164                 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
165                 break;
166             case 'R':
167                 index = handleR(value, result, index, slavoGermanic);
168                 break;
169             case 'S':
170                 index = handleS(value, result, index, slavoGermanic);
171                 break;
172             case 'T':
173                 index = handleT(value, result, index);
174                 break;
175             case 'V':
176                 result.append('F');
177                 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
178                 break;
179             case 'W':
180                 index = handleW(value, result, index);
181                 break;
182             case 'X':
183                 index = handleX(value, result, index);
184                 break;
185             case 'Z':
186                 index = handleZ(value, result, index, slavoGermanic);
187                 break;
188             default:
189                 index++;
190                 break;
191             }
192         }
193 
194         return alternate ? result.getAlternate() : result.getPrimary();
195     }
196 
197     /**
198      * Encode the value using DoubleMetaphone.  It will only work if
199      * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
200      *
201      * @param obj Object to encode (should be of type String)
202      * @return An encoded Object (will be of type String)
203      * @throws EncoderException encode parameter is not of type String
204      */
encode(Object obj)205     public Object encode(Object obj) throws EncoderException {
206         if (!(obj instanceof String)) {
207             throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
208         }
209         return doubleMetaphone((String) obj);
210     }
211 
212     /**
213      * Encode the value using DoubleMetaphone.
214      *
215      * @param value String to encode
216      * @return An encoded String
217      */
encode(String value)218     public String encode(String value) {
219         return doubleMetaphone(value);
220     }
221 
222     /**
223      * Check if the Double Metaphone values of two <code>String</code> values
224      * are equal.
225      *
226      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
227      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
228      * @return <code>true</code> if the encoded <code>String</code>s are equal;
229      *          <code>false</code> otherwise.
230      * @see #isDoubleMetaphoneEqual(String,String,boolean)
231      */
isDoubleMetaphoneEqual(String value1, String value2)232     public boolean isDoubleMetaphoneEqual(String value1, String value2) {
233         return isDoubleMetaphoneEqual(value1, value2, false);
234     }
235 
236     /**
237      * Check if the Double Metaphone values of two <code>String</code> values
238      * are equal, optionally using the alternate value.
239      *
240      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
241      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
242      * @param alternate use the alternate value if <code>true</code>.
243      * @return <code>true</code> if the encoded <code>String</code>s are equal;
244      *          <code>false</code> otherwise.
245      */
isDoubleMetaphoneEqual(String value1, String value2, boolean alternate)246     public boolean isDoubleMetaphoneEqual(String value1,
247                                           String value2,
248                                           boolean alternate) {
249         return doubleMetaphone(value1, alternate).equals(doubleMetaphone
250                                                          (value2, alternate));
251     }
252 
253     /**
254      * Returns the maxCodeLen.
255      * @return int
256      */
getMaxCodeLen()257     public int getMaxCodeLen() {
258         return this.maxCodeLen;
259     }
260 
261     /**
262      * Sets the maxCodeLen.
263      * @param maxCodeLen The maxCodeLen to set
264      */
setMaxCodeLen(int maxCodeLen)265     public void setMaxCodeLen(int maxCodeLen) {
266         this.maxCodeLen = maxCodeLen;
267     }
268 
269     //-- BEGIN HANDLERS --//
270 
271     /**
272      * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
273      */
handleAEIOUY(String value, DoubleMetaphoneResult result, int index)274     private int handleAEIOUY(String value, DoubleMetaphoneResult result, int
275                              index) {
276         if (index == 0) {
277             result.append('A');
278         }
279         return index + 1;
280     }
281 
282     /**
283      * Handles 'C' cases
284      */
handleC(String value, DoubleMetaphoneResult result, int index)285     private int handleC(String value,
286                         DoubleMetaphoneResult result,
287                         int index) {
288         if (conditionC0(value, index)) {  // very confusing, moved out
289             result.append('K');
290             index += 2;
291         } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
292             result.append('S');
293             index += 2;
294         } else if (contains(value, index, 2, "CH")) {
295             index = handleCH(value, result, index);
296         } else if (contains(value, index, 2, "CZ") &&
297                    !contains(value, index - 2, 4, "WICZ")) {
298             //-- "Czerny" --//
299             result.append('S', 'X');
300             index += 2;
301         } else if (contains(value, index + 1, 3, "CIA")) {
302             //-- "focaccia" --//
303             result.append('X');
304             index += 3;
305         } else if (contains(value, index, 2, "CC") &&
306                    !(index == 1 && charAt(value, 0) == 'M')) {
307             //-- double "cc" but not "McClelland" --//
308             return handleCC(value, result, index);
309         } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
310             result.append('K');
311             index += 2;
312         } else if (contains(value, index, 2, "CI", "CE", "CY")) {
313             //-- Italian vs. English --//
314             if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
315                 result.append('S', 'X');
316             } else {
317                 result.append('S');
318             }
319             index += 2;
320         } else {
321             result.append('K');
322             if (contains(value, index + 1, 2, " C", " Q", " G")) {
323                 //-- Mac Caffrey, Mac Gregor --//
324                 index += 3;
325             } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
326                        !contains(value, index + 1, 2, "CE", "CI")) {
327                 index += 2;
328             } else {
329                 index++;
330             }
331         }
332 
333         return index;
334     }
335 
336     /**
337      * Handles 'CC' cases
338      */
handleCC(String value, DoubleMetaphoneResult result, int index)339     private int handleCC(String value,
340                          DoubleMetaphoneResult result,
341                          int index) {
342         if (contains(value, index + 2, 1, "I", "E", "H") &&
343             !contains(value, index + 2, 2, "HU")) {
344             //-- "bellocchio" but not "bacchus" --//
345             if ((index == 1 && charAt(value, index - 1) == 'A') ||
346                 contains(value, index - 1, 5, "UCCEE", "UCCES")) {
347                 //-- "accident", "accede", "succeed" --//
348                 result.append("KS");
349             } else {
350                 //-- "bacci", "bertucci", other Italian --//
351                 result.append('X');
352             }
353             index += 3;
354         } else {    // Pierce's rule
355             result.append('K');
356             index += 2;
357         }
358 
359         return index;
360     }
361 
362     /**
363      * Handles 'CH' cases
364      */
handleCH(String value, DoubleMetaphoneResult result, int index)365     private int handleCH(String value,
366                          DoubleMetaphoneResult result,
367                          int index) {
368         if (index > 0 && contains(value, index, 4, "CHAE")) {   // Michael
369             result.append('K', 'X');
370             return index + 2;
371         } else if (conditionCH0(value, index)) {
372             //-- Greek roots ("chemistry", "chorus", etc.) --//
373             result.append('K');
374             return index + 2;
375         } else if (conditionCH1(value, index)) {
376             //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
377             result.append('K');
378             return index + 2;
379         } else {
380             if (index > 0) {
381                 if (contains(value, 0, 2, "MC")) {
382                     result.append('K');
383                 } else {
384                     result.append('X', 'K');
385                 }
386             } else {
387                 result.append('X');
388             }
389             return index + 2;
390         }
391     }
392 
393     /**
394      * Handles 'D' cases
395      */
handleD(String value, DoubleMetaphoneResult result, int index)396     private int handleD(String value,
397                         DoubleMetaphoneResult result,
398                         int index) {
399         if (contains(value, index, 2, "DG")) {
400             //-- "Edge" --//
401             if (contains(value, index + 2, 1, "I", "E", "Y")) {
402                 result.append('J');
403                 index += 3;
404                 //-- "Edgar" --//
405             } else {
406                 result.append("TK");
407                 index += 2;
408             }
409         } else if (contains(value, index, 2, "DT", "DD")) {
410             result.append('T');
411             index += 2;
412         } else {
413             result.append('T');
414             index++;
415         }
416         return index;
417     }
418 
419     /**
420      * Handles 'G' cases
421      */
handleG(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)422     private int handleG(String value,
423                         DoubleMetaphoneResult result,
424                         int index,
425                         boolean slavoGermanic) {
426         if (charAt(value, index + 1) == 'H') {
427             index = handleGH(value, result, index);
428         } else if (charAt(value, index + 1) == 'N') {
429             if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
430                 result.append("KN", "N");
431             } else if (!contains(value, index + 2, 2, "EY") &&
432                        charAt(value, index + 1) != 'Y' && !slavoGermanic) {
433                 result.append("N", "KN");
434             } else {
435                 result.append("KN");
436             }
437             index = index + 2;
438         } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
439             result.append("KL", "L");
440             index += 2;
441         } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
442             //-- -ges-, -gep-, -gel-, -gie- at beginning --//
443             result.append('K', 'J');
444             index += 2;
445         } else if ((contains(value, index + 1, 2, "ER") ||
446                     charAt(value, index + 1) == 'Y') &&
447                    !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
448                    !contains(value, index - 1, 1, "E", "I") &&
449                    !contains(value, index - 1, 3, "RGY", "OGY")) {
450             //-- -ger-, -gy- --//
451             result.append('K', 'J');
452             index += 2;
453         } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
454                    contains(value, index - 1, 4, "AGGI", "OGGI")) {
455             //-- Italian "biaggi" --//
456             if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) {
457                 //-- obvious germanic --//
458                 result.append('K');
459             } else if (contains(value, index + 1, 4, "IER")) {
460                 result.append('J');
461             } else {
462                 result.append('J', 'K');
463             }
464             index += 2;
465         } else if (charAt(value, index + 1) == 'G') {
466             index += 2;
467             result.append('K');
468         } else {
469             index++;
470             result.append('K');
471         }
472         return index;
473     }
474 
475     /**
476      * Handles 'GH' cases
477      */
handleGH(String value, DoubleMetaphoneResult result, int index)478     private int handleGH(String value,
479                          DoubleMetaphoneResult result,
480                          int index) {
481         if (index > 0 && !isVowel(charAt(value, index - 1))) {
482             result.append('K');
483             index += 2;
484         } else if (index == 0) {
485             if (charAt(value, index + 2) == 'I') {
486                 result.append('J');
487             } else {
488                 result.append('K');
489             }
490             index += 2;
491         } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
492                    (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
493                    (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
494             //-- Parker's rule (with some further refinements) - "hugh"
495             index += 2;
496         } else {
497             if (index > 2 && charAt(value, index - 1) == 'U' &&
498                 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
499                 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
500                 result.append('F');
501             } else if (index > 0 && charAt(value, index - 1) != 'I') {
502                 result.append('K');
503             }
504             index += 2;
505         }
506         return index;
507     }
508 
509     /**
510      * Handles 'H' cases
511      */
handleH(String value, DoubleMetaphoneResult result, int index)512     private int handleH(String value,
513                         DoubleMetaphoneResult result,
514                         int index) {
515         //-- only keep if first & before vowel or between 2 vowels --//
516         if ((index == 0 || isVowel(charAt(value, index - 1))) &&
517             isVowel(charAt(value, index + 1))) {
518             result.append('H');
519             index += 2;
520             //-- also takes car of "HH" --//
521         } else {
522             index++;
523         }
524         return index;
525     }
526 
527     /**
528      * Handles 'J' cases
529      */
handleJ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)530     private int handleJ(String value, DoubleMetaphoneResult result, int index,
531                         boolean slavoGermanic) {
532         if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
533                 //-- obvious Spanish, "Jose", "San Jacinto" --//
534                 if ((index == 0 && (charAt(value, index + 4) == ' ') ||
535                      value.length() == 4) || contains(value, 0, 4, "SAN ")) {
536                     result.append('H');
537                 } else {
538                     result.append('J', 'H');
539                 }
540                 index++;
541             } else {
542                 if (index == 0 && !contains(value, index, 4, "JOSE")) {
543                     result.append('J', 'A');
544                 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
545                               (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
546                     result.append('J', 'H');
547                 } else if (index == value.length() - 1) {
548                     result.append('J', ' ');
549                 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) {
550                     result.append('J');
551                 }
552 
553                 if (charAt(value, index + 1) == 'J') {
554                     index += 2;
555                 } else {
556                     index++;
557                 }
558             }
559         return index;
560     }
561 
562     /**
563      * Handles 'L' cases
564      */
handleL(String value, DoubleMetaphoneResult result, int index)565     private int handleL(String value,
566                         DoubleMetaphoneResult result,
567                         int index) {
568         result.append('L');
569         if (charAt(value, index + 1) == 'L') {
570             if (conditionL0(value, index)) {
571                 result.appendAlternate(' ');
572             }
573             index += 2;
574         } else {
575             index++;
576         }
577         return index;
578     }
579 
580     /**
581      * Handles 'P' cases
582      */
handleP(String value, DoubleMetaphoneResult result, int index)583     private int handleP(String value,
584                         DoubleMetaphoneResult result,
585                         int index) {
586         if (charAt(value, index + 1) == 'H') {
587             result.append('F');
588             index += 2;
589         } else {
590             result.append('P');
591             index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
592         }
593         return index;
594     }
595 
596     /**
597      * Handles 'R' cases
598      */
handleR(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)599     private int handleR(String value,
600                         DoubleMetaphoneResult result,
601                         int index,
602                         boolean slavoGermanic) {
603         if (index == value.length() - 1 && !slavoGermanic &&
604             contains(value, index - 2, 2, "IE") &&
605             !contains(value, index - 4, 2, "ME", "MA")) {
606             result.appendAlternate('R');
607         } else {
608             result.append('R');
609         }
610         return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
611     }
612 
613     /**
614      * Handles 'S' cases
615      */
handleS(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)616     private int handleS(String value,
617                         DoubleMetaphoneResult result,
618                         int index,
619                         boolean slavoGermanic) {
620         if (contains(value, index - 1, 3, "ISL", "YSL")) {
621             //-- special cases "island", "isle", "carlisle", "carlysle" --//
622             index++;
623         } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
624             //-- special case "sugar-" --//
625             result.append('X', 'S');
626             index++;
627         } else if (contains(value, index, 2, "SH")) {
628             if (contains(value, index + 1, 4,
629                          "HEIM", "HOEK", "HOLM", "HOLZ")) {
630                 //-- germanic --//
631                 result.append('S');
632             } else {
633                 result.append('X');
634             }
635             index += 2;
636         } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
637             //-- Italian and Armenian --//
638             if (slavoGermanic) {
639                 result.append('S');
640             } else {
641                 result.append('S', 'X');
642             }
643             index += 3;
644         } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) {
645             //-- german & anglicisations, e.g. "smith" match "schmidt" //
646             // "snider" match "schneider" --//
647             //-- also, -sz- in slavic language altho in hungarian it //
648             //   is pronounced "s" --//
649             result.append('S', 'X');
650             index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
651         } else if (contains(value, index, 2, "SC")) {
652             index = handleSC(value, result, index);
653         } else {
654             if (index == value.length() - 1 && contains(value, index - 2,
655                                                         2, "AI", "OI")){
656                 //-- french e.g. "resnais", "artois" --//
657                 result.appendAlternate('S');
658             } else {
659                 result.append('S');
660             }
661             index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
662         }
663         return index;
664     }
665 
666     /**
667      * Handles 'SC' cases
668      */
handleSC(String value, DoubleMetaphoneResult result, int index)669     private int handleSC(String value,
670                          DoubleMetaphoneResult result,
671                          int index) {
672         if (charAt(value, index + 2) == 'H') {
673             //-- Schlesinger's rule --//
674             if (contains(value, index + 3,
675                          2, "OO", "ER", "EN", "UY", "ED", "EM")) {
676                 //-- Dutch origin, e.g. "school", "schooner" --//
677                 if (contains(value, index + 3, 2, "ER", "EN")) {
678                     //-- "schermerhorn", "schenker" --//
679                     result.append("X", "SK");
680                 } else {
681                     result.append("SK");
682                 }
683             } else {
684                 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
685                     result.append('X', 'S');
686                 } else {
687                     result.append('X');
688                 }
689             }
690         } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
691             result.append('S');
692         } else {
693             result.append("SK");
694         }
695         return index + 3;
696     }
697 
698     /**
699      * Handles 'T' cases
700      */
handleT(String value, DoubleMetaphoneResult result, int index)701     private int handleT(String value,
702                         DoubleMetaphoneResult result,
703                         int index) {
704         if (contains(value, index, 4, "TION")) {
705             result.append('X');
706             index += 3;
707         } else if (contains(value, index, 3, "TIA", "TCH")) {
708             result.append('X');
709             index += 3;
710         } else if (contains(value, index, 2, "TH") || contains(value, index,
711                                                                3, "TTH")) {
712             if (contains(value, index + 2, 2, "OM", "AM") ||
713                 //-- special case "thomas", "thames" or germanic --//
714                 contains(value, 0, 4, "VAN ", "VON ") ||
715                 contains(value, 0, 3, "SCH")) {
716                 result.append('T');
717             } else {
718                 result.append('0', 'T');
719             }
720             index += 2;
721         } else {
722             result.append('T');
723             index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
724         }
725         return index;
726     }
727 
728     /**
729      * Handles 'W' cases
730      */
handleW(String value, DoubleMetaphoneResult result, int index)731     private int handleW(String value,
732                         DoubleMetaphoneResult result,
733                         int index) {
734         if (contains(value, index, 2, "WR")) {
735             //-- can also be in middle of word --//
736             result.append('R');
737             index += 2;
738         } else {
739             if (index == 0 && (isVowel(charAt(value, index + 1)) ||
740                                contains(value, index, 2, "WH"))) {
741                 if (isVowel(charAt(value, index + 1))) {
742                     //-- Wasserman should match Vasserman --//
743                     result.append('A', 'F');
744                 } else {
745                     //-- need Uomo to match Womo --//
746                     result.append('A');
747                 }
748                 index++;
749             } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
750                        contains(value, index - 1,
751                                 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
752                        contains(value, 0, 3, "SCH")) {
753                 //-- Arnow should match Arnoff --//
754                 result.appendAlternate('F');
755                 index++;
756             } else if (contains(value, index, 4, "WICZ", "WITZ")) {
757                 //-- Polish e.g. "filipowicz" --//
758                 result.append("TS", "FX");
759                 index += 4;
760             } else {
761                 index++;
762             }
763         }
764         return index;
765     }
766 
767     /**
768      * Handles 'X' cases
769      */
handleX(String value, DoubleMetaphoneResult result, int index)770     private int handleX(String value,
771                         DoubleMetaphoneResult result,
772                         int index) {
773         if (index == 0) {
774             result.append('S');
775             index++;
776         } else {
777             if (!((index == value.length() - 1) &&
778                   (contains(value, index - 3, 3, "IAU", "EAU") ||
779                    contains(value, index - 2, 2, "AU", "OU")))) {
780                 //-- French e.g. breaux --//
781                 result.append("KS");
782             }
783             index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
784         }
785         return index;
786     }
787 
788     /**
789      * Handles 'Z' cases
790      */
handleZ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)791     private int handleZ(String value, DoubleMetaphoneResult result, int index,
792                         boolean slavoGermanic) {
793         if (charAt(value, index + 1) == 'H') {
794             //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
795             result.append('J');
796             index += 2;
797         } else {
798             if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
799                 result.append("S", "TS");
800             } else {
801                 result.append('S');
802             }
803             index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
804         }
805         return index;
806     }
807 
808     //-- BEGIN CONDITIONS --//
809 
810     /**
811      * Complex condition 0 for 'C'
812      */
conditionC0(String value, int index)813     private boolean conditionC0(String value, int index) {
814         if (contains(value, index, 4, "CHIA")) {
815             return true;
816         } else if (index <= 1) {
817             return false;
818         } else if (isVowel(charAt(value, index - 2))) {
819             return false;
820         } else if (!contains(value, index - 1, 3, "ACH")) {
821             return false;
822         } else {
823             char c = charAt(value, index + 2);
824             return (c != 'I' && c != 'E')
825                     || contains(value, index - 2, 6, "BACHER", "MACHER");
826         }
827     }
828 
829     /**
830      * Complex condition 0 for 'CH'
831      */
conditionCH0(String value, int index)832     private boolean conditionCH0(String value, int index) {
833         if (index != 0) {
834             return false;
835         } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
836                    !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
837             return false;
838         } else if (contains(value, 0, 5, "CHORE")) {
839             return false;
840         } else {
841             return true;
842         }
843     }
844 
845     /**
846      * Complex condition 1 for 'CH'
847      */
conditionCH1(String value, int index)848     private boolean conditionCH1(String value, int index) {
849         return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0,
850                                                                    3, "SCH")) ||
851                 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
852                 contains(value, index + 2, 1, "T", "S") ||
853                 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
854                  (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
855     }
856 
857     /**
858      * Complex condition 0 for 'L'
859      */
conditionL0(String value, int index)860     private boolean conditionL0(String value, int index) {
861         if (index == value.length() - 3 &&
862             contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
863             return true;
864         } else if ((contains(value, index - 1, 2, "AS", "OS") ||
865                     contains(value, value.length() - 1, 1, "A", "O")) &&
866                    contains(value, index - 1, 4, "ALLE")) {
867             return true;
868         } else {
869             return false;
870         }
871     }
872 
873     /**
874      * Complex condition 0 for 'M'
875      */
conditionM0(String value, int index)876     private boolean conditionM0(String value, int index) {
877         if (charAt(value, index + 1) == 'M') {
878             return true;
879         }
880         return contains(value, index - 1, 3, "UMB")
881                 && ((index + 1) == value.length() - 1 || contains(value,
882                         index + 2, 2, "ER"));
883     }
884 
885     //-- BEGIN HELPER FUNCTIONS --//
886 
887     /**
888      * Determines whether or not a value is of slavo-germanic orgin. A value is
889      * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
890      */
isSlavoGermanic(String value)891     private boolean isSlavoGermanic(String value) {
892         return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
893             value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
894     }
895 
896     /**
897      * Determines whether or not a character is a vowel or not
898      */
isVowel(char ch)899     private boolean isVowel(char ch) {
900         return VOWELS.indexOf(ch) != -1;
901     }
902 
903     /**
904      * Determines whether or not the value starts with a silent letter.  It will
905      * return <code>true</code> if the value starts with any of 'GN', 'KN',
906      * 'PN', 'WR' or 'PS'.
907      */
isSilentStart(String value)908     private boolean isSilentStart(String value) {
909         boolean result = false;
910         for (int i = 0; i < SILENT_START.length; i++) {
911             if (value.startsWith(SILENT_START[i])) {
912                 result = true;
913                 break;
914             }
915         }
916         return result;
917     }
918 
919     /**
920      * Cleans the input
921      */
cleanInput(String input)922     private String cleanInput(String input) {
923         if (input == null) {
924             return null;
925         }
926         input = input.trim();
927         if (input.length() == 0) {
928             return null;
929         }
930         return input.toUpperCase();
931     }
932 
933     /**
934      * Gets the character at index <code>index</code> if available, otherwise
935      * it returns <code>Character.MIN_VALUE</code> so that there is some sort
936      * of a default
937      */
charAt(String value, int index)938     protected char charAt(String value, int index) {
939         if (index < 0 || index >= value.length()) {
940             return Character.MIN_VALUE;
941         }
942         return value.charAt(index);
943     }
944 
945     /**
946      * Shortcut method with 1 criteria
947      */
contains(String value, int start, int length, String criteria)948     private static boolean contains(String value, int start, int length,
949                                     String criteria) {
950         return contains(value, start, length,
951                         new String[] { criteria });
952     }
953 
954     /**
955      * Shortcut method with 2 criteria
956      */
contains(String value, int start, int length, String criteria1, String criteria2)957     private static boolean contains(String value, int start, int length,
958                                     String criteria1, String criteria2) {
959         return contains(value, start, length,
960                         new String[] { criteria1, criteria2 });
961     }
962 
963     /**
964      * Shortcut method with 3 criteria
965      */
contains(String value, int start, int length, String criteria1, String criteria2, String criteria3)966     private static boolean contains(String value, int start, int length,
967                                     String criteria1, String criteria2,
968                                     String criteria3) {
969         return contains(value, start, length,
970                         new String[] { criteria1, criteria2, criteria3 });
971     }
972 
973     /**
974      * Shortcut method with 4 criteria
975      */
contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4)976     private static boolean contains(String value, int start, int length,
977                                     String criteria1, String criteria2,
978                                     String criteria3, String criteria4) {
979         return contains(value, start, length,
980                         new String[] { criteria1, criteria2, criteria3,
981                                        criteria4 });
982     }
983 
984     /**
985      * Shortcut method with 5 criteria
986      */
contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4, String criteria5)987     private static boolean contains(String value, int start, int length,
988                                     String criteria1, String criteria2,
989                                     String criteria3, String criteria4,
990                                     String criteria5) {
991         return contains(value, start, length,
992                         new String[] { criteria1, criteria2, criteria3,
993                                        criteria4, criteria5 });
994     }
995 
996     /**
997      * Shortcut method with 6 criteria
998      */
contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4, String criteria5, String criteria6)999     private static boolean contains(String value, int start, int length,
1000                                     String criteria1, String criteria2,
1001                                     String criteria3, String criteria4,
1002                                     String criteria5, String criteria6) {
1003         return contains(value, start, length,
1004                         new String[] { criteria1, criteria2, criteria3,
1005                                        criteria4, criteria5, criteria6 });
1006     }
1007 
1008     /**
1009      * Determines whether <code>value</code> contains any of the criteria
1010      starting
1011      * at index <code>start</code> and matching up to length <code>length</code>
1012      */
contains(String value, int start, int length, String[] criteria)1013     protected static boolean contains(String value, int start, int length,
1014                                       String[] criteria) {
1015         boolean result = false;
1016         if (start >= 0 && start + length <= value.length()) {
1017             String target = value.substring(start, start + length);
1018 
1019             for (int i = 0; i < criteria.length; i++) {
1020                 if (target.equals(criteria[i])) {
1021                     result = true;
1022                     break;
1023                 }
1024             }
1025         }
1026         return result;
1027     }
1028 
1029     //-- BEGIN INNER CLASSES --//
1030 
1031     /**
1032      * Inner class for storing results, since there is the optional alternate
1033      * encoding.
1034      */
1035     public class DoubleMetaphoneResult {
1036 
1037         private StringBuffer primary = new StringBuffer(getMaxCodeLen());
1038         private StringBuffer alternate = new StringBuffer(getMaxCodeLen());
1039         private int maxLength;
1040 
DoubleMetaphoneResult(int maxLength)1041         public DoubleMetaphoneResult(int maxLength) {
1042             this.maxLength = maxLength;
1043         }
1044 
append(char value)1045         public void append(char value) {
1046             appendPrimary(value);
1047             appendAlternate(value);
1048         }
1049 
append(char primary, char alternate)1050         public void append(char primary, char alternate) {
1051             appendPrimary(primary);
1052             appendAlternate(alternate);
1053         }
1054 
appendPrimary(char value)1055         public void appendPrimary(char value) {
1056             if (this.primary.length() < this.maxLength) {
1057                 this.primary.append(value);
1058             }
1059         }
1060 
appendAlternate(char value)1061         public void appendAlternate(char value) {
1062             if (this.alternate.length() < this.maxLength) {
1063                 this.alternate.append(value);
1064             }
1065         }
1066 
append(String value)1067         public void append(String value) {
1068             appendPrimary(value);
1069             appendAlternate(value);
1070         }
1071 
append(String primary, String alternate)1072         public void append(String primary, String alternate) {
1073             appendPrimary(primary);
1074             appendAlternate(alternate);
1075         }
1076 
appendPrimary(String value)1077         public void appendPrimary(String value) {
1078             int addChars = this.maxLength - this.primary.length();
1079             if (value.length() <= addChars) {
1080                 this.primary.append(value);
1081             } else {
1082                 this.primary.append(value.substring(0, addChars));
1083             }
1084         }
1085 
appendAlternate(String value)1086         public void appendAlternate(String value) {
1087             int addChars = this.maxLength - this.alternate.length();
1088             if (value.length() <= addChars) {
1089                 this.alternate.append(value);
1090             } else {
1091                 this.alternate.append(value.substring(0, addChars));
1092             }
1093         }
1094 
getPrimary()1095         public String getPrimary() {
1096             return this.primary.toString();
1097         }
1098 
getAlternate()1099         public String getAlternate() {
1100             return this.alternate.toString();
1101         }
1102 
isComplete()1103         public boolean isComplete() {
1104             return this.primary.length() >= this.maxLength &&
1105                 this.alternate.length() >= this.maxLength;
1106         }
1107     }
1108 }
1109