1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts.aggregation.util; 17 18 import java.util.Arrays; 19 20 /** 21 * A string distance calculator, particularly suited for name matching. 22 * <p> 23 * A detailed discussion of the topic of record linkage in general and name matching 24 * in particular can be found in this article: 25 * <blockquote> 26 * Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions". 27 * Research Report Series, RRS. 28 * </blockquote> 29 */ 30 public class NameDistance { 31 32 private static final float WINKLER_BONUS_THRESHOLD = 0.7f; 33 private static final int MIN_EXACT_PREFIX_LENGTH = 3; 34 35 private final int mMaxLength; 36 private final boolean mPrefixOnly; 37 private final boolean[] mMatchFlags1; 38 private final boolean[] mMatchFlags2; 39 40 /** 41 * Constructor. 42 * 43 * @param maxLength byte arrays are truncated if longer than this number 44 */ NameDistance(int maxLength)45 public NameDistance(int maxLength) { 46 mMaxLength = maxLength; 47 mPrefixOnly = false; 48 mMatchFlags1 = new boolean[maxLength]; 49 mMatchFlags2 = new boolean[maxLength]; 50 } 51 52 /** 53 * Constructor for a matcher that only checks if one string is the exact prefix of the other 54 */ NameDistance()55 public NameDistance() { 56 mPrefixOnly = true; 57 mMaxLength = 0; 58 mMatchFlags1 = mMatchFlags2 = null; 59 } 60 61 /** 62 * Computes a string distance between two normalized strings passed as byte arrays. 63 */ getDistance(byte bytes1[], byte bytes2[])64 public float getDistance(byte bytes1[], byte bytes2[]) { 65 byte[] array1, array2; 66 67 if (bytes1.length > bytes2.length) { 68 array2 = bytes1; 69 array1 = bytes2; 70 } else { 71 array2 = bytes2; 72 array1 = bytes1; 73 } 74 75 int length1 = array1.length; 76 if (length1 >= MIN_EXACT_PREFIX_LENGTH) { 77 boolean prefix = true; 78 for (int i = 0; i < array1.length; i++) { 79 if (array1[i] != array2[i]) { 80 prefix = false; 81 break; 82 } 83 } 84 if (prefix) { 85 return 1.0f; 86 } 87 } 88 89 if (mPrefixOnly) { 90 return 0; 91 } 92 93 if (length1 > mMaxLength) { 94 length1 = mMaxLength; 95 } 96 97 int length2 = array2.length; 98 if (length2 > mMaxLength) { 99 length2 = mMaxLength; 100 } 101 102 Arrays.fill(mMatchFlags1, 0, length1, false); 103 Arrays.fill(mMatchFlags2, 0, length2, false); 104 105 int range = length2 / 2 - 1; 106 if (range < 0) { 107 range = 0; 108 } 109 110 int matches = 0; 111 for (int i = 0; i < length1; i++) { 112 byte c1 = array1[i]; 113 114 int from = i - range; 115 if (from < 0) { 116 from = 0; 117 } 118 119 int to = i + range + 1; 120 if (to > length2) { 121 to = length2; 122 } 123 124 for (int j = from; j < to; j++) { 125 if (!mMatchFlags2[j] && c1 == array2[j]) { 126 mMatchFlags1[i] = mMatchFlags2[j] = true; 127 matches++; 128 break; 129 } 130 } 131 } 132 133 if (matches == 0) { 134 return 0f; 135 } 136 137 int transpositions = 0; 138 int j = 0; 139 for (int i = 0; i < length1; i++) { 140 if (mMatchFlags1[i]) { 141 while (!mMatchFlags2[j]) { 142 j++; 143 } 144 if (array1[i] != array2[j]) { 145 transpositions++; 146 } 147 j++; 148 } 149 } 150 151 float m = matches; 152 float jaro = ((m / length1 + m / length2 + (m - (transpositions / 2f)) / m)) / 3; 153 154 if (jaro < WINKLER_BONUS_THRESHOLD) { 155 return jaro; 156 } 157 158 // Add Winkler bonus 159 int prefix = 0; 160 for (int i = 0; i < length1; i++) { 161 if (bytes1[i] != bytes2[i]) { 162 break; 163 } 164 prefix++; 165 } 166 167 return jaro + Math.min(0.1f, 1f / length2) * prefix * (1 - jaro); 168 } 169 } 170