Jeff Davidson | a192cc2 | 2018-02-08 15:30:06 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2018 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | package android.webkit; |
| 18 | |
| 19 | import java.util.Locale; |
| 20 | import java.util.regex.MatchResult; |
| 21 | import java.util.regex.Matcher; |
| 22 | import java.util.regex.Pattern; |
| 23 | |
| 24 | /** |
| 25 | * Java implementation of legacy WebView.findAddress algorithm. |
| 26 | * |
| 27 | * @hide |
| 28 | */ |
| 29 | class FindAddress { |
| 30 | static class ZipRange { |
| 31 | int mLow; |
| 32 | int mHigh; |
| 33 | int mException1; |
| 34 | int mException2; |
| 35 | ZipRange(int low, int high, int exception1, int exception2) { |
| 36 | mLow = low; |
| 37 | mHigh = high; |
| 38 | mException1 = exception1; |
| 39 | mException2 = exception1; |
| 40 | } |
| 41 | boolean matches(String zipCode) { |
| 42 | int prefix = Integer.parseInt(zipCode.substring(0, 2)); |
| 43 | return (mLow <= prefix && prefix <= mHigh) || prefix == mException1 |
| 44 | || prefix == mException2; |
| 45 | } |
| 46 | } |
| 47 | |
| 48 | // Addresses consist of at least this many words, not including state and zip code. |
| 49 | private static final int MIN_ADDRESS_WORDS = 4; |
| 50 | |
| 51 | // Adddresses consist of at most this many words, not including state and zip code. |
| 52 | private static final int MAX_ADDRESS_WORDS = 14; |
| 53 | |
| 54 | // Addresses consist of at most this many lines. |
| 55 | private static final int MAX_ADDRESS_LINES = 5; |
| 56 | |
| 57 | // No words in an address are longer than this many characters. |
| 58 | private static final int kMaxAddressNameWordLength = 25; |
| 59 | |
| 60 | // Location name should be in the first MAX_LOCATION_NAME_DISTANCE words |
| 61 | private static final int MAX_LOCATION_NAME_DISTANCE = 5; |
| 62 | |
| 63 | private static final ZipRange[] sStateZipCodeRanges = { |
| 64 | new ZipRange(99, 99, -1, -1), // AK Alaska. |
| 65 | new ZipRange(35, 36, -1, -1), // AL Alabama. |
| 66 | new ZipRange(71, 72, -1, -1), // AR Arkansas. |
| 67 | new ZipRange(96, 96, -1, -1), // AS American Samoa. |
| 68 | new ZipRange(85, 86, -1, -1), // AZ Arizona. |
| 69 | new ZipRange(90, 96, -1, -1), // CA California. |
| 70 | new ZipRange(80, 81, -1, -1), // CO Colorado. |
| 71 | new ZipRange(6, 6, -1, -1), // CT Connecticut. |
| 72 | new ZipRange(20, 20, -1, -1), // DC District of Columbia. |
| 73 | new ZipRange(19, 19, -1, -1), // DE Delaware. |
| 74 | new ZipRange(32, 34, -1, -1), // FL Florida. |
| 75 | new ZipRange(96, 96, -1, -1), // FM Federated States of Micronesia. |
| 76 | new ZipRange(30, 31, -1, -1), // GA Georgia. |
| 77 | new ZipRange(96, 96, -1, -1), // GU Guam. |
| 78 | new ZipRange(96, 96, -1, -1), // HI Hawaii. |
| 79 | new ZipRange(50, 52, -1, -1), // IA Iowa. |
| 80 | new ZipRange(83, 83, -1, -1), // ID Idaho. |
| 81 | new ZipRange(60, 62, -1, -1), // IL Illinois. |
| 82 | new ZipRange(46, 47, -1, -1), // IN Indiana. |
| 83 | new ZipRange(66, 67, 73, -1), // KS Kansas. |
| 84 | new ZipRange(40, 42, -1, -1), // KY Kentucky. |
| 85 | new ZipRange(70, 71, -1, -1), // LA Louisiana. |
| 86 | new ZipRange(1, 2, -1, -1), // MA Massachusetts. |
| 87 | new ZipRange(20, 21, -1, -1), // MD Maryland. |
| 88 | new ZipRange(3, 4, -1, -1), // ME Maine. |
| 89 | new ZipRange(96, 96, -1, -1), // MH Marshall Islands. |
| 90 | new ZipRange(48, 49, -1, -1), // MI Michigan. |
| 91 | new ZipRange(55, 56, -1, -1), // MN Minnesota. |
| 92 | new ZipRange(63, 65, -1, -1), // MO Missouri. |
| 93 | new ZipRange(96, 96, -1, -1), // MP Northern Mariana Islands. |
| 94 | new ZipRange(38, 39, -1, -1), // MS Mississippi. |
| 95 | new ZipRange(55, 56, -1, -1), // MT Montana. |
| 96 | new ZipRange(27, 28, -1, -1), // NC North Carolina. |
| 97 | new ZipRange(58, 58, -1, -1), // ND North Dakota. |
| 98 | new ZipRange(68, 69, -1, -1), // NE Nebraska. |
| 99 | new ZipRange(3, 4, -1, -1), // NH New Hampshire. |
| 100 | new ZipRange(7, 8, -1, -1), // NJ New Jersey. |
| 101 | new ZipRange(87, 88, 86, -1), // NM New Mexico. |
| 102 | new ZipRange(88, 89, 96, -1), // NV Nevada. |
| 103 | new ZipRange(10, 14, 0, 6), // NY New York. |
| 104 | new ZipRange(43, 45, -1, -1), // OH Ohio. |
| 105 | new ZipRange(73, 74, -1, -1), // OK Oklahoma. |
| 106 | new ZipRange(97, 97, -1, -1), // OR Oregon. |
| 107 | new ZipRange(15, 19, -1, -1), // PA Pennsylvania. |
| 108 | new ZipRange(6, 6, 0, 9), // PR Puerto Rico. |
| 109 | new ZipRange(96, 96, -1, -1), // PW Palau. |
| 110 | new ZipRange(2, 2, -1, -1), // RI Rhode Island. |
| 111 | new ZipRange(29, 29, -1, -1), // SC South Carolina. |
| 112 | new ZipRange(57, 57, -1, -1), // SD South Dakota. |
| 113 | new ZipRange(37, 38, -1, -1), // TN Tennessee. |
| 114 | new ZipRange(75, 79, 87, 88), // TX Texas. |
| 115 | new ZipRange(84, 84, -1, -1), // UT Utah. |
| 116 | new ZipRange(22, 24, 20, -1), // VA Virginia. |
| 117 | new ZipRange(6, 9, -1, -1), // VI Virgin Islands. |
| 118 | new ZipRange(5, 5, -1, -1), // VT Vermont. |
| 119 | new ZipRange(98, 99, -1, -1), // WA Washington. |
| 120 | new ZipRange(53, 54, -1, -1), // WI Wisconsin. |
| 121 | new ZipRange(24, 26, -1, -1), // WV West Virginia. |
| 122 | new ZipRange(82, 83, -1, -1) // WY Wyoming. |
| 123 | }; |
| 124 | |
| 125 | // Newlines |
| 126 | private static final String NL = "\n\u000B\u000C\r\u0085\u2028\u2029"; |
| 127 | |
| 128 | // Space characters |
| 129 | private static final String SP = "\u0009\u0020\u00A0\u1680\u2000\u2001" |
| 130 | + "\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u202F" |
| 131 | + "\u205F\u3000"; |
| 132 | |
| 133 | // Whitespace |
| 134 | private static final String WS = SP + NL; |
| 135 | |
| 136 | // Characters that are considered word delimiters. |
| 137 | private static final String WORD_DELIM = ",*\u2022" + WS; |
| 138 | |
| 139 | // Lookahead for word end. |
| 140 | private static final String WORD_END = "(?=[" + WORD_DELIM + "]|$)"; |
| 141 | |
| 142 | // Address words are a sequence of non-delimiter characters. |
| 143 | private static final Pattern sWordRe = |
| 144 | Pattern.compile("[^" + WORD_DELIM + "]+" + WORD_END, Pattern.CASE_INSENSITIVE); |
| 145 | |
| 146 | // Characters that are considered suffix delimiters for house numbers. |
| 147 | private static final String HOUSE_POST_DELIM = ",\"'" + WS; |
| 148 | |
| 149 | // Lookahead for house end. |
| 150 | private static final String HOUSE_END = "(?=[" + HOUSE_POST_DELIM + "]|$)"; |
| 151 | |
| 152 | // Characters that are considered prefix delimiters for house numbers. |
| 153 | private static final String HOUSE_PRE_DELIM = ":" + HOUSE_POST_DELIM; |
| 154 | |
| 155 | // A house number component is "one" or a number, optionally |
| 156 | // followed by a single alphabetic character, or |
| 157 | private static final String HOUSE_COMPONENT = "(?:one|\\d+([a-z](?=[^a-z]|$)|st|nd|rd|th)?)"; |
| 158 | |
| 159 | // House numbers are a repetition of |HOUSE_COMPONENT|, separated by -, and followed by |
| 160 | // a delimiter character. |
| 161 | private static final Pattern sHouseNumberRe = |
| 162 | Pattern.compile(HOUSE_COMPONENT + "(?:-" + HOUSE_COMPONENT + ")*" + HOUSE_END, |
| 163 | Pattern.CASE_INSENSITIVE); |
| 164 | |
| 165 | // XXX: do we want to accept whitespace other than 0x20 in state names? |
| 166 | private static final Pattern sStateRe = Pattern.compile("(?:" |
| 167 | + "(ak|alaska)|" |
| 168 | + "(al|alabama)|" |
| 169 | + "(ar|arkansas)|" |
| 170 | + "(as|american[" + SP + "]+samoa)|" |
| 171 | + "(az|arizona)|" |
| 172 | + "(ca|california)|" |
| 173 | + "(co|colorado)|" |
| 174 | + "(ct|connecticut)|" |
| 175 | + "(dc|district[" + SP + "]+of[" + SP + "]+columbia)|" |
| 176 | + "(de|delaware)|" |
| 177 | + "(fl|florida)|" |
| 178 | + "(fm|federated[" + SP + "]+states[" + SP + "]+of[" + SP + "]+micronesia)|" |
| 179 | + "(ga|georgia)|" |
| 180 | + "(gu|guam)|" |
| 181 | + "(hi|hawaii)|" |
| 182 | + "(ia|iowa)|" |
| 183 | + "(id|idaho)|" |
| 184 | + "(il|illinois)|" |
| 185 | + "(in|indiana)|" |
| 186 | + "(ks|kansas)|" |
| 187 | + "(ky|kentucky)|" |
| 188 | + "(la|louisiana)|" |
| 189 | + "(ma|massachusetts)|" |
| 190 | + "(md|maryland)|" |
| 191 | + "(me|maine)|" |
| 192 | + "(mh|marshall[" + SP + "]+islands)|" |
| 193 | + "(mi|michigan)|" |
| 194 | + "(mn|minnesota)|" |
| 195 | + "(mo|missouri)|" |
| 196 | + "(mp|northern[" + SP + "]+mariana[" + SP + "]+islands)|" |
| 197 | + "(ms|mississippi)|" |
| 198 | + "(mt|montana)|" |
| 199 | + "(nc|north[" + SP + "]+carolina)|" |
| 200 | + "(nd|north[" + SP + "]+dakota)|" |
| 201 | + "(ne|nebraska)|" |
| 202 | + "(nh|new[" + SP + "]+hampshire)|" |
| 203 | + "(nj|new[" + SP + "]+jersey)|" |
| 204 | + "(nm|new[" + SP + "]+mexico)|" |
| 205 | + "(nv|nevada)|" |
| 206 | + "(ny|new[" + SP + "]+york)|" |
| 207 | + "(oh|ohio)|" |
| 208 | + "(ok|oklahoma)|" |
| 209 | + "(or|oregon)|" |
| 210 | + "(pa|pennsylvania)|" |
| 211 | + "(pr|puerto[" + SP + "]+rico)|" |
| 212 | + "(pw|palau)|" |
| 213 | + "(ri|rhode[" + SP + "]+island)|" |
| 214 | + "(sc|south[" + SP + "]+carolina)|" |
| 215 | + "(sd|south[" + SP + "]+dakota)|" |
| 216 | + "(tn|tennessee)|" |
| 217 | + "(tx|texas)|" |
| 218 | + "(ut|utah)|" |
| 219 | + "(va|virginia)|" |
| 220 | + "(vi|virgin[" + SP + "]+islands)|" |
| 221 | + "(vt|vermont)|" |
| 222 | + "(wa|washington)|" |
| 223 | + "(wi|wisconsin)|" |
| 224 | + "(wv|west[" + SP + "]+virginia)|" |
| 225 | + "(wy|wyoming)" |
| 226 | + ")" + WORD_END, |
| 227 | Pattern.CASE_INSENSITIVE); |
| 228 | |
| 229 | private static final Pattern sLocationNameRe = Pattern.compile("(?:" |
| 230 | + "alley|annex|arcade|ave[.]?|avenue|alameda|bayou|" |
| 231 | + "beach|bend|bluffs?|bottom|boulevard|branch|bridge|" |
| 232 | + "brooks?|burgs?|bypass|broadway|camino|camp|canyon|" |
| 233 | + "cape|causeway|centers?|circles?|cliffs?|club|common|" |
| 234 | + "corners?|course|courts?|coves?|creek|crescent|crest|" |
| 235 | + "crossing|crossroad|curve|circulo|dale|dam|divide|" |
| 236 | + "drives?|estates?|expressway|extensions?|falls?|ferry|" |
| 237 | + "fields?|flats?|fords?|forest|forges?|forks?|fort|" |
| 238 | + "freeway|gardens?|gateway|glens?|greens?|groves?|" |
| 239 | + "harbors?|haven|heights|highway|hills?|hollow|inlet|" |
| 240 | + "islands?|isle|junctions?|keys?|knolls?|lakes?|land|" |
| 241 | + "landing|lane|lights?|loaf|locks?|lodge|loop|mall|" |
| 242 | + "manors?|meadows?|mews|mills?|mission|motorway|mount|" |
| 243 | + "mountains?|neck|orchard|oval|overpass|parks?|" |
| 244 | + "parkways?|pass|passage|path|pike|pines?|plains?|" |
| 245 | + "plaza|points?|ports?|prairie|privada|radial|ramp|" |
| 246 | + "ranch|rapids?|rd[.]?|rest|ridges?|river|roads?|route|" |
| 247 | + "row|rue|run|shoals?|shores?|skyway|springs?|spurs?|" |
| 248 | + "squares?|station|stravenue|stream|st[.]?|streets?|" |
| 249 | + "summit|speedway|terrace|throughway|trace|track|" |
| 250 | + "trafficway|trail|tunnel|turnpike|underpass|unions?|" |
| 251 | + "valleys?|viaduct|views?|villages?|ville|vista|walks?|" |
| 252 | + "wall|ways?|wells?|xing|xrd)" + WORD_END, |
| 253 | Pattern.CASE_INSENSITIVE); |
| 254 | |
| 255 | private static final Pattern sSuffixedNumberRe = |
| 256 | Pattern.compile("(\\d+)(st|nd|rd|th)", Pattern.CASE_INSENSITIVE); |
| 257 | |
| 258 | private static final Pattern sZipCodeRe = |
| 259 | Pattern.compile("(?:\\d{5}(?:-\\d{4})?)" + WORD_END, Pattern.CASE_INSENSITIVE); |
| 260 | |
| 261 | private static boolean checkHouseNumber(String houseNumber) { |
| 262 | // Make sure that there are at most 5 digits. |
| 263 | int digitCount = 0; |
| 264 | for (int i = 0; i < houseNumber.length(); ++i) { |
| 265 | if (Character.isDigit(houseNumber.charAt(i))) ++digitCount; |
| 266 | } |
| 267 | if (digitCount > 5) return false; |
| 268 | |
| 269 | // Make sure that any ordinals are valid. |
| 270 | Matcher suffixMatcher = sSuffixedNumberRe.matcher(houseNumber); |
| 271 | while (suffixMatcher.find()) { |
| 272 | int num = Integer.parseInt(suffixMatcher.group(1)); |
| 273 | if (num == 0) { |
| 274 | return false; // 0th is invalid. |
| 275 | } |
| 276 | String suffix = suffixMatcher.group(2).toLowerCase(Locale.getDefault()); |
| 277 | switch (num % 10) { |
| 278 | case 1: |
| 279 | return suffix.equals(num % 100 == 11 ? "th" : "st"); |
| 280 | case 2: |
| 281 | return suffix.equals(num % 100 == 12 ? "th" : "nd"); |
| 282 | case 3: |
| 283 | return suffix.equals(num % 100 == 13 ? "th" : "rd"); |
| 284 | default: |
| 285 | return suffix.equals("th"); |
| 286 | } |
| 287 | } |
| 288 | return true; |
| 289 | } |
| 290 | |
| 291 | /** |
| 292 | * Attempt to match a house number beginnning at position offset |
| 293 | * in content. The house number must be followed by a word |
| 294 | * delimiter or the end of the string, and if offset is non-zero, |
| 295 | * then it must also be preceded by a word delimiter. |
| 296 | * |
| 297 | * @return a MatchResult if a valid house number was found. |
| 298 | */ |
| 299 | private static MatchResult matchHouseNumber(String content, int offset) { |
| 300 | if (offset > 0 && HOUSE_PRE_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null; |
| 301 | Matcher matcher = sHouseNumberRe.matcher(content).region(offset, content.length()); |
| 302 | if (matcher.lookingAt()) { |
| 303 | MatchResult matchResult = matcher.toMatchResult(); |
| 304 | if (checkHouseNumber(matchResult.group(0))) return matchResult; |
| 305 | } |
| 306 | return null; |
| 307 | } |
| 308 | |
| 309 | /** |
| 310 | * Attempt to match a US state beginnning at position offset in |
| 311 | * content. The matching state must be followed by a word |
| 312 | * delimiter or the end of the string, and if offset is non-zero, |
| 313 | * then it must also be preceded by a word delimiter. |
| 314 | * |
| 315 | * @return a MatchResult if a valid US state (or two letter code) |
| 316 | * was found. |
| 317 | */ |
| 318 | private static MatchResult matchState(String content, int offset) { |
| 319 | if (offset > 0 && WORD_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null; |
| 320 | Matcher stateMatcher = sStateRe.matcher(content).region(offset, content.length()); |
| 321 | return stateMatcher.lookingAt() ? stateMatcher.toMatchResult() : null; |
| 322 | } |
| 323 | |
| 324 | /** |
| 325 | * Test whether zipCode matches the U.S. zip code format (ddddd or |
| 326 | * ddddd-dddd) and is within the expected range, given that |
| 327 | * stateMatch is a match of sStateRe. |
| 328 | * |
| 329 | * @return true if zipCode is a valid zip code, is legal for the |
| 330 | * matched state, and is followed by a word delimiter or the end |
| 331 | * of the string. |
| 332 | */ |
| 333 | private static boolean isValidZipCode(String zipCode, MatchResult stateMatch) { |
| 334 | if (stateMatch == null) return false; |
| 335 | // Work out the index of the state, based on which group matched. |
| 336 | int stateIndex = stateMatch.groupCount(); |
| 337 | while (stateIndex > 0) { |
| 338 | if (stateMatch.group(stateIndex--) != null) break; |
| 339 | } |
| 340 | return sZipCodeRe.matcher(zipCode).matches() |
| 341 | && sStateZipCodeRanges[stateIndex].matches(zipCode); |
| 342 | } |
| 343 | |
| 344 | /** |
| 345 | * Test whether location is one of the valid locations. |
| 346 | * |
| 347 | * @return true if location starts with a valid location name |
| 348 | * followed by a word delimiter or the end of the string. |
| 349 | */ |
| 350 | private static boolean isValidLocationName(String location) { |
| 351 | return sLocationNameRe.matcher(location).matches(); |
| 352 | } |
| 353 | |
| 354 | /** |
| 355 | * Attempt to match a complete address in content, starting with |
| 356 | * houseNumberMatch. |
| 357 | * |
| 358 | * @param content The string to search. |
| 359 | * @param houseNumberMatch A matching house number to start extending. |
| 360 | * @return +ve: the end of the match |
| 361 | * +ve: the position to restart searching for house numbers, negated. |
| 362 | */ |
| 363 | private static int attemptMatch(String content, MatchResult houseNumberMatch) { |
| 364 | int restartPos = -1; |
| 365 | int nonZipMatch = -1; |
| 366 | int it = houseNumberMatch.end(); |
| 367 | int numLines = 1; |
| 368 | boolean consecutiveHouseNumbers = true; |
| 369 | boolean foundLocationName = false; |
| 370 | int wordCount = 1; |
| 371 | String lastWord = ""; |
| 372 | |
| 373 | Matcher matcher = sWordRe.matcher(content); |
| 374 | |
| 375 | for (; it < content.length(); lastWord = matcher.group(0), it = matcher.end()) { |
| 376 | if (!matcher.find(it)) { |
| 377 | // No more words in the input sequence. |
| 378 | return -content.length(); |
| 379 | } |
| 380 | if (matcher.end() - matcher.start() > kMaxAddressNameWordLength) { |
| 381 | // Word is too long to be part of an address. Fail. |
| 382 | return -matcher.end(); |
| 383 | } |
| 384 | |
| 385 | // Count the number of newlines we just consumed. |
| 386 | while (it < matcher.start()) { |
| 387 | if (NL.indexOf(content.charAt(it++)) != -1) ++numLines; |
| 388 | } |
| 389 | |
| 390 | // Consumed too many lines. Fail. |
| 391 | if (numLines > MAX_ADDRESS_LINES) break; |
| 392 | |
| 393 | // Consumed too many words. Fail. |
| 394 | if (++wordCount > MAX_ADDRESS_WORDS) break; |
| 395 | |
| 396 | if (matchHouseNumber(content, it) != null) { |
| 397 | if (consecutiveHouseNumbers && numLines > 1) { |
| 398 | // Last line ended with a number, and this this line starts with one. |
| 399 | // Restart at this number. |
| 400 | return -it; |
| 401 | } |
| 402 | // Remember the position of this match as the restart position. |
| 403 | if (restartPos == -1) restartPos = it; |
| 404 | continue; |
| 405 | } |
| 406 | |
| 407 | consecutiveHouseNumbers = false; |
| 408 | |
| 409 | if (isValidLocationName(matcher.group(0))) { |
| 410 | foundLocationName = true; |
| 411 | continue; |
| 412 | } |
| 413 | |
| 414 | if (wordCount == MAX_LOCATION_NAME_DISTANCE && !foundLocationName) { |
| 415 | // Didn't find a location name in time. Fail. |
| 416 | it = matcher.end(); |
| 417 | break; |
| 418 | } |
| 419 | |
| 420 | if (foundLocationName && wordCount > MIN_ADDRESS_WORDS) { |
| 421 | // We can now attempt to match a state. |
| 422 | MatchResult stateMatch = matchState(content, it); |
| 423 | if (stateMatch != null) { |
| 424 | if (lastWord.equals("et") && stateMatch.group(0).equals("al")) { |
| 425 | // Reject "et al" as a false postitive. |
| 426 | it = stateMatch.end(); |
| 427 | break; |
| 428 | } |
| 429 | |
| 430 | // At this point we've matched a state; try to match a zip code after it. |
| 431 | Matcher zipMatcher = sWordRe.matcher(content); |
Justin Klaassen | b8042fc | 2018-04-15 00:41:15 -0400 | [diff] [blame] | 432 | if (zipMatcher.find(stateMatch.end())) { |
| 433 | if (isValidZipCode(zipMatcher.group(0), stateMatch)) { |
| 434 | return zipMatcher.end(); |
| 435 | } |
| 436 | } else { |
| 437 | // The content ends with a state but no zip |
| 438 | // code. This is a legal match according to the |
| 439 | // documentation. N.B. This is equivalent to the |
| 440 | // original c++ implementation, which only allowed |
| 441 | // the zip code to be optional at the end of the |
| 442 | // string, which presumably is a bug. We tried |
| 443 | // relaxing this to work in other places but it |
| 444 | // caused too many false positives. |
| 445 | nonZipMatch = stateMatch.end(); |
Jeff Davidson | a192cc2 | 2018-02-08 15:30:06 -0800 | [diff] [blame] | 446 | } |
Jeff Davidson | a192cc2 | 2018-02-08 15:30:06 -0800 | [diff] [blame] | 447 | } |
| 448 | } |
| 449 | } |
| 450 | |
| 451 | if (nonZipMatch > 0) return nonZipMatch; |
| 452 | |
| 453 | return -(restartPos > 0 ? restartPos : it); |
| 454 | } |
| 455 | |
| 456 | /** |
| 457 | * Return the first matching address in content. |
| 458 | * |
| 459 | * @param content The string to search. |
| 460 | * @return The first valid address, or null if no address was matched. |
| 461 | */ |
| 462 | static String findAddress(String content) { |
| 463 | Matcher houseNumberMatcher = sHouseNumberRe.matcher(content); |
| 464 | int start = 0; |
| 465 | while (houseNumberMatcher.find(start)) { |
| 466 | if (checkHouseNumber(houseNumberMatcher.group(0))) { |
| 467 | start = houseNumberMatcher.start(); |
| 468 | int end = attemptMatch(content, houseNumberMatcher); |
| 469 | if (end > 0) { |
| 470 | return content.substring(start, end); |
| 471 | } |
| 472 | start = -end; |
| 473 | } else { |
| 474 | start = houseNumberMatcher.end(); |
| 475 | } |
| 476 | } |
| 477 | return null; |
| 478 | } |
| 479 | } |