Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/phonenumbers/phonenumbermatcher.py: 20%

311 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1"""Functionality to match phone numbers in a piece of text""" 

2 

3# Based on original Java code: 

4# java/src/com/google/i18n/phonenumbers/PhoneNumberMatch.java 

5# java/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java 

6# Copyright (C) 2011 The Libphonenumber Authors 

7# 

8# Licensed under the Apache License, Version 2.0 (the "License"); 

9# you may not use this file except in compliance with the License. 

10# You may obtain a copy of the License at 

11# 

12# http://www.apache.org/licenses/LICENSE-2.0 

13# 

14# Unless required by applicable law or agreed to in writing, software 

15# distributed under the License is distributed on an "AS IS" BASIS, 

16# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

17# See the License for the specific language governing permissions and 

18# limitations under the License. 

19import re 

20 

21# Extra regexp function; see README 

22from .re_util import fullmatch 

23from .util import UnicodeMixin, u, unicod, prnt 

24from .util import U_EMPTY_STRING, U_DASH, U_SEMICOLON, U_SLASH, U_X_LOWER, U_X_UPPER, U_PERCENT 

25from .unicode_util import Category, Block, is_letter 

26from .phonenumberutil import _MAX_LENGTH_FOR_NSN, _MAX_LENGTH_COUNTRY_CODE 

27from .phonenumberutil import _VALID_PUNCTUATION, _PLUS_CHARS, NON_DIGITS_PATTERN 

28from .phonenumberutil import _EXTN_PATTERNS_FOR_MATCHING, _REGEX_FLAGS 

29from .phonenumberutil import _SECOND_NUMBER_START_PATTERN, _UNWANTED_END_CHAR_PATTERN 

30from .phonenumberutil import MatchType, NumberParseException, PhoneNumberFormat 

31from .phonenumberutil import is_possible_number, is_valid_number, parse 

32from .phonenumberutil import normalize_digits_only, national_significant_number 

33from .phonenumberutil import _format_nsn_using_pattern, ndd_prefix_for_region 

34from .phonenumberutil import format_number, is_number_match, region_code_for_country_code 

35from .phonenumberutil import _maybe_strip_national_prefix_carrier_code 

36from .phonenumberutil import _choose_formatting_pattern_for_number 

37from .phonenumberutil import _formatting_rule_has_first_group_only 

38from .phonenumber import CountryCodeSource 

39from .phonemetadata import PhoneMetadata 

40 

41# Import auto-generated data structures 

42try: 

43 from .data import _ALT_NUMBER_FORMATS 

44except ImportError: # pragma no cover 

45 # Before the generated code exists, the data/ directory is empty. 

46 # The generation process imports this module, creating a circular 

47 # dependency. The hack below works around this. 

48 import os 

49 import sys 

50 if os.path.basename(sys.argv[0]) in ("buildmetadatafromxml.py", "buildprefixdata.py"): 

51 prnt("Failed to import generated data (but OK as during autogeneration)", file=sys.stderr) 

52 _ALT_NUMBER_FORMATS = {} 

53 else: 

54 raise 

55 

56 

57def _limit(lower, upper): 

58 """Returns a regular expression quantifier with an upper and lower limit.""" 

59 if ((lower < 0) or (upper <= 0) or (upper < lower)): 59 ↛ 60line 59 didn't jump to line 60, because the condition on line 59 was never true

60 raise Exception("Illegal argument to _limit") 

61 return unicod("{%d,%d}") % (lower, upper) 

62 

63 

64# Build the MATCHING_BRACKETS and PATTERN regular expression patterns. The 

65# building blocks below exist to make the patterns more easily understood. 

66_OPENING_PARENS = u("(\\[\uFF08\uFF3B") 

67_CLOSING_PARENS = u(")\\]\uFF09\uFF3D") 

68_NON_PARENS = u("[^") + _OPENING_PARENS + _CLOSING_PARENS + u("]") 

69# Limit on the number of pairs of brackets in a phone number. 

70_BRACKET_PAIR_LIMIT = _limit(0, 3) 

71 

72# Pattern to check that brackets match. Opening brackets should be closed 

73# within a phone number. This also checks that there is something inside the 

74# brackets. Having no brackets at all is also fine. 

75# 

76# An opening bracket at the beginning may not be closed, but subsequent ones 

77# should be. It's also possible that the leading bracket was dropped, so we 

78# shouldn't be surprised if we see a closing bracket first. We limit the sets 

79# of brackets in a phone number to four. 

80_MATCHING_BRACKETS = re.compile(u("(?:[") + _OPENING_PARENS + u("])?") + u("(?:") + _NON_PARENS + u("+") + 

81 u("[") + _CLOSING_PARENS + u("])?") + 

82 _NON_PARENS + u("+") + 

83 u("(?:[") + _OPENING_PARENS + u("]") + _NON_PARENS + 

84 u("+[") + _CLOSING_PARENS + u("])") + _BRACKET_PAIR_LIMIT + 

85 _NON_PARENS + u("*")) 

86 

87# Limit on the number of leading (plus) characters. 

88_LEAD_LIMIT = _limit(0, 2) 

89# Limit on the number of consecutive punctuation characters. 

90_PUNCTUATION_LIMIT = _limit(0, 4) 

91# The maximum number of digits allowed in a digit-separated block. As we allow 

92# all digits in a single block, set high enough to accommodate the entire 

93# national number and the international country code. 

94_DIGIT_BLOCK_LIMIT = (_MAX_LENGTH_FOR_NSN + _MAX_LENGTH_COUNTRY_CODE) 

95# Limit on the number of blocks separated by punctuation. Use _DIGIT_BLOCK_LIMIT 

96# since some formats use spaces to separate each digit. 

97_BLOCK_LIMIT = _limit(0, _DIGIT_BLOCK_LIMIT) 

98 

99# A punctuation sequence allowing white space. 

100_PUNCTUATION = u("[") + _VALID_PUNCTUATION + u("]") + _PUNCTUATION_LIMIT 

101# A digits block without punctuation. 

102_DIGIT_SEQUENCE = u("\\d") + _limit(1, _DIGIT_BLOCK_LIMIT) 

103# Punctuation that may be at the start of a phone number - brackets and plus signs. 

104_LEAD_CLASS_CHARS = _OPENING_PARENS + _PLUS_CHARS 

105_LEAD_CLASS = u("[") + _LEAD_CLASS_CHARS + u("]") 

106_LEAD_PATTERN = re.compile(_LEAD_CLASS) 

107 

108# Phone number pattern allowing optional punctuation. 

109# This is the phone number pattern used by _find(), similar to 

110# phonenumberutil._VALID_PHONE_NUMBER, but with the following differences: 

111# - All captures are limited in order to place an upper bound to the text 

112# matched by the pattern. 

113# - Leading punctuation / plus signs are limited. 

114# - Consecutive occurrences of punctuation are limited. 

115# - Number of digits is limited. 

116# - No whitespace is allowed at the start or end. 

117# - No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently 

118# supported. 

119_PATTERN = re.compile(u("(?:") + _LEAD_CLASS + _PUNCTUATION + u(")") + _LEAD_LIMIT + 

120 _DIGIT_SEQUENCE + u("(?:") + _PUNCTUATION + _DIGIT_SEQUENCE + u(")") + _BLOCK_LIMIT + 

121 u("(?:") + _EXTN_PATTERNS_FOR_MATCHING + u(")?"), 

122 _REGEX_FLAGS) 

123 

124# Matches strings that look like publication pages. Example: "Computing 

125# Complete Answers to Queries in the Presence of Limited Access Patterns. 

126# Chen Li. VLDB J. 12(3): 211-227 (2003)." 

127# 

128# The string "211-227 (2003)" is not a telephone number. 

129_PUB_PAGES = re.compile(u("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")) 

130 

131# Matches strings that look like dates using "/" as a separator. Examples: 

132# 3/10/2011, 31/10/96 or 08/31/95. 

133_SLASH_SEPARATED_DATES = re.compile(u("(?:(?:[0-3]?\\d/[01]?\\d)|(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")) 

134 

135# Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does 

136# not include the trailing ":\d\d" -- that is covered by TIME_STAMPS_SUFFIX. 

137_TIME_STAMPS = re.compile(u("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$")) 

138_TIME_STAMPS_SUFFIX = re.compile(u(":[0-5]\\d")) 

139 

140# Patterns used to extract phone numbers from a larger phone-number-like 

141# pattern. These are ordered according to specificity. For example, 

142# white-space is last since that is frequently used in numbers, not just to 

143# separate two numbers. We have separate patterns since we don't want to break 

144# up the phone-number-like text on more than one different kind of symbol at 

145# one time, although symbols of the same type (e.g. space) can be safely 

146# grouped together. 

147# 

148# Note that if there is a match, we will always check any text found up to the 

149# first match as well. 

150_INNER_MATCHES = ( 

151 # Breaks on the slash - e.g. "651-234-2345/332-445-1234" 

152 re.compile(u("/+(.*)")), 

153 # Note that the bracket here is inside the capturing group, since we 

154 # consider it part of the phone number. Will match a pattern like "(650) 

155 # 223 3345 (754) 223 3321". 

156 re.compile(u("(\\([^(]*)")), 

157 # Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We 

158 # require a space on either side of the hyphen for it to be considered a 

159 # separator. 

160 re.compile(u("(?u)(?:\\s-|-\\s)\\s*(.+)")), 

161 # Various types of wide hyphens. Note we have decided not to enforce a 

162 # space here, since it's possible that it's supposed to be used to break 

163 # two numbers without spaces, and we haven't seen many instances of it 

164 # used within a number. 

165 re.compile(u("(?u)[\u2012-\u2015\uFF0D]\\s*(.+)")), 

166 # Breaks on a full stop - e.g. "12345. 332-445-1234 is my number." 

167 re.compile(u("(?u)\\.+\\s*([^.]+)")), 

168 # Breaks on space - e.g. "3324451234 8002341234" 

169 re.compile(u("(?u)\\s+(\\S+)"))) 

170 

171 

172class Leniency(object): 

173 """Leniency when finding potential phone numbers in text segments. 

174 

175 The levels here are ordered in increasing strictness.""" 

176 # Phone numbers accepted are possible (i.e. is_possible_number(number)) but 

177 # not necessarily valid (is_valid_number(number)). 

178 POSSIBLE = 0 

179 # Phone numbers accepted are both possible (is_possible_number(number)) 

180 # and valid (is_valid_number(PhoneNumber)). Numbers written in national 

181 # format must have their national-prefix present if it is usually written 

182 # for a number of this type. 

183 VALID = 1 

184 # Phone numbers accepted are valid (i.e. is_valid_number(number)) and are 

185 # grouped in a possible way for this locale. For example, a US number 

186 # written as "65 02 53 00 00" and "650253 0000" are not accepted at this 

187 # leniency level, whereas "650 253 0000", "650 2530000" or "6502530000" 

188 # are. 

189 # Numbers with more than one '/' symbol in the national significant number 

190 # are also dropped at this level. 

191 # 

192 # Warning: This level might result in lower coverage especially for 

193 # regions outside of country code "+1". If you are not sure about which 

194 # level to use, email the discussion group 

195 # libphonenumber-discuss@googlegroups.com. 

196 STRICT_GROUPING = 2 

197 # Phone numbers accepted are valid (i.e. is_valid_number(number)) and are 

198 # grouped in the same way that we would have formatted it, or as a single 

199 # block. For example, a US number written as "650 2530000" is not accepted 

200 # at this leniency level, whereas "650 253 0000" or "6502530000" are. 

201 # Numbers with more than one '/' symbol are also dropped at this level. 

202 # Warning: This level might result in lower coverage especially for 

203 # regions outside of country code "+1". If you are not sure about which 

204 # level to use, email the discussion group 

205 # libphonenumber-discuss@googlegroups.com. 

206 EXACT_GROUPING = 3 

207 

208 

209def _verify(leniency, numobj, candidate, matcher): 

210 """Returns True if number is a verified number according to the 

211 leniency.""" 

212 if leniency == Leniency.POSSIBLE: 

213 return is_possible_number(numobj) 

214 elif leniency == Leniency.VALID: 

215 if (not is_valid_number(numobj) or 

216 not _contains_only_valid_x_chars(numobj, candidate)): 

217 return False 

218 return _is_national_prefix_present_if_required(numobj) 

219 elif leniency == Leniency.STRICT_GROUPING: 

220 return _verify_strict_grouping(numobj, candidate, matcher) 

221 elif leniency == Leniency.EXACT_GROUPING: 

222 return _verify_exact_grouping(numobj, candidate, matcher) 

223 else: 

224 raise Exception("Error: unsupported Leniency value %s" % leniency) 

225 

226 

227def _verify_strict_grouping(numobj, candidate, matcher): 

228 if (not is_valid_number(numobj) or 

229 not _contains_only_valid_x_chars(numobj, candidate) or 

230 _contains_more_than_one_slash_in_national_number(numobj, candidate) or 

231 not _is_national_prefix_present_if_required(numobj)): 

232 return False 

233 return matcher._check_number_grouping_is_valid(numobj, candidate, 

234 _all_number_groups_remain_grouped) 

235 

236 

237def _all_number_groups_remain_grouped(numobj, normalized_candidate, formatted_number_groups): 

238 """Returns True if the groups of digits found in our candidate phone number match our 

239 expectations. 

240 

241 Arguments: 

242 numobj -- the original number we found when parsing 

243 normalized_candidate -- the candidate number, normalized to only contain ASCII digits, 

244 but with non-digits (spaces etc) retained 

245 expected_number_groups -- the groups of digits that we would expect to see if we 

246 formatted this number 

247 Returns True if expectations matched. 

248 """ 

249 from_index = 0 

250 if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY: 

251 # First skip the country code if the normalized candidate contained it. 

252 country_code = str(numobj.country_code) 

253 from_index = normalized_candidate.find(country_code) + len(country_code) 

254 # Check each group of consecutive digits are not broken into separate 

255 # groupings in the candidate string. 

256 for ii, formatted_number_group in enumerate(formatted_number_groups): 

257 # Fails if the substring of normalized_candidate starting from 

258 # from_index doesn't contain the consecutive digits in 

259 # formatted_number_group. 

260 from_index = normalized_candidate.find(formatted_number_group, from_index) 

261 if from_index < 0: 

262 return False 

263 # Moves from_index forward. 

264 from_index += len(formatted_number_group) 

265 if (ii == 0 and from_index < len(normalized_candidate)): 

266 # We are at the position right after the NDC. We get the region 

267 # used for formatting information based on the country code in the 

268 # phone number, rather than the number itself, as we do not need 

269 # to distinguish between different countries with the same country 

270 # calling code and this is faster. 

271 region = region_code_for_country_code(numobj.country_code) 

272 if (ndd_prefix_for_region(region, True) is not None and 

273 normalized_candidate[from_index].isdigit()): 

274 # This means there is no formatting symbol after the NDC. In 

275 # this case, we only accept the number if there is no 

276 # formatting symbol at all in the number, except for 

277 # extensions. This is only important for countries with 

278 # national prefixes. 

279 nsn = national_significant_number(numobj) 

280 return normalized_candidate[(from_index - len(formatted_number_group)):].startswith(nsn) 

281 # The check here makes sure that we haven't mistakenly already used the extension to 

282 # match the last group of the subscriber number. Note the extension cannot have 

283 # formatting in-between digits. 

284 return (normalized_candidate[from_index:].find(numobj.extension or U_EMPTY_STRING) != -1) 

285 

286 

287def _verify_exact_grouping(numobj, candidate, matcher): 

288 if (not is_valid_number(numobj) or 

289 not _contains_only_valid_x_chars(numobj, candidate) or 

290 _contains_more_than_one_slash_in_national_number(numobj, candidate) or 

291 not _is_national_prefix_present_if_required(numobj)): 

292 return False 

293 return matcher._check_number_grouping_is_valid(numobj, candidate, 

294 _all_number_groups_are_exactly_present) 

295 

296 

297def _all_number_groups_are_exactly_present(numobj, normalized_candidate, formatted_number_groups): 

298 """Returns True if the groups of digits found in our candidate phone number match our 

299 expectations. 

300 

301 Arguments: 

302 numobj -- the original number we found when parsing 

303 normalized_candidate -- the candidate number, normalized to only contain ASCII digits, 

304 but with non-digits (spaces etc) retained 

305 expected_number_groups -- the groups of digits that we would expect to see if we 

306 formatted this number 

307 Returns True if expectations matched. 

308 """ 

309 candidate_groups = re.split(NON_DIGITS_PATTERN, normalized_candidate) 

310 # Set this to the last group, skipping it if the number has an extension. 

311 if numobj.extension is not None: 

312 candidate_number_group_index = len(candidate_groups) - 2 

313 else: 

314 candidate_number_group_index = len(candidate_groups) - 1 

315 # First we check if the national significant number is formatted as a 

316 # block. We use contains and not equals, since the national significant 

317 # number may be present with a prefix such as a national number prefix, or 

318 # the country code itself. 

319 if (len(candidate_groups) == 1 or 

320 candidate_groups[candidate_number_group_index].find(national_significant_number(numobj)) != -1): 

321 return True 

322 # Starting from the end, go through in reverse, excluding the first group, 

323 # and check the candidate and number groups are the same. 

324 formatted_number_group_index = len(formatted_number_groups) - 1 

325 while (formatted_number_group_index > 0 and candidate_number_group_index >= 0): 

326 if (candidate_groups[candidate_number_group_index] != 

327 formatted_number_groups[formatted_number_group_index]): 

328 return False 

329 formatted_number_group_index -= 1 

330 candidate_number_group_index -= 1 

331 # Now check the first group. There may be a national prefix at the start, so we only check 

332 # that the candidate group ends with the formatted number group. 

333 return (candidate_number_group_index >= 0 and 

334 candidate_groups[candidate_number_group_index].endswith(formatted_number_groups[0])) 

335 

336 

337def _get_national_number_groups_without_pattern(numobj): 

338 """Helper method to get the national-number part of a number, formatted without any national 

339 prefix, and return it as a set of digit blocks that would be formatted together following 

340 standard formatting rules.""" 

341 # This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX represents groups of 

342 # digits. 

343 rfc3966_format = format_number(numobj, PhoneNumberFormat.RFC3966) 

344 # We remove the extension part from the formatted string before splitting 

345 # it into different groups. 

346 end_index = rfc3966_format.find(U_SEMICOLON) 

347 if end_index < 0: 

348 end_index = len(rfc3966_format) 

349 

350 # The country-code will have a '-' following it. 

351 start_index = rfc3966_format.find(U_DASH) + 1 

352 return rfc3966_format[start_index:end_index].split(U_DASH) 

353 

354 

355def _get_national_number_groups(numobj, formatting_pattern): 

356 """Helper method to get the national-number part of a number, formatted without any national 

357 prefix, and return it as a set of digit blocks that should be formatted together according to 

358 the formatting pattern passed in.""" 

359 # If a format is provided, we format the NSN only, and split that according to the separator. 

360 nsn = national_significant_number(numobj) 

361 return _format_nsn_using_pattern(nsn, formatting_pattern, 

362 PhoneNumberFormat.RFC3966).split(U_DASH) 

363 

364 

365def _contains_more_than_one_slash_in_national_number(numobj, candidate): 

366 first_slash_in_body_index = candidate.find(U_SLASH) 

367 if first_slash_in_body_index < 0: 

368 # No slashes, this is okay. 

369 return False 

370 # Now look for a second one. 

371 second_slash_in_body_index = candidate.find(U_SLASH, first_slash_in_body_index + 1) 

372 if second_slash_in_body_index < 0: 

373 # Only one slash, this is okay., 

374 return False 

375 

376 # If the first slash is after the country calling code, this is permitted. 

377 candidate_has_country_code = (numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITH_PLUS_SIGN or 

378 numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITHOUT_PLUS_SIGN) 

379 if (candidate_has_country_code and 

380 normalize_digits_only(candidate[:first_slash_in_body_index]) == 

381 unicod(numobj.country_code)): 

382 # Any more slashes and this is illegal. 

383 return (candidate[(second_slash_in_body_index + 1):].find(U_SLASH) != -1) 

384 return True 

385 

386 

387def _contains_only_valid_x_chars(numobj, candidate): 

388 # The characters 'x' and 'X' can be (1) a carrier code, in which case they 

389 # always precede the national significant number or (2) an extension sign, 

390 # in which case they always precede the extension number. We assume a 

391 # carrier code is more than 1 digit, so the first case has to have more 

392 # than 1 consecutive 'x' or 'X', whereas the second case can only have 

393 # exactly 1 'x' or 'X'. We ignore the character if it appears as the last 

394 # character of the string. 

395 ii = 0 

396 while ii < (len(candidate) - 1): 

397 if (candidate[ii] == U_X_LOWER or candidate[ii] == U_X_UPPER): 

398 next_char = candidate[ii + 1] 

399 if (next_char == U_X_LOWER or next_char == U_X_UPPER): 

400 # This is the carrier code case, in which the 'X's always 

401 # precede the national significant number. 

402 ii += 1 

403 if is_number_match(numobj, candidate[ii:]) != MatchType.NSN_MATCH: 

404 return False 

405 # This is the extension sign case, in which the 'x' or 'X' should 

406 # always precede the extension number. 

407 elif normalize_digits_only(candidate[ii:]) != numobj.extension: 

408 return False 

409 ii += 1 

410 return True 

411 

412 

413def _is_national_prefix_present_if_required(numobj): 

414 # First, check how we deduced the country code. If it was written in 

415 # international format, then the national prefix is not required. 

416 if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY: 

417 return True 

418 phone_number_region = region_code_for_country_code(numobj.country_code) 

419 metadata = PhoneMetadata.metadata_for_region(phone_number_region, None) 

420 if metadata is None: 

421 return True 

422 # Check if a national prefix should be present when formatting this number. 

423 national_number = national_significant_number(numobj) 

424 format_rule = _choose_formatting_pattern_for_number(metadata.number_format, 

425 national_number) 

426 # To do this, we check that a national prefix formatting rule was present 

427 # and that it wasn't just the first-group symbol ($1) with punctuation. 

428 if (format_rule is not None and 

429 format_rule.national_prefix_formatting_rule): 

430 if format_rule.national_prefix_optional_when_formatting: 

431 # The national-prefix is optional in these cases, so we don't need 

432 # to check if it was present. 

433 return True 

434 if _formatting_rule_has_first_group_only(format_rule.national_prefix_formatting_rule): 

435 # National Prefix not needed for this number. 

436 return True 

437 # Normalize the remainder. 

438 raw_input = normalize_digits_only(numobj.raw_input) 

439 # Check if we found a national prefix and/or carrier code at the start of the raw input, 

440 # and return the result. 

441 return _maybe_strip_national_prefix_carrier_code(raw_input, metadata)[2] 

442 return True 

443 

444 

445class PhoneNumberMatcher(object): 

446 """A stateful class that finds and extracts telephone numbers from text. 

447 

448 Vanity numbers (phone numbers using alphabetic digits such as '1-800-SIX-FLAGS' are 

449 not found. 

450 

451 This class is not thread-safe. 

452 """ 

453 # The potential states of a PhoneNumberMatcher. 

454 _NOT_READY = 0 

455 _READY = 1 

456 _DONE = 2 

457 

458 def __init__(self, text, region, 

459 leniency=Leniency.VALID, max_tries=65535): 

460 """Creates a new instance. 

461 

462 Arguments: 

463 text -- The character sequence that we will search, None for no text. 

464 country -- The country to assume for phone numbers not written in 

465 international format (with a leading plus, or with the 

466 international dialing prefix of the specified region). May be 

467 None or "ZZ" if only numbers with a leading plus should be 

468 considered. 

469 leniency -- The leniency to use when evaluating candidate phone 

470 numbers. 

471 max_tries -- The maximum number of invalid numbers to try before 

472 giving up on the text. This is to cover degenerate cases where 

473 the text has a lot of false positives in it. Must be >= 0. 

474 """ 

475 if leniency is None: 

476 raise ValueError("Need a leniency value") 

477 if int(max_tries) < 0: 

478 raise ValueError("Need max_tries to be positive int") 

479 # The text searched for phone numbers. 

480 self.text = text 

481 if self.text is None: 

482 self.text = U_EMPTY_STRING 

483 # The region (country) to assume for phone numbers without an 

484 # international prefix, possibly None. 

485 self.preferred_region = region 

486 # The degree of validation requested. 

487 self.leniency = leniency 

488 # The maximum number of retries after matching an invalid number. 

489 self._max_tries = int(max_tries) 

490 # The iteration tristate. 

491 self._state = PhoneNumberMatcher._NOT_READY 

492 # The last successful match, None unless in state _READY 

493 self._last_match = None 

494 # The next index to start searching at. Undefined in state _DONE 

495 self._search_index = 0 

496 

497 def _find(self, index): 

498 """Attempts to find the next subsequence in the searched sequence on or after index 

499 that represents a phone number. Returns the next match, None if none was found. 

500 

501 Arguments: 

502 index -- The search index to start searching at. 

503 Returns the phone number match found, None if none can be found. 

504 """ 

505 match = _PATTERN.search(self.text, index) 

506 while self._max_tries > 0 and match is not None: 

507 start = match.start() 

508 candidate = self.text[start:match.end()] 

509 

510 # Check for extra numbers at the end. 

511 # TODO: This is the place to start when trying to support 

512 # extraction of multiple phone number from split notations (+41 79 

513 # 123 45 67 / 68). 

514 candidate = self._trim_after_first_match(_SECOND_NUMBER_START_PATTERN, 

515 candidate) 

516 

517 match = self._extract_match(candidate, start) 

518 if match is not None: 

519 return match 

520 # Move along 

521 index = start + len(candidate) 

522 self._max_tries -= 1 

523 match = _PATTERN.search(self.text, index) 

524 return None 

525 

526 def _trim_after_first_match(self, pattern, candidate): 

527 """Trims away any characters after the first match of pattern in 

528 candidate, returning the trimmed version.""" 

529 trailing_chars_match = pattern.search(candidate) 

530 if trailing_chars_match: 

531 candidate = candidate[:trailing_chars_match.start()] 

532 return candidate 

533 

534 @classmethod 

535 def _is_latin_letter(cls, letter): 

536 """Helper method to determine if a character is a Latin-script letter 

537 or not. For our purposes, combining marks should also return True 

538 since we assume they have been added to a preceding Latin character.""" 

539 # Combining marks are a subset of non-spacing-mark 

540 if (not is_letter(letter) and 

541 Category.get(letter) != Category.NON_SPACING_MARK): 

542 return False 

543 block = Block.get(letter) 

544 return (block == Block.BASIC_LATIN or 

545 block == Block.LATIN_1_SUPPLEMENT or 

546 block == Block.LATIN_EXTENDED_A or 

547 block == Block.LATIN_EXTENDED_ADDITIONAL or 

548 block == Block.LATIN_EXTENDED_B or 

549 block == Block.COMBINING_DIACRITICAL_MARKS) 

550 

551 @classmethod 

552 def _is_invalid_punctuation_symbol(cls, character): 

553 return (character == U_PERCENT or 

554 Category.get(character) == Category.CURRENCY_SYMBOL) 

555 

556 def _extract_match(self, candidate, offset): 

557 """Attempts to extract a match from a candidate string. 

558 

559 Arguments: 

560 candidate -- The candidate text that might contain a phone number. 

561 offset -- The offset of candidate within self.text 

562 Returns the match found, None if none can be found 

563 """ 

564 # Skip a match that is more likely a publication page reference or a 

565 # date. 

566 if (_SLASH_SEPARATED_DATES.search(candidate)): 

567 return None 

568 

569 # Skip potential time-stamps. 

570 if _TIME_STAMPS.search(candidate): 

571 following_text = self.text[offset + len(candidate):] 

572 if _TIME_STAMPS_SUFFIX.match(following_text): 

573 return None 

574 

575 # Try to come up with a valid match given the entire candidate. 

576 match = self._parse_and_verify(candidate, offset) 

577 if match is not None: 

578 return match 

579 

580 # If that failed, try to find an "inner match" -- there might be a 

581 # phone number within this candidate. 

582 return self._extract_inner_match(candidate, offset) 

583 

584 def _extract_inner_match(self, candidate, offset): 

585 """Attempts to extract a match from candidate if the whole candidate 

586 does not qualify as a match. 

587 

588 Arguments: 

589 candidate -- The candidate text that might contain a phone number 

590 offset -- The current offset of candidate within text 

591 Returns the match found, None if none can be found 

592 """ 

593 for possible_inner_match in _INNER_MATCHES: 

594 group_match = possible_inner_match.search(candidate) 

595 is_first_match = True 

596 while group_match and self._max_tries > 0: 

597 if is_first_match: 

598 # We should handle any group before this one too. 

599 group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN, 

600 candidate[:group_match.start()]) 

601 match = self._parse_and_verify(group, offset) 

602 if match is not None: 

603 return match 

604 self._max_tries -= 1 

605 is_first_match = False 

606 group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN, 

607 group_match.group(1)) 

608 match = self._parse_and_verify(group, offset + group_match.start(1)) 

609 if match is not None: 

610 return match 

611 self._max_tries -= 1 

612 group_match = possible_inner_match.search(candidate, group_match.start() + 1) 

613 return None 

614 

615 def _parse_and_verify(self, candidate, offset): 

616 """Parses a phone number from the candidate using phonenumberutil.parse and 

617 verifies it matches the requested leniency. If parsing and verification succeed, a 

618 corresponding PhoneNumberMatch is returned, otherwise this method returns None. 

619 

620 Arguments: 

621 candidate -- The candidate match. 

622 offset -- The offset of candidate within self.text. 

623 Returns the parsed and validated phone number match, or None. 

624 """ 

625 try: 

626 # Check the candidate doesn't contain any formatting which would 

627 # indicate that it really isn't a phone number. 

628 if (not fullmatch(_MATCHING_BRACKETS, candidate) or _PUB_PAGES.search(candidate)): 

629 return None 

630 

631 # If leniency is set to VALID or stricter, we also want to skip 

632 # numbers that are surrounded by Latin alphabetic characters, to 

633 # skip cases like abc8005001234 or 8005001234def. 

634 if self.leniency >= Leniency.VALID: 

635 # If the candidate is not at the start of the text, and does 

636 # not start with phone-number punctuation, check the previous 

637 # character 

638 if (offset > 0 and 

639 not _LEAD_PATTERN.match(candidate)): 

640 previous_char = self.text[offset - 1] 

641 # We return None if it is a latin letter or an invalid 

642 # punctuation symbol 

643 if (self._is_invalid_punctuation_symbol(previous_char) or 

644 self._is_latin_letter(previous_char)): 

645 return None 

646 last_char_index = offset + len(candidate) 

647 if last_char_index < len(self.text): 

648 next_char = self.text[last_char_index] 

649 if (self._is_invalid_punctuation_symbol(next_char) or 

650 self._is_latin_letter(next_char)): 

651 return None 

652 

653 numobj = parse(candidate, self.preferred_region, keep_raw_input=True) 

654 if _verify(self.leniency, numobj, candidate, self): 

655 # We used parse(keep_raw_input=True) to create this number, 

656 # but for now we don't return the extra values parsed. 

657 # TODO: stop clearing all values here and switch all users 

658 # over to using raw_input rather than the raw_string of 

659 # PhoneNumberMatch. 

660 numobj.country_code_source = CountryCodeSource.UNSPECIFIED 

661 numobj.raw_input = None 

662 numobj.preferred_domestic_carrier_code = None 

663 return PhoneNumberMatch(offset, candidate, numobj) 

664 except NumberParseException: 

665 # ignore and continue 

666 pass 

667 return None 

668 

669 def _check_number_grouping_is_valid(self, numobj, candidate, checker): 

670 normalized_candidate = normalize_digits_only(candidate, True) # keep non-digits 

671 formatted_number_groups = _get_national_number_groups_without_pattern(numobj) 

672 if checker(numobj, normalized_candidate, formatted_number_groups): 

673 return True 

674 # If this didn't pass, see if there are any alternate formats that match, and try them instead. 

675 alternate_formats = _ALT_NUMBER_FORMATS.get(numobj.country_code, None) 

676 nsn = national_significant_number(numobj) 

677 if alternate_formats is not None: 

678 for alternate_format in alternate_formats: 

679 if len(alternate_format.leading_digits_pattern) > 0: 

680 # There is only one leading digits pattern for alternate formats. 

681 pattern = re.compile(alternate_format.leading_digits_pattern[0]) 

682 if not pattern.match(nsn): 

683 # Leading digits don't match; try another one. 

684 continue 

685 formatted_number_groups = _get_national_number_groups(numobj, alternate_format) 

686 if checker(numobj, normalized_candidate, formatted_number_groups): 

687 return True 

688 return False 

689 

690 def has_next(self): 

691 """Indicates whether there is another match available""" 

692 if self._state == PhoneNumberMatcher._NOT_READY: 

693 self._last_match = self._find(self._search_index) 

694 if self._last_match is None: 

695 self._state = PhoneNumberMatcher._DONE 

696 else: 

697 self._search_index = self._last_match.end 

698 self._state = PhoneNumberMatcher._READY 

699 return (self._state == PhoneNumberMatcher._READY) 

700 

701 def next(self): 

702 """Return the next match; raises Exception if no next match available""" 

703 # Check the state and find the next match as a side-effect if necessary. 

704 if not self.has_next(): 

705 raise StopIteration("No next match") 

706 # Don't retain that memory any longer than necessary. 

707 result = self._last_match 

708 self._last_match = None 

709 self._state = PhoneNumberMatcher._NOT_READY 

710 return result 

711 

712 def __iter__(self): 

713 while self.has_next(): 

714 yield self.next() 

715 

716 

717class PhoneNumberMatch(UnicodeMixin): 

718 """The immutable match of a phone number within a piece of text. 

719 

720 Matches may be found using the find() method of PhoneNumberMatcher. 

721 

722 A match consists of the phone number (in .number) as well as the .start 

723 and .end offsets of the corresponding subsequence of the searched 

724 text. Use .raw_string to obtain a copy of the matched subsequence. 

725 

726 The following annotated example clarifies the relationship between the 

727 searched text, the match offsets, and the parsed number: 

728 

729 >>> text = "Call me at +1 425 882-8080 for details." 

730 >>> country = "US" 

731 >>> import phonenumbers 

732 >>> matcher = phonenumbers.PhoneNumberMatcher(text, country) 

733 >>> matcher.has_next() 

734 True 

735 >>> m = matcher.next() # Find the first phone number match 

736 >>> m.raw_string # contains the phone number as it appears in the text. 

737 "+1 425 882-8080" 

738 >>> (m.start, m.end) # define the range of the matched subsequence. 

739 (11, 26) 

740 >>> text[m.start, m.end] 

741 "+1 425 882-8080" 

742 >>> phonenumberutil.parse("+1 425 882-8080", "US") == m.number 

743 True 

744 """ 

745 def __init__(self, start, raw_string, numobj): 

746 if start < 0: 

747 raise Exception("Start index not >= 0") 

748 if raw_string is None or numobj is None: 

749 raise Exception("Invalid argument") 

750 # The start index into the text. 

751 self.start = start 

752 # The raw substring matched. 

753 self.raw_string = raw_string 

754 self.end = self.start + len(raw_string) 

755 # The matched phone number. 

756 self.number = numobj 

757 

758 def __eq__(self, other): 

759 if not isinstance(other, PhoneNumberMatch): 

760 return False 

761 return (self.start == other.start and 

762 self.raw_string == other.raw_string and 

763 self.end == other.end and 

764 self.number == other.number) 

765 

766 def __ne__(self, other): 

767 return not self.__eq__(other) 

768 

769 def __repr__(self): 

770 return (unicod("PhoneNumberMatch(start=%r, raw_string=%r, numobj=%r)") % 

771 (self.start, 

772 self.raw_string, 

773 self.number)) 

774 

775 def __unicode__(self): 

776 return unicod("PhoneNumberMatch [%s,%s) %s") % (self.start, self.end, self.raw_string)