Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/phonenumbers/phonenumbermatcher.py: 20%

1"""Functionality to match phone numbers in a piece of text"""

3# Based on original Java code:

4# java/src/com/google/i18n/phonenumbers/PhoneNumberMatch.java

5# java/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java

8# Licensed under the Apache License, Version 2.0 (the "License");

9# you may not use this file except in compliance with the License.

10# You may obtain a copy of the License at

11#

12# http://www.apache.org/licenses/LICENSE-2.0

13#

14# Unless required by applicable law or agreed to in writing, software

15# distributed under the License is distributed on an "AS IS" BASIS,

16# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

17# See the License for the specific language governing permissions and

18# limitations under the License.

19import re

21# Extra regexp function; see README

22from .re_util import fullmatch

23from .util import UnicodeMixin, u, unicod, prnt

24from .util import U_EMPTY_STRING, U_DASH, U_SEMICOLON, U_SLASH, U_X_LOWER, U_X_UPPER, U_PERCENT

25from .unicode_util import Category, Block, is_letter

26from .phonenumberutil import _MAX_LENGTH_FOR_NSN, _MAX_LENGTH_COUNTRY_CODE

27from .phonenumberutil import _VALID_PUNCTUATION, _PLUS_CHARS, NON_DIGITS_PATTERN

28from .phonenumberutil import _EXTN_PATTERNS_FOR_MATCHING, _REGEX_FLAGS

29from .phonenumberutil import _SECOND_NUMBER_START_PATTERN, _UNWANTED_END_CHAR_PATTERN

30from .phonenumberutil import MatchType, NumberParseException, PhoneNumberFormat

31from .phonenumberutil import is_possible_number, is_valid_number, parse

32from .phonenumberutil import normalize_digits_only, national_significant_number

33from .phonenumberutil import _format_nsn_using_pattern, ndd_prefix_for_region

34from .phonenumberutil import format_number, is_number_match, region_code_for_country_code

35from .phonenumberutil import _maybe_strip_national_prefix_carrier_code

36from .phonenumberutil import _choose_formatting_pattern_for_number

37from .phonenumberutil import _formatting_rule_has_first_group_only

38from .phonenumber import CountryCodeSource

39from .phonemetadata import PhoneMetadata

41# Import auto-generated data structures

42try:

43 from .data import _ALT_NUMBER_FORMATS

44except ImportError: # pragma no cover

45 # Before the generated code exists, the data/ directory is empty.

46 # The generation process imports this module, creating a circular

47 # dependency. The hack below works around this.

48 import os

49 import sys

50 if os.path.basename(sys.argv[0]) in ("buildmetadatafromxml.py", "buildprefixdata.py"):

51 prnt("Failed to import generated data (but OK as during autogeneration)", file=sys.stderr)

52 _ALT_NUMBER_FORMATS = {}

53 else:

54 raise

57def _limit(lower, upper):

58 """Returns a regular expression quantifier with an upper and lower limit."""

59 if ((lower < 0) or (upper <= 0) or (upper < lower)): 59 ↛ 60line 59 didn't jump to line 60, because the condition on line 59 was never true

60 raise Exception("Illegal argument to _limit")

61 return unicod("{%d,%d}") % (lower, upper)

64# Build the MATCHING_BRACKETS and PATTERN regular expression patterns. The

65# building blocks below exist to make the patterns more easily understood.

66_OPENING_PARENS = u("(\\[\uFF08\uFF3B")

67_CLOSING_PARENS = u(")\\]\uFF09\uFF3D")

68_NON_PARENS = u("[^") + _OPENING_PARENS + _CLOSING_PARENS + u("]")

69# Limit on the number of pairs of brackets in a phone number.

70_BRACKET_PAIR_LIMIT = _limit(0, 3)

72# Pattern to check that brackets match. Opening brackets should be closed

73# within a phone number. This also checks that there is something inside the

74# brackets. Having no brackets at all is also fine.

75#

76# An opening bracket at the beginning may not be closed, but subsequent ones

77# should be. It's also possible that the leading bracket was dropped, so we

78# shouldn't be surprised if we see a closing bracket first. We limit the sets

79# of brackets in a phone number to four.

80_MATCHING_BRACKETS = re.compile(u("(?:[") + _OPENING_PARENS + u("])?") + u("(?:") + _NON_PARENS + u("+") +

81 u("[") + _CLOSING_PARENS + u("])?") +

82 _NON_PARENS + u("+") +

83 u("(?:[") + _OPENING_PARENS + u("]") + _NON_PARENS +

84 u("+[") + _CLOSING_PARENS + u("])") + _BRACKET_PAIR_LIMIT +

85 _NON_PARENS + u("*"))

87# Limit on the number of leading (plus) characters.

88_LEAD_LIMIT = _limit(0, 2)

89# Limit on the number of consecutive punctuation characters.

90_PUNCTUATION_LIMIT = _limit(0, 4)

91# The maximum number of digits allowed in a digit-separated block. As we allow

92# all digits in a single block, set high enough to accommodate the entire

93# national number and the international country code.

94_DIGIT_BLOCK_LIMIT = (_MAX_LENGTH_FOR_NSN + _MAX_LENGTH_COUNTRY_CODE)

95# Limit on the number of blocks separated by punctuation. Use _DIGIT_BLOCK_LIMIT

96# since some formats use spaces to separate each digit.

97_BLOCK_LIMIT = _limit(0, _DIGIT_BLOCK_LIMIT)

99# A punctuation sequence allowing white space.

100_PUNCTUATION = u("[") + _VALID_PUNCTUATION + u("]") + _PUNCTUATION_LIMIT

101# A digits block without punctuation.

102_DIGIT_SEQUENCE = u("\\d") + _limit(1, _DIGIT_BLOCK_LIMIT)

103# Punctuation that may be at the start of a phone number - brackets and plus signs.

104_LEAD_CLASS_CHARS = _OPENING_PARENS + _PLUS_CHARS

105_LEAD_CLASS = u("[") + _LEAD_CLASS_CHARS + u("]")

106_LEAD_PATTERN = re.compile(_LEAD_CLASS)

107

108# Phone number pattern allowing optional punctuation.

109# This is the phone number pattern used by _find(), similar to

110# phonenumberutil._VALID_PHONE_NUMBER, but with the following differences:

111# - All captures are limited in order to place an upper bound to the text

112# matched by the pattern.

113# - Leading punctuation / plus signs are limited.

114# - Consecutive occurrences of punctuation are limited.

115# - Number of digits is limited.

116# - No whitespace is allowed at the start or end.

117# - No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently

118# supported.

119_PATTERN = re.compile(u("(?:") + _LEAD_CLASS + _PUNCTUATION + u(")") + _LEAD_LIMIT +

120 _DIGIT_SEQUENCE + u("(?:") + _PUNCTUATION + _DIGIT_SEQUENCE + u(")") + _BLOCK_LIMIT +

121 u("(?:") + _EXTN_PATTERNS_FOR_MATCHING + u(")?"),

122 _REGEX_FLAGS)

123

124# Matches strings that look like publication pages. Example: "Computing

125# Complete Answers to Queries in the Presence of Limited Access Patterns.

126# Chen Li. VLDB J. 12(3): 211-227 (2003)."

127#

128# The string "211-227 (2003)" is not a telephone number.

129_PUB_PAGES = re.compile(u("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}"))

130

131# Matches strings that look like dates using "/" as a separator. Examples:

132# 3/10/2011, 31/10/96 or 08/31/95.

133_SLASH_SEPARATED_DATES = re.compile(u("(?:(?:[0-3]?\\d/[01]?\\d)|(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}"))

134

135# Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does

136# not include the trailing ":\d\d" -- that is covered by TIME_STAMPS_SUFFIX.

137_TIME_STAMPS = re.compile(u("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$"))

138_TIME_STAMPS_SUFFIX = re.compile(u(":[0-5]\\d"))

139

140# Patterns used to extract phone numbers from a larger phone-number-like

141# pattern. These are ordered according to specificity. For example,

142# white-space is last since that is frequently used in numbers, not just to

143# separate two numbers. We have separate patterns since we don't want to break

144# up the phone-number-like text on more than one different kind of symbol at

145# one time, although symbols of the same type (e.g. space) can be safely

146# grouped together.

147#

148# Note that if there is a match, we will always check any text found up to the

149# first match as well.

150_INNER_MATCHES = (

151 # Breaks on the slash - e.g. "651-234-2345/332-445-1234"

152 re.compile(u("/+(.*)")),

153 # Note that the bracket here is inside the capturing group, since we

154 # consider it part of the phone number. Will match a pattern like "(650)

155 # 223 3345 (754) 223 3321".

156 re.compile(u("(\\([^(]*)")),

157 # Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We

158 # require a space on either side of the hyphen for it to be considered a

159 # separator.

160 re.compile(u("(?u)(?:\\s-|-\\s)\\s*(.+)")),

161 # Various types of wide hyphens. Note we have decided not to enforce a

162 # space here, since it's possible that it's supposed to be used to break

163 # two numbers without spaces, and we haven't seen many instances of it

164 # used within a number.

165 re.compile(u("(?u)[\u2012-\u2015\uFF0D]\\s*(.+)")),

166 # Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."

167 re.compile(u("(?u)\\.+\\s*([^.]+)")),

168 # Breaks on space - e.g. "3324451234 8002341234"

169 re.compile(u("(?u)\\s+(\\S+)")))

170

171

172class Leniency(object):

173 """Leniency when finding potential phone numbers in text segments.

174

175 The levels here are ordered in increasing strictness."""

176 # Phone numbers accepted are possible (i.e. is_possible_number(number)) but

177 # not necessarily valid (is_valid_number(number)).

178 POSSIBLE = 0

179 # Phone numbers accepted are both possible (is_possible_number(number))

180 # and valid (is_valid_number(PhoneNumber)). Numbers written in national

181 # format must have their national-prefix present if it is usually written

182 # for a number of this type.

183 VALID = 1

184 # Phone numbers accepted are valid (i.e. is_valid_number(number)) and are

185 # grouped in a possible way for this locale. For example, a US number

186 # written as "65 02 53 00 00" and "650253 0000" are not accepted at this

187 # leniency level, whereas "650 253 0000", "650 2530000" or "6502530000"

188 # are.

189 # Numbers with more than one '/' symbol in the national significant number

190 # are also dropped at this level.

191 #

192 # Warning: This level might result in lower coverage especially for

193 # regions outside of country code "+1". If you are not sure about which

194 # level to use, email the discussion group

195 # libphonenumber-discuss@googlegroups.com.

196 STRICT_GROUPING = 2

197 # Phone numbers accepted are valid (i.e. is_valid_number(number)) and are

198 # grouped in the same way that we would have formatted it, or as a single

199 # block. For example, a US number written as "650 2530000" is not accepted

200 # at this leniency level, whereas "650 253 0000" or "6502530000" are.

201 # Numbers with more than one '/' symbol are also dropped at this level.

202 # Warning: This level might result in lower coverage especially for

203 # regions outside of country code "+1". If you are not sure about which

204 # level to use, email the discussion group

205 # libphonenumber-discuss@googlegroups.com.

206 EXACT_GROUPING = 3

207

208

209def _verify(leniency, numobj, candidate, matcher):

210 """Returns True if number is a verified number according to the

211 leniency."""

212 if leniency == Leniency.POSSIBLE:

213 return is_possible_number(numobj)

214 elif leniency == Leniency.VALID:

215 if (not is_valid_number(numobj) or

216 not _contains_only_valid_x_chars(numobj, candidate)):

217 return False

218 return _is_national_prefix_present_if_required(numobj)

219 elif leniency == Leniency.STRICT_GROUPING:

220 return _verify_strict_grouping(numobj, candidate, matcher)

221 elif leniency == Leniency.EXACT_GROUPING:

222 return _verify_exact_grouping(numobj, candidate, matcher)

223 else:

224 raise Exception("Error: unsupported Leniency value %s" % leniency)

225

226

227def _verify_strict_grouping(numobj, candidate, matcher):

228 if (not is_valid_number(numobj) or

229 not _contains_only_valid_x_chars(numobj, candidate) or

230 _contains_more_than_one_slash_in_national_number(numobj, candidate) or

231 not _is_national_prefix_present_if_required(numobj)):

232 return False

233 return matcher._check_number_grouping_is_valid(numobj, candidate,

234 _all_number_groups_remain_grouped)

235

236

237def _all_number_groups_remain_grouped(numobj, normalized_candidate, formatted_number_groups):

238 """Returns True if the groups of digits found in our candidate phone number match our

239 expectations.

240

241 Arguments:

242 numobj -- the original number we found when parsing

243 normalized_candidate -- the candidate number, normalized to only contain ASCII digits,

244 but with non-digits (spaces etc) retained

245 expected_number_groups -- the groups of digits that we would expect to see if we

246 formatted this number

247 Returns True if expectations matched.

248 """

249 from_index = 0

250 if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:

251 # First skip the country code if the normalized candidate contained it.

252 country_code = str(numobj.country_code)

253 from_index = normalized_candidate.find(country_code) + len(country_code)

254 # Check each group of consecutive digits are not broken into separate

255 # groupings in the candidate string.

256 for ii, formatted_number_group in enumerate(formatted_number_groups):

257 # Fails if the substring of normalized_candidate starting from

258 # from_index doesn't contain the consecutive digits in

259 # formatted_number_group.

260 from_index = normalized_candidate.find(formatted_number_group, from_index)

261 if from_index < 0:

262 return False

263 # Moves from_index forward.

264 from_index += len(formatted_number_group)

265 if (ii == 0 and from_index < len(normalized_candidate)):

266 # We are at the position right after the NDC. We get the region

267 # used for formatting information based on the country code in the

268 # phone number, rather than the number itself, as we do not need

269 # to distinguish between different countries with the same country

270 # calling code and this is faster.

271 region = region_code_for_country_code(numobj.country_code)

272 if (ndd_prefix_for_region(region, True) is not None and

273 normalized_candidate[from_index].isdigit()):

274 # This means there is no formatting symbol after the NDC. In

275 # this case, we only accept the number if there is no

276 # formatting symbol at all in the number, except for

277 # extensions. This is only important for countries with

278 # national prefixes.

279 nsn = national_significant_number(numobj)

280 return normalized_candidate[(from_index - len(formatted_number_group)):].startswith(nsn)

281 # The check here makes sure that we haven't mistakenly already used the extension to

282 # match the last group of the subscriber number. Note the extension cannot have

283 # formatting in-between digits.

284 return (normalized_candidate[from_index:].find(numobj.extension or U_EMPTY_STRING) != -1)

285

286

287def _verify_exact_grouping(numobj, candidate, matcher):

288 if (not is_valid_number(numobj) or

289 not _contains_only_valid_x_chars(numobj, candidate) or

290 _contains_more_than_one_slash_in_national_number(numobj, candidate) or

291 not _is_national_prefix_present_if_required(numobj)):

292 return False

293 return matcher._check_number_grouping_is_valid(numobj, candidate,

294 _all_number_groups_are_exactly_present)

295

296

297def _all_number_groups_are_exactly_present(numobj, normalized_candidate, formatted_number_groups):

298 """Returns True if the groups of digits found in our candidate phone number match our

299 expectations.

300

301 Arguments:

302 numobj -- the original number we found when parsing

303 normalized_candidate -- the candidate number, normalized to only contain ASCII digits,

304 but with non-digits (spaces etc) retained

305 expected_number_groups -- the groups of digits that we would expect to see if we

306 formatted this number

307 Returns True if expectations matched.

308 """

309 candidate_groups = re.split(NON_DIGITS_PATTERN, normalized_candidate)

310 # Set this to the last group, skipping it if the number has an extension.

311 if numobj.extension is not None:

312 candidate_number_group_index = len(candidate_groups) - 2

313 else:

314 candidate_number_group_index = len(candidate_groups) - 1

315 # First we check if the national significant number is formatted as a

316 # block. We use contains and not equals, since the national significant

317 # number may be present with a prefix such as a national number prefix, or

318 # the country code itself.

319 if (len(candidate_groups) == 1 or

320 candidate_groups[candidate_number_group_index].find(national_significant_number(numobj)) != -1):

321 return True

322 # Starting from the end, go through in reverse, excluding the first group,

323 # and check the candidate and number groups are the same.

324 formatted_number_group_index = len(formatted_number_groups) - 1

325 while (formatted_number_group_index > 0 and candidate_number_group_index >= 0):

326 if (candidate_groups[candidate_number_group_index] !=

327 formatted_number_groups[formatted_number_group_index]):

328 return False

329 formatted_number_group_index -= 1

330 candidate_number_group_index -= 1

331 # Now check the first group. There may be a national prefix at the start, so we only check

332 # that the candidate group ends with the formatted number group.

333 return (candidate_number_group_index >= 0 and

334 candidate_groups[candidate_number_group_index].endswith(formatted_number_groups[0]))

335

336

337def _get_national_number_groups_without_pattern(numobj):

338 """Helper method to get the national-number part of a number, formatted without any national

339 prefix, and return it as a set of digit blocks that would be formatted together following

340 standard formatting rules."""

341 # This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX represents groups of

342 # digits.

343 rfc3966_format = format_number(numobj, PhoneNumberFormat.RFC3966)

344 # We remove the extension part from the formatted string before splitting

345 # it into different groups.

346 end_index = rfc3966_format.find(U_SEMICOLON)

347 if end_index < 0:

348 end_index = len(rfc3966_format)

349

350 # The country-code will have a '-' following it.

351 start_index = rfc3966_format.find(U_DASH) + 1

352 return rfc3966_format[start_index:end_index].split(U_DASH)

353

354

355def _get_national_number_groups(numobj, formatting_pattern):

356 """Helper method to get the national-number part of a number, formatted without any national

357 prefix, and return it as a set of digit blocks that should be formatted together according to

358 the formatting pattern passed in."""

359 # If a format is provided, we format the NSN only, and split that according to the separator.

360 nsn = national_significant_number(numobj)

361 return _format_nsn_using_pattern(nsn, formatting_pattern,

362 PhoneNumberFormat.RFC3966).split(U_DASH)

363

364

365def _contains_more_than_one_slash_in_national_number(numobj, candidate):

366 first_slash_in_body_index = candidate.find(U_SLASH)

367 if first_slash_in_body_index < 0:

368 # No slashes, this is okay.

369 return False

370 # Now look for a second one.

371 second_slash_in_body_index = candidate.find(U_SLASH, first_slash_in_body_index + 1)

372 if second_slash_in_body_index < 0:

373 # Only one slash, this is okay.,

374 return False

375

376 # If the first slash is after the country calling code, this is permitted.

377 candidate_has_country_code = (numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITH_PLUS_SIGN or

378 numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITHOUT_PLUS_SIGN)

379 if (candidate_has_country_code and

380 normalize_digits_only(candidate[:first_slash_in_body_index]) ==

381 unicod(numobj.country_code)):

382 # Any more slashes and this is illegal.

383 return (candidate[(second_slash_in_body_index + 1):].find(U_SLASH) != -1)

384 return True

385

386

387def _contains_only_valid_x_chars(numobj, candidate):

388 # The characters 'x' and 'X' can be (1) a carrier code, in which case they

389 # always precede the national significant number or (2) an extension sign,

390 # in which case they always precede the extension number. We assume a

391 # carrier code is more than 1 digit, so the first case has to have more

392 # than 1 consecutive 'x' or 'X', whereas the second case can only have

393 # exactly 1 'x' or 'X'. We ignore the character if it appears as the last

394 # character of the string.

395 ii = 0

396 while ii < (len(candidate) - 1):

397 if (candidate[ii] == U_X_LOWER or candidate[ii] == U_X_UPPER):

398 next_char = candidate[ii + 1]

399 if (next_char == U_X_LOWER or next_char == U_X_UPPER):

400 # This is the carrier code case, in which the 'X's always

401 # precede the national significant number.

402 ii += 1

403 if is_number_match(numobj, candidate[ii:]) != MatchType.NSN_MATCH:

404 return False

405 # This is the extension sign case, in which the 'x' or 'X' should

406 # always precede the extension number.

407 elif normalize_digits_only(candidate[ii:]) != numobj.extension:

408 return False

409 ii += 1

410 return True

411

412

413def _is_national_prefix_present_if_required(numobj):

414 # First, check how we deduced the country code. If it was written in

415 # international format, then the national prefix is not required.

416 if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:

417 return True

418 phone_number_region = region_code_for_country_code(numobj.country_code)

419 metadata = PhoneMetadata.metadata_for_region(phone_number_region, None)

420 if metadata is None:

421 return True

422 # Check if a national prefix should be present when formatting this number.

423 national_number = national_significant_number(numobj)

424 format_rule = _choose_formatting_pattern_for_number(metadata.number_format,

425 national_number)

426 # To do this, we check that a national prefix formatting rule was present

427 # and that it wasn't just the first-group symbol ($1) with punctuation.

428 if (format_rule is not None and

429 format_rule.national_prefix_formatting_rule):

430 if format_rule.national_prefix_optional_when_formatting:

431 # The national-prefix is optional in these cases, so we don't need

432 # to check if it was present.

433 return True

434 if _formatting_rule_has_first_group_only(format_rule.national_prefix_formatting_rule):

435 # National Prefix not needed for this number.

436 return True

437 # Normalize the remainder.

438 raw_input = normalize_digits_only(numobj.raw_input)

439 # Check if we found a national prefix and/or carrier code at the start of the raw input,

440 # and return the result.

441 return _maybe_strip_national_prefix_carrier_code(raw_input, metadata)[2]

442 return True

443

444

445class PhoneNumberMatcher(object):

446 """A stateful class that finds and extracts telephone numbers from text.

447

448 Vanity numbers (phone numbers using alphabetic digits such as '1-800-SIX-FLAGS' are

449 not found.

450

451 This class is not thread-safe.

452 """

453 # The potential states of a PhoneNumberMatcher.

454 _NOT_READY = 0

455 _READY = 1

456 _DONE = 2

457

458 def __init__(self, text, region,

459 leniency=Leniency.VALID, max_tries=65535):

460 """Creates a new instance.

461

462 Arguments:

463 text -- The character sequence that we will search, None for no text.

464 country -- The country to assume for phone numbers not written in

465 international format (with a leading plus, or with the

466 international dialing prefix of the specified region). May be

467 None or "ZZ" if only numbers with a leading plus should be

468 considered.

469 leniency -- The leniency to use when evaluating candidate phone

470 numbers.

471 max_tries -- The maximum number of invalid numbers to try before

472 giving up on the text. This is to cover degenerate cases where

473 the text has a lot of false positives in it. Must be >= 0.

474 """

475 if leniency is None:

476 raise ValueError("Need a leniency value")

477 if int(max_tries) < 0:

478 raise ValueError("Need max_tries to be positive int")

479 # The text searched for phone numbers.

480 self.text = text

481 if self.text is None:

482 self.text = U_EMPTY_STRING

483 # The region (country) to assume for phone numbers without an

484 # international prefix, possibly None.

485 self.preferred_region = region

486 # The degree of validation requested.

487 self.leniency = leniency

488 # The maximum number of retries after matching an invalid number.

489 self._max_tries = int(max_tries)

490 # The iteration tristate.

491 self._state = PhoneNumberMatcher._NOT_READY

492 # The last successful match, None unless in state _READY

493 self._last_match = None

494 # The next index to start searching at. Undefined in state _DONE

495 self._search_index = 0

496

497 def _find(self, index):

498 """Attempts to find the next subsequence in the searched sequence on or after index

499 that represents a phone number. Returns the next match, None if none was found.

500

501 Arguments:

502 index -- The search index to start searching at.

503 Returns the phone number match found, None if none can be found.

504 """

505 match = _PATTERN.search(self.text, index)

506 while self._max_tries > 0 and match is not None:

507 start = match.start()

508 candidate = self.text[start:match.end()]

509

510 # Check for extra numbers at the end.

511 # TODO: This is the place to start when trying to support

512 # extraction of multiple phone number from split notations (+41 79

513 # 123 45 67 / 68).

514 candidate = self._trim_after_first_match(_SECOND_NUMBER_START_PATTERN,

515 candidate)

516

517 match = self._extract_match(candidate, start)

518 if match is not None:

519 return match

520 # Move along

521 index = start + len(candidate)

522 self._max_tries -= 1

523 match = _PATTERN.search(self.text, index)

524 return None

525

526 def _trim_after_first_match(self, pattern, candidate):

527 """Trims away any characters after the first match of pattern in

528 candidate, returning the trimmed version."""

529 trailing_chars_match = pattern.search(candidate)

530 if trailing_chars_match:

531 candidate = candidate[:trailing_chars_match.start()]

532 return candidate

533

534 @classmethod

535 def _is_latin_letter(cls, letter):

536 """Helper method to determine if a character is a Latin-script letter

537 or not. For our purposes, combining marks should also return True

538 since we assume they have been added to a preceding Latin character."""

539 # Combining marks are a subset of non-spacing-mark

540 if (not is_letter(letter) and

541 Category.get(letter) != Category.NON_SPACING_MARK):

542 return False

543 block = Block.get(letter)

544 return (block == Block.BASIC_LATIN or

545 block == Block.LATIN_1_SUPPLEMENT or

546 block == Block.LATIN_EXTENDED_A or

547 block == Block.LATIN_EXTENDED_ADDITIONAL or

548 block == Block.LATIN_EXTENDED_B or

549 block == Block.COMBINING_DIACRITICAL_MARKS)

550

551 @classmethod

552 def _is_invalid_punctuation_symbol(cls, character):

553 return (character == U_PERCENT or

554 Category.get(character) == Category.CURRENCY_SYMBOL)

555

556 def _extract_match(self, candidate, offset):

557 """Attempts to extract a match from a candidate string.

558

559 Arguments:

560 candidate -- The candidate text that might contain a phone number.

561 offset -- The offset of candidate within self.text

562 Returns the match found, None if none can be found

563 """

564 # Skip a match that is more likely a publication page reference or a

565 # date.

566 if (_SLASH_SEPARATED_DATES.search(candidate)):

567 return None

568

569 # Skip potential time-stamps.

570 if _TIME_STAMPS.search(candidate):

571 following_text = self.text[offset + len(candidate):]

572 if _TIME_STAMPS_SUFFIX.match(following_text):

573 return None

574

575 # Try to come up with a valid match given the entire candidate.

576 match = self._parse_and_verify(candidate, offset)

577 if match is not None:

578 return match

579

580 # If that failed, try to find an "inner match" -- there might be a

581 # phone number within this candidate.

582 return self._extract_inner_match(candidate, offset)

583

584 def _extract_inner_match(self, candidate, offset):

585 """Attempts to extract a match from candidate if the whole candidate

586 does not qualify as a match.

587

588 Arguments:

589 candidate -- The candidate text that might contain a phone number

590 offset -- The current offset of candidate within text

591 Returns the match found, None if none can be found

592 """

593 for possible_inner_match in _INNER_MATCHES:

594 group_match = possible_inner_match.search(candidate)

595 is_first_match = True

596 while group_match and self._max_tries > 0:

597 if is_first_match:

598 # We should handle any group before this one too.

599 group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,

600 candidate[:group_match.start()])

601 match = self._parse_and_verify(group, offset)

602 if match is not None:

603 return match

604 self._max_tries -= 1

605 is_first_match = False

606 group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,

607 group_match.group(1))

608 match = self._parse_and_verify(group, offset + group_match.start(1))

609 if match is not None:

610 return match

611 self._max_tries -= 1

612 group_match = possible_inner_match.search(candidate, group_match.start() + 1)

613 return None

614

615 def _parse_and_verify(self, candidate, offset):

616 """Parses a phone number from the candidate using phonenumberutil.parse and

617 verifies it matches the requested leniency. If parsing and verification succeed, a

618 corresponding PhoneNumberMatch is returned, otherwise this method returns None.

619

620 Arguments:

621 candidate -- The candidate match.

622 offset -- The offset of candidate within self.text.

623 Returns the parsed and validated phone number match, or None.

624 """

625 try:

626 # Check the candidate doesn't contain any formatting which would

627 # indicate that it really isn't a phone number.

628 if (not fullmatch(_MATCHING_BRACKETS, candidate) or _PUB_PAGES.search(candidate)):

629 return None

630

631 # If leniency is set to VALID or stricter, we also want to skip

632 # numbers that are surrounded by Latin alphabetic characters, to

633 # skip cases like abc8005001234 or 8005001234def.

634 if self.leniency >= Leniency.VALID:

635 # If the candidate is not at the start of the text, and does

636 # not start with phone-number punctuation, check the previous

637 # character

638 if (offset > 0 and

639 not _LEAD_PATTERN.match(candidate)):

640 previous_char = self.text[offset - 1]

641 # We return None if it is a latin letter or an invalid

642 # punctuation symbol

643 if (self._is_invalid_punctuation_symbol(previous_char) or

644 self._is_latin_letter(previous_char)):

645 return None

646 last_char_index = offset + len(candidate)

647 if last_char_index < len(self.text):

648 next_char = self.text[last_char_index]

649 if (self._is_invalid_punctuation_symbol(next_char) or

650 self._is_latin_letter(next_char)):

651 return None

652

653 numobj = parse(candidate, self.preferred_region, keep_raw_input=True)

654 if _verify(self.leniency, numobj, candidate, self):

655 # We used parse(keep_raw_input=True) to create this number,

656 # but for now we don't return the extra values parsed.

657 # TODO: stop clearing all values here and switch all users

658 # over to using raw_input rather than the raw_string of

659 # PhoneNumberMatch.

660 numobj.country_code_source = CountryCodeSource.UNSPECIFIED

661 numobj.raw_input = None

662 numobj.preferred_domestic_carrier_code = None

663 return PhoneNumberMatch(offset, candidate, numobj)

664 except NumberParseException:

665 # ignore and continue

666 pass

667 return None

668

669 def _check_number_grouping_is_valid(self, numobj, candidate, checker):

670 normalized_candidate = normalize_digits_only(candidate, True) # keep non-digits

671 formatted_number_groups = _get_national_number_groups_without_pattern(numobj)

672 if checker(numobj, normalized_candidate, formatted_number_groups):

673 return True

674 # If this didn't pass, see if there are any alternate formats that match, and try them instead.

675 alternate_formats = _ALT_NUMBER_FORMATS.get(numobj.country_code, None)

676 nsn = national_significant_number(numobj)

677 if alternate_formats is not None:

678 for alternate_format in alternate_formats:

679 if len(alternate_format.leading_digits_pattern) > 0:

680 # There is only one leading digits pattern for alternate formats.

681 pattern = re.compile(alternate_format.leading_digits_pattern[0])

682 if not pattern.match(nsn):

683 # Leading digits don't match; try another one.

684 continue

685 formatted_number_groups = _get_national_number_groups(numobj, alternate_format)

686 if checker(numobj, normalized_candidate, formatted_number_groups):

687 return True

688 return False

689

690 def has_next(self):

691 """Indicates whether there is another match available"""

692 if self._state == PhoneNumberMatcher._NOT_READY:

693 self._last_match = self._find(self._search_index)

694 if self._last_match is None:

695 self._state = PhoneNumberMatcher._DONE

696 else:

697 self._search_index = self._last_match.end

698 self._state = PhoneNumberMatcher._READY

699 return (self._state == PhoneNumberMatcher._READY)

700

701 def next(self):

702 """Return the next match; raises Exception if no next match available"""

703 # Check the state and find the next match as a side-effect if necessary.

704 if not self.has_next():

705 raise StopIteration("No next match")

706 # Don't retain that memory any longer than necessary.

707 result = self._last_match

708 self._last_match = None

709 self._state = PhoneNumberMatcher._NOT_READY

710 return result

711

712 def __iter__(self):

713 while self.has_next():

714 yield self.next()

715

716

717class PhoneNumberMatch(UnicodeMixin):

718 """The immutable match of a phone number within a piece of text.

719

720 Matches may be found using the find() method of PhoneNumberMatcher.

721

722 A match consists of the phone number (in .number) as well as the .start

723 and .end offsets of the corresponding subsequence of the searched

724 text. Use .raw_string to obtain a copy of the matched subsequence.

725

726 The following annotated example clarifies the relationship between the

727 searched text, the match offsets, and the parsed number:

728

729 >>> text = "Call me at +1 425 882-8080 for details."

730 >>> country = "US"

731 >>> import phonenumbers

732 >>> matcher = phonenumbers.PhoneNumberMatcher(text, country)

733 >>> matcher.has_next()

734 True

735 >>> m = matcher.next() # Find the first phone number match

736 >>> m.raw_string # contains the phone number as it appears in the text.

737 "+1 425 882-8080"

738 >>> (m.start, m.end) # define the range of the matched subsequence.

739 (11, 26)

740 >>> text[m.start, m.end]

741 "+1 425 882-8080"

742 >>> phonenumberutil.parse("+1 425 882-8080", "US") == m.number

743 True

744 """

745 def __init__(self, start, raw_string, numobj):

746 if start < 0:

747 raise Exception("Start index not >= 0")

748 if raw_string is None or numobj is None:

749 raise Exception("Invalid argument")

750 # The start index into the text.

751 self.start = start

752 # The raw substring matched.

753 self.raw_string = raw_string

754 self.end = self.start + len(raw_string)

755 # The matched phone number.

756 self.number = numobj

757

758 def __eq__(self, other):

759 if not isinstance(other, PhoneNumberMatch):

760 return False

761 return (self.start == other.start and

762 self.raw_string == other.raw_string and

763 self.end == other.end and

764 self.number == other.number)

765

766 def __ne__(self, other):

767 return not self.__eq__(other)

768

769 def __repr__(self):

770 return (unicod("PhoneNumberMatch(start=%r, raw_string=%r, numobj=%r)") %

771 (self.start,

772 self.raw_string,

773 self.number))

774

775 def __unicode__(self):

776 return unicod("PhoneNumberMatch [%s,%s) %s") % (self.start, self.end, self.raw_string)