Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/phonenumbers/phonenumbermatcher.py: 20%
311 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""Functionality to match phone numbers in a piece of text"""
3# Based on original Java code:
4# java/src/com/google/i18n/phonenumbers/PhoneNumberMatch.java
5# java/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java
6# Copyright (C) 2011 The Libphonenumber Authors
7#
8# Licensed under the Apache License, Version 2.0 (the "License");
9# you may not use this file except in compliance with the License.
10# You may obtain a copy of the License at
11#
12# http://www.apache.org/licenses/LICENSE-2.0
13#
14# Unless required by applicable law or agreed to in writing, software
15# distributed under the License is distributed on an "AS IS" BASIS,
16# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17# See the License for the specific language governing permissions and
18# limitations under the License.
19import re
21# Extra regexp function; see README
22from .re_util import fullmatch
23from .util import UnicodeMixin, u, unicod, prnt
24from .util import U_EMPTY_STRING, U_DASH, U_SEMICOLON, U_SLASH, U_X_LOWER, U_X_UPPER, U_PERCENT
25from .unicode_util import Category, Block, is_letter
26from .phonenumberutil import _MAX_LENGTH_FOR_NSN, _MAX_LENGTH_COUNTRY_CODE
27from .phonenumberutil import _VALID_PUNCTUATION, _PLUS_CHARS, NON_DIGITS_PATTERN
28from .phonenumberutil import _EXTN_PATTERNS_FOR_MATCHING, _REGEX_FLAGS
29from .phonenumberutil import _SECOND_NUMBER_START_PATTERN, _UNWANTED_END_CHAR_PATTERN
30from .phonenumberutil import MatchType, NumberParseException, PhoneNumberFormat
31from .phonenumberutil import is_possible_number, is_valid_number, parse
32from .phonenumberutil import normalize_digits_only, national_significant_number
33from .phonenumberutil import _format_nsn_using_pattern, ndd_prefix_for_region
34from .phonenumberutil import format_number, is_number_match, region_code_for_country_code
35from .phonenumberutil import _maybe_strip_national_prefix_carrier_code
36from .phonenumberutil import _choose_formatting_pattern_for_number
37from .phonenumberutil import _formatting_rule_has_first_group_only
38from .phonenumber import CountryCodeSource
39from .phonemetadata import PhoneMetadata
41# Import auto-generated data structures
42try:
43 from .data import _ALT_NUMBER_FORMATS
44except ImportError: # pragma no cover
45 # Before the generated code exists, the data/ directory is empty.
46 # The generation process imports this module, creating a circular
47 # dependency. The hack below works around this.
48 import os
49 import sys
50 if os.path.basename(sys.argv[0]) in ("buildmetadatafromxml.py", "buildprefixdata.py"):
51 prnt("Failed to import generated data (but OK as during autogeneration)", file=sys.stderr)
52 _ALT_NUMBER_FORMATS = {}
53 else:
54 raise
57def _limit(lower, upper):
58 """Returns a regular expression quantifier with an upper and lower limit."""
59 if ((lower < 0) or (upper <= 0) or (upper < lower)): 59 ↛ 60line 59 didn't jump to line 60, because the condition on line 59 was never true
60 raise Exception("Illegal argument to _limit")
61 return unicod("{%d,%d}") % (lower, upper)
64# Build the MATCHING_BRACKETS and PATTERN regular expression patterns. The
65# building blocks below exist to make the patterns more easily understood.
66_OPENING_PARENS = u("(\\[\uFF08\uFF3B")
67_CLOSING_PARENS = u(")\\]\uFF09\uFF3D")
68_NON_PARENS = u("[^") + _OPENING_PARENS + _CLOSING_PARENS + u("]")
69# Limit on the number of pairs of brackets in a phone number.
70_BRACKET_PAIR_LIMIT = _limit(0, 3)
72# Pattern to check that brackets match. Opening brackets should be closed
73# within a phone number. This also checks that there is something inside the
74# brackets. Having no brackets at all is also fine.
75#
76# An opening bracket at the beginning may not be closed, but subsequent ones
77# should be. It's also possible that the leading bracket was dropped, so we
78# shouldn't be surprised if we see a closing bracket first. We limit the sets
79# of brackets in a phone number to four.
80_MATCHING_BRACKETS = re.compile(u("(?:[") + _OPENING_PARENS + u("])?") + u("(?:") + _NON_PARENS + u("+") +
81 u("[") + _CLOSING_PARENS + u("])?") +
82 _NON_PARENS + u("+") +
83 u("(?:[") + _OPENING_PARENS + u("]") + _NON_PARENS +
84 u("+[") + _CLOSING_PARENS + u("])") + _BRACKET_PAIR_LIMIT +
85 _NON_PARENS + u("*"))
87# Limit on the number of leading (plus) characters.
88_LEAD_LIMIT = _limit(0, 2)
89# Limit on the number of consecutive punctuation characters.
90_PUNCTUATION_LIMIT = _limit(0, 4)
91# The maximum number of digits allowed in a digit-separated block. As we allow
92# all digits in a single block, set high enough to accommodate the entire
93# national number and the international country code.
94_DIGIT_BLOCK_LIMIT = (_MAX_LENGTH_FOR_NSN + _MAX_LENGTH_COUNTRY_CODE)
95# Limit on the number of blocks separated by punctuation. Use _DIGIT_BLOCK_LIMIT
96# since some formats use spaces to separate each digit.
97_BLOCK_LIMIT = _limit(0, _DIGIT_BLOCK_LIMIT)
99# A punctuation sequence allowing white space.
100_PUNCTUATION = u("[") + _VALID_PUNCTUATION + u("]") + _PUNCTUATION_LIMIT
101# A digits block without punctuation.
102_DIGIT_SEQUENCE = u("\\d") + _limit(1, _DIGIT_BLOCK_LIMIT)
103# Punctuation that may be at the start of a phone number - brackets and plus signs.
104_LEAD_CLASS_CHARS = _OPENING_PARENS + _PLUS_CHARS
105_LEAD_CLASS = u("[") + _LEAD_CLASS_CHARS + u("]")
106_LEAD_PATTERN = re.compile(_LEAD_CLASS)
108# Phone number pattern allowing optional punctuation.
109# This is the phone number pattern used by _find(), similar to
110# phonenumberutil._VALID_PHONE_NUMBER, but with the following differences:
111# - All captures are limited in order to place an upper bound to the text
112# matched by the pattern.
113# - Leading punctuation / plus signs are limited.
114# - Consecutive occurrences of punctuation are limited.
115# - Number of digits is limited.
116# - No whitespace is allowed at the start or end.
117# - No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently
118# supported.
119_PATTERN = re.compile(u("(?:") + _LEAD_CLASS + _PUNCTUATION + u(")") + _LEAD_LIMIT +
120 _DIGIT_SEQUENCE + u("(?:") + _PUNCTUATION + _DIGIT_SEQUENCE + u(")") + _BLOCK_LIMIT +
121 u("(?:") + _EXTN_PATTERNS_FOR_MATCHING + u(")?"),
122 _REGEX_FLAGS)
124# Matches strings that look like publication pages. Example: "Computing
125# Complete Answers to Queries in the Presence of Limited Access Patterns.
126# Chen Li. VLDB J. 12(3): 211-227 (2003)."
127#
128# The string "211-227 (2003)" is not a telephone number.
129_PUB_PAGES = re.compile(u("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}"))
131# Matches strings that look like dates using "/" as a separator. Examples:
132# 3/10/2011, 31/10/96 or 08/31/95.
133_SLASH_SEPARATED_DATES = re.compile(u("(?:(?:[0-3]?\\d/[01]?\\d)|(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}"))
135# Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
136# not include the trailing ":\d\d" -- that is covered by TIME_STAMPS_SUFFIX.
137_TIME_STAMPS = re.compile(u("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$"))
138_TIME_STAMPS_SUFFIX = re.compile(u(":[0-5]\\d"))
140# Patterns used to extract phone numbers from a larger phone-number-like
141# pattern. These are ordered according to specificity. For example,
142# white-space is last since that is frequently used in numbers, not just to
143# separate two numbers. We have separate patterns since we don't want to break
144# up the phone-number-like text on more than one different kind of symbol at
145# one time, although symbols of the same type (e.g. space) can be safely
146# grouped together.
147#
148# Note that if there is a match, we will always check any text found up to the
149# first match as well.
150_INNER_MATCHES = (
151 # Breaks on the slash - e.g. "651-234-2345/332-445-1234"
152 re.compile(u("/+(.*)")),
153 # Note that the bracket here is inside the capturing group, since we
154 # consider it part of the phone number. Will match a pattern like "(650)
155 # 223 3345 (754) 223 3321".
156 re.compile(u("(\\([^(]*)")),
157 # Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We
158 # require a space on either side of the hyphen for it to be considered a
159 # separator.
160 re.compile(u("(?u)(?:\\s-|-\\s)\\s*(.+)")),
161 # Various types of wide hyphens. Note we have decided not to enforce a
162 # space here, since it's possible that it's supposed to be used to break
163 # two numbers without spaces, and we haven't seen many instances of it
164 # used within a number.
165 re.compile(u("(?u)[\u2012-\u2015\uFF0D]\\s*(.+)")),
166 # Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
167 re.compile(u("(?u)\\.+\\s*([^.]+)")),
168 # Breaks on space - e.g. "3324451234 8002341234"
169 re.compile(u("(?u)\\s+(\\S+)")))
172class Leniency(object):
173 """Leniency when finding potential phone numbers in text segments.
175 The levels here are ordered in increasing strictness."""
176 # Phone numbers accepted are possible (i.e. is_possible_number(number)) but
177 # not necessarily valid (is_valid_number(number)).
178 POSSIBLE = 0
179 # Phone numbers accepted are both possible (is_possible_number(number))
180 # and valid (is_valid_number(PhoneNumber)). Numbers written in national
181 # format must have their national-prefix present if it is usually written
182 # for a number of this type.
183 VALID = 1
184 # Phone numbers accepted are valid (i.e. is_valid_number(number)) and are
185 # grouped in a possible way for this locale. For example, a US number
186 # written as "65 02 53 00 00" and "650253 0000" are not accepted at this
187 # leniency level, whereas "650 253 0000", "650 2530000" or "6502530000"
188 # are.
189 # Numbers with more than one '/' symbol in the national significant number
190 # are also dropped at this level.
191 #
192 # Warning: This level might result in lower coverage especially for
193 # regions outside of country code "+1". If you are not sure about which
194 # level to use, email the discussion group
195 # libphonenumber-discuss@googlegroups.com.
196 STRICT_GROUPING = 2
197 # Phone numbers accepted are valid (i.e. is_valid_number(number)) and are
198 # grouped in the same way that we would have formatted it, or as a single
199 # block. For example, a US number written as "650 2530000" is not accepted
200 # at this leniency level, whereas "650 253 0000" or "6502530000" are.
201 # Numbers with more than one '/' symbol are also dropped at this level.
202 # Warning: This level might result in lower coverage especially for
203 # regions outside of country code "+1". If you are not sure about which
204 # level to use, email the discussion group
205 # libphonenumber-discuss@googlegroups.com.
206 EXACT_GROUPING = 3
209def _verify(leniency, numobj, candidate, matcher):
210 """Returns True if number is a verified number according to the
211 leniency."""
212 if leniency == Leniency.POSSIBLE:
213 return is_possible_number(numobj)
214 elif leniency == Leniency.VALID:
215 if (not is_valid_number(numobj) or
216 not _contains_only_valid_x_chars(numobj, candidate)):
217 return False
218 return _is_national_prefix_present_if_required(numobj)
219 elif leniency == Leniency.STRICT_GROUPING:
220 return _verify_strict_grouping(numobj, candidate, matcher)
221 elif leniency == Leniency.EXACT_GROUPING:
222 return _verify_exact_grouping(numobj, candidate, matcher)
223 else:
224 raise Exception("Error: unsupported Leniency value %s" % leniency)
227def _verify_strict_grouping(numobj, candidate, matcher):
228 if (not is_valid_number(numobj) or
229 not _contains_only_valid_x_chars(numobj, candidate) or
230 _contains_more_than_one_slash_in_national_number(numobj, candidate) or
231 not _is_national_prefix_present_if_required(numobj)):
232 return False
233 return matcher._check_number_grouping_is_valid(numobj, candidate,
234 _all_number_groups_remain_grouped)
237def _all_number_groups_remain_grouped(numobj, normalized_candidate, formatted_number_groups):
238 """Returns True if the groups of digits found in our candidate phone number match our
239 expectations.
241 Arguments:
242 numobj -- the original number we found when parsing
243 normalized_candidate -- the candidate number, normalized to only contain ASCII digits,
244 but with non-digits (spaces etc) retained
245 expected_number_groups -- the groups of digits that we would expect to see if we
246 formatted this number
247 Returns True if expectations matched.
248 """
249 from_index = 0
250 if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:
251 # First skip the country code if the normalized candidate contained it.
252 country_code = str(numobj.country_code)
253 from_index = normalized_candidate.find(country_code) + len(country_code)
254 # Check each group of consecutive digits are not broken into separate
255 # groupings in the candidate string.
256 for ii, formatted_number_group in enumerate(formatted_number_groups):
257 # Fails if the substring of normalized_candidate starting from
258 # from_index doesn't contain the consecutive digits in
259 # formatted_number_group.
260 from_index = normalized_candidate.find(formatted_number_group, from_index)
261 if from_index < 0:
262 return False
263 # Moves from_index forward.
264 from_index += len(formatted_number_group)
265 if (ii == 0 and from_index < len(normalized_candidate)):
266 # We are at the position right after the NDC. We get the region
267 # used for formatting information based on the country code in the
268 # phone number, rather than the number itself, as we do not need
269 # to distinguish between different countries with the same country
270 # calling code and this is faster.
271 region = region_code_for_country_code(numobj.country_code)
272 if (ndd_prefix_for_region(region, True) is not None and
273 normalized_candidate[from_index].isdigit()):
274 # This means there is no formatting symbol after the NDC. In
275 # this case, we only accept the number if there is no
276 # formatting symbol at all in the number, except for
277 # extensions. This is only important for countries with
278 # national prefixes.
279 nsn = national_significant_number(numobj)
280 return normalized_candidate[(from_index - len(formatted_number_group)):].startswith(nsn)
281 # The check here makes sure that we haven't mistakenly already used the extension to
282 # match the last group of the subscriber number. Note the extension cannot have
283 # formatting in-between digits.
284 return (normalized_candidate[from_index:].find(numobj.extension or U_EMPTY_STRING) != -1)
287def _verify_exact_grouping(numobj, candidate, matcher):
288 if (not is_valid_number(numobj) or
289 not _contains_only_valid_x_chars(numobj, candidate) or
290 _contains_more_than_one_slash_in_national_number(numobj, candidate) or
291 not _is_national_prefix_present_if_required(numobj)):
292 return False
293 return matcher._check_number_grouping_is_valid(numobj, candidate,
294 _all_number_groups_are_exactly_present)
297def _all_number_groups_are_exactly_present(numobj, normalized_candidate, formatted_number_groups):
298 """Returns True if the groups of digits found in our candidate phone number match our
299 expectations.
301 Arguments:
302 numobj -- the original number we found when parsing
303 normalized_candidate -- the candidate number, normalized to only contain ASCII digits,
304 but with non-digits (spaces etc) retained
305 expected_number_groups -- the groups of digits that we would expect to see if we
306 formatted this number
307 Returns True if expectations matched.
308 """
309 candidate_groups = re.split(NON_DIGITS_PATTERN, normalized_candidate)
310 # Set this to the last group, skipping it if the number has an extension.
311 if numobj.extension is not None:
312 candidate_number_group_index = len(candidate_groups) - 2
313 else:
314 candidate_number_group_index = len(candidate_groups) - 1
315 # First we check if the national significant number is formatted as a
316 # block. We use contains and not equals, since the national significant
317 # number may be present with a prefix such as a national number prefix, or
318 # the country code itself.
319 if (len(candidate_groups) == 1 or
320 candidate_groups[candidate_number_group_index].find(national_significant_number(numobj)) != -1):
321 return True
322 # Starting from the end, go through in reverse, excluding the first group,
323 # and check the candidate and number groups are the same.
324 formatted_number_group_index = len(formatted_number_groups) - 1
325 while (formatted_number_group_index > 0 and candidate_number_group_index >= 0):
326 if (candidate_groups[candidate_number_group_index] !=
327 formatted_number_groups[formatted_number_group_index]):
328 return False
329 formatted_number_group_index -= 1
330 candidate_number_group_index -= 1
331 # Now check the first group. There may be a national prefix at the start, so we only check
332 # that the candidate group ends with the formatted number group.
333 return (candidate_number_group_index >= 0 and
334 candidate_groups[candidate_number_group_index].endswith(formatted_number_groups[0]))
337def _get_national_number_groups_without_pattern(numobj):
338 """Helper method to get the national-number part of a number, formatted without any national
339 prefix, and return it as a set of digit blocks that would be formatted together following
340 standard formatting rules."""
341 # This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX represents groups of
342 # digits.
343 rfc3966_format = format_number(numobj, PhoneNumberFormat.RFC3966)
344 # We remove the extension part from the formatted string before splitting
345 # it into different groups.
346 end_index = rfc3966_format.find(U_SEMICOLON)
347 if end_index < 0:
348 end_index = len(rfc3966_format)
350 # The country-code will have a '-' following it.
351 start_index = rfc3966_format.find(U_DASH) + 1
352 return rfc3966_format[start_index:end_index].split(U_DASH)
355def _get_national_number_groups(numobj, formatting_pattern):
356 """Helper method to get the national-number part of a number, formatted without any national
357 prefix, and return it as a set of digit blocks that should be formatted together according to
358 the formatting pattern passed in."""
359 # If a format is provided, we format the NSN only, and split that according to the separator.
360 nsn = national_significant_number(numobj)
361 return _format_nsn_using_pattern(nsn, formatting_pattern,
362 PhoneNumberFormat.RFC3966).split(U_DASH)
365def _contains_more_than_one_slash_in_national_number(numobj, candidate):
366 first_slash_in_body_index = candidate.find(U_SLASH)
367 if first_slash_in_body_index < 0:
368 # No slashes, this is okay.
369 return False
370 # Now look for a second one.
371 second_slash_in_body_index = candidate.find(U_SLASH, first_slash_in_body_index + 1)
372 if second_slash_in_body_index < 0:
373 # Only one slash, this is okay.,
374 return False
376 # If the first slash is after the country calling code, this is permitted.
377 candidate_has_country_code = (numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITH_PLUS_SIGN or
378 numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITHOUT_PLUS_SIGN)
379 if (candidate_has_country_code and
380 normalize_digits_only(candidate[:first_slash_in_body_index]) ==
381 unicod(numobj.country_code)):
382 # Any more slashes and this is illegal.
383 return (candidate[(second_slash_in_body_index + 1):].find(U_SLASH) != -1)
384 return True
387def _contains_only_valid_x_chars(numobj, candidate):
388 # The characters 'x' and 'X' can be (1) a carrier code, in which case they
389 # always precede the national significant number or (2) an extension sign,
390 # in which case they always precede the extension number. We assume a
391 # carrier code is more than 1 digit, so the first case has to have more
392 # than 1 consecutive 'x' or 'X', whereas the second case can only have
393 # exactly 1 'x' or 'X'. We ignore the character if it appears as the last
394 # character of the string.
395 ii = 0
396 while ii < (len(candidate) - 1):
397 if (candidate[ii] == U_X_LOWER or candidate[ii] == U_X_UPPER):
398 next_char = candidate[ii + 1]
399 if (next_char == U_X_LOWER or next_char == U_X_UPPER):
400 # This is the carrier code case, in which the 'X's always
401 # precede the national significant number.
402 ii += 1
403 if is_number_match(numobj, candidate[ii:]) != MatchType.NSN_MATCH:
404 return False
405 # This is the extension sign case, in which the 'x' or 'X' should
406 # always precede the extension number.
407 elif normalize_digits_only(candidate[ii:]) != numobj.extension:
408 return False
409 ii += 1
410 return True
413def _is_national_prefix_present_if_required(numobj):
414 # First, check how we deduced the country code. If it was written in
415 # international format, then the national prefix is not required.
416 if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:
417 return True
418 phone_number_region = region_code_for_country_code(numobj.country_code)
419 metadata = PhoneMetadata.metadata_for_region(phone_number_region, None)
420 if metadata is None:
421 return True
422 # Check if a national prefix should be present when formatting this number.
423 national_number = national_significant_number(numobj)
424 format_rule = _choose_formatting_pattern_for_number(metadata.number_format,
425 national_number)
426 # To do this, we check that a national prefix formatting rule was present
427 # and that it wasn't just the first-group symbol ($1) with punctuation.
428 if (format_rule is not None and
429 format_rule.national_prefix_formatting_rule):
430 if format_rule.national_prefix_optional_when_formatting:
431 # The national-prefix is optional in these cases, so we don't need
432 # to check if it was present.
433 return True
434 if _formatting_rule_has_first_group_only(format_rule.national_prefix_formatting_rule):
435 # National Prefix not needed for this number.
436 return True
437 # Normalize the remainder.
438 raw_input = normalize_digits_only(numobj.raw_input)
439 # Check if we found a national prefix and/or carrier code at the start of the raw input,
440 # and return the result.
441 return _maybe_strip_national_prefix_carrier_code(raw_input, metadata)[2]
442 return True
445class PhoneNumberMatcher(object):
446 """A stateful class that finds and extracts telephone numbers from text.
448 Vanity numbers (phone numbers using alphabetic digits such as '1-800-SIX-FLAGS' are
449 not found.
451 This class is not thread-safe.
452 """
453 # The potential states of a PhoneNumberMatcher.
454 _NOT_READY = 0
455 _READY = 1
456 _DONE = 2
458 def __init__(self, text, region,
459 leniency=Leniency.VALID, max_tries=65535):
460 """Creates a new instance.
462 Arguments:
463 text -- The character sequence that we will search, None for no text.
464 country -- The country to assume for phone numbers not written in
465 international format (with a leading plus, or with the
466 international dialing prefix of the specified region). May be
467 None or "ZZ" if only numbers with a leading plus should be
468 considered.
469 leniency -- The leniency to use when evaluating candidate phone
470 numbers.
471 max_tries -- The maximum number of invalid numbers to try before
472 giving up on the text. This is to cover degenerate cases where
473 the text has a lot of false positives in it. Must be >= 0.
474 """
475 if leniency is None:
476 raise ValueError("Need a leniency value")
477 if int(max_tries) < 0:
478 raise ValueError("Need max_tries to be positive int")
479 # The text searched for phone numbers.
480 self.text = text
481 if self.text is None:
482 self.text = U_EMPTY_STRING
483 # The region (country) to assume for phone numbers without an
484 # international prefix, possibly None.
485 self.preferred_region = region
486 # The degree of validation requested.
487 self.leniency = leniency
488 # The maximum number of retries after matching an invalid number.
489 self._max_tries = int(max_tries)
490 # The iteration tristate.
491 self._state = PhoneNumberMatcher._NOT_READY
492 # The last successful match, None unless in state _READY
493 self._last_match = None
494 # The next index to start searching at. Undefined in state _DONE
495 self._search_index = 0
497 def _find(self, index):
498 """Attempts to find the next subsequence in the searched sequence on or after index
499 that represents a phone number. Returns the next match, None if none was found.
501 Arguments:
502 index -- The search index to start searching at.
503 Returns the phone number match found, None if none can be found.
504 """
505 match = _PATTERN.search(self.text, index)
506 while self._max_tries > 0 and match is not None:
507 start = match.start()
508 candidate = self.text[start:match.end()]
510 # Check for extra numbers at the end.
511 # TODO: This is the place to start when trying to support
512 # extraction of multiple phone number from split notations (+41 79
513 # 123 45 67 / 68).
514 candidate = self._trim_after_first_match(_SECOND_NUMBER_START_PATTERN,
515 candidate)
517 match = self._extract_match(candidate, start)
518 if match is not None:
519 return match
520 # Move along
521 index = start + len(candidate)
522 self._max_tries -= 1
523 match = _PATTERN.search(self.text, index)
524 return None
526 def _trim_after_first_match(self, pattern, candidate):
527 """Trims away any characters after the first match of pattern in
528 candidate, returning the trimmed version."""
529 trailing_chars_match = pattern.search(candidate)
530 if trailing_chars_match:
531 candidate = candidate[:trailing_chars_match.start()]
532 return candidate
534 @classmethod
535 def _is_latin_letter(cls, letter):
536 """Helper method to determine if a character is a Latin-script letter
537 or not. For our purposes, combining marks should also return True
538 since we assume they have been added to a preceding Latin character."""
539 # Combining marks are a subset of non-spacing-mark
540 if (not is_letter(letter) and
541 Category.get(letter) != Category.NON_SPACING_MARK):
542 return False
543 block = Block.get(letter)
544 return (block == Block.BASIC_LATIN or
545 block == Block.LATIN_1_SUPPLEMENT or
546 block == Block.LATIN_EXTENDED_A or
547 block == Block.LATIN_EXTENDED_ADDITIONAL or
548 block == Block.LATIN_EXTENDED_B or
549 block == Block.COMBINING_DIACRITICAL_MARKS)
551 @classmethod
552 def _is_invalid_punctuation_symbol(cls, character):
553 return (character == U_PERCENT or
554 Category.get(character) == Category.CURRENCY_SYMBOL)
556 def _extract_match(self, candidate, offset):
557 """Attempts to extract a match from a candidate string.
559 Arguments:
560 candidate -- The candidate text that might contain a phone number.
561 offset -- The offset of candidate within self.text
562 Returns the match found, None if none can be found
563 """
564 # Skip a match that is more likely a publication page reference or a
565 # date.
566 if (_SLASH_SEPARATED_DATES.search(candidate)):
567 return None
569 # Skip potential time-stamps.
570 if _TIME_STAMPS.search(candidate):
571 following_text = self.text[offset + len(candidate):]
572 if _TIME_STAMPS_SUFFIX.match(following_text):
573 return None
575 # Try to come up with a valid match given the entire candidate.
576 match = self._parse_and_verify(candidate, offset)
577 if match is not None:
578 return match
580 # If that failed, try to find an "inner match" -- there might be a
581 # phone number within this candidate.
582 return self._extract_inner_match(candidate, offset)
584 def _extract_inner_match(self, candidate, offset):
585 """Attempts to extract a match from candidate if the whole candidate
586 does not qualify as a match.
588 Arguments:
589 candidate -- The candidate text that might contain a phone number
590 offset -- The current offset of candidate within text
591 Returns the match found, None if none can be found
592 """
593 for possible_inner_match in _INNER_MATCHES:
594 group_match = possible_inner_match.search(candidate)
595 is_first_match = True
596 while group_match and self._max_tries > 0:
597 if is_first_match:
598 # We should handle any group before this one too.
599 group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
600 candidate[:group_match.start()])
601 match = self._parse_and_verify(group, offset)
602 if match is not None:
603 return match
604 self._max_tries -= 1
605 is_first_match = False
606 group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
607 group_match.group(1))
608 match = self._parse_and_verify(group, offset + group_match.start(1))
609 if match is not None:
610 return match
611 self._max_tries -= 1
612 group_match = possible_inner_match.search(candidate, group_match.start() + 1)
613 return None
615 def _parse_and_verify(self, candidate, offset):
616 """Parses a phone number from the candidate using phonenumberutil.parse and
617 verifies it matches the requested leniency. If parsing and verification succeed, a
618 corresponding PhoneNumberMatch is returned, otherwise this method returns None.
620 Arguments:
621 candidate -- The candidate match.
622 offset -- The offset of candidate within self.text.
623 Returns the parsed and validated phone number match, or None.
624 """
625 try:
626 # Check the candidate doesn't contain any formatting which would
627 # indicate that it really isn't a phone number.
628 if (not fullmatch(_MATCHING_BRACKETS, candidate) or _PUB_PAGES.search(candidate)):
629 return None
631 # If leniency is set to VALID or stricter, we also want to skip
632 # numbers that are surrounded by Latin alphabetic characters, to
633 # skip cases like abc8005001234 or 8005001234def.
634 if self.leniency >= Leniency.VALID:
635 # If the candidate is not at the start of the text, and does
636 # not start with phone-number punctuation, check the previous
637 # character
638 if (offset > 0 and
639 not _LEAD_PATTERN.match(candidate)):
640 previous_char = self.text[offset - 1]
641 # We return None if it is a latin letter or an invalid
642 # punctuation symbol
643 if (self._is_invalid_punctuation_symbol(previous_char) or
644 self._is_latin_letter(previous_char)):
645 return None
646 last_char_index = offset + len(candidate)
647 if last_char_index < len(self.text):
648 next_char = self.text[last_char_index]
649 if (self._is_invalid_punctuation_symbol(next_char) or
650 self._is_latin_letter(next_char)):
651 return None
653 numobj = parse(candidate, self.preferred_region, keep_raw_input=True)
654 if _verify(self.leniency, numobj, candidate, self):
655 # We used parse(keep_raw_input=True) to create this number,
656 # but for now we don't return the extra values parsed.
657 # TODO: stop clearing all values here and switch all users
658 # over to using raw_input rather than the raw_string of
659 # PhoneNumberMatch.
660 numobj.country_code_source = CountryCodeSource.UNSPECIFIED
661 numobj.raw_input = None
662 numobj.preferred_domestic_carrier_code = None
663 return PhoneNumberMatch(offset, candidate, numobj)
664 except NumberParseException:
665 # ignore and continue
666 pass
667 return None
669 def _check_number_grouping_is_valid(self, numobj, candidate, checker):
670 normalized_candidate = normalize_digits_only(candidate, True) # keep non-digits
671 formatted_number_groups = _get_national_number_groups_without_pattern(numobj)
672 if checker(numobj, normalized_candidate, formatted_number_groups):
673 return True
674 # If this didn't pass, see if there are any alternate formats that match, and try them instead.
675 alternate_formats = _ALT_NUMBER_FORMATS.get(numobj.country_code, None)
676 nsn = national_significant_number(numobj)
677 if alternate_formats is not None:
678 for alternate_format in alternate_formats:
679 if len(alternate_format.leading_digits_pattern) > 0:
680 # There is only one leading digits pattern for alternate formats.
681 pattern = re.compile(alternate_format.leading_digits_pattern[0])
682 if not pattern.match(nsn):
683 # Leading digits don't match; try another one.
684 continue
685 formatted_number_groups = _get_national_number_groups(numobj, alternate_format)
686 if checker(numobj, normalized_candidate, formatted_number_groups):
687 return True
688 return False
690 def has_next(self):
691 """Indicates whether there is another match available"""
692 if self._state == PhoneNumberMatcher._NOT_READY:
693 self._last_match = self._find(self._search_index)
694 if self._last_match is None:
695 self._state = PhoneNumberMatcher._DONE
696 else:
697 self._search_index = self._last_match.end
698 self._state = PhoneNumberMatcher._READY
699 return (self._state == PhoneNumberMatcher._READY)
701 def next(self):
702 """Return the next match; raises Exception if no next match available"""
703 # Check the state and find the next match as a side-effect if necessary.
704 if not self.has_next():
705 raise StopIteration("No next match")
706 # Don't retain that memory any longer than necessary.
707 result = self._last_match
708 self._last_match = None
709 self._state = PhoneNumberMatcher._NOT_READY
710 return result
712 def __iter__(self):
713 while self.has_next():
714 yield self.next()
717class PhoneNumberMatch(UnicodeMixin):
718 """The immutable match of a phone number within a piece of text.
720 Matches may be found using the find() method of PhoneNumberMatcher.
722 A match consists of the phone number (in .number) as well as the .start
723 and .end offsets of the corresponding subsequence of the searched
724 text. Use .raw_string to obtain a copy of the matched subsequence.
726 The following annotated example clarifies the relationship between the
727 searched text, the match offsets, and the parsed number:
729 >>> text = "Call me at +1 425 882-8080 for details."
730 >>> country = "US"
731 >>> import phonenumbers
732 >>> matcher = phonenumbers.PhoneNumberMatcher(text, country)
733 >>> matcher.has_next()
734 True
735 >>> m = matcher.next() # Find the first phone number match
736 >>> m.raw_string # contains the phone number as it appears in the text.
737 "+1 425 882-8080"
738 >>> (m.start, m.end) # define the range of the matched subsequence.
739 (11, 26)
740 >>> text[m.start, m.end]
741 "+1 425 882-8080"
742 >>> phonenumberutil.parse("+1 425 882-8080", "US") == m.number
743 True
744 """
745 def __init__(self, start, raw_string, numobj):
746 if start < 0:
747 raise Exception("Start index not >= 0")
748 if raw_string is None or numobj is None:
749 raise Exception("Invalid argument")
750 # The start index into the text.
751 self.start = start
752 # The raw substring matched.
753 self.raw_string = raw_string
754 self.end = self.start + len(raw_string)
755 # The matched phone number.
756 self.number = numobj
758 def __eq__(self, other):
759 if not isinstance(other, PhoneNumberMatch):
760 return False
761 return (self.start == other.start and
762 self.raw_string == other.raw_string and
763 self.end == other.end and
764 self.number == other.number)
766 def __ne__(self, other):
767 return not self.__eq__(other)
769 def __repr__(self):
770 return (unicod("PhoneNumberMatch(start=%r, raw_string=%r, numobj=%r)") %
771 (self.start,
772 self.raw_string,
773 self.number))
775 def __unicode__(self):
776 return unicod("PhoneNumberMatch [%s,%s) %s") % (self.start, self.end, self.raw_string)