Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/diff_match_patch/diff_match_patch.py: 4%
972 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1#!/usr/bin/python3
3"""Diff Match and Patch
4Copyright 2018 The diff-match-patch Authors.
5https://github.com/google/diff-match-patch
7Licensed under the Apache License, Version 2.0 (the "License");
8you may not use this file except in compliance with the License.
9You may obtain a copy of the License at
11 http://www.apache.org/licenses/LICENSE-2.0
13Unless required by applicable law or agreed to in writing, software
14distributed under the License is distributed on an "AS IS" BASIS,
15WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16See the License for the specific language governing permissions and
17limitations under the License.
18"""
20"""Functions for diff, match and patch.
22Computes the difference between two texts to create a patch.
23Applies the patch onto another text, allowing for errors.
24"""
26__author__ = "fraser@google.com (Neil Fraser)"
28import re
29import sys
30import time
31import urllib.parse
34class diff_match_patch:
35 """Class containing the diff, match and patch methods.
37 Also contains the behaviour settings.
38 """
40 def __init__(self):
41 """Inits a diff_match_patch object with default settings.
42 Redefine these in your program to override the defaults.
43 """
45 # Number of seconds to map a diff before giving up (0 for infinity).
46 self.Diff_Timeout = 1.0
47 # Cost of an empty edit operation in terms of edit characters.
48 self.Diff_EditCost = 4
49 # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
50 self.Match_Threshold = 0.5
51 # How far to search for a match (0 = exact location, 1000+ = broad match).
52 # A match this many characters away from the expected location will add
53 # 1.0 to the score (0.0 is a perfect match).
54 self.Match_Distance = 1000
55 # When deleting a large block of text (over ~64 characters), how close do
56 # the contents have to be to match the expected contents. (0.0 = perfection,
57 # 1.0 = very loose). Note that Match_Threshold controls how closely the
58 # end points of a delete need to match.
59 self.Patch_DeleteThreshold = 0.5
60 # Chunk size for context length.
61 self.Patch_Margin = 4
63 # The number of bits in an int.
64 # Python has no maximum, thus to disable patch splitting set to 0.
65 # However to avoid long patches in certain pathological cases, use 32.
66 # Multiple short patches (using native ints) are much faster than long ones.
67 self.Match_MaxBits = 32
69 # DIFF FUNCTIONS
71 # The data structure representing a diff is an array of tuples:
72 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
73 # which means: delete "Hello", add "Goodbye" and keep " world."
74 DIFF_DELETE = -1
75 DIFF_INSERT = 1
76 DIFF_EQUAL = 0
78 def diff_main(self, text1, text2, checklines=True, deadline=None):
79 """Find the differences between two texts. Simplifies the problem by
80 stripping any common prefix or suffix off the texts before diffing.
82 Args:
83 text1: Old string to be diffed.
84 text2: New string to be diffed.
85 checklines: Optional speedup flag. If present and false, then don't run
86 a line-level diff first to identify the changed areas.
87 Defaults to true, which does a faster, slightly less optimal diff.
88 deadline: Optional time when the diff should be complete by. Used
89 internally for recursive calls. Users should set DiffTimeout instead.
91 Returns:
92 Array of changes.
93 """
94 # Set a deadline by which time the diff must be complete.
95 if deadline == None:
96 # Unlike in most languages, Python counts time in seconds.
97 if self.Diff_Timeout <= 0:
98 deadline = sys.maxsize
99 else:
100 deadline = time.time() + self.Diff_Timeout
102 # Check for null inputs.
103 if text1 == None or text2 == None:
104 raise ValueError("Null inputs. (diff_main)")
106 # Check for equality (speedup).
107 if text1 == text2:
108 if text1:
109 return [(self.DIFF_EQUAL, text1)]
110 return []
112 # Trim off common prefix (speedup).
113 commonlength = self.diff_commonPrefix(text1, text2)
114 commonprefix = text1[:commonlength]
115 text1 = text1[commonlength:]
116 text2 = text2[commonlength:]
118 # Trim off common suffix (speedup).
119 commonlength = self.diff_commonSuffix(text1, text2)
120 if commonlength == 0:
121 commonsuffix = ""
122 else:
123 commonsuffix = text1[-commonlength:]
124 text1 = text1[:-commonlength]
125 text2 = text2[:-commonlength]
127 # Compute the diff on the middle block.
128 diffs = self.diff_compute(text1, text2, checklines, deadline)
130 # Restore the prefix and suffix.
131 if commonprefix:
132 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
133 if commonsuffix:
134 diffs.append((self.DIFF_EQUAL, commonsuffix))
135 self.diff_cleanupMerge(diffs)
136 return diffs
138 def diff_compute(self, text1, text2, checklines, deadline):
139 """Find the differences between two texts. Assumes that the texts do not
140 have any common prefix or suffix.
142 Args:
143 text1: Old string to be diffed.
144 text2: New string to be diffed.
145 checklines: Speedup flag. If false, then don't run a line-level diff
146 first to identify the changed areas.
147 If true, then run a faster, slightly less optimal diff.
148 deadline: Time when the diff should be complete by.
150 Returns:
151 Array of changes.
152 """
153 if not text1:
154 # Just add some text (speedup).
155 return [(self.DIFF_INSERT, text2)]
157 if not text2:
158 # Just delete some text (speedup).
159 return [(self.DIFF_DELETE, text1)]
161 if len(text1) > len(text2):
162 (longtext, shorttext) = (text1, text2)
163 else:
164 (shorttext, longtext) = (text1, text2)
165 i = longtext.find(shorttext)
166 if i != -1:
167 # Shorter text is inside the longer text (speedup).
168 diffs = [
169 (self.DIFF_INSERT, longtext[:i]),
170 (self.DIFF_EQUAL, shorttext),
171 (self.DIFF_INSERT, longtext[i + len(shorttext) :]),
172 ]
173 # Swap insertions for deletions if diff is reversed.
174 if len(text1) > len(text2):
175 diffs[0] = (self.DIFF_DELETE, diffs[0][1])
176 diffs[2] = (self.DIFF_DELETE, diffs[2][1])
177 return diffs
179 if len(shorttext) == 1:
180 # Single character string.
181 # After the previous speedup, the character can't be an equality.
182 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
184 # Check to see if the problem can be split in two.
185 hm = self.diff_halfMatch(text1, text2)
186 if hm:
187 # A half-match was found, sort out the return data.
188 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
189 # Send both pairs off for separate processing.
190 diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
191 diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
192 # Merge the results.
193 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
195 if checklines and len(text1) > 100 and len(text2) > 100:
196 return self.diff_lineMode(text1, text2, deadline)
198 return self.diff_bisect(text1, text2, deadline)
200 def diff_lineMode(self, text1, text2, deadline):
201 """Do a quick line-level diff on both strings, then rediff the parts for
202 greater accuracy.
203 This speedup can produce non-minimal diffs.
205 Args:
206 text1: Old string to be diffed.
207 text2: New string to be diffed.
208 deadline: Time when the diff should be complete by.
210 Returns:
211 Array of changes.
212 """
214 # Scan the text on a line-by-line basis first.
215 (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
217 diffs = self.diff_main(text1, text2, False, deadline)
219 # Convert the diff back to original text.
220 self.diff_charsToLines(diffs, linearray)
221 # Eliminate freak matches (e.g. blank lines)
222 self.diff_cleanupSemantic(diffs)
224 # Rediff any replacement blocks, this time character-by-character.
225 # Add a dummy entry at the end.
226 diffs.append((self.DIFF_EQUAL, ""))
227 pointer = 0
228 count_delete = 0
229 count_insert = 0
230 text_delete = ""
231 text_insert = ""
232 while pointer < len(diffs):
233 if diffs[pointer][0] == self.DIFF_INSERT:
234 count_insert += 1
235 text_insert += diffs[pointer][1]
236 elif diffs[pointer][0] == self.DIFF_DELETE:
237 count_delete += 1
238 text_delete += diffs[pointer][1]
239 elif diffs[pointer][0] == self.DIFF_EQUAL:
240 # Upon reaching an equality, check for prior redundancies.
241 if count_delete >= 1 and count_insert >= 1:
242 # Delete the offending records and add the merged ones.
243 subDiff = self.diff_main(text_delete, text_insert, False, deadline)
244 diffs[pointer - count_delete - count_insert : pointer] = subDiff
245 pointer = pointer - count_delete - count_insert + len(subDiff)
246 count_insert = 0
247 count_delete = 0
248 text_delete = ""
249 text_insert = ""
251 pointer += 1
253 diffs.pop() # Remove the dummy entry at the end.
255 return diffs
257 def diff_bisect(self, text1, text2, deadline):
258 """Find the 'middle snake' of a diff, split the problem in two
259 and return the recursively constructed diff.
260 See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
262 Args:
263 text1: Old string to be diffed.
264 text2: New string to be diffed.
265 deadline: Time at which to bail if not yet complete.
267 Returns:
268 Array of diff tuples.
269 """
271 # Cache the text lengths to prevent multiple calls.
272 text1_length = len(text1)
273 text2_length = len(text2)
274 max_d = (text1_length + text2_length + 1) // 2
275 v_offset = max_d
276 v_length = 2 * max_d
277 v1 = [-1] * v_length
278 v1[v_offset + 1] = 0
279 v2 = v1[:]
280 delta = text1_length - text2_length
281 # If the total number of characters is odd, then the front path will
282 # collide with the reverse path.
283 front = delta % 2 != 0
284 # Offsets for start and end of k loop.
285 # Prevents mapping of space beyond the grid.
286 k1start = 0
287 k1end = 0
288 k2start = 0
289 k2end = 0
290 for d in range(max_d):
291 # Bail out if deadline is reached.
292 if time.time() > deadline:
293 break
295 # Walk the front path one step.
296 for k1 in range(-d + k1start, d + 1 - k1end, 2):
297 k1_offset = v_offset + k1
298 if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]):
299 x1 = v1[k1_offset + 1]
300 else:
301 x1 = v1[k1_offset - 1] + 1
302 y1 = x1 - k1
303 while (
304 x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1]
305 ):
306 x1 += 1
307 y1 += 1
308 v1[k1_offset] = x1
309 if x1 > text1_length:
310 # Ran off the right of the graph.
311 k1end += 2
312 elif y1 > text2_length:
313 # Ran off the bottom of the graph.
314 k1start += 2
315 elif front:
316 k2_offset = v_offset + delta - k1
317 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
318 # Mirror x2 onto top-left coordinate system.
319 x2 = text1_length - v2[k2_offset]
320 if x1 >= x2:
321 # Overlap detected.
322 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
324 # Walk the reverse path one step.
325 for k2 in range(-d + k2start, d + 1 - k2end, 2):
326 k2_offset = v_offset + k2
327 if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]):
328 x2 = v2[k2_offset + 1]
329 else:
330 x2 = v2[k2_offset - 1] + 1
331 y2 = x2 - k2
332 while (
333 x2 < text1_length
334 and y2 < text2_length
335 and text1[-x2 - 1] == text2[-y2 - 1]
336 ):
337 x2 += 1
338 y2 += 1
339 v2[k2_offset] = x2
340 if x2 > text1_length:
341 # Ran off the left of the graph.
342 k2end += 2
343 elif y2 > text2_length:
344 # Ran off the top of the graph.
345 k2start += 2
346 elif not front:
347 k1_offset = v_offset + delta - k2
348 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
349 x1 = v1[k1_offset]
350 y1 = v_offset + x1 - k1_offset
351 # Mirror x2 onto top-left coordinate system.
352 x2 = text1_length - x2
353 if x1 >= x2:
354 # Overlap detected.
355 return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
357 # Diff took too long and hit the deadline or
358 # number of diffs equals number of characters, no commonality at all.
359 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
361 def diff_bisectSplit(self, text1, text2, x, y, deadline):
362 """Given the location of the 'middle snake', split the diff in two parts
363 and recurse.
365 Args:
366 text1: Old string to be diffed.
367 text2: New string to be diffed.
368 x: Index of split point in text1.
369 y: Index of split point in text2.
370 deadline: Time at which to bail if not yet complete.
372 Returns:
373 Array of diff tuples.
374 """
375 text1a = text1[:x]
376 text2a = text2[:y]
377 text1b = text1[x:]
378 text2b = text2[y:]
380 # Compute both diffs serially.
381 diffs = self.diff_main(text1a, text2a, False, deadline)
382 diffsb = self.diff_main(text1b, text2b, False, deadline)
384 return diffs + diffsb
386 def diff_linesToChars(self, text1, text2):
387 """Split two texts into an array of strings. Reduce the texts to a string
388 of hashes where each Unicode character represents one line.
390 Args:
391 text1: First string.
392 text2: Second string.
394 Returns:
395 Three element tuple, containing the encoded text1, the encoded text2 and
396 the array of unique strings. The zeroth element of the array of unique
397 strings is intentionally blank.
398 """
399 lineArray = [] # e.g. lineArray[4] == "Hello\n"
400 lineHash = {} # e.g. lineHash["Hello\n"] == 4
402 # "\x00" is a valid character, but various debuggers don't like it.
403 # So we'll insert a junk entry to avoid generating a null character.
404 lineArray.append("")
406 def diff_linesToCharsMunge(text):
407 """Split a text into an array of strings. Reduce the texts to a string
408 of hashes where each Unicode character represents one line.
409 Modifies linearray and linehash through being a closure.
411 Args:
412 text: String to encode.
414 Returns:
415 Encoded string.
416 """
417 chars = []
418 # Walk the text, pulling out a substring for each line.
419 # text.split('\n') would would temporarily double our memory footprint.
420 # Modifying text would create many large strings to garbage collect.
421 lineStart = 0
422 lineEnd = -1
423 while lineEnd < len(text) - 1:
424 lineEnd = text.find("\n", lineStart)
425 if lineEnd == -1:
426 lineEnd = len(text) - 1
427 line = text[lineStart : lineEnd + 1]
429 if line in lineHash:
430 chars.append(chr(lineHash[line]))
431 else:
432 if len(lineArray) == maxLines:
433 # Bail out at 1114111 because chr(1114112) throws.
434 line = text[lineStart:]
435 lineEnd = len(text)
436 lineArray.append(line)
437 lineHash[line] = len(lineArray) - 1
438 chars.append(chr(len(lineArray) - 1))
439 lineStart = lineEnd + 1
440 return "".join(chars)
442 # Allocate 2/3rds of the space for text1, the rest for text2.
443 maxLines = 666666
444 chars1 = diff_linesToCharsMunge(text1)
445 maxLines = 1114111
446 chars2 = diff_linesToCharsMunge(text2)
447 return (chars1, chars2, lineArray)
449 def diff_charsToLines(self, diffs, lineArray):
450 """Rehydrate the text in a diff from a string of line hashes to real lines
451 of text.
453 Args:
454 diffs: Array of diff tuples.
455 lineArray: Array of unique strings.
456 """
457 for i in range(len(diffs)):
458 text = []
459 for char in diffs[i][1]:
460 text.append(lineArray[ord(char)])
461 diffs[i] = (diffs[i][0], "".join(text))
463 def diff_commonPrefix(self, text1, text2):
464 """Determine the common prefix of two strings.
466 Args:
467 text1: First string.
468 text2: Second string.
470 Returns:
471 The number of characters common to the start of each string.
472 """
473 # Quick check for common null cases.
474 if not text1 or not text2 or text1[0] != text2[0]:
475 return 0
476 # Binary search.
477 # Performance analysis: https://neil.fraser.name/news/2007/10/09/
478 pointermin = 0
479 pointermax = min(len(text1), len(text2))
480 pointermid = pointermax
481 pointerstart = 0
482 while pointermin < pointermid:
483 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
484 pointermin = pointermid
485 pointerstart = pointermin
486 else:
487 pointermax = pointermid
488 pointermid = (pointermax - pointermin) // 2 + pointermin
489 return pointermid
491 def diff_commonSuffix(self, text1, text2):
492 """Determine the common suffix of two strings.
494 Args:
495 text1: First string.
496 text2: Second string.
498 Returns:
499 The number of characters common to the end of each string.
500 """
501 # Quick check for common null cases.
502 if not text1 or not text2 or text1[-1] != text2[-1]:
503 return 0
504 # Binary search.
505 # Performance analysis: https://neil.fraser.name/news/2007/10/09/
506 pointermin = 0
507 pointermax = min(len(text1), len(text2))
508 pointermid = pointermax
509 pointerend = 0
510 while pointermin < pointermid:
511 if (
512 text1[-pointermid : len(text1) - pointerend]
513 == text2[-pointermid : len(text2) - pointerend]
514 ):
515 pointermin = pointermid
516 pointerend = pointermin
517 else:
518 pointermax = pointermid
519 pointermid = (pointermax - pointermin) // 2 + pointermin
520 return pointermid
522 def diff_commonOverlap(self, text1, text2):
523 """Determine if the suffix of one string is the prefix of another.
525 Args:
526 text1 First string.
527 text2 Second string.
529 Returns:
530 The number of characters common to the end of the first
531 string and the start of the second string.
532 """
533 # Cache the text lengths to prevent multiple calls.
534 text1_length = len(text1)
535 text2_length = len(text2)
536 # Eliminate the null case.
537 if text1_length == 0 or text2_length == 0:
538 return 0
539 # Truncate the longer string.
540 if text1_length > text2_length:
541 text1 = text1[-text2_length:]
542 elif text1_length < text2_length:
543 text2 = text2[:text1_length]
544 text_length = min(text1_length, text2_length)
545 # Quick check for the worst case.
546 if text1 == text2:
547 return text_length
549 # Start by looking for a single character match
550 # and increase length until no match is found.
551 # Performance analysis: https://neil.fraser.name/news/2010/11/04/
552 best = 0
553 length = 1
554 while True:
555 pattern = text1[-length:]
556 found = text2.find(pattern)
557 if found == -1:
558 return best
559 length += found
560 if found == 0 or text1[-length:] == text2[:length]:
561 best = length
562 length += 1
564 def diff_halfMatch(self, text1, text2):
565 """Do the two texts share a substring which is at least half the length of
566 the longer text?
567 This speedup can produce non-minimal diffs.
569 Args:
570 text1: First string.
571 text2: Second string.
573 Returns:
574 Five element Array, containing the prefix of text1, the suffix of text1,
575 the prefix of text2, the suffix of text2 and the common middle. Or None
576 if there was no match.
577 """
578 if self.Diff_Timeout <= 0:
579 # Don't risk returning a non-optimal diff if we have unlimited time.
580 return None
581 if len(text1) > len(text2):
582 (longtext, shorttext) = (text1, text2)
583 else:
584 (shorttext, longtext) = (text1, text2)
585 if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
586 return None # Pointless.
588 def diff_halfMatchI(longtext, shorttext, i):
589 """Does a substring of shorttext exist within longtext such that the
590 substring is at least half the length of longtext?
591 Closure, but does not reference any external variables.
593 Args:
594 longtext: Longer string.
595 shorttext: Shorter string.
596 i: Start index of quarter length substring within longtext.
598 Returns:
599 Five element Array, containing the prefix of longtext, the suffix of
600 longtext, the prefix of shorttext, the suffix of shorttext and the
601 common middle. Or None if there was no match.
602 """
603 seed = longtext[i : i + len(longtext) // 4]
604 best_common = ""
605 j = shorttext.find(seed)
606 while j != -1:
607 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
608 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
609 if len(best_common) < suffixLength + prefixLength:
610 best_common = (
611 shorttext[j - suffixLength : j]
612 + shorttext[j : j + prefixLength]
613 )
614 best_longtext_a = longtext[: i - suffixLength]
615 best_longtext_b = longtext[i + prefixLength :]
616 best_shorttext_a = shorttext[: j - suffixLength]
617 best_shorttext_b = shorttext[j + prefixLength :]
618 j = shorttext.find(seed, j + 1)
620 if len(best_common) * 2 >= len(longtext):
621 return (
622 best_longtext_a,
623 best_longtext_b,
624 best_shorttext_a,
625 best_shorttext_b,
626 best_common,
627 )
628 else:
629 return None
631 # First check if the second quarter is the seed for a half-match.
632 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)
633 # Check again based on the third quarter.
634 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)
635 if not hm1 and not hm2:
636 return None
637 elif not hm2:
638 hm = hm1
639 elif not hm1:
640 hm = hm2
641 else:
642 # Both matched. Select the longest.
643 if len(hm1[4]) > len(hm2[4]):
644 hm = hm1
645 else:
646 hm = hm2
648 # A half-match was found, sort out the return data.
649 if len(text1) > len(text2):
650 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
651 else:
652 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
653 return (text1_a, text1_b, text2_a, text2_b, mid_common)
655 def diff_cleanupSemantic(self, diffs):
656 """Reduce the number of edits by eliminating semantically trivial
657 equalities.
659 Args:
660 diffs: Array of diff tuples.
661 """
662 changes = False
663 equalities = [] # Stack of indices where equalities are found.
664 lastEquality = None # Always equal to diffs[equalities[-1]][1]
665 pointer = 0 # Index of current position.
666 # Number of chars that changed prior to the equality.
667 length_insertions1, length_deletions1 = 0, 0
668 # Number of chars that changed after the equality.
669 length_insertions2, length_deletions2 = 0, 0
670 while pointer < len(diffs):
671 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
672 equalities.append(pointer)
673 length_insertions1, length_insertions2 = length_insertions2, 0
674 length_deletions1, length_deletions2 = length_deletions2, 0
675 lastEquality = diffs[pointer][1]
676 else: # An insertion or deletion.
677 if diffs[pointer][0] == self.DIFF_INSERT:
678 length_insertions2 += len(diffs[pointer][1])
679 else:
680 length_deletions2 += len(diffs[pointer][1])
681 # Eliminate an equality that is smaller or equal to the edits on both
682 # sides of it.
683 if (
684 lastEquality
685 and (
686 len(lastEquality) <= max(length_insertions1, length_deletions1)
687 )
688 and (
689 len(lastEquality) <= max(length_insertions2, length_deletions2)
690 )
691 ):
692 # Duplicate record.
693 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality))
694 # Change second copy to insert.
695 diffs[equalities[-1] + 1] = (
696 self.DIFF_INSERT,
697 diffs[equalities[-1] + 1][1],
698 )
699 # Throw away the equality we just deleted.
700 equalities.pop()
701 # Throw away the previous equality (it needs to be reevaluated).
702 if len(equalities):
703 equalities.pop()
704 if len(equalities):
705 pointer = equalities[-1]
706 else:
707 pointer = -1
708 # Reset the counters.
709 length_insertions1, length_deletions1 = 0, 0
710 length_insertions2, length_deletions2 = 0, 0
711 lastEquality = None
712 changes = True
713 pointer += 1
715 # Normalize the diff.
716 if changes:
717 self.diff_cleanupMerge(diffs)
718 self.diff_cleanupSemanticLossless(diffs)
720 # Find any overlaps between deletions and insertions.
721 # e.g: <del>abcxxx</del><ins>xxxdef</ins>
722 # -> <del>abc</del>xxx<ins>def</ins>
723 # e.g: <del>xxxabc</del><ins>defxxx</ins>
724 # -> <ins>def</ins>xxx<del>abc</del>
725 # Only extract an overlap if it is as big as the edit ahead or behind it.
726 pointer = 1
727 while pointer < len(diffs):
728 if (
729 diffs[pointer - 1][0] == self.DIFF_DELETE
730 and diffs[pointer][0] == self.DIFF_INSERT
731 ):
732 deletion = diffs[pointer - 1][1]
733 insertion = diffs[pointer][1]
734 overlap_length1 = self.diff_commonOverlap(deletion, insertion)
735 overlap_length2 = self.diff_commonOverlap(insertion, deletion)
736 if overlap_length1 >= overlap_length2:
737 if (
738 overlap_length1 >= len(deletion) / 2.0
739 or overlap_length1 >= len(insertion) / 2.0
740 ):
741 # Overlap found. Insert an equality and trim the surrounding edits.
742 diffs.insert(
743 pointer, (self.DIFF_EQUAL, insertion[:overlap_length1])
744 )
745 diffs[pointer - 1] = (
746 self.DIFF_DELETE,
747 deletion[: len(deletion) - overlap_length1],
748 )
749 diffs[pointer + 1] = (
750 self.DIFF_INSERT,
751 insertion[overlap_length1:],
752 )
753 pointer += 1
754 else:
755 if (
756 overlap_length2 >= len(deletion) / 2.0
757 or overlap_length2 >= len(insertion) / 2.0
758 ):
759 # Reverse overlap found.
760 # Insert an equality and swap and trim the surrounding edits.
761 diffs.insert(
762 pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])
763 )
764 diffs[pointer - 1] = (
765 self.DIFF_INSERT,
766 insertion[: len(insertion) - overlap_length2],
767 )
768 diffs[pointer + 1] = (
769 self.DIFF_DELETE,
770 deletion[overlap_length2:],
771 )
772 pointer += 1
773 pointer += 1
774 pointer += 1
776 def diff_cleanupSemanticLossless(self, diffs):
777 """Look for single edits surrounded on both sides by equalities
778 which can be shifted sideways to align the edit to a word boundary.
779 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
781 Args:
782 diffs: Array of diff tuples.
783 """
785 def diff_cleanupSemanticScore(one, two):
786 """Given two strings, compute a score representing whether the
787 internal boundary falls on logical boundaries.
788 Scores range from 6 (best) to 0 (worst).
789 Closure, but does not reference any external variables.
791 Args:
792 one: First string.
793 two: Second string.
795 Returns:
796 The score.
797 """
798 if not one or not two:
799 # Edges are the best.
800 return 6
802 # Each port of this function behaves slightly differently due to
803 # subtle differences in each language's definition of things like
804 # 'whitespace'. Since this function's purpose is largely cosmetic,
805 # the choice has been made to use each language's native features
806 # rather than force total conformity.
807 char1 = one[-1]
808 char2 = two[0]
809 nonAlphaNumeric1 = not char1.isalnum()
810 nonAlphaNumeric2 = not char2.isalnum()
811 whitespace1 = nonAlphaNumeric1 and char1.isspace()
812 whitespace2 = nonAlphaNumeric2 and char2.isspace()
813 lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")
814 lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")
815 blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)
816 blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)
818 if blankLine1 or blankLine2:
819 # Five points for blank lines.
820 return 5
821 elif lineBreak1 or lineBreak2:
822 # Four points for line breaks.
823 return 4
824 elif nonAlphaNumeric1 and not whitespace1 and whitespace2:
825 # Three points for end of sentences.
826 return 3
827 elif whitespace1 or whitespace2:
828 # Two points for whitespace.
829 return 2
830 elif nonAlphaNumeric1 or nonAlphaNumeric2:
831 # One point for non-alphanumeric.
832 return 1
833 return 0
835 pointer = 1
836 # Intentionally ignore the first and last element (don't need checking).
837 while pointer < len(diffs) - 1:
838 if (
839 diffs[pointer - 1][0] == self.DIFF_EQUAL
840 and diffs[pointer + 1][0] == self.DIFF_EQUAL
841 ):
842 # This is a single edit surrounded by equalities.
843 equality1 = diffs[pointer - 1][1]
844 edit = diffs[pointer][1]
845 equality2 = diffs[pointer + 1][1]
847 # First, shift the edit as far left as possible.
848 commonOffset = self.diff_commonSuffix(equality1, edit)
849 if commonOffset:
850 commonString = edit[-commonOffset:]
851 equality1 = equality1[:-commonOffset]
852 edit = commonString + edit[:-commonOffset]
853 equality2 = commonString + equality2
855 # Second, step character by character right, looking for the best fit.
856 bestEquality1 = equality1
857 bestEdit = edit
858 bestEquality2 = equality2
859 bestScore = diff_cleanupSemanticScore(
860 equality1, edit
861 ) + diff_cleanupSemanticScore(edit, equality2)
862 while edit and equality2 and edit[0] == equality2[0]:
863 equality1 += edit[0]
864 edit = edit[1:] + equality2[0]
865 equality2 = equality2[1:]
866 score = diff_cleanupSemanticScore(
867 equality1, edit
868 ) + diff_cleanupSemanticScore(edit, equality2)
869 # The >= encourages trailing rather than leading whitespace on edits.
870 if score >= bestScore:
871 bestScore = score
872 bestEquality1 = equality1
873 bestEdit = edit
874 bestEquality2 = equality2
876 if diffs[pointer - 1][1] != bestEquality1:
877 # We have an improvement, save it back to the diff.
878 if bestEquality1:
879 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
880 else:
881 del diffs[pointer - 1]
882 pointer -= 1
883 diffs[pointer] = (diffs[pointer][0], bestEdit)
884 if bestEquality2:
885 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
886 else:
887 del diffs[pointer + 1]
888 pointer -= 1
889 pointer += 1
891 # Define some regex patterns for matching boundaries.
892 BLANKLINEEND = re.compile(r"\n\r?\n$")
893 BLANKLINESTART = re.compile(r"^\r?\n\r?\n")
895 def diff_cleanupEfficiency(self, diffs):
896 """Reduce the number of edits by eliminating operationally trivial
897 equalities.
899 Args:
900 diffs: Array of diff tuples.
901 """
902 changes = False
903 equalities = [] # Stack of indices where equalities are found.
904 lastEquality = None # Always equal to diffs[equalities[-1]][1]
905 pointer = 0 # Index of current position.
906 pre_ins = False # Is there an insertion operation before the last equality.
907 pre_del = False # Is there a deletion operation before the last equality.
908 post_ins = False # Is there an insertion operation after the last equality.
909 post_del = False # Is there a deletion operation after the last equality.
910 while pointer < len(diffs):
911 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
912 if len(diffs[pointer][1]) < self.Diff_EditCost and (
913 post_ins or post_del
914 ):
915 # Candidate found.
916 equalities.append(pointer)
917 pre_ins = post_ins
918 pre_del = post_del
919 lastEquality = diffs[pointer][1]
920 else:
921 # Not a candidate, and can never become one.
922 equalities = []
923 lastEquality = None
925 post_ins = post_del = False
926 else: # An insertion or deletion.
927 if diffs[pointer][0] == self.DIFF_DELETE:
928 post_del = True
929 else:
930 post_ins = True
932 # Five types to be split:
933 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
934 # <ins>A</ins>X<ins>C</ins><del>D</del>
935 # <ins>A</ins><del>B</del>X<ins>C</ins>
936 # <ins>A</del>X<ins>C</ins><del>D</del>
937 # <ins>A</ins><del>B</del>X<del>C</del>
939 if lastEquality and (
940 (pre_ins and pre_del and post_ins and post_del)
941 or (
942 (len(lastEquality) < self.Diff_EditCost / 2)
943 and (pre_ins + pre_del + post_ins + post_del) == 3
944 )
945 ):
946 # Duplicate record.
947 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality))
948 # Change second copy to insert.
949 diffs[equalities[-1] + 1] = (
950 self.DIFF_INSERT,
951 diffs[equalities[-1] + 1][1],
952 )
953 equalities.pop() # Throw away the equality we just deleted.
954 lastEquality = None
955 if pre_ins and pre_del:
956 # No changes made which could affect previous entry, keep going.
957 post_ins = post_del = True
958 equalities = []
959 else:
960 if len(equalities):
961 equalities.pop() # Throw away the previous equality.
962 if len(equalities):
963 pointer = equalities[-1]
964 else:
965 pointer = -1
966 post_ins = post_del = False
967 changes = True
968 pointer += 1
970 if changes:
971 self.diff_cleanupMerge(diffs)
973 def diff_cleanupMerge(self, diffs):
974 """Reorder and merge like edit sections. Merge equalities.
975 Any edit section can move as long as it doesn't cross an equality.
977 Args:
978 diffs: Array of diff tuples.
979 """
980 diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end.
981 pointer = 0
982 count_delete = 0
983 count_insert = 0
984 text_delete = ""
985 text_insert = ""
986 while pointer < len(diffs):
987 if diffs[pointer][0] == self.DIFF_INSERT:
988 count_insert += 1
989 text_insert += diffs[pointer][1]
990 pointer += 1
991 elif diffs[pointer][0] == self.DIFF_DELETE:
992 count_delete += 1
993 text_delete += diffs[pointer][1]
994 pointer += 1
995 elif diffs[pointer][0] == self.DIFF_EQUAL:
996 # Upon reaching an equality, check for prior redundancies.
997 if count_delete + count_insert > 1:
998 if count_delete != 0 and count_insert != 0:
999 # Factor out any common prefixies.
1000 commonlength = self.diff_commonPrefix(text_insert, text_delete)
1001 if commonlength != 0:
1002 x = pointer - count_delete - count_insert - 1
1003 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
1004 diffs[x] = (
1005 diffs[x][0],
1006 diffs[x][1] + text_insert[:commonlength],
1007 )
1008 else:
1009 diffs.insert(
1010 0, (self.DIFF_EQUAL, text_insert[:commonlength])
1011 )
1012 pointer += 1
1013 text_insert = text_insert[commonlength:]
1014 text_delete = text_delete[commonlength:]
1015 # Factor out any common suffixies.
1016 commonlength = self.diff_commonSuffix(text_insert, text_delete)
1017 if commonlength != 0:
1018 diffs[pointer] = (
1019 diffs[pointer][0],
1020 text_insert[-commonlength:] + diffs[pointer][1],
1021 )
1022 text_insert = text_insert[:-commonlength]
1023 text_delete = text_delete[:-commonlength]
1024 # Delete the offending records and add the merged ones.
1025 new_ops = []
1026 if len(text_delete) != 0:
1027 new_ops.append((self.DIFF_DELETE, text_delete))
1028 if len(text_insert) != 0:
1029 new_ops.append((self.DIFF_INSERT, text_insert))
1030 pointer -= count_delete + count_insert
1031 diffs[pointer : pointer + count_delete + count_insert] = new_ops
1032 pointer += len(new_ops) + 1
1033 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
1034 # Merge this equality with the previous one.
1035 diffs[pointer - 1] = (
1036 diffs[pointer - 1][0],
1037 diffs[pointer - 1][1] + diffs[pointer][1],
1038 )
1039 del diffs[pointer]
1040 else:
1041 pointer += 1
1043 count_insert = 0
1044 count_delete = 0
1045 text_delete = ""
1046 text_insert = ""
1048 if diffs[-1][1] == "":
1049 diffs.pop() # Remove the dummy entry at the end.
1051 # Second pass: look for single edits surrounded on both sides by equalities
1052 # which can be shifted sideways to eliminate an equality.
1053 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
1054 changes = False
1055 pointer = 1
1056 # Intentionally ignore the first and last element (don't need checking).
1057 while pointer < len(diffs) - 1:
1058 if (
1059 diffs[pointer - 1][0] == self.DIFF_EQUAL
1060 and diffs[pointer + 1][0] == self.DIFF_EQUAL
1061 ):
1062 # This is a single edit surrounded by equalities.
1063 if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
1064 # Shift the edit over the previous equality.
1065 if diffs[pointer - 1][1] != "":
1066 diffs[pointer] = (
1067 diffs[pointer][0],
1068 diffs[pointer - 1][1]
1069 + diffs[pointer][1][: -len(diffs[pointer - 1][1])],
1070 )
1071 diffs[pointer + 1] = (
1072 diffs[pointer + 1][0],
1073 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1074 )
1075 del diffs[pointer - 1]
1076 changes = True
1077 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
1078 # Shift the edit over the next equality.
1079 diffs[pointer - 1] = (
1080 diffs[pointer - 1][0],
1081 diffs[pointer - 1][1] + diffs[pointer + 1][1],
1082 )
1083 diffs[pointer] = (
1084 diffs[pointer][0],
1085 diffs[pointer][1][len(diffs[pointer + 1][1]) :]
1086 + diffs[pointer + 1][1],
1087 )
1088 del diffs[pointer + 1]
1089 changes = True
1090 pointer += 1
1092 # If shifts were made, the diff needs reordering and another shift sweep.
1093 if changes:
1094 self.diff_cleanupMerge(diffs)
1096 def diff_xIndex(self, diffs, loc):
1097 """loc is a location in text1, compute and return the equivalent location
1098 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1100 Args:
1101 diffs: Array of diff tuples.
1102 loc: Location within text1.
1104 Returns:
1105 Location within text2.
1106 """
1107 chars1 = 0
1108 chars2 = 0
1109 last_chars1 = 0
1110 last_chars2 = 0
1111 for x in range(len(diffs)):
1112 (op, text) = diffs[x]
1113 if op != self.DIFF_INSERT: # Equality or deletion.
1114 chars1 += len(text)
1115 if op != self.DIFF_DELETE: # Equality or insertion.
1116 chars2 += len(text)
1117 if chars1 > loc: # Overshot the location.
1118 break
1119 last_chars1 = chars1
1120 last_chars2 = chars2
1122 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1123 # The location was deleted.
1124 return last_chars2
1125 # Add the remaining len(character).
1126 return last_chars2 + (loc - last_chars1)
1128 def diff_prettyHtml(self, diffs):
1129 """Convert a diff array into a pretty HTML report.
1131 Args:
1132 diffs: Array of diff tuples.
1134 Returns:
1135 HTML representation.
1136 """
1137 html = []
1138 for (op, data) in diffs:
1139 text = (
1140 data.replace("&", "&")
1141 .replace("<", "<")
1142 .replace(">", ">")
1143 .replace("\n", "¶<br>")
1144 )
1145 if op == self.DIFF_INSERT:
1146 html.append('<ins style="background:#e6ffe6;">%s</ins>' % text)
1147 elif op == self.DIFF_DELETE:
1148 html.append('<del style="background:#ffe6e6;">%s</del>' % text)
1149 elif op == self.DIFF_EQUAL:
1150 html.append("<span>%s</span>" % text)
1151 return "".join(html)
1153 def diff_text1(self, diffs):
1154 """Compute and return the source text (all equalities and deletions).
1156 Args:
1157 diffs: Array of diff tuples.
1159 Returns:
1160 Source text.
1161 """
1162 text = []
1163 for (op, data) in diffs:
1164 if op != self.DIFF_INSERT:
1165 text.append(data)
1166 return "".join(text)
1168 def diff_text2(self, diffs):
1169 """Compute and return the destination text (all equalities and insertions).
1171 Args:
1172 diffs: Array of diff tuples.
1174 Returns:
1175 Destination text.
1176 """
1177 text = []
1178 for (op, data) in diffs:
1179 if op != self.DIFF_DELETE:
1180 text.append(data)
1181 return "".join(text)
1183 def diff_levenshtein(self, diffs):
1184 """Compute the Levenshtein distance; the number of inserted, deleted or
1185 substituted characters.
1187 Args:
1188 diffs: Array of diff tuples.
1190 Returns:
1191 Number of changes.
1192 """
1193 levenshtein = 0
1194 insertions = 0
1195 deletions = 0
1196 for (op, data) in diffs:
1197 if op == self.DIFF_INSERT:
1198 insertions += len(data)
1199 elif op == self.DIFF_DELETE:
1200 deletions += len(data)
1201 elif op == self.DIFF_EQUAL:
1202 # A deletion and an insertion is one substitution.
1203 levenshtein += max(insertions, deletions)
1204 insertions = 0
1205 deletions = 0
1206 levenshtein += max(insertions, deletions)
1207 return levenshtein
1209 def diff_toDelta(self, diffs):
1210 """Crush the diff into an encoded string which describes the operations
1211 required to transform text1 into text2.
1212 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1213 Operations are tab-separated. Inserted text is escaped using %xx notation.
1215 Args:
1216 diffs: Array of diff tuples.
1218 Returns:
1219 Delta text.
1220 """
1221 text = []
1222 for (op, data) in diffs:
1223 if op == self.DIFF_INSERT:
1224 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1225 data = data.encode("utf-8")
1226 text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
1227 elif op == self.DIFF_DELETE:
1228 text.append("-%d" % len(data))
1229 elif op == self.DIFF_EQUAL:
1230 text.append("=%d" % len(data))
1231 return "\t".join(text)
1233 def diff_fromDelta(self, text1, delta):
1234 """Given the original text1, and an encoded string which describes the
1235 operations required to transform text1 into text2, compute the full diff.
1237 Args:
1238 text1: Source string for the diff.
1239 delta: Delta text.
1241 Returns:
1242 Array of diff tuples.
1244 Raises:
1245 ValueError: If invalid input.
1246 """
1247 diffs = []
1248 pointer = 0 # Cursor in text1
1249 tokens = delta.split("\t")
1250 for token in tokens:
1251 if token == "":
1252 # Blank tokens are ok (from a trailing \t).
1253 continue
1254 # Each token begins with a one character parameter which specifies the
1255 # operation of this token (delete, insert, equality).
1256 param = token[1:]
1257 if token[0] == "+":
1258 param = urllib.parse.unquote(param)
1259 diffs.append((self.DIFF_INSERT, param))
1260 elif token[0] == "-" or token[0] == "=":
1261 try:
1262 n = int(param)
1263 except ValueError:
1264 raise ValueError("Invalid number in diff_fromDelta: " + param)
1265 if n < 0:
1266 raise ValueError("Negative number in diff_fromDelta: " + param)
1267 text = text1[pointer : pointer + n]
1268 pointer += n
1269 if token[0] == "=":
1270 diffs.append((self.DIFF_EQUAL, text))
1271 else:
1272 diffs.append((self.DIFF_DELETE, text))
1273 else:
1274 # Anything else is an error.
1275 raise ValueError(
1276 "Invalid diff operation in diff_fromDelta: " + token[0]
1277 )
1278 if pointer != len(text1):
1279 raise ValueError(
1280 "Delta length (%d) does not equal source text length (%d)."
1281 % (pointer, len(text1))
1282 )
1283 return diffs
1285 # MATCH FUNCTIONS
1287 def match_main(self, text, pattern, loc):
1288 """Locate the best instance of 'pattern' in 'text' near 'loc'.
1290 Args:
1291 text: The text to search.
1292 pattern: The pattern to search for.
1293 loc: The location to search around.
1295 Returns:
1296 Best match index or -1.
1297 """
1298 # Check for null inputs.
1299 if text == None or pattern == None:
1300 raise ValueError("Null inputs. (match_main)")
1302 loc = max(0, min(loc, len(text)))
1303 if text == pattern:
1304 # Shortcut (potentially not guaranteed by the algorithm)
1305 return 0
1306 elif not text:
1307 # Nothing to match.
1308 return -1
1309 elif text[loc : loc + len(pattern)] == pattern:
1310 # Perfect match at the perfect spot! (Includes case of null pattern)
1311 return loc
1312 else:
1313 # Do a fuzzy compare.
1314 match = self.match_bitap(text, pattern, loc)
1315 return match
1317 def match_bitap(self, text, pattern, loc):
1318 """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1319 Bitap algorithm.
1321 Args:
1322 text: The text to search.
1323 pattern: The pattern to search for.
1324 loc: The location to search around.
1326 Returns:
1327 Best match index or -1.
1328 """
1329 # Python doesn't have a maxint limit, so ignore this check.
1330 # if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1331 # raise ValueError("Pattern too long for this application.")
1333 # Initialise the alphabet.
1334 s = self.match_alphabet(pattern)
1336 def match_bitapScore(e, x):
1337 """Compute and return the score for a match with e errors and x location.
1338 Accesses loc and pattern through being a closure.
1340 Args:
1341 e: Number of errors in match.
1342 x: Location of match.
1344 Returns:
1345 Overall score for match (0.0 = good, 1.0 = bad).
1346 """
1347 accuracy = float(e) / len(pattern)
1348 proximity = abs(loc - x)
1349 if not self.Match_Distance:
1350 # Dodge divide by zero error.
1351 return proximity and 1.0 or accuracy
1352 return accuracy + (proximity / float(self.Match_Distance))
1354 # Highest score beyond which we give up.
1355 score_threshold = self.Match_Threshold
1356 # Is there a nearby exact match? (speedup)
1357 best_loc = text.find(pattern, loc)
1358 if best_loc != -1:
1359 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1360 # What about in the other direction? (speedup)
1361 best_loc = text.rfind(pattern, loc + len(pattern))
1362 if best_loc != -1:
1363 score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1365 # Initialise the bit arrays.
1366 matchmask = 1 << (len(pattern) - 1)
1367 best_loc = -1
1369 bin_max = len(pattern) + len(text)
1370 # Empty initialization added to appease pychecker.
1371 last_rd = None
1372 for d in range(len(pattern)):
1373 # Scan for the best match each iteration allows for one more error.
1374 # Run a binary search to determine how far from 'loc' we can stray at
1375 # this error level.
1376 bin_min = 0
1377 bin_mid = bin_max
1378 while bin_min < bin_mid:
1379 if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1380 bin_min = bin_mid
1381 else:
1382 bin_max = bin_mid
1383 bin_mid = (bin_max - bin_min) // 2 + bin_min
1385 # Use the result from this iteration as the maximum for the next.
1386 bin_max = bin_mid
1387 start = max(1, loc - bin_mid + 1)
1388 finish = min(loc + bin_mid, len(text)) + len(pattern)
1390 rd = [0] * (finish + 2)
1391 rd[finish + 1] = (1 << d) - 1
1392 for j in range(finish, start - 1, -1):
1393 if len(text) <= j - 1:
1394 # Out of range.
1395 charMatch = 0
1396 else:
1397 charMatch = s.get(text[j - 1], 0)
1398 if d == 0: # First pass: exact match.
1399 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1400 else: # Subsequent passes: fuzzy match.
1401 rd[j] = (
1402 (((rd[j + 1] << 1) | 1) & charMatch)
1403 | (((last_rd[j + 1] | last_rd[j]) << 1) | 1)
1404 | last_rd[j + 1]
1405 )
1406 if rd[j] & matchmask:
1407 score = match_bitapScore(d, j - 1)
1408 # This match will almost certainly be better than any existing match.
1409 # But check anyway.
1410 if score <= score_threshold:
1411 # Told you so.
1412 score_threshold = score
1413 best_loc = j - 1
1414 if best_loc > loc:
1415 # When passing loc, don't exceed our current distance from loc.
1416 start = max(1, 2 * loc - best_loc)
1417 else:
1418 # Already passed loc, downhill from here on in.
1419 break
1420 # No hope for a (better) match at greater error levels.
1421 if match_bitapScore(d + 1, loc) > score_threshold:
1422 break
1423 last_rd = rd
1424 return best_loc
1426 def match_alphabet(self, pattern):
1427 """Initialise the alphabet for the Bitap algorithm.
1429 Args:
1430 pattern: The text to encode.
1432 Returns:
1433 Hash of character locations.
1434 """
1435 s = {}
1436 for char in pattern:
1437 s[char] = 0
1438 for i in range(len(pattern)):
1439 s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1440 return s
1442 # PATCH FUNCTIONS
1444 def patch_addContext(self, patch, text):
1445 """Increase the context until it is unique,
1446 but don't let the pattern expand beyond Match_MaxBits.
1448 Args:
1449 patch: The patch to grow.
1450 text: Source text.
1451 """
1452 if len(text) == 0:
1453 return
1454 pattern = text[patch.start2 : patch.start2 + patch.length1]
1455 padding = 0
1457 # Look for the first and last matches of pattern in text. If two different
1458 # matches are found, increase the pattern length.
1459 while text.find(pattern) != text.rfind(pattern) and (
1460 self.Match_MaxBits == 0
1461 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin
1462 ):
1463 padding += self.Patch_Margin
1464 pattern = text[
1465 max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding
1466 ]
1467 # Add one chunk for good luck.
1468 padding += self.Patch_Margin
1470 # Add the prefix.
1471 prefix = text[max(0, patch.start2 - padding) : patch.start2]
1472 if prefix:
1473 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1474 # Add the suffix.
1475 suffix = text[
1476 patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding
1477 ]
1478 if suffix:
1479 patch.diffs.append((self.DIFF_EQUAL, suffix))
1481 # Roll back the start points.
1482 patch.start1 -= len(prefix)
1483 patch.start2 -= len(prefix)
1484 # Extend lengths.
1485 patch.length1 += len(prefix) + len(suffix)
1486 patch.length2 += len(prefix) + len(suffix)
1488 def patch_make(self, a, b=None, c=None):
1489 """Compute a list of patches to turn text1 into text2.
1490 Use diffs if provided, otherwise compute it ourselves.
1491 There are four ways to call this function, depending on what data is
1492 available to the caller:
1493 Method 1:
1494 a = text1, b = text2
1495 Method 2:
1496 a = diffs
1497 Method 3 (optimal):
1498 a = text1, b = diffs
1499 Method 4 (deprecated, use method 3):
1500 a = text1, b = text2, c = diffs
1502 Args:
1503 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1504 text2 (method 2).
1505 b: text2 (methods 1,4) or Array of diff tuples for text1 to
1506 text2 (method 3) or undefined (method 2).
1507 c: Array of diff tuples for text1 to text2 (method 4) or
1508 undefined (methods 1,2,3).
1510 Returns:
1511 Array of Patch objects.
1512 """
1513 text1 = None
1514 diffs = None
1515 if isinstance(a, str) and isinstance(b, str) and c is None:
1516 # Method 1: text1, text2
1517 # Compute diffs from text1 and text2.
1518 text1 = a
1519 diffs = self.diff_main(text1, b, True)
1520 if len(diffs) > 2:
1521 self.diff_cleanupSemantic(diffs)
1522 self.diff_cleanupEfficiency(diffs)
1523 elif isinstance(a, list) and b is None and c is None:
1524 # Method 2: diffs
1525 # Compute text1 from diffs.
1526 diffs = a
1527 text1 = self.diff_text1(diffs)
1528 elif isinstance(a, str) and isinstance(b, list) and c is None:
1529 # Method 3: text1, diffs
1530 text1 = a
1531 diffs = b
1532 elif isinstance(a, str) and isinstance(b, str) and isinstance(c, list):
1533 # Method 4: text1, text2, diffs
1534 # text2 is not used.
1535 text1 = a
1536 diffs = c
1537 else:
1538 raise ValueError("Unknown call format to patch_make.")
1540 if not diffs:
1541 return [] # Get rid of the None case.
1542 patches = []
1543 patch = patch_obj()
1544 char_count1 = 0 # Number of characters into the text1 string.
1545 char_count2 = 0 # Number of characters into the text2 string.
1546 prepatch_text = text1 # Recreate the patches to determine context info.
1547 postpatch_text = text1
1548 for x in range(len(diffs)):
1549 (diff_type, diff_text) = diffs[x]
1550 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1551 # A new patch starts here.
1552 patch.start1 = char_count1
1553 patch.start2 = char_count2
1554 if diff_type == self.DIFF_INSERT:
1555 # Insertion
1556 patch.diffs.append(diffs[x])
1557 patch.length2 += len(diff_text)
1558 postpatch_text = (
1559 postpatch_text[:char_count2]
1560 + diff_text
1561 + postpatch_text[char_count2:]
1562 )
1563 elif diff_type == self.DIFF_DELETE:
1564 # Deletion.
1565 patch.length1 += len(diff_text)
1566 patch.diffs.append(diffs[x])
1567 postpatch_text = (
1568 postpatch_text[:char_count2]
1569 + postpatch_text[char_count2 + len(diff_text) :]
1570 )
1571 elif (
1572 diff_type == self.DIFF_EQUAL
1573 and len(diff_text) <= 2 * self.Patch_Margin
1574 and len(patch.diffs) != 0
1575 and len(diffs) != x + 1
1576 ):
1577 # Small equality inside a patch.
1578 patch.diffs.append(diffs[x])
1579 patch.length1 += len(diff_text)
1580 patch.length2 += len(diff_text)
1582 if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin:
1583 # Time for a new patch.
1584 if len(patch.diffs) != 0:
1585 self.patch_addContext(patch, prepatch_text)
1586 patches.append(patch)
1587 patch = patch_obj()
1588 # Unlike Unidiff, our patch lists have a rolling context.
1589 # https://github.com/google/diff-match-patch/wiki/Unidiff
1590 # Update prepatch text & pos to reflect the application of the
1591 # just completed patch.
1592 prepatch_text = postpatch_text
1593 char_count1 = char_count2
1595 # Update the current character count.
1596 if diff_type != self.DIFF_INSERT:
1597 char_count1 += len(diff_text)
1598 if diff_type != self.DIFF_DELETE:
1599 char_count2 += len(diff_text)
1601 # Pick up the leftover patch if not empty.
1602 if len(patch.diffs) != 0:
1603 self.patch_addContext(patch, prepatch_text)
1604 patches.append(patch)
1605 return patches
1607 def patch_deepCopy(self, patches):
1608 """Given an array of patches, return another array that is identical.
1610 Args:
1611 patches: Array of Patch objects.
1613 Returns:
1614 Array of Patch objects.
1615 """
1616 patchesCopy = []
1617 for patch in patches:
1618 patchCopy = patch_obj()
1619 # No need to deep copy the tuples since they are immutable.
1620 patchCopy.diffs = patch.diffs[:]
1621 patchCopy.start1 = patch.start1
1622 patchCopy.start2 = patch.start2
1623 patchCopy.length1 = patch.length1
1624 patchCopy.length2 = patch.length2
1625 patchesCopy.append(patchCopy)
1626 return patchesCopy
1628 def patch_apply(self, patches, text):
1629 """Merge a set of patches onto the text. Return a patched text, as well
1630 as a list of true/false values indicating which patches were applied.
1632 Args:
1633 patches: Array of Patch objects.
1634 text: Old text.
1636 Returns:
1637 Two element Array, containing the new text and an array of boolean values.
1638 """
1639 if not patches:
1640 return (text, [])
1642 # Deep copy the patches so that no changes are made to originals.
1643 patches = self.patch_deepCopy(patches)
1645 nullPadding = self.patch_addPadding(patches)
1646 text = nullPadding + text + nullPadding
1647 self.patch_splitMax(patches)
1649 # delta keeps track of the offset between the expected and actual location
1650 # of the previous patch. If there are patches expected at positions 10 and
1651 # 20, but the first patch was found at 12, delta is 2 and the second patch
1652 # has an effective expected position of 22.
1653 delta = 0
1654 results = []
1655 for patch in patches:
1656 expected_loc = patch.start2 + delta
1657 text1 = self.diff_text1(patch.diffs)
1658 end_loc = -1
1659 if len(text1) > self.Match_MaxBits:
1660 # patch_splitMax will only provide an oversized pattern in the case of
1661 # a monster delete.
1662 start_loc = self.match_main(
1663 text, text1[: self.Match_MaxBits], expected_loc
1664 )
1665 if start_loc != -1:
1666 end_loc = self.match_main(
1667 text,
1668 text1[-self.Match_MaxBits :],
1669 expected_loc + len(text1) - self.Match_MaxBits,
1670 )
1671 if end_loc == -1 or start_loc >= end_loc:
1672 # Can't find valid trailing context. Drop this patch.
1673 start_loc = -1
1674 else:
1675 start_loc = self.match_main(text, text1, expected_loc)
1676 if start_loc == -1:
1677 # No match found. :(
1678 results.append(False)
1679 # Subtract the delta for this failed patch from subsequent patches.
1680 delta -= patch.length2 - patch.length1
1681 else:
1682 # Found a match. :)
1683 results.append(True)
1684 delta = start_loc - expected_loc
1685 if end_loc == -1:
1686 text2 = text[start_loc : start_loc + len(text1)]
1687 else:
1688 text2 = text[start_loc : end_loc + self.Match_MaxBits]
1689 if text1 == text2:
1690 # Perfect match, just shove the replacement text in.
1691 text = (
1692 text[:start_loc]
1693 + self.diff_text2(patch.diffs)
1694 + text[start_loc + len(text1) :]
1695 )
1696 else:
1697 # Imperfect match.
1698 # Run a diff to get a framework of equivalent indices.
1699 diffs = self.diff_main(text1, text2, False)
1700 if (
1701 len(text1) > self.Match_MaxBits
1702 and self.diff_levenshtein(diffs) / float(len(text1))
1703 > self.Patch_DeleteThreshold
1704 ):
1705 # The end points match, but the content is unacceptably bad.
1706 results[-1] = False
1707 else:
1708 self.diff_cleanupSemanticLossless(diffs)
1709 index1 = 0
1710 for (op, data) in patch.diffs:
1711 if op != self.DIFF_EQUAL:
1712 index2 = self.diff_xIndex(diffs, index1)
1713 if op == self.DIFF_INSERT: # Insertion
1714 text = (
1715 text[: start_loc + index2]
1716 + data
1717 + text[start_loc + index2 :]
1718 )
1719 elif op == self.DIFF_DELETE: # Deletion
1720 text = (
1721 text[: start_loc + index2]
1722 + text[
1723 start_loc
1724 + self.diff_xIndex(diffs, index1 + len(data)) :
1725 ]
1726 )
1727 if op != self.DIFF_DELETE:
1728 index1 += len(data)
1729 # Strip the padding off.
1730 text = text[len(nullPadding) : -len(nullPadding)]
1731 return (text, results)
1733 def patch_addPadding(self, patches):
1734 """Add some padding on text start and end so that edges can match
1735 something. Intended to be called only from within patch_apply.
1737 Args:
1738 patches: Array of Patch objects.
1740 Returns:
1741 The padding string added to each side.
1742 """
1743 paddingLength = self.Patch_Margin
1744 nullPadding = ""
1745 for x in range(1, paddingLength + 1):
1746 nullPadding += chr(x)
1748 # Bump all the patches forward.
1749 for patch in patches:
1750 patch.start1 += paddingLength
1751 patch.start2 += paddingLength
1753 # Add some padding on start of first diff.
1754 patch = patches[0]
1755 diffs = patch.diffs
1756 if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1757 # Add nullPadding equality.
1758 diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1759 patch.start1 -= paddingLength # Should be 0.
1760 patch.start2 -= paddingLength # Should be 0.
1761 patch.length1 += paddingLength
1762 patch.length2 += paddingLength
1763 elif paddingLength > len(diffs[0][1]):
1764 # Grow first equality.
1765 extraLength = paddingLength - len(diffs[0][1])
1766 newText = nullPadding[len(diffs[0][1]) :] + diffs[0][1]
1767 diffs[0] = (diffs[0][0], newText)
1768 patch.start1 -= extraLength
1769 patch.start2 -= extraLength
1770 patch.length1 += extraLength
1771 patch.length2 += extraLength
1773 # Add some padding on end of last diff.
1774 patch = patches[-1]
1775 diffs = patch.diffs
1776 if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1777 # Add nullPadding equality.
1778 diffs.append((self.DIFF_EQUAL, nullPadding))
1779 patch.length1 += paddingLength
1780 patch.length2 += paddingLength
1781 elif paddingLength > len(diffs[-1][1]):
1782 # Grow last equality.
1783 extraLength = paddingLength - len(diffs[-1][1])
1784 newText = diffs[-1][1] + nullPadding[:extraLength]
1785 diffs[-1] = (diffs[-1][0], newText)
1786 patch.length1 += extraLength
1787 patch.length2 += extraLength
1789 return nullPadding
1791 def patch_splitMax(self, patches):
1792 """Look through the patches and break up any which are longer than the
1793 maximum limit of the match algorithm.
1794 Intended to be called only from within patch_apply.
1796 Args:
1797 patches: Array of Patch objects.
1798 """
1799 patch_size = self.Match_MaxBits
1800 if patch_size == 0:
1801 # Python has the option of not splitting strings due to its ability
1802 # to handle integers of arbitrary precision.
1803 return
1804 for x in range(len(patches)):
1805 if patches[x].length1 <= patch_size:
1806 continue
1807 bigpatch = patches[x]
1808 # Remove the big old patch.
1809 del patches[x]
1810 x -= 1
1811 start1 = bigpatch.start1
1812 start2 = bigpatch.start2
1813 precontext = ""
1814 while len(bigpatch.diffs) != 0:
1815 # Create one of several smaller patches.
1816 patch = patch_obj()
1817 empty = True
1818 patch.start1 = start1 - len(precontext)
1819 patch.start2 = start2 - len(precontext)
1820 if precontext:
1821 patch.length1 = patch.length2 = len(precontext)
1822 patch.diffs.append((self.DIFF_EQUAL, precontext))
1824 while (
1825 len(bigpatch.diffs) != 0
1826 and patch.length1 < patch_size - self.Patch_Margin
1827 ):
1828 (diff_type, diff_text) = bigpatch.diffs[0]
1829 if diff_type == self.DIFF_INSERT:
1830 # Insertions are harmless.
1831 patch.length2 += len(diff_text)
1832 start2 += len(diff_text)
1833 patch.diffs.append(bigpatch.diffs.pop(0))
1834 empty = False
1835 elif (
1836 diff_type == self.DIFF_DELETE
1837 and len(patch.diffs) == 1
1838 and patch.diffs[0][0] == self.DIFF_EQUAL
1839 and len(diff_text) > 2 * patch_size
1840 ):
1841 # This is a large deletion. Let it pass in one chunk.
1842 patch.length1 += len(diff_text)
1843 start1 += len(diff_text)
1844 empty = False
1845 patch.diffs.append((diff_type, diff_text))
1846 del bigpatch.diffs[0]
1847 else:
1848 # Deletion or equality. Only take as much as we can stomach.
1849 diff_text = diff_text[
1850 : patch_size - patch.length1 - self.Patch_Margin
1851 ]
1852 patch.length1 += len(diff_text)
1853 start1 += len(diff_text)
1854 if diff_type == self.DIFF_EQUAL:
1855 patch.length2 += len(diff_text)
1856 start2 += len(diff_text)
1857 else:
1858 empty = False
1860 patch.diffs.append((diff_type, diff_text))
1861 if diff_text == bigpatch.diffs[0][1]:
1862 del bigpatch.diffs[0]
1863 else:
1864 bigpatch.diffs[0] = (
1865 bigpatch.diffs[0][0],
1866 bigpatch.diffs[0][1][len(diff_text) :],
1867 )
1869 # Compute the head context for the next patch.
1870 precontext = self.diff_text2(patch.diffs)
1871 precontext = precontext[-self.Patch_Margin :]
1872 # Append the end context for this patch.
1873 postcontext = self.diff_text1(bigpatch.diffs)[: self.Patch_Margin]
1874 if postcontext:
1875 patch.length1 += len(postcontext)
1876 patch.length2 += len(postcontext)
1877 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1878 patch.diffs[-1] = (
1879 self.DIFF_EQUAL,
1880 patch.diffs[-1][1] + postcontext,
1881 )
1882 else:
1883 patch.diffs.append((self.DIFF_EQUAL, postcontext))
1885 if not empty:
1886 x += 1
1887 patches.insert(x, patch)
1889 def patch_toText(self, patches):
1890 """Take a list of patches and return a textual representation.
1892 Args:
1893 patches: Array of Patch objects.
1895 Returns:
1896 Text representation of patches.
1897 """
1898 text = []
1899 for patch in patches:
1900 text.append(str(patch))
1901 return "".join(text)
1903 def patch_fromText(self, textline):
1904 """Parse a textual representation of patches and return a list of patch
1905 objects.
1907 Args:
1908 textline: Text representation of patches.
1910 Returns:
1911 Array of Patch objects.
1913 Raises:
1914 ValueError: If invalid input.
1915 """
1916 patches = []
1917 if not textline:
1918 return patches
1919 text = textline.split("\n")
1920 while len(text) != 0:
1921 m = re.match(r"^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1922 if not m:
1923 raise ValueError("Invalid patch string: " + text[0])
1924 patch = patch_obj()
1925 patches.append(patch)
1926 patch.start1 = int(m.group(1))
1927 if m.group(2) == "":
1928 patch.start1 -= 1
1929 patch.length1 = 1
1930 elif m.group(2) == "0":
1931 patch.length1 = 0
1932 else:
1933 patch.start1 -= 1
1934 patch.length1 = int(m.group(2))
1936 patch.start2 = int(m.group(3))
1937 if m.group(4) == "":
1938 patch.start2 -= 1
1939 patch.length2 = 1
1940 elif m.group(4) == "0":
1941 patch.length2 = 0
1942 else:
1943 patch.start2 -= 1
1944 patch.length2 = int(m.group(4))
1946 del text[0]
1948 while len(text) != 0:
1949 if text[0]:
1950 sign = text[0][0]
1951 else:
1952 sign = ""
1953 line = urllib.parse.unquote(text[0][1:])
1954 if sign == "+":
1955 # Insertion.
1956 patch.diffs.append((self.DIFF_INSERT, line))
1957 elif sign == "-":
1958 # Deletion.
1959 patch.diffs.append((self.DIFF_DELETE, line))
1960 elif sign == " ":
1961 # Minor equality.
1962 patch.diffs.append((self.DIFF_EQUAL, line))
1963 elif sign == "@":
1964 # Start of next patch.
1965 break
1966 elif sign == "":
1967 # Blank line? Whatever.
1968 pass
1969 else:
1970 # WTF?
1971 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1972 del text[0]
1973 return patches
1976class patch_obj:
1977 """Class representing one patch operation.
1978 """
1980 def __init__(self):
1981 """Initializes with an empty list of diffs.
1982 """
1983 self.diffs = []
1984 self.start1 = None
1985 self.start2 = None
1986 self.length1 = 0
1987 self.length2 = 0
1989 def __str__(self):
1990 """Emulate GNU diff's format.
1991 Header: @@ -382,8 +481,9 @@
1992 Indices are printed as 1-based, not 0-based.
1994 Returns:
1995 The GNU diff string.
1996 """
1997 if self.length1 == 0:
1998 coords1 = str(self.start1) + ",0"
1999 elif self.length1 == 1:
2000 coords1 = str(self.start1 + 1)
2001 else:
2002 coords1 = str(self.start1 + 1) + "," + str(self.length1)
2003 if self.length2 == 0:
2004 coords2 = str(self.start2) + ",0"
2005 elif self.length2 == 1:
2006 coords2 = str(self.start2 + 1)
2007 else:
2008 coords2 = str(self.start2 + 1) + "," + str(self.length2)
2009 text = ["@@ -", coords1, " +", coords2, " @@\n"]
2010 # Escape the body of the patch with %xx notation.
2011 for (op, data) in self.diffs:
2012 if op == diff_match_patch.DIFF_INSERT:
2013 text.append("+")
2014 elif op == diff_match_patch.DIFF_DELETE:
2015 text.append("-")
2016 elif op == diff_match_patch.DIFF_EQUAL:
2017 text.append(" ")
2018 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
2019 data = data.encode("utf-8")
2020 text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
2021 return "".join(text)