Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/gitdb/fun.py: 13%
342 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
2#
3# This module is part of GitDB and is released under
4# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
5"""Contains basic c-functions which usually contain performance critical code
6Keeping this code separate from the beginning makes it easier to out-source
7it into c later, if required"""
9import zlib
10from gitdb.util import byte_ord
11decompressobj = zlib.decompressobj
13import mmap
14from itertools import islice
15from functools import reduce
17from gitdb.const import NULL_BYTE, BYTE_SPACE
18from gitdb.utils.encoding import force_text
19from gitdb.typ import (
20 str_blob_type,
21 str_commit_type,
22 str_tree_type,
23 str_tag_type,
24)
26from io import StringIO
28# INVARIANTS
29OFS_DELTA = 6
30REF_DELTA = 7
31delta_types = (OFS_DELTA, REF_DELTA)
33type_id_to_type_map = {
34 0: b'', # EXT 1
35 1: str_commit_type,
36 2: str_tree_type,
37 3: str_blob_type,
38 4: str_tag_type,
39 5: b'', # EXT 2
40 OFS_DELTA: "OFS_DELTA", # OFFSET DELTA
41 REF_DELTA: "REF_DELTA" # REFERENCE DELTA
42}
44type_to_type_id_map = {
45 str_commit_type: 1,
46 str_tree_type: 2,
47 str_blob_type: 3,
48 str_tag_type: 4,
49 "OFS_DELTA": OFS_DELTA,
50 "REF_DELTA": REF_DELTA,
51}
53# used when dealing with larger streams
54chunk_size = 1000 * mmap.PAGESIZE
56__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info',
57 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data',
58 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header')
61#{ Structures
63def _set_delta_rbound(d, size):
64 """Truncate the given delta to the given size
65 :param size: size relative to our target offset, may not be 0, must be smaller or equal
66 to our size
67 :return: d"""
68 d.ts = size
70 # NOTE: data is truncated automatically when applying the delta
71 # MUST NOT DO THIS HERE
72 return d
75def _move_delta_lbound(d, bytes):
76 """Move the delta by the given amount of bytes, reducing its size so that its
77 right bound stays static
78 :param bytes: amount of bytes to move, must be smaller than delta size
79 :return: d"""
80 if bytes == 0:
81 return
83 d.to += bytes
84 d.so += bytes
85 d.ts -= bytes
86 if d.data is not None:
87 d.data = d.data[bytes:]
88 # END handle data
90 return d
93def delta_duplicate(src):
94 return DeltaChunk(src.to, src.ts, src.so, src.data)
97def delta_chunk_apply(dc, bbuf, write):
98 """Apply own data to the target buffer
99 :param bbuf: buffer providing source bytes for copy operations
100 :param write: write method to call with data to write"""
101 if dc.data is None:
102 # COPY DATA FROM SOURCE
103 write(bbuf[dc.so:dc.so + dc.ts])
104 else:
105 # APPEND DATA
106 # what's faster: if + 4 function calls or just a write with a slice ?
107 # Considering data can be larger than 127 bytes now, it should be worth it
108 if dc.ts < len(dc.data):
109 write(dc.data[:dc.ts])
110 else:
111 write(dc.data)
112 # END handle truncation
113 # END handle chunk mode
116class DeltaChunk:
118 """Represents a piece of a delta, it can either add new data, or copy existing
119 one from a source buffer"""
120 __slots__ = (
121 'to', # start offset in the target buffer in bytes
122 'ts', # size of this chunk in the target buffer in bytes
123 'so', # start offset in the source buffer in bytes or None
124 'data', # chunk of bytes to be added to the target buffer,
125 # DeltaChunkList to use as base, or None
126 )
128 def __init__(self, to, ts, so, data):
129 self.to = to
130 self.ts = ts
131 self.so = so
132 self.data = data
134 def __repr__(self):
135 return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "")
137 #{ Interface
139 def rbound(self):
140 return self.to + self.ts
142 def has_data(self):
143 """:return: True if the instance has data to add to the target stream"""
144 return self.data is not None
146 #} END interface
149def _closest_index(dcl, absofs):
150 """:return: index at which the given absofs should be inserted. The index points
151 to the DeltaChunk with a target buffer absofs that equals or is greater than
152 absofs.
153 **Note:** global method for performance only, it belongs to DeltaChunkList"""
154 lo = 0
155 hi = len(dcl)
156 while lo < hi:
157 mid = (lo + hi) / 2
158 dc = dcl[mid]
159 if dc.to > absofs:
160 hi = mid
161 elif dc.rbound() > absofs or dc.to == absofs:
162 return mid
163 else:
164 lo = mid + 1
165 # END handle bound
166 # END for each delta absofs
167 return len(dcl) - 1
170def delta_list_apply(dcl, bbuf, write):
171 """Apply the chain's changes and write the final result using the passed
172 write function.
173 :param bbuf: base buffer containing the base of all deltas contained in this
174 list. It will only be used if the chunk in question does not have a base
175 chain.
176 :param write: function taking a string of bytes to write to the output"""
177 for dc in dcl:
178 delta_chunk_apply(dc, bbuf, write)
179 # END for each dc
182def delta_list_slice(dcl, absofs, size, ndcl):
183 """:return: Subsection of this list at the given absolute offset, with the given
184 size in bytes.
185 :return: None"""
186 cdi = _closest_index(dcl, absofs) # delta start index
187 cd = dcl[cdi]
188 slen = len(dcl)
189 lappend = ndcl.append
191 if cd.to != absofs:
192 tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data)
193 _move_delta_lbound(tcd, absofs - cd.to)
194 tcd.ts = min(tcd.ts, size)
195 lappend(tcd)
196 size -= tcd.ts
197 cdi += 1
198 # END lbound overlap handling
200 while cdi < slen and size:
201 # are we larger than the current block
202 cd = dcl[cdi]
203 if cd.ts <= size:
204 lappend(DeltaChunk(cd.to, cd.ts, cd.so, cd.data))
205 size -= cd.ts
206 else:
207 tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data)
208 tcd.ts = size
209 lappend(tcd)
210 size -= tcd.ts
211 break
212 # END hadle size
213 cdi += 1
214 # END for each chunk
217class DeltaChunkList(list):
219 """List with special functionality to deal with DeltaChunks.
220 There are two types of lists we represent. The one was created bottom-up, working
221 towards the latest delta, the other kind was created top-down, working from the
222 latest delta down to the earliest ancestor. This attribute is queryable
223 after all processing with is_reversed."""
225 __slots__ = tuple()
227 def rbound(self):
228 """:return: rightmost extend in bytes, absolute"""
229 if len(self) == 0:
230 return 0
231 return self[-1].rbound()
233 def lbound(self):
234 """:return: leftmost byte at which this chunklist starts"""
235 if len(self) == 0:
236 return 0
237 return self[0].to
239 def size(self):
240 """:return: size of bytes as measured by our delta chunks"""
241 return self.rbound() - self.lbound()
243 def apply(self, bbuf, write):
244 """Only used by public clients, internally we only use the global routines
245 for performance"""
246 return delta_list_apply(self, bbuf, write)
248 def compress(self):
249 """Alter the list to reduce the amount of nodes. Currently we concatenate
250 add-chunks
251 :return: self"""
252 slen = len(self)
253 if slen < 2:
254 return self
255 i = 0
257 first_data_index = None
258 while i < slen:
259 dc = self[i]
260 i += 1
261 if dc.data is None:
262 if first_data_index is not None and i - 2 - first_data_index > 1:
263 # if first_data_index is not None:
264 nd = StringIO() # new data
265 so = self[first_data_index].to # start offset in target buffer
266 for x in range(first_data_index, i - 1):
267 xdc = self[x]
268 nd.write(xdc.data[:xdc.ts])
269 # END collect data
271 del(self[first_data_index:i - 1])
272 buf = nd.getvalue()
273 self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf))
275 slen = len(self)
276 i = first_data_index + 1
278 # END concatenate data
279 first_data_index = None
280 continue
281 # END skip non-data chunks
283 if first_data_index is None:
284 first_data_index = i - 1
285 # END iterate list
287 # if slen_orig != len(self):
288 # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100)
289 return self
291 def check_integrity(self, target_size=-1):
292 """Verify the list has non-overlapping chunks only, and the total size matches
293 target_size
294 :param target_size: if not -1, the total size of the chain must be target_size
295 :raise AssertionError: if the size doesn't match"""
296 if target_size > -1:
297 assert self[-1].rbound() == target_size
298 assert reduce(lambda x, y: x + y, (d.ts for d in self), 0) == target_size
299 # END target size verification
301 if len(self) < 2:
302 return
304 # check data
305 for dc in self:
306 assert dc.ts > 0
307 if dc.has_data():
308 assert len(dc.data) >= dc.ts
309 # END for each dc
311 left = islice(self, 0, len(self) - 1)
312 right = iter(self)
313 right.next()
314 # this is very pythonic - we might have just use index based access here,
315 # but this could actually be faster
316 for lft, rgt in zip(left, right):
317 assert lft.rbound() == rgt.to
318 assert lft.to + lft.ts == rgt.to
319 # END for each pair
322class TopdownDeltaChunkList(DeltaChunkList):
324 """Represents a list which is generated by feeding its ancestor streams one by
325 one"""
326 __slots__ = tuple()
328 def connect_with_next_base(self, bdcl):
329 """Connect this chain with the next level of our base delta chunklist.
330 The goal in this game is to mark as many of our chunks rigid, hence they
331 cannot be changed by any of the upcoming bases anymore. Once all our
332 chunks are marked like that, we can stop all processing
333 :param bdcl: data chunk list being one of our bases. They must be fed in
334 consecutively and in order, towards the earliest ancestor delta
335 :return: True if processing was done. Use it to abort processing of
336 remaining streams if False is returned"""
337 nfc = 0 # number of frozen chunks
338 dci = 0 # delta chunk index
339 slen = len(self) # len of self
340 ccl = list() # temporary list
341 while dci < slen:
342 dc = self[dci]
343 dci += 1
345 # all add-chunks which are already topmost don't need additional processing
346 if dc.data is not None:
347 nfc += 1
348 continue
349 # END skip add chunks
351 # copy chunks
352 # integrate the portion of the base list into ourselves. Lists
353 # dont support efficient insertion ( just one at a time ), but for now
354 # we live with it. Internally, its all just a 32/64bit pointer, and
355 # the portions of moved memory should be smallish. Maybe we just rebuild
356 # ourselves in order to reduce the amount of insertions ...
357 del(ccl[:])
358 delta_list_slice(bdcl, dc.so, dc.ts, ccl)
360 # move the target bounds into place to match with our chunk
361 ofs = dc.to - dc.so
362 for cdc in ccl:
363 cdc.to += ofs
364 # END update target bounds
366 if len(ccl) == 1:
367 self[dci - 1] = ccl[0]
368 else:
369 # maybe try to compute the expenses here, and pick the right algorithm
370 # It would normally be faster than copying everything physically though
371 # TODO: Use a deque here, and decide by the index whether to extend
372 # or extend left !
373 post_dci = self[dci:]
374 del(self[dci - 1:]) # include deletion of dc
375 self.extend(ccl)
376 self.extend(post_dci)
378 slen = len(self)
379 dci += len(ccl) - 1 # deleted dc, added rest
381 # END handle chunk replacement
382 # END for each chunk
384 if nfc == slen:
385 return False
386 # END handle completeness
387 return True
390#} END structures
392#{ Routines
394def is_loose_object(m):
395 """
396 :return: True the file contained in memory map m appears to be a loose object.
397 Only the first two bytes are needed"""
398 b0, b1 = map(ord, m[:2])
399 word = (b0 << 8) + b1
400 return b0 == 0x78 and (word % 31) == 0
403def loose_object_header_info(m):
404 """
405 :return: tuple(type_string, uncompressed_size_in_bytes) the type string of the
406 object as well as its uncompressed size in bytes.
407 :param m: memory map from which to read the compressed object data"""
408 decompress_size = 8192 # is used in cgit as well
409 hdr = decompressobj().decompress(m, decompress_size)
410 type_name, size = hdr[:hdr.find(NULL_BYTE)].split(BYTE_SPACE)
412 return type_name, int(size)
415def pack_object_header_info(data):
416 """
417 :return: tuple(type_id, uncompressed_size_in_bytes, byte_offset)
418 The type_id should be interpreted according to the ``type_id_to_type_map`` map
419 The byte-offset specifies the start of the actual zlib compressed datastream
420 :param m: random-access memory, like a string or memory map"""
421 c = byte_ord(data[0]) # first byte
422 i = 1 # next char to read
423 type_id = (c >> 4) & 7 # numeric type
424 size = c & 15 # starting size
425 s = 4 # starting bit-shift size
426 while c & 0x80:
427 c = byte_ord(data[i])
428 i += 1
429 size += (c & 0x7f) << s
430 s += 7
431 # END character loop
432 # end performance at expense of maintenance ...
433 return (type_id, size, i)
436def create_pack_object_header(obj_type, obj_size):
437 """
438 :return: string defining the pack header comprised of the object type
439 and its incompressed size in bytes
441 :param obj_type: pack type_id of the object
442 :param obj_size: uncompressed size in bytes of the following object stream"""
443 c = 0 # 1 byte
444 hdr = bytearray() # output string
446 c = (obj_type << 4) | (obj_size & 0xf)
447 obj_size >>= 4
448 while obj_size:
449 hdr.append(c | 0x80)
450 c = obj_size & 0x7f
451 obj_size >>= 7
452 # END until size is consumed
453 hdr.append(c)
454 # end handle interpreter
455 return hdr
458def msb_size(data, offset=0):
459 """
460 :return: tuple(read_bytes, size) read the msb size from the given random
461 access data starting at the given byte offset"""
462 size = 0
463 i = 0
464 l = len(data)
465 hit_msb = False
466 while i < l:
467 c = data[i + offset]
468 size |= (c & 0x7f) << i * 7
469 i += 1
470 if not c & 0x80:
471 hit_msb = True
472 break
473 # END check msb bit
474 # END while in range
475 # end performance ...
476 if not hit_msb:
477 raise AssertionError("Could not find terminating MSB byte in data stream")
478 return i + offset, size
481def loose_object_header(type, size):
482 """
483 :return: bytes representing the loose object header, which is immediately
484 followed by the content stream of size 'size'"""
485 return ('%s %i\0' % (force_text(type), size)).encode('ascii')
488def write_object(type, size, read, write, chunk_size=chunk_size):
489 """
490 Write the object as identified by type, size and source_stream into the
491 target_stream
493 :param type: type string of the object
494 :param size: amount of bytes to write from source_stream
495 :param read: read method of a stream providing the content data
496 :param write: write method of the output stream
497 :param close_target_stream: if True, the target stream will be closed when
498 the routine exits, even if an error is thrown
499 :return: The actual amount of bytes written to stream, which includes the header and a trailing newline"""
500 tbw = 0 # total num bytes written
502 # WRITE HEADER: type SP size NULL
503 tbw += write(loose_object_header(type, size))
504 tbw += stream_copy(read, write, size, chunk_size)
506 return tbw
509def stream_copy(read, write, size, chunk_size):
510 """
511 Copy a stream up to size bytes using the provided read and write methods,
512 in chunks of chunk_size
514 **Note:** its much like stream_copy utility, but operates just using methods"""
515 dbw = 0 # num data bytes written
517 # WRITE ALL DATA UP TO SIZE
518 while True:
519 cs = min(chunk_size, size - dbw)
520 # NOTE: not all write methods return the amount of written bytes, like
521 # mmap.write. Its bad, but we just deal with it ... perhaps its not
522 # even less efficient
523 # data_len = write(read(cs))
524 # dbw += data_len
525 data = read(cs)
526 data_len = len(data)
527 dbw += data_len
528 write(data)
529 if data_len < cs or dbw == size:
530 break
531 # END check for stream end
532 # END duplicate data
533 return dbw
536def connect_deltas(dstreams):
537 """
538 Read the condensed delta chunk information from dstream and merge its information
539 into a list of existing delta chunks
541 :param dstreams: iterable of delta stream objects, the delta to be applied last
542 comes first, then all its ancestors in order
543 :return: DeltaChunkList, containing all operations to apply"""
544 tdcl = None # topmost dcl
546 dcl = tdcl = TopdownDeltaChunkList()
547 for dsi, ds in enumerate(dstreams):
548 # print "Stream", dsi
549 db = ds.read()
550 delta_buf_size = ds.size
552 # read header
553 i, base_size = msb_size(db)
554 i, target_size = msb_size(db, i)
556 # interpret opcodes
557 tbw = 0 # amount of target bytes written
558 while i < delta_buf_size:
559 c = ord(db[i])
560 i += 1
561 if c & 0x80:
562 cp_off, cp_size = 0, 0
563 if (c & 0x01):
564 cp_off = ord(db[i])
565 i += 1
566 if (c & 0x02):
567 cp_off |= (ord(db[i]) << 8)
568 i += 1
569 if (c & 0x04):
570 cp_off |= (ord(db[i]) << 16)
571 i += 1
572 if (c & 0x08):
573 cp_off |= (ord(db[i]) << 24)
574 i += 1
575 if (c & 0x10):
576 cp_size = ord(db[i])
577 i += 1
578 if (c & 0x20):
579 cp_size |= (ord(db[i]) << 8)
580 i += 1
581 if (c & 0x40):
582 cp_size |= (ord(db[i]) << 16)
583 i += 1
585 if not cp_size:
586 cp_size = 0x10000
588 rbound = cp_off + cp_size
589 if (rbound < cp_size or
590 rbound > base_size):
591 break
593 dcl.append(DeltaChunk(tbw, cp_size, cp_off, None))
594 tbw += cp_size
595 elif c:
596 # NOTE: in C, the data chunks should probably be concatenated here.
597 # In python, we do it as a post-process
598 dcl.append(DeltaChunk(tbw, c, 0, db[i:i + c]))
599 i += c
600 tbw += c
601 else:
602 raise ValueError("unexpected delta opcode 0")
603 # END handle command byte
604 # END while processing delta data
606 dcl.compress()
608 # merge the lists !
609 if dsi > 0:
610 if not tdcl.connect_with_next_base(dcl):
611 break
612 # END handle merge
614 # prepare next base
615 dcl = DeltaChunkList()
616 # END for each delta stream
618 return tdcl
621def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write):
622 """
623 Apply data from a delta buffer using a source buffer to the target file
625 :param src_buf: random access data from which the delta was created
626 :param src_buf_size: size of the source buffer in bytes
627 :param delta_buf_size: size for the delta buffer in bytes
628 :param delta_buf: random access delta data
629 :param write: write method taking a chunk of bytes
631 **Note:** transcribed to python from the similar routine in patch-delta.c"""
632 i = 0
633 db = delta_buf
634 while i < delta_buf_size:
635 c = db[i]
636 i += 1
637 if c & 0x80:
638 cp_off, cp_size = 0, 0
639 if (c & 0x01):
640 cp_off = db[i]
641 i += 1
642 if (c & 0x02):
643 cp_off |= (db[i] << 8)
644 i += 1
645 if (c & 0x04):
646 cp_off |= (db[i] << 16)
647 i += 1
648 if (c & 0x08):
649 cp_off |= (db[i] << 24)
650 i += 1
651 if (c & 0x10):
652 cp_size = db[i]
653 i += 1
654 if (c & 0x20):
655 cp_size |= (db[i] << 8)
656 i += 1
657 if (c & 0x40):
658 cp_size |= (db[i] << 16)
659 i += 1
661 if not cp_size:
662 cp_size = 0x10000
664 rbound = cp_off + cp_size
665 if (rbound < cp_size or
666 rbound > src_buf_size):
667 break
668 write(src_buf[cp_off:cp_off + cp_size])
669 elif c:
670 write(db[i:i + c])
671 i += c
672 else:
673 raise ValueError("unexpected delta opcode 0")
674 # END handle command byte
675 # END while processing delta data
677 # yes, lets use the exact same error message that git uses :)
678 assert i == delta_buf_size, "delta replay has gone wild"
681def is_equal_canonical_sha(canonical_length, match, sha1):
682 """
683 :return: True if the given lhs and rhs 20 byte binary shas
684 The comparison will take the canonical_length of the match sha into account,
685 hence the comparison will only use the last 4 bytes for uneven canonical representations
686 :param match: less than 20 byte sha
687 :param sha1: 20 byte sha"""
688 binary_length = canonical_length // 2
689 if match[:binary_length] != sha1[:binary_length]:
690 return False
692 if canonical_length - binary_length and \
693 (byte_ord(match[-1]) ^ byte_ord(sha1[len(match) - 1])) & 0xf0:
694 return False
695 # END handle uneven canonnical length
696 return True
698#} END routines
701try:
702 from gitdb_speedups._perf import connect_deltas
703except ImportError:
704 pass