Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/gitdb/fun.py: 13%

3# This module is part of GitDB and is released under

4# the New BSD License: http://www.opensource.org/licenses/bsd-license.php

5"""Contains basic c-functions which usually contain performance critical code

6Keeping this code separate from the beginning makes it easier to out-source

7it into c later, if required"""

9import zlib

10from gitdb.util import byte_ord

11decompressobj = zlib.decompressobj

13import mmap

14from itertools import islice

15from functools import reduce

17from gitdb.const import NULL_BYTE, BYTE_SPACE

18from gitdb.utils.encoding import force_text

19from gitdb.typ import (

20 str_blob_type,

21 str_commit_type,

22 str_tree_type,

23 str_tag_type,

24)

26from io import StringIO

28# INVARIANTS

29OFS_DELTA = 6

30REF_DELTA = 7

31delta_types = (OFS_DELTA, REF_DELTA)

33type_id_to_type_map = {

34 0: b'', # EXT 1

35 1: str_commit_type,

36 2: str_tree_type,

37 3: str_blob_type,

38 4: str_tag_type,

39 5: b'', # EXT 2

40 OFS_DELTA: "OFS_DELTA", # OFFSET DELTA

41 REF_DELTA: "REF_DELTA" # REFERENCE DELTA

42}

44type_to_type_id_map = {

45 str_commit_type: 1,

46 str_tree_type: 2,

47 str_blob_type: 3,

48 str_tag_type: 4,

49 "OFS_DELTA": OFS_DELTA,

50 "REF_DELTA": REF_DELTA,

51}

53# used when dealing with larger streams

54chunk_size = 1000 * mmap.PAGESIZE

56__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info',

57 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data',

58 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header')

61#{ Structures

63def _set_delta_rbound(d, size):

64 """Truncate the given delta to the given size

65 :param size: size relative to our target offset, may not be 0, must be smaller or equal

66 to our size

67 :return: d"""

68 d.ts = size

70 # NOTE: data is truncated automatically when applying the delta

71 # MUST NOT DO THIS HERE

72 return d

75def _move_delta_lbound(d, bytes):

76 """Move the delta by the given amount of bytes, reducing its size so that its

77 right bound stays static

78 :param bytes: amount of bytes to move, must be smaller than delta size

79 :return: d"""

80 if bytes == 0:

81 return

83 d.to += bytes

84 d.so += bytes

85 d.ts -= bytes

86 if d.data is not None:

87 d.data = d.data[bytes:]

88 # END handle data

90 return d

93def delta_duplicate(src):

94 return DeltaChunk(src.to, src.ts, src.so, src.data)

97def delta_chunk_apply(dc, bbuf, write):

98 """Apply own data to the target buffer

99 :param bbuf: buffer providing source bytes for copy operations

100 :param write: write method to call with data to write"""

101 if dc.data is None:

102 # COPY DATA FROM SOURCE

103 write(bbuf[dc.so:dc.so + dc.ts])

104 else:

105 # APPEND DATA

106 # what's faster: if + 4 function calls or just a write with a slice ?

107 # Considering data can be larger than 127 bytes now, it should be worth it

108 if dc.ts < len(dc.data):

109 write(dc.data[:dc.ts])

110 else:

111 write(dc.data)

112 # END handle truncation

113 # END handle chunk mode

114

115

116class DeltaChunk:

117

118 """Represents a piece of a delta, it can either add new data, or copy existing

119 one from a source buffer"""

120 __slots__ = (

121 'to', # start offset in the target buffer in bytes

122 'ts', # size of this chunk in the target buffer in bytes

123 'so', # start offset in the source buffer in bytes or None

124 'data', # chunk of bytes to be added to the target buffer,

125 # DeltaChunkList to use as base, or None

126 )

127

128 def __init__(self, to, ts, so, data):

129 self.to = to

130 self.ts = ts

131 self.so = so

132 self.data = data

133

134 def __repr__(self):

135 return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "")

136

137 #{ Interface

138

139 def rbound(self):

140 return self.to + self.ts

141

142 def has_data(self):

143 """:return: True if the instance has data to add to the target stream"""

144 return self.data is not None

145

146 #} END interface

147

148

149def _closest_index(dcl, absofs):

150 """:return: index at which the given absofs should be inserted. The index points

151 to the DeltaChunk with a target buffer absofs that equals or is greater than

152 absofs.

153 **Note:** global method for performance only, it belongs to DeltaChunkList"""

154 lo = 0

155 hi = len(dcl)

156 while lo < hi:

157 mid = (lo + hi) / 2

158 dc = dcl[mid]

159 if dc.to > absofs:

160 hi = mid

161 elif dc.rbound() > absofs or dc.to == absofs:

162 return mid

163 else:

164 lo = mid + 1

165 # END handle bound

166 # END for each delta absofs

167 return len(dcl) - 1

168

169

170def delta_list_apply(dcl, bbuf, write):

171 """Apply the chain's changes and write the final result using the passed

172 write function.

173 :param bbuf: base buffer containing the base of all deltas contained in this

174 list. It will only be used if the chunk in question does not have a base

175 chain.

176 :param write: function taking a string of bytes to write to the output"""

177 for dc in dcl:

178 delta_chunk_apply(dc, bbuf, write)

179 # END for each dc

180

181

182def delta_list_slice(dcl, absofs, size, ndcl):

183 """:return: Subsection of this list at the given absolute offset, with the given

184 size in bytes.

185 :return: None"""

186 cdi = _closest_index(dcl, absofs) # delta start index

187 cd = dcl[cdi]

188 slen = len(dcl)

189 lappend = ndcl.append

190

191 if cd.to != absofs:

192 tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data)

193 _move_delta_lbound(tcd, absofs - cd.to)

194 tcd.ts = min(tcd.ts, size)

195 lappend(tcd)

196 size -= tcd.ts

197 cdi += 1

198 # END lbound overlap handling

199

200 while cdi < slen and size:

201 # are we larger than the current block

202 cd = dcl[cdi]

203 if cd.ts <= size:

204 lappend(DeltaChunk(cd.to, cd.ts, cd.so, cd.data))

205 size -= cd.ts

206 else:

207 tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data)

208 tcd.ts = size

209 lappend(tcd)

210 size -= tcd.ts

211 break

212 # END hadle size

213 cdi += 1

214 # END for each chunk

215

216

217class DeltaChunkList(list):

218

219 """List with special functionality to deal with DeltaChunks.

220 There are two types of lists we represent. The one was created bottom-up, working

221 towards the latest delta, the other kind was created top-down, working from the

222 latest delta down to the earliest ancestor. This attribute is queryable

223 after all processing with is_reversed."""

224

225 __slots__ = tuple()

226

227 def rbound(self):

228 """:return: rightmost extend in bytes, absolute"""

229 if len(self) == 0:

230 return 0

231 return self[-1].rbound()

232

233 def lbound(self):

234 """:return: leftmost byte at which this chunklist starts"""

235 if len(self) == 0:

236 return 0

237 return self[0].to

238

239 def size(self):

240 """:return: size of bytes as measured by our delta chunks"""

241 return self.rbound() - self.lbound()

242

243 def apply(self, bbuf, write):

244 """Only used by public clients, internally we only use the global routines

245 for performance"""

246 return delta_list_apply(self, bbuf, write)

247

248 def compress(self):

249 """Alter the list to reduce the amount of nodes. Currently we concatenate

250 add-chunks

251 :return: self"""

252 slen = len(self)

253 if slen < 2:

254 return self

255 i = 0

256

257 first_data_index = None

258 while i < slen:

259 dc = self[i]

260 i += 1

261 if dc.data is None:

262 if first_data_index is not None and i - 2 - first_data_index > 1:

263 # if first_data_index is not None:

264 nd = StringIO() # new data

265 so = self[first_data_index].to # start offset in target buffer

266 for x in range(first_data_index, i - 1):

267 xdc = self[x]

268 nd.write(xdc.data[:xdc.ts])

269 # END collect data

270

271 del(self[first_data_index:i - 1])

272 buf = nd.getvalue()

273 self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf))

274

275 slen = len(self)

276 i = first_data_index + 1

277

278 # END concatenate data

279 first_data_index = None

280 continue

281 # END skip non-data chunks

282

283 if first_data_index is None:

284 first_data_index = i - 1

285 # END iterate list

286

287 # if slen_orig != len(self):

288 # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100)

289 return self

290

291 def check_integrity(self, target_size=-1):

292 """Verify the list has non-overlapping chunks only, and the total size matches

293 target_size

294 :param target_size: if not -1, the total size of the chain must be target_size

295 :raise AssertionError: if the size doesn't match"""

296 if target_size > -1:

297 assert self[-1].rbound() == target_size

298 assert reduce(lambda x, y: x + y, (d.ts for d in self), 0) == target_size

299 # END target size verification

300

301 if len(self) < 2:

302 return

303

304 # check data

305 for dc in self:

306 assert dc.ts > 0

307 if dc.has_data():

308 assert len(dc.data) >= dc.ts

309 # END for each dc

310

311 left = islice(self, 0, len(self) - 1)

312 right = iter(self)

313 right.next()

314 # this is very pythonic - we might have just use index based access here,

315 # but this could actually be faster

316 for lft, rgt in zip(left, right):

317 assert lft.rbound() == rgt.to

318 assert lft.to + lft.ts == rgt.to

319 # END for each pair

320

321

322class TopdownDeltaChunkList(DeltaChunkList):

323

324 """Represents a list which is generated by feeding its ancestor streams one by

325 one"""

326 __slots__ = tuple()

327

328 def connect_with_next_base(self, bdcl):

329 """Connect this chain with the next level of our base delta chunklist.

330 The goal in this game is to mark as many of our chunks rigid, hence they

331 cannot be changed by any of the upcoming bases anymore. Once all our

332 chunks are marked like that, we can stop all processing

333 :param bdcl: data chunk list being one of our bases. They must be fed in

334 consecutively and in order, towards the earliest ancestor delta

335 :return: True if processing was done. Use it to abort processing of

336 remaining streams if False is returned"""

337 nfc = 0 # number of frozen chunks

338 dci = 0 # delta chunk index

339 slen = len(self) # len of self

340 ccl = list() # temporary list

341 while dci < slen:

342 dc = self[dci]

343 dci += 1

344

345 # all add-chunks which are already topmost don't need additional processing

346 if dc.data is not None:

347 nfc += 1

348 continue

349 # END skip add chunks

350

351 # copy chunks

352 # integrate the portion of the base list into ourselves. Lists

353 # dont support efficient insertion ( just one at a time ), but for now

354 # we live with it. Internally, its all just a 32/64bit pointer, and

355 # the portions of moved memory should be smallish. Maybe we just rebuild

356 # ourselves in order to reduce the amount of insertions ...

357 del(ccl[:])

358 delta_list_slice(bdcl, dc.so, dc.ts, ccl)

359

360 # move the target bounds into place to match with our chunk

361 ofs = dc.to - dc.so

362 for cdc in ccl:

363 cdc.to += ofs

364 # END update target bounds

365

366 if len(ccl) == 1:

367 self[dci - 1] = ccl[0]

368 else:

369 # maybe try to compute the expenses here, and pick the right algorithm

370 # It would normally be faster than copying everything physically though

371 # TODO: Use a deque here, and decide by the index whether to extend

372 # or extend left !

373 post_dci = self[dci:]

374 del(self[dci - 1:]) # include deletion of dc

375 self.extend(ccl)

376 self.extend(post_dci)

377

378 slen = len(self)

379 dci += len(ccl) - 1 # deleted dc, added rest

380

381 # END handle chunk replacement

382 # END for each chunk

383

384 if nfc == slen:

385 return False

386 # END handle completeness

387 return True

390#} END structures

392#{ Routines

394def is_loose_object(m):

395 """

396 :return: True the file contained in memory map m appears to be a loose object.

397 Only the first two bytes are needed"""

398 b0, b1 = map(ord, m[:2])

399 word = (b0 << 8) + b1

400 return b0 == 0x78 and (word % 31) == 0

401

402

403def loose_object_header_info(m):

404 """

405 :return: tuple(type_string, uncompressed_size_in_bytes) the type string of the

406 object as well as its uncompressed size in bytes.

407 :param m: memory map from which to read the compressed object data"""

408 decompress_size = 8192 # is used in cgit as well

409 hdr = decompressobj().decompress(m, decompress_size)

410 type_name, size = hdr[:hdr.find(NULL_BYTE)].split(BYTE_SPACE)

411

412 return type_name, int(size)

413

414

415def pack_object_header_info(data):

416 """

417 :return: tuple(type_id, uncompressed_size_in_bytes, byte_offset)

418 The type_id should be interpreted according to the ``type_id_to_type_map`` map

419 The byte-offset specifies the start of the actual zlib compressed datastream

420 :param m: random-access memory, like a string or memory map"""

421 c = byte_ord(data[0]) # first byte

422 i = 1 # next char to read

423 type_id = (c >> 4) & 7 # numeric type

424 size = c & 15 # starting size

425 s = 4 # starting bit-shift size

426 while c & 0x80:

427 c = byte_ord(data[i])

428 i += 1

429 size += (c & 0x7f) << s

430 s += 7

431 # END character loop

432 # end performance at expense of maintenance ...

433 return (type_id, size, i)

434

435

436def create_pack_object_header(obj_type, obj_size):

437 """

438 :return: string defining the pack header comprised of the object type

439 and its incompressed size in bytes

440

441 :param obj_type: pack type_id of the object

442 :param obj_size: uncompressed size in bytes of the following object stream"""

443 c = 0 # 1 byte

444 hdr = bytearray() # output string

445

446 c = (obj_type << 4) | (obj_size & 0xf)

447 obj_size >>= 4

448 while obj_size:

449 hdr.append(c | 0x80)

450 c = obj_size & 0x7f

451 obj_size >>= 7

452 # END until size is consumed

453 hdr.append(c)

454 # end handle interpreter

455 return hdr

456

457

458def msb_size(data, offset=0):

459 """

460 :return: tuple(read_bytes, size) read the msb size from the given random

461 access data starting at the given byte offset"""

462 size = 0

463 i = 0

464 l = len(data)

465 hit_msb = False

466 while i < l:

467 c = data[i + offset]

468 size |= (c & 0x7f) << i * 7

469 i += 1

470 if not c & 0x80:

471 hit_msb = True

472 break

473 # END check msb bit

474 # END while in range

475 # end performance ...

476 if not hit_msb:

477 raise AssertionError("Could not find terminating MSB byte in data stream")

478 return i + offset, size

479

480

481def loose_object_header(type, size):

482 """

483 :return: bytes representing the loose object header, which is immediately

484 followed by the content stream of size 'size'"""

485 return ('%s %i\0' % (force_text(type), size)).encode('ascii')

486

487

488def write_object(type, size, read, write, chunk_size=chunk_size):

489 """

490 Write the object as identified by type, size and source_stream into the

491 target_stream

492

493 :param type: type string of the object

494 :param size: amount of bytes to write from source_stream

495 :param read: read method of a stream providing the content data

496 :param write: write method of the output stream

497 :param close_target_stream: if True, the target stream will be closed when

498 the routine exits, even if an error is thrown

499 :return: The actual amount of bytes written to stream, which includes the header and a trailing newline"""

500 tbw = 0 # total num bytes written

501

502 # WRITE HEADER: type SP size NULL

503 tbw += write(loose_object_header(type, size))

504 tbw += stream_copy(read, write, size, chunk_size)

505

506 return tbw

507

508

509def stream_copy(read, write, size, chunk_size):

510 """

511 Copy a stream up to size bytes using the provided read and write methods,

512 in chunks of chunk_size

513

514 **Note:** its much like stream_copy utility, but operates just using methods"""

515 dbw = 0 # num data bytes written

516

517 # WRITE ALL DATA UP TO SIZE

518 while True:

519 cs = min(chunk_size, size - dbw)

520 # NOTE: not all write methods return the amount of written bytes, like

521 # mmap.write. Its bad, but we just deal with it ... perhaps its not

522 # even less efficient

523 # data_len = write(read(cs))

524 # dbw += data_len

525 data = read(cs)

526 data_len = len(data)

527 dbw += data_len

528 write(data)

529 if data_len < cs or dbw == size:

530 break

531 # END check for stream end

532 # END duplicate data

533 return dbw

534

535

536def connect_deltas(dstreams):

537 """

538 Read the condensed delta chunk information from dstream and merge its information

539 into a list of existing delta chunks

540

541 :param dstreams: iterable of delta stream objects, the delta to be applied last

542 comes first, then all its ancestors in order

543 :return: DeltaChunkList, containing all operations to apply"""

544 tdcl = None # topmost dcl

545

546 dcl = tdcl = TopdownDeltaChunkList()

547 for dsi, ds in enumerate(dstreams):

548 # print "Stream", dsi

549 db = ds.read()

550 delta_buf_size = ds.size

551

552 # read header

553 i, base_size = msb_size(db)

554 i, target_size = msb_size(db, i)

555

556 # interpret opcodes

557 tbw = 0 # amount of target bytes written

558 while i < delta_buf_size:

559 c = ord(db[i])

560 i += 1

561 if c & 0x80:

562 cp_off, cp_size = 0, 0

563 if (c & 0x01):

564 cp_off = ord(db[i])

565 i += 1

566 if (c & 0x02):

567 cp_off |= (ord(db[i]) << 8)

568 i += 1

569 if (c & 0x04):

570 cp_off |= (ord(db[i]) << 16)

571 i += 1

572 if (c & 0x08):

573 cp_off |= (ord(db[i]) << 24)

574 i += 1

575 if (c & 0x10):

576 cp_size = ord(db[i])

577 i += 1

578 if (c & 0x20):

579 cp_size |= (ord(db[i]) << 8)

580 i += 1

581 if (c & 0x40):

582 cp_size |= (ord(db[i]) << 16)

583 i += 1

584

585 if not cp_size:

586 cp_size = 0x10000

587

588 rbound = cp_off + cp_size

589 if (rbound < cp_size or

590 rbound > base_size):

591 break

592

593 dcl.append(DeltaChunk(tbw, cp_size, cp_off, None))

594 tbw += cp_size

595 elif c:

596 # NOTE: in C, the data chunks should probably be concatenated here.

597 # In python, we do it as a post-process

598 dcl.append(DeltaChunk(tbw, c, 0, db[i:i + c]))

599 i += c

600 tbw += c

601 else:

602 raise ValueError("unexpected delta opcode 0")

603 # END handle command byte

604 # END while processing delta data

605

606 dcl.compress()

607

608 # merge the lists !

609 if dsi > 0:

610 if not tdcl.connect_with_next_base(dcl):

611 break

612 # END handle merge

613

614 # prepare next base

615 dcl = DeltaChunkList()

616 # END for each delta stream

617

618 return tdcl

619

620

621def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write):

622 """

623 Apply data from a delta buffer using a source buffer to the target file

624

625 :param src_buf: random access data from which the delta was created

626 :param src_buf_size: size of the source buffer in bytes

627 :param delta_buf_size: size for the delta buffer in bytes

628 :param delta_buf: random access delta data

629 :param write: write method taking a chunk of bytes

630

631 **Note:** transcribed to python from the similar routine in patch-delta.c"""

632 i = 0

633 db = delta_buf

634 while i < delta_buf_size:

635 c = db[i]

636 i += 1

637 if c & 0x80:

638 cp_off, cp_size = 0, 0

639 if (c & 0x01):

640 cp_off = db[i]

641 i += 1

642 if (c & 0x02):

643 cp_off |= (db[i] << 8)

644 i += 1

645 if (c & 0x04):

646 cp_off |= (db[i] << 16)

647 i += 1

648 if (c & 0x08):

649 cp_off |= (db[i] << 24)

650 i += 1

651 if (c & 0x10):

652 cp_size = db[i]

653 i += 1

654 if (c & 0x20):

655 cp_size |= (db[i] << 8)

656 i += 1

657 if (c & 0x40):

658 cp_size |= (db[i] << 16)

659 i += 1

660

661 if not cp_size:

662 cp_size = 0x10000

663

664 rbound = cp_off + cp_size

665 if (rbound < cp_size or

666 rbound > src_buf_size):

667 break

668 write(src_buf[cp_off:cp_off + cp_size])

669 elif c:

670 write(db[i:i + c])

671 i += c

672 else:

673 raise ValueError("unexpected delta opcode 0")

674 # END handle command byte

675 # END while processing delta data

676

677 # yes, lets use the exact same error message that git uses :)

678 assert i == delta_buf_size, "delta replay has gone wild"

679

680

681def is_equal_canonical_sha(canonical_length, match, sha1):

682 """

683 :return: True if the given lhs and rhs 20 byte binary shas

684 The comparison will take the canonical_length of the match sha into account,

685 hence the comparison will only use the last 4 bytes for uneven canonical representations

686 :param match: less than 20 byte sha

687 :param sha1: 20 byte sha"""

688 binary_length = canonical_length // 2

689 if match[:binary_length] != sha1[:binary_length]:

690 return False

691

692 if canonical_length - binary_length and \

693 (byte_ord(match[-1]) ^ byte_ord(sha1[len(match) - 1])) & 0xf0:

694 return False

695 # END handle uneven canonnical length

696 return True

697

698#} END routines

699

700

701try:

702 from gitdb_speedups._perf import connect_deltas

703except ImportError:

704 pass