Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/gitdb/fun.py: 13%

342 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors 

2# 

3# This module is part of GitDB and is released under 

4# the New BSD License: http://www.opensource.org/licenses/bsd-license.php 

5"""Contains basic c-functions which usually contain performance critical code 

6Keeping this code separate from the beginning makes it easier to out-source 

7it into c later, if required""" 

8 

9import zlib 

10from gitdb.util import byte_ord 

11decompressobj = zlib.decompressobj 

12 

13import mmap 

14from itertools import islice 

15from functools import reduce 

16 

17from gitdb.const import NULL_BYTE, BYTE_SPACE 

18from gitdb.utils.encoding import force_text 

19from gitdb.typ import ( 

20 str_blob_type, 

21 str_commit_type, 

22 str_tree_type, 

23 str_tag_type, 

24) 

25 

26from io import StringIO 

27 

28# INVARIANTS 

29OFS_DELTA = 6 

30REF_DELTA = 7 

31delta_types = (OFS_DELTA, REF_DELTA) 

32 

33type_id_to_type_map = { 

34 0: b'', # EXT 1 

35 1: str_commit_type, 

36 2: str_tree_type, 

37 3: str_blob_type, 

38 4: str_tag_type, 

39 5: b'', # EXT 2 

40 OFS_DELTA: "OFS_DELTA", # OFFSET DELTA 

41 REF_DELTA: "REF_DELTA" # REFERENCE DELTA 

42} 

43 

44type_to_type_id_map = { 

45 str_commit_type: 1, 

46 str_tree_type: 2, 

47 str_blob_type: 3, 

48 str_tag_type: 4, 

49 "OFS_DELTA": OFS_DELTA, 

50 "REF_DELTA": REF_DELTA, 

51} 

52 

53# used when dealing with larger streams 

54chunk_size = 1000 * mmap.PAGESIZE 

55 

56__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info', 

57 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', 

58 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header') 

59 

60 

61#{ Structures 

62 

63def _set_delta_rbound(d, size): 

64 """Truncate the given delta to the given size 

65 :param size: size relative to our target offset, may not be 0, must be smaller or equal 

66 to our size 

67 :return: d""" 

68 d.ts = size 

69 

70 # NOTE: data is truncated automatically when applying the delta 

71 # MUST NOT DO THIS HERE 

72 return d 

73 

74 

75def _move_delta_lbound(d, bytes): 

76 """Move the delta by the given amount of bytes, reducing its size so that its 

77 right bound stays static 

78 :param bytes: amount of bytes to move, must be smaller than delta size 

79 :return: d""" 

80 if bytes == 0: 

81 return 

82 

83 d.to += bytes 

84 d.so += bytes 

85 d.ts -= bytes 

86 if d.data is not None: 

87 d.data = d.data[bytes:] 

88 # END handle data 

89 

90 return d 

91 

92 

93def delta_duplicate(src): 

94 return DeltaChunk(src.to, src.ts, src.so, src.data) 

95 

96 

97def delta_chunk_apply(dc, bbuf, write): 

98 """Apply own data to the target buffer 

99 :param bbuf: buffer providing source bytes for copy operations 

100 :param write: write method to call with data to write""" 

101 if dc.data is None: 

102 # COPY DATA FROM SOURCE 

103 write(bbuf[dc.so:dc.so + dc.ts]) 

104 else: 

105 # APPEND DATA 

106 # what's faster: if + 4 function calls or just a write with a slice ? 

107 # Considering data can be larger than 127 bytes now, it should be worth it 

108 if dc.ts < len(dc.data): 

109 write(dc.data[:dc.ts]) 

110 else: 

111 write(dc.data) 

112 # END handle truncation 

113 # END handle chunk mode 

114 

115 

116class DeltaChunk: 

117 

118 """Represents a piece of a delta, it can either add new data, or copy existing 

119 one from a source buffer""" 

120 __slots__ = ( 

121 'to', # start offset in the target buffer in bytes 

122 'ts', # size of this chunk in the target buffer in bytes 

123 'so', # start offset in the source buffer in bytes or None 

124 'data', # chunk of bytes to be added to the target buffer, 

125 # DeltaChunkList to use as base, or None 

126 ) 

127 

128 def __init__(self, to, ts, so, data): 

129 self.to = to 

130 self.ts = ts 

131 self.so = so 

132 self.data = data 

133 

134 def __repr__(self): 

135 return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "") 

136 

137 #{ Interface 

138 

139 def rbound(self): 

140 return self.to + self.ts 

141 

142 def has_data(self): 

143 """:return: True if the instance has data to add to the target stream""" 

144 return self.data is not None 

145 

146 #} END interface 

147 

148 

149def _closest_index(dcl, absofs): 

150 """:return: index at which the given absofs should be inserted. The index points 

151 to the DeltaChunk with a target buffer absofs that equals or is greater than 

152 absofs. 

153 **Note:** global method for performance only, it belongs to DeltaChunkList""" 

154 lo = 0 

155 hi = len(dcl) 

156 while lo < hi: 

157 mid = (lo + hi) / 2 

158 dc = dcl[mid] 

159 if dc.to > absofs: 

160 hi = mid 

161 elif dc.rbound() > absofs or dc.to == absofs: 

162 return mid 

163 else: 

164 lo = mid + 1 

165 # END handle bound 

166 # END for each delta absofs 

167 return len(dcl) - 1 

168 

169 

170def delta_list_apply(dcl, bbuf, write): 

171 """Apply the chain's changes and write the final result using the passed 

172 write function. 

173 :param bbuf: base buffer containing the base of all deltas contained in this 

174 list. It will only be used if the chunk in question does not have a base 

175 chain. 

176 :param write: function taking a string of bytes to write to the output""" 

177 for dc in dcl: 

178 delta_chunk_apply(dc, bbuf, write) 

179 # END for each dc 

180 

181 

182def delta_list_slice(dcl, absofs, size, ndcl): 

183 """:return: Subsection of this list at the given absolute offset, with the given 

184 size in bytes. 

185 :return: None""" 

186 cdi = _closest_index(dcl, absofs) # delta start index 

187 cd = dcl[cdi] 

188 slen = len(dcl) 

189 lappend = ndcl.append 

190 

191 if cd.to != absofs: 

192 tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) 

193 _move_delta_lbound(tcd, absofs - cd.to) 

194 tcd.ts = min(tcd.ts, size) 

195 lappend(tcd) 

196 size -= tcd.ts 

197 cdi += 1 

198 # END lbound overlap handling 

199 

200 while cdi < slen and size: 

201 # are we larger than the current block 

202 cd = dcl[cdi] 

203 if cd.ts <= size: 

204 lappend(DeltaChunk(cd.to, cd.ts, cd.so, cd.data)) 

205 size -= cd.ts 

206 else: 

207 tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) 

208 tcd.ts = size 

209 lappend(tcd) 

210 size -= tcd.ts 

211 break 

212 # END hadle size 

213 cdi += 1 

214 # END for each chunk 

215 

216 

217class DeltaChunkList(list): 

218 

219 """List with special functionality to deal with DeltaChunks. 

220 There are two types of lists we represent. The one was created bottom-up, working 

221 towards the latest delta, the other kind was created top-down, working from the 

222 latest delta down to the earliest ancestor. This attribute is queryable 

223 after all processing with is_reversed.""" 

224 

225 __slots__ = tuple() 

226 

227 def rbound(self): 

228 """:return: rightmost extend in bytes, absolute""" 

229 if len(self) == 0: 

230 return 0 

231 return self[-1].rbound() 

232 

233 def lbound(self): 

234 """:return: leftmost byte at which this chunklist starts""" 

235 if len(self) == 0: 

236 return 0 

237 return self[0].to 

238 

239 def size(self): 

240 """:return: size of bytes as measured by our delta chunks""" 

241 return self.rbound() - self.lbound() 

242 

243 def apply(self, bbuf, write): 

244 """Only used by public clients, internally we only use the global routines 

245 for performance""" 

246 return delta_list_apply(self, bbuf, write) 

247 

248 def compress(self): 

249 """Alter the list to reduce the amount of nodes. Currently we concatenate 

250 add-chunks 

251 :return: self""" 

252 slen = len(self) 

253 if slen < 2: 

254 return self 

255 i = 0 

256 

257 first_data_index = None 

258 while i < slen: 

259 dc = self[i] 

260 i += 1 

261 if dc.data is None: 

262 if first_data_index is not None and i - 2 - first_data_index > 1: 

263 # if first_data_index is not None: 

264 nd = StringIO() # new data 

265 so = self[first_data_index].to # start offset in target buffer 

266 for x in range(first_data_index, i - 1): 

267 xdc = self[x] 

268 nd.write(xdc.data[:xdc.ts]) 

269 # END collect data 

270 

271 del(self[first_data_index:i - 1]) 

272 buf = nd.getvalue() 

273 self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) 

274 

275 slen = len(self) 

276 i = first_data_index + 1 

277 

278 # END concatenate data 

279 first_data_index = None 

280 continue 

281 # END skip non-data chunks 

282 

283 if first_data_index is None: 

284 first_data_index = i - 1 

285 # END iterate list 

286 

287 # if slen_orig != len(self): 

288 # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100) 

289 return self 

290 

291 def check_integrity(self, target_size=-1): 

292 """Verify the list has non-overlapping chunks only, and the total size matches 

293 target_size 

294 :param target_size: if not -1, the total size of the chain must be target_size 

295 :raise AssertionError: if the size doesn't match""" 

296 if target_size > -1: 

297 assert self[-1].rbound() == target_size 

298 assert reduce(lambda x, y: x + y, (d.ts for d in self), 0) == target_size 

299 # END target size verification 

300 

301 if len(self) < 2: 

302 return 

303 

304 # check data 

305 for dc in self: 

306 assert dc.ts > 0 

307 if dc.has_data(): 

308 assert len(dc.data) >= dc.ts 

309 # END for each dc 

310 

311 left = islice(self, 0, len(self) - 1) 

312 right = iter(self) 

313 right.next() 

314 # this is very pythonic - we might have just use index based access here, 

315 # but this could actually be faster 

316 for lft, rgt in zip(left, right): 

317 assert lft.rbound() == rgt.to 

318 assert lft.to + lft.ts == rgt.to 

319 # END for each pair 

320 

321 

322class TopdownDeltaChunkList(DeltaChunkList): 

323 

324 """Represents a list which is generated by feeding its ancestor streams one by 

325 one""" 

326 __slots__ = tuple() 

327 

328 def connect_with_next_base(self, bdcl): 

329 """Connect this chain with the next level of our base delta chunklist. 

330 The goal in this game is to mark as many of our chunks rigid, hence they 

331 cannot be changed by any of the upcoming bases anymore. Once all our 

332 chunks are marked like that, we can stop all processing 

333 :param bdcl: data chunk list being one of our bases. They must be fed in 

334 consecutively and in order, towards the earliest ancestor delta 

335 :return: True if processing was done. Use it to abort processing of 

336 remaining streams if False is returned""" 

337 nfc = 0 # number of frozen chunks 

338 dci = 0 # delta chunk index 

339 slen = len(self) # len of self 

340 ccl = list() # temporary list 

341 while dci < slen: 

342 dc = self[dci] 

343 dci += 1 

344 

345 # all add-chunks which are already topmost don't need additional processing 

346 if dc.data is not None: 

347 nfc += 1 

348 continue 

349 # END skip add chunks 

350 

351 # copy chunks 

352 # integrate the portion of the base list into ourselves. Lists 

353 # dont support efficient insertion ( just one at a time ), but for now 

354 # we live with it. Internally, its all just a 32/64bit pointer, and 

355 # the portions of moved memory should be smallish. Maybe we just rebuild 

356 # ourselves in order to reduce the amount of insertions ... 

357 del(ccl[:]) 

358 delta_list_slice(bdcl, dc.so, dc.ts, ccl) 

359 

360 # move the target bounds into place to match with our chunk 

361 ofs = dc.to - dc.so 

362 for cdc in ccl: 

363 cdc.to += ofs 

364 # END update target bounds 

365 

366 if len(ccl) == 1: 

367 self[dci - 1] = ccl[0] 

368 else: 

369 # maybe try to compute the expenses here, and pick the right algorithm 

370 # It would normally be faster than copying everything physically though 

371 # TODO: Use a deque here, and decide by the index whether to extend 

372 # or extend left ! 

373 post_dci = self[dci:] 

374 del(self[dci - 1:]) # include deletion of dc 

375 self.extend(ccl) 

376 self.extend(post_dci) 

377 

378 slen = len(self) 

379 dci += len(ccl) - 1 # deleted dc, added rest 

380 

381 # END handle chunk replacement 

382 # END for each chunk 

383 

384 if nfc == slen: 

385 return False 

386 # END handle completeness 

387 return True 

388 

389 

390#} END structures 

391 

392#{ Routines 

393 

394def is_loose_object(m): 

395 """ 

396 :return: True the file contained in memory map m appears to be a loose object. 

397 Only the first two bytes are needed""" 

398 b0, b1 = map(ord, m[:2]) 

399 word = (b0 << 8) + b1 

400 return b0 == 0x78 and (word % 31) == 0 

401 

402 

403def loose_object_header_info(m): 

404 """ 

405 :return: tuple(type_string, uncompressed_size_in_bytes) the type string of the 

406 object as well as its uncompressed size in bytes. 

407 :param m: memory map from which to read the compressed object data""" 

408 decompress_size = 8192 # is used in cgit as well 

409 hdr = decompressobj().decompress(m, decompress_size) 

410 type_name, size = hdr[:hdr.find(NULL_BYTE)].split(BYTE_SPACE) 

411 

412 return type_name, int(size) 

413 

414 

415def pack_object_header_info(data): 

416 """ 

417 :return: tuple(type_id, uncompressed_size_in_bytes, byte_offset) 

418 The type_id should be interpreted according to the ``type_id_to_type_map`` map 

419 The byte-offset specifies the start of the actual zlib compressed datastream 

420 :param m: random-access memory, like a string or memory map""" 

421 c = byte_ord(data[0]) # first byte 

422 i = 1 # next char to read 

423 type_id = (c >> 4) & 7 # numeric type 

424 size = c & 15 # starting size 

425 s = 4 # starting bit-shift size 

426 while c & 0x80: 

427 c = byte_ord(data[i]) 

428 i += 1 

429 size += (c & 0x7f) << s 

430 s += 7 

431 # END character loop 

432 # end performance at expense of maintenance ... 

433 return (type_id, size, i) 

434 

435 

436def create_pack_object_header(obj_type, obj_size): 

437 """ 

438 :return: string defining the pack header comprised of the object type 

439 and its incompressed size in bytes 

440 

441 :param obj_type: pack type_id of the object 

442 :param obj_size: uncompressed size in bytes of the following object stream""" 

443 c = 0 # 1 byte 

444 hdr = bytearray() # output string 

445 

446 c = (obj_type << 4) | (obj_size & 0xf) 

447 obj_size >>= 4 

448 while obj_size: 

449 hdr.append(c | 0x80) 

450 c = obj_size & 0x7f 

451 obj_size >>= 7 

452 # END until size is consumed 

453 hdr.append(c) 

454 # end handle interpreter 

455 return hdr 

456 

457 

458def msb_size(data, offset=0): 

459 """ 

460 :return: tuple(read_bytes, size) read the msb size from the given random 

461 access data starting at the given byte offset""" 

462 size = 0 

463 i = 0 

464 l = len(data) 

465 hit_msb = False 

466 while i < l: 

467 c = data[i + offset] 

468 size |= (c & 0x7f) << i * 7 

469 i += 1 

470 if not c & 0x80: 

471 hit_msb = True 

472 break 

473 # END check msb bit 

474 # END while in range 

475 # end performance ... 

476 if not hit_msb: 

477 raise AssertionError("Could not find terminating MSB byte in data stream") 

478 return i + offset, size 

479 

480 

481def loose_object_header(type, size): 

482 """ 

483 :return: bytes representing the loose object header, which is immediately 

484 followed by the content stream of size 'size'""" 

485 return ('%s %i\0' % (force_text(type), size)).encode('ascii') 

486 

487 

488def write_object(type, size, read, write, chunk_size=chunk_size): 

489 """ 

490 Write the object as identified by type, size and source_stream into the 

491 target_stream 

492 

493 :param type: type string of the object 

494 :param size: amount of bytes to write from source_stream 

495 :param read: read method of a stream providing the content data 

496 :param write: write method of the output stream 

497 :param close_target_stream: if True, the target stream will be closed when 

498 the routine exits, even if an error is thrown 

499 :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" 

500 tbw = 0 # total num bytes written 

501 

502 # WRITE HEADER: type SP size NULL 

503 tbw += write(loose_object_header(type, size)) 

504 tbw += stream_copy(read, write, size, chunk_size) 

505 

506 return tbw 

507 

508 

509def stream_copy(read, write, size, chunk_size): 

510 """ 

511 Copy a stream up to size bytes using the provided read and write methods, 

512 in chunks of chunk_size 

513 

514 **Note:** its much like stream_copy utility, but operates just using methods""" 

515 dbw = 0 # num data bytes written 

516 

517 # WRITE ALL DATA UP TO SIZE 

518 while True: 

519 cs = min(chunk_size, size - dbw) 

520 # NOTE: not all write methods return the amount of written bytes, like 

521 # mmap.write. Its bad, but we just deal with it ... perhaps its not 

522 # even less efficient 

523 # data_len = write(read(cs)) 

524 # dbw += data_len 

525 data = read(cs) 

526 data_len = len(data) 

527 dbw += data_len 

528 write(data) 

529 if data_len < cs or dbw == size: 

530 break 

531 # END check for stream end 

532 # END duplicate data 

533 return dbw 

534 

535 

536def connect_deltas(dstreams): 

537 """ 

538 Read the condensed delta chunk information from dstream and merge its information 

539 into a list of existing delta chunks 

540 

541 :param dstreams: iterable of delta stream objects, the delta to be applied last 

542 comes first, then all its ancestors in order 

543 :return: DeltaChunkList, containing all operations to apply""" 

544 tdcl = None # topmost dcl 

545 

546 dcl = tdcl = TopdownDeltaChunkList() 

547 for dsi, ds in enumerate(dstreams): 

548 # print "Stream", dsi 

549 db = ds.read() 

550 delta_buf_size = ds.size 

551 

552 # read header 

553 i, base_size = msb_size(db) 

554 i, target_size = msb_size(db, i) 

555 

556 # interpret opcodes 

557 tbw = 0 # amount of target bytes written 

558 while i < delta_buf_size: 

559 c = ord(db[i]) 

560 i += 1 

561 if c & 0x80: 

562 cp_off, cp_size = 0, 0 

563 if (c & 0x01): 

564 cp_off = ord(db[i]) 

565 i += 1 

566 if (c & 0x02): 

567 cp_off |= (ord(db[i]) << 8) 

568 i += 1 

569 if (c & 0x04): 

570 cp_off |= (ord(db[i]) << 16) 

571 i += 1 

572 if (c & 0x08): 

573 cp_off |= (ord(db[i]) << 24) 

574 i += 1 

575 if (c & 0x10): 

576 cp_size = ord(db[i]) 

577 i += 1 

578 if (c & 0x20): 

579 cp_size |= (ord(db[i]) << 8) 

580 i += 1 

581 if (c & 0x40): 

582 cp_size |= (ord(db[i]) << 16) 

583 i += 1 

584 

585 if not cp_size: 

586 cp_size = 0x10000 

587 

588 rbound = cp_off + cp_size 

589 if (rbound < cp_size or 

590 rbound > base_size): 

591 break 

592 

593 dcl.append(DeltaChunk(tbw, cp_size, cp_off, None)) 

594 tbw += cp_size 

595 elif c: 

596 # NOTE: in C, the data chunks should probably be concatenated here. 

597 # In python, we do it as a post-process 

598 dcl.append(DeltaChunk(tbw, c, 0, db[i:i + c])) 

599 i += c 

600 tbw += c 

601 else: 

602 raise ValueError("unexpected delta opcode 0") 

603 # END handle command byte 

604 # END while processing delta data 

605 

606 dcl.compress() 

607 

608 # merge the lists ! 

609 if dsi > 0: 

610 if not tdcl.connect_with_next_base(dcl): 

611 break 

612 # END handle merge 

613 

614 # prepare next base 

615 dcl = DeltaChunkList() 

616 # END for each delta stream 

617 

618 return tdcl 

619 

620 

621def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write): 

622 """ 

623 Apply data from a delta buffer using a source buffer to the target file 

624 

625 :param src_buf: random access data from which the delta was created 

626 :param src_buf_size: size of the source buffer in bytes 

627 :param delta_buf_size: size for the delta buffer in bytes 

628 :param delta_buf: random access delta data 

629 :param write: write method taking a chunk of bytes 

630 

631 **Note:** transcribed to python from the similar routine in patch-delta.c""" 

632 i = 0 

633 db = delta_buf 

634 while i < delta_buf_size: 

635 c = db[i] 

636 i += 1 

637 if c & 0x80: 

638 cp_off, cp_size = 0, 0 

639 if (c & 0x01): 

640 cp_off = db[i] 

641 i += 1 

642 if (c & 0x02): 

643 cp_off |= (db[i] << 8) 

644 i += 1 

645 if (c & 0x04): 

646 cp_off |= (db[i] << 16) 

647 i += 1 

648 if (c & 0x08): 

649 cp_off |= (db[i] << 24) 

650 i += 1 

651 if (c & 0x10): 

652 cp_size = db[i] 

653 i += 1 

654 if (c & 0x20): 

655 cp_size |= (db[i] << 8) 

656 i += 1 

657 if (c & 0x40): 

658 cp_size |= (db[i] << 16) 

659 i += 1 

660 

661 if not cp_size: 

662 cp_size = 0x10000 

663 

664 rbound = cp_off + cp_size 

665 if (rbound < cp_size or 

666 rbound > src_buf_size): 

667 break 

668 write(src_buf[cp_off:cp_off + cp_size]) 

669 elif c: 

670 write(db[i:i + c]) 

671 i += c 

672 else: 

673 raise ValueError("unexpected delta opcode 0") 

674 # END handle command byte 

675 # END while processing delta data 

676 

677 # yes, lets use the exact same error message that git uses :) 

678 assert i == delta_buf_size, "delta replay has gone wild" 

679 

680 

681def is_equal_canonical_sha(canonical_length, match, sha1): 

682 """ 

683 :return: True if the given lhs and rhs 20 byte binary shas 

684 The comparison will take the canonical_length of the match sha into account, 

685 hence the comparison will only use the last 4 bytes for uneven canonical representations 

686 :param match: less than 20 byte sha 

687 :param sha1: 20 byte sha""" 

688 binary_length = canonical_length // 2 

689 if match[:binary_length] != sha1[:binary_length]: 

690 return False 

691 

692 if canonical_length - binary_length and \ 

693 (byte_ord(match[-1]) ^ byte_ord(sha1[len(match) - 1])) & 0xf0: 

694 return False 

695 # END handle uneven canonnical length 

696 return True 

697 

698#} END routines 

699 

700 

701try: 

702 from gitdb_speedups._perf import connect_deltas 

703except ImportError: 

704 pass