Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/numpy/lib/_datasource.py: 18%

176 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1"""A file interface for handling local and remote data files. 

2 

3The goal of datasource is to abstract some of the file system operations 

4when dealing with data files so the researcher doesn't have to know all the 

5low-level details. Through datasource, a researcher can obtain and use a 

6file with one function call, regardless of location of the file. 

7 

8DataSource is meant to augment standard python libraries, not replace them. 

9It should work seamlessly with standard file IO operations and the os 

10module. 

11 

12DataSource files can originate locally or remotely: 

13 

14- local files : '/home/guido/src/local/data.txt' 

15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt' 

16 

17DataSource files can also be compressed or uncompressed. Currently only 

18gzip, bz2 and xz are supported. 

19 

20Example:: 

21 

22 >>> # Create a DataSource, use os.curdir (default) for local storage. 

23 >>> from numpy import DataSource 

24 >>> ds = DataSource() 

25 >>> 

26 >>> # Open a remote file. 

27 >>> # DataSource downloads the file, stores it locally in: 

28 >>> # './www.google.com/index.html' 

29 >>> # opens the file and returns a file object. 

30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP 

31 >>> 

32 >>> # Use the file as you normally would 

33 >>> fp.read() # doctest: +SKIP 

34 >>> fp.close() # doctest: +SKIP 

35 

36""" 

37import os 

38import io 

39 

40from numpy.core.overrides import set_module 

41 

42 

43_open = open 

44 

45 

46def _check_mode(mode, encoding, newline): 

47 """Check mode and that encoding and newline are compatible. 

48 

49 Parameters 

50 ---------- 

51 mode : str 

52 File open mode. 

53 encoding : str 

54 File encoding. 

55 newline : str 

56 Newline for text files. 

57 

58 """ 

59 if "t" in mode: 

60 if "b" in mode: 

61 raise ValueError("Invalid mode: %r" % (mode,)) 

62 else: 

63 if encoding is not None: 

64 raise ValueError("Argument 'encoding' not supported in binary mode") 

65 if newline is not None: 

66 raise ValueError("Argument 'newline' not supported in binary mode") 

67 

68 

69# Using a class instead of a module-level dictionary 

70# to reduce the initial 'import numpy' overhead by 

71# deferring the import of lzma, bz2 and gzip until needed 

72 

73# TODO: .zip support, .tar support? 

74class _FileOpeners: 

75 """ 

76 Container for different methods to open (un-)compressed files. 

77 

78 `_FileOpeners` contains a dictionary that holds one method for each 

79 supported file format. Attribute lookup is implemented in such a way 

80 that an instance of `_FileOpeners` itself can be indexed with the keys 

81 of that dictionary. Currently uncompressed files as well as files 

82 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported. 

83 

84 Notes 

85 ----- 

86 `_file_openers`, an instance of `_FileOpeners`, is made available for 

87 use in the `_datasource` module. 

88 

89 Examples 

90 -------- 

91 >>> import gzip 

92 >>> np.lib._datasource._file_openers.keys() 

93 [None, '.bz2', '.gz', '.xz', '.lzma'] 

94 >>> np.lib._datasource._file_openers['.gz'] is gzip.open 

95 True 

96 

97 """ 

98 

99 def __init__(self): 

100 self._loaded = False 

101 self._file_openers = {None: io.open} 

102 

103 def _load(self): 

104 if self._loaded: 

105 return 

106 

107 try: 

108 import bz2 

109 self._file_openers[".bz2"] = bz2.open 

110 except ImportError: 

111 pass 

112 

113 try: 

114 import gzip 

115 self._file_openers[".gz"] = gzip.open 

116 except ImportError: 

117 pass 

118 

119 try: 

120 import lzma 

121 self._file_openers[".xz"] = lzma.open 

122 self._file_openers[".lzma"] = lzma.open 

123 except (ImportError, AttributeError): 

124 # There are incompatible backports of lzma that do not have the 

125 # lzma.open attribute, so catch that as well as ImportError. 

126 pass 

127 

128 self._loaded = True 

129 

130 def keys(self): 

131 """ 

132 Return the keys of currently supported file openers. 

133 

134 Parameters 

135 ---------- 

136 None 

137 

138 Returns 

139 ------- 

140 keys : list 

141 The keys are None for uncompressed files and the file extension 

142 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression 

143 methods. 

144 

145 """ 

146 self._load() 

147 return list(self._file_openers.keys()) 

148 

149 def __getitem__(self, key): 

150 self._load() 

151 return self._file_openers[key] 

152 

153_file_openers = _FileOpeners() 

154 

155def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None): 

156 """ 

157 Open `path` with `mode` and return the file object. 

158 

159 If ``path`` is an URL, it will be downloaded, stored in the 

160 `DataSource` `destpath` directory and opened from there. 

161 

162 Parameters 

163 ---------- 

164 path : str 

165 Local file path or URL to open. 

166 mode : str, optional 

167 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to 

168 append. Available modes depend on the type of object specified by 

169 path. Default is 'r'. 

170 destpath : str, optional 

171 Path to the directory where the source file gets downloaded to for 

172 use. If `destpath` is None, a temporary directory will be created. 

173 The default path is the current directory. 

174 encoding : {None, str}, optional 

175 Open text file with given encoding. The default encoding will be 

176 what `io.open` uses. 

177 newline : {None, str}, optional 

178 Newline to use when reading text file. 

179 

180 Returns 

181 ------- 

182 out : file object 

183 The opened file. 

184 

185 Notes 

186 ----- 

187 This is a convenience function that instantiates a `DataSource` and 

188 returns the file object from ``DataSource.open(path)``. 

189 

190 """ 

191 

192 ds = DataSource(destpath) 

193 return ds.open(path, mode, encoding=encoding, newline=newline) 

194 

195 

196@set_module('numpy') 

197class DataSource: 

198 """ 

199 DataSource(destpath='.') 

200 

201 A generic data source file (file, http, ftp, ...). 

202 

203 DataSources can be local files or remote files/URLs. The files may 

204 also be compressed or uncompressed. DataSource hides some of the 

205 low-level details of downloading the file, allowing you to simply pass 

206 in a valid file path (or URL) and obtain a file object. 

207 

208 Parameters 

209 ---------- 

210 destpath : str or None, optional 

211 Path to the directory where the source file gets downloaded to for 

212 use. If `destpath` is None, a temporary directory will be created. 

213 The default path is the current directory. 

214 

215 Notes 

216 ----- 

217 URLs require a scheme string (``http://``) to be used, without it they 

218 will fail:: 

219 

220 >>> repos = np.DataSource() 

221 >>> repos.exists('www.google.com/index.html') 

222 False 

223 >>> repos.exists('http://www.google.com/index.html') 

224 True 

225 

226 Temporary directories are deleted when the DataSource is deleted. 

227 

228 Examples 

229 -------- 

230 :: 

231 

232 >>> ds = np.DataSource('/home/guido') 

233 >>> urlname = 'http://www.google.com/' 

234 >>> gfile = ds.open('http://www.google.com/') 

235 >>> ds.abspath(urlname) 

236 '/home/guido/www.google.com/index.html' 

237 

238 >>> ds = np.DataSource(None) # use with temporary file 

239 >>> ds.open('/home/guido/foobar.txt') 

240 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430> 

241 >>> ds.abspath('/home/guido/foobar.txt') 

242 '/tmp/.../home/guido/foobar.txt' 

243 

244 """ 

245 

246 def __init__(self, destpath=os.curdir): 

247 """Create a DataSource with a local path at destpath.""" 

248 if destpath: 

249 self._destpath = os.path.abspath(destpath) 

250 self._istmpdest = False 

251 else: 

252 import tempfile # deferring import to improve startup time 

253 self._destpath = tempfile.mkdtemp() 

254 self._istmpdest = True 

255 

256 def __del__(self): 

257 # Remove temp directories 

258 if hasattr(self, '_istmpdest') and self._istmpdest: 

259 import shutil 

260 

261 shutil.rmtree(self._destpath) 

262 

263 def _iszip(self, filename): 

264 """Test if the filename is a zip file by looking at the file extension. 

265 

266 """ 

267 fname, ext = os.path.splitext(filename) 

268 return ext in _file_openers.keys() 

269 

270 def _iswritemode(self, mode): 

271 """Test if the given mode will open a file for writing.""" 

272 

273 # Currently only used to test the bz2 files. 

274 _writemodes = ("w", "+") 

275 for c in mode: 

276 if c in _writemodes: 

277 return True 

278 return False 

279 

280 def _splitzipext(self, filename): 

281 """Split zip extension from filename and return filename. 

282 

283 Returns 

284 ------- 

285 base, zip_ext : {tuple} 

286 

287 """ 

288 

289 if self._iszip(filename): 

290 return os.path.splitext(filename) 

291 else: 

292 return filename, None 

293 

294 def _possible_names(self, filename): 

295 """Return a tuple containing compressed filename variations.""" 

296 names = [filename] 

297 if not self._iszip(filename): 

298 for zipext in _file_openers.keys(): 

299 if zipext: 

300 names.append(filename+zipext) 

301 return names 

302 

303 def _isurl(self, path): 

304 """Test if path is a net location. Tests the scheme and netloc.""" 

305 

306 # We do this here to reduce the 'import numpy' initial import time. 

307 from urllib.parse import urlparse 

308 

309 # BUG : URLs require a scheme string ('http://') to be used. 

310 # www.google.com will fail. 

311 # Should we prepend the scheme for those that don't have it and 

312 # test that also? Similar to the way we append .gz and test for 

313 # for compressed versions of files. 

314 

315 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) 

316 return bool(scheme and netloc) 

317 

318 def _cache(self, path): 

319 """Cache the file specified by path. 

320 

321 Creates a copy of the file in the datasource cache. 

322 

323 """ 

324 # We import these here because importing them is slow and 

325 # a significant fraction of numpy's total import time. 

326 import shutil 

327 from urllib.request import urlopen 

328 

329 upath = self.abspath(path) 

330 

331 # ensure directory exists 

332 if not os.path.exists(os.path.dirname(upath)): 

333 os.makedirs(os.path.dirname(upath)) 

334 

335 # TODO: Doesn't handle compressed files! 

336 if self._isurl(path): 

337 with urlopen(path) as openedurl: 

338 with _open(upath, 'wb') as f: 

339 shutil.copyfileobj(openedurl, f) 

340 else: 

341 shutil.copyfile(path, upath) 

342 return upath 

343 

344 def _findfile(self, path): 

345 """Searches for ``path`` and returns full path if found. 

346 

347 If path is an URL, _findfile will cache a local copy and return the 

348 path to the cached file. If path is a local file, _findfile will 

349 return a path to that local file. 

350 

351 The search will include possible compressed versions of the file 

352 and return the first occurrence found. 

353 

354 """ 

355 

356 # Build list of possible local file paths 

357 if not self._isurl(path): 

358 # Valid local paths 

359 filelist = self._possible_names(path) 

360 # Paths in self._destpath 

361 filelist += self._possible_names(self.abspath(path)) 

362 else: 

363 # Cached URLs in self._destpath 

364 filelist = self._possible_names(self.abspath(path)) 

365 # Remote URLs 

366 filelist = filelist + self._possible_names(path) 

367 

368 for name in filelist: 

369 if self.exists(name): 

370 if self._isurl(name): 

371 name = self._cache(name) 

372 return name 

373 return None 

374 

375 def abspath(self, path): 

376 """ 

377 Return absolute path of file in the DataSource directory. 

378 

379 If `path` is an URL, then `abspath` will return either the location 

380 the file exists locally or the location it would exist when opened 

381 using the `open` method. 

382 

383 Parameters 

384 ---------- 

385 path : str 

386 Can be a local file or a remote URL. 

387 

388 Returns 

389 ------- 

390 out : str 

391 Complete path, including the `DataSource` destination directory. 

392 

393 Notes 

394 ----- 

395 The functionality is based on `os.path.abspath`. 

396 

397 """ 

398 # We do this here to reduce the 'import numpy' initial import time. 

399 from urllib.parse import urlparse 

400 

401 # TODO: This should be more robust. Handles case where path includes 

402 # the destpath, but not other sub-paths. Failing case: 

403 # path = /home/guido/datafile.txt 

404 # destpath = /home/alex/ 

405 # upath = self.abspath(path) 

406 # upath == '/home/alex/home/guido/datafile.txt' 

407 

408 # handle case where path includes self._destpath 

409 splitpath = path.split(self._destpath, 2) 

410 if len(splitpath) > 1: 

411 path = splitpath[1] 

412 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) 

413 netloc = self._sanitize_relative_path(netloc) 

414 upath = self._sanitize_relative_path(upath) 

415 return os.path.join(self._destpath, netloc, upath) 

416 

417 def _sanitize_relative_path(self, path): 

418 """Return a sanitised relative path for which 

419 os.path.abspath(os.path.join(base, path)).startswith(base) 

420 """ 

421 last = None 

422 path = os.path.normpath(path) 

423 while path != last: 

424 last = path 

425 # Note: os.path.join treats '/' as os.sep on Windows 

426 path = path.lstrip(os.sep).lstrip('/') 

427 path = path.lstrip(os.pardir).lstrip('..') 

428 drive, path = os.path.splitdrive(path) # for Windows 

429 return path 

430 

431 def exists(self, path): 

432 """ 

433 Test if path exists. 

434 

435 Test if `path` exists as (and in this order): 

436 

437 - a local file. 

438 - a remote URL that has been downloaded and stored locally in the 

439 `DataSource` directory. 

440 - a remote URL that has not been downloaded, but is valid and 

441 accessible. 

442 

443 Parameters 

444 ---------- 

445 path : str 

446 Can be a local file or a remote URL. 

447 

448 Returns 

449 ------- 

450 out : bool 

451 True if `path` exists. 

452 

453 Notes 

454 ----- 

455 When `path` is an URL, `exists` will return True if it's either 

456 stored locally in the `DataSource` directory, or is a valid remote 

457 URL. `DataSource` does not discriminate between the two, the file 

458 is accessible if it exists in either location. 

459 

460 """ 

461 

462 # First test for local path 

463 if os.path.exists(path): 

464 return True 

465 

466 # We import this here because importing urllib is slow and 

467 # a significant fraction of numpy's total import time. 

468 from urllib.request import urlopen 

469 from urllib.error import URLError 

470 

471 # Test cached url 

472 upath = self.abspath(path) 

473 if os.path.exists(upath): 

474 return True 

475 

476 # Test remote url 

477 if self._isurl(path): 

478 try: 

479 netfile = urlopen(path) 

480 netfile.close() 

481 del(netfile) 

482 return True 

483 except URLError: 

484 return False 

485 return False 

486 

487 def open(self, path, mode='r', encoding=None, newline=None): 

488 """ 

489 Open and return file-like object. 

490 

491 If `path` is an URL, it will be downloaded, stored in the 

492 `DataSource` directory and opened from there. 

493 

494 Parameters 

495 ---------- 

496 path : str 

497 Local file path or URL to open. 

498 mode : {'r', 'w', 'a'}, optional 

499 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 

500 'a' to append. Available modes depend on the type of object 

501 specified by `path`. Default is 'r'. 

502 encoding : {None, str}, optional 

503 Open text file with given encoding. The default encoding will be 

504 what `io.open` uses. 

505 newline : {None, str}, optional 

506 Newline to use when reading text file. 

507 

508 Returns 

509 ------- 

510 out : file object 

511 File object. 

512 

513 """ 

514 

515 # TODO: There is no support for opening a file for writing which 

516 # doesn't exist yet (creating a file). Should there be? 

517 

518 # TODO: Add a ``subdir`` parameter for specifying the subdirectory 

519 # used to store URLs in self._destpath. 

520 

521 if self._isurl(path) and self._iswritemode(mode): 

522 raise ValueError("URLs are not writeable") 

523 

524 # NOTE: _findfile will fail on a new file opened for writing. 

525 found = self._findfile(path) 

526 if found: 

527 _fname, ext = self._splitzipext(found) 

528 if ext == 'bz2': 

529 mode.replace("+", "") 

530 return _file_openers[ext](found, mode=mode, 

531 encoding=encoding, newline=newline) 

532 else: 

533 raise FileNotFoundError(f"{path} not found.") 

534 

535 

536class Repository (DataSource): 

537 """ 

538 Repository(baseurl, destpath='.') 

539 

540 A data repository where multiple DataSource's share a base 

541 URL/directory. 

542 

543 `Repository` extends `DataSource` by prepending a base URL (or 

544 directory) to all the files it handles. Use `Repository` when you will 

545 be working with multiple files from one base URL. Initialize 

546 `Repository` with the base URL, then refer to each file by its filename 

547 only. 

548 

549 Parameters 

550 ---------- 

551 baseurl : str 

552 Path to the local directory or remote location that contains the 

553 data files. 

554 destpath : str or None, optional 

555 Path to the directory where the source file gets downloaded to for 

556 use. If `destpath` is None, a temporary directory will be created. 

557 The default path is the current directory. 

558 

559 Examples 

560 -------- 

561 To analyze all files in the repository, do something like this 

562 (note: this is not self-contained code):: 

563 

564 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/') 

565 >>> for filename in filelist: 

566 ... fp = repos.open(filename) 

567 ... fp.analyze() 

568 ... fp.close() 

569 

570 Similarly you could use a URL for a repository:: 

571 

572 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data') 

573 

574 """ 

575 

576 def __init__(self, baseurl, destpath=os.curdir): 

577 """Create a Repository with a shared url or directory of baseurl.""" 

578 DataSource.__init__(self, destpath=destpath) 

579 self._baseurl = baseurl 

580 

581 def __del__(self): 

582 DataSource.__del__(self) 

583 

584 def _fullpath(self, path): 

585 """Return complete path for path. Prepends baseurl if necessary.""" 

586 splitpath = path.split(self._baseurl, 2) 

587 if len(splitpath) == 1: 

588 result = os.path.join(self._baseurl, path) 

589 else: 

590 result = path # path contains baseurl already 

591 return result 

592 

593 def _findfile(self, path): 

594 """Extend DataSource method to prepend baseurl to ``path``.""" 

595 return DataSource._findfile(self, self._fullpath(path)) 

596 

597 def abspath(self, path): 

598 """ 

599 Return absolute path of file in the Repository directory. 

600 

601 If `path` is an URL, then `abspath` will return either the location 

602 the file exists locally or the location it would exist when opened 

603 using the `open` method. 

604 

605 Parameters 

606 ---------- 

607 path : str 

608 Can be a local file or a remote URL. This may, but does not 

609 have to, include the `baseurl` with which the `Repository` was 

610 initialized. 

611 

612 Returns 

613 ------- 

614 out : str 

615 Complete path, including the `DataSource` destination directory. 

616 

617 """ 

618 return DataSource.abspath(self, self._fullpath(path)) 

619 

620 def exists(self, path): 

621 """ 

622 Test if path exists prepending Repository base URL to path. 

623 

624 Test if `path` exists as (and in this order): 

625 

626 - a local file. 

627 - a remote URL that has been downloaded and stored locally in the 

628 `DataSource` directory. 

629 - a remote URL that has not been downloaded, but is valid and 

630 accessible. 

631 

632 Parameters 

633 ---------- 

634 path : str 

635 Can be a local file or a remote URL. This may, but does not 

636 have to, include the `baseurl` with which the `Repository` was 

637 initialized. 

638 

639 Returns 

640 ------- 

641 out : bool 

642 True if `path` exists. 

643 

644 Notes 

645 ----- 

646 When `path` is an URL, `exists` will return True if it's either 

647 stored locally in the `DataSource` directory, or is a valid remote 

648 URL. `DataSource` does not discriminate between the two, the file 

649 is accessible if it exists in either location. 

650 

651 """ 

652 return DataSource.exists(self, self._fullpath(path)) 

653 

654 def open(self, path, mode='r', encoding=None, newline=None): 

655 """ 

656 Open and return file-like object prepending Repository base URL. 

657 

658 If `path` is an URL, it will be downloaded, stored in the 

659 DataSource directory and opened from there. 

660 

661 Parameters 

662 ---------- 

663 path : str 

664 Local file path or URL to open. This may, but does not have to, 

665 include the `baseurl` with which the `Repository` was 

666 initialized. 

667 mode : {'r', 'w', 'a'}, optional 

668 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 

669 'a' to append. Available modes depend on the type of object 

670 specified by `path`. Default is 'r'. 

671 encoding : {None, str}, optional 

672 Open text file with given encoding. The default encoding will be 

673 what `io.open` uses. 

674 newline : {None, str}, optional 

675 Newline to use when reading text file. 

676 

677 Returns 

678 ------- 

679 out : file object 

680 File object. 

681 

682 """ 

683 return DataSource.open(self, self._fullpath(path), mode, 

684 encoding=encoding, newline=newline) 

685 

686 def listdir(self): 

687 """ 

688 List files in the source Repository. 

689 

690 Returns 

691 ------- 

692 files : list of str 

693 List of file names (not containing a directory part). 

694 

695 Notes 

696 ----- 

697 Does not currently work for remote repositories. 

698 

699 """ 

700 if self._isurl(self._baseurl): 

701 raise NotImplementedError( 

702 "Directory listing of URLs, not supported yet.") 

703 else: 

704 return os.listdir(self._baseurl)