Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/numpy/lib/_datasource.py: 18%
176 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""A file interface for handling local and remote data files.
3The goal of datasource is to abstract some of the file system operations
4when dealing with data files so the researcher doesn't have to know all the
5low-level details. Through datasource, a researcher can obtain and use a
6file with one function call, regardless of location of the file.
8DataSource is meant to augment standard python libraries, not replace them.
9It should work seamlessly with standard file IO operations and the os
10module.
12DataSource files can originate locally or remotely:
14- local files : '/home/guido/src/local/data.txt'
15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
17DataSource files can also be compressed or uncompressed. Currently only
18gzip, bz2 and xz are supported.
20Example::
22 >>> # Create a DataSource, use os.curdir (default) for local storage.
23 >>> from numpy import DataSource
24 >>> ds = DataSource()
25 >>>
26 >>> # Open a remote file.
27 >>> # DataSource downloads the file, stores it locally in:
28 >>> # './www.google.com/index.html'
29 >>> # opens the file and returns a file object.
30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP
31 >>>
32 >>> # Use the file as you normally would
33 >>> fp.read() # doctest: +SKIP
34 >>> fp.close() # doctest: +SKIP
36"""
37import os
38import io
40from numpy.core.overrides import set_module
43_open = open
46def _check_mode(mode, encoding, newline):
47 """Check mode and that encoding and newline are compatible.
49 Parameters
50 ----------
51 mode : str
52 File open mode.
53 encoding : str
54 File encoding.
55 newline : str
56 Newline for text files.
58 """
59 if "t" in mode:
60 if "b" in mode:
61 raise ValueError("Invalid mode: %r" % (mode,))
62 else:
63 if encoding is not None:
64 raise ValueError("Argument 'encoding' not supported in binary mode")
65 if newline is not None:
66 raise ValueError("Argument 'newline' not supported in binary mode")
69# Using a class instead of a module-level dictionary
70# to reduce the initial 'import numpy' overhead by
71# deferring the import of lzma, bz2 and gzip until needed
73# TODO: .zip support, .tar support?
74class _FileOpeners:
75 """
76 Container for different methods to open (un-)compressed files.
78 `_FileOpeners` contains a dictionary that holds one method for each
79 supported file format. Attribute lookup is implemented in such a way
80 that an instance of `_FileOpeners` itself can be indexed with the keys
81 of that dictionary. Currently uncompressed files as well as files
82 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.
84 Notes
85 -----
86 `_file_openers`, an instance of `_FileOpeners`, is made available for
87 use in the `_datasource` module.
89 Examples
90 --------
91 >>> import gzip
92 >>> np.lib._datasource._file_openers.keys()
93 [None, '.bz2', '.gz', '.xz', '.lzma']
94 >>> np.lib._datasource._file_openers['.gz'] is gzip.open
95 True
97 """
99 def __init__(self):
100 self._loaded = False
101 self._file_openers = {None: io.open}
103 def _load(self):
104 if self._loaded:
105 return
107 try:
108 import bz2
109 self._file_openers[".bz2"] = bz2.open
110 except ImportError:
111 pass
113 try:
114 import gzip
115 self._file_openers[".gz"] = gzip.open
116 except ImportError:
117 pass
119 try:
120 import lzma
121 self._file_openers[".xz"] = lzma.open
122 self._file_openers[".lzma"] = lzma.open
123 except (ImportError, AttributeError):
124 # There are incompatible backports of lzma that do not have the
125 # lzma.open attribute, so catch that as well as ImportError.
126 pass
128 self._loaded = True
130 def keys(self):
131 """
132 Return the keys of currently supported file openers.
134 Parameters
135 ----------
136 None
138 Returns
139 -------
140 keys : list
141 The keys are None for uncompressed files and the file extension
142 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression
143 methods.
145 """
146 self._load()
147 return list(self._file_openers.keys())
149 def __getitem__(self, key):
150 self._load()
151 return self._file_openers[key]
153_file_openers = _FileOpeners()
155def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
156 """
157 Open `path` with `mode` and return the file object.
159 If ``path`` is an URL, it will be downloaded, stored in the
160 `DataSource` `destpath` directory and opened from there.
162 Parameters
163 ----------
164 path : str
165 Local file path or URL to open.
166 mode : str, optional
167 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
168 append. Available modes depend on the type of object specified by
169 path. Default is 'r'.
170 destpath : str, optional
171 Path to the directory where the source file gets downloaded to for
172 use. If `destpath` is None, a temporary directory will be created.
173 The default path is the current directory.
174 encoding : {None, str}, optional
175 Open text file with given encoding. The default encoding will be
176 what `io.open` uses.
177 newline : {None, str}, optional
178 Newline to use when reading text file.
180 Returns
181 -------
182 out : file object
183 The opened file.
185 Notes
186 -----
187 This is a convenience function that instantiates a `DataSource` and
188 returns the file object from ``DataSource.open(path)``.
190 """
192 ds = DataSource(destpath)
193 return ds.open(path, mode, encoding=encoding, newline=newline)
196@set_module('numpy')
197class DataSource:
198 """
199 DataSource(destpath='.')
201 A generic data source file (file, http, ftp, ...).
203 DataSources can be local files or remote files/URLs. The files may
204 also be compressed or uncompressed. DataSource hides some of the
205 low-level details of downloading the file, allowing you to simply pass
206 in a valid file path (or URL) and obtain a file object.
208 Parameters
209 ----------
210 destpath : str or None, optional
211 Path to the directory where the source file gets downloaded to for
212 use. If `destpath` is None, a temporary directory will be created.
213 The default path is the current directory.
215 Notes
216 -----
217 URLs require a scheme string (``http://``) to be used, without it they
218 will fail::
220 >>> repos = np.DataSource()
221 >>> repos.exists('www.google.com/index.html')
222 False
223 >>> repos.exists('http://www.google.com/index.html')
224 True
226 Temporary directories are deleted when the DataSource is deleted.
228 Examples
229 --------
230 ::
232 >>> ds = np.DataSource('/home/guido')
233 >>> urlname = 'http://www.google.com/'
234 >>> gfile = ds.open('http://www.google.com/')
235 >>> ds.abspath(urlname)
236 '/home/guido/www.google.com/index.html'
238 >>> ds = np.DataSource(None) # use with temporary file
239 >>> ds.open('/home/guido/foobar.txt')
240 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
241 >>> ds.abspath('/home/guido/foobar.txt')
242 '/tmp/.../home/guido/foobar.txt'
244 """
246 def __init__(self, destpath=os.curdir):
247 """Create a DataSource with a local path at destpath."""
248 if destpath:
249 self._destpath = os.path.abspath(destpath)
250 self._istmpdest = False
251 else:
252 import tempfile # deferring import to improve startup time
253 self._destpath = tempfile.mkdtemp()
254 self._istmpdest = True
256 def __del__(self):
257 # Remove temp directories
258 if hasattr(self, '_istmpdest') and self._istmpdest:
259 import shutil
261 shutil.rmtree(self._destpath)
263 def _iszip(self, filename):
264 """Test if the filename is a zip file by looking at the file extension.
266 """
267 fname, ext = os.path.splitext(filename)
268 return ext in _file_openers.keys()
270 def _iswritemode(self, mode):
271 """Test if the given mode will open a file for writing."""
273 # Currently only used to test the bz2 files.
274 _writemodes = ("w", "+")
275 for c in mode:
276 if c in _writemodes:
277 return True
278 return False
280 def _splitzipext(self, filename):
281 """Split zip extension from filename and return filename.
283 Returns
284 -------
285 base, zip_ext : {tuple}
287 """
289 if self._iszip(filename):
290 return os.path.splitext(filename)
291 else:
292 return filename, None
294 def _possible_names(self, filename):
295 """Return a tuple containing compressed filename variations."""
296 names = [filename]
297 if not self._iszip(filename):
298 for zipext in _file_openers.keys():
299 if zipext:
300 names.append(filename+zipext)
301 return names
303 def _isurl(self, path):
304 """Test if path is a net location. Tests the scheme and netloc."""
306 # We do this here to reduce the 'import numpy' initial import time.
307 from urllib.parse import urlparse
309 # BUG : URLs require a scheme string ('http://') to be used.
310 # www.google.com will fail.
311 # Should we prepend the scheme for those that don't have it and
312 # test that also? Similar to the way we append .gz and test for
313 # for compressed versions of files.
315 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
316 return bool(scheme and netloc)
318 def _cache(self, path):
319 """Cache the file specified by path.
321 Creates a copy of the file in the datasource cache.
323 """
324 # We import these here because importing them is slow and
325 # a significant fraction of numpy's total import time.
326 import shutil
327 from urllib.request import urlopen
329 upath = self.abspath(path)
331 # ensure directory exists
332 if not os.path.exists(os.path.dirname(upath)):
333 os.makedirs(os.path.dirname(upath))
335 # TODO: Doesn't handle compressed files!
336 if self._isurl(path):
337 with urlopen(path) as openedurl:
338 with _open(upath, 'wb') as f:
339 shutil.copyfileobj(openedurl, f)
340 else:
341 shutil.copyfile(path, upath)
342 return upath
344 def _findfile(self, path):
345 """Searches for ``path`` and returns full path if found.
347 If path is an URL, _findfile will cache a local copy and return the
348 path to the cached file. If path is a local file, _findfile will
349 return a path to that local file.
351 The search will include possible compressed versions of the file
352 and return the first occurrence found.
354 """
356 # Build list of possible local file paths
357 if not self._isurl(path):
358 # Valid local paths
359 filelist = self._possible_names(path)
360 # Paths in self._destpath
361 filelist += self._possible_names(self.abspath(path))
362 else:
363 # Cached URLs in self._destpath
364 filelist = self._possible_names(self.abspath(path))
365 # Remote URLs
366 filelist = filelist + self._possible_names(path)
368 for name in filelist:
369 if self.exists(name):
370 if self._isurl(name):
371 name = self._cache(name)
372 return name
373 return None
375 def abspath(self, path):
376 """
377 Return absolute path of file in the DataSource directory.
379 If `path` is an URL, then `abspath` will return either the location
380 the file exists locally or the location it would exist when opened
381 using the `open` method.
383 Parameters
384 ----------
385 path : str
386 Can be a local file or a remote URL.
388 Returns
389 -------
390 out : str
391 Complete path, including the `DataSource` destination directory.
393 Notes
394 -----
395 The functionality is based on `os.path.abspath`.
397 """
398 # We do this here to reduce the 'import numpy' initial import time.
399 from urllib.parse import urlparse
401 # TODO: This should be more robust. Handles case where path includes
402 # the destpath, but not other sub-paths. Failing case:
403 # path = /home/guido/datafile.txt
404 # destpath = /home/alex/
405 # upath = self.abspath(path)
406 # upath == '/home/alex/home/guido/datafile.txt'
408 # handle case where path includes self._destpath
409 splitpath = path.split(self._destpath, 2)
410 if len(splitpath) > 1:
411 path = splitpath[1]
412 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
413 netloc = self._sanitize_relative_path(netloc)
414 upath = self._sanitize_relative_path(upath)
415 return os.path.join(self._destpath, netloc, upath)
417 def _sanitize_relative_path(self, path):
418 """Return a sanitised relative path for which
419 os.path.abspath(os.path.join(base, path)).startswith(base)
420 """
421 last = None
422 path = os.path.normpath(path)
423 while path != last:
424 last = path
425 # Note: os.path.join treats '/' as os.sep on Windows
426 path = path.lstrip(os.sep).lstrip('/')
427 path = path.lstrip(os.pardir).lstrip('..')
428 drive, path = os.path.splitdrive(path) # for Windows
429 return path
431 def exists(self, path):
432 """
433 Test if path exists.
435 Test if `path` exists as (and in this order):
437 - a local file.
438 - a remote URL that has been downloaded and stored locally in the
439 `DataSource` directory.
440 - a remote URL that has not been downloaded, but is valid and
441 accessible.
443 Parameters
444 ----------
445 path : str
446 Can be a local file or a remote URL.
448 Returns
449 -------
450 out : bool
451 True if `path` exists.
453 Notes
454 -----
455 When `path` is an URL, `exists` will return True if it's either
456 stored locally in the `DataSource` directory, or is a valid remote
457 URL. `DataSource` does not discriminate between the two, the file
458 is accessible if it exists in either location.
460 """
462 # First test for local path
463 if os.path.exists(path):
464 return True
466 # We import this here because importing urllib is slow and
467 # a significant fraction of numpy's total import time.
468 from urllib.request import urlopen
469 from urllib.error import URLError
471 # Test cached url
472 upath = self.abspath(path)
473 if os.path.exists(upath):
474 return True
476 # Test remote url
477 if self._isurl(path):
478 try:
479 netfile = urlopen(path)
480 netfile.close()
481 del(netfile)
482 return True
483 except URLError:
484 return False
485 return False
487 def open(self, path, mode='r', encoding=None, newline=None):
488 """
489 Open and return file-like object.
491 If `path` is an URL, it will be downloaded, stored in the
492 `DataSource` directory and opened from there.
494 Parameters
495 ----------
496 path : str
497 Local file path or URL to open.
498 mode : {'r', 'w', 'a'}, optional
499 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
500 'a' to append. Available modes depend on the type of object
501 specified by `path`. Default is 'r'.
502 encoding : {None, str}, optional
503 Open text file with given encoding. The default encoding will be
504 what `io.open` uses.
505 newline : {None, str}, optional
506 Newline to use when reading text file.
508 Returns
509 -------
510 out : file object
511 File object.
513 """
515 # TODO: There is no support for opening a file for writing which
516 # doesn't exist yet (creating a file). Should there be?
518 # TODO: Add a ``subdir`` parameter for specifying the subdirectory
519 # used to store URLs in self._destpath.
521 if self._isurl(path) and self._iswritemode(mode):
522 raise ValueError("URLs are not writeable")
524 # NOTE: _findfile will fail on a new file opened for writing.
525 found = self._findfile(path)
526 if found:
527 _fname, ext = self._splitzipext(found)
528 if ext == 'bz2':
529 mode.replace("+", "")
530 return _file_openers[ext](found, mode=mode,
531 encoding=encoding, newline=newline)
532 else:
533 raise FileNotFoundError(f"{path} not found.")
536class Repository (DataSource):
537 """
538 Repository(baseurl, destpath='.')
540 A data repository where multiple DataSource's share a base
541 URL/directory.
543 `Repository` extends `DataSource` by prepending a base URL (or
544 directory) to all the files it handles. Use `Repository` when you will
545 be working with multiple files from one base URL. Initialize
546 `Repository` with the base URL, then refer to each file by its filename
547 only.
549 Parameters
550 ----------
551 baseurl : str
552 Path to the local directory or remote location that contains the
553 data files.
554 destpath : str or None, optional
555 Path to the directory where the source file gets downloaded to for
556 use. If `destpath` is None, a temporary directory will be created.
557 The default path is the current directory.
559 Examples
560 --------
561 To analyze all files in the repository, do something like this
562 (note: this is not self-contained code)::
564 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
565 >>> for filename in filelist:
566 ... fp = repos.open(filename)
567 ... fp.analyze()
568 ... fp.close()
570 Similarly you could use a URL for a repository::
572 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
574 """
576 def __init__(self, baseurl, destpath=os.curdir):
577 """Create a Repository with a shared url or directory of baseurl."""
578 DataSource.__init__(self, destpath=destpath)
579 self._baseurl = baseurl
581 def __del__(self):
582 DataSource.__del__(self)
584 def _fullpath(self, path):
585 """Return complete path for path. Prepends baseurl if necessary."""
586 splitpath = path.split(self._baseurl, 2)
587 if len(splitpath) == 1:
588 result = os.path.join(self._baseurl, path)
589 else:
590 result = path # path contains baseurl already
591 return result
593 def _findfile(self, path):
594 """Extend DataSource method to prepend baseurl to ``path``."""
595 return DataSource._findfile(self, self._fullpath(path))
597 def abspath(self, path):
598 """
599 Return absolute path of file in the Repository directory.
601 If `path` is an URL, then `abspath` will return either the location
602 the file exists locally or the location it would exist when opened
603 using the `open` method.
605 Parameters
606 ----------
607 path : str
608 Can be a local file or a remote URL. This may, but does not
609 have to, include the `baseurl` with which the `Repository` was
610 initialized.
612 Returns
613 -------
614 out : str
615 Complete path, including the `DataSource` destination directory.
617 """
618 return DataSource.abspath(self, self._fullpath(path))
620 def exists(self, path):
621 """
622 Test if path exists prepending Repository base URL to path.
624 Test if `path` exists as (and in this order):
626 - a local file.
627 - a remote URL that has been downloaded and stored locally in the
628 `DataSource` directory.
629 - a remote URL that has not been downloaded, but is valid and
630 accessible.
632 Parameters
633 ----------
634 path : str
635 Can be a local file or a remote URL. This may, but does not
636 have to, include the `baseurl` with which the `Repository` was
637 initialized.
639 Returns
640 -------
641 out : bool
642 True if `path` exists.
644 Notes
645 -----
646 When `path` is an URL, `exists` will return True if it's either
647 stored locally in the `DataSource` directory, or is a valid remote
648 URL. `DataSource` does not discriminate between the two, the file
649 is accessible if it exists in either location.
651 """
652 return DataSource.exists(self, self._fullpath(path))
654 def open(self, path, mode='r', encoding=None, newline=None):
655 """
656 Open and return file-like object prepending Repository base URL.
658 If `path` is an URL, it will be downloaded, stored in the
659 DataSource directory and opened from there.
661 Parameters
662 ----------
663 path : str
664 Local file path or URL to open. This may, but does not have to,
665 include the `baseurl` with which the `Repository` was
666 initialized.
667 mode : {'r', 'w', 'a'}, optional
668 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
669 'a' to append. Available modes depend on the type of object
670 specified by `path`. Default is 'r'.
671 encoding : {None, str}, optional
672 Open text file with given encoding. The default encoding will be
673 what `io.open` uses.
674 newline : {None, str}, optional
675 Newline to use when reading text file.
677 Returns
678 -------
679 out : file object
680 File object.
682 """
683 return DataSource.open(self, self._fullpath(path), mode,
684 encoding=encoding, newline=newline)
686 def listdir(self):
687 """
688 List files in the source Repository.
690 Returns
691 -------
692 files : list of str
693 List of file names (not containing a directory part).
695 Notes
696 -----
697 Does not currently work for remote repositories.
699 """
700 if self._isurl(self._baseurl):
701 raise NotImplementedError(
702 "Directory listing of URLs, not supported yet.")
703 else:
704 return os.listdir(self._baseurl)