Improve compatibility of FilePath with GDAL's VSI system (#2856)

* Improve compatibility of FilePath with GDAL's VSI system

Resolves #2850

* Pyparsing 3.1 has changes

* Store and track file objects, not file wrappers
This commit is contained in:
Sean Gillies 2023-06-26 11:01:31 -06:00 committed by GitHub
parent e2dcbcef13
commit 0560836d5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 80 additions and 37 deletions

View File

@ -4,7 +4,10 @@ Changes
1.3.8 (2023-06-26)
------------------
- Prevent a crash when accessing the block shapes of a multidataset HDF5 file (#).
- Rasterio's Python file VSI plugin is now compatible with GDAL VRTs such as
the one used for boundless reads of datasets (#2856).
- Prevent a crash when accessing the block shapes of a multidataset HDF5 file
(#2859).
- Add a workaround for a GDAL multithreading bug introduced in 3.6.0 (#2851).
1.3.7 (2023-05-22)

View File

@ -81,7 +81,7 @@ except ImportError:
have_vsi_plugin = False
__all__ = ['band', 'open', 'pad', 'Env', 'CRS']
__version__ = "1.3.7dev"
__version__ = "1.3.8dev"
__gdal_version__ = gdal_version()
__proj_version__ = ".".join([str(version) for version in get_proj_version()])
__geos_version__ = ".".join([str(version) for version in get_geos_version()])

View File

@ -69,7 +69,7 @@ cdef bytes FILESYSTEM_PREFIX_BYTES = FILESYSTEM_PREFIX.encode("ascii")
# Currently the only way to "create" a file in the filesystem is to add
# an entry to this dictionary. GDAL will then Open the path later.
cdef _FILESYSTEM_INFO = {}
cdef _OPEN_FILE_OBJS = set()
cdef int install_filepath_plugin(VSIFilesystemPluginCallbacksStruct *callbacks_struct):
"""Install handlers for python file-like objects if it isn't already installed."""
@ -97,13 +97,33 @@ cdef void uninstall_filepath_plugin(VSIFilesystemPluginCallbacksStruct *callback
## Filesystem Functions
def clone_file_obj(fobj):
"""Clone a filelike object.
Supports BytesIO, MemoryFile, fsspec files, and Python file objects.
"""
if hasattr(fobj, "fs"):
new_fobj = fobj.fs.open(fobj.path, fobj.mode)
elif hasattr(fobj, "getbuffer"):
new_fobj = fobj.__class__(fobj.getbuffer())
else:
new_fobj = open(fobj.name, fobj.mode)
return new_fobj
cdef void* filepath_open(void *pUserData, const char *pszFilename, const char *pszAccess) with gil:
"""Access existing open file-like object in the virtual filesystem.
"""Access files in the virtual filesystem.
This function is mandatory in the GDAL Filesystem Plugin API.
This function returns clones of the file wrappers stored in
_FILESYSTEM_INFO. GDAL may call this function multiple times per
filename and each result must be seperately seekable.
"""
cdef object file_wrapper
cdef object file_obj
if pszAccess != b"r" and pszAccess != b"rb":
log.error("FilePath is currently a read-only interface.")
@ -115,36 +135,33 @@ cdef void* filepath_open(void *pUserData, const char *pszFilename, const char *p
cdef dict filesystem_info = <object>pUserData
try:
file_wrapper = filesystem_info[pszFilename]
file_obj = clone_file_obj(filesystem_info[pszFilename])
except KeyError:
log.info("Object not found in virtual filesystem: filename=%r", pszFilename)
return NULL
if not hasattr(file_wrapper, "_file_obj"):
log.error("Unexpected file object found in FilePath filesystem.")
return NULL
return <void *>file_wrapper
# Open file wrappers are kept in this set and removed when closed.
_OPEN_FILE_OBJS.add(file_obj)
return <void *>file_obj
## File functions
cdef vsi_l_offset filepath_tell(void *pFile) with gil:
cdef object file_wrapper = <object>pFile
cdef object file_obj = file_wrapper._file_obj
cdef object file_obj = <object>pFile
cdef long pos = file_obj.tell()
return <vsi_l_offset>pos
cdef int filepath_seek(void *pFile, vsi_l_offset nOffset, int nWhence) except -1 with gil:
cdef object file_wrapper = <object>pFile
cdef object file_obj = file_wrapper._file_obj
cdef object file_obj = <object>pFile
# TODO: Add "seekable" check?
file_obj.seek(nOffset, nWhence)
return 0
cdef size_t filepath_read(void *pFile, void *pBuffer, size_t nSize, size_t nCount) with gil:
cdef object file_wrapper = <object>pFile
cdef object file_obj = file_wrapper._file_obj
cdef object file_obj = <object>pFile
cdef bytes python_data = file_obj.read(nSize * nCount)
cdef int num_bytes = len(python_data)
# NOTE: We have to cast to char* first, otherwise Cython doesn't do the conversion properly
@ -153,11 +170,8 @@ cdef size_t filepath_read(void *pFile, void *pBuffer, size_t nSize, size_t nCoun
cdef int filepath_close(void *pFile) except -1 with gil:
# Optional
cdef object file_wrapper = <object>pFile
cdef object file_obj = file_wrapper._file_obj
file_obj.seek(0)
_ = _FILESYSTEM_INFO.pop(file_wrapper._filepath_path, None)
cdef object file_obj = <object>pFile
_OPEN_FILE_OBJS.remove(file_obj)
return 0
@ -183,19 +197,18 @@ cdef class FilePathBase:
# auxiliary files.
self._dirname = dirname or str(uuid4())
if filename:
# GDAL's SRTMHGT driver requires the filename to be "correct" (match
# the bounds being written)
self.name = "{0}{1}/{2}".format(FILESYSTEM_PREFIX, self._dirname, filename)
else:
self.name = "{0}{1}/{1}".format(FILESYSTEM_PREFIX, self._dirname)
# GDAL's SRTMHGT driver requires the filename to be "correct" (match
# the bounds being written).
self._filename = filename or self._dirname
self.name = "{0}{1}/{2}".format(FILESYSTEM_PREFIX, self._dirname, self._filename)
self._path = self.name.encode('utf-8')
self._filepath_path = self._path[len(FILESYSTEM_PREFIX):]
self._file_obj = filelike_obj
self.mode = "r"
_FILESYSTEM_INFO[self._filepath_path] = self._file_obj
self.closed = False
_FILESYSTEM_INFO[self._filepath_path] = self
def exists(self):
"""Test if the in-memory file exists.
@ -234,4 +247,5 @@ cdef class FilePathBase:
to the user.
"""
_ = _FILESYSTEM_INFO.pop(self._filepath_path)
self.closed = True

View File

@ -1182,10 +1182,12 @@ cdef class MemoryFileBase:
cdef VSILFILE *fp = NULL
if file_or_bytes:
if hasattr(file_or_bytes, 'read'):
if hasattr(file_or_bytes, "read"):
initial_bytes = file_or_bytes.read()
elif isinstance(file_or_bytes, bytes):
initial_bytes = file_or_bytes
elif hasattr(file_or_bytes, "itemsize"):
initial_bytes = bytes(file_or_bytes)
else:
raise TypeError(
"Constructor argument must be a file opened in binary "
@ -1196,16 +1198,11 @@ cdef class MemoryFileBase:
# Make an in-memory directory specific to this dataset to help organize
# auxiliary files.
self._dirname = dirname or str(uuid4())
VSIMkdir("/vsimem/{0}".format(self._dirname).encode("utf-8"), 0666)
self._filename = filename or f"{self._dirname}.{ext.lstrip('.')}"
if filename:
# GDAL's SRTMHGT driver requires the filename to be "correct" (match
# the bounds being written)
self.name = "/vsimem/{0}/{1}".format(self._dirname, filename)
else:
# GDAL 2.1 requires a .zip extension for zipped files.
self.name = "/vsimem/{0}/{0}.{1}".format(self._dirname, ext.lstrip('.'))
VSIMkdir(f"/vsimem/{self._dirname}".encode('utf-8'), 0666)
self.name = f"/vsimem/{self._dirname}/{self._filename}"
self._path = self.name.encode('utf-8')
self._initial_bytes = initial_bytes

View File

@ -11,3 +11,4 @@ matplotlib
numpy>=1.10
snuggs~=1.4.0
setuptools>=20.0
pyparsing~=3.1

View File

@ -10,6 +10,7 @@ import pytest
import rasterio
from rasterio.enums import MaskFlags
from rasterio.shutil import copyfiles
from rasterio.windows import Window
try:
from rasterio.io import FilePath
@ -52,6 +53,33 @@ def test_initial_bytes(rgb_file_object):
with vsifile.open() as src:
assert src.driver == 'GTiff'
assert src.count == 3
assert src.dtypes == ("uint8", "uint8", "uint8")
assert src.read().shape == (3, 718, 791)
def test_initial_bytes_boundless(rgb_file_object):
"""FilePath contents can initialized from bytes and opened."""
with FilePath(rgb_file_object) as vsifile:
with vsifile.open() as src:
assert src.driver == "GTiff"
assert src.count == 3
assert src.dtypes == ("uint8", "uint8", "uint8")
assert src.read(window=Window(0, 0, 800, 800), boundless=True).shape == (
3,
800,
800,
)
def test_filepath_vrt(rgb_file_object):
"""A FilePath can be wrapped by a VRT."""
from rasterio.vrt import _boundless_vrt_doc
with FilePath(rgb_file_object) as vsifile, vsifile.open() as dst:
vrt_doc = _boundless_vrt_doc(dst)
with rasterio.open(vrt_doc) as src:
assert src.driver == "VRT"
assert src.count == 3
assert src.dtypes == ('uint8', 'uint8', 'uint8')
assert src.read().shape == (3, 718, 791)