Skip to content

Commit 3bf9f85

Browse files
authored
Include all V2 codecs as filters (#713)
1 parent 6f5347d commit 3bf9f85

File tree

12 files changed

+86
-38
lines changed

12 files changed

+86
-38
lines changed

conftest.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ def pytest_addoption(parser):
2929
action="store_true",
3030
help="runs tests requiring a network connection",
3131
)
32+
parser.addoption(
33+
"--run-slow-tests",
34+
action="store_true",
35+
help="runs slow tests",
36+
)
3237
parser.addoption(
3338
"--run-minio-tests",
3439
action="store_true",
@@ -44,6 +49,8 @@ def pytest_runtest_setup(item):
4449
)
4550
if "minio" in item.keywords and not item.config.getoption("--run-minio-tests"):
4651
pytest.skip("set --run-minio-tests to run tests requiring docker and minio")
52+
if "slow" in item.keywords and not item.config.getoption("--run-slow-tests"):
53+
pytest.skip("set --run-slow-tests to run slow tests")
4754

4855

4956
def _xarray_subset():

docs/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ vds = open_virtual_mfdataset(urls, parser = parser, registry = registry)
103103
print(vds)
104104
```
105105

106-
The magic of VirtualiZarr is that you can persist the virtual dataset to disk in a chunk references format such as [Icechunk][https://icechunk.io/],
106+
The magic of VirtualiZarr is that you can persist the virtual dataset to disk in a chunk references format such as [Icechunk](https://icechunk.io/),
107107
meaning that the work of constructing the single coherent dataset only needs to happen once.
108108
For subsequent data access, you can use [xarray.open_zarr][] to open that Icechunk store, which on object storage is
109109
far faster than using [xarray.open_mfdataset][] to open the the original non-cloud-optimized files.

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ run-mypy = { cmd = "mypy virtualizarr" }
178178
# Using '--dist loadscope' (rather than default of '--dist load' when '-n auto'
179179
# is used), reduces test hangs that appear to be macOS-related.
180180
run-tests = { cmd = "pytest -n auto --dist loadscope --run-network-tests --verbose --durations=10" }
181+
run-tests-including-slow = { cmd = "pytest -n auto --dist loadscope --run-network-tests --run-slow-tests --verbose --durations=10" }
181182
run-tests-no-network = { cmd = "pytest -n auto --verbose" }
182183
run-tests-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov=virtualizarr --cov=term-missing" }
183184
run-tests-xml-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov=virtualizarr --cov-report=xml" }
@@ -224,6 +225,7 @@ show_error_codes = true
224225
module = [
225226
"docker",
226227
"fsspec.*",
228+
"s3fs.*",
227229
"h5py",
228230
"kerchunk.*",
229231
"minio",
@@ -305,6 +307,7 @@ markers = [
305307
# this warning: "PytestUnknownMarkWarning: Unknown pytest.mark.flaky"
306308
"flaky: flaky tests",
307309
"network: marks test requiring internet (select with '--run-network-tests')",
310+
"slow: marks test as slow (select with '--run-slow-tests')",
308311
"minio: marks test requiring docker and minio (select with '--run-minio-tests')",
309312
]
310313
filterwarnings = [

virtualizarr/codecs.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@
2020
]
2121

2222

23-
def numcodec_config_to_configurable(num_codec: dict) -> dict:
23+
def zarr_codec_config_to_v3(num_codec: dict) -> dict:
2424
"""
2525
Convert a numcodecs codec into a zarr v3 configurable.
2626
"""
27+
# TODO: Special case Blosc codec
2728
if num_codec["id"].startswith("numcodecs."):
2829
return num_codec
2930

@@ -32,6 +33,19 @@ def numcodec_config_to_configurable(num_codec: dict) -> dict:
3233
return {"name": name, "configuration": num_codec_copy}
3334

3435

36+
def zarr_codec_config_to_v2(num_codec: dict) -> dict:
37+
"""
38+
Convert a numcodecs codec into a zarr v2 configurable.
39+
"""
40+
# TODO: Special case Blosc codec
41+
if name := num_codec.get("name", None):
42+
return {"id": name, **num_codec["configuration"]}
43+
elif num_codec.get("id", None):
44+
return num_codec
45+
else:
46+
raise ValueError(f"Expected a valid Zarr V2 or V3 codec dict, got {num_codec}")
47+
48+
3549
def extract_codecs(
3650
codecs: CodecPipeline,
3751
) -> DeconstructedCodecPipeline:

virtualizarr/parsers/hdf/hdf.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import numpy as np
1010

11-
from virtualizarr.codecs import numcodec_config_to_configurable
11+
from virtualizarr.codecs import zarr_codec_config_to_v3
1212
from virtualizarr.manifests import (
1313
ChunkEntry,
1414
ChunkManifest,
@@ -71,9 +71,7 @@ def _construct_manifest_array(
7171
encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype)
7272
attrs["_FillValue"] = encoded_cf_fill_value
7373

74-
codec_configs = [
75-
numcodec_config_to_configurable(codec.get_config()) for codec in codecs
76-
]
74+
codec_configs = [zarr_codec_config_to_v3(codec.get_config()) for codec in codecs]
7775

7876
fill_value = dataset.fillvalue.item()
7977
dims = tuple(_dataset_dims(dataset, group=group))

virtualizarr/parsers/kerchunk/translator.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from zarr.core.metadata import ArrayV3Metadata
1010

1111
from virtualizarr.codecs import (
12-
numcodec_config_to_configurable,
12+
zarr_codec_config_to_v3,
1313
)
1414
from virtualizarr.manifests import (
1515
ChunkManifest,
@@ -65,9 +65,7 @@ def from_kerchunk_refs(decoded_arr_refs_zarray, zattrs) -> "ArrayV3Metadata":
6565

6666
# Ensure compressor is a list before unpacking
6767
codec_configs = [*filters, *(compressor if compressor is not None else [])]
68-
numcodec_configs = [
69-
numcodec_config_to_configurable(config) for config in codec_configs
70-
]
68+
numcodec_configs = [zarr_codec_config_to_v3(config) for config in codec_configs]
7169
dimension_names = decoded_arr_refs_zarray["dimension_names"]
7270
return create_v3_array_metadata(
7371
chunk_shape=tuple(decoded_arr_refs_zarray["chunks"]),

virtualizarr/tests/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
requires_network = pytest.mark.network
77
requires_minio = pytest.mark.minio
8+
slow_test = pytest.mark.slow
89

910

1011
def _importorskip(

virtualizarr/tests/test_integration.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pytest
88
import xarray as xr
99
import xarray.testing as xrt
10-
from obstore.store import LocalStore
10+
from obstore.store import LocalStore, from_url
1111

1212
from conftest import ARRAYBYTES_CODEC, ZLIB_CODEC
1313
from virtualizarr import open_virtual_dataset
@@ -25,7 +25,9 @@
2525
has_icechunk,
2626
has_kerchunk,
2727
requires_kerchunk,
28+
requires_network,
2829
requires_zarr_python,
30+
slow_test,
2931
)
3032

3133
icechunk = pytest.importorskip("icechunk")
@@ -519,3 +521,35 @@ def test_convert_relative_paths_to_urls(self, netcdf4_file, local_registry):
519521
path = manifest["0.0.0"]["path"]
520522

521523
assert path == expected_path
524+
525+
526+
@requires_kerchunk
527+
@requires_network
528+
@slow_test
529+
def test_roundtrip_dataset_with_multiple_compressors():
530+
# Regression test to make sure we can load data with a compression and a shuffle codec
531+
# TODO: Simplify this test to not require network access
532+
import s3fs
533+
534+
bucket = "s3://nex-gddp-cmip6"
535+
path = "NEX-GDDP-CMIP6/ACCESS-CM2/ssp126/r1i1p1f1/tasmax/tasmax_day_ACCESS-CM2_ssp126_r1i1p1f1_gn_2015_v2.0.nc"
536+
url = f"{bucket}/{path}"
537+
store = from_url(bucket, region="us-west-2", skip_signature=True)
538+
registry = ObjectStoreRegistry({bucket: store})
539+
parser = HDFParser()
540+
vds = open_virtual_dataset(
541+
url=url, parser=parser, registry=registry, loadable_variables=[]
542+
)
543+
544+
ds_refs = vds.vz.to_kerchunk(format="dict")
545+
fs = s3fs.S3FileSystem(anon=True)
546+
with (
547+
xr.open_dataset(fs.open(url), engine="h5netcdf", decode_times=True) as expected,
548+
xr.open_dataset(
549+
ds_refs,
550+
decode_times=True,
551+
engine="kerchunk",
552+
storage_options={"remote_options": {"anon": True}},
553+
) as observed,
554+
):
555+
xr.testing.assert_allclose(expected, observed)

virtualizarr/tests/test_parsers/test_dmrpp.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from virtualizarr.parsers import DMRPPParser, HDFParser
1111
from virtualizarr.parsers.dmrpp import DMRParser
1212
from virtualizarr.registry import ObjectStoreRegistry
13-
from virtualizarr.tests import requires_network
13+
from virtualizarr.tests import requires_network, slow_test
1414
from virtualizarr.tests.utils import obstore_local, obstore_s3
1515
from virtualizarr.xarray import open_virtual_dataset
1616

@@ -346,6 +346,7 @@ def dmrparser(dmrpp_xml_str: str, filepath: str) -> DMRParser:
346346
return DMRParser(root=ET.fromstring(dmrpp_xml_str), data_filepath=filepath)
347347

348348

349+
@slow_test
349350
@requires_network
350351
@pytest.mark.parametrize("data_url, dmrpp_url", urls)
351352
def test_NASA_dmrpp(data_url, dmrpp_url):
@@ -373,6 +374,7 @@ def test_NASA_dmrpp(data_url, dmrpp_url):
373374

374375

375376
@requires_network
377+
@slow_test
376378
@pytest.mark.parametrize("data_url, dmrpp_url", urls)
377379
def test_NASA_dmrpp_load(data_url, dmrpp_url):
378380
store = obstore_s3(

virtualizarr/tests/test_writers/test_kerchunk.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ def testconvert_v3_to_v2_metadata(array_v3_metadata):
214214
assert v2_metadata.dtype.to_native_dtype() == np.dtype("int32")
215215
assert v2_metadata.chunks == chunks
216216
assert v2_metadata.fill_value == 0
217-
compressor_config = v2_metadata.compressor.get_config()
217+
compressor_config = v2_metadata.filters[1].get_config()
218218
assert compressor_config["id"] == "blosc"
219219
assert compressor_config["cname"] == "zstd"
220220
assert compressor_config["clevel"] == 5

0 commit comments

Comments
 (0)