Dieu M. Nguyen
08/29/2023, 9:01 PMto_zarr()
step is getting errors.
import dask.array as da
import xarray as xr
import numpy as np
configuration = lakefs_client.Configuration()
configuration.username = access_key_id
configuration.password = secret_access_key
configuration.host = endpoint_url
client = LakeFSClient(configuration)
repo = "zarr-test"
branch = "zarr-store"
client.branches.create_branch(
repository=repo,
branch_creation=models.BranchCreation(
name=branch,
source="main"))
# Create random array
state = da.random.RandomState(1234)
shape = (180, 360, 400)
chunk_shape = (36, 72, 200)
nlats, nlons, ntimes = shape
arr = state.random(shape, chunks=chunk_shape)
arr = da.random.random(shape, chunks=chunk_shape)
ds = xr.Dataset(
data_vars={
"precipitation": xr.DataArray(arr, dims=('lat', 'lon', 'time'))
},
coords={
"lat": xr.DataArray(np.linspace(-90, 90, num=nlats, endpoint=False), dims='lat'),
"lon": xr.DataArray(np.linspace(-180, 180, num=nlons, endpoint=False), dims='lon'),
"time": xr.date_range(start="2000-06-01", freq="D", periods=ntimes)
},
attrs={
"description": "GPM IMERG test dataset"
}
)
# Write the first 200 time slices
ds_0 = ds.isel(time=slice(0, 200))
s3a_gateway_path = f's3a://{repo}/{branch}/precipitation_data.zarr'
task = ds_0.to_zarr(s3a_gateway_path,
# zarr_version=3,
mode='w',
compute=False)
Without zarr_version
, I get PermissionError: Access Denied
. If I set zarr_version=3
, I get KeyError: 'zarr.json'
. Maybe I am setting the s3a_gateway_path
incorrectly?Amit Kesarwani
08/29/2023, 9:18 PMfrom pyspark.sql import SparkSession
spark = SparkSession.builder.appName("lakeFS / Jupyter") \
.config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
.config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
.config("spark.hadoop.fs.s3a.path.style.access", "true") \
.config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
.config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
.getOrCreate()
spark.sparkContext.setLogLevel("INFO")
spark
Dieu M. Nguyen
08/29/2023, 9:45 PM<http://xarray.to|xarray.to>_zarr()
with lakefs, please let me know!
Something else I tried is to write the Zarr store (which can be thought of as a directory of subdirectories and files) locally, and try to upload it with client.objects.upload_object()
. But content
is expected to be IOBase and I believe only a single file, not a group like the Zarr store. So does lakeFS support writing Zarr?Barak Amar
s3 = s3fs.S3FileSystem(
anon=False,
endpoint_url="<http://localhost:8000>",
key="AKIAIOSFODNN7EXAMPLE",
secret="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
)
s3_path = f"{repo}/{branch}/{name}"
s3store = s3.get_mapper(s3_path)
Oz Katz
# requires:
# pip install s3fs numpy xarray zarr
import numpy as np
import xarray as xr
lakefs = {
"key": "AKIAIOSFOLQUICKSTART",
"secret": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
"endpoint_url": "<http://localhost:8000>",
}
data = np.random.rand(2000, 5000)
da = xr.DataArray(data)
da.to_zarr('<s3://my-repository/my-branch/some/path/>', storage_options=lakefs)
Dieu M. Nguyen
08/30/2023, 4:34 PM